summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c1
-rw-r--r--kernel/bpf/core.c5
-rw-r--r--kernel/bpf/rqspinlock.c1
-rw-r--r--kernel/bpf/syscall.c1
-rw-r--r--kernel/cgroup/cgroup-internal.h8
-rw-r--r--kernel/cgroup/cgroup-v1.c12
-rw-r--r--kernel/cgroup/cgroup.c50
-rw-r--r--kernel/cgroup/cpuset-internal.h54
-rw-r--r--kernel/cgroup/cpuset-v1.c271
-rw-r--r--kernel/cgroup/cpuset.c506
-rw-r--r--kernel/cgroup/debug.c2
-rw-r--r--kernel/configs/debug.config2
-rw-r--r--kernel/crash_core.c17
-rw-r--r--kernel/crash_dump_dm_crypt.c21
-rw-r--r--kernel/debug/gdbstub.c1
-rw-r--r--kernel/delayacct.c33
-rw-r--r--kernel/dma/Kconfig6
-rw-r--r--kernel/dma/debug.c28
-rw-r--r--kernel/dma/direct.c3
-rw-r--r--kernel/dma/direct.h3
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/irq/irqdomain.c1
-rw-r--r--kernel/kallsyms.c79
-rw-r--r--kernel/kallsyms_internal.h1
-rw-r--r--kernel/kcsan/kcsan_test.c4
-rw-r--r--kernel/kexec_file.c131
-rw-r--r--kernel/liveupdate/Kconfig17
-rw-r--r--kernel/liveupdate/Makefile1
-rw-r--r--kernel/liveupdate/kexec_handover.c147
-rw-r--r--kernel/liveupdate/luo_core.c10
-rw-r--r--kernel/liveupdate/luo_file.c39
-rw-r--r--kernel/liveupdate/luo_flb.c654
-rw-r--r--kernel/liveupdate/luo_internal.h22
-rw-r--r--kernel/module/kallsyms.c9
-rw-r--r--kernel/panic.c164
-rw-r--r--kernel/power/swap.c10
-rw-r--r--kernel/printk/internal.h8
-rw-r--r--kernel/printk/nbcon.c23
-rw-r--r--kernel/printk/printk.c70
-rw-r--r--kernel/printk/printk_ringbuffer.h5
-rw-r--r--kernel/rcu/srcutree.c3
-rw-r--r--kernel/resource.c73
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/ext.c4
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/sys.c30
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c23
-rw-r--r--kernel/trace/bpf_trace.c5
-rw-r--r--kernel/trace/fgraph.c2
-rw-r--r--kernel/trace/ftrace.c12
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.c30
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.h22
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c40
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h24
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.c2
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c2
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c26
-rw-r--r--kernel/trace/rv/monitors/sco/sco.h14
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c28
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.h14
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c28
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h18
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c26
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.h14
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.c38
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.h22
-rw-r--r--kernel/trace/rv/monitors/sts/sts.c34
-rw-r--r--kernel/trace/rv/monitors/sts/sts.h28
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c26
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h14
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c28
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h14
-rw-r--r--kernel/trace/trace.c1059
-rw-r--r--kernel/trace/trace.h133
-rw-r--r--kernel/trace/trace_events.c163
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_hist.c101
-rw-r--r--kernel/trace/trace_events_synth.c6
-rw-r--r--kernel/trace/trace_events_trigger.c62
-rw-r--r--kernel/trace/trace_hwlat.c15
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_output.c30
-rw-r--r--kernel/trace/trace_pid.c246
-rw-r--r--kernel/trace/trace_printk.c430
-rw-r--r--kernel/trace/trace_selftest.c10
-rw-r--r--kernel/trace/trace_seq.c29
-rw-r--r--kernel/tracepoint.c18
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/ucount.c2
-rw-r--r--kernel/vmcore_info.c7
-rw-r--r--kernel/watchdog.c12
-rw-r--r--kernel/watchdog_perf.c50
-rw-r--r--kernel/workqueue.c149
97 files changed, 3586 insertions, 2068 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 39c4f26c484d..592d927e70f9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -32,6 +32,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/file.h>
+#include <linux/hex.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/atomic.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dc906dfdff94..5ab6bace7d0d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -25,6 +25,7 @@
#include <linux/prandom.h>
#include <linux/bpf.h>
#include <linux/btf.h>
+#include <linux/hex.h>
#include <linux/objtool.h>
#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
@@ -716,8 +717,8 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}
-int __bpf_address_lookup(unsigned long addr, unsigned long *size,
- unsigned long *off, char *sym)
+int bpf_address_lookup(unsigned long addr, unsigned long *size,
+ unsigned long *off, char *sym)
{
struct bpf_ksym *ksym;
int ret = 0;
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 2fdfa828e3d3..e4e338cdb437 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -695,7 +695,6 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock)
int ret;
BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock));
- BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock));
preempt_disable();
ret = res_spin_lock((rqspinlock_t *)lock);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 683c332dbafb..dd89bf809772 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -9,6 +9,7 @@
#include <linux/bpf_verifier.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
+#include <linux/hex.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 22051b4f1ccb..3bfe37693d68 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -52,7 +52,7 @@ struct cgroup_fs_context {
bool cpuset_clone_children;
bool none; /* User explicitly requested empty subsystem */
bool all_ss; /* Seen 'all' option */
- u16 subsys_mask; /* Selected subsystems */
+ u32 subsys_mask; /* Selected subsystems */
char *name; /* Hierarchy name */
char *release_agent; /* Path for release notifications */
};
@@ -146,7 +146,7 @@ struct cgroup_mgctx {
struct cgroup_taskset tset;
/* subsystems affected by migration */
- u16 ss_mask;
+ u32 ss_mask;
};
#define CGROUP_TASKSET_INIT(tset) \
@@ -235,8 +235,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask);
int cgroup_do_get_tree(struct fs_context *fc);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a9e029b570c8..724950c4b690 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -28,7 +28,7 @@
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
+static u32 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
@@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
static int check_cgroupfs_options(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- u16 mask = U16_MAX;
- u16 enabled = 0;
+ u32 mask = U32_MAX;
+ u32 enabled = 0;
struct cgroup_subsys *ss;
int i;
#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
+ mask = ~((u32)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
@@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
int ret = 0;
- u16 added_mask, removed_mask;
+ u32 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str)
continue;
if (!strcmp(token, "all")) {
- cgroup_no_v1_mask = U16_MAX;
+ cgroup_no_v1_mask = U32_MAX;
continue;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f0d33b04910..8af4351536cf 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -203,13 +203,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
-static u16 cgrp_dfl_inhibit_ss_mask;
+static u32 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
-static u16 cgrp_dfl_implicit_ss_mask;
+static u32 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */
-static u16 cgrp_dfl_threaded_ss_mask;
+static u32 cgrp_dfl_threaded_ss_mask;
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
@@ -231,10 +231,10 @@ static u64 css_serial_nr_next = 1;
* These bitmasks identify subsystems with specific features to avoid
* having to do iterative checks repeatedly.
*/
-static u16 have_fork_callback __read_mostly;
-static u16 have_exit_callback __read_mostly;
-static u16 have_release_callback __read_mostly;
-static u16 have_canfork_callback __read_mostly;
+static u32 have_fork_callback __read_mostly;
+static u32 have_exit_callback __read_mostly;
+static u32 have_release_callback __read_mostly;
+static u32 have_canfork_callback __read_mostly;
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
@@ -472,13 +472,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp)
}
/* subsystems visibly enabled on a cgroup */
-static u16 cgroup_control(struct cgroup *cgrp)
+static u32 cgroup_control(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- u16 root_ss_mask = cgrp->root->subsys_mask;
+ u32 root_ss_mask = cgrp->root->subsys_mask;
if (parent) {
- u16 ss_mask = parent->subtree_control;
+ u32 ss_mask = parent->subtree_control;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
@@ -493,12 +493,12 @@ static u16 cgroup_control(struct cgroup *cgrp)
}
/* subsystems enabled on a cgroup */
-static u16 cgroup_ss_mask(struct cgroup *cgrp)
+static u32 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
if (parent) {
- u16 ss_mask = parent->subtree_ss_mask;
+ u32 ss_mask = parent->subtree_ss_mask;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
@@ -1633,9 +1633,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
* This function calculates which subsystems need to be enabled if
* @subtree_control is to be applied while restricted to @this_ss_mask.
*/
-static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
+static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask)
{
- u16 cur_ss_mask = subtree_control;
+ u32 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
@@ -1644,7 +1644,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
- u16 new_ss_mask = cur_ss_mask;
+ u32 new_ss_mask = cur_ss_mask;
do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
@@ -1848,12 +1848,12 @@ err:
return ret;
}
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, ret;
- u16 dfl_disable_ss_mask = 0;
+ u32 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -2149,7 +2149,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -3131,7 +3131,7 @@ void cgroup_procs_write_finish(struct task_struct *task,
put_task_struct(task);
}
-static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
@@ -3496,9 +3496,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
-static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable)
{
- u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+ u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
/* if nothing is getting enabled, nothing to worry about */
if (!enable)
@@ -3541,7 +3541,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- u16 enable = 0, disable = 0;
+ u32 enable = 0, disable = 0;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -4945,7 +4945,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
rcu_read_lock();
css_for_each_child(child, css) {
- if (child->flags & CSS_ONLINE) {
+ if (css_is_online(child)) {
ret = true;
break;
}
@@ -5750,7 +5750,7 @@ static void offline_css(struct cgroup_subsys_state *css)
lockdep_assert_held(&cgroup_mutex);
- if (!(css->flags & CSS_ONLINE))
+ if (!css_is_online(css))
return;
if (ss->css_offline)
@@ -6347,7 +6347,7 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss;
int ssid;
- BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
+ BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 01976c8e7d49..fd7d19842ded 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -9,6 +9,7 @@
#include <linux/cpuset.h>
#include <linux/spinlock.h>
#include <linux/union_find.h>
+#include <linux/sched/isolation.h>
/* See "Frequency meter" comments, below. */
@@ -144,17 +145,12 @@ struct cpuset {
*/
nodemask_t old_mems_allowed;
- struct fmeter fmeter; /* memory_pressure filter */
-
/*
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
int attach_in_progress;
- /* for custom sched domain */
- int relax_domain_level;
-
/* partition root state */
int partition_root_state;
@@ -179,10 +175,19 @@ struct cpuset {
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
+#ifdef CONFIG_CPUSETS_V1
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /* for custom sched domain */
+ int relax_domain_level;
+
/* Used to merge intersecting subsets for generate_sched_domains */
struct uf_node node;
+#endif
};
+extern struct cpuset top_cpuset;
+
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct cpuset, css) : NULL;
@@ -240,6 +245,30 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
+ */
+static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
+}
+
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
+static inline bool cpuset_is_populated(struct cpuset *cs)
+{
+ lockdep_assert_cpuset_lock_held();
+
+ /* Cpusets in the process of attaching should be considered as populated */
+ return cgroup_is_populated(cs->css.cgroup) ||
+ cs->attach_in_progress;
+}
+
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
@@ -285,7 +314,6 @@ void cpuset_full_unlock(void);
*/
#ifdef CONFIG_CPUSETS_V1
extern struct cftype cpuset1_files[];
-void fmeter_init(struct fmeter *fmp);
void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk);
void cpuset1_update_tasks_flags(struct cpuset *cs);
@@ -293,8 +321,13 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated);
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2);
+void cpuset1_init(struct cpuset *cs);
+void cpuset1_online_css(struct cgroup_subsys_state *css);
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes);
+
#else
-static inline void fmeter_init(struct fmeter *fmp) {}
static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk) {}
static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
@@ -303,6 +336,13 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
bool cpus_updated, bool mems_updated) {}
static inline int cpuset1_validate_change(struct cpuset *cur,
struct cpuset *trial) { return 0; }
+static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1,
+ struct cpuset *cs2) { return false; }
+static inline void cpuset1_init(struct cpuset *cs) {}
+static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
+static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes) { return 0; };
+
#endif /* CONFIG_CPUSETS_V1 */
#endif /* __CPUSET_INTERNAL_H */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 12e76774c75b..7a23b9e8778f 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct {
#define FM_SCALE 1000 /* faux fixed point scale */
/* Initialize a frequency meter */
-void fmeter_init(struct fmeter *fmp)
+static void fmeter_init(struct fmeter *fmp)
{
fmp->cnt = 0;
fmp->val = 0;
@@ -368,11 +368,44 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
if (par && !is_cpuset_subset(trial, par))
goto out;
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if (cpuset_is_populated(cur)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
return ret;
}
+/*
+ * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
+ * to legacy (v1)
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
+ */
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return cpumask_intersects(cs1->cpus_allowed,
+ cs2->cpus_allowed);
+
+ return false;
+}
+
#ifdef CONFIG_PROC_PID_CPUSET
/*
* proc_cpuset_show()
@@ -499,6 +532,242 @@ out_unlock:
return retval;
}
+void cpuset1_init(struct cpuset *cs)
+{
+ fmeter_init(&cs->fmeter);
+ cs->relax_domain_level = -1;
+}
+
+void cpuset1_online_css(struct cgroup_subsys_state *css)
+{
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_cpuset_lock_held();
+
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ return;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * historical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ cpuset_callback_lock_irq();
+ cs->mems_allowed = parent->mems_allowed;
+ cs->effective_mems = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+ cpuset_callback_unlock_irq();
+}
+
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ if (dattr->relax_domain_level < c->relax_domain_level)
+ dattr->relax_domain_level = c->relax_domain_level;
+}
+
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (is_sched_load_balance(cp))
+ update_domain_attr(dattr, cp);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * cpuset1_generate_sched_domains()
+ *
+ * Finding the best partition (set of domains):
+ * The double nested loops below over i, j scan over the load
+ * balanced cpusets (using the array of cpuset pointers in csa[])
+ * looking for pairs of cpusets that have overlapping cpus_allowed
+ * and merging them using a union-find algorithm.
+ *
+ * The union of the cpus_allowed masks from the set of all cpusets
+ * having the same root then form the one element of the partition
+ * (one sched domain) to be passed to partition_sched_domains().
+ */
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ struct cpuset *cp; /* top-down scan of cpusets */
+ struct cpuset **csa; /* array of all cpuset ptrs */
+ int csn; /* how many cpuset ptrs in csa so far */
+ int i, j; /* indices for partition finding loops */
+ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ struct sched_domain_attr *dattr; /* attributes for custom domains */
+ int ndoms = 0; /* number of sched domains in result */
+ int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
+ int nslot_update;
+
+ lockdep_assert_cpuset_lock_held();
+
+ doms = NULL;
+ dattr = NULL;
+ csa = NULL;
+
+ /* Special case for the 99% of systems with one, full, sched domain */
+ if (is_sched_load_balance(&top_cpuset)) {
+ ndoms = 1;
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr_tree(dattr, &top_cpuset);
+ }
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ goto done;
+ }
+
+ csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
+ if (!csa)
+ goto done;
+ csn = 0;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
+ continue;
+
+ /*
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
+ continue;
+
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ rcu_read_unlock();
+
+ for (i = 0; i < csn; i++)
+ uf_node_init(&csa[i]->node);
+
+ /* Merge overlapping cpusets */
+ for (i = 0; i < csn; i++) {
+ for (j = i + 1; j < csn; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
+ uf_union(&csa[i]->node, &csa[j]->node);
+ }
+ }
+
+ /* Count the total number of domains */
+ for (i = 0; i < csn; i++) {
+ if (uf_find(&csa[i]->node) == &csa[i]->node)
+ ndoms++;
+ }
+
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ /*
+ * The rest of the code, including the scheduler, can deal with
+ * dattr==NULL case. No need to abort if alloc fails.
+ */
+ dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
+ GFP_KERNEL);
+
+ for (nslot = 0, i = 0; i < csn; i++) {
+ nslot_update = 0;
+ for (j = i; j < csn; j++) {
+ if (uf_find(&csa[j]->node) == &csa[i]->node) {
+ struct cpumask *dp = doms[nslot];
+
+ if (i == j) {
+ nslot_update = 1;
+ cpumask_clear(dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ }
+ cpumask_or(dp, dp, csa[j]->effective_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ if (dattr)
+ update_domain_attr_tree(dattr + nslot, csa[j]);
+ }
+ }
+ if (nslot_update)
+ nslot++;
+ }
+ BUG_ON(nslot != ndoms);
+
+done:
+ kfree(csa);
+
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 01a553caee56..832179236529 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -119,6 +119,17 @@ static bool force_sd_rebuild;
* For simplicity, a local partition can be created under a local or remote
* partition but a remote partition cannot have any partition root in its
* ancestor chain except the cgroup root.
+ *
+ * A valid partition can be formed by setting exclusive_cpus or cpus_allowed
+ * if exclusive_cpus is not set. In the case of partition with empty
+ * exclusive_cpus, all the conflicting exclusive CPUs specified in the
+ * following cpumasks of sibling cpusets will be removed from its
+ * cpus_allowed in determining its effective_xcpus.
+ * - effective_xcpus
+ * - exclusive_cpus
+ *
+ * The "cpuset.cpus.exclusive" control file should be used for setting up
+ * partition if the users want to get as many CPUs as possible.
*/
#define PRS_MEMBER 0
#define PRS_ROOT 1
@@ -201,12 +212,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
* If cpu_online_mask is used while a hotunplug operation is happening in
* parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
-static struct cpuset top_cpuset = {
+struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
- .relax_domain_level = -1,
- .remote_partition = false,
};
/*
@@ -261,6 +270,11 @@ void cpuset_unlock(void)
mutex_unlock(&cpuset_mutex);
}
+void lockdep_assert_cpuset_lock_held(void)
+{
+ lockdep_assert_held(&cpuset_mutex);
+}
+
/**
* cpuset_full_lock - Acquire full protection for cpuset modification
*
@@ -319,7 +333,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
*/
static inline void dec_attach_in_progress_locked(struct cpuset *cs)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
cs->attach_in_progress--;
if (!cs->attach_in_progress)
@@ -353,15 +367,6 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
-static inline bool cpuset_is_populated(struct cpuset *cs)
-{
- lockdep_assert_held(&cpuset_mutex);
-
- /* Cpusets in the process of attaching should be considered as populated */
- return cgroup_is_populated(cs->css.cgroup) ||
- cs->attach_in_progress;
-}
-
/**
* partition_is_populated - check if partition has tasks
* @cs: partition root to be checked
@@ -453,9 +458,8 @@ static void guarantee_active_cpus(struct task_struct *tsk,
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
+ while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/**
@@ -603,36 +607,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
/**
* cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
- * @cs1: first cpuset to check
- * @cs2: second cpuset to check
+ * @trial: the trial cpuset to be checked
+ * @sibling: a sibling cpuset to be checked against
+ * @xcpus_changed: set if exclusive_cpus has been set
*
* Returns: true if CPU exclusivity conflict exists, false otherwise
*
* Conflict detection rules:
- * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
- * 2. exclusive_cpus masks cannot intersect between cpusets
- * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ * o cgroup v1
+ * See cpuset1_cpus_excl_conflict()
+ * o cgroup v2
+ * - The exclusive_cpus values cannot overlap.
+ * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed.
*/
-static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling,
+ bool xcpus_changed)
{
- /* If either cpuset is exclusive, check if they are mutually exclusive */
- if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
- return !cpusets_are_exclusive(cs1, cs2);
-
- /* Exclusive_cpus cannot intersect */
- if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
- return true;
-
- /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
- if (!cpumask_empty(cs1->cpus_allowed) &&
- cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
- return true;
+ if (!cpuset_v2())
+ return cpuset1_cpus_excl_conflict(trial, sibling);
- if (!cpumask_empty(cs2->cpus_allowed) &&
- cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */
+ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) &&
+ cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus))
return true;
- return false;
+ /* Exclusive_cpus cannot intersect */
+ return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus);
}
static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
@@ -666,6 +666,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ bool xcpus_changed;
int ret = 0;
rcu_read_lock();
@@ -682,20 +683,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
/*
- * Cpusets with tasks - existing or newly being attached - can't
- * be changed to have empty cpus_allowed or mems_allowed.
- */
- ret = -ENOSPC;
- if (cpuset_is_populated(cur)) {
- if (!cpumask_empty(cur->cpus_allowed) &&
- cpumask_empty(trial->cpus_allowed))
- goto out;
- if (!nodes_empty(cur->mems_allowed) &&
- nodes_empty(trial->mems_allowed))
- goto out;
- }
-
- /*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
* tasks. This check is not done when scheduling is disabled as the
* users should know what they are doing.
@@ -722,10 +709,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* overlap. exclusive_cpus cannot overlap with each other if set.
*/
ret = -EINVAL;
+ xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus);
cpuset_for_each_child(c, css, par) {
if (c == cur)
continue;
- if (cpus_excl_conflict(trial, c))
+ if (cpus_excl_conflict(trial, c, xcpus_changed))
goto out;
if (mems_excl_conflict(trial, c))
goto out;
@@ -738,49 +726,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping effective cpus_allowed masks?
- */
-static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
-{
- return cpumask_intersects(a->effective_cpus, b->effective_cpus);
-}
-
-static void
-update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
-{
- if (dattr->relax_domain_level < c->relax_domain_level)
- dattr->relax_domain_level = c->relax_domain_level;
- return;
-}
-
-static void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs)
-{
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- /* skip the whole subtree if @cp doesn't have any CPU */
- if (cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
-
- if (is_sched_load_balance(cp))
- update_domain_attr(dattr, cp);
- }
- rcu_read_unlock();
-}
-
-/* Must be called with cpuset_mutex held. */
-static inline int nr_cpusets(void)
-{
- /* jump label reference count + the top-level cpuset */
- return static_key_count(&cpusets_enabled_key.key) + 1;
-}
/*
* generate_sched_domains()
@@ -820,103 +765,46 @@ static inline int nr_cpusets(void)
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
- *
- * Finding the best partition (set of domains):
- * The double nested loops below over i, j scan over the load
- * balanced cpusets (using the array of cpuset pointers in csa[])
- * looking for pairs of cpusets that have overlapping cpus_allowed
- * and merging them using a union-find algorithm.
- *
- * The union of the cpus_allowed masks from the set of all cpusets
- * having the same root then form the one element of the partition
- * (one sched domain) to be passed to partition_sched_domains().
- *
*/
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
- int csn; /* how many cpuset ptrs in csa so far */
int i, j; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
- int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
- bool root_load_balance = is_sched_load_balance(&top_cpuset);
- bool cgrpv2 = cpuset_v2();
- int nslot_update;
+
+ if (!cpuset_v2())
+ return cpuset1_generate_sched_domains(domains, attributes);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
-single_root_domain:
+ if (cpumask_empty(subpartitions_cpus)) {
ndoms = 1;
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
-
- dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr_tree(dattr, &top_cpuset);
- }
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
-
- goto done;
+ /* !csa will be checked and can be correctly handled */
+ goto generate_doms;
}
csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
- csn = 0;
+ /* Find how many partitions and cache them to csa[] */
rcu_read_lock();
- if (root_load_balance)
- csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
- if (cp == &top_cpuset)
- continue;
-
- if (cgrpv2)
- goto v2;
-
- /*
- * v1:
- * Continue traversing beyond @cp iff @cp has some CPUs and
- * isn't load balancing. The former is obvious. The
- * latter: All child cpusets contain a subset of the
- * parent's cpus, so just skip them, and then we call
- * update_domain_attr_tree() to calc relax_domain_level of
- * the corresponding sched domain.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_TYPE_DOMAIN))))
- continue;
-
- if (is_sched_load_balance(cp) &&
- !cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
-
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
- continue;
-
-v2:
/*
* Only valid partition roots that are not isolated and with
- * non-empty effective_cpus will be saved into csn[].
+ * non-empty effective_cpus will be saved into csa[].
*/
if ((cp->partition_root_state == PRS_ROOT) &&
!cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
+ csa[ndoms++] = cp;
/*
* Skip @cp's subtree if not a partition root and has no
@@ -927,40 +815,18 @@ v2:
}
rcu_read_unlock();
- /*
- * If there are only isolated partitions underneath the cgroup root,
- * we can optimize out unneeded sched domains scanning.
- */
- if (root_load_balance && (csn == 1))
- goto single_root_domain;
-
- for (i = 0; i < csn; i++)
- uf_node_init(&csa[i]->node);
-
- /* Merge overlapping cpusets */
- for (i = 0; i < csn; i++) {
- for (j = i + 1; j < csn; j++) {
- if (cpusets_overlap(csa[i], csa[j])) {
+ for (i = 0; i < ndoms; i++) {
+ for (j = i + 1; j < ndoms; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
/*
* Cgroup v2 shouldn't pass down overlapping
* partition root cpusets.
*/
- WARN_ON_ONCE(cgrpv2);
- uf_union(&csa[i]->node, &csa[j]->node);
- }
+ WARN_ON_ONCE(1);
}
}
- /* Count the total number of domains */
- for (i = 0; i < csn; i++) {
- if (uf_find(&csa[i]->node) == &csa[i]->node)
- ndoms++;
- }
-
- /*
- * Now we know how many domains to create.
- * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
- */
+generate_doms:
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
@@ -977,45 +843,19 @@ v2:
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
* subset of HK_TYPE_DOMAIN housekeeping CPUs.
*/
- if (cgrpv2) {
- for (i = 0; i < ndoms; i++) {
- /*
- * The top cpuset may contain some boot time isolated
- * CPUs that need to be excluded from the sched domain.
- */
- if (csa[i] == &top_cpuset)
- cpumask_and(doms[i], csa[i]->effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
- else
- cpumask_copy(doms[i], csa[i]->effective_cpus);
- if (dattr)
- dattr[i] = SD_ATTR_INIT;
- }
- goto done;
- }
-
- for (nslot = 0, i = 0; i < csn; i++) {
- nslot_update = 0;
- for (j = i; j < csn; j++) {
- if (uf_find(&csa[j]->node) == &csa[i]->node) {
- struct cpumask *dp = doms[nslot];
-
- if (i == j) {
- nslot_update = 1;
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- }
- cpumask_or(dp, dp, csa[j]->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
- if (dattr)
- update_domain_attr_tree(dattr + nslot, csa[j]);
- }
- }
- if (nslot_update)
- nslot++;
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (!csa || csa[i] == &top_cpuset)
+ cpumask_and(doms[i], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
}
- BUG_ON(nslot != ndoms);
done:
kfree(csa);
@@ -1055,7 +895,7 @@ void dl_rebuild_rd_accounting(void)
int cpu;
u64 cookie = ++dl_cookie;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
@@ -1100,53 +940,33 @@ void dl_rebuild_rd_accounting(void)
*/
void rebuild_sched_domains_locked(void)
{
- struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
- struct cpuset *cs;
int ndoms;
+ int i;
lockdep_assert_cpus_held();
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
force_sd_rebuild = false;
- /*
- * If we have raced with CPU hotplug, return early to avoid
- * passing doms with offlined cpu to partition_sched_domains().
- * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
- *
- * With no CPUs in any subpartitions, top_cpuset's effective CPUs
- * should be the same as the active CPUs, so checking only top_cpuset
- * is enough to detect racing CPU offlines.
- */
- if (cpumask_empty(subpartitions_cpus) &&
- !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- return;
+ /* Generate domain masks and attrs */
+ ndoms = generate_sched_domains(&doms, &attr);
/*
- * With subpartition CPUs, however, the effective CPUs of a partition
- * root should be only a subset of the active CPUs. Since a CPU in any
- * partition root could be offlined, all must be checked.
- */
- if (!cpumask_empty(subpartitions_cpus)) {
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_valid(cs)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
- if (!cpumask_subset(cs->effective_cpus,
- cpu_active_mask)) {
- rcu_read_unlock();
- return;
- }
- }
- rcu_read_unlock();
+ * cpuset_hotplug_workfn is invoked synchronously now, thus this
+ * function should not race with CPU hotplug. And the effective CPUs
+ * must not include any offline CPUs. Passing an offline CPU in the
+ * doms to partition_sched_domains() will trigger a kernel panic.
+ *
+ * We perform a final check here: if the doms contains any
+ * offline CPUs, a warning is emitted and we return directly to
+ * prevent the panic.
+ */
+ for (i = 0; i < ndoms; ++i) {
+ if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
+ return;
}
- /* Generate domain masks and attrs */
- ndoms = generate_sched_domains(&doms, &attr);
-
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
}
@@ -1501,23 +1321,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
int retval = 0;
if (cpumask_empty(excpus))
- return retval;
+ return 0;
/*
- * Exclude exclusive CPUs from siblings
+ * Remove exclusive CPUs from siblings
*/
rcu_read_lock();
cpuset_for_each_child(sibling, css, parent) {
+ struct cpumask *sibling_xcpus;
+
if (sibling == cs)
continue;
- if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
- cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
- retval++;
- continue;
- }
- if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
- cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
+ /*
+ * If exclusive_cpus is defined, effective_xcpus will always
+ * be a subset. Otherwise, effective_xcpus will only be set
+ * in a valid partition root.
+ */
+ sibling_xcpus = cpumask_empty(sibling->exclusive_cpus)
+ ? sibling->effective_xcpus
+ : sibling->exclusive_cpus;
+
+ if (cpumask_intersects(excpus, sibling_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling_xcpus);
retval++;
}
}
@@ -1806,7 +1632,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int parent_prs = parent->partition_root_state;
bool nocpu;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
/*
@@ -2315,17 +2141,13 @@ get_css:
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
- if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
- compute_excpus(cp, cp->effective_xcpus);
-
/*
- * Make sure effective_xcpus is properly set for a valid
- * partition root.
+ * Need to compute effective_xcpus if either exclusive_cpus
+ * is non-empty or it is a valid partition root.
*/
- if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
- cpumask_and(cp->effective_xcpus,
- cp->cpus_allowed, parent->effective_xcpus);
- else if (new_prs < 0)
+ if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus))
+ compute_excpus(cp, cp->effective_xcpus);
+ if (new_prs <= 0)
reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
@@ -2378,7 +2200,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
/*
* Check all its siblings and call update_cpumasks_hier()
@@ -2387,27 +2209,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* It is possible a change in parent's effective_cpus
* due to a change in a child partition's effective_xcpus will impact
* its siblings even if they do not inherit parent's effective_cpus
- * directly.
+ * directly. It should not impact valid partition.
*
* The update_cpumasks_hier() function may sleep. So we have to
* release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
- if (sibling == cs)
+ if (sibling == cs || is_partition_valid(sibling))
continue;
- if (!is_partition_valid(sibling)) {
- compute_effective_cpumask(tmp->new_cpus, sibling,
- parent);
- if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
- continue;
- } else if (is_remote_partition(sibling)) {
- /*
- * Change in a sibling cpuset won't affect a remote
- * partition root.
- */
+
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
continue;
- }
if (!css_tryget_online(&sibling->css))
continue;
@@ -2463,43 +2278,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri
return PERR_NONE;
}
-static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
- struct tmpmasks *tmp)
-{
- int retval;
- struct cpuset *parent = parent_cs(cs);
-
- retval = validate_change(cs, trialcs);
-
- if ((retval == -EINVAL) && cpuset_v2()) {
- struct cgroup_subsys_state *css;
- struct cpuset *cp;
-
- /*
- * The -EINVAL error code indicates that partition sibling
- * CPU exclusivity rule has been violated. We still allow
- * the cpumask change to proceed while invalidating the
- * partition. However, any conflicting sibling partitions
- * have to be marked as invalid too.
- */
- trialcs->prs_err = PERR_NOTEXCL;
- rcu_read_lock();
- cpuset_for_each_child(cp, css, parent) {
- struct cpumask *xcpus = user_xcpus(trialcs);
-
- if (is_partition_valid(cp) &&
- cpumask_intersects(xcpus, cp->effective_xcpus)) {
- rcu_read_unlock();
- update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
- retval = 0;
- }
- return retval;
-}
-
/**
* partition_cpus_change - Handle partition state changes due to CPU mask updates
* @cs: The target cpuset being modified
@@ -2559,15 +2337,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- if (alloc_tmpmasks(&tmp))
- return -ENOMEM;
-
compute_trialcs_excpus(trialcs, cs);
trialcs->prs_err = PERR_NONE;
- retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ retval = validate_change(cs, trialcs);
if (retval < 0)
- goto out_free;
+ return retval;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2590,7 +2368,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
-out_free:
+
free_tmpmasks(&tmp);
return retval;
}
@@ -2843,13 +2621,13 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
- nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+ bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (is_in_v2_mode() && nodes_empty(*new_mems))
+ if (is_in_v2_mode() && !has_mems)
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -3249,7 +3027,7 @@ static nodemask_t cpuset_attach_nodemask_to;
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
if (cs != &top_cpuset)
guarantee_active_cpus(task, cpus_attach);
@@ -3605,8 +3383,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- fmeter_init(&cs->fmeter);
- cs->relax_domain_level = -1;
+ cpuset1_init(cs);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cpuset_v2())
@@ -3619,17 +3396,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
struct cpuset *parent = parent_cs(cs);
- struct cpuset *tmp_cs;
- struct cgroup_subsys_state *pos_css;
if (!parent)
return 0;
cpuset_full_lock();
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
/*
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
*/
@@ -3644,39 +3415,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
}
spin_unlock_irq(&callback_lock);
+ cpuset1_online_css(css);
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
- goto out_unlock;
-
- /*
- * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
- * set. This flag handling is implemented in cgroup core for
- * historical reasons - the flag may be specified during mount.
- *
- * Currently, if any sibling cpusets have exclusive cpus or mem, we
- * refuse to clone the configuration - thereby refusing the task to
- * be entered, and as a result refusing the sys_unshare() or
- * clone() which initiated it. If this becomes a problem for some
- * users who wish to allow that scenario, then this could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
- */
- rcu_read_lock();
- cpuset_for_each_child(tmp_cs, pos_css, parent) {
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
- rcu_read_unlock();
- goto out_unlock;
- }
- }
- rcu_read_unlock();
-
- spin_lock_irq(&callback_lock);
- cs->mems_allowed = parent->mems_allowed;
- cs->effective_mems = parent->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- spin_unlock_irq(&callback_lock);
-out_unlock:
cpuset_full_unlock();
return 0;
}
@@ -3876,7 +3616,7 @@ int __init cpuset_init(void)
cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
- fmeter_init(&top_cpuset.fmeter);
+ cpuset1_init(&top_cpuset);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -4210,7 +3950,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask
*/
void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
__cpuset_cpus_allowed_locked(tsk, pmask);
}
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 81ea38dd6f9d..a5490097fe52 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
}
static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
- u16 mask)
+ u32 mask)
{
struct cgroup_subsys *ss;
int ssid;
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 9f6ab7dabf67..774702591d26 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y
# Debug Oops, Lockups and Hangs
#
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
-# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_PANIC_ON_OOPS=y
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 99dac1aa972a..3952b3e102e0 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -44,9 +44,15 @@ note_buf_t __percpu *crash_notes;
int kimage_crash_copy_vmcoreinfo(struct kimage *image)
{
- struct page *vmcoreinfo_page;
+ struct page *vmcoreinfo_base;
+ struct page *vmcoreinfo_pages[DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE)];
+ unsigned int order, nr_pages;
+ int i;
void *safecopy;
+ nr_pages = DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE);
+ order = get_order(VMCOREINFO_BYTES);
+
if (!IS_ENABLED(CONFIG_CRASH_DUMP))
return 0;
if (image->type != KEXEC_TYPE_CRASH)
@@ -61,12 +67,15 @@ int kimage_crash_copy_vmcoreinfo(struct kimage *image)
* happens to generate vmcoreinfo note, hereby we rely on
* vmap for this purpose.
*/
- vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
- if (!vmcoreinfo_page) {
+ vmcoreinfo_base = kimage_alloc_control_pages(image, order);
+ if (!vmcoreinfo_base) {
pr_warn("Could not allocate vmcoreinfo buffer\n");
return -ENOMEM;
}
- safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+ for (i = 0; i < nr_pages; i++)
+ vmcoreinfo_pages[i] = vmcoreinfo_base + i;
+
+ safecopy = vmap(vmcoreinfo_pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (!safecopy) {
pr_warn("Could not vmap vmcoreinfo buffer\n");
return -ENOMEM;
diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c
index 401423ba477d..37129243054d 100644
--- a/kernel/crash_dump_dm_crypt.c
+++ b/kernel/crash_dump_dm_crypt.c
@@ -143,6 +143,7 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
{
const struct user_key_payload *ukp;
struct key *key;
+ int ret = 0;
kexec_dprintk("Requesting logon key %s", dm_key->key_desc);
key = request_key(&key_type_logon, dm_key->key_desc, NULL);
@@ -152,20 +153,28 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
return PTR_ERR(key);
}
+ down_read(&key->sem);
ukp = user_key_payload_locked(key);
- if (!ukp)
- return -EKEYREVOKED;
+ if (!ukp) {
+ ret = -EKEYREVOKED;
+ goto out;
+ }
if (ukp->datalen > KEY_SIZE_MAX) {
pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
memcpy(dm_key->data, ukp->data, ukp->datalen);
dm_key->key_size = ukp->datalen;
kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size,
dm_key->key_desc, dm_key->data);
- return 0;
+
+out:
+ up_read(&key->sem);
+ key_put(key);
+ return ret;
}
struct config_key {
@@ -223,7 +232,7 @@ static void config_key_release(struct config_item *item)
key_count--;
}
-static struct configfs_item_operations config_key_item_ops = {
+static const struct configfs_item_operations config_key_item_ops = {
.release = config_key_release,
};
@@ -298,7 +307,7 @@ static struct configfs_attribute *config_keys_attrs[] = {
* Note that, since no extra work is required on ->drop_item(),
* no ->drop_item() is provided.
*/
-static struct configfs_group_operations config_keys_group_ops = {
+static const struct configfs_group_operations config_keys_group_ops = {
.make_item = config_keys_make_item,
};
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 22fe969c5d2e..f586afd76c80 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -27,6 +27,7 @@
#include <linux/kernel.h>
#include <linux/sched/signal.h>
+#include <linux/hex.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/serial_core.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 30e7912ebb0d..2e55c493c98b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -18,6 +18,8 @@
do { \
d->type##_delay_max = tsk->delays->type##_delay_max; \
d->type##_delay_min = tsk->delays->type##_delay_min; \
+ d->type##_delay_max_ts.tv_sec = tsk->delays->type##_delay_max_ts.tv_sec; \
+ d->type##_delay_max_ts.tv_nsec = tsk->delays->type##_delay_max_ts.tv_nsec; \
tmp = d->type##_delay_total + tsk->delays->type##_delay; \
d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
d->type##_count += tsk->delays->type##_count; \
@@ -104,7 +106,8 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumulator (@total) and @count
*/
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count,
+ u64 *max, u64 *min, struct timespec64 *ts)
{
s64 ns = local_clock() - *start;
unsigned long flags;
@@ -113,8 +116,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou
raw_spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- if (ns > *max)
+ if (ns > *max) {
*max = ns;
+ ktime_get_real_ts64(ts);
+ }
if (*min == 0 || ns < *min)
*min = ns;
raw_spin_unlock_irqrestore(lock, flags);
@@ -137,7 +142,8 @@ void __delayacct_blkio_end(struct task_struct *p)
&p->delays->blkio_delay,
&p->delays->blkio_count,
&p->delays->blkio_delay_max,
- &p->delays->blkio_delay_min);
+ &p->delays->blkio_delay_min,
+ &p->delays->blkio_delay_max_ts);
}
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -170,6 +176,8 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_delay_max = tsk->sched_info.max_run_delay;
d->cpu_delay_min = tsk->sched_info.min_run_delay;
+ d->cpu_delay_max_ts.tv_sec = tsk->sched_info.max_run_delay_ts.tv_sec;
+ d->cpu_delay_max_ts.tv_nsec = tsk->sched_info.max_run_delay_ts.tv_nsec;
tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
tmp = (s64)d->cpu_run_virtual_total + t3;
@@ -217,7 +225,8 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_delay,
&current->delays->freepages_count,
&current->delays->freepages_delay_max,
- &current->delays->freepages_delay_min);
+ &current->delays->freepages_delay_min,
+ &current->delays->freepages_delay_max_ts);
}
void __delayacct_thrashing_start(bool *in_thrashing)
@@ -241,7 +250,8 @@ void __delayacct_thrashing_end(bool *in_thrashing)
&current->delays->thrashing_delay,
&current->delays->thrashing_count,
&current->delays->thrashing_delay_max,
- &current->delays->thrashing_delay_min);
+ &current->delays->thrashing_delay_min,
+ &current->delays->thrashing_delay_max_ts);
}
void __delayacct_swapin_start(void)
@@ -256,7 +266,8 @@ void __delayacct_swapin_end(void)
&current->delays->swapin_delay,
&current->delays->swapin_count,
&current->delays->swapin_delay_max,
- &current->delays->swapin_delay_min);
+ &current->delays->swapin_delay_min,
+ &current->delays->swapin_delay_max_ts);
}
void __delayacct_compact_start(void)
@@ -271,7 +282,8 @@ void __delayacct_compact_end(void)
&current->delays->compact_delay,
&current->delays->compact_count,
&current->delays->compact_delay_max,
- &current->delays->compact_delay_min);
+ &current->delays->compact_delay_min,
+ &current->delays->compact_delay_max_ts);
}
void __delayacct_wpcopy_start(void)
@@ -286,7 +298,8 @@ void __delayacct_wpcopy_end(void)
&current->delays->wpcopy_delay,
&current->delays->wpcopy_count,
&current->delays->wpcopy_delay_max,
- &current->delays->wpcopy_delay_min);
+ &current->delays->wpcopy_delay_min,
+ &current->delays->wpcopy_delay_max_ts);
}
void __delayacct_irq(struct task_struct *task, u32 delta)
@@ -296,8 +309,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta)
raw_spin_lock_irqsave(&task->delays->lock, flags);
task->delays->irq_delay += delta;
task->delays->irq_count++;
- if (delta > task->delays->irq_delay_max)
+ if (delta > task->delays->irq_delay_max) {
task->delays->irq_delay_max = delta;
+ ktime_get_real_ts64(&task->delays->irq_delay_max_ts);
+ }
if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
task->delays->irq_delay_min = delta;
raw_spin_unlock_irqrestore(&task->delays->lock, flags);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 31cfdb6b4bc3..159900736f25 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -47,12 +47,6 @@ config ARCH_HAS_DMA_SET_MASK
config ARCH_HAS_DMA_WRITE_COMBINE
bool
-#
-# Select if the architectures provides the arch_dma_mark_clean hook
-#
-config ARCH_HAS_DMA_MARK_CLEAN
- bool
-
config DMA_DECLARE_COHERENT
bool
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 138ede653de4..43d6a996d7a7 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -63,6 +63,7 @@ enum map_err_types {
* @sg_mapped_ents: 'mapped_ents' from dma_map_sg
* @paddr: physical start address of the mapping
* @map_err_type: track whether dma_mapping_error() was checked
+ * @is_cache_clean: driver promises not to write to buffer while mapped
* @stack_len: number of backtrace entries in @stack_entries
* @stack_entries: stack of backtrace history
*/
@@ -76,7 +77,8 @@ struct dma_debug_entry {
int sg_call_ents;
int sg_mapped_ents;
phys_addr_t paddr;
- enum map_err_types map_err_type;
+ enum map_err_types map_err_type;
+ bool is_cache_clean;
#ifdef CONFIG_STACKTRACE
unsigned int stack_len;
unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
@@ -472,12 +474,15 @@ static int active_cacheline_dec_overlap(phys_addr_t cln)
return active_cacheline_set_overlap(cln, --overlap);
}
-static int active_cacheline_insert(struct dma_debug_entry *entry)
+static int active_cacheline_insert(struct dma_debug_entry *entry,
+ bool *overlap_cache_clean)
{
phys_addr_t cln = to_cacheline_number(entry);
unsigned long flags;
int rc;
+ *overlap_cache_clean = false;
+
/* If the device is not writing memory then we don't have any
* concerns about the cpu consuming stale data. This mitigates
* legitimate usages of overlapping mappings.
@@ -487,8 +492,16 @@ static int active_cacheline_insert(struct dma_debug_entry *entry)
spin_lock_irqsave(&radix_lock, flags);
rc = radix_tree_insert(&dma_active_cacheline, cln, entry);
- if (rc == -EEXIST)
+ if (rc == -EEXIST) {
+ struct dma_debug_entry *existing;
+
active_cacheline_inc_overlap(cln);
+ existing = radix_tree_lookup(&dma_active_cacheline, cln);
+ /* A lookup failure here after we got -EEXIST is unexpected. */
+ WARN_ON(!existing);
+ if (existing)
+ *overlap_cache_clean = existing->is_cache_clean;
+ }
spin_unlock_irqrestore(&radix_lock, flags);
return rc;
@@ -583,19 +596,24 @@ DEFINE_SHOW_ATTRIBUTE(dump);
*/
static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
{
+ bool overlap_cache_clean;
struct hash_bucket *bucket;
unsigned long flags;
int rc;
+ entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN);
+
bucket = get_hash_bucket(entry, &flags);
hash_bucket_add(bucket, entry);
put_hash_bucket(bucket, flags);
- rc = active_cacheline_insert(entry);
+ rc = active_cacheline_insert(entry, &overlap_cache_clean);
if (rc == -ENOMEM) {
pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
- } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ } else if (rc == -EEXIST &&
+ !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ !(entry->is_cache_clean && overlap_cache_clean) &&
!(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
is_swiotlb_active(entry->dev))) {
err_printk(entry->dev, entry,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50c3fe2a1d55..c9fa983990cd 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -425,9 +425,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_sync_dma_for_cpu(paddr, sg->length, dir);
swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
-
- if (dir == DMA_FROM_DEVICE)
- arch_dma_mark_clean(paddr, sg->length);
}
if (!dev_is_dma_coherent(dev))
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..f476c63b668c 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -75,9 +75,6 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
}
swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
-
- if (dir == DMA_FROM_DEVICE)
- arch_dma_mark_clean(paddr, size);
}
static inline dma_addr_t dma_direct_map_phys(struct device *dev,
diff --git a/kernel/fork.c b/kernel/fork.c
index 9c5effbdbdc1..e832da9d15a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1357,7 +1357,7 @@ struct file *get_task_exe_file(struct task_struct *task)
* @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
- * this kernel workthread has transiently adopted a user mm with use_mm,
+ * this kernel workthread has transiently adopted a user mm with kthread_use_mm,
* to do its AIO) is not set and if so returns a reference to it, after
* bumping up the use count. User must release the mm via mmput()
* after use. Typically used by /proc and ptrace.
@@ -2069,7 +2069,7 @@ __latent_entropy struct task_struct *copy_process(
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
- * Clear TID on mm_release()?
+ * TID is cleared in mm_release() when the task exits
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4c01e9d5ccc7..c2258b133939 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1913,6 +1913,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs);
static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq)
{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 049e296f586c..aec2f06858af 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -151,8 +151,10 @@ static unsigned int get_symbol_offset(unsigned long pos)
unsigned long kallsyms_sym_address(int idx)
{
- /* values are unsigned offsets */
- return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+ /* non-relocatable 32-bit kernels just embed the value directly */
+ if (!IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_RELOCATABLE))
+ return (u32)kallsyms_offsets[idx];
+ return (unsigned long)offset_to_ptr(kallsyms_offsets + idx);
}
static unsigned int get_symbol_seq(int index)
@@ -345,7 +347,7 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
return 1;
}
return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) ||
- !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
+ !!bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
static int kallsyms_lookup_buildid(unsigned long addr,
@@ -355,8 +357,21 @@ static int kallsyms_lookup_buildid(unsigned long addr,
{
int ret;
- namebuf[KSYM_NAME_LEN - 1] = 0;
+ /*
+ * kallsyms_lookus() returns pointer to namebuf on success and
+ * NULL on error. But some callers ignore the return value.
+ * Instead they expect @namebuf filled either with valid
+ * or empty string.
+ */
namebuf[0] = 0;
+ /*
+ * Initialize the module-related return values. They are not set
+ * when the symbol is in vmlinux or it is a bpf address.
+ */
+ if (modname)
+ *modname = NULL;
+ if (modbuildid)
+ *modbuildid = NULL;
if (is_ksym_addr(addr)) {
unsigned long pos;
@@ -365,10 +380,6 @@ static int kallsyms_lookup_buildid(unsigned long addr,
/* Grab name */
kallsyms_expand_symbol(get_symbol_offset(pos),
namebuf, KSYM_NAME_LEN);
- if (modname)
- *modname = NULL;
- if (modbuildid)
- *modbuildid = NULL;
return strlen(namebuf);
}
@@ -377,12 +388,11 @@ static int kallsyms_lookup_buildid(unsigned long addr,
ret = module_address_lookup(addr, symbolsize, offset,
modname, modbuildid, namebuf);
if (!ret)
- ret = bpf_address_lookup(addr, symbolsize,
- offset, modname, namebuf);
+ ret = bpf_address_lookup(addr, symbolsize, offset, namebuf);
if (!ret)
- ret = ftrace_mod_address_lookup(addr, symbolsize,
- offset, modname, namebuf);
+ ret = ftrace_mod_address_lookup(addr, symbolsize, offset,
+ modname, modbuildid, namebuf);
return ret;
}
@@ -426,6 +436,37 @@ int lookup_symbol_name(unsigned long addr, char *symname)
return lookup_module_symbol_name(addr, symname);
}
+#ifdef CONFIG_STACKTRACE_BUILD_ID
+
+static int append_buildid(char *buffer, const char *modname,
+ const unsigned char *buildid)
+{
+ if (!modname)
+ return 0;
+
+ if (!buildid) {
+ pr_warn_once("Undefined buildid for the module %s\n", modname);
+ return 0;
+ }
+
+ /* build ID should match length of sprintf */
+#ifdef CONFIG_MODULES
+ static_assert(sizeof(typeof_member(struct module, build_id)) == 20);
+#endif
+
+ return sprintf(buffer, " %20phN", buildid);
+}
+
+#else /* CONFIG_STACKTRACE_BUILD_ID */
+
+static int append_buildid(char *buffer, const char *modname,
+ const unsigned char *buildid)
+{
+ return 0;
+}
+
+#endif /* CONFIG_STACKTRACE_BUILD_ID */
+
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
int symbol_offset, int add_offset, int add_buildid)
@@ -435,6 +476,9 @@ static int __sprint_symbol(char *buffer, unsigned long address,
unsigned long offset, size;
int len;
+ /* Prevent module removal until modname and modbuildid are printed */
+ guard(rcu)();
+
address += symbol_offset;
len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid,
buffer);
@@ -448,15 +492,8 @@ static int __sprint_symbol(char *buffer, unsigned long address,
if (modname) {
len += sprintf(buffer + len, " [%s", modname);
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
- if (add_buildid && buildid) {
- /* build ID should match length of sprintf */
-#if IS_ENABLED(CONFIG_MODULES)
- static_assert(sizeof(typeof_member(struct module, build_id)) == 20);
-#endif
- len += sprintf(buffer + len, " %20phN", buildid);
- }
-#endif
+ if (add_buildid)
+ len += append_buildid(buffer + len, modname, buildid);
len += sprintf(buffer + len, "]");
}
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
index 9633782f8250..81a867dbe57d 100644
--- a/kernel/kallsyms_internal.h
+++ b/kernel/kallsyms_internal.h
@@ -8,7 +8,6 @@ extern const int kallsyms_offsets[];
extern const u8 kallsyms_names[];
extern const unsigned int kallsyms_num_syms;
-extern const unsigned long kallsyms_relative_base;
extern const char kallsyms_token_table[];
extern const u16 kallsyms_token_index[];
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 219d22857c98..8ef8167be745 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -176,7 +176,7 @@ static bool __report_matches(const struct expect_report *r)
/* Title */
cur = expect[0];
- end = &expect[0][sizeof(expect[0]) - 1];
+ end = ARRAY_END(expect[0]);
cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ",
is_assert ? "assert: race" : "data-race");
if (r->access[1].fn) {
@@ -200,7 +200,7 @@ static bool __report_matches(const struct expect_report *r)
/* Access 1 */
cur = expect[1];
- end = &expect[1][sizeof(expect[1]) - 1];
+ end = ARRAY_END(expect[1]);
if (!r->access[1].fn)
cur += scnprintf(cur, end - cur, "race at unknown origin, with ");
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index eb62a9794242..2bfbb2d144e6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -883,6 +883,60 @@ out_free_sha_regions:
#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
/*
+ * kexec_purgatory_find_symbol - find a symbol in the purgatory
+ * @pi: Purgatory to search in.
+ * @name: Name of the symbol.
+ *
+ * Return: pointer to symbol in read-only symtab on success, NULL on error.
+ */
+static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ const Elf_Shdr *sechdrs;
+ const Elf_Ehdr *ehdr;
+ const Elf_Sym *syms;
+ const char *strtab;
+ int i, k;
+
+ if (!pi->ehdr)
+ return NULL;
+
+ ehdr = pi->ehdr;
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (void *)ehdr + sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+/*
* kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
* @pi: Purgatory to be loaded.
* @kbuf: Buffer to setup.
@@ -960,6 +1014,10 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
unsigned long offset;
size_t sechdrs_size;
Elf_Shdr *sechdrs;
+ const Elf_Sym *entry_sym;
+ u16 entry_shndx = 0;
+ unsigned long entry_off = 0;
+ bool start_fixed = false;
int i;
/*
@@ -977,6 +1035,12 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
bss_addr = kbuf->mem + kbuf->bufsz;
kbuf->image->start = pi->ehdr->e_entry;
+ entry_sym = kexec_purgatory_find_symbol(pi, "purgatory_start");
+ if (entry_sym) {
+ entry_shndx = entry_sym->st_shndx;
+ entry_off = entry_sym->st_value;
+ }
+
for (i = 0; i < pi->ehdr->e_shnum; i++) {
unsigned long align;
void *src, *dst;
@@ -994,6 +1058,13 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
offset = ALIGN(offset, align);
+ if (!start_fixed && entry_sym && i == entry_shndx &&
+ (sechdrs[i].sh_flags & SHF_EXECINSTR) &&
+ entry_off < sechdrs[i].sh_size) {
+ kbuf->image->start = kbuf->mem + offset + entry_off;
+ start_fixed = true;
+ }
+
/*
* Check if the segment contains the entry point, if so,
* calculate the value of image->start based on it.
@@ -1004,13 +1075,14 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
* is not set to the initial value, and warn the user so they
* have a chance to fix their purgatory's linker script.
*/
- if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
+ if (!start_fixed && sechdrs[i].sh_flags & SHF_EXECINSTR &&
pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
pi->ehdr->e_entry < (sechdrs[i].sh_addr
+ sechdrs[i].sh_size) &&
- !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) {
+ kbuf->image->start == pi->ehdr->e_entry) {
kbuf->image->start -= sechdrs[i].sh_addr;
kbuf->image->start += kbuf->mem + offset;
+ start_fixed = true;
}
src = (void *)pi->ehdr + sechdrs[i].sh_offset;
@@ -1128,61 +1200,6 @@ out_free_kbuf:
return ret;
}
-/*
- * kexec_purgatory_find_symbol - find a symbol in the purgatory
- * @pi: Purgatory to search in.
- * @name: Name of the symbol.
- *
- * Return: pointer to symbol in read-only symtab on success, NULL on error.
- */
-static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
-{
- const Elf_Shdr *sechdrs;
- const Elf_Ehdr *ehdr;
- const Elf_Sym *syms;
- const char *strtab;
- int i, k;
-
- if (!pi->ehdr)
- return NULL;
-
- ehdr = pi->ehdr;
- sechdrs = (void *)ehdr + ehdr->e_shoff;
-
- for (i = 0; i < ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_SYMTAB)
- continue;
-
- if (sechdrs[i].sh_link >= ehdr->e_shnum)
- /* Invalid strtab section number */
- continue;
- strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (void *)ehdr + sechdrs[i].sh_offset;
-
- /* Go through symbols for a match */
- for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
- if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
- continue;
-
- if (strcmp(strtab + syms[k].st_name, name) != 0)
- continue;
-
- if (syms[k].st_shndx == SHN_UNDEF ||
- syms[k].st_shndx >= ehdr->e_shnum) {
- pr_debug("Symbol: %s has bad section index %d.\n",
- name, syms[k].st_shndx);
- return NULL;
- }
-
- /* Found the symbol we are looking for */
- return &syms[k];
- }
- }
-
- return NULL;
-}
-
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
{
struct purgatory_info *pi = &image->purgatory_info;
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
index d2aeaf13c3ac..1a8513f16ef7 100644
--- a/kernel/liveupdate/Kconfig
+++ b/kernel/liveupdate/Kconfig
@@ -54,7 +54,6 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT
config LIVEUPDATE
bool "Live Update Orchestrator"
depends on KEXEC_HANDOVER
- depends on SHMEM
help
Enable the Live Update Orchestrator. Live Update is a mechanism,
typically based on kexec, that allows the kernel to be updated
@@ -73,4 +72,20 @@ config LIVEUPDATE
If unsure, say N.
+config LIVEUPDATE_MEMFD
+ bool "Live update support for memfd"
+ depends on LIVEUPDATE
+ depends on MEMFD_CREATE
+ depends on SHMEM
+ default LIVEUPDATE
+ help
+ Enable live update support for memfd regions. This allows preserving
+ memfd-backed memory across kernel live updates.
+
+ This can be used to back VM memory with memfds, allowing the guest
+ memory to persist, or for other user workloads needing to preserve
+ pages.
+
+ If unsure, say N.
+
endmenu
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
index 7cad2eece32d..d2f779cbe279 100644
--- a/kernel/liveupdate/Makefile
+++ b/kernel/liveupdate/Makefile
@@ -3,6 +3,7 @@
luo-y := \
luo_core.o \
luo_file.o \
+ luo_flb.o \
luo_session.o
obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 90d411a59f76..fb3a7b67676e 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -15,6 +15,7 @@
#include <linux/count_zeros.h>
#include <linux/kexec.h>
#include <linux/kexec_handover.h>
+#include <linux/kho/abi/kexec_handover.h>
#include <linux/libfdt.h>
#include <linux/list.h>
#include <linux/memblock.h>
@@ -24,7 +25,6 @@
#include <asm/early_ioremap.h>
-#include "kexec_handover_internal.h"
/*
* KHO is tightly coupled with mm init and needs access to some of mm
* internal APIs.
@@ -33,10 +33,7 @@
#include "../kexec_internal.h"
#include "kexec_handover_internal.h"
-#define KHO_FDT_COMPATIBLE "kho-v1"
-#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
-#define PROP_SUB_FDT "fdt"
-
+/* The magic token for preserved pages */
#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
/*
@@ -219,10 +216,32 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
return 0;
}
+/* For physically contiguous 0-order pages. */
+static void kho_init_pages(struct page *page, unsigned long nr_pages)
+{
+ for (unsigned long i = 0; i < nr_pages; i++)
+ set_page_count(page + i, 1);
+}
+
+static void kho_init_folio(struct page *page, unsigned int order)
+{
+ unsigned long nr_pages = (1 << order);
+
+ /* Head page gets refcount of 1. */
+ set_page_count(page, 1);
+
+ /* For higher order folios, tail pages get a page count of zero. */
+ for (unsigned long i = 1; i < nr_pages; i++)
+ set_page_count(page + i, 0);
+
+ if (order > 0)
+ prep_compound_page(page, order);
+}
+
static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
{
struct page *page = pfn_to_online_page(PHYS_PFN(phys));
- unsigned int nr_pages, ref_cnt;
+ unsigned long nr_pages;
union kho_page_info info;
if (!page)
@@ -240,20 +259,11 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
/* Clear private to make sure later restores on this page error out. */
page->private = 0;
- /* Head page gets refcount of 1. */
- set_page_count(page, 1);
- /*
- * For higher order folios, tail pages get a page count of zero.
- * For physically contiguous order-0 pages every pages gets a page
- * count of 1
- */
- ref_cnt = is_folio ? 0 : 1;
- for (unsigned int i = 1; i < nr_pages; i++)
- set_page_count(page + i, ref_cnt);
-
- if (is_folio && info.order)
- prep_compound_page(page, info.order);
+ if (is_folio)
+ kho_init_folio(page, info.order);
+ else
+ kho_init_pages(page, nr_pages);
/* Always mark headpage's codetag as empty to avoid accounting mismatch */
clear_page_tag_ref(page);
@@ -289,9 +299,9 @@ EXPORT_SYMBOL_GPL(kho_restore_folio);
* Restore a contiguous list of order 0 pages that was preserved with
* kho_preserve_pages().
*
- * Return: 0 on success, error code on failure
+ * Return: the first page on success, NULL on failure.
*/
-struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
+struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages)
{
const unsigned long start_pfn = PHYS_PFN(phys);
const unsigned long end_pfn = start_pfn + nr_pages;
@@ -386,7 +396,7 @@ static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
void *ptr;
u64 phys;
- ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL);
+ ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL);
/* Check and discard previous memory map */
phys = get_unaligned((u64 *)ptr);
@@ -474,7 +484,7 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
const void *mem_ptr;
int len;
- mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+ mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
if (!mem_ptr || len != sizeof(u64)) {
pr_err("failed to get preserved memory bitmaps\n");
return 0;
@@ -645,11 +655,13 @@ static void __init kho_reserve_scratch(void)
scratch_size_update();
/* FIXME: deal with node hot-plug/remove */
- kho_scratch_cnt = num_online_nodes() + 2;
+ kho_scratch_cnt = nodes_weight(node_states[N_MEMORY]) + 2;
size = kho_scratch_cnt * sizeof(*kho_scratch);
kho_scratch = memblock_alloc(size, PAGE_SIZE);
- if (!kho_scratch)
+ if (!kho_scratch) {
+ pr_err("Failed to reserve scratch array\n");
goto err_disable_kho;
+ }
/*
* reserve scratch area in low memory for lowmem allocations in the
@@ -658,8 +670,10 @@ static void __init kho_reserve_scratch(void)
size = scratch_size_lowmem;
addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
ARCH_LOW_ADDRESS_LIMIT);
- if (!addr)
+ if (!addr) {
+ pr_err("Failed to reserve lowmem scratch buffer\n");
goto err_free_scratch_desc;
+ }
kho_scratch[i].addr = addr;
kho_scratch[i].size = size;
@@ -668,20 +682,28 @@ static void __init kho_reserve_scratch(void)
/* reserve large contiguous area for allocations without nid */
size = scratch_size_global;
addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
- if (!addr)
+ if (!addr) {
+ pr_err("Failed to reserve global scratch buffer\n");
goto err_free_scratch_areas;
+ }
kho_scratch[i].addr = addr;
kho_scratch[i].size = size;
i++;
- for_each_online_node(nid) {
+ /*
+ * Loop over nodes that have both memory and are online. Skip
+ * memoryless nodes, as we can not allocate scratch areas there.
+ */
+ for_each_node_state(nid, N_MEMORY) {
size = scratch_size_node(nid);
addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
0, MEMBLOCK_ALLOC_ACCESSIBLE,
nid, true);
- if (!addr)
+ if (!addr) {
+ pr_err("Failed to reserve nid %d scratch buffer\n", nid);
goto err_free_scratch_areas;
+ }
kho_scratch[i].addr = addr;
kho_scratch[i].size = size;
@@ -735,7 +757,8 @@ int kho_add_subtree(const char *name, void *fdt)
goto out_pack;
}
- err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys));
+ err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME,
+ &phys, sizeof(phys));
if (err < 0)
goto out_pack;
@@ -766,7 +789,7 @@ void kho_remove_subtree(void *fdt)
const u64 *val;
int len;
- val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len);
+ val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len);
if (!val || len != sizeof(phys_addr_t))
continue;
@@ -831,7 +854,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
*
* Return: 0 on success, error code on failure
*/
-int kho_preserve_pages(struct page *page, unsigned int nr_pages)
+int kho_preserve_pages(struct page *page, unsigned long nr_pages)
{
struct kho_mem_track *track = &kho_out.track;
const unsigned long start_pfn = page_to_pfn(page);
@@ -875,7 +898,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages);
* kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
* preserved blocks is not supported.
*/
-void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+void kho_unpreserve_pages(struct page *page, unsigned long nr_pages)
{
struct kho_mem_track *track = &kho_out.track;
const unsigned long start_pfn = page_to_pfn(page);
@@ -885,21 +908,6 @@ void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
}
EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
-struct kho_vmalloc_hdr {
- DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
-};
-
-#define KHO_VMALLOC_SIZE \
- ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
- sizeof(phys_addr_t))
-
-struct kho_vmalloc_chunk {
- struct kho_vmalloc_hdr hdr;
- phys_addr_t phys[KHO_VMALLOC_SIZE];
-};
-
-static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
-
/* vmalloc flags KHO supports */
#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP)
@@ -1315,7 +1323,7 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
if (offset < 0)
return -ENOENT;
- val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
+ val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len);
if (!val || len != sizeof(*val))
return -EINVAL;
@@ -1335,7 +1343,7 @@ static __init int kho_out_fdt_setup(void)
err |= fdt_finish_reservemap(root);
err |= fdt_begin_node(root, "");
err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
- err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map,
+ err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map,
sizeof(empty_mem_map));
err |= fdt_end_node(root);
err |= fdt_finish(root);
@@ -1451,46 +1459,40 @@ void __init kho_memory_init(void)
void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
phys_addr_t scratch_phys, u64 scratch_len)
{
+ unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
struct kho_scratch *scratch = NULL;
phys_addr_t mem_map_phys;
void *fdt = NULL;
- int err = 0;
- unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
+ int err;
/* Validate the input FDT */
fdt = early_memremap(fdt_phys, fdt_len);
if (!fdt) {
pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
- err = -EFAULT;
- goto out;
+ goto err_report;
}
err = fdt_check_header(fdt);
if (err) {
pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
fdt_phys, err);
- err = -EINVAL;
- goto out;
+ goto err_unmap_fdt;
}
err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
if (err) {
pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
fdt_phys, KHO_FDT_COMPATIBLE, err);
- err = -EINVAL;
- goto out;
+ goto err_unmap_fdt;
}
mem_map_phys = kho_get_mem_map_phys(fdt);
- if (!mem_map_phys) {
- err = -ENOENT;
- goto out;
- }
+ if (!mem_map_phys)
+ goto err_unmap_fdt;
scratch = early_memremap(scratch_phys, scratch_len);
if (!scratch) {
pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
scratch_phys, scratch_len);
- err = -EFAULT;
- goto out;
+ goto err_unmap_fdt;
}
/*
@@ -1507,7 +1509,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
if (WARN_ON(err)) {
pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
&area->addr, &size, ERR_PTR(err));
- goto out;
+ goto err_unmap_scratch;
}
pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
}
@@ -1529,13 +1531,14 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
kho_scratch_cnt = scratch_cnt;
pr_info("found kexec handover data.\n");
-out:
- if (fdt)
- early_memunmap(fdt, fdt_len);
- if (scratch)
- early_memunmap(scratch, scratch_len);
- if (err)
- pr_warn("disabling KHO revival: %d\n", err);
+ return;
+
+err_unmap_scratch:
+ early_memunmap(scratch, scratch_len);
+err_unmap_fdt:
+ early_memunmap(fdt, fdt_len);
+err_report:
+ pr_warn("disabling KHO revival\n");
}
/* Helper functions for kexec_file_load */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 944663d99dd9..dda7bb57d421 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -35,8 +35,7 @@
* iommu, interrupts, vfio, participating filesystems, and memory management.
*
* LUO uses Kexec Handover to transfer memory state from the current kernel to
- * the next kernel. For more details see
- * Documentation/core-api/kho/concepts.rst.
+ * the next kernel. For more details see Documentation/core-api/kho/index.rst.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -128,7 +127,9 @@ static int __init luo_early_startup(void)
if (err)
return err;
- return 0;
+ err = luo_flb_setup_incoming(luo_global.fdt_in);
+
+ return err;
}
static int __init liveupdate_early_init(void)
@@ -165,6 +166,7 @@ static int __init luo_fdt_setup(void)
err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
err |= luo_session_setup_outgoing(fdt_out);
+ err |= luo_flb_setup_outgoing(fdt_out);
err |= fdt_end_node(fdt_out);
err |= fdt_finish(fdt_out);
if (err)
@@ -226,6 +228,8 @@ int liveupdate_reboot(void)
if (err)
return err;
+ luo_flb_serialize();
+
err = kho_finalize();
if (err) {
pr_err("kho_finalize failed %d\n", err);
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index 9f7283379ebc..4c7df52a6507 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -104,6 +104,7 @@
#include <linux/io.h>
#include <linux/kexec_handover.h>
#include <linux/kho/abi/luo.h>
+#include <linux/list_private.h>
#include <linux/liveupdate.h>
#include <linux/module.h>
#include <linux/sizes.h>
@@ -273,7 +274,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
goto err_fput;
err = -ENOENT;
- luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ list_private_for_each_entry(fh, &luo_file_handler_list, list) {
if (fh->ops->can_preserve(fh, file)) {
err = 0;
break;
@@ -284,10 +285,14 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
if (err)
goto err_free_files_mem;
+ err = luo_flb_file_preserve(fh);
+ if (err)
+ goto err_free_files_mem;
+
luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
if (!luo_file) {
err = -ENOMEM;
- goto err_free_files_mem;
+ goto err_flb_unpreserve;
}
luo_file->file = file;
@@ -311,6 +316,8 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
err_kfree:
kfree(luo_file);
+err_flb_unpreserve:
+ luo_flb_file_unpreserve(fh);
err_free_files_mem:
luo_free_files_mem(file_set);
err_fput:
@@ -352,6 +359,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
args.serialized_data = luo_file->serialized_data;
args.private_data = luo_file->private_data;
luo_file->fh->ops->unpreserve(&args);
+ luo_flb_file_unpreserve(luo_file->fh);
list_del(&luo_file->list);
file_set->count--;
@@ -627,6 +635,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
args.retrieved = luo_file->retrieved;
luo_file->fh->ops->finish(&args);
+ luo_flb_file_finish(luo_file->fh);
}
/**
@@ -758,7 +767,7 @@ int luo_file_deserialize(struct luo_file_set *file_set,
bool handler_found = false;
struct luo_file *luo_file;
- luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ list_private_for_each_entry(fh, &luo_file_handler_list, list) {
if (!strcmp(fh->compatible, file_ser[i].compatible)) {
handler_found = true;
break;
@@ -833,7 +842,7 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
return -EBUSY;
/* Check for duplicate compatible strings */
- luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) {
+ list_private_for_each_entry(fh_iter, &luo_file_handler_list, list) {
if (!strcmp(fh_iter->compatible, fh->compatible)) {
pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
fh->compatible);
@@ -848,10 +857,13 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
goto err_resume;
}
+ INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, flb_list));
INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list));
list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list);
luo_session_resume();
+ liveupdate_test_register(fh);
+
return 0;
err_resume:
@@ -868,23 +880,38 @@ err_resume:
*
* It ensures safe removal by checking that:
* No live update session is currently in progress.
+ * No FLB registered with this file handler.
*
* If the unregistration fails, the internal test state is reverted.
*
* Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live
- * update is in progress, can't quiesce live update.
+ * update is in progress, can't quiesce live update or FLB is registred with
+ * this file handler.
*/
int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
{
+ int err = -EBUSY;
+
if (!liveupdate_enabled())
return -EOPNOTSUPP;
+ liveupdate_test_unregister(fh);
+
if (!luo_session_quiesce())
- return -EBUSY;
+ goto err_register;
+
+ if (!list_empty(&ACCESS_PRIVATE(fh, flb_list)))
+ goto err_resume;
list_del(&ACCESS_PRIVATE(fh, list));
module_put(fh->ops->owner);
luo_session_resume();
return 0;
+
+err_resume:
+ luo_session_resume();
+err_register:
+ liveupdate_test_register(fh);
+ return err;
}
diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c
new file mode 100644
index 000000000000..4c437de5c0b0
--- /dev/null
+++ b/kernel/liveupdate/luo_flb.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO File Lifecycle Bound Global Data
+ *
+ * File-Lifecycle-Bound (FLB) objects provide a mechanism for managing global
+ * state that is shared across multiple live-updatable files. The lifecycle of
+ * this shared state is tied to the preservation of the files that depend on it.
+ *
+ * An FLB represents a global resource, such as the IOMMU core state, that is
+ * required by multiple file descriptors (e.g., all VFIO fds).
+ *
+ * The preservation of the FLB's state is triggered when the *first* file
+ * depending on it is preserved. The cleanup of this state (unpreserve or
+ * finish) is triggered when the *last* file depending on it is unpreserved or
+ * finished.
+ *
+ * Handler Dependency: A file handler declares its dependency on one or more
+ * FLBs by registering them via liveupdate_register_flb().
+ *
+ * Callback Model: Each FLB is defined by a set of operations
+ * (&struct liveupdate_flb_ops) that LUO invokes at key points:
+ *
+ * - .preserve(): Called for the first file. Saves global state.
+ * - .unpreserve(): Called for the last file (if aborted pre-reboot).
+ * - .retrieve(): Called on-demand in the new kernel to restore the state.
+ * - .finish(): Called for the last file in the new kernel for cleanup.
+ *
+ * This reference-counted approach ensures that shared state is saved exactly
+ * once and restored exactly once, regardless of how many files depend on it,
+ * and that its lifecycle is correctly managed across the kexec transition.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/libfdt.h>
+#include <linux/list_private.h>
+#include <linux/liveupdate.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include "luo_internal.h"
+
+#define LUO_FLB_PGCNT 1ul
+#define LUO_FLB_MAX (((LUO_FLB_PGCNT << PAGE_SHIFT) - \
+ sizeof(struct luo_flb_header_ser)) / sizeof(struct luo_flb_ser))
+
+struct luo_flb_header {
+ struct luo_flb_header_ser *header_ser;
+ struct luo_flb_ser *ser;
+ bool active;
+};
+
+struct luo_flb_global {
+ struct luo_flb_header incoming;
+ struct luo_flb_header outgoing;
+ struct list_head list;
+ long count;
+};
+
+static struct luo_flb_global luo_flb_global = {
+ .list = LIST_HEAD_INIT(luo_flb_global.list),
+};
+
+/*
+ * struct luo_flb_link - Links an FLB definition to a file handler's internal
+ * list of dependencies.
+ * @flb: A pointer to the registered &struct liveupdate_flb definition.
+ * @list: The list_head for linking.
+ */
+struct luo_flb_link {
+ struct liveupdate_flb *flb;
+ struct list_head list;
+};
+
+/* luo_flb_get_private - Access private field, and if needed initialize it. */
+static struct luo_flb_private *luo_flb_get_private(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = &ACCESS_PRIVATE(flb, private);
+
+ if (!private->initialized) {
+ mutex_init(&private->incoming.lock);
+ mutex_init(&private->outgoing.lock);
+ INIT_LIST_HEAD(&private->list);
+ private->users = 0;
+ private->initialized = true;
+ }
+
+ return private;
+}
+
+static int luo_flb_file_preserve_one(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+
+ scoped_guard(mutex, &private->outgoing.lock) {
+ if (!private->outgoing.count) {
+ struct liveupdate_flb_op_args args = {0};
+ int err;
+
+ args.flb = flb;
+ err = flb->ops->preserve(&args);
+ if (err)
+ return err;
+ private->outgoing.data = args.data;
+ private->outgoing.obj = args.obj;
+ }
+ private->outgoing.count++;
+ }
+
+ return 0;
+}
+
+static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+
+ scoped_guard(mutex, &private->outgoing.lock) {
+ private->outgoing.count--;
+ if (!private->outgoing.count) {
+ struct liveupdate_flb_op_args args = {0};
+
+ args.flb = flb;
+ args.data = private->outgoing.data;
+ args.obj = private->outgoing.obj;
+
+ if (flb->ops->unpreserve)
+ flb->ops->unpreserve(&args);
+
+ private->outgoing.data = 0;
+ private->outgoing.obj = NULL;
+ }
+ }
+}
+
+static int luo_flb_retrieve_one(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ struct luo_flb_header *fh = &luo_flb_global.incoming;
+ struct liveupdate_flb_op_args args = {0};
+ bool found = false;
+ int err;
+
+ guard(mutex)(&private->incoming.lock);
+
+ if (private->incoming.finished)
+ return -ENODATA;
+
+ if (private->incoming.retrieved)
+ return 0;
+
+ if (!fh->active)
+ return -ENODATA;
+
+ for (int i = 0; i < fh->header_ser->count; i++) {
+ if (!strcmp(fh->ser[i].name, flb->compatible)) {
+ private->incoming.data = fh->ser[i].data;
+ private->incoming.count = fh->ser[i].count;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return -ENOENT;
+
+ args.flb = flb;
+ args.data = private->incoming.data;
+
+ err = flb->ops->retrieve(&args);
+ if (err)
+ return err;
+
+ private->incoming.obj = args.obj;
+ private->incoming.retrieved = true;
+
+ return 0;
+}
+
+static void luo_flb_file_finish_one(struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ u64 count;
+
+ scoped_guard(mutex, &private->incoming.lock)
+ count = --private->incoming.count;
+
+ if (!count) {
+ struct liveupdate_flb_op_args args = {0};
+
+ if (!private->incoming.retrieved) {
+ int err = luo_flb_retrieve_one(flb);
+
+ if (WARN_ON(err))
+ return;
+ }
+
+ scoped_guard(mutex, &private->incoming.lock) {
+ args.flb = flb;
+ args.obj = private->incoming.obj;
+ flb->ops->finish(&args);
+
+ private->incoming.data = 0;
+ private->incoming.obj = NULL;
+ private->incoming.finished = true;
+ }
+ }
+}
+
+/**
+ * luo_flb_file_preserve - Notifies FLBs that a file is about to be preserved.
+ * @fh: The file handler for the preserved file.
+ *
+ * This function iterates through all FLBs associated with the given file
+ * handler. It increments the reference count for each FLB. If the count becomes
+ * 1, it triggers the FLB's .preserve() callback to save the global state.
+ *
+ * This operation is atomic. If any FLB's .preserve() op fails, it will roll
+ * back by calling .unpreserve() on any FLBs that were successfully preserved
+ * during this call.
+ *
+ * Context: Called from luo_preserve_file()
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_flb_file_preserve(struct liveupdate_file_handler *fh)
+{
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+ int err = 0;
+
+ list_for_each_entry(iter, flb_list, list) {
+ err = luo_flb_file_preserve_one(iter->flb);
+ if (err)
+ goto exit_err;
+ }
+
+ return 0;
+
+exit_err:
+ list_for_each_entry_continue_reverse(iter, flb_list, list)
+ luo_flb_file_unpreserve_one(iter->flb);
+
+ return err;
+}
+
+/**
+ * luo_flb_file_unpreserve - Notifies FLBs that a dependent file was unpreserved.
+ * @fh: The file handler for the unpreserved file.
+ *
+ * This function iterates through all FLBs associated with the given file
+ * handler, in reverse order of registration. It decrements the reference count
+ * for each FLB. If the count becomes 0, it triggers the FLB's .unpreserve()
+ * callback to clean up the global state.
+ *
+ * Context: Called when a preserved file is being cleaned up before reboot
+ * (e.g., from luo_file_unpreserve_files()).
+ */
+void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh)
+{
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+
+ list_for_each_entry_reverse(iter, flb_list, list)
+ luo_flb_file_unpreserve_one(iter->flb);
+}
+
+/**
+ * luo_flb_file_finish - Notifies FLBs that a dependent file has been finished.
+ * @fh: The file handler for the finished file.
+ *
+ * This function iterates through all FLBs associated with the given file
+ * handler, in reverse order of registration. It decrements the incoming
+ * reference count for each FLB. If the count becomes 0, it triggers the FLB's
+ * .finish() callback for final cleanup in the new kernel.
+ *
+ * Context: Called from luo_file_finish() for each file being finished.
+ */
+void luo_flb_file_finish(struct liveupdate_file_handler *fh)
+{
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+
+ list_for_each_entry_reverse(iter, flb_list, list)
+ luo_flb_file_finish_one(iter->flb);
+}
+
+/**
+ * liveupdate_register_flb - Associate an FLB with a file handler and register it globally.
+ * @fh: The file handler that will now depend on the FLB.
+ * @flb: The File-Lifecycle-Bound object to associate.
+ *
+ * Establishes a dependency, informing the LUO core that whenever a file of
+ * type @fh is preserved, the state of @flb must also be managed.
+ *
+ * On the first registration of a given @flb object, it is added to a global
+ * registry. This function checks for duplicate registrations, both for a
+ * specific handler and globally, and ensures the total number of unique
+ * FLBs does not exceed the system limit.
+ *
+ * Context: Typically called from a subsystem's module init function after
+ * both the handler and the FLB have been defined and initialized.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -EINVAL if arguments are NULL or not initialized.
+ * -ENOMEM on memory allocation failure.
+ * -EEXIST if this FLB is already registered with this handler.
+ * -ENOSPC if the maximum number of global FLBs has been reached.
+ * -EOPNOTSUPP if live update is disabled or not configured.
+ */
+int liveupdate_register_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *link __free(kfree) = NULL;
+ struct liveupdate_flb *gflb;
+ struct luo_flb_link *iter;
+ int err;
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ if (WARN_ON(!flb->ops->preserve || !flb->ops->unpreserve ||
+ !flb->ops->retrieve || !flb->ops->finish)) {
+ return -EINVAL;
+ }
+
+ /*
+ * File handler must already be registered, as it initializes the
+ * flb_list
+ */
+ if (WARN_ON(list_empty(&ACCESS_PRIVATE(fh, list))))
+ return -EINVAL;
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ return -ENOMEM;
+
+ /*
+ * Ensure the system is quiescent (no active sessions).
+ * This acts as a global lock for registration: no other thread can
+ * be in this section, and no sessions can be creating/using FDs.
+ */
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ /* Check that this FLB is not already linked to this file handler */
+ err = -EEXIST;
+ list_for_each_entry(iter, flb_list, list) {
+ if (iter->flb == flb)
+ goto err_resume;
+ }
+
+ /*
+ * If this FLB is not linked to global list it's the first time the FLB
+ * is registered
+ */
+ if (!private->users) {
+ if (WARN_ON(!list_empty(&private->list))) {
+ err = -EINVAL;
+ goto err_resume;
+ }
+
+ if (luo_flb_global.count == LUO_FLB_MAX) {
+ err = -ENOSPC;
+ goto err_resume;
+ }
+
+ /* Check that compatible string is unique in global list */
+ list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) {
+ if (!strcmp(gflb->compatible, flb->compatible))
+ goto err_resume;
+ }
+
+ if (!try_module_get(flb->ops->owner)) {
+ err = -EAGAIN;
+ goto err_resume;
+ }
+
+ list_add_tail(&private->list, &luo_flb_global.list);
+ luo_flb_global.count++;
+ }
+
+ /* Finally, link the FLB to the file handler */
+ private->users++;
+ link->flb = flb;
+ list_add_tail(&no_free_ptr(link)->list, flb_list);
+ luo_session_resume();
+
+ return 0;
+
+err_resume:
+ luo_session_resume();
+ return err;
+}
+
+/**
+ * liveupdate_unregister_flb - Remove an FLB dependency from a file handler.
+ * @fh: The file handler that is currently depending on the FLB.
+ * @flb: The File-Lifecycle-Bound object to remove.
+ *
+ * Removes the association between the specified file handler and the FLB
+ * previously established by liveupdate_register_flb().
+ *
+ * This function manages the global lifecycle of the FLB. It decrements the
+ * FLB's usage count. If this was the last file handler referencing this FLB,
+ * the FLB is removed from the global registry and the reference to its
+ * owner module (acquired during registration) is released.
+ *
+ * Context: This function ensures the session is quiesced (no active FDs
+ * being created) during the update. It is typically called from a
+ * subsystem's module exit function.
+ * Return: 0 on success.
+ * -EOPNOTSUPP if live update is disabled.
+ * -EBUSY if the live update session is active and cannot be quiesced.
+ * -ENOENT if the FLB was not found in the file handler's list.
+ */
+int liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+ int err = -ENOENT;
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ /*
+ * Ensure the system is quiescent (no active sessions).
+ * This acts as a global lock for unregistration.
+ */
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ /* Find and remove the link from the file handler's list */
+ list_for_each_entry(iter, flb_list, list) {
+ if (iter->flb == flb) {
+ list_del(&iter->list);
+ kfree(iter);
+ err = 0;
+ break;
+ }
+ }
+
+ if (err)
+ goto err_resume;
+
+ private->users--;
+ /*
+ * If this is the last file-handler with which we are registred, remove
+ * from the global list, and relese module reference.
+ */
+ if (!private->users) {
+ list_del_init(&private->list);
+ luo_flb_global.count--;
+ module_put(flb->ops->owner);
+ }
+
+ luo_session_resume();
+
+ return 0;
+
+err_resume:
+ luo_session_resume();
+ return err;
+}
+
+/**
+ * liveupdate_flb_get_incoming - Retrieve the incoming FLB object.
+ * @flb: The FLB definition.
+ * @objp: Output parameter; will be populated with the live shared object.
+ *
+ * Returns a pointer to its shared live object for the incoming (post-reboot)
+ * path.
+ *
+ * If this is the first time the object is requested in the new kernel, this
+ * function will trigger the FLB's .retrieve() callback to reconstruct the
+ * object from its preserved state. Subsequent calls will return the same
+ * cached object.
+ *
+ * Return: 0 on success, or a negative errno on failure. -ENODATA means no
+ * incoming FLB data, -ENOENT means specific flb not found in the incoming
+ * data, and -EOPNOTSUPP when live update is disabled or not configured.
+ */
+int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ if (!private->incoming.obj) {
+ int err = luo_flb_retrieve_one(flb);
+
+ if (err)
+ return err;
+ }
+
+ guard(mutex)(&private->incoming.lock);
+ *objp = private->incoming.obj;
+
+ return 0;
+}
+
+/**
+ * liveupdate_flb_get_outgoing - Retrieve the outgoing FLB object.
+ * @flb: The FLB definition.
+ * @objp: Output parameter; will be populated with the live shared object.
+ *
+ * Returns a pointer to its shared live object for the outgoing (pre-reboot)
+ * path.
+ *
+ * This function assumes the object has already been created by the FLB's
+ * .preserve() callback, which is triggered when the first dependent file
+ * is preserved.
+ *
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ guard(mutex)(&private->outgoing.lock);
+ *objp = private->outgoing.obj;
+
+ return 0;
+}
+
+int __init luo_flb_setup_outgoing(void *fdt_out)
+{
+ struct luo_flb_header_ser *header_ser;
+ u64 header_ser_pa;
+ int err;
+
+ header_ser = kho_alloc_preserve(LUO_FLB_PGCNT << PAGE_SHIFT);
+ if (IS_ERR(header_ser))
+ return PTR_ERR(header_ser);
+
+ header_ser_pa = virt_to_phys(header_ser);
+
+ err = fdt_begin_node(fdt_out, LUO_FDT_FLB_NODE_NAME);
+ err |= fdt_property_string(fdt_out, "compatible",
+ LUO_FDT_FLB_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_FLB_HEADER, &header_ser_pa,
+ sizeof(header_ser_pa));
+ err |= fdt_end_node(fdt_out);
+
+ if (err)
+ goto err_unpreserve;
+
+ header_ser->pgcnt = LUO_FLB_PGCNT;
+ luo_flb_global.outgoing.header_ser = header_ser;
+ luo_flb_global.outgoing.ser = (void *)(header_ser + 1);
+ luo_flb_global.outgoing.active = true;
+
+ return 0;
+
+err_unpreserve:
+ kho_unpreserve_free(header_ser);
+
+ return err;
+}
+
+int __init luo_flb_setup_incoming(void *fdt_in)
+{
+ struct luo_flb_header_ser *header_ser;
+ int err, header_size, offset;
+ const void *ptr;
+ u64 header_ser_pa;
+
+ offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_FLB_NODE_NAME);
+ if (offset < 0) {
+ pr_err("Unable to get FLB node [%s]\n", LUO_FDT_FLB_NODE_NAME);
+
+ return -ENOENT;
+ }
+
+ err = fdt_node_check_compatible(fdt_in, offset,
+ LUO_FDT_FLB_COMPATIBLE);
+ if (err) {
+ pr_err("FLB node is incompatible with '%s' [%d]\n",
+ LUO_FDT_FLB_COMPATIBLE, err);
+
+ return -EINVAL;
+ }
+
+ header_size = 0;
+ ptr = fdt_getprop(fdt_in, offset, LUO_FDT_FLB_HEADER, &header_size);
+ if (!ptr || header_size != sizeof(u64)) {
+ pr_err("Unable to get FLB header property '%s' [%d]\n",
+ LUO_FDT_FLB_HEADER, header_size);
+
+ return -EINVAL;
+ }
+
+ header_ser_pa = get_unaligned((u64 *)ptr);
+ header_ser = phys_to_virt(header_ser_pa);
+
+ luo_flb_global.incoming.header_ser = header_ser;
+ luo_flb_global.incoming.ser = (void *)(header_ser + 1);
+ luo_flb_global.incoming.active = true;
+
+ return 0;
+}
+
+/**
+ * luo_flb_serialize - Serializes all active FLB objects for KHO.
+ *
+ * This function is called from the reboot path. It iterates through all
+ * registered File-Lifecycle-Bound (FLB) objects. For each FLB that has been
+ * preserved (i.e., its reference count is greater than zero), it writes its
+ * metadata into the memory region designated for Kexec Handover.
+ *
+ * The serialized data includes the FLB's compatibility string, its opaque
+ * data handle, and the final reference count. This allows the new kernel to
+ * find the appropriate handler and reconstruct the FLB's state.
+ *
+ * Context: Called from liveupdate_reboot() just before kho_finalize().
+ */
+void luo_flb_serialize(void)
+{
+ struct luo_flb_header *fh = &luo_flb_global.outgoing;
+ struct liveupdate_flb *gflb;
+ int i = 0;
+
+ list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) {
+ struct luo_flb_private *private = luo_flb_get_private(gflb);
+
+ if (private->outgoing.count > 0) {
+ strscpy(fh->ser[i].name, gflb->compatible,
+ sizeof(fh->ser[i].name));
+ fh->ser[i].data = private->outgoing.data;
+ fh->ser[i].count = private->outgoing.count;
+ i++;
+ }
+ }
+
+ fh->header_ser->count = i;
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index c8973b543d1d..8083d8739b09 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -40,13 +40,6 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
*/
#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
-/* Mimics list_for_each_entry() but for private list head entries */
-#define luo_list_for_each_private(pos, head, member) \
- for (struct list_head *__iter = (head)->next; \
- __iter != (head) && \
- ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \
- __iter = __iter->next)
-
/**
* struct luo_file_set - A set of files that belong to the same sessions.
* @files_list: An ordered list of files associated with this session, it is
@@ -107,4 +100,19 @@ int luo_file_deserialize(struct luo_file_set *file_set,
void luo_file_set_init(struct luo_file_set *file_set);
void luo_file_set_destroy(struct luo_file_set *file_set);
+int luo_flb_file_preserve(struct liveupdate_file_handler *fh);
+void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh);
+void luo_flb_file_finish(struct liveupdate_file_handler *fh);
+int __init luo_flb_setup_outgoing(void *fdt);
+int __init luo_flb_setup_incoming(void *fdt);
+void luo_flb_serialize(void);
+
+#ifdef CONFIG_LIVEUPDATE_TEST
+void liveupdate_test_register(struct liveupdate_file_handler *fh);
+void liveupdate_test_unregister(struct liveupdate_file_handler *fh);
+#else
+static inline void liveupdate_test_register(struct liveupdate_file_handler *fh) { }
+static inline void liveupdate_test_unregister(struct liveupdate_file_handler *fh) { }
+#endif
+
#endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index 00a60796327c..0fc11e45df9b 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -334,13 +334,8 @@ int module_address_lookup(unsigned long addr,
if (mod) {
if (modname)
*modname = mod->name;
- if (modbuildid) {
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
- *modbuildid = mod->build_id;
-#else
- *modbuildid = NULL;
-#endif
- }
+ if (modbuildid)
+ *modbuildid = module_buildid(mod);
sym = find_kallsyms_symbol(mod, addr, size, offset);
diff --git a/kernel/panic.c b/kernel/panic.c
index 0c20fcaae98a..c78600212b6c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -42,6 +42,7 @@
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
+#define PANIC_MSG_BUFSZ 1024
#ifdef CONFIG_SMP
/*
@@ -74,6 +75,8 @@ EXPORT_SYMBOL_GPL(panic_timeout);
unsigned long panic_print;
+static int panic_force_cpu = -1;
+
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
@@ -300,6 +303,150 @@ void __weak crash_smp_send_stop(void)
}
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+atomic_t panic_redirect_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP)
+static char *panic_force_buf;
+
+static int __init panic_force_cpu_setup(char *str)
+{
+ int cpu;
+
+ if (!str)
+ return -EINVAL;
+
+ if (kstrtoint(str, 0, &cpu) || cpu < 0 || cpu >= nr_cpu_ids) {
+ pr_warn("panic_force_cpu: invalid value '%s'\n", str);
+ return -EINVAL;
+ }
+
+ panic_force_cpu = cpu;
+ return 0;
+}
+early_param("panic_force_cpu", panic_force_cpu_setup);
+
+static int __init panic_force_cpu_late_init(void)
+{
+ if (panic_force_cpu < 0)
+ return 0;
+
+ panic_force_buf = kmalloc(PANIC_MSG_BUFSZ, GFP_KERNEL);
+
+ return 0;
+}
+late_initcall(panic_force_cpu_late_init);
+
+static void do_panic_on_target_cpu(void *info)
+{
+ panic("%s", (char *)info);
+}
+
+/**
+ * panic_smp_redirect_cpu - Redirect panic to target CPU
+ * @target_cpu: CPU that should handle the panic
+ * @msg: formatted panic message
+ *
+ * Default implementation uses IPI. Architectures with NMI support
+ * can override this for more reliable delivery.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+int __weak panic_smp_redirect_cpu(int target_cpu, void *msg)
+{
+ static call_single_data_t panic_csd;
+
+ panic_csd.func = do_panic_on_target_cpu;
+ panic_csd.info = msg;
+
+ return smp_call_function_single_async(target_cpu, &panic_csd);
+}
+
+/**
+ * panic_try_force_cpu - Redirect panic to a specific CPU for crash kernel
+ * @fmt: panic message format string
+ * @args: arguments for format string
+ *
+ * Some platforms require panic handling to occur on a specific CPU
+ * for the crash kernel to function correctly. This function redirects
+ * panic handling to the CPU specified via the panic_force_cpu= boot parameter.
+ *
+ * Returns false if panic should proceed on current CPU.
+ * Returns true if panic was redirected.
+ */
+__printf(1, 0)
+static bool panic_try_force_cpu(const char *fmt, va_list args)
+{
+ int this_cpu = raw_smp_processor_id();
+ int old_cpu = PANIC_CPU_INVALID;
+ const char *msg;
+
+ /* Feature not enabled via boot parameter */
+ if (panic_force_cpu < 0)
+ return false;
+
+ /* Already on target CPU - proceed normally */
+ if (this_cpu == panic_force_cpu)
+ return false;
+
+ /* Target CPU is offline, can't redirect */
+ if (!cpu_online(panic_force_cpu)) {
+ pr_warn("panic: target CPU %d is offline, continuing on CPU %d\n",
+ panic_force_cpu, this_cpu);
+ return false;
+ }
+
+ /* Another panic already in progress */
+ if (panic_in_progress())
+ return false;
+
+ /*
+ * Only one CPU can do the redirect. Use atomic cmpxchg to ensure
+ * we don't race with another CPU also trying to redirect.
+ */
+ if (!atomic_try_cmpxchg(&panic_redirect_cpu, &old_cpu, this_cpu))
+ return false;
+
+ /*
+ * Use dynamically allocated buffer if available, otherwise
+ * fall back to static message for early boot panics or allocation failure.
+ */
+ if (panic_force_buf) {
+ vsnprintf(panic_force_buf, PANIC_MSG_BUFSZ, fmt, args);
+ msg = panic_force_buf;
+ } else {
+ msg = "Redirected panic (buffer unavailable)";
+ }
+
+ console_verbose();
+ bust_spinlocks(1);
+
+ pr_emerg("panic: Redirecting from CPU %d to CPU %d for crash kernel.\n",
+ this_cpu, panic_force_cpu);
+
+ /* Dump original CPU before redirecting */
+ if (!test_taint(TAINT_DIE) &&
+ oops_in_progress <= 1 &&
+ IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
+ dump_stack();
+ }
+
+ if (panic_smp_redirect_cpu(panic_force_cpu, (void *)msg) != 0) {
+ atomic_set(&panic_redirect_cpu, PANIC_CPU_INVALID);
+ pr_warn("panic: failed to redirect to CPU %d, continuing on CPU %d\n",
+ panic_force_cpu, this_cpu);
+ return false;
+ }
+
+ /* IPI/NMI sent, this CPU should stop */
+ return true;
+}
+#else
+__printf(1, 0)
+static inline bool panic_try_force_cpu(const char *fmt, va_list args)
+{
+ return false;
+}
+#endif /* CONFIG_SMP && CONFIG_CRASH_DUMP */
bool panic_try_start(void)
{
@@ -428,7 +575,7 @@ static void panic_other_cpus_shutdown(bool crash_kexec)
*/
void vpanic(const char *fmt, va_list args)
{
- static char buf[1024];
+ static char buf[PANIC_MSG_BUFSZ];
long i, i_next = 0, len;
int state = 0;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
@@ -452,6 +599,15 @@ void vpanic(const char *fmt, va_list args)
local_irq_disable();
preempt_disable_notrace();
+ /* Redirect panic to target CPU if configured via panic_force_cpu=. */
+ if (panic_try_force_cpu(fmt, args)) {
+ /*
+ * Mark ourselves offline so panic_other_cpus_shutdown() won't wait
+ * for us on architectures that check num_online_cpus().
+ */
+ set_cpu_online(smp_processor_id(), false);
+ panic_smp_self_stop();
+ }
/*
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
@@ -484,7 +640,11 @@ void vpanic(const char *fmt, va_list args)
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
+ if (atomic_read(&panic_redirect_cpu) != PANIC_CPU_INVALID &&
+ panic_force_cpu == raw_smp_processor_id()) {
+ pr_emerg("panic: Redirected from CPU %d, skipping stack dump.\n",
+ atomic_read(&panic_redirect_cpu));
+ } else if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
panic_this_cpu_backtrace_printed = true;
} else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
dump_stack();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7e462957c9bf..c4eb284b8e72 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -174,10 +174,10 @@ sector_t alloc_swapdev_block(int swap)
* Allocate a swap page and register that it has been allocated, so that
* it can be freed in case of an error.
*/
- offset = swp_offset(get_swap_page_of_type(swap));
+ offset = swp_offset(swap_alloc_hibernation_slot(swap));
if (offset) {
if (swsusp_extents_insert(offset))
- swap_free(swp_entry(swap, offset));
+ swap_free_hibernation_slot(swp_entry(swap, offset));
else
return swapdev_block(swap, offset);
}
@@ -186,6 +186,7 @@ sector_t alloc_swapdev_block(int swap)
void free_all_swap_pages(int swap)
{
+ unsigned long offset;
struct rb_node *node;
/*
@@ -197,8 +198,9 @@ void free_all_swap_pages(int swap)
ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
- swap_free_nr(swp_entry(swap, ext->start),
- ext->end - ext->start + 1);
+
+ for (offset = ext->start; offset <= ext->end; offset++)
+ swap_free_hibernation_slot(swp_entry(swap, offset));
kfree(ext);
}
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 5f5f626f4279..5fdea5682756 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -281,12 +281,20 @@ struct printk_buffers {
* nothing to output and this record should be skipped.
* @seq: The sequence number of the record used for @pbufs->outbuf.
* @dropped: The number of dropped records from reading @seq.
+ * @cpu: CPU on which the message was generated.
+ * @pid: PID of the task that generated the message
+ * @comm: Name of the task that generated the message.
*/
struct printk_message {
struct printk_buffers *pbufs;
unsigned int outbuf_len;
u64 seq;
unsigned long dropped;
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+ int cpu;
+ pid_t pid;
+ char comm[TASK_COMM_LEN];
+#endif
};
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 32fc12e53675..d558b18505cd 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -946,6 +946,20 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
}
EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt,
+ struct printk_message *pmsg)
+{
+ wctxt->cpu = pmsg->cpu;
+ wctxt->pid = pmsg->pid;
+ memcpy(wctxt->comm, pmsg->comm, sizeof(wctxt->comm));
+ static_assert(sizeof(wctxt->comm) == sizeof(pmsg->comm));
+}
+#else
+static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt,
+ struct printk_message *pmsg) {}
+#endif
+
/**
* nbcon_emit_next_record - Emit a record in the acquired context
* @wctxt: The write context that will be handed to the write function
@@ -1048,6 +1062,8 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_a
/* Initialize the write context for driver callbacks. */
nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);
+ wctxt_load_execution_ctx(wctxt, &pmsg);
+
if (use_atomic)
con->write_atomic(con, wctxt);
else
@@ -1758,9 +1774,12 @@ bool nbcon_alloc(struct console *con)
/* Synchronize the kthread start. */
lockdep_assert_console_list_lock_held();
- /* The write_thread() callback is mandatory. */
- if (WARN_ON(!con->write_thread))
+ /* Check for mandatory nbcon callbacks. */
+ if (WARN_ON(!con->write_thread ||
+ !con->device_lock ||
+ !con->device_unlock)) {
return false;
+ }
rcuwait_init(&con->rcuwait);
init_irq_work(&con->irq_work, nbcon_irq_work);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 37d16ef27f13..a181394604d1 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2133,12 +2133,40 @@ static inline void printk_delay(int level)
}
}
+#define CALLER_ID_MASK 0x80000000
+
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
- 0x80000000 + smp_processor_id();
+ CALLER_ID_MASK + smp_processor_id();
+}
+
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+/* Store the opposite info than caller_id. */
+static u32 printk_caller_id2(void)
+{
+ return !in_task() ? task_pid_nr(current) :
+ CALLER_ID_MASK + smp_processor_id();
+}
+
+static pid_t printk_info_get_pid(const struct printk_info *info)
+{
+ u32 caller_id = info->caller_id;
+ u32 caller_id2 = info->caller_id2;
+
+ return caller_id & CALLER_ID_MASK ? caller_id2 : caller_id;
}
+static int printk_info_get_cpu(const struct printk_info *info)
+{
+ u32 caller_id = info->caller_id;
+ u32 caller_id2 = info->caller_id2;
+
+ return ((caller_id & CALLER_ID_MASK ?
+ caller_id : caller_id2) & ~CALLER_ID_MASK);
+}
+#endif
+
/**
* printk_parse_prefix - Parse level and control flags.
*
@@ -2215,6 +2243,28 @@ static u16 printk_sprint(char *text, u16 size, int facility,
return text_len;
}
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+static void printk_store_execution_ctx(struct printk_info *info)
+{
+ info->caller_id2 = printk_caller_id2();
+ get_task_comm(info->comm, current);
+}
+
+static void pmsg_load_execution_ctx(struct printk_message *pmsg,
+ const struct printk_info *info)
+{
+ pmsg->cpu = printk_info_get_cpu(info);
+ pmsg->pid = printk_info_get_pid(info);
+ memcpy(pmsg->comm, info->comm, sizeof(pmsg->comm));
+ static_assert(sizeof(pmsg->comm) == sizeof(info->comm));
+}
+#else
+static void printk_store_execution_ctx(struct printk_info *info) {}
+
+static void pmsg_load_execution_ctx(struct printk_message *pmsg,
+ const struct printk_info *info) {}
+#endif
+
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -2322,6 +2372,7 @@ int vprintk_store(int facility, int level,
r.info->caller_id = caller_id;
if (dev_info)
memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
+ printk_store_execution_ctx(r.info);
/* A message without a trailing newline can be continued. */
if (!(flags & LOG_NEWLINE))
@@ -3004,6 +3055,7 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
pmsg->seq = r.info->seq;
pmsg->dropped = r.info->seq - seq;
force_con = r.info->flags & LOG_FORCE_CON;
+ pmsg_load_execution_ctx(pmsg, r.info);
/*
* Skip records that are not forced to be printed on consoles and that
@@ -3364,22 +3416,6 @@ void console_unlock(void)
}
EXPORT_SYMBOL(console_unlock);
-/**
- * console_conditional_schedule - yield the CPU if required
- *
- * If the console code is currently allowed to sleep, and
- * if this CPU should yield the CPU to another task, do
- * so here.
- *
- * Must be called within console_lock();.
- */
-void __sched console_conditional_schedule(void)
-{
- if (console_may_schedule)
- cond_resched();
-}
-EXPORT_SYMBOL(console_conditional_schedule);
-
void console_unblank(void)
{
bool found_unblank = false;
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 4ef81349d9fb..1651b53ece34 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -23,6 +23,11 @@ struct printk_info {
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
u32 caller_id; /* thread id or processor id */
+#ifdef CONFIG_PRINTK_EXECUTION_CTX
+ u32 caller_id2; /* caller_id complement */
+ /* name of the task that generated the message */
+ char comm[TASK_COMM_LEN];
+#endif
struct dev_printk_info dev_info;
};
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index c469c708fdd6..66ba6a2f83d3 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -789,7 +789,8 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
struct srcu_data *sdp;
/* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */
- WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi());
+ WARN_ON_ONCE(read_flavor != SRCU_READ_FLAVOR_NMI &&
+ read_flavor != SRCU_READ_FLAVOR_FAST && in_nmi());
WARN_ON_ONCE(read_flavor & (read_flavor - 1));
sdp = raw_cpu_ptr(ssp->sda);
diff --git a/kernel/resource.c b/kernel/resource.c
index e4e9bac12e6e..31341bdd7707 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -48,6 +48,14 @@ struct resource iomem_resource = {
};
EXPORT_SYMBOL(iomem_resource);
+struct resource soft_reserve_resource = {
+ .name = "Soft Reserved",
+ .start = 0,
+ .end = -1,
+ .desc = IORES_DESC_SOFT_RESERVED,
+ .flags = IORESOURCE_MEM,
+};
+
static DEFINE_RWLOCK(resource_lock);
/*
@@ -82,7 +90,7 @@ static struct resource *next_resource(struct resource *p, bool skip_children,
#ifdef CONFIG_PROC_FS
-enum { MAX_IORES_LEVEL = 5 };
+enum { MAX_IORES_LEVEL = 8 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
@@ -321,13 +329,14 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long
}
/**
- * find_next_iomem_res - Finds the lowest iomem resource that covers part of
- * [@start..@end].
+ * find_next_res - Finds the lowest resource that covers part of
+ * [@start..@end].
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
* -ENODEV. Returns -EINVAL for invalid parameters.
*
+ * @parent: resource tree root to search
* @start: start address of the resource searched for
* @end: end address of same resource
* @flags: flags which the resource must have
@@ -337,9 +346,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long
* The caller must specify @start, @end, @flags, and @desc
* (which may be IORES_DESC_NONE).
*/
-static int find_next_iomem_res(resource_size_t start, resource_size_t end,
- unsigned long flags, unsigned long desc,
- struct resource *res)
+static int find_next_res(struct resource *parent, resource_size_t start,
+ resource_size_t end, unsigned long flags,
+ unsigned long desc, struct resource *res)
{
/* Skip children until we find a top level range that matches */
bool skip_children = true;
@@ -353,7 +362,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for_each_resource(&iomem_resource, p, skip_children) {
+ for_each_resource(parent, p, skip_children) {
/* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
@@ -390,16 +399,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
return p ? 0 : -ENODEV;
}
-static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
- unsigned long flags, unsigned long desc,
- void *arg,
- int (*func)(struct resource *, void *))
+static int find_next_iomem_res(resource_size_t start, resource_size_t end,
+ unsigned long flags, unsigned long desc,
+ struct resource *res)
+{
+ return find_next_res(&iomem_resource, start, end, flags, desc, res);
+}
+
+static int walk_res_desc(struct resource *parent, resource_size_t start,
+ resource_size_t end, unsigned long flags,
+ unsigned long desc, void *arg,
+ int (*func)(struct resource *, void *))
{
struct resource res;
int ret = -EINVAL;
while (start < end &&
- !find_next_iomem_res(start, end, flags, desc, &res)) {
+ !find_next_res(parent, start, end, flags, desc, &res)) {
ret = (*func)(&res, arg);
if (ret)
break;
@@ -410,6 +426,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
return ret;
}
+static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
+ unsigned long flags, unsigned long desc,
+ void *arg,
+ int (*func)(struct resource *, void *))
+{
+ return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func);
+}
+
+
/**
* walk_iomem_res_desc - Walks through iomem resources and calls func()
* with matching resource ranges.
@@ -435,6 +460,18 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
/*
+ * In support of device drivers claiming Soft Reserved resources, walk the Soft
+ * Reserved resource deferral tree.
+ */
+int walk_soft_reserve_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ return walk_res_desc(&soft_reserve_resource, start, end, IORESOURCE_MEM,
+ IORES_DESC_SOFT_RESERVED, arg, func);
+}
+EXPORT_SYMBOL_GPL(walk_soft_reserve_res);
+
+/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* Now, this function is only for System RAM, it deals with full ranges and
@@ -656,6 +693,18 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags,
}
EXPORT_SYMBOL_GPL(region_intersects);
+/*
+ * Check if the provided range is registered in the Soft Reserved resource
+ * deferral tree for driver consideration.
+ */
+int region_intersects_soft_reserve(resource_size_t start, size_t size)
+{
+ guard(read_lock)(&resource_lock);
+ return __region_intersects(&soft_reserve_resource, start, size,
+ IORESOURCE_MEM, IORES_DESC_SOFT_RESERVED);
+}
+EXPORT_SYMBOL_GPL(region_intersects_soft_reserve);
+
void __weak arch_remove_reservations(struct resource *avail)
{
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7c8b769c0d0d..759777694c78 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10788,10 +10788,9 @@ void sched_mm_cid_exit(struct task_struct *t)
return;
/*
* Mode change. The task has the CID unset
- * already. The CPU CID is still valid and
- * does not have MM_CID_TRANSIT set as the
- * mode change has just taken effect under
- * mm::mm_cid::lock. Drop it.
+ * already and dealt with an eventually set
+ * TRANSIT bit. If the CID is owned by the CPU
+ * then drop it.
*/
mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e6bf73456176..c18e81e8ef51 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3520,8 +3520,8 @@ static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
* operations inside scheduler locks.
*/
dsq->id = SCX_DSQ_INVALID;
- llist_add(&dsq->free_node, &dsqs_to_free);
- irq_work_queue(&free_dsq_irq_work);
+ if (llist_add(&dsq->free_node, &dsqs_to_free))
+ irq_work_queue(&free_dsq_irq_work);
out_unlock_dsq:
raw_spin_unlock_irqrestore(&dsq->lock, flags);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e51bfa3586fa..b82fb70a9d54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3813,8 +3813,10 @@ static __always_inline void mm_unset_cid_on_task(struct task_struct *t)
static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp)
{
/* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */
- pcp->cid = cpu_cid_to_cid(pcp->cid);
- mm_drop_cid(mm, pcp->cid);
+ if (cid_on_cpu(pcp->cid)) {
+ pcp->cid = cpu_cid_to_cid(pcp->cid);
+ mm_drop_cid(mm, pcp->cid);
+ }
}
static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c903f1a42891..a612cf253c87 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
delta = rq_clock(rq) - t->sched_info.last_queued;
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
- if (delta > t->sched_info.max_run_delay)
+ if (delta > t->sched_info.max_run_delay) {
t->sched_info.max_run_delay = delta;
+ ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+ }
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
rq_sched_info_dequeue(rq, delta);
@@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
- if (delta > t->sched_info.max_run_delay)
+ if (delta > t->sched_info.max_run_delay) {
t->sched_info.max_run_delay = delta;
+ ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+ }
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
diff --git a/kernel/sys.c b/kernel/sys.c
index 35ea9d79a42e..c86eba9aa7e9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2388,6 +2388,21 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st
return -EINVAL;
}
+int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status)
+{
+ return -EINVAL;
+}
+
+int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status)
+{
+ return -EINVAL;
+}
+
+int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status)
+{
+ return -EINVAL;
+}
+
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
static int prctl_set_vma(unsigned long opt, unsigned long addr,
@@ -2873,6 +2888,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = rseq_slice_extension_prctl(arg2, arg3);
break;
+ case PR_GET_INDIR_BR_LP_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_get_indir_br_lp_status(me, (unsigned long __user *)arg2);
+ break;
+ case PR_SET_INDIR_BR_LP_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_set_indir_br_lp_status(me, arg2);
+ break;
+ case PR_LOCK_INDIR_BR_LP_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_lock_indir_br_lp_status(me, arg2);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d7042a09fe46..49de13cae428 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -136,6 +136,7 @@ config BUILDTIME_MCOUNT_SORT
config TRACER_MAX_TRACE
bool
+ select TRACER_SNAPSHOT
config TRACE_CLOCK
bool
@@ -425,7 +426,6 @@ config IRQSOFF_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
- select TRACER_SNAPSHOT
select TRACER_SNAPSHOT_PER_CPU_SWAP
help
This option measures the time spent in irqs-off critical
@@ -448,7 +448,6 @@ config PREEMPT_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
- select TRACER_SNAPSHOT
select TRACER_SNAPSHOT_PER_CPU_SWAP
select TRACE_PREEMPT_TOGGLE
help
@@ -470,7 +469,6 @@ config SCHED_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
- select TRACER_SNAPSHOT
help
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
@@ -620,7 +618,6 @@ config TRACE_SYSCALL_BUF_SIZE_DEFAULT
config TRACER_SNAPSHOT
bool "Create a snapshot trace buffer"
- select TRACER_MAX_TRACE
help
Allow tracing users to take snapshot of the current buffer using the
ftrace interface, e.g.:
@@ -628,6 +625,9 @@ config TRACER_SNAPSHOT
echo 1 > /sys/kernel/tracing/snapshot
cat snapshot
+ Note, the latency tracers select this option. To disable it,
+ all the latency tracers need to be disabled.
+
config TRACER_SNAPSHOT_PER_CPU_SWAP
bool "Allow snapshot to swap per CPU"
depends on TRACER_SNAPSHOT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index fc5dcc888e13..04096c21d06b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING) += trace_pid.o
obj-$(CONFIG_TRACING) += pid_list.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c4db5c2e7103..f2de9cf15d0e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1832,7 +1832,9 @@ static struct trace_event trace_blk_event = {
.funcs = &trace_blk_event_funcs,
};
-static int __init init_blk_tracer(void)
+static struct work_struct blktrace_works __initdata;
+
+static int __init __init_blk_tracer(void)
{
if (!register_trace_event(&trace_blk_event)) {
pr_warn("Warning: could not register block events\n");
@@ -1852,6 +1854,25 @@ static int __init init_blk_tracer(void)
return 0;
}
+static void __init blktrace_works_func(struct work_struct *work)
+{
+ __init_blk_tracer();
+}
+
+static int __init init_blk_tracer(void)
+{
+ int ret = 0;
+
+ if (trace_init_wq) {
+ INIT_WORK(&blktrace_works, blktrace_works_func);
+ queue_work(trace_init_wq, &blktrace_works);
+ } else {
+ ret = __init_blk_tracer();
+ }
+
+ return ret;
+}
+
device_initcall(init_blk_tracer);
static int blk_trace_remove_queue(struct request_queue *q)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f7baeb8278ca..eadaef8592a3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2076,7 +2076,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
- cant_sleep();
+ rcu_read_lock_dont_migrate();
if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
goto out;
@@ -2085,13 +2085,12 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
run_ctx.bpf_cookie = link->cookie;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
- rcu_read_lock();
(void) bpf_prog_run(prog, args);
- rcu_read_unlock();
bpf_reset_run_ctx(old_run_ctx);
out:
bpf_prog_put_recursion_context(prog);
+ rcu_read_unlock_migrate();
}
#define UNPACK(...) __VA_ARGS__
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index cc48d16be43e..4df766c690f9 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1303,7 +1303,7 @@ static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *go
static_call_update(fgraph_func, func);
static_call_update(fgraph_retfunc, retfunc);
if (enable_branch)
- static_branch_disable(&fgraph_do_direct);
+ static_branch_enable(&fgraph_do_direct);
}
static void ftrace_graph_disable_direct(bool disable_branch)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9b10c633bdd..1ce17c8af409 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1147,6 +1147,7 @@ struct ftrace_page {
};
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
+#define ENTRIES_PER_PAGE_GROUP(order) ((PAGE_SIZE << (order)) / ENTRY_SIZE)
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
@@ -3873,7 +3874,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count,
*num_pages += 1 << order;
ftrace_number_of_groups++;
- cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
+ cnt = ENTRIES_PER_PAGE_GROUP(order);
pg->order = order;
if (cnt > count)
@@ -7668,7 +7669,7 @@ static int ftrace_process_locs(struct module *mod,
long skip;
/* Count the number of entries unused and compare it to skipped. */
- pg_remaining = (PAGE_SIZE << pg->order) / ENTRY_SIZE - pg->index;
+ pg_remaining = ENTRIES_PER_PAGE_GROUP(pg->order) - pg->index;
if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) {
@@ -7676,7 +7677,7 @@ static int ftrace_process_locs(struct module *mod,
for (pg = pg_unuse; pg && skip > 0; pg = pg->next) {
remaining += 1 << pg->order;
- skip -= (PAGE_SIZE << pg->order) / ENTRY_SIZE;
+ skip -= ENTRIES_PER_PAGE_GROUP(pg->order);
}
pages -= remaining;
@@ -8112,7 +8113,8 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
int
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
- unsigned long *off, char **modname, char *sym)
+ unsigned long *off, char **modname,
+ const unsigned char **modbuildid, char *sym)
{
struct ftrace_mod_map *mod_map;
int ret = 0;
@@ -8124,6 +8126,8 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
if (ret) {
if (modname)
*modname = mod_map->mod->name;
+ if (modbuildid)
+ *modbuildid = module_buildid(mod_map->mod);
break;
}
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 630221b00838..d33103408955 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
@@ -4013,19 +4014,36 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
rb_end_commit(cpu_buffer);
}
+static bool
+rb_irq_work_queue(struct rb_irq_work *irq_work)
+{
+ int cpu;
+
+ /* irq_work_queue_on() is not NMI-safe */
+ if (unlikely(in_nmi()))
+ return irq_work_queue(&irq_work->work);
+
+ /*
+ * If CPU isolation is not active, cpu is always the current
+ * CPU, and the following is equivallent to irq_work_queue().
+ */
+ cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
+ return irq_work_queue_on(&irq_work->work, cpu);
+}
+
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&buffer->irq_work.work);
+ rb_irq_work_queue(&buffer->irq_work);
}
if (cpu_buffer->irq_work.waiters_pending) {
cpu_buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
+ rb_irq_work_queue(&cpu_buffer->irq_work);
}
if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
@@ -4045,7 +4063,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->irq_work.wakeup_full = true;
cpu_buffer->irq_work.full_waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
+ rb_irq_work_queue(&cpu_buffer->irq_work);
}
#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c
index 5a83b7171432..4b5646a70094 100644
--- a/kernel/trace/rv/monitors/nrp/nrp.c
+++ b/kernel/trace/rv/monitors/nrp/nrp.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "nrp"
@@ -15,17 +14,16 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "nrp.h"
-
-static struct rv_monitor rv_nrp;
-DECLARE_DA_MON_PER_TASK(nrp, unsigned char);
+#include <rv/da_monitor.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/trace/irq_vectors.h>
static void handle_vector_irq_entry(void *data, int vector)
{
- da_handle_event_nrp(current, irq_entry_nrp);
+ da_handle_event(current, irq_entry_nrp);
}
static void attach_vector_irq(void)
@@ -60,7 +58,7 @@ static void detach_vector_irq(void) { }
static void handle_irq_entry(void *data, int irq, struct irqaction *action)
{
- da_handle_event_nrp(current, irq_entry_nrp);
+ da_handle_event(current, irq_entry_nrp);
}
static void handle_sched_need_resched(void *data, struct task_struct *tsk,
@@ -72,22 +70,22 @@ static void handle_sched_need_resched(void *data, struct task_struct *tsk,
* which may not mirror the system state but makes the monitor simpler,
*/
if (tif == TIF_NEED_RESCHED)
- da_handle_start_event_nrp(tsk, sched_need_resched_nrp);
+ da_handle_start_event(tsk, sched_need_resched_nrp);
}
static void handle_schedule_entry(void *data, bool preempt)
{
if (preempt)
- da_handle_event_nrp(current, schedule_entry_preempt_nrp);
+ da_handle_event(current, schedule_entry_preempt_nrp);
else
- da_handle_event_nrp(current, schedule_entry_nrp);
+ da_handle_event(current, schedule_entry_nrp);
}
static int enable_nrp(void)
{
int retval;
- retval = da_monitor_init_nrp();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -101,33 +99,33 @@ static int enable_nrp(void)
static void disable_nrp(void)
{
- rv_nrp.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry);
rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched);
rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry);
detach_vector_irq();
- da_monitor_destroy_nrp();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_nrp = {
+static struct rv_monitor rv_this = {
.name = "nrp",
.description = "need resched preempts.",
.enable = enable_nrp,
.disable = disable_nrp,
- .reset = da_monitor_reset_all_nrp,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_nrp(void)
{
- return rv_register_monitor(&rv_nrp, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_nrp(void)
{
- rv_unregister_monitor(&rv_nrp);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_nrp);
diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h
index c9f12207cbf6..3270d4c0139f 100644
--- a/kernel/trace/rv/monitors/nrp/nrp.h
+++ b/kernel/trace/rv/monitors/nrp/nrp.h
@@ -5,22 +5,24 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME nrp
+
enum states_nrp {
- preempt_irq_nrp = 0,
+ preempt_irq_nrp,
any_thread_running_nrp,
nested_preempt_nrp,
rescheduling_nrp,
- state_max_nrp
+ state_max_nrp,
};
#define INVALID_STATE state_max_nrp
enum events_nrp {
- irq_entry_nrp = 0,
+ irq_entry_nrp,
sched_need_resched_nrp,
schedule_entry_nrp,
schedule_entry_preempt_nrp,
- event_max_nrp
+ event_max_nrp,
};
struct automaton_nrp {
@@ -36,38 +38,38 @@ static const struct automaton_nrp automaton_nrp = {
"preempt_irq",
"any_thread_running",
"nested_preempt",
- "rescheduling"
+ "rescheduling",
},
.event_names = {
"irq_entry",
"sched_need_resched",
"schedule_entry",
- "schedule_entry_preempt"
+ "schedule_entry_preempt",
},
.function = {
{
preempt_irq_nrp,
preempt_irq_nrp,
nested_preempt_nrp,
- nested_preempt_nrp
+ nested_preempt_nrp,
},
{
any_thread_running_nrp,
rescheduling_nrp,
any_thread_running_nrp,
- INVALID_STATE
+ INVALID_STATE,
},
{
nested_preempt_nrp,
preempt_irq_nrp,
any_thread_running_nrp,
- any_thread_running_nrp
+ any_thread_running_nrp,
},
{
preempt_irq_nrp,
rescheduling_nrp,
any_thread_running_nrp,
- any_thread_running_nrp
+ any_thread_running_nrp,
},
},
.initial_state = preempt_irq_nrp,
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
index 50d64e7fb8c4..25a40e90fa40 100644
--- a/kernel/trace/rv/monitors/opid/opid.c
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "opid"
@@ -16,17 +15,16 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "opid.h"
-
-static struct rv_monitor rv_opid;
-DECLARE_DA_MON_PER_CPU(opid, unsigned char);
+#include <rv/da_monitor.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/trace/irq_vectors.h>
static void handle_vector_irq_entry(void *data, int vector)
{
- da_handle_event_opid(irq_entry_opid);
+ da_handle_event(irq_entry_opid);
}
static void attach_vector_irq(void)
@@ -61,52 +59,52 @@ static void detach_vector_irq(void) { }
static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_opid(irq_disable_opid);
+ da_handle_event(irq_disable_opid);
}
static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_opid(irq_enable_opid);
+ da_handle_event(irq_enable_opid);
}
static void handle_irq_entry(void *data, int irq, struct irqaction *action)
{
- da_handle_event_opid(irq_entry_opid);
+ da_handle_event(irq_entry_opid);
}
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_opid(preempt_disable_opid);
+ da_handle_event(preempt_disable_opid);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_opid(preempt_enable_opid);
+ da_handle_event(preempt_enable_opid);
}
static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif)
{
/* The monitor's intitial state is not in_irq */
if (this_cpu_read(hardirq_context))
- da_handle_event_opid(sched_need_resched_opid);
+ da_handle_event(sched_need_resched_opid);
else
- da_handle_start_event_opid(sched_need_resched_opid);
+ da_handle_start_event(sched_need_resched_opid);
}
static void handle_sched_waking(void *data, struct task_struct *p)
{
/* The monitor's intitial state is not in_irq */
if (this_cpu_read(hardirq_context))
- da_handle_event_opid(sched_waking_opid);
+ da_handle_event(sched_waking_opid);
else
- da_handle_start_event_opid(sched_waking_opid);
+ da_handle_start_event(sched_waking_opid);
}
static int enable_opid(void)
{
int retval;
- retval = da_monitor_init_opid();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -124,7 +122,7 @@ static int enable_opid(void)
static void disable_opid(void)
{
- rv_opid.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("opid", irq_disable, handle_irq_disable);
rv_detach_trace_probe("opid", irq_enable, handle_irq_enable);
@@ -135,29 +133,29 @@ static void disable_opid(void)
rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
detach_vector_irq();
- da_monitor_destroy_opid();
+ da_monitor_destroy();
}
/*
* This is the monitor register section.
*/
-static struct rv_monitor rv_opid = {
+static struct rv_monitor rv_this = {
.name = "opid",
.description = "operations with preemption and irq disabled.",
.enable = enable_opid,
.disable = disable_opid,
- .reset = da_monitor_reset_all_opid,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_opid(void)
{
- return rv_register_monitor(&rv_opid, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_opid(void)
{
- rv_unregister_monitor(&rv_opid);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_opid);
diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h
index b4b8c2ff7f64..092992514970 100644
--- a/kernel/trace/rv/monitors/opid/opid.h
+++ b/kernel/trace/rv/monitors/opid/opid.h
@@ -5,26 +5,28 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME opid
+
enum states_opid {
- disabled_opid = 0,
+ disabled_opid,
enabled_opid,
in_irq_opid,
irq_disabled_opid,
preempt_disabled_opid,
- state_max_opid
+ state_max_opid,
};
#define INVALID_STATE state_max_opid
enum events_opid {
- irq_disable_opid = 0,
+ irq_disable_opid,
irq_enable_opid,
irq_entry_opid,
preempt_disable_opid,
preempt_enable_opid,
sched_need_resched_opid,
sched_waking_opid,
- event_max_opid
+ event_max_opid,
};
struct automaton_opid {
@@ -41,7 +43,7 @@ static const struct automaton_opid automaton_opid = {
"enabled",
"in_irq",
"irq_disabled",
- "preempt_disabled"
+ "preempt_disabled",
},
.event_names = {
"irq_disable",
@@ -50,7 +52,7 @@ static const struct automaton_opid automaton_opid = {
"preempt_disable",
"preempt_enable",
"sched_need_resched",
- "sched_waking"
+ "sched_waking",
},
.function = {
{
@@ -60,7 +62,7 @@ static const struct automaton_opid automaton_opid = {
INVALID_STATE,
irq_disabled_opid,
disabled_opid,
- disabled_opid
+ disabled_opid,
},
{
irq_disabled_opid,
@@ -69,7 +71,7 @@ static const struct automaton_opid automaton_opid = {
preempt_disabled_opid,
enabled_opid,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
@@ -78,7 +80,7 @@ static const struct automaton_opid automaton_opid = {
INVALID_STATE,
INVALID_STATE,
in_irq_opid,
- in_irq_opid
+ in_irq_opid,
},
{
INVALID_STATE,
@@ -87,7 +89,7 @@ static const struct automaton_opid automaton_opid = {
disabled_opid,
INVALID_STATE,
irq_disabled_opid,
- INVALID_STATE
+ INVALID_STATE,
},
{
disabled_opid,
@@ -96,7 +98,7 @@ static const struct automaton_opid automaton_opid = {
INVALID_STATE,
enabled_opid,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
},
.initial_state = disabled_opid,
diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c
index fd75fc927d65..17f271231c99 100644
--- a/kernel/trace/rv/monitors/rtapp/rtapp.c
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.c
@@ -8,8 +8,6 @@
#include "rtapp.h"
-struct rv_monitor rv_rtapp;
-
struct rv_monitor rv_rtapp = {
.name = "rtapp",
.description = "Collection of monitors for detecting problems with real-time applications",
diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c
index d04db4b543f9..dd9d96fc6e21 100644
--- a/kernel/trace/rv/monitors/sched/sched.c
+++ b/kernel/trace/rv/monitors/sched/sched.c
@@ -8,8 +8,6 @@
#include "sched.h"
-struct rv_monitor rv_sched;
-
struct rv_monitor rv_sched = {
.name = "sched",
.description = "container for several scheduler monitor specifications.",
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
index 04c36405e2e3..5a3bd5e16e62 100644
--- a/kernel/trace/rv/monitors/sco/sco.c
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "sco"
@@ -14,31 +13,30 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "sco.h"
-
-static struct rv_monitor rv_sco;
-DECLARE_DA_MON_PER_CPU(sco, unsigned char);
+#include <rv/da_monitor.h>
static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
{
- da_handle_start_event_sco(sched_set_state_sco);
+ da_handle_start_event(sched_set_state_sco);
}
static void handle_schedule_entry(void *data, bool preempt)
{
- da_handle_event_sco(schedule_entry_sco);
+ da_handle_event(schedule_entry_sco);
}
static void handle_schedule_exit(void *data, bool is_switch)
{
- da_handle_start_event_sco(schedule_exit_sco);
+ da_handle_start_event(schedule_exit_sco);
}
static int enable_sco(void)
{
int retval;
- retval = da_monitor_init_sco();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -51,32 +49,32 @@ static int enable_sco(void)
static void disable_sco(void)
{
- rv_sco.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state);
rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry);
rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit);
- da_monitor_destroy_sco();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_sco = {
+static struct rv_monitor rv_this = {
.name = "sco",
.description = "scheduling context operations.",
.enable = enable_sco,
.disable = disable_sco,
- .reset = da_monitor_reset_all_sco,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_sco(void)
{
- return rv_register_monitor(&rv_sco, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_sco(void)
{
- rv_unregister_monitor(&rv_sco);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_sco);
diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h
index 7a4c1f2d5ca1..bac3beb51e72 100644
--- a/kernel/trace/rv/monitors/sco/sco.h
+++ b/kernel/trace/rv/monitors/sco/sco.h
@@ -5,19 +5,21 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME sco
+
enum states_sco {
- thread_context_sco = 0,
+ thread_context_sco,
scheduling_context_sco,
- state_max_sco
+ state_max_sco,
};
#define INVALID_STATE state_max_sco
enum events_sco {
- sched_set_state_sco = 0,
+ sched_set_state_sco,
schedule_entry_sco,
schedule_exit_sco,
- event_max_sco
+ event_max_sco,
};
struct automaton_sco {
@@ -31,12 +33,12 @@ struct automaton_sco {
static const struct automaton_sco automaton_sco = {
.state_names = {
"thread_context",
- "scheduling_context"
+ "scheduling_context",
},
.event_names = {
"sched_set_state",
"schedule_entry",
- "schedule_exit"
+ "schedule_exit",
},
.function = {
{ thread_context_sco, scheduling_context_sco, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c
index 1e351ba52fee..83b48627dc9f 100644
--- a/kernel/trace/rv/monitors/scpd/scpd.c
+++ b/kernel/trace/rv/monitors/scpd/scpd.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "scpd"
@@ -15,36 +14,35 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "scpd.h"
-
-static struct rv_monitor rv_scpd;
-DECLARE_DA_MON_PER_CPU(scpd, unsigned char);
+#include <rv/da_monitor.h>
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_scpd(preempt_disable_scpd);
+ da_handle_event(preempt_disable_scpd);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_scpd(preempt_enable_scpd);
+ da_handle_start_event(preempt_enable_scpd);
}
static void handle_schedule_entry(void *data, bool preempt)
{
- da_handle_event_scpd(schedule_entry_scpd);
+ da_handle_event(schedule_entry_scpd);
}
static void handle_schedule_exit(void *data, bool is_switch)
{
- da_handle_event_scpd(schedule_exit_scpd);
+ da_handle_event(schedule_exit_scpd);
}
static int enable_scpd(void)
{
int retval;
- retval = da_monitor_init_scpd();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -58,33 +56,33 @@ static int enable_scpd(void)
static void disable_scpd(void)
{
- rv_scpd.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable);
rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry);
rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit);
- da_monitor_destroy_scpd();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_scpd = {
+static struct rv_monitor rv_this = {
.name = "scpd",
.description = "schedule called with preemption disabled.",
.enable = enable_scpd,
.disable = disable_scpd,
- .reset = da_monitor_reset_all_scpd,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_scpd(void)
{
- return rv_register_monitor(&rv_scpd, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_scpd(void)
{
- rv_unregister_monitor(&rv_scpd);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_scpd);
diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h
index 295f735a5811..d6329da2671b 100644
--- a/kernel/trace/rv/monitors/scpd/scpd.h
+++ b/kernel/trace/rv/monitors/scpd/scpd.h
@@ -5,20 +5,22 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME scpd
+
enum states_scpd {
- cant_sched_scpd = 0,
+ cant_sched_scpd,
can_sched_scpd,
- state_max_scpd
+ state_max_scpd,
};
#define INVALID_STATE state_max_scpd
enum events_scpd {
- preempt_disable_scpd = 0,
+ preempt_disable_scpd,
preempt_enable_scpd,
schedule_entry_scpd,
schedule_exit_scpd,
- event_max_scpd
+ event_max_scpd,
};
struct automaton_scpd {
@@ -32,13 +34,13 @@ struct automaton_scpd {
static const struct automaton_scpd automaton_scpd = {
.state_names = {
"cant_sched",
- "can_sched"
+ "can_sched",
},
.event_names = {
"preempt_disable",
"preempt_enable",
"schedule_entry",
- "schedule_exit"
+ "schedule_exit",
},
.function = {
{ can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c
index 558950f524a5..b80b73795dec 100644
--- a/kernel/trace/rv/monitors/snep/snep.c
+++ b/kernel/trace/rv/monitors/snep/snep.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "snep"
@@ -15,36 +14,35 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "snep.h"
-
-static struct rv_monitor rv_snep;
-DECLARE_DA_MON_PER_CPU(snep, unsigned char);
+#include <rv/da_monitor.h>
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_snep(preempt_disable_snep);
+ da_handle_start_event(preempt_disable_snep);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_snep(preempt_enable_snep);
+ da_handle_start_event(preempt_enable_snep);
}
static void handle_schedule_entry(void *data, bool preempt)
{
- da_handle_event_snep(schedule_entry_snep);
+ da_handle_event(schedule_entry_snep);
}
static void handle_schedule_exit(void *data, bool is_switch)
{
- da_handle_start_event_snep(schedule_exit_snep);
+ da_handle_start_event(schedule_exit_snep);
}
static int enable_snep(void)
{
int retval;
- retval = da_monitor_init_snep();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -58,33 +56,33 @@ static int enable_snep(void)
static void disable_snep(void)
{
- rv_snep.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable);
rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry);
rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit);
- da_monitor_destroy_snep();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_snep = {
+static struct rv_monitor rv_this = {
.name = "snep",
.description = "schedule does not enable preempt.",
.enable = enable_snep,
.disable = disable_snep,
- .reset = da_monitor_reset_all_snep,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_snep(void)
{
- return rv_register_monitor(&rv_snep, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_snep(void)
{
- rv_unregister_monitor(&rv_snep);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_snep);
diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h
index 4cd9abb77b7b..357520a5b3d1 100644
--- a/kernel/trace/rv/monitors/snep/snep.h
+++ b/kernel/trace/rv/monitors/snep/snep.h
@@ -5,20 +5,22 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME snep
+
enum states_snep {
- non_scheduling_context_snep = 0,
+ non_scheduling_context_snep,
scheduling_contex_snep,
- state_max_snep
+ state_max_snep,
};
#define INVALID_STATE state_max_snep
enum events_snep {
- preempt_disable_snep = 0,
+ preempt_disable_snep,
preempt_enable_snep,
schedule_entry_snep,
schedule_exit_snep,
- event_max_snep
+ event_max_snep,
};
struct automaton_snep {
@@ -32,26 +34,26 @@ struct automaton_snep {
static const struct automaton_snep automaton_snep = {
.state_names = {
"non_scheduling_context",
- "scheduling_contex"
+ "scheduling_contex",
},
.event_names = {
"preempt_disable",
"preempt_enable",
"schedule_entry",
- "schedule_exit"
+ "schedule_exit",
},
.function = {
{
non_scheduling_context_snep,
non_scheduling_context_snep,
scheduling_contex_snep,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
INVALID_STATE,
INVALID_STATE,
- non_scheduling_context_snep
+ non_scheduling_context_snep,
},
},
.initial_state = non_scheduling_context_snep,
diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c
index 540e686e699f..f168b1a4b12c 100644
--- a/kernel/trace/rv/monitors/snroc/snroc.c
+++ b/kernel/trace/rv/monitors/snroc/snroc.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "snroc"
@@ -14,14 +13,13 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "snroc.h"
-
-static struct rv_monitor rv_snroc;
-DECLARE_DA_MON_PER_TASK(snroc, unsigned char);
+#include <rv/da_monitor.h>
static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
{
- da_handle_event_snroc(tsk, sched_set_state_snroc);
+ da_handle_event(tsk, sched_set_state_snroc);
}
static void handle_sched_switch(void *data, bool preempt,
@@ -29,15 +27,15 @@ static void handle_sched_switch(void *data, bool preempt,
struct task_struct *next,
unsigned int prev_state)
{
- da_handle_start_event_snroc(prev, sched_switch_out_snroc);
- da_handle_event_snroc(next, sched_switch_in_snroc);
+ da_handle_start_event(prev, sched_switch_out_snroc);
+ da_handle_event(next, sched_switch_in_snroc);
}
static int enable_snroc(void)
{
int retval;
- retval = da_monitor_init_snroc();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -49,31 +47,31 @@ static int enable_snroc(void)
static void disable_snroc(void)
{
- rv_snroc.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state);
rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch);
- da_monitor_destroy_snroc();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_snroc = {
+static struct rv_monitor rv_this = {
.name = "snroc",
.description = "set non runnable on its own context.",
.enable = enable_snroc,
.disable = disable_snroc,
- .reset = da_monitor_reset_all_snroc,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_snroc(void)
{
- return rv_register_monitor(&rv_snroc, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_snroc(void)
{
- rv_unregister_monitor(&rv_snroc);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_snroc);
diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h
index c3650a2b1b10..88b7328ad31a 100644
--- a/kernel/trace/rv/monitors/snroc/snroc.h
+++ b/kernel/trace/rv/monitors/snroc/snroc.h
@@ -5,19 +5,21 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME snroc
+
enum states_snroc {
- other_context_snroc = 0,
+ other_context_snroc,
own_context_snroc,
- state_max_snroc
+ state_max_snroc,
};
#define INVALID_STATE state_max_snroc
enum events_snroc {
- sched_set_state_snroc = 0,
+ sched_set_state_snroc,
sched_switch_in_snroc,
sched_switch_out_snroc,
- event_max_snroc
+ event_max_snroc,
};
struct automaton_snroc {
@@ -31,12 +33,12 @@ struct automaton_snroc {
static const struct automaton_snroc automaton_snroc = {
.state_names = {
"other_context",
- "own_context"
+ "own_context",
},
.event_names = {
"sched_set_state",
"sched_switch_in",
- "sched_switch_out"
+ "sched_switch_out",
},
.function = {
{ INVALID_STATE, own_context_snroc, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c
index 84b8d890d9d4..a91321c890cd 100644
--- a/kernel/trace/rv/monitors/sssw/sssw.c
+++ b/kernel/trace/rv/monitors/sssw/sssw.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "sssw"
@@ -15,17 +14,16 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "sssw.h"
-
-static struct rv_monitor rv_sssw;
-DECLARE_DA_MON_PER_TASK(sssw, unsigned char);
+#include <rv/da_monitor.h>
static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
{
if (state == TASK_RUNNING)
- da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw);
+ da_handle_start_event(tsk, sched_set_state_runnable_sssw);
else
- da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw);
+ da_handle_event(tsk, sched_set_state_sleepable_sssw);
}
static void handle_sched_switch(void *data, bool preempt,
@@ -34,15 +32,15 @@ static void handle_sched_switch(void *data, bool preempt,
unsigned int prev_state)
{
if (preempt)
- da_handle_event_sssw(prev, sched_switch_preempt_sssw);
+ da_handle_event(prev, sched_switch_preempt_sssw);
else if (prev_state == TASK_RUNNING)
- da_handle_event_sssw(prev, sched_switch_yield_sssw);
+ da_handle_event(prev, sched_switch_yield_sssw);
else if (prev_state == TASK_RTLOCK_WAIT)
/* special case of sleeping task with racy conditions */
- da_handle_event_sssw(prev, sched_switch_blocking_sssw);
+ da_handle_event(prev, sched_switch_blocking_sssw);
else
- da_handle_event_sssw(prev, sched_switch_suspend_sssw);
- da_handle_event_sssw(next, sched_switch_in_sssw);
+ da_handle_event(prev, sched_switch_suspend_sssw);
+ da_handle_event(next, sched_switch_in_sssw);
}
static void handle_sched_wakeup(void *data, struct task_struct *p)
@@ -51,21 +49,21 @@ static void handle_sched_wakeup(void *data, struct task_struct *p)
* Wakeup can also lead to signal_wakeup although the system is
* actually runnable. The monitor can safely start with this event.
*/
- da_handle_start_event_sssw(p, sched_wakeup_sssw);
+ da_handle_start_event(p, sched_wakeup_sssw);
}
static void handle_signal_deliver(void *data, int sig,
struct kernel_siginfo *info,
struct k_sigaction *ka)
{
- da_handle_event_sssw(current, signal_deliver_sssw);
+ da_handle_event(current, signal_deliver_sssw);
}
static int enable_sssw(void)
{
int retval;
- retval = da_monitor_init_sssw();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -79,33 +77,33 @@ static int enable_sssw(void)
static void disable_sssw(void)
{
- rv_sssw.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state);
rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch);
rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup);
rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver);
- da_monitor_destroy_sssw();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_sssw = {
+static struct rv_monitor rv_this = {
.name = "sssw",
.description = "set state sleep and wakeup.",
.enable = enable_sssw,
.disable = disable_sssw,
- .reset = da_monitor_reset_all_sssw,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_sssw(void)
{
- return rv_register_monitor(&rv_sssw, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_sssw(void)
{
- rv_unregister_monitor(&rv_sssw);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_sssw);
diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h
index 243d54050c94..1a4b806061c3 100644
--- a/kernel/trace/rv/monitors/sssw/sssw.h
+++ b/kernel/trace/rv/monitors/sssw/sssw.h
@@ -5,18 +5,20 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME sssw
+
enum states_sssw {
- runnable_sssw = 0,
+ runnable_sssw,
signal_wakeup_sssw,
sleepable_sssw,
sleeping_sssw,
- state_max_sssw
+ state_max_sssw,
};
#define INVALID_STATE state_max_sssw
enum events_sssw {
- sched_set_state_runnable_sssw = 0,
+ sched_set_state_runnable_sssw,
sched_set_state_sleepable_sssw,
sched_switch_blocking_sssw,
sched_switch_in_sssw,
@@ -25,7 +27,7 @@ enum events_sssw {
sched_switch_yield_sssw,
sched_wakeup_sssw,
signal_deliver_sssw,
- event_max_sssw
+ event_max_sssw,
};
struct automaton_sssw {
@@ -41,7 +43,7 @@ static const struct automaton_sssw automaton_sssw = {
"runnable",
"signal_wakeup",
"sleepable",
- "sleeping"
+ "sleeping",
},
.event_names = {
"sched_set_state_runnable",
@@ -52,7 +54,7 @@ static const struct automaton_sssw automaton_sssw = {
"sched_switch_suspend",
"sched_switch_yield",
"sched_wakeup",
- "signal_deliver"
+ "signal_deliver",
},
.function = {
{
@@ -64,7 +66,7 @@ static const struct automaton_sssw automaton_sssw = {
INVALID_STATE,
runnable_sssw,
runnable_sssw,
- runnable_sssw
+ runnable_sssw,
},
{
INVALID_STATE,
@@ -75,7 +77,7 @@ static const struct automaton_sssw automaton_sssw = {
INVALID_STATE,
signal_wakeup_sssw,
signal_wakeup_sssw,
- runnable_sssw
+ runnable_sssw,
},
{
runnable_sssw,
@@ -86,7 +88,7 @@ static const struct automaton_sssw automaton_sssw = {
sleeping_sssw,
signal_wakeup_sssw,
runnable_sssw,
- sleepable_sssw
+ sleepable_sssw,
},
{
INVALID_STATE,
@@ -97,7 +99,7 @@ static const struct automaton_sssw automaton_sssw = {
INVALID_STATE,
INVALID_STATE,
runnable_sssw,
- INVALID_STATE
+ INVALID_STATE,
},
},
.initial_state = runnable_sssw,
diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c
index c4a9cd67c1d2..ce031cbf202a 100644
--- a/kernel/trace/rv/monitors/sts/sts.c
+++ b/kernel/trace/rv/monitors/sts/sts.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "sts"
@@ -16,17 +15,16 @@
#include <rv_trace.h>
#include <monitors/sched/sched.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "sts.h"
-
-static struct rv_monitor rv_sts;
-DECLARE_DA_MON_PER_CPU(sts, unsigned char);
+#include <rv/da_monitor.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/trace/irq_vectors.h>
static void handle_vector_irq_entry(void *data, int vector)
{
- da_handle_event_sts(irq_entry_sts);
+ da_handle_event(irq_entry_sts);
}
static void attach_vector_irq(void)
@@ -61,17 +59,17 @@ static void detach_vector_irq(void) { }
static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_sts(irq_disable_sts);
+ da_handle_event(irq_disable_sts);
}
static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_sts(irq_enable_sts);
+ da_handle_event(irq_enable_sts);
}
static void handle_irq_entry(void *data, int irq, struct irqaction *action)
{
- da_handle_event_sts(irq_entry_sts);
+ da_handle_event(irq_entry_sts);
}
static void handle_sched_switch(void *data, bool preempt,
@@ -79,24 +77,24 @@ static void handle_sched_switch(void *data, bool preempt,
struct task_struct *next,
unsigned int prev_state)
{
- da_handle_event_sts(sched_switch_sts);
+ da_handle_event(sched_switch_sts);
}
static void handle_schedule_entry(void *data, bool preempt)
{
- da_handle_event_sts(schedule_entry_sts);
+ da_handle_event(schedule_entry_sts);
}
static void handle_schedule_exit(void *data, bool is_switch)
{
- da_handle_start_event_sts(schedule_exit_sts);
+ da_handle_start_event(schedule_exit_sts);
}
static int enable_sts(void)
{
int retval;
- retval = da_monitor_init_sts();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -113,7 +111,7 @@ static int enable_sts(void)
static void disable_sts(void)
{
- rv_sts.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("sts", irq_disable, handle_irq_disable);
rv_detach_trace_probe("sts", irq_enable, handle_irq_enable);
@@ -123,29 +121,29 @@ static void disable_sts(void)
rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit);
detach_vector_irq();
- da_monitor_destroy_sts();
+ da_monitor_destroy();
}
/*
* This is the monitor register section.
*/
-static struct rv_monitor rv_sts = {
+static struct rv_monitor rv_this = {
.name = "sts",
.description = "schedule implies task switch.",
.enable = enable_sts,
.disable = disable_sts,
- .reset = da_monitor_reset_all_sts,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_sts(void)
{
- return rv_register_monitor(&rv_sts, &rv_sched);
+ return rv_register_monitor(&rv_this, &rv_sched);
}
static void __exit unregister_sts(void)
{
- rv_unregister_monitor(&rv_sts);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_sts);
diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h
index 3368b6599a00..6f7b2d9d72e6 100644
--- a/kernel/trace/rv/monitors/sts/sts.h
+++ b/kernel/trace/rv/monitors/sts/sts.h
@@ -5,27 +5,29 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME sts
+
enum states_sts {
- can_sched_sts = 0,
+ can_sched_sts,
cant_sched_sts,
disable_to_switch_sts,
enable_to_exit_sts,
in_irq_sts,
scheduling_sts,
switching_sts,
- state_max_sts
+ state_max_sts,
};
#define INVALID_STATE state_max_sts
enum events_sts {
- irq_disable_sts = 0,
+ irq_disable_sts,
irq_enable_sts,
irq_entry_sts,
sched_switch_sts,
schedule_entry_sts,
schedule_exit_sts,
- event_max_sts
+ event_max_sts,
};
struct automaton_sts {
@@ -44,7 +46,7 @@ static const struct automaton_sts automaton_sts = {
"enable_to_exit",
"in_irq",
"scheduling",
- "switching"
+ "switching",
},
.event_names = {
"irq_disable",
@@ -52,7 +54,7 @@ static const struct automaton_sts automaton_sts = {
"irq_entry",
"sched_switch",
"schedule_entry",
- "schedule_exit"
+ "schedule_exit",
},
.function = {
{
@@ -61,7 +63,7 @@ static const struct automaton_sts automaton_sts = {
INVALID_STATE,
INVALID_STATE,
scheduling_sts,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
@@ -69,7 +71,7 @@ static const struct automaton_sts automaton_sts = {
cant_sched_sts,
INVALID_STATE,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
@@ -77,7 +79,7 @@ static const struct automaton_sts automaton_sts = {
in_irq_sts,
switching_sts,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
enable_to_exit_sts,
@@ -85,7 +87,7 @@ static const struct automaton_sts automaton_sts = {
enable_to_exit_sts,
INVALID_STATE,
INVALID_STATE,
- can_sched_sts
+ can_sched_sts,
},
{
INVALID_STATE,
@@ -93,7 +95,7 @@ static const struct automaton_sts automaton_sts = {
in_irq_sts,
INVALID_STATE,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
disable_to_switch_sts,
@@ -101,7 +103,7 @@ static const struct automaton_sts automaton_sts = {
INVALID_STATE,
INVALID_STATE,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
{
INVALID_STATE,
@@ -109,7 +111,7 @@ static const struct automaton_sts automaton_sts = {
INVALID_STATE,
INVALID_STATE,
INVALID_STATE,
- INVALID_STATE
+ INVALID_STATE,
},
},
.initial_state = can_sched_sts,
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index 4b4e99615a11..22d77ec42463 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "wip"
@@ -14,31 +13,30 @@
#include <trace/events/sched.h>
#include <trace/events/preemptirq.h>
+#define RV_MON_TYPE RV_MON_PER_CPU
#include "wip.h"
-
-static struct rv_monitor rv_wip;
-DECLARE_DA_MON_PER_CPU(wip, unsigned char);
+#include <rv/da_monitor.h>
static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_event_wip(preempt_disable_wip);
+ da_handle_event(preempt_disable_wip);
}
static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
{
- da_handle_start_event_wip(preempt_enable_wip);
+ da_handle_start_event(preempt_enable_wip);
}
static void handle_sched_waking(void *data, struct task_struct *task)
{
- da_handle_event_wip(sched_waking_wip);
+ da_handle_event(sched_waking_wip);
}
static int enable_wip(void)
{
int retval;
- retval = da_monitor_init_wip();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -51,32 +49,32 @@ static int enable_wip(void)
static void disable_wip(void)
{
- rv_wip.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable);
rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable);
rv_detach_trace_probe("wip", sched_waking, handle_sched_waking);
- da_monitor_destroy_wip();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_wip = {
+static struct rv_monitor rv_this = {
.name = "wip",
.description = "wakeup in preemptive per-cpu testing monitor.",
.enable = enable_wip,
.disable = disable_wip,
- .reset = da_monitor_reset_all_wip,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_wip(void)
{
- return rv_register_monitor(&rv_wip, NULL);
+ return rv_register_monitor(&rv_this, NULL);
}
static void __exit unregister_wip(void)
{
- rv_unregister_monitor(&rv_wip);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_wip);
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index c7193748bf36..b4c3eea94c86 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -5,19 +5,21 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME wip
+
enum states_wip {
- preemptive_wip = 0,
+ preemptive_wip,
non_preemptive_wip,
- state_max_wip
+ state_max_wip,
};
#define INVALID_STATE state_max_wip
enum events_wip {
- preempt_disable_wip = 0,
+ preempt_disable_wip,
preempt_enable_wip,
sched_waking_wip,
- event_max_wip
+ event_max_wip,
};
struct automaton_wip {
@@ -31,12 +33,12 @@ struct automaton_wip {
static const struct automaton_wip automaton_wip = {
.state_names = {
"preemptive",
- "non_preemptive"
+ "non_preemptive",
},
.event_names = {
"preempt_disable",
"preempt_enable",
- "sched_waking"
+ "sched_waking",
},
.function = {
{ non_preemptive_wip, INVALID_STATE, INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 4145bea2729e..579e7e217ee0 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -6,40 +6,38 @@
#include <linux/init.h>
#include <linux/rv.h>
#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
#define MODULE_NAME "wwnr"
#include <rv_trace.h>
#include <trace/events/sched.h>
+#define RV_MON_TYPE RV_MON_PER_TASK
#include "wwnr.h"
-
-static struct rv_monitor rv_wwnr;
-DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
+#include <rv/da_monitor.h>
static void handle_switch(void *data, bool preempt, struct task_struct *p,
struct task_struct *n, unsigned int prev_state)
{
/* start monitoring only after the first suspension */
if (prev_state == TASK_INTERRUPTIBLE)
- da_handle_start_event_wwnr(p, switch_out_wwnr);
+ da_handle_start_event(p, switch_out_wwnr);
else
- da_handle_event_wwnr(p, switch_out_wwnr);
+ da_handle_event(p, switch_out_wwnr);
- da_handle_event_wwnr(n, switch_in_wwnr);
+ da_handle_event(n, switch_in_wwnr);
}
static void handle_wakeup(void *data, struct task_struct *p)
{
- da_handle_event_wwnr(p, wakeup_wwnr);
+ da_handle_event(p, wakeup_wwnr);
}
static int enable_wwnr(void)
{
int retval;
- retval = da_monitor_init_wwnr();
+ retval = da_monitor_init();
if (retval)
return retval;
@@ -51,31 +49,31 @@ static int enable_wwnr(void)
static void disable_wwnr(void)
{
- rv_wwnr.enabled = 0;
+ rv_this.enabled = 0;
rv_detach_trace_probe("wwnr", sched_switch, handle_switch);
rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup);
- da_monitor_destroy_wwnr();
+ da_monitor_destroy();
}
-static struct rv_monitor rv_wwnr = {
+static struct rv_monitor rv_this = {
.name = "wwnr",
.description = "wakeup while not running per-task testing model.",
.enable = enable_wwnr,
.disable = disable_wwnr,
- .reset = da_monitor_reset_all_wwnr,
+ .reset = da_monitor_reset_all,
.enabled = 0,
};
static int __init register_wwnr(void)
{
- return rv_register_monitor(&rv_wwnr, NULL);
+ return rv_register_monitor(&rv_this, NULL);
}
static void __exit unregister_wwnr(void)
{
- rv_unregister_monitor(&rv_wwnr);
+ rv_unregister_monitor(&rv_this);
}
module_init(register_wwnr);
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index 0a59d23edf61..a28006512c9b 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -5,19 +5,21 @@
* Documentation/trace/rv/deterministic_automata.rst
*/
+#define MONITOR_NAME wwnr
+
enum states_wwnr {
- not_running_wwnr = 0,
+ not_running_wwnr,
running_wwnr,
- state_max_wwnr
+ state_max_wwnr,
};
#define INVALID_STATE state_max_wwnr
enum events_wwnr {
- switch_in_wwnr = 0,
+ switch_in_wwnr,
switch_out_wwnr,
wakeup_wwnr,
- event_max_wwnr
+ event_max_wwnr,
};
struct automaton_wwnr {
@@ -31,12 +33,12 @@ struct automaton_wwnr {
static const struct automaton_wwnr automaton_wwnr = {
.state_names = {
"not_running",
- "running"
+ "running",
},
.event_names = {
"switch_in",
"switch_out",
- "wakeup"
+ "wakeup",
},
.function = {
{ running_wwnr, INVALID_STATE, not_running_wwnr },
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8bd4ec08fb36..2f6fbf9e7caf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -67,7 +67,7 @@
* insertions into the ring-buffer such as trace_printk could occurred
* at the same time, giving false positive or negative results.
*/
-static bool __read_mostly tracing_selftest_running;
+bool __read_mostly tracing_selftest_running;
/*
* If boot-time tracing including tracers/events via kernel cmdline
@@ -83,7 +83,6 @@ void __init disable_tracing_selftest(const char *reason)
}
}
#else
-#define tracing_selftest_running 0
#define tracing_selftest_disabled 0
#endif
@@ -114,7 +113,7 @@ DEFINE_PER_CPU(bool, trace_taskinfo_save);
* of the tracer is successful. But that is the only place that sets
* this back to zero.
*/
-static int tracing_disabled = 1;
+int tracing_disabled = 1;
cpumask_var_t __read_mostly tracing_buffer_mask;
@@ -535,22 +534,11 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
-static struct trace_array *printk_trace = &global_trace;
+struct trace_array *printk_trace = &global_trace;
/* List of trace_arrays interested in the top level trace_marker */
static LIST_HEAD(marker_copies);
-static __always_inline bool printk_binsafe(struct trace_array *tr)
-{
- /*
- * The binary format of traceprintk can cause a crash if used
- * by a buffer from another boot. Force the use of the
- * non binary version of trace_printk if the trace_printk
- * buffer is a boot mapped ring buffer.
- */
- return !(tr->flags & TRACE_ARRAY_FL_BOOT);
-}
-
static void update_printk_trace(struct trace_array *tr)
{
if (printk_trace == tr)
@@ -649,248 +637,6 @@ int tracing_check_open_get_tr(struct trace_array *tr)
return 0;
}
-/**
- * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
- * @filtered_pids: The list of pids to check
- * @search_pid: The PID to find in @filtered_pids
- *
- * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
- */
-bool
-trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
-{
- return trace_pid_list_is_set(filtered_pids, search_pid);
-}
-
-/**
- * trace_ignore_this_task - should a task be ignored for tracing
- * @filtered_pids: The list of pids to check
- * @filtered_no_pids: The list of pids not to be traced
- * @task: The task that should be ignored if not filtered
- *
- * Checks if @task should be traced or not from @filtered_pids.
- * Returns true if @task should *NOT* be traced.
- * Returns false if @task should be traced.
- */
-bool
-trace_ignore_this_task(struct trace_pid_list *filtered_pids,
- struct trace_pid_list *filtered_no_pids,
- struct task_struct *task)
-{
- /*
- * If filtered_no_pids is not empty, and the task's pid is listed
- * in filtered_no_pids, then return true.
- * Otherwise, if filtered_pids is empty, that means we can
- * trace all tasks. If it has content, then only trace pids
- * within filtered_pids.
- */
-
- return (filtered_pids &&
- !trace_find_filtered_pid(filtered_pids, task->pid)) ||
- (filtered_no_pids &&
- trace_find_filtered_pid(filtered_no_pids, task->pid));
-}
-
-/**
- * trace_filter_add_remove_task - Add or remove a task from a pid_list
- * @pid_list: The list to modify
- * @self: The current task for fork or NULL for exit
- * @task: The task to add or remove
- *
- * If adding a task, if @self is defined, the task is only added if @self
- * is also included in @pid_list. This happens on fork and tasks should
- * only be added when the parent is listed. If @self is NULL, then the
- * @task pid will be removed from the list, which would happen on exit
- * of a task.
- */
-void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
- struct task_struct *self,
- struct task_struct *task)
-{
- if (!pid_list)
- return;
-
- /* For forks, we only add if the forking task is listed */
- if (self) {
- if (!trace_find_filtered_pid(pid_list, self->pid))
- return;
- }
-
- /* "self" is set for forks, and NULL for exits */
- if (self)
- trace_pid_list_set(pid_list, task->pid);
- else
- trace_pid_list_clear(pid_list, task->pid);
-}
-
-/**
- * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
- * @pid_list: The pid list to show
- * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
- * @pos: The position of the file
- *
- * This is used by the seq_file "next" operation to iterate the pids
- * listed in a trace_pid_list structure.
- *
- * Returns the pid+1 as we want to display pid of zero, but NULL would
- * stop the iteration.
- */
-void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
-{
- long pid = (unsigned long)v;
- unsigned int next;
-
- (*pos)++;
-
- /* pid already is +1 of the actual previous bit */
- if (trace_pid_list_next(pid_list, pid, &next) < 0)
- return NULL;
-
- pid = next;
-
- /* Return pid + 1 to allow zero to be represented */
- return (void *)(pid + 1);
-}
-
-/**
- * trace_pid_start - Used for seq_file to start reading pid lists
- * @pid_list: The pid list to show
- * @pos: The position of the file
- *
- * This is used by seq_file "start" operation to start the iteration
- * of listing pids.
- *
- * Returns the pid+1 as we want to display pid of zero, but NULL would
- * stop the iteration.
- */
-void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
-{
- unsigned long pid;
- unsigned int first;
- loff_t l = 0;
-
- if (trace_pid_list_first(pid_list, &first) < 0)
- return NULL;
-
- pid = first;
-
- /* Return pid + 1 so that zero can be the exit value */
- for (pid++; pid && l < *pos;
- pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
- ;
- return (void *)pid;
-}
-
-/**
- * trace_pid_show - show the current pid in seq_file processing
- * @m: The seq_file structure to write into
- * @v: A void pointer of the pid (+1) value to display
- *
- * Can be directly used by seq_file operations to display the current
- * pid value.
- */
-int trace_pid_show(struct seq_file *m, void *v)
-{
- unsigned long pid = (unsigned long)v - 1;
-
- seq_printf(m, "%lu\n", pid);
- return 0;
-}
-
-/* 128 should be much more than enough */
-#define PID_BUF_SIZE 127
-
-int trace_pid_write(struct trace_pid_list *filtered_pids,
- struct trace_pid_list **new_pid_list,
- const char __user *ubuf, size_t cnt)
-{
- struct trace_pid_list *pid_list;
- struct trace_parser parser;
- unsigned long val;
- int nr_pids = 0;
- ssize_t read = 0;
- ssize_t ret;
- loff_t pos;
- pid_t pid;
-
- if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
- return -ENOMEM;
-
- /*
- * Always recreate a new array. The write is an all or nothing
- * operation. Always create a new array when adding new pids by
- * the user. If the operation fails, then the current list is
- * not modified.
- */
- pid_list = trace_pid_list_alloc();
- if (!pid_list) {
- trace_parser_put(&parser);
- return -ENOMEM;
- }
-
- if (filtered_pids) {
- /* copy the current bits to the new max */
- ret = trace_pid_list_first(filtered_pids, &pid);
- while (!ret) {
- ret = trace_pid_list_set(pid_list, pid);
- if (ret < 0)
- goto out;
-
- ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
- nr_pids++;
- }
- }
-
- ret = 0;
- while (cnt > 0) {
-
- pos = 0;
-
- ret = trace_get_user(&parser, ubuf, cnt, &pos);
- if (ret < 0)
- break;
-
- read += ret;
- ubuf += ret;
- cnt -= ret;
-
- if (!trace_parser_loaded(&parser))
- break;
-
- ret = -EINVAL;
- if (kstrtoul(parser.buffer, 0, &val))
- break;
-
- pid = (pid_t)val;
-
- if (trace_pid_list_set(pid_list, pid) < 0) {
- ret = -1;
- break;
- }
- nr_pids++;
-
- trace_parser_clear(&parser);
- ret = 0;
- }
- out:
- trace_parser_put(&parser);
-
- if (ret < 0) {
- trace_pid_list_free(pid_list);
- return ret;
- }
-
- if (!nr_pids) {
- /* Cleared the list of pids */
- trace_pid_list_free(pid_list);
- pid_list = NULL;
- }
-
- *new_pid_list = pid_list;
-
- return read;
-}
-
static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
{
u64 ts;
@@ -1033,56 +779,6 @@ static inline void trace_access_lock_init(void)
#endif
-#ifdef CONFIG_STACKTRACE
-static void __ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs);
-static inline void ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs);
-
-#else
-static inline void __ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs)
-{
-}
-static inline void ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned long trace_ctx,
- int skip, struct pt_regs *regs)
-{
-}
-
-#endif
-
-static __always_inline void
-trace_event_setup(struct ring_buffer_event *event,
- int type, unsigned int trace_ctx)
-{
- struct trace_entry *ent = ring_buffer_event_data(event);
-
- tracing_generic_entry_update(ent, type, trace_ctx);
-}
-
-static __always_inline struct ring_buffer_event *
-__trace_buffer_lock_reserve(struct trace_buffer *buffer,
- int type,
- unsigned long len,
- unsigned int trace_ctx)
-{
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(buffer, len);
- if (event != NULL)
- trace_event_setup(event, type, trace_ctx);
-
- return event;
-}
-
void tracer_tracing_on(struct trace_array *tr)
{
if (tr->array_buffer.buffer)
@@ -1110,130 +806,10 @@ void tracing_on(void)
}
EXPORT_SYMBOL_GPL(tracing_on);
-
-static __always_inline void
-__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event)
-{
- __this_cpu_write(trace_taskinfo_save, true);
-
- /* If this is the temp buffer, we need to commit fully */
- if (this_cpu_read(trace_buffered_event) == event) {
- /* Length is in event->array[0] */
- ring_buffer_write(buffer, event->array[0], &event->array[1]);
- /* Release the temp buffer */
- this_cpu_dec(trace_buffered_event_cnt);
- /* ring_buffer_unlock_commit() enables preemption */
- preempt_enable_notrace();
- } else
- ring_buffer_unlock_commit(buffer);
-}
-
-int __trace_array_puts(struct trace_array *tr, unsigned long ip,
- const char *str, int size)
-{
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct print_entry *entry;
- unsigned int trace_ctx;
- int alloc;
-
- if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
- return 0;
-
- if (unlikely(tracing_selftest_running && tr == &global_trace))
- return 0;
-
- if (unlikely(tracing_disabled))
- return 0;
-
- alloc = sizeof(*entry) + size + 2; /* possible \n added */
-
- trace_ctx = tracing_gen_ctx();
- buffer = tr->array_buffer.buffer;
- guard(ring_buffer_nest)(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- trace_ctx);
- if (!event)
- return 0;
-
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
-
- memcpy(&entry->buf, str, size);
-
- /* Add a newline if necessary */
- if (entry->buf[size - 1] != '\n') {
- entry->buf[size] = '\n';
- entry->buf[size + 1] = '\0';
- } else
- entry->buf[size] = '\0';
-
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
- return size;
-}
-EXPORT_SYMBOL_GPL(__trace_array_puts);
-
-/**
- * __trace_puts - write a constant string into the trace buffer.
- * @ip: The address of the caller
- * @str: The constant string to write
- * @size: The size of the string.
- */
-int __trace_puts(unsigned long ip, const char *str, int size)
-{
- return __trace_array_puts(printk_trace, ip, str, size);
-}
-EXPORT_SYMBOL_GPL(__trace_puts);
-
-/**
- * __trace_bputs - write the pointer to a constant string into trace buffer
- * @ip: The address of the caller
- * @str: The constant string to write to the buffer to
- */
-int __trace_bputs(unsigned long ip, const char *str)
-{
- struct trace_array *tr = READ_ONCE(printk_trace);
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct bputs_entry *entry;
- unsigned int trace_ctx;
- int size = sizeof(struct bputs_entry);
-
- if (!printk_binsafe(tr))
- return __trace_puts(ip, str, strlen(str));
-
- if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
- return 0;
-
- if (unlikely(tracing_selftest_running || tracing_disabled))
- return 0;
-
- trace_ctx = tracing_gen_ctx();
- buffer = tr->array_buffer.buffer;
-
- guard(ring_buffer_nest)(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- trace_ctx);
- if (!event)
- return 0;
-
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
- entry->str = str;
-
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(__trace_bputs);
-
#ifdef CONFIG_TRACER_SNAPSHOT
static void tracing_snapshot_instance_cond(struct trace_array *tr,
void *cond_data)
{
- struct tracer *tracer = tr->current_trace;
unsigned long flags;
if (in_nmi()) {
@@ -1249,15 +825,15 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
return;
}
- /* Note, snapshot can not be used when the tracer uses it */
- if (tracer->use_max_tr) {
- trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ if (tr->mapped) {
+ trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
return;
}
- if (tr->mapped) {
- trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+ /* Note, snapshot can not be used when the tracer uses it */
+ if (tracer_uses_snapshot(tr->current_trace)) {
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
return;
}
@@ -1357,12 +933,12 @@ int tracing_alloc_snapshot_instance(struct trace_array *tr)
/* Make the snapshot buffer have the same order as main buffer */
order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
- ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order);
if (ret < 0)
return ret;
/* allocate spare buffer */
- ret = resize_buffer_duplicate_size(&tr->max_buffer,
+ ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
&tr->array_buffer, RING_BUFFER_ALL_CPUS);
if (ret < 0)
return ret;
@@ -1380,10 +956,10 @@ static void free_snapshot(struct trace_array *tr)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
- ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
- ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
- set_buffer_entries(&tr->max_buffer, 1);
- tracing_reset_online_cpus(&tr->max_buffer);
+ ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0);
+ ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&tr->snapshot_buffer, 1);
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
tr->allocated_snapshot = false;
}
@@ -1499,7 +1075,7 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
guard(mutex)(&trace_types_lock);
- if (tr->current_trace->use_max_tr)
+ if (tracer_uses_snapshot(tr->current_trace))
return -EBUSY;
/*
@@ -1666,9 +1242,18 @@ EXPORT_SYMBOL_GPL(tracing_off);
void disable_trace_on_warning(void)
{
if (__disable_trace_on_warning) {
+ struct trace_array *tr = READ_ONCE(printk_trace);
+
trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_,
"Disabling tracing due to warning\n");
tracing_off();
+
+ /* Disable trace_printk() buffer too */
+ if (tr != &global_trace) {
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "Disabling tracing due to warning\n");
+ tracer_tracing_off(tr);
+ }
}
}
@@ -1903,10 +1488,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
unsigned long __read_mostly tracing_thresh;
#ifdef CONFIG_TRACER_MAX_TRACE
-static const struct file_operations tracing_max_lat_fops;
-
#ifdef LATENCY_FS_NOTIFY
-
static struct workqueue_struct *fsnotify_wq;
static void latency_fsnotify_workfn(struct work_struct *work)
@@ -1923,17 +1505,6 @@ static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
queue_work(fsnotify_wq, &tr->fsnotify_work);
}
-static void trace_create_maxlat_file(struct trace_array *tr,
- struct dentry *d_tracer)
-{
- INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
- init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
- tr->d_max_latency = trace_create_file("tracing_max_latency",
- TRACE_MODE_WRITE,
- d_tracer, tr,
- &tracing_max_lat_fops);
-}
-
__init static int latency_fsnotify_init(void)
{
fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
@@ -1958,14 +1529,22 @@ void latency_fsnotify(struct trace_array *tr)
*/
irq_work_queue(&tr->fsnotify_irqwork);
}
+#endif /* !LATENCY_FS_NOTIFY */
-#else /* !LATENCY_FS_NOTIFY */
-
-#define trace_create_maxlat_file(tr, d_tracer) \
- trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
- d_tracer, tr, &tracing_max_lat_fops)
+static const struct file_operations tracing_max_lat_fops;
+static void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+#ifdef LATENCY_FS_NOTIFY
+ INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
+ init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
#endif
+ tr->d_max_latency = trace_create_file("tracing_max_latency",
+ TRACE_MODE_WRITE,
+ d_tracer, tr,
+ &tracing_max_lat_fops);
+}
/*
* Copy the new maximum trace into the separate maximum-trace
@@ -1976,8 +1555,8 @@ static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
struct array_buffer *trace_buf = &tr->array_buffer;
- struct array_buffer *max_buf = &tr->max_buffer;
struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+ struct array_buffer *max_buf = &tr->snapshot_buffer;
struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
max_buf->cpu = cpu;
@@ -2006,7 +1585,14 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
tracing_record_cmdline(tsk);
latency_fsnotify(tr);
}
+#else
+static inline void trace_create_maxlat_file(struct trace_array *tr,
+ struct dentry *d_tracer) { }
+static inline void __update_max_tr(struct trace_array *tr,
+ struct task_struct *tsk, int cpu) { }
+#endif /* CONFIG_TRACER_MAX_TRACE */
+#ifdef CONFIG_TRACER_SNAPSHOT
/**
* update_max_tr - snapshot all trace buffers from global_trace to max_tr
* @tr: tracer
@@ -2036,17 +1622,16 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
/* Inherit the recordable setting from array_buffer */
if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
- ring_buffer_record_on(tr->max_buffer.buffer);
+ ring_buffer_record_on(tr->snapshot_buffer.buffer);
else
- ring_buffer_record_off(tr->max_buffer.buffer);
+ ring_buffer_record_off(tr->snapshot_buffer.buffer);
-#ifdef CONFIG_TRACER_SNAPSHOT
if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
arch_spin_unlock(&tr->max_lock);
return;
}
-#endif
- swap(tr->array_buffer.buffer, tr->max_buffer.buffer);
+
+ swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
@@ -2081,7 +1666,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
arch_spin_lock(&tr->max_lock);
- ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu);
+ ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu);
if (ret == -EBUSY) {
/*
@@ -2091,7 +1676,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* and flag that it failed.
* Another reason is resize is in progress.
*/
- trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
+ trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_,
"Failed to swap buffers due to commit or resize in progress\n");
}
@@ -2100,8 +1685,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
}
-
-#endif /* CONFIG_TRACER_MAX_TRACE */
+#endif /* CONFIG_TRACER_SNAPSHOT */
struct pipe_wait {
struct trace_iterator *iter;
@@ -2134,13 +1718,13 @@ static int wait_on_pipe(struct trace_iterator *iter, int full)
ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full,
wait_pipe_cond, &pwait);
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
/*
* Make sure this is still the snapshot buffer, as if a snapshot were
* to happen, this would now be the main buffer.
*/
if (iter->snapshot)
- iter->array_buffer = &iter->tr->max_buffer;
+ iter->array_buffer = &iter->tr->snapshot_buffer;
#endif
return ret;
}
@@ -2205,10 +1789,10 @@ static int run_tracer_selftest(struct tracer *type)
tr->current_trace_flags = type->flags ? : type->default_flags;
#ifdef CONFIG_TRACER_MAX_TRACE
- if (type->use_max_tr) {
+ if (tracer_uses_snapshot(type)) {
/* If we expanded the buffers, make sure the max is expanded too */
if (tr->ring_buffer_expanded)
- ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
+ ring_buffer_resize(tr->snapshot_buffer.buffer, trace_buf_size,
RING_BUFFER_ALL_CPUS);
tr->allocated_snapshot = true;
}
@@ -2230,12 +1814,12 @@ static int run_tracer_selftest(struct tracer *type)
tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- if (type->use_max_tr) {
+ if (tracer_uses_snapshot(type)) {
tr->allocated_snapshot = false;
/* Shrink the max buffer again */
if (tr->ring_buffer_expanded)
- ring_buffer_resize(tr->max_buffer.buffer, 1,
+ ring_buffer_resize(tr->snapshot_buffer.buffer, 1,
RING_BUFFER_ALL_CPUS);
}
#endif
@@ -2477,8 +2061,8 @@ void tracing_reset_all_online_cpus_unlocked(void)
continue;
tr->clear_trace = false;
tracing_reset_online_cpus(&tr->array_buffer);
-#ifdef CONFIG_TRACER_MAX_TRACE
- tracing_reset_online_cpus(&tr->max_buffer);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
#endif
}
}
@@ -2517,8 +2101,8 @@ static void tracing_start_tr(struct trace_array *tr)
if (buffer)
ring_buffer_record_enable(buffer);
-#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = tr->max_buffer.buffer;
+#ifdef CONFIG_TRACER_SNAPSHOT
+ buffer = tr->snapshot_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
#endif
@@ -2553,8 +2137,8 @@ static void tracing_stop_tr(struct trace_array *tr)
if (buffer)
ring_buffer_record_disable(buffer);
-#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = tr->max_buffer.buffer;
+#ifdef CONFIG_TRACER_SNAPSHOT
+ buffer = tr->snapshot_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
#endif
@@ -3002,10 +2586,10 @@ struct ftrace_stacks {
static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
-static void __ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs)
+void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
{
struct ring_buffer_event *event;
unsigned int size, nr_entries;
@@ -3088,17 +2672,6 @@ static void __ftrace_trace_stack(struct trace_array *tr,
trace_clear_recursion(bit);
}
-static inline void ftrace_trace_stack(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx,
- int skip, struct pt_regs *regs)
-{
- if (!(tr->trace_flags & TRACE_ITER(STACKTRACE)))
- return;
-
- __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs);
-}
-
void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
int skip)
{
@@ -3233,324 +2806,6 @@ void trace_last_func_repeats(struct trace_array *tr,
__buffer_unlock_commit(buffer, event);
}
-/* created for use with alloc_percpu */
-struct trace_buffer_struct {
- int nesting;
- char buffer[4][TRACE_BUF_SIZE];
-};
-
-static struct trace_buffer_struct __percpu *trace_percpu_buffer;
-
-/*
- * This allows for lockless recording. If we're nested too deeply, then
- * this returns NULL.
- */
-static char *get_trace_buf(void)
-{
- struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
-
- if (!trace_percpu_buffer || buffer->nesting >= 4)
- return NULL;
-
- buffer->nesting++;
-
- /* Interrupts must see nesting incremented before we use the buffer */
- barrier();
- return &buffer->buffer[buffer->nesting - 1][0];
-}
-
-static void put_trace_buf(void)
-{
- /* Don't let the decrement of nesting leak before this */
- barrier();
- this_cpu_dec(trace_percpu_buffer->nesting);
-}
-
-static int alloc_percpu_trace_buffer(void)
-{
- struct trace_buffer_struct __percpu *buffers;
-
- if (trace_percpu_buffer)
- return 0;
-
- buffers = alloc_percpu(struct trace_buffer_struct);
- if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
- return -ENOMEM;
-
- trace_percpu_buffer = buffers;
- return 0;
-}
-
-static int buffers_allocated;
-
-void trace_printk_init_buffers(void)
-{
- if (buffers_allocated)
- return;
-
- if (alloc_percpu_trace_buffer())
- return;
-
- /* trace_printk() is for debug use only. Don't use it in production. */
-
- pr_warn("\n");
- pr_warn("**********************************************************\n");
- pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
- pr_warn("** **\n");
- pr_warn("** trace_printk() being used. Allocating extra memory. **\n");
- pr_warn("** **\n");
- pr_warn("** This means that this is a DEBUG kernel and it is **\n");
- pr_warn("** unsafe for production use. **\n");
- pr_warn("** **\n");
- pr_warn("** If you see this message and you are not debugging **\n");
- pr_warn("** the kernel, report this immediately to your vendor! **\n");
- pr_warn("** **\n");
- pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
- pr_warn("**********************************************************\n");
-
- /* Expand the buffers to set size */
- tracing_update_buffers(&global_trace);
-
- buffers_allocated = 1;
-
- /*
- * trace_printk_init_buffers() can be called by modules.
- * If that happens, then we need to start cmdline recording
- * directly here. If the global_trace.buffer is already
- * allocated here, then this was called by module code.
- */
- if (global_trace.array_buffer.buffer)
- tracing_start_cmdline_record();
-}
-EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
-
-void trace_printk_start_comm(void)
-{
- /* Start tracing comms if trace printk is set */
- if (!buffers_allocated)
- return;
- tracing_start_cmdline_record();
-}
-
-static void trace_printk_start_stop_comm(int enabled)
-{
- if (!buffers_allocated)
- return;
-
- if (enabled)
- tracing_start_cmdline_record();
- else
- tracing_stop_cmdline_record();
-}
-
-/**
- * trace_vbprintk - write binary msg to tracing buffer
- * @ip: The address of the caller
- * @fmt: The string format to write to the buffer
- * @args: Arguments for @fmt
- */
-int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
-{
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct trace_array *tr = READ_ONCE(printk_trace);
- struct bprint_entry *entry;
- unsigned int trace_ctx;
- char *tbuffer;
- int len = 0, size;
-
- if (!printk_binsafe(tr))
- return trace_vprintk(ip, fmt, args);
-
- if (unlikely(tracing_selftest_running || tracing_disabled))
- return 0;
-
- /* Don't pollute graph traces with trace_vprintk internals */
- pause_graph_tracing();
-
- trace_ctx = tracing_gen_ctx();
- guard(preempt_notrace)();
-
- tbuffer = get_trace_buf();
- if (!tbuffer) {
- len = 0;
- goto out_nobuffer;
- }
-
- len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
-
- if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
- goto out_put;
-
- size = sizeof(*entry) + sizeof(u32) * len;
- buffer = tr->array_buffer.buffer;
- scoped_guard(ring_buffer_nest, buffer) {
- event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- trace_ctx);
- if (!event)
- goto out_put;
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
- entry->fmt = fmt;
-
- memcpy(entry->buf, tbuffer, sizeof(u32) * len);
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
- }
-out_put:
- put_trace_buf();
-
-out_nobuffer:
- unpause_graph_tracing();
-
- return len;
-}
-EXPORT_SYMBOL_GPL(trace_vbprintk);
-
-static __printf(3, 0)
-int __trace_array_vprintk(struct trace_buffer *buffer,
- unsigned long ip, const char *fmt, va_list args)
-{
- struct ring_buffer_event *event;
- int len = 0, size;
- struct print_entry *entry;
- unsigned int trace_ctx;
- char *tbuffer;
-
- if (tracing_disabled)
- return 0;
-
- /* Don't pollute graph traces with trace_vprintk internals */
- pause_graph_tracing();
-
- trace_ctx = tracing_gen_ctx();
- guard(preempt_notrace)();
-
-
- tbuffer = get_trace_buf();
- if (!tbuffer) {
- len = 0;
- goto out_nobuffer;
- }
-
- len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
-
- size = sizeof(*entry) + len + 1;
- scoped_guard(ring_buffer_nest, buffer) {
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- trace_ctx);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
-
- memcpy(&entry->buf, tbuffer, len + 1);
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
- }
-out:
- put_trace_buf();
-
-out_nobuffer:
- unpause_graph_tracing();
-
- return len;
-}
-
-int trace_array_vprintk(struct trace_array *tr,
- unsigned long ip, const char *fmt, va_list args)
-{
- if (tracing_selftest_running && tr == &global_trace)
- return 0;
-
- return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
-}
-
-/**
- * trace_array_printk - Print a message to a specific instance
- * @tr: The instance trace_array descriptor
- * @ip: The instruction pointer that this is called from.
- * @fmt: The format to print (printf format)
- *
- * If a subsystem sets up its own instance, they have the right to
- * printk strings into their tracing instance buffer using this
- * function. Note, this function will not write into the top level
- * buffer (use trace_printk() for that), as writing into the top level
- * buffer should only have events that can be individually disabled.
- * trace_printk() is only used for debugging a kernel, and should not
- * be ever incorporated in normal use.
- *
- * trace_array_printk() can be used, as it will not add noise to the
- * top level tracing buffer.
- *
- * Note, trace_array_init_printk() must be called on @tr before this
- * can be used.
- */
-int trace_array_printk(struct trace_array *tr,
- unsigned long ip, const char *fmt, ...)
-{
- int ret;
- va_list ap;
-
- if (!tr)
- return -ENOENT;
-
- /* This is only allowed for created instances */
- if (tr == &global_trace)
- return 0;
-
- if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
- return 0;
-
- va_start(ap, fmt);
- ret = trace_array_vprintk(tr, ip, fmt, ap);
- va_end(ap);
- return ret;
-}
-EXPORT_SYMBOL_GPL(trace_array_printk);
-
-/**
- * trace_array_init_printk - Initialize buffers for trace_array_printk()
- * @tr: The trace array to initialize the buffers for
- *
- * As trace_array_printk() only writes into instances, they are OK to
- * have in the kernel (unlike trace_printk()). This needs to be called
- * before trace_array_printk() can be used on a trace_array.
- */
-int trace_array_init_printk(struct trace_array *tr)
-{
- if (!tr)
- return -ENOENT;
-
- /* This is only allowed for created instances */
- if (tr == &global_trace)
- return -EINVAL;
-
- return alloc_percpu_trace_buffer();
-}
-EXPORT_SYMBOL_GPL(trace_array_init_printk);
-
-int trace_array_printk_buf(struct trace_buffer *buffer,
- unsigned long ip, const char *fmt, ...)
-{
- int ret;
- va_list ap;
-
- if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK)))
- return 0;
-
- va_start(ap, fmt);
- ret = __trace_array_vprintk(buffer, ip, fmt, ap);
- va_end(ap);
- return ret;
-}
-
-int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
-{
- return trace_array_vprintk(printk_trace, ip, fmt, args);
-}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-
static void trace_iterator_increment(struct trace_iterator *iter)
{
struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
@@ -3987,10 +3242,8 @@ static void *s_start(struct seq_file *m, loff_t *pos)
}
mutex_unlock(&trace_types_lock);
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->trace->use_max_tr)
+ if (iter->snapshot && tracer_uses_snapshot(iter->trace))
return ERR_PTR(-EBUSY);
-#endif
if (*pos != iter->pos) {
iter->ent = NULL;
@@ -4029,10 +3282,8 @@ static void s_stop(struct seq_file *m, void *p)
{
struct trace_iterator *iter = m->private;
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->trace->use_max_tr)
+ if (iter->snapshot && tracer_uses_snapshot(iter->trace))
return;
-#endif
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
@@ -4286,7 +3537,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
/* ftrace and system call events are still OK */
if ((event->type > __TRACE_LAST_TYPE) &&
!is_syscall_event(event))
- return print_event_fields(iter, event);
+ return print_event_fields(iter, event);
}
return event->funcs->trace(iter, sym_flags, event);
}
@@ -4509,7 +3760,7 @@ static void test_ftrace_alive(struct seq_file *m)
"# MAY BE MISSING FUNCTION EVENTS\n");
}
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
static void show_snapshot_main_help(struct seq_file *m)
{
seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
@@ -4687,10 +3938,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->tr = tr;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
/* Currently only the top directory has a snapshot */
if (tr->current_trace->print_max || snapshot)
- iter->array_buffer = &tr->max_buffer;
+ iter->array_buffer = &tr->snapshot_buffer;
else
#endif
iter->array_buffer = &tr->array_buffer;
@@ -4759,11 +4010,6 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
return 0;
}
-bool tracing_is_disabled(void)
-{
- return (tracing_disabled) ? true: false;
-}
-
/*
* Open and update trace_array ref count.
* Must have the current trace_array passed to it.
@@ -4881,6 +4127,8 @@ static int tracing_single_release_tr(struct inode *inode, struct file *file)
return single_release(inode, file);
}
+static bool update_last_data_if_empty(struct trace_array *tr);
+
static int tracing_open(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -4898,13 +4146,15 @@ static int tracing_open(struct inode *inode, struct file *file)
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->current_trace->print_max)
- trace_buf = &tr->max_buffer;
+ trace_buf = &tr->snapshot_buffer;
#endif
if (cpu == RING_BUFFER_ALL_CPUS)
tracing_reset_online_cpus(trace_buf);
else
tracing_reset_cpu(trace_buf, cpu);
+
+ update_last_data_if_empty(tr);
}
if (file->f_mode & FMODE_READ) {
@@ -4929,11 +4179,9 @@ static int tracing_open(struct inode *inode, struct file *file)
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
-#ifdef CONFIG_TRACER_SNAPSHOT
/* arrays with mapped buffer range do not have snapshots */
- if (tr->range_addr_start && t->use_max_tr)
+ if (tr->range_addr_start && tracer_uses_snapshot(t))
return false;
-#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}
@@ -5110,15 +4358,15 @@ int tracing_set_cpumask(struct trace_array *tr,
if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
-#ifdef CONFIG_TRACER_MAX_TRACE
- ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ ring_buffer_record_disable_cpu(tr->snapshot_buffer.buffer, cpu);
#endif
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
-#ifdef CONFIG_TRACER_MAX_TRACE
- ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ ring_buffer_record_enable_cpu(tr->snapshot_buffer.buffer, cpu);
#endif
}
}
@@ -5327,8 +4575,8 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled)
case TRACE_ITER(OVERWRITE):
ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled);
-#ifdef CONFIG_TRACER_MAX_TRACE
- ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ ring_buffer_change_overwrite(tr->snapshot_buffer.buffer, enabled);
#endif
break;
@@ -5971,6 +5219,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
int tracer_init(struct tracer *t, struct trace_array *tr)
{
tracing_reset_online_cpus(&tr->array_buffer);
+ update_last_data_if_empty(tr);
return t->init(tr);
}
@@ -5991,7 +5240,7 @@ static void update_buffer_entries(struct array_buffer *buf, int cpu)
}
}
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
/* resize @tr's buffer to the size of @size_tr's entries */
static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
struct array_buffer *size_buf, int cpu_id)
@@ -6017,7 +5266,7 @@ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
return ret;
}
-#endif /* CONFIG_TRACER_MAX_TRACE */
+#endif /* CONFIG_TRACER_SNAPSHOT */
static int __tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu)
@@ -6042,11 +5291,11 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
if (ret < 0)
goto out_start;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
if (!tr->allocated_snapshot)
goto out;
- ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
+ ret = ring_buffer_resize(tr->snapshot_buffer.buffer, size, cpu);
if (ret < 0) {
int r = resize_buffer_duplicate_size(&tr->array_buffer,
&tr->array_buffer, cpu);
@@ -6071,10 +5320,10 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
goto out_start;
}
- update_buffer_entries(&tr->max_buffer, cpu);
+ update_buffer_entries(&tr->snapshot_buffer, cpu);
out:
-#endif /* CONFIG_TRACER_MAX_TRACE */
+#endif /* CONFIG_TRACER_SNAPSHOT */
update_buffer_entries(&tr->array_buffer, cpu);
out_start:
@@ -6265,6 +5514,9 @@ int tracing_update_buffers(struct trace_array *tr)
{
int ret = 0;
+ if (!tr)
+ tr = &global_trace;
+
guard(mutex)(&trace_types_lock);
update_last_data(tr);
@@ -6299,9 +5551,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
struct tracer *trace = NULL;
struct tracers *t;
-#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
-#endif
int ret;
guard(mutex)(&trace_types_lock);
@@ -6329,7 +5579,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
return 0;
#ifdef CONFIG_TRACER_SNAPSHOT
- if (trace->use_max_tr) {
+ if (tracer_uses_snapshot(trace)) {
local_irq_disable();
arch_spin_lock(&tr->max_lock);
ret = tr->cond_snapshot ? -EBUSY : 0;
@@ -6361,14 +5611,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (tr->current_trace->reset)
tr->current_trace->reset(tr);
-#ifdef CONFIG_TRACER_MAX_TRACE
- had_max_tr = tr->current_trace->use_max_tr;
+ had_max_tr = tracer_uses_snapshot(tr->current_trace);
/* Current trace needs to be nop_trace before synchronize_rcu */
tr->current_trace = &nop_trace;
tr->current_trace_flags = nop_trace.flags;
- if (had_max_tr && !trace->use_max_tr) {
+ if (had_max_tr && !tracer_uses_snapshot(trace)) {
/*
* We need to make sure that the update_max_tr sees that
* current_trace changed to nop_trace to keep it from
@@ -6381,24 +5630,19 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
tracing_disarm_snapshot(tr);
}
- if (!had_max_tr && trace->use_max_tr) {
+ if (!had_max_tr && tracer_uses_snapshot(trace)) {
ret = tracing_arm_snapshot_locked(tr);
if (ret)
return ret;
}
-#else
- tr->current_trace = &nop_trace;
-#endif
tr->current_trace_flags = t->flags ? : t->tracer->flags;
if (trace->init) {
ret = tracer_init(trace, tr);
if (ret) {
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (trace->use_max_tr)
+ if (tracer_uses_snapshot(trace))
tracing_disarm_snapshot(tr);
-#endif
tr->current_trace_flags = nop_trace.flags;
return ret;
}
@@ -7603,7 +6847,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
unsigned long ip;
char *buf;
- if (tracing_disabled)
+ if (unlikely(tracing_disabled))
return -EINVAL;
if (!(tr->trace_flags & TRACE_ITER(MARKERS)))
@@ -7683,7 +6927,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
ssize_t written = -ENODEV;
char *buf;
- if (tracing_disabled)
+ if (unlikely(tracing_disabled))
return -EINVAL;
if (!(tr->trace_flags & TRACE_ITER(MARKERS)))
@@ -7784,11 +7028,12 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
*/
tracing_reset_online_cpus(&tr->array_buffer);
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->max_buffer.buffer)
- ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
- tracing_reset_online_cpus(&tr->max_buffer);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ if (tr->snapshot_buffer.buffer)
+ ring_buffer_set_clock(tr->snapshot_buffer.buffer, trace_clocks[i].func);
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
#endif
+ update_last_data_if_empty(tr);
if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) {
struct trace_scratch *tscratch = tr->scratch;
@@ -7881,26 +7126,6 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve
return ring_buffer_event_time_stamp(buffer, rbe);
}
-/*
- * Set or disable using the per CPU trace_buffer_event when possible.
- */
-int tracing_set_filter_buffering(struct trace_array *tr, bool set)
-{
- guard(mutex)(&trace_types_lock);
-
- if (set && tr->no_filter_buffering_ref++)
- return 0;
-
- if (!set) {
- if (WARN_ON_ONCE(!tr->no_filter_buffering_ref))
- return -EINVAL;
-
- --tr->no_filter_buffering_ref;
- }
-
- return 0;
-}
-
struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
@@ -7939,7 +7164,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
ret = 0;
iter->tr = tr;
- iter->array_buffer = &tr->max_buffer;
+ iter->array_buffer = &tr->snapshot_buffer;
iter->cpu_file = tracing_get_cpu(inode);
m->private = iter;
file->private_data = m;
@@ -7976,7 +7201,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
guard(mutex)(&trace_types_lock);
- if (tr->current_trace->use_max_tr)
+ if (tracer_uses_snapshot(tr->current_trace))
return -EBUSY;
local_irq_disable();
@@ -8002,7 +7227,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
return -EINVAL;
#endif
if (tr->allocated_snapshot)
- ret = resize_buffer_duplicate_size(&tr->max_buffer,
+ ret = resize_buffer_duplicate_size(&tr->snapshot_buffer,
&tr->array_buffer, iter->cpu_file);
ret = tracing_arm_snapshot_locked(tr);
@@ -8023,9 +7248,9 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
default:
if (tr->allocated_snapshot) {
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->max_buffer);
+ tracing_reset_online_cpus(&tr->snapshot_buffer);
else
- tracing_reset_cpu(&tr->max_buffer, iter->cpu_file);
+ tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file);
}
break;
}
@@ -8075,13 +7300,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
info = filp->private_data;
- if (info->iter.trace->use_max_tr) {
+ if (tracer_uses_snapshot(info->iter.trace)) {
tracing_buffers_release(inode, filp);
return -EBUSY;
}
info->iter.snapshot = true;
- info->iter.array_buffer = &info->iter.tr->max_buffer;
+ info->iter.array_buffer = &info->iter.tr->snapshot_buffer;
return ret;
}
@@ -8631,10 +7856,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!count)
return 0;
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ if (iter->snapshot && tracer_uses_snapshot(iter->tr->current_trace))
return -EBUSY;
-#endif
page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
@@ -8818,10 +8041,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
int entries, i;
ssize_t ret = 0;
-#ifdef CONFIG_TRACER_MAX_TRACE
- if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ if (iter->snapshot && tracer_uses_snapshot(iter->tr->current_trace))
return -EBUSY;
-#endif
page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
if (*ppos & (page_size - 1))
@@ -8955,7 +8176,7 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0;
}
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
static int get_snapshot_map(struct trace_array *tr)
{
int err = 0;
@@ -9398,7 +8619,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_stats_fops);
- trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
+ trace_create_cpu_file("buffer_size_kb", TRACE_MODE_WRITE, d_cpu,
tr, cpu, &tracing_entries_fops);
if (tr->range_addr_start)
@@ -9959,12 +9180,12 @@ buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
if (ret)
goto out;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
if (!tr->allocated_snapshot)
goto out_max;
- ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order);
if (ret) {
/* Put back the old order */
cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
@@ -10180,12 +9401,12 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
if (ret)
return ret;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
/* Fix mapped buffer trace arrays do not have snapshot buffers */
if (tr->range_addr_start)
return 0;
- ret = allocate_trace_buffer(tr, &tr->max_buffer,
+ ret = allocate_trace_buffer(tr, &tr->snapshot_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
free_trace_buffer(&tr->array_buffer);
@@ -10207,8 +9428,8 @@ static void free_trace_buffers(struct trace_array *tr)
free_trace_buffer(&tr->array_buffer);
kfree(tr->module_delta);
-#ifdef CONFIG_TRACER_MAX_TRACE
- free_trace_buffer(&tr->max_buffer);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ free_trace_buffer(&tr->snapshot_buffer);
#endif
}
@@ -10349,7 +9570,7 @@ trace_array_create_systems(const char *name, const char *systems,
tr->syscall_buf_sz = global_trace.syscall_buf_sz;
tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
spin_lock_init(&tr->snapshot_trigger_lock);
#endif
tr->current_trace = &nop_trace;
@@ -10674,9 +9895,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
-#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_maxlat_file(tr, d_tracer);
-#endif
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
@@ -10775,7 +9994,7 @@ int tracing_init_dentry(void)
extern struct trace_eval_map *__start_ftrace_eval_maps[];
extern struct trace_eval_map *__stop_ftrace_eval_maps[];
-static struct workqueue_struct *eval_map_wq __initdata;
+struct workqueue_struct *trace_init_wq __initdata;
static struct work_struct eval_map_work __initdata;
static struct work_struct tracerfs_init_work __initdata;
@@ -10791,15 +10010,15 @@ static int __init trace_eval_init(void)
{
INIT_WORK(&eval_map_work, eval_map_work_func);
- eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0);
- if (!eval_map_wq) {
- pr_err("Unable to allocate eval_map_wq\n");
+ trace_init_wq = alloc_workqueue("trace_init_wq", WQ_UNBOUND, 0);
+ if (!trace_init_wq) {
+ pr_err("Unable to allocate trace_init_wq\n");
/* Do work here */
eval_map_work_func(&eval_map_work);
return -ENOMEM;
}
- queue_work(eval_map_wq, &eval_map_work);
+ queue_work(trace_init_wq, &eval_map_work);
return 0;
}
@@ -10808,8 +10027,8 @@ subsys_initcall(trace_eval_init);
static int __init trace_eval_sync(void)
{
/* Make sure the eval map updates are finished */
- if (eval_map_wq)
- destroy_workqueue(eval_map_wq);
+ if (trace_init_wq)
+ destroy_workqueue(trace_init_wq);
return 0;
}
@@ -10970,9 +10189,9 @@ static __init int tracer_init_tracefs(void)
if (ret)
return 0;
- if (eval_map_wq) {
+ if (trace_init_wq) {
INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func);
- queue_work(eval_map_wq, &tracerfs_init_work);
+ queue_work(trace_init_wq, &tracerfs_init_work);
} else {
tracer_init_tracefs_work_func(NULL);
}
@@ -11305,7 +10524,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
return done;
}
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
__init static bool tr_needs_alloc_snapshot(const char *name)
{
char *test;
@@ -11495,7 +10714,7 @@ __init static void enable_instances(void)
}
} else {
/* Only non mapped buffers have snapshot buffers */
- if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ if (IS_ENABLED(CONFIG_TRACER_SNAPSHOT))
do_allocate_snapshot(name);
}
@@ -11622,7 +10841,7 @@ __init static int tracer_alloc_buffers(void)
global_trace.current_trace_flags = nop_trace.flags;
global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
spin_lock_init(&global_trace.snapshot_trigger_lock);
#endif
ftrace_init_global_array_ops(&global_trace);
@@ -11690,7 +10909,7 @@ struct trace_array *trace_get_global_array(void)
void __init ftrace_boot_snapshot(void)
{
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
struct trace_array *tr;
if (!snapshot_at_boot)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c11edec5d8f5..b8f3804586a0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -131,7 +131,7 @@ enum trace_type {
#define FAULT_STRING "(fault)"
-#define HIST_STACKTRACE_DEPTH 16
+#define HIST_STACKTRACE_DEPTH 31
#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
#define HIST_STACKTRACE_SKIP 5
@@ -332,29 +332,33 @@ struct trace_array {
struct list_head list;
char *name;
struct array_buffer array_buffer;
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
/*
- * The max_buffer is used to snapshot the trace when a maximum
+ * The snapshot_buffer is used to snapshot the trace when a maximum
* latency is reached, or when the user initiates a snapshot.
* Some tracers will use this to store a maximum trace while
* it continues examining live traces.
*
- * The buffers for the max_buffer are set up the same as the array_buffer
- * When a snapshot is taken, the buffer of the max_buffer is swapped
- * with the buffer of the array_buffer and the buffers are reset for
- * the array_buffer so the tracing can continue.
+ * The buffers for the snapshot_buffer are set up the same as the
+ * array_buffer. When a snapshot is taken, the buffer of the
+ * snapshot_buffer is swapped with the buffer of the array_buffer
+ * and the buffers are reset for the array_buffer so the tracing can
+ * continue.
*/
- struct array_buffer max_buffer;
+ struct array_buffer snapshot_buffer;
bool allocated_snapshot;
spinlock_t snapshot_trigger_lock;
unsigned int snapshot;
+#ifdef CONFIG_TRACER_MAX_TRACE
unsigned long max_latency;
-#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
+#ifdef CONFIG_FSNOTIFY
struct work_struct fsnotify_work;
struct irq_work fsnotify_irqwork;
-#endif
-#endif
+#endif /* CONFIG_FSNOTIFY */
+#endif /* CONFIG_TRACER_MAX_TRACE */
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
/* The below is for memory mapped ring buffer */
unsigned int mapped;
unsigned long range_addr_start;
@@ -380,7 +384,7 @@ struct trace_array {
*
* It is also used in other places outside the update_max_tr
* so it needs to be defined outside of the
- * CONFIG_TRACER_MAX_TRACE.
+ * CONFIG_TRACER_SNAPSHOT.
*/
arch_spinlock_t max_lock;
#ifdef CONFIG_FTRACE_SYSCALLS
@@ -479,13 +483,14 @@ extern struct trace_array *trace_array_find(const char *instance);
extern struct trace_array *trace_array_find_get(const char *instance);
extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
-extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
extern bool trace_clock_in_ns(struct trace_array *tr);
extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr);
+extern struct trace_array *printk_trace;
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
@@ -661,6 +666,8 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL;
}
+extern int tracing_disabled;
+
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void tracing_reset_online_cpus(struct array_buffer *buf);
@@ -672,7 +679,6 @@ int tracing_release_generic_tr(struct inode *inode, struct file *file);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
-bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
void tracer_tracing_off(struct trace_array *tr);
@@ -772,6 +778,7 @@ extern cpumask_var_t __read_mostly tracing_buffer_mask;
extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
+extern struct workqueue_struct *trace_init_wq __initdata;
/* PID filtering */
@@ -790,22 +797,22 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
struct trace_pid_list **new_pid_list,
const char __user *ubuf, size_t cnt);
-#ifdef CONFIG_TRACER_MAX_TRACE
+#ifdef CONFIG_TRACER_SNAPSHOT
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
void *cond_data);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
-#ifdef CONFIG_FSNOTIFY
-#define LATENCY_FS_NOTIFY
+#if defined(CONFIG_TRACER_MAX_TRACE) && defined(CONFIG_FSNOTIFY)
+# define LATENCY_FS_NOTIFY
#endif
-#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
#else
static inline void latency_fsnotify(struct trace_array *tr) { }
#endif
+#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_STACKTRACE
void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
@@ -816,6 +823,18 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
}
#endif /* CONFIG_STACKTRACE */
+#ifdef CONFIG_TRACER_MAX_TRACE
+static inline bool tracer_uses_snapshot(struct tracer *tracer)
+{
+ return tracer->use_max_tr;
+}
+#else
+static inline bool tracer_uses_snapshot(struct tracer *tracer)
+{
+ return false;
+}
+#endif
+
void trace_last_func_repeats(struct trace_array *tr,
struct trace_func_repeats *last_info,
unsigned int trace_ctx);
@@ -865,6 +884,7 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
+extern bool __read_mostly tracing_selftest_running;
/*
* Tracer data references selftest functions that only occur
* on boot up. These can be __init functions. Thus, when selftests
@@ -877,6 +897,7 @@ static inline void __init disable_tracing_selftest(const char *reason)
}
/* Tracers are seldom changed. Optimize when selftests are disabled. */
#define __tracer_data __read_mostly
+#define tracing_selftest_running 0
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
@@ -1414,6 +1435,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(COPY_MARKER, "copy_trace_marker"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
+ C(BITMASK_LIST, "bitmask-list"), \
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
@@ -1567,6 +1589,47 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
const char __user *ptr, size_t size,
trace_user_buf_copy copy_func, void *data);
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+ int type, unsigned int trace_ctx)
+{
+ struct trace_entry *ent = ring_buffer_event_data(event);
+
+ tracing_generic_entry_update(ent, type, trace_ctx);
+}
+
+static __always_inline struct ring_buffer_event *
+__trace_buffer_lock_reserve(struct trace_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned int trace_ctx)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(buffer, len);
+ if (event != NULL)
+ trace_event_setup(event, type, trace_ctx);
+
+ return event;
+}
+
+static __always_inline void
+__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event)
+{
+ __this_cpu_write(trace_taskinfo_save, true);
+
+ /* If this is the temp buffer, we need to commit fully */
+ if (this_cpu_read(trace_buffered_event) == event) {
+ /* Length is in event->array[0] */
+ ring_buffer_write(buffer, event->array[0], &event->array[1]);
+ /* Release the temp buffer */
+ this_cpu_dec(trace_buffered_event_cnt);
+ /* ring_buffer_unlock_commit() enables preemption */
+ preempt_enable_notrace();
+ } else
+ ring_buffer_unlock_commit(buffer);
+}
+
static inline void
__trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
@@ -2087,6 +2150,7 @@ extern const char *__stop___tracepoint_str[];
void trace_printk_control(bool enabled);
void trace_printk_start_comm(void);
+void trace_printk_start_stop_comm(int enabled);
int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set);
int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled);
@@ -2119,7 +2183,7 @@ extern void tracing_log_err(struct trace_array *tr,
* about performance). The internal_trace_puts() is for such
* a purpose.
*/
-#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
+#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str)
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
@@ -2237,6 +2301,37 @@ static inline void sanitize_event_name(char *name)
*name = '_';
}
+#ifdef CONFIG_STACKTRACE
+void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs);
+
+static __always_inline void ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
+{
+ if (!(tr->trace_flags & TRACE_ITER(STACKTRACE)))
+ return;
+
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs);
+}
+#else
+static inline void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx,
+ int skip, struct pt_regs *regs)
+{
+}
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned long trace_ctx,
+ int skip, struct pt_regs *regs)
+{
+}
+#endif
+
/*
* This is a generic way to read and write a u64 value from a file in tracefs.
*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 137b4d9bb116..61fe01dce7a6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -649,6 +649,22 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
}
EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
+/**
+ * trace_event_buffer_reserve - reserve space on the ring buffer for an event
+ * @fbuffer: information about how to save the event
+ * @trace_file: the instance file descriptor for the event
+ * @len: The length of the event
+ *
+ * The @fbuffer has information about the ring buffer and data will
+ * be added to it to be used by the call to trace_event_buffer_commit().
+ * The @trace_file is the desrciptor with information about the status
+ * of the given event for a specific trace_array instance.
+ * The @len is the length of data to save for the event.
+ *
+ * Returns a pointer to the data on the ring buffer or NULL if the
+ * event was not reserved (event was filtered, too big, or the buffer
+ * simply was disabled for write).
+ */
void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
struct trace_event_file *trace_file,
unsigned long len)
@@ -1662,6 +1678,82 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
+static int get_call_len(struct trace_event_call *call)
+{
+ int len;
+
+ /* Get the length of "<system>:<event>" */
+ len = strlen(call->class->system) + 1;
+ len += strlen(trace_event_name(call));
+
+ /* Set the index to 32 bytes to separate event from data */
+ return len >= 32 ? 1 : 32 - len;
+}
+
+/**
+ * t_show_filters - seq_file callback to display active event filters
+ * @m: The seq_file interface for formatted output
+ * @v: The current trace_event_file being iterated
+ *
+ * Identifies and prints active filters for the current event file in the
+ * iteration. If a filter is applied to the current event and, if so,
+ * prints the system name, event name, and the filter string.
+ */
+static int t_show_filters(struct seq_file *m, void *v)
+{
+ struct trace_event_file *file = v;
+ struct trace_event_call *call = file->event_call;
+ struct event_filter *filter;
+ int len;
+
+ guard(rcu)();
+ filter = rcu_dereference(file->filter);
+ if (!filter || !filter->filter_string)
+ return 0;
+
+ len = get_call_len(call);
+
+ seq_printf(m, "%s:%s%*.s%s\n", call->class->system,
+ trace_event_name(call), len, "", filter->filter_string);
+
+ return 0;
+}
+
+/**
+ * t_show_triggers - seq_file callback to display active event triggers
+ * @m: The seq_file interface for formatted output
+ * @v: The current trace_event_file being iterated
+ *
+ * Iterates through the trigger list of the current event file and prints
+ * each active trigger's configuration using its associated print
+ * operation.
+ */
+static int t_show_triggers(struct seq_file *m, void *v)
+{
+ struct trace_event_file *file = v;
+ struct trace_event_call *call = file->event_call;
+ struct event_trigger_data *data;
+ int len;
+
+ /*
+ * The event_mutex is held by t_start(), protecting the
+ * file->triggers list traversal.
+ */
+ if (list_empty(&file->triggers))
+ return 0;
+
+ len = get_call_len(call);
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ seq_printf(m, "%s:%s%*.s", call->class->system,
+ trace_event_name(call), len, "");
+
+ data->cmd_ops->print(m, data);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_MODULES
static int s_show(struct seq_file *m, void *v)
{
@@ -2176,7 +2268,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
struct event_subsystem *system = NULL;
int ret;
- if (tracing_is_disabled())
+ if (unlikely(tracing_disabled))
return -ENODEV;
/* Make sure the system still exists */
@@ -2489,6 +2581,8 @@ ftrace_event_npid_write(struct file *filp, const char __user *ubuf,
static int ftrace_event_avail_open(struct inode *inode, struct file *file);
static int ftrace_event_set_open(struct inode *inode, struct file *file);
+static int ftrace_event_show_filters_open(struct inode *inode, struct file *file);
+static int ftrace_event_show_triggers_open(struct inode *inode, struct file *file);
static int ftrace_event_set_pid_open(struct inode *inode, struct file *file);
static int ftrace_event_set_npid_open(struct inode *inode, struct file *file);
static int ftrace_event_release(struct inode *inode, struct file *file);
@@ -2507,6 +2601,20 @@ static const struct seq_operations show_set_event_seq_ops = {
.stop = s_stop,
};
+static const struct seq_operations show_show_event_filters_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .show = t_show_filters,
+ .stop = t_stop,
+};
+
+static const struct seq_operations show_show_event_triggers_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .show = t_show_triggers,
+ .stop = t_stop,
+};
+
static const struct seq_operations show_set_pid_seq_ops = {
.start = p_start,
.next = p_next,
@@ -2536,6 +2644,20 @@ static const struct file_operations ftrace_set_event_fops = {
.release = ftrace_event_release,
};
+static const struct file_operations ftrace_show_event_filters_fops = {
+ .open = ftrace_event_show_filters_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations ftrace_show_event_triggers_fops = {
+ .open = ftrace_event_show_triggers_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
static const struct file_operations ftrace_set_event_pid_fops = {
.open = ftrace_event_set_pid_open,
.read = seq_read,
@@ -2680,6 +2802,34 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
return ret;
}
+/**
+ * ftrace_event_show_filters_open - open interface for set_event_filters
+ * @inode: The inode of the file
+ * @file: The file being opened
+ *
+ * Connects the set_event_filters file to the sequence operations
+ * required to iterate over and display active event filters.
+ */
+static int
+ftrace_event_show_filters_open(struct inode *inode, struct file *file)
+{
+ return ftrace_event_open(inode, file, &show_show_event_filters_seq_ops);
+}
+
+/**
+ * ftrace_event_show_triggers_open - open interface for show_event_triggers
+ * @inode: The inode of the file
+ * @file: The file being opened
+ *
+ * Connects the show_event_triggers file to the sequence operations
+ * required to iterate over and display active event triggers.
+ */
+static int
+ftrace_event_show_triggers_open(struct inode *inode, struct file *file)
+{
+ return ftrace_event_open(inode, file, &show_show_event_triggers_seq_ops);
+}
+
static int
ftrace_event_set_pid_open(struct inode *inode, struct file *file)
{
@@ -3963,11 +4113,6 @@ void trace_put_event_file(struct trace_event_file *file)
EXPORT_SYMBOL_GPL(trace_put_event_file);
#ifdef CONFIG_DYNAMIC_FTRACE
-
-/* Avoid typos */
-#define ENABLE_EVENT_STR "enable_event"
-#define DISABLE_EVENT_STR "disable_event"
-
struct event_probe_data {
struct trace_event_file *file;
unsigned long count;
@@ -4400,6 +4545,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
if (!entry)
return -ENOMEM;
+ trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
+ &ftrace_show_event_filters_fops);
+
+ trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
+ &ftrace_show_event_triggers_fops);
+
nr_entries = ARRAY_SIZE(events_entries);
e_events = eventfs_create_events_dir("events", parent, events_entries,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 385af8405392..7001e34476ee 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1375,7 +1375,7 @@ static void free_filter_list_tasks(struct rcu_head *rhp)
struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work);
- queue_rcu_work(system_wq, &filter_list->rwork);
+ queue_rcu_work(system_dfl_wq, &filter_list->rwork);
}
/*
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index c97bb2fda5c0..e6f449f53afc 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -105,38 +105,44 @@ enum field_op_id {
FIELD_OP_MULT,
};
+#define FIELD_FUNCS \
+ C(NOP, "nop"), \
+ C(VAR_REF, "var_ref"), \
+ C(COUNTER, "counter"), \
+ C(CONST, "const"), \
+ C(LOG2, "log2"), \
+ C(BUCKET, "bucket"), \
+ C(TIMESTAMP, "timestamp"), \
+ C(CPU, "cpu"), \
+ C(COMM, "comm"), \
+ C(STRING, "string"), \
+ C(DYNSTRING, "dynstring"), \
+ C(RELDYNSTRING, "reldynstring"), \
+ C(PSTRING, "pstring"), \
+ C(S64, "s64"), \
+ C(U64, "u64"), \
+ C(S32, "s32"), \
+ C(U32, "u32"), \
+ C(S16, "s16"), \
+ C(U16, "u16"), \
+ C(S8, "s8"), \
+ C(U8, "u8"), \
+ C(UMINUS, "uminus"), \
+ C(MINUS, "minus"), \
+ C(PLUS, "plus"), \
+ C(DIV, "div"), \
+ C(MULT, "mult"), \
+ C(DIV_POWER2, "div_power2"), \
+ C(DIV_NOT_POWER2, "div_not_power2"), \
+ C(DIV_MULT_SHIFT, "div_mult_shift"), \
+ C(EXECNAME, "execname"), \
+ C(STACK, "stack"),
+
+#undef C
+#define C(a, b) HIST_FIELD_FN_##a
+
enum hist_field_fn {
- HIST_FIELD_FN_NOP,
- HIST_FIELD_FN_VAR_REF,
- HIST_FIELD_FN_COUNTER,
- HIST_FIELD_FN_CONST,
- HIST_FIELD_FN_LOG2,
- HIST_FIELD_FN_BUCKET,
- HIST_FIELD_FN_TIMESTAMP,
- HIST_FIELD_FN_CPU,
- HIST_FIELD_FN_COMM,
- HIST_FIELD_FN_STRING,
- HIST_FIELD_FN_DYNSTRING,
- HIST_FIELD_FN_RELDYNSTRING,
- HIST_FIELD_FN_PSTRING,
- HIST_FIELD_FN_S64,
- HIST_FIELD_FN_U64,
- HIST_FIELD_FN_S32,
- HIST_FIELD_FN_U32,
- HIST_FIELD_FN_S16,
- HIST_FIELD_FN_U16,
- HIST_FIELD_FN_S8,
- HIST_FIELD_FN_U8,
- HIST_FIELD_FN_UMINUS,
- HIST_FIELD_FN_MINUS,
- HIST_FIELD_FN_PLUS,
- HIST_FIELD_FN_DIV,
- HIST_FIELD_FN_MULT,
- HIST_FIELD_FN_DIV_POWER2,
- HIST_FIELD_FN_DIV_NOT_POWER2,
- HIST_FIELD_FN_DIV_MULT_SHIFT,
- HIST_FIELD_FN_EXECNAME,
- HIST_FIELD_FN_STACK,
+ FIELD_FUNCS
};
/*
@@ -3157,7 +3163,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
u64 var_val;
/* Make sure stacktrace can fit in the string variable length */
- BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX);
+ BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) > STR_VAR_LEN_MAX);
for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
struct field_var *field_var = field_vars[i];
@@ -5854,6 +5860,12 @@ const struct file_operations event_hist_fops = {
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+
+#undef C
+#define C(a, b) b
+
+static const char * const field_funcs[] = { FIELD_FUNCS };
+
static void hist_field_debug_show_flags(struct seq_file *m,
unsigned long flags)
{
@@ -5918,6 +5930,7 @@ static int hist_field_debug_show(struct seq_file *m,
seq_printf(m, " type: %s\n", field->type);
seq_printf(m, " size: %u\n", field->size);
seq_printf(m, " is_signed: %u\n", field->is_signed);
+ seq_printf(m, " function: hist_field_%s()\n", field_funcs[field->fn_num]);
return 0;
}
@@ -6518,6 +6531,26 @@ static bool existing_hist_update_only(char *glob,
return updated;
}
+/*
+ * Set or disable using the per CPU trace_buffer_event when possible.
+ */
+static int tracing_set_filter_buffering(struct trace_array *tr, bool set)
+{
+ guard(mutex)(&trace_types_lock);
+
+ if (set && tr->no_filter_buffering_ref++)
+ return 0;
+
+ if (!set) {
+ if (WARN_ON_ONCE(!tr->no_filter_buffering_ref))
+ return -EINVAL;
+
+ --tr->no_filter_buffering_ref;
+ }
+
+ return 0;
+}
+
static int hist_register_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
@@ -6907,11 +6940,9 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
out_unreg:
event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
out_free:
- event_trigger_reset_filter(cmd_ops, trigger_data);
-
remove_hist_vars(hist_data);
- kfree(trigger_data);
+ trigger_data_free(trigger_data);
destroy_hist_data(hist_data);
goto out;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 45c187e77e21..ce42fbf16f4a 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -499,9 +499,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
return len;
}
-static notrace void trace_event_raw_event_synth(void *__data,
- u64 *var_ref_vals,
- unsigned int *var_ref_idx)
+static void trace_event_raw_event_synth(void *__data,
+ u64 *var_ref_vals,
+ unsigned int *var_ref_idx)
{
unsigned int i, n_u64, val_idx, len, data_size = 0;
struct trace_event_file *trace_file = __data;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 06b75bcfc7b8..7fa26327c9c7 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1347,18 +1347,13 @@ traceon_trigger(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file) {
- if (tracer_tracing_is_on(file->tr))
- return;
-
- tracer_tracing_on(file->tr);
+ if (WARN_ON_ONCE(!file))
return;
- }
- if (tracing_is_on())
+ if (tracer_tracing_is_on(file->tr))
return;
- tracing_on();
+ tracer_tracing_on(file->tr);
}
static bool
@@ -1368,13 +1363,11 @@ traceon_count_func(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file) {
- if (tracer_tracing_is_on(file->tr))
- return false;
- } else {
- if (tracing_is_on())
- return false;
- }
+ if (WARN_ON_ONCE(!file))
+ return false;
+
+ if (tracer_tracing_is_on(file->tr))
+ return false;
if (!data->count)
return false;
@@ -1392,18 +1385,13 @@ traceoff_trigger(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file) {
- if (!tracer_tracing_is_on(file->tr))
- return;
-
- tracer_tracing_off(file->tr);
+ if (WARN_ON_ONCE(!file))
return;
- }
- if (!tracing_is_on())
+ if (!tracer_tracing_is_on(file->tr))
return;
- tracing_off();
+ tracer_tracing_off(file->tr);
}
static bool
@@ -1413,13 +1401,11 @@ traceoff_count_func(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file) {
- if (!tracer_tracing_is_on(file->tr))
- return false;
- } else {
- if (!tracing_is_on())
- return false;
- }
+ if (WARN_ON_ONCE(!file))
+ return false;
+
+ if (!tracer_tracing_is_on(file->tr))
+ return false;
if (!data->count)
return false;
@@ -1481,10 +1467,10 @@ snapshot_trigger(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file)
- tracing_snapshot_instance(file->tr);
- else
- tracing_snapshot();
+ if (WARN_ON_ONCE(!file))
+ return;
+
+ tracing_snapshot_instance(file->tr);
}
static int
@@ -1570,10 +1556,10 @@ stacktrace_trigger(struct event_trigger_data *data,
{
struct trace_event_file *file = data->private_data;
- if (file)
- __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP);
- else
- trace_dump_stack(STACK_SKIP);
+ if (WARN_ON_ONCE(!file))
+ return;
+
+ __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP);
}
static int
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 2f7b94e98317..3fe274b84f1c 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -102,9 +102,9 @@ struct hwlat_sample {
/* keep the global state somewhere. */
static struct hwlat_data {
- struct mutex lock; /* protect changes */
+ struct mutex lock; /* protect changes */
- u64 count; /* total since reset */
+ atomic64_t count; /* total since reset */
u64 sample_window; /* total sampling window (on+off) */
u64 sample_width; /* active sampling portion of window */
@@ -193,8 +193,7 @@ void trace_hwlat_callback(bool enter)
* get_sample - sample the CPU TSC and look for likely hardware latencies
*
* Used to repeatedly capture the CPU TSC (or similar), looking for potential
- * hardware-induced latency. Called with interrupts disabled and with
- * hwlat_data.lock held.
+ * hardware-induced latency. Called with interrupts disabled.
*/
static int get_sample(void)
{
@@ -204,6 +203,7 @@ static int get_sample(void)
time_type start, t1, t2, last_t2;
s64 diff, outer_diff, total, last_total = 0;
u64 sample = 0;
+ u64 sample_width = READ_ONCE(hwlat_data.sample_width);
u64 thresh = tracing_thresh;
u64 outer_sample = 0;
int ret = -1;
@@ -267,7 +267,7 @@ static int get_sample(void)
if (diff > sample)
sample = diff; /* only want highest value */
- } while (total <= hwlat_data.sample_width);
+ } while (total <= sample_width);
barrier(); /* finish the above in the view for NMIs */
trace_hwlat_callback_enabled = false;
@@ -285,8 +285,7 @@ static int get_sample(void)
if (kdata->nmi_total_ts)
do_div(kdata->nmi_total_ts, NSEC_PER_USEC);
- hwlat_data.count++;
- s.seqnum = hwlat_data.count;
+ s.seqnum = atomic64_inc_return(&hwlat_data.count);
s.duration = sample;
s.outer_duration = outer_sample;
s.nmi_total_ts = kdata->nmi_total_ts;
@@ -832,7 +831,7 @@ static int hwlat_tracer_init(struct trace_array *tr)
hwlat_trace = tr;
- hwlat_data.count = 0;
+ atomic64_set(&hwlat_data.count, 0);
tr->max_latency = 0;
save_tracing_thresh = tracing_thresh;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9953506370a5..061658518605 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -2048,6 +2048,10 @@ static __init int init_kprobe_trace(void)
trace_create_file("kprobe_profile", TRACE_MODE_READ,
NULL, NULL, &kprobe_profile_ops);
+ /* If no 'kprobe_event=' cmd is provided, return directly. */
+ if (kprobe_boot_events_buf[0] == '\0')
+ return 0;
+
setup_boot_kprobe_events();
return 0;
@@ -2079,7 +2083,7 @@ static __init int kprobe_trace_self_tests_init(void)
struct trace_kprobe *tk;
struct trace_event_file *file;
- if (tracing_is_disabled())
+ if (unlikely(tracing_disabled))
return -ENODEV;
if (tracing_selftest_disabled)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cc2d3306bb60..1996d7aba038 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -194,13 +194,37 @@ trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
EXPORT_SYMBOL(trace_print_symbols_seq_u64);
#endif
+/**
+ * trace_print_bitmask_seq - print a bitmask to a sequence buffer
+ * @iter: The trace iterator for the current event instance
+ * @bitmask_ptr: The pointer to the bitmask data
+ * @bitmask_size: The size of the bitmask in bytes
+ *
+ * Prints a bitmask into a sequence buffer as either a hex string or a
+ * human-readable range list, depending on the instance's "bitmask-list"
+ * trace option. The bitmask is formatted into the iterator's temporary
+ * scratchpad rather than the primary sequence buffer. This avoids
+ * duplication and pointer-collision issues when the returned string is
+ * processed by a "%s" specifier in a TP_printk() macro.
+ *
+ * Returns a pointer to the formatted string within the temporary buffer.
+ */
const char *
-trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+trace_print_bitmask_seq(struct trace_iterator *iter, void *bitmask_ptr,
unsigned int bitmask_size)
{
- const char *ret = trace_seq_buffer_ptr(p);
+ struct trace_seq *p = &iter->tmp_seq;
+ const struct trace_array *tr = iter->tr;
+ const char *ret;
+
+ trace_seq_init(p);
+ ret = trace_seq_buffer_ptr(p);
+
+ if (tr->trace_flags & TRACE_ITER(BITMASK_LIST))
+ trace_seq_bitmask_list(p, bitmask_ptr, bitmask_size * 8);
+ else
+ trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
- trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
trace_seq_putc(p, 0);
return ret;
diff --git a/kernel/trace/trace_pid.c b/kernel/trace/trace_pid.c
new file mode 100644
index 000000000000..7127c8de4174
--- /dev/null
+++ b/kernel/trace/trace_pid.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "trace.h"
+
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+ return trace_pid_list_is_set(filtered_pids, search_pid);
+}
+
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @filtered_no_pids: The list of pids not to be traced
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list *filtered_no_pids,
+ struct task_struct *task)
+{
+ /*
+ * If filtered_no_pids is not empty, and the task's pid is listed
+ * in filtered_no_pids, then return true.
+ * Otherwise, if filtered_pids is empty, that means we can
+ * trace all tasks. If it has content, then only trace pids
+ * within filtered_pids.
+ */
+
+ return (filtered_pids &&
+ !trace_find_filtered_pid(filtered_pids, task->pid)) ||
+ (filtered_no_pids &&
+ trace_find_filtered_pid(filtered_no_pids, task->pid));
+}
+
+/**
+ * trace_filter_add_remove_task - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task)
+{
+ if (!pid_list)
+ return;
+
+ /* For forks, we only add if the forking task is listed */
+ if (self) {
+ if (!trace_find_filtered_pid(pid_list, self->pid))
+ return;
+ }
+
+ /* "self" is set for forks, and NULL for exits */
+ if (self)
+ trace_pid_list_set(pid_list, task->pid);
+ else
+ trace_pid_list_clear(pid_list, task->pid);
+}
+
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+ long pid = (unsigned long)v;
+ unsigned int next;
+
+ (*pos)++;
+
+ /* pid already is +1 of the actual previous bit */
+ if (trace_pid_list_next(pid_list, pid, &next) < 0)
+ return NULL;
+
+ pid = next;
+
+ /* Return pid + 1 to allow zero to be represented */
+ return (void *)(pid + 1);
+}
+
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+ unsigned long pid;
+ unsigned int first;
+ loff_t l = 0;
+
+ if (trace_pid_list_first(pid_list, &first) < 0)
+ return NULL;
+
+ pid = first;
+
+ /* Return pid + 1 so that zero can be the exit value */
+ for (pid++; pid && l < *pos;
+ pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+ ;
+ return (void *)pid;
+}
+
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+ unsigned long pid = (unsigned long)v - 1;
+
+ seq_printf(m, "%lu\n", pid);
+ return 0;
+}
+
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE 127
+
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_parser parser;
+ unsigned long val;
+ int nr_pids = 0;
+ ssize_t read = 0;
+ ssize_t ret;
+ loff_t pos;
+ pid_t pid;
+
+ if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ /*
+ * Always recreate a new array. The write is an all or nothing
+ * operation. Always create a new array when adding new pids by
+ * the user. If the operation fails, then the current list is
+ * not modified.
+ */
+ pid_list = trace_pid_list_alloc();
+ if (!pid_list) {
+ trace_parser_put(&parser);
+ return -ENOMEM;
+ }
+
+ if (filtered_pids) {
+ /* copy the current bits to the new max */
+ ret = trace_pid_list_first(filtered_pids, &pid);
+ while (!ret) {
+ ret = trace_pid_list_set(pid_list, pid);
+ if (ret < 0)
+ goto out;
+
+ ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
+ nr_pids++;
+ }
+ }
+
+ ret = 0;
+ while (cnt > 0) {
+
+ pos = 0;
+
+ ret = trace_get_user(&parser, ubuf, cnt, &pos);
+ if (ret < 0)
+ break;
+
+ read += ret;
+ ubuf += ret;
+ cnt -= ret;
+
+ if (!trace_parser_loaded(&parser))
+ break;
+
+ ret = -EINVAL;
+ if (kstrtoul(parser.buffer, 0, &val))
+ break;
+
+ pid = (pid_t)val;
+
+ if (trace_pid_list_set(pid_list, pid) < 0) {
+ ret = -1;
+ break;
+ }
+ nr_pids++;
+
+ trace_parser_clear(&parser);
+ ret = 0;
+ }
+ out:
+ trace_parser_put(&parser);
+
+ if (ret < 0) {
+ trace_pid_list_free(pid_list);
+ return ret;
+ }
+
+ if (!nr_pids) {
+ /* Cleared the list of pids */
+ trace_pid_list_free(pid_list);
+ pid_list = NULL;
+ }
+
+ *new_pid_list = pid_list;
+
+ return read;
+}
+
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 29f6e95439b6..6a29e4350b55 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -376,6 +376,436 @@ static const struct file_operations ftrace_formats_fops = {
.release = seq_release,
};
+static __always_inline bool printk_binsafe(struct trace_array *tr)
+{
+ /*
+ * The binary format of traceprintk can cause a crash if used
+ * by a buffer from another boot. Force the use of the
+ * non binary version of trace_printk if the trace_printk
+ * buffer is a boot mapped ring buffer.
+ */
+ return !(tr->flags & TRACE_ARRAY_FL_BOOT);
+}
+
+int __trace_array_puts(struct trace_array *tr, unsigned long ip,
+ const char *str, int size)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct print_entry *entry;
+ unsigned int trace_ctx;
+ int alloc;
+
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
+ return 0;
+
+ if (unlikely(tracing_selftest_running &&
+ (tr->flags & TRACE_ARRAY_FL_GLOBAL)))
+ return 0;
+
+ if (unlikely(tracing_disabled))
+ return 0;
+
+ alloc = sizeof(*entry) + size + 2; /* possible \n added */
+
+ trace_ctx = tracing_gen_ctx();
+ buffer = tr->array_buffer.buffer;
+ guard(ring_buffer_nest)(buffer);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ trace_ctx);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+
+ memcpy(&entry->buf, str, size);
+
+ /* Add a newline if necessary */
+ if (entry->buf[size - 1] != '\n') {
+ entry->buf[size] = '\n';
+ entry->buf[size + 1] = '\0';
+ } else
+ entry->buf[size] = '\0';
+
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
+ return size;
+}
+EXPORT_SYMBOL_GPL(__trace_array_puts);
+
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip: The address of the caller
+ * @str: The constant string to write
+ */
+int __trace_puts(unsigned long ip, const char *str)
+{
+ return __trace_array_puts(printk_trace, ip, str, strlen(str));
+}
+EXPORT_SYMBOL_GPL(__trace_puts);
+
+/**
+ * __trace_bputs - write the pointer to a constant string into trace buffer
+ * @ip: The address of the caller
+ * @str: The constant string to write to the buffer to
+ */
+int __trace_bputs(unsigned long ip, const char *str)
+{
+ struct trace_array *tr = READ_ONCE(printk_trace);
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct bputs_entry *entry;
+ unsigned int trace_ctx;
+ int size = sizeof(struct bputs_entry);
+
+ if (!printk_binsafe(tr))
+ return __trace_puts(ip, str);
+
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
+ return 0;
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ trace_ctx = tracing_gen_ctx();
+ buffer = tr->array_buffer.buffer;
+
+ guard(ring_buffer_nest)(buffer);
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+ trace_ctx);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->str = str;
+
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(__trace_bputs);
+
+/* created for use with alloc_percpu */
+struct trace_buffer_struct {
+ int nesting;
+ char buffer[4][TRACE_BUF_SIZE];
+};
+
+static struct trace_buffer_struct __percpu *trace_percpu_buffer;
+
+/*
+ * This allows for lockless recording. If we're nested too deeply, then
+ * this returns NULL.
+ */
+static char *get_trace_buf(void)
+{
+ struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
+
+ if (!trace_percpu_buffer || buffer->nesting >= 4)
+ return NULL;
+
+ buffer->nesting++;
+
+ /* Interrupts must see nesting incremented before we use the buffer */
+ barrier();
+ return &buffer->buffer[buffer->nesting - 1][0];
+}
+
+static void put_trace_buf(void)
+{
+ /* Don't let the decrement of nesting leak before this */
+ barrier();
+ this_cpu_dec(trace_percpu_buffer->nesting);
+}
+
+static int alloc_percpu_trace_buffer(void)
+{
+ struct trace_buffer_struct __percpu *buffers;
+
+ if (trace_percpu_buffer)
+ return 0;
+
+ buffers = alloc_percpu(struct trace_buffer_struct);
+ if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
+ return -ENOMEM;
+
+ trace_percpu_buffer = buffers;
+ return 0;
+}
+
+static int buffers_allocated;
+
+void trace_printk_init_buffers(void)
+{
+ if (buffers_allocated)
+ return;
+
+ if (alloc_percpu_trace_buffer())
+ return;
+
+ /* trace_printk() is for debug use only. Don't use it in production. */
+
+ pr_warn("\n");
+ pr_warn("**********************************************************\n");
+ pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warn("** **\n");
+ pr_warn("** trace_printk() being used. Allocating extra memory. **\n");
+ pr_warn("** **\n");
+ pr_warn("** This means that this is a DEBUG kernel and it is **\n");
+ pr_warn("** unsafe for production use. **\n");
+ pr_warn("** **\n");
+ pr_warn("** If you see this message and you are not debugging **\n");
+ pr_warn("** the kernel, report this immediately to your vendor! **\n");
+ pr_warn("** **\n");
+ pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warn("**********************************************************\n");
+
+ /* Expand the buffers to set size */
+ if (tracing_update_buffers(NULL) < 0)
+ pr_err("Failed to expand tracing buffers for trace_printk() calls\n");
+ else
+ buffers_allocated = 1;
+
+ /*
+ * trace_printk_init_buffers() can be called by modules.
+ * If that happens, then we need to start cmdline recording
+ * directly here.
+ */
+ if (system_state == SYSTEM_RUNNING)
+ tracing_start_cmdline_record();
+}
+EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
+
+void trace_printk_start_comm(void)
+{
+ /* Start tracing comms if trace printk is set */
+ if (!buffers_allocated)
+ return;
+ tracing_start_cmdline_record();
+}
+
+void trace_printk_start_stop_comm(int enabled)
+{
+ if (!buffers_allocated)
+ return;
+
+ if (enabled)
+ tracing_start_cmdline_record();
+ else
+ tracing_stop_cmdline_record();
+}
+
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ * @ip: The address of the caller
+ * @fmt: The string format to write to the buffer
+ * @args: Arguments for @fmt
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct trace_array *tr = READ_ONCE(printk_trace);
+ struct bprint_entry *entry;
+ unsigned int trace_ctx;
+ char *tbuffer;
+ int len = 0, size;
+
+ if (!printk_binsafe(tr))
+ return trace_vprintk(ip, fmt, args);
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
+
+ trace_ctx = tracing_gen_ctx();
+ guard(preempt_notrace)();
+
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
+ goto out_nobuffer;
+ }
+
+ len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
+
+ if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
+ goto out_put;
+
+ size = sizeof(*entry) + sizeof(u32) * len;
+ buffer = tr->array_buffer.buffer;
+ scoped_guard(ring_buffer_nest, buffer) {
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+ trace_ctx);
+ if (!event)
+ goto out_put;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->fmt = fmt;
+
+ memcpy(entry->buf, tbuffer, sizeof(u32) * len);
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
+ }
+out_put:
+ put_trace_buf();
+
+out_nobuffer:
+ unpause_graph_tracing();
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
+static __printf(3, 0)
+int __trace_array_vprintk(struct trace_buffer *buffer,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ struct ring_buffer_event *event;
+ int len = 0, size;
+ struct print_entry *entry;
+ unsigned int trace_ctx;
+ char *tbuffer;
+
+ if (unlikely(tracing_disabled))
+ return 0;
+
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
+
+ trace_ctx = tracing_gen_ctx();
+ guard(preempt_notrace)();
+
+
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
+ goto out_nobuffer;
+ }
+
+ len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
+
+ size = sizeof(*entry) + len + 1;
+ scoped_guard(ring_buffer_nest, buffer) {
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ trace_ctx);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+
+ memcpy(&entry->buf, tbuffer, len + 1);
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
+ }
+out:
+ put_trace_buf();
+
+out_nobuffer:
+ unpause_graph_tracing();
+
+ return len;
+}
+
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ if (tracing_selftest_running && (tr->flags & TRACE_ARRAY_FL_GLOBAL))
+ return 0;
+
+ return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
+}
+
+/**
+ * trace_array_printk - Print a message to a specific instance
+ * @tr: The instance trace_array descriptor
+ * @ip: The instruction pointer that this is called from.
+ * @fmt: The format to print (printf format)
+ *
+ * If a subsystem sets up its own instance, they have the right to
+ * printk strings into their tracing instance buffer using this
+ * function. Note, this function will not write into the top level
+ * buffer (use trace_printk() for that), as writing into the top level
+ * buffer should only have events that can be individually disabled.
+ * trace_printk() is only used for debugging a kernel, and should not
+ * be ever incorporated in normal use.
+ *
+ * trace_array_printk() can be used, as it will not add noise to the
+ * top level tracing buffer.
+ *
+ * Note, trace_array_init_printk() must be called on @tr before this
+ * can be used.
+ */
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!tr)
+ return -ENOENT;
+
+ /* This is only allowed for created instances */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return 0;
+
+ if (!(tr->trace_flags & TRACE_ITER(PRINTK)))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_array_vprintk(tr, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_array_printk);
+
+/**
+ * trace_array_init_printk - Initialize buffers for trace_array_printk()
+ * @tr: The trace array to initialize the buffers for
+ *
+ * As trace_array_printk() only writes into instances, they are OK to
+ * have in the kernel (unlike trace_printk()). This needs to be called
+ * before trace_array_printk() can be used on a trace_array.
+ */
+int trace_array_init_printk(struct trace_array *tr)
+{
+ if (!tr)
+ return -ENOENT;
+
+ /* This is only allowed for created instances */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return -EINVAL;
+
+ return alloc_percpu_trace_buffer();
+}
+EXPORT_SYMBOL_GPL(trace_array_init_printk);
+
+int trace_array_printk_buf(struct trace_buffer *buffer,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK)))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = __trace_array_vprintk(buffer, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+{
+ return trace_array_vprintk(printk_trace, ip, fmt, args);
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
static __init int init_trace_printk_function_export(void)
{
int ret;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d88c44f1dfa5..be53fe6fee6a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1225,7 +1225,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
/* check both trace buffers */
ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -1287,7 +1287,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
/* check both trace buffers */
ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -1355,7 +1355,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
if (ret)
goto out;
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
if (ret)
goto out;
@@ -1385,7 +1385,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
if (ret)
goto out;
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -1513,7 +1513,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* check both trace buffers */
ret = trace_test_buffer(&tr->array_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&tr->max_buffer, &count);
+ ret = trace_test_buffer(&tr->snapshot_buffer, &count);
trace->reset(tr);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 32684ef4fb9d..85f6f10d107f 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -106,7 +106,7 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
* Writes a ASCII representation of a bitmask string into @s.
*/
void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
- int nmaskbits)
+ int nmaskbits)
{
unsigned int save_len = s->seq.len;
@@ -125,6 +125,33 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
EXPORT_SYMBOL_GPL(trace_seq_bitmask);
/**
+ * trace_seq_bitmask_list - write a bitmask array in its list representation
+ * @s: trace sequence descriptor
+ * @maskp: points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits: The number of bits that are valid in @maskp
+ *
+ * Writes a list representation (e.g., 0-3,5-7) of a bitmask string into @s.
+ */
+void trace_seq_bitmask_list(struct trace_seq *s, const unsigned long *maskp,
+ int nmaskbits)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ seq_buf_printf(&s->seq, "%*pbl", nmaskbits, maskp);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask_list);
+
+/**
* trace_seq_vprintf - sequence printing of trace information
* @s: trace sequence descriptor
* @fmt: printf format string
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 62719d2941c9..fd2ee879815c 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -34,9 +34,13 @@ enum tp_transition_sync {
struct tp_transition_snapshot {
unsigned long rcu;
+ unsigned long srcu_gp;
bool ongoing;
};
+DEFINE_SRCU_FAST(tracepoint_srcu);
+EXPORT_SYMBOL_GPL(tracepoint_srcu);
+
/* Protected by tracepoints_mutex */
static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC];
@@ -46,6 +50,7 @@ static void tp_rcu_get_state(enum tp_transition_sync sync)
/* Keep the latest get_state snapshot. */
snapshot->rcu = get_state_synchronize_rcu();
+ snapshot->srcu_gp = start_poll_synchronize_srcu(&tracepoint_srcu);
snapshot->ongoing = true;
}
@@ -56,6 +61,8 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync)
if (!snapshot->ongoing)
return;
cond_synchronize_rcu(snapshot->rcu);
+ if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu_gp))
+ synchronize_srcu(&tracepoint_srcu);
snapshot->ongoing = false;
}
@@ -112,10 +119,13 @@ static inline void release_probes(struct tracepoint *tp, struct tracepoint_func
struct tp_probes *tp_probes = container_of(old,
struct tp_probes, probes[0]);
- if (tracepoint_is_faultable(tp))
- call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes);
- else
- call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+ if (tracepoint_is_faultable(tp)) {
+ call_rcu_tasks_trace(&tp_probes->rcu,
+ rcu_free_old_probes);
+ } else {
+ call_srcu(&tracepoint_srcu, &tp_probes->rcu,
+ rcu_free_old_probes);
+ }
}
}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 6ea2f6363b90..5c153106e642 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -125,7 +125,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
{
u64 time, delta;
- if (!likely(tsk->mm))
+ if (unlikely(!tsk->mm || (tsk->flags & PF_KTHREAD)))
return;
time = stime + utime;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 586af49fc03e..fc4a8f2d3096 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -47,7 +47,7 @@ static int set_permissions(struct ctl_table_header *head,
int mode;
/* Allow users with CAP_SYS_RESOURCE unrestrained access */
- if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+ if (ns_capable_noaudit(user_ns, CAP_SYS_RESOURCE))
mode = (table->mode & S_IRWXU) >> 6;
else
/* Allow all others at most read-only access */
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index e2784038bbed..8d82913223a1 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -141,7 +141,9 @@ EXPORT_SYMBOL_GPL(hwerr_log_error_type);
static int __init crash_save_vmcoreinfo_init(void)
{
- vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+ int order;
+ order = get_order(VMCOREINFO_BYTES);
+ vmcoreinfo_data = (unsigned char *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
if (!vmcoreinfo_data) {
pr_warn("Memory allocation for vmcoreinfo_data failed\n");
return -ENOMEM;
@@ -150,7 +152,7 @@ static int __init crash_save_vmcoreinfo_init(void)
vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
GFP_KERNEL | __GFP_ZERO);
if (!vmcoreinfo_note) {
- free_page((unsigned long)vmcoreinfo_data);
+ free_pages((unsigned long)vmcoreinfo_data, order);
vmcoreinfo_data = NULL;
pr_warn("Memory allocation for vmcoreinfo_note failed\n");
return -ENOMEM;
@@ -242,7 +244,6 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(kallsyms_token_table);
VMCOREINFO_SYMBOL(kallsyms_token_index);
VMCOREINFO_SYMBOL(kallsyms_offsets);
- VMCOREINFO_SYMBOL(kallsyms_relative_base);
#endif /* CONFIG_KALLSYMS */
arch_crash_save_vmcoreinfo();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 366122f4a0f8..7d675781bc91 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC;
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
@@ -550,7 +550,7 @@ static bool need_counting_irqs(void)
u8 util;
int tail = __this_cpu_read(cpustat_tail);
- tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
+ tail = (tail + NUM_SAMPLE_PERIODS - 1) % NUM_SAMPLE_PERIODS;
util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
return util > HARDIRQ_PERCENT_THRESH;
}
@@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
unsigned long touch_ts, period_ts, now;
struct pt_regs *regs = get_irq_regs();
- int duration;
int softlockup_all_cpu_backtrace;
+ int duration, thresh_count;
unsigned long flags;
if (!watchdog_enabled)
@@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
- if (softlockup_panic)
+ thresh_count = duration / get_softlockup_thresh();
+
+ if (softlockup_panic && thresh_count >= softlockup_panic)
panic("softlockup: hung tasks");
}
@@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "softlockup_sys_info",
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index d3ca70e3c256..cf05775a96d3 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -118,18 +118,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
watchdog_hardlockup_check(smp_processor_id(), regs);
}
-static int hardlockup_detector_event_create(void)
+static struct perf_event *hardlockup_detector_event_create(unsigned int cpu)
{
- unsigned int cpu;
struct perf_event_attr *wd_attr;
struct perf_event *evt;
- /*
- * Preemption is not disabled because memory will be allocated.
- * Ensure CPU-locality by calling this in per-CPU kthread.
- */
- WARN_ON(!is_percpu_thread());
- cpu = raw_smp_processor_id();
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
@@ -143,14 +136,7 @@ static int hardlockup_detector_event_create(void)
watchdog_overflow_callback, NULL);
}
- if (IS_ERR(evt)) {
- pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
- PTR_ERR(evt));
- return PTR_ERR(evt);
- }
- WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
- this_cpu_write(watchdog_ev, evt);
- return 0;
+ return evt;
}
/**
@@ -159,17 +145,26 @@ static int hardlockup_detector_event_create(void)
*/
void watchdog_hardlockup_enable(unsigned int cpu)
{
+ struct perf_event *evt;
+
WARN_ON_ONCE(cpu != smp_processor_id());
- if (hardlockup_detector_event_create())
+ evt = hardlockup_detector_event_create(cpu);
+ if (IS_ERR(evt)) {
+ pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
+ PTR_ERR(evt));
return;
+ }
/* use original value for check */
if (!atomic_fetch_inc(&watchdog_cpus))
pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
+ WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
+ this_cpu_write(watchdog_ev, evt);
+
watchdog_init_timestamp();
- perf_event_enable(this_cpu_read(watchdog_ev));
+ perf_event_enable(evt);
}
/**
@@ -263,19 +258,30 @@ bool __weak __init arch_perf_nmi_is_available(void)
*/
int __init watchdog_hardlockup_probe(void)
{
+ struct perf_event *evt;
+ unsigned int cpu;
int ret;
if (!arch_perf_nmi_is_available())
return -ENODEV;
- ret = hardlockup_detector_event_create();
+ if (!hw_nmi_get_sample_period(watchdog_thresh))
+ return -EINVAL;
- if (ret) {
+ /*
+ * Test hardware PMU availability by creating a temporary perf event.
+ * The event is released immediately.
+ */
+ cpu = raw_smp_processor_id();
+ evt = hardlockup_detector_event_create(cpu);
+ if (IS_ERR(evt)) {
pr_info("Perf NMI watchdog permanently disabled\n");
+ ret = PTR_ERR(evt);
} else {
- perf_event_release_kernel(this_cpu_read(watchdog_ev));
- this_cpu_write(watchdog_ev, NULL);
+ perf_event_release_kernel(evt);
+ ret = 0;
}
+
return ret;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index eb5660013222..c515cff01828 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -117,6 +117,8 @@ enum wq_internal_consts {
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
+ RESCUER_BATCH = 16, /* process items per turn */
+
/*
* Rescue workers are used only on emergencies and shared by
* all cpus. Give MIN_NICE.
@@ -286,6 +288,7 @@ struct pool_workqueue {
struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
+ struct work_struct mayday_cursor; /* L: cursor on pool->worklist */
u64 stats[PWQ_NR_STATS];
@@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
return NULL;
}
+static void mayday_cursor_func(struct work_struct *work)
+{
+ /* should not be processed, only for marking position */
+ BUG();
+}
+
/**
* move_linked_works - move linked works to a list
* @work: start of series of works to be scheduled
@@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
lockdep_assert_held(&pool->lock);
+ /* The cursor work should not be processed */
+ if (unlikely(work->func == mayday_cursor_func)) {
+ /* only worker_thread() can possibly take this branch */
+ WARN_ON_ONCE(worker->rescue_wq);
+ if (nextp)
+ *nextp = list_next_entry(work, entry);
+ list_del_init(&work->entry);
+ return false;
+ }
+
/*
* A single work shouldn't be executed concurrently by multiple workers.
* __queue_work() ensures that @work doesn't jump to a different pool
@@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work)
reap_dying_workers(&cull_list);
}
-static void send_mayday(struct work_struct *work)
+static void send_mayday(struct pool_workqueue *pwq)
{
- struct pool_workqueue *pwq = get_work_pwq(work);
struct workqueue_struct *wq = pwq->wq;
lockdep_assert_held(&wq_mayday_lock);
@@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t)
* rescuers.
*/
list_for_each_entry(work, &pool->worklist, entry)
- send_mayday(work);
+ send_mayday(get_work_pwq(work));
}
raw_spin_unlock(&wq_mayday_lock);
@@ -3440,22 +3458,57 @@ sleep:
static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
{
struct worker_pool *pool = pwq->pool;
+ struct work_struct *cursor = &pwq->mayday_cursor;
struct work_struct *work, *n;
- /* need rescue? */
- if (!pwq->nr_active || !need_to_create_worker(pool))
+ /* have work items to rescue? */
+ if (!pwq->nr_active)
return false;
- /*
- * Slurp in all works issued via this workqueue and
- * process'em.
- */
- list_for_each_entry_safe(work, n, &pool->worklist, entry) {
- if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
+ /* need rescue? */
+ if (!need_to_create_worker(pool)) {
+ /*
+ * The pool has idle workers and doesn't need the rescuer, so it
+ * could simply return false here.
+ *
+ * However, the memory pressure might not be fully relieved.
+ * In PERCPU pool with concurrency enabled, having idle workers
+ * does not necessarily mean memory pressure is gone; it may
+ * simply mean regular workers have woken up, completed their
+ * work, and gone idle again due to concurrency limits.
+ *
+ * In this case, those working workers may later sleep again,
+ * the pool may run out of idle workers, and it will have to
+ * allocate new ones and wait for the timer to send mayday,
+ * causing unnecessary delay - especially if memory pressure
+ * was never resolved throughout.
+ *
+ * Do more work if memory pressure is still on to reduce
+ * relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
+ * not precisely, unless there are other PWQs needing help.
+ */
+ if (!(pool->flags & POOL_MANAGER_ACTIVE) ||
+ !list_empty(&pwq->wq->maydays))
+ return false;
+ }
+
+ /* search from the start or cursor if available */
+ if (list_empty(&cursor->entry))
+ work = list_first_entry(&pool->worklist, struct work_struct, entry);
+ else
+ work = list_next_entry(cursor, entry);
+
+ /* find the next work item to rescue */
+ list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
pwq->stats[PWQ_STAT_RESCUED]++;
+ /* put the cursor for next search */
+ list_move_tail(&cursor->entry, &n->entry);
+ return true;
+ }
}
- return !list_empty(&rescuer->scheduled);
+ return false;
}
/**
@@ -3512,6 +3565,7 @@ repeat:
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
+ unsigned int count = 0;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -3524,31 +3578,27 @@ repeat:
WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
- if (assign_rescuer_work(pwq, rescuer)) {
+ while (assign_rescuer_work(pwq, rescuer)) {
process_scheduled_works(rescuer);
/*
- * The above execution of rescued work items could
- * have created more to rescue through
- * pwq_activate_first_inactive() or chained
- * queueing. Let's put @pwq back on mayday list so
- * that such back-to-back work items, which may be
- * being used to relieve memory pressure, don't
- * incur MAYDAY_INTERVAL delay inbetween.
+ * If the per-turn work item limit is reached and other
+ * PWQs are in mayday, requeue mayday for this PWQ and
+ * let the rescuer handle the other PWQs first.
*/
- if (pwq->nr_active && need_to_create_worker(pool)) {
+ if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
+ pwq->nr_active && need_to_create_worker(pool)) {
raw_spin_lock(&wq_mayday_lock);
- /*
- * Queue iff somebody else hasn't queued it already.
- */
- if (list_empty(&pwq->mayday_node)) {
- get_pwq(pwq);
- list_add_tail(&pwq->mayday_node, &wq->maydays);
- }
+ send_mayday(pwq);
raw_spin_unlock(&wq_mayday_lock);
+ break;
}
}
+ /* The cursor can not be left behind without the rescuer watching it. */
+ if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
+ list_del_init(&pwq->mayday_cursor.entry);
+
/*
* Leave this pool. Notify regular workers; otherwise, we end up
* with 0 concurrency and stalling the execution.
@@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
kthread_init_work(&pwq->release_work, pwq_release_workfn);
+
+ /*
+ * Set the dummy cursor work with valid function and get_work_pwq().
+ *
+ * The cursor work should only be in the pwq->pool->worklist, and
+ * should not be treated as a processable work item.
+ *
+ * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
+ * surprise for kernel debugging tools and reviewers.
+ */
+ INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
+ atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq |
+ WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE);
}
/* sync @pwq with the current state of its associated wq and link it */
@@ -7508,9 +7571,13 @@ static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
-static unsigned int wq_panic_on_stall;
+static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
+static unsigned int wq_panic_on_stall_time;
+module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
+MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
+
/*
* Show workers that might prevent the processing of pending work items.
* The only candidates are CPU-bound workers in the running state.
@@ -7562,14 +7629,25 @@ static void show_cpu_pools_hogs(void)
rcu_read_unlock();
}
-static void panic_on_wq_watchdog(void)
+/*
+ * It triggers a panic in two scenarios: when the total number of stalls
+ * exceeds a threshold, and when a stall lasts longer than
+ * wq_panic_on_stall_time
+ */
+static void panic_on_wq_watchdog(unsigned int stall_time_sec)
{
static unsigned int wq_stall;
if (wq_panic_on_stall) {
wq_stall++;
- BUG_ON(wq_stall >= wq_panic_on_stall);
+ if (wq_stall >= wq_panic_on_stall)
+ panic("workqueue: %u stall(s) exceeded threshold %u\n",
+ wq_stall, wq_panic_on_stall);
}
+
+ if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time)
+ panic("workqueue: stall lasted %us, exceeding threshold %us\n",
+ stall_time_sec, wq_panic_on_stall_time);
}
static void wq_watchdog_reset_touched(void)
@@ -7584,10 +7662,12 @@ static void wq_watchdog_reset_touched(void)
static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ unsigned int max_stall_time = 0;
bool lockup_detected = false;
bool cpu_pool_stall = false;
unsigned long now = jiffies;
struct worker_pool *pool;
+ unsigned int stall_time;
int pi;
if (!thresh)
@@ -7621,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
/* did we stall? */
if (time_after(now, ts + thresh)) {
lockup_detected = true;
+ stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
+ max_stall_time = max(max_stall_time, stall_time);
if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
pool->cpu_stall = true;
cpu_pool_stall = true;
}
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
- pr_cont(" stuck for %us!\n",
- jiffies_to_msecs(now - pool_ts) / 1000);
+ pr_cont(" stuck for %us!\n", stall_time);
}
@@ -7641,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
show_cpu_pools_hogs();
if (lockup_detected)
- panic_on_wq_watchdog();
+ panic_on_wq_watchdog(max_stall_time);
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);