summaryrefslogtreecommitdiff
path: root/kernel/trace/trace.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-15 15:59:46 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-15 15:59:46 -0700
commite4bf304f000e6fcceaf60b1455a5124b783b3a66 (patch)
tree27880cd98f6c232dbfecc6c5b6561c1f81148db2 /kernel/trace/trace.c
parent15218296329e489d861a3e4fd2bd299afc115b8e (diff)
parent6170922f137231b98fc568571befef63e1edff3f (diff)
Merge tag 'trace-ringbuffer-v7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull ring-buffer updates from Steven Rostedt: - Add remote buffers for pKVM pKVM has a hypervisor component that is used to protect the guest from the host kernel. This hypervisor is a black box to the kernel as the kernel is to user space. The remote buffers are used to have a memory mapping between the hypervisor and the kernel where kernel may send commands to enable tracing within the hypervisor. Then the kernel will read this memory mapping just like user space can read the memory mapped ring buffer of the kernel tracing system. Since the hypervisor only has a single context, it doesn't need to worry about races between normal context, interrupt context and NMIs like the kernel does. The ring buffer it uses doesn't need to be as complex. The remote buffers are a simple version of the ring buffer that works in a single context. They are still per-CPU and use sub buffers. The data layout is the same as the kernel's ring buffer to share the same parsing. Currently, only ARM64 implements pKVM, but there's work to implement it also in x86. The remote buffer code is separated out from the ARM implementation so that it can be used in the future by x86. The ARM64 updates for pKVM is in the ARM/KVM tree and it merged in the remote buffers of this tree. - Make the backup instance non reusable The backup instance is a copy of the persistent ring buffer so that the persistent ring buffer could start recording again without using the data from the previous boot. The backup isn't for normal tracing. It is made read-only, and after it is consumed, it is automatically removed. - Have backup copy persistent instance before it starts recording To allow the persistent ring buffer to start recording from the kernel command line commands, move the copy of the backup instance to before the the command line options start recording. - Report header_page overwrite field as "char" and not "int' The rust parser of the header_page file was triggering a warning when it defined the overwrite variable as "int" but it was only a single byte in size. - Fix memory barriers for the trace_buffer CPU mask When a CPU comes online, the bit is set to allow readers to know that the CPU buffer is allocated. The bit is set after the allocation is done, and a smp_wmb() is performed after the allocation and before the setting of the bit. But instead of adding a smp_rmb() to all readers, since once a buffer is created for a CPU it is not deleted if that CPU goes offline, so this allocation is almost always done at boot up before any readers exist. If for the unlikely case where a CPU comes online for the first time after the system boot has finished, send an IPI to all CPUs to force the smp_rmb() for each CPU. - Show clock function being used in debugging ring buffer data When the ring buffer checks are enabled and the ring buffer detects an inconsistency in the times of the invents, print out the clock being used when the error occurred. There was a very hard to hit bug that would happen every so often and it ended up being only triggered when the jiffies clock was being used. If the bug showed the clock being used, it would have been much easier to find the problem (which was an internal function was being traced which caused the clock accounting to go off). * tag 'trace-ringbuffer-v7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: (26 commits) ring-buffer: Prevent off-by-one array access in ring_buffer_desc_page() ring-buffer: Report header_page overwrite as char tracing: Allow backup to save persistent ring buffer before it starts tracing/Documentation: Add a section about backup instance tracing: Remove the backup instance automatically after read tracing: Make the backup instance non-reusable ring-buffer: Enforce read ordering of trace_buffer cpumask and buffers ring-buffer: Show what clock function is used on timestamp errors tracing: Check for undefined symbols in simple_ring_buffer tracing: load/unload page callbacks for simple_ring_buffer Documentation: tracing: Add tracing remotes tracing: selftests: Add trace remote tests tracing: Add a trace remote module for testing tracing: Introduce simple_ring_buffer ring-buffer: Export buffer_data_page and macros tracing: Add helpers to create trace remote events tracing: Add events/ root files to trace remotes tracing: Add events to trace remotes tracing: Add init callback to trace remotes tracing: Add non-consuming read to trace remotes ...
Diffstat (limited to 'kernel/trace/trace.c')
-rw-r--r--kernel/trace/trace.c186
1 files changed, 147 insertions, 39 deletions
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a626211ceb9a..e9455d46ec16 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -578,8 +578,59 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
tr->ring_buffer_expanded = true;
}
+static void trace_array_autoremove(struct work_struct *work)
+{
+ struct trace_array *tr = container_of(work, struct trace_array, autoremove_work);
+
+ trace_array_destroy(tr);
+}
+
+static struct workqueue_struct *autoremove_wq;
+
+static void trace_array_kick_autoremove(struct trace_array *tr)
+{
+ if (autoremove_wq)
+ queue_work(autoremove_wq, &tr->autoremove_work);
+}
+
+static void trace_array_cancel_autoremove(struct trace_array *tr)
+{
+ /*
+ * Since this can be called inside trace_array_autoremove(),
+ * it has to avoid deadlock of the workqueue.
+ */
+ if (work_pending(&tr->autoremove_work))
+ cancel_work_sync(&tr->autoremove_work);
+}
+
+static void trace_array_init_autoremove(struct trace_array *tr)
+{
+ INIT_WORK(&tr->autoremove_work, trace_array_autoremove);
+}
+
+static void trace_array_start_autoremove(void)
+{
+ if (autoremove_wq)
+ return;
+
+ autoremove_wq = alloc_workqueue("tr_autoremove_wq",
+ WQ_UNBOUND | WQ_HIGHPRI, 0);
+ if (!autoremove_wq)
+ pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n");
+}
+
LIST_HEAD(ftrace_trace_arrays);
+static int __trace_array_get(struct trace_array *this_tr)
+{
+ /* When free_on_close is set, this is not available anymore. */
+ if (autoremove_wq && this_tr->free_on_close)
+ return -ENODEV;
+
+ this_tr->ref++;
+ return 0;
+}
+
int trace_array_get(struct trace_array *this_tr)
{
struct trace_array *tr;
@@ -587,8 +638,7 @@ int trace_array_get(struct trace_array *this_tr)
guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr == this_tr) {
- tr->ref++;
- return 0;
+ return __trace_array_get(tr);
}
}
@@ -599,6 +649,12 @@ static void __trace_array_put(struct trace_array *this_tr)
{
WARN_ON(!this_tr->ref);
this_tr->ref--;
+ /*
+ * When free_on_close is set, prepare removing the array
+ * when the last reference is released.
+ */
+ if (this_tr->ref == 1 && this_tr->free_on_close)
+ trace_array_kick_autoremove(this_tr);
}
/**
@@ -3856,7 +3912,7 @@ static int s_show(struct seq_file *m, void *v)
* Should be used after trace_array_get(), trace_types_lock
* ensures that i_cdev was already initialized.
*/
-static inline int tracing_get_cpu(struct inode *inode)
+int tracing_get_cpu(struct inode *inode)
{
if (inode->i_cdev) /* See trace_create_cpu_file() */
return (long)inode->i_cdev - 1;
@@ -4022,6 +4078,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
if (ret)
return ret;
+ if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+ trace_array_put(tr);
+ return -EACCES;
+ }
+
filp->private_data = inode->i_private;
return 0;
@@ -5462,6 +5523,10 @@ static void update_last_data(struct trace_array *tr)
/* Only if the buffer has previous boot data clear and update it. */
tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT;
+ /* If this is a backup instance, mark it for autoremove. */
+ if (tr->flags & TRACE_ARRAY_FL_VMALLOC)
+ tr->free_on_close = true;
+
/* Reset the module list and reload them */
if (tr->scratch) {
struct trace_scratch *tscratch = tr->scratch;
@@ -7097,6 +7162,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
if (ret)
return ret;
+ if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+ trace_array_put(tr);
+ return -EACCES;
+ }
+
ret = single_open(file, tracing_clock_show, inode->i_private);
if (ret < 0)
trace_array_put(tr);
@@ -8606,7 +8676,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
return tr->percpu_dir;
}
-static struct dentry *
+struct dentry *
trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
void *data, long cpu, const struct file_operations *fops)
{
@@ -9527,8 +9597,8 @@ struct trace_array *trace_array_find_get(const char *instance)
guard(mutex)(&trace_types_lock);
tr = trace_array_find(instance);
- if (tr)
- tr->ref++;
+ if (tr && __trace_array_get(tr) < 0)
+ tr = NULL;
return tr;
}
@@ -9625,6 +9695,8 @@ trace_array_create_systems(const char *name, const char *systems,
if (ftrace_allocate_ftrace_ops(tr) < 0)
goto out_free_tr;
+ trace_array_init_autoremove(tr);
+
ftrace_init_trace_array(tr);
init_trace_flags_index(tr);
@@ -9735,7 +9807,9 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr->name && strcmp(tr->name, name) == 0) {
- tr->ref++;
+ /* if this fails, @tr is going to be removed. */
+ if (__trace_array_get(tr) < 0)
+ tr = NULL;
return tr;
}
}
@@ -9774,6 +9848,7 @@ static int __remove_instance(struct trace_array *tr)
set_tracer_flag(tr, 1ULL << i, 0);
}
+ trace_array_cancel_autoremove(tr);
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9866,17 +9941,22 @@ static __init void create_trace_instances(struct dentry *d_tracer)
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
+ umode_t writable_mode = TRACE_MODE_WRITE;
int cpu;
+ if (trace_array_is_readonly(tr))
+ writable_mode = TRACE_MODE_READ;
+
trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
- tr, &show_traces_fops);
+ tr, &show_traces_fops);
- trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
- tr, &set_tracer_fops);
+ trace_create_file("current_tracer", writable_mode, d_tracer,
+ tr, &set_tracer_fops);
- trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
+ trace_create_file("tracing_cpumask", writable_mode, d_tracer,
tr, &tracing_cpumask_fops);
+ /* Options are used for changing print-format even for readonly instance. */
trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_iter_fops);
@@ -9886,12 +9966,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
tr, &tracing_pipe_fops);
- trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
+ trace_create_file("buffer_size_kb", writable_mode, d_tracer,
tr, &tracing_entries_fops);
trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
tr, &tracing_total_entries_fops);
+ trace_create_file("trace_clock", writable_mode, d_tracer, tr,
+ &trace_clock_fops);
+
+ trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
+ &trace_time_stamp_mode_fops);
+
+ tr->buffer_percent = 50;
+
+ trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
+ create_trace_options_dir(tr);
+
+ if (tr->range_addr_start)
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
+
+ for_each_tracing_cpu(cpu)
+ tracing_init_tracefs_percpu(tr, cpu);
+
+ /* Read-only instance has above files only. */
+ if (trace_array_is_readonly(tr))
+ return;
+
trace_create_file("free_buffer", 0200, d_tracer,
tr, &tracing_free_buffer_fops);
@@ -9903,49 +10007,29 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
- trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
- &trace_clock_fops);
-
- trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
- tr, &rb_simple_fops);
-
- trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
- &trace_time_stamp_mode_fops);
-
- tr->buffer_percent = 50;
-
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_percent_fops);
-
- trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_subbuf_size_fops);
+ tr, &buffer_percent_fops);
trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
- tr, &tracing_syscall_buf_fops);
+ tr, &tracing_syscall_buf_fops);
- create_trace_options_dir(tr);
+ trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
+ tr, &rb_simple_fops);
trace_create_maxlat_file(tr, d_tracer);
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
- if (tr->range_addr_start) {
- trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
- tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- } else {
+ if (!tr->range_addr_start)
trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
tr, &snapshot_fops);
#endif
- }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
- for_each_tracing_cpu(cpu)
- tracing_init_tracefs_percpu(tr, cpu);
-
ftrace_init_tracefs(tr, d_tracer);
}
@@ -10771,17 +10855,41 @@ __init static void enable_instances(void)
/*
* Backup buffers can be freed but need vfree().
*/
- if (backup)
- tr->flags |= TRACE_ARRAY_FL_VMALLOC;
+ if (backup) {
+ tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY;
+ trace_array_start_autoremove();
+ }
if (start || backup) {
tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
tr->range_name = no_free_ptr(rname);
}
+ /*
+ * Save the events to start and enabled them after all boot instances
+ * have been created.
+ */
+ tr->boot_events = curr_str;
+ }
+
+ /* Enable the events after all boot instances have been created */
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+
+ if (!tr->boot_events || !(*tr->boot_events)) {
+ tr->boot_events = NULL;
+ continue;
+ }
+
+ curr_str = tr->boot_events;
+
+ /* Clear the instance if this is a persistent buffer */
+ if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)
+ update_last_data(tr);
+
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
}
+ tr->boot_events = NULL;
}
}