From a1f157c7a3bb342b972c2830a0ad53f627000a04 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 6 Sep 2023 17:18:37 +0800 Subject: tracing: Expand all ring buffers individually The ring buffer of global_trace is set to the minimum size in order to save memory on boot up and then it will be expand when some trace feature enabled. However currently operations under an instance can also cause global_trace ring buffer being expanded, and the expanded memory would be wasted if global_trace then not being used. See following case, we enable 'sched_switch' event in instance 'A', then ring buffer of global_trace is unexpectedly expanded to be 1410KB, also the '(expanded: 1408)' from 'buffer_size_kb' of instance is confusing. # cd /sys/kernel/tracing # mkdir instances/A # cat buffer_size_kb 7 (expanded: 1408) # cat instances/A/buffer_size_kb 1410 (expanded: 1408) # echo sched:sched_switch > instances/A/set_event # cat buffer_size_kb 1410 # cat instances/A/buffer_size_kb 1410 To fix it, we can: - Make 'ring_buffer_expanded' as a member of 'struct trace_array'; - Make 'ring_buffer_expanded' of instance is defaultly true, global_trace is defaultly false; - In order not to expose 'global_trace' outside of file 'kernel/trace/trace.c', introduce trace_set_ring_buffer_expanded() to set 'ring_buffer_expanded' as 'true'; - Pass the expected trace_array to tracing_update_buffers(). Link: https://lore.kernel.org/linux-trace-kernel/20230906091837.3998020-1-zhengyejian1@huawei.com Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 47 +++++++++++++++++++++++++-------------------- kernel/trace/trace.h | 9 +++++++-- kernel/trace/trace_events.c | 22 +++++++++++---------- 3 files changed, 45 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abaaf516fcae..dd6395692ff9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -54,12 +54,6 @@ #include "trace.h" #include "trace_output.h" -/* - * On boot up, the ring buffer is set to the minimum size, so that - * we do not waste memory on systems that are not using tracing. - */ -bool ring_buffer_expanded; - #ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. @@ -202,7 +196,7 @@ static int __init set_cmdline_ftrace(char *str) strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); return 1; } __setup("ftrace=", set_cmdline_ftrace); @@ -247,7 +241,7 @@ static int __init boot_alloc_snapshot(char *str) } else { allocate_snapshot = true; /* We also need the main ring buffer expanded */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); } return 1; } @@ -490,6 +484,13 @@ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; +void trace_set_ring_buffer_expanded(struct trace_array *tr) +{ + if (!tr) + tr = &global_trace; + tr->ring_buffer_expanded = true; +} + LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) @@ -2012,7 +2013,7 @@ static int run_tracer_selftest(struct tracer *type) #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; @@ -2038,7 +2039,7 @@ static int run_tracer_selftest(struct tracer *type) tr->allocated_snapshot = false; /* Shrink the max buffer again */ - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } @@ -3403,7 +3404,7 @@ void trace_printk_init_buffers(void) pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ - tracing_update_buffers(); + tracing_update_buffers(&global_trace); buffers_allocated = 1; @@ -6374,7 +6375,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, * we use the size that was given, and we can forget about * expanding it later. */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(tr); /* May be called before buffers are initialized */ if (!tr->array_buffer.buffer) @@ -6452,6 +6453,7 @@ out: /** * tracing_update_buffers - used by tracing facility to expand ring buffers + * @tr: The tracing instance * * To save on memory when the tracing is never used on a system with it * configured in. The ring buffers are set to a minimum size. But once @@ -6460,13 +6462,13 @@ out: * * This function is to be called when a tracer is about to be used. */ -int tracing_update_buffers(void) +int tracing_update_buffers(struct trace_array *tr) { int ret = 0; mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, + if (!tr->ring_buffer_expanded) + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); @@ -6520,7 +6522,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) { + if (!tr->ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) @@ -7192,7 +7194,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } if (buf_size_same) { - if (!ring_buffer_expanded) + if (!tr->ring_buffer_expanded) r = sprintf(buf, "%lu (expanded: %lu)\n", size >> 10, trace_buf_size >> 10); @@ -7249,10 +7251,10 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; - if (!ring_buffer_expanded) + if (!tr->ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) r = sprintf(buf, "%lu\n", size); else r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); @@ -7646,7 +7648,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, unsigned long val; int ret; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -9550,6 +9552,9 @@ static struct trace_array *trace_array_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; + /* The ring buffer is defaultly expanded */ + trace_set_ring_buffer_expanded(tr); + if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; @@ -10444,7 +10449,7 @@ __init static int tracer_alloc_buffers(void) trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ - if (ring_buffer_expanded) + if (global_trace.ring_buffer_expanded) ring_buf_size = trace_buf_size; else ring_buf_size = 1; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 77debe53f07c..e92cb9c1292f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -410,6 +410,11 @@ struct trace_array { struct cond_snapshot *cond_snapshot; #endif struct trace_func_repeats __percpu *last_func_repeats; + /* + * On boot up, the ring buffer is set to the minimum size, so that + * we do not waste memory on systems that are not using tracing. + */ + bool ring_buffer_expanded; }; enum { @@ -761,7 +766,7 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -extern bool ring_buffer_expanded; +extern void trace_set_ring_buffer_expanded(struct trace_array *tr); extern bool tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1305,7 +1310,7 @@ static inline void trace_branch_disable(void) #endif /* CONFIG_BRANCH_TRACER */ /* set ring buffers to default size if not already done so */ -int tracing_update_buffers(void); +int tracing_update_buffers(struct trace_array *tr); union trace_synth_field { u8 as_u8; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f49d6ddb6342..099b9b6bbdc4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1166,7 +1166,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (!cnt) return 0; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -1397,18 +1397,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - ret = tracing_update_buffers(); - if (ret < 0) - return ret; - switch (val) { case 0: case 1: ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); - if (likely(file)) + if (likely(file)) { + ret = tracing_update_buffers(file->tr); + if (ret < 0) { + mutex_unlock(&event_mutex); + return ret; + } ret = ftrace_event_enable_disable(file, val); + } mutex_unlock(&event_mutex); break; @@ -1482,7 +1484,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(dir->tr); if (ret < 0) return ret; @@ -1956,7 +1958,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (!cnt) return 0; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -2824,7 +2826,7 @@ static __init int setup_trace_triggers(char *str) int i; strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); buf = bootup_trigger_buf; @@ -3614,7 +3616,7 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event tracing"); return 1; -- cgit v1.2.3 From bdf4fb628093b4ab2f4eb312256233a2ae0594b7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Sep 2023 18:34:02 +0200 Subject: ring_buffer: Use try_cmpxchg instead of cmpxchg in rb_insert_pages Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in rb_insert_pages. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). No functional change intended. Link: https://lore.kernel.org/linux-trace-kernel/20230914163420.12923-1-ubizjak@gmail.com Cc: Steven Rostedt Cc: Masami Hiramatsu Signed-off-by: Uros Bizjak Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 515cafdb18d9..43cc47d7faaf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2056,7 +2056,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) retries = 10; success = false; while (retries--) { - struct list_head *head_page, *prev_page, *r; + struct list_head *head_page, *prev_page; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; struct buffer_page *hpage = rb_set_head_page(cpu_buffer); @@ -2075,9 +2075,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) last_page->next = head_page_with_bit; first_page->prev = prev_page; - r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); - - if (r == head_page_with_bit) { + /* caution: head_page_with_bit gets updated on cmpxchg failure */ + if (try_cmpxchg(&prev_page->next, + &head_page_with_bit, first_page)) { /* * yay, we replaced the page pointer to our new list, * now, we just have to update to head page's prev -- cgit v1.2.3 From 5dbd04eddb2c0841d1b3930e0a9944a2343c9cac Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 12 Sep 2023 18:07:02 +0000 Subject: tracing/user_events: Allow events to persist for perfmon_capable users There are several scenarios that have come up where having a user_event persist even if the process that registered it exits. The main one is having a daemon create events on bootup that shouldn't get deleted if the daemon has to exit or reload. Another is within OpenTelemetry exporters, they wish to potentially check if a user_event exists on the system to determine if exporting the data out should occur. The user_event in this case must exist even in the absence of the owning process running (such as the above daemon case). Expose the previously internal flag USER_EVENT_REG_PERSIST to user processes. Upon register or delete of events with this flag, ensure the user is perfmon_capable to prevent random user processes with access to tracefs from creating events that persist after exit. Link: https://lkml.kernel.org/r/20230912180704.1284-2-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b87f41187c6a..9365ce407426 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -49,18 +49,6 @@ #define EVENT_STATUS_PERF BIT(1) #define EVENT_STATUS_OTHER BIT(7) -/* - * User register flags are not allowed yet, keep them here until we are - * ready to expose them out to the user ABI. - */ -enum user_reg_flag { - /* Event will not delete upon last reference closing */ - USER_EVENT_REG_PERSIST = 1U << 0, - - /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, -}; - /* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. @@ -220,6 +208,17 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static bool user_event_capable(u16 reg_flags) +{ + /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */ + if (reg_flags & USER_EVENT_REG_PERSIST) { + if (!perfmon_capable()) + return false; + } + + return true; +} + static struct user_event *user_event_get(struct user_event *user) { refcount_inc(&user->refcnt); @@ -1811,6 +1810,9 @@ static int user_event_free(struct dyn_event *ev) if (!user_event_last_ref(user)) return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; + return destroy_user_event(user); } @@ -1926,10 +1928,13 @@ static int user_event_parse(struct user_event_group *group, char *name, int argc = 0; char **argv; - /* User register flags are not ready yet */ - if (reg_flags != 0 || flags != NULL) + /* Currently don't support any text based flags */ + if (flags != NULL) return -EINVAL; + if (!user_event_capable(reg_flags)) + return -EPERM; + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); user = find_user_event(group, name, &key); @@ -2062,6 +2067,9 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user_event_last_ref(user)) return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; + return destroy_user_event(user); } -- cgit v1.2.3 From 5790b1fb3d672d9a1fe3881a7181dfdbe741568f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 4 Oct 2023 16:50:07 -0400 Subject: eventfs: Remove eventfs_file and just use eventfs_inode Instead of having a descriptor for every file represented in the eventfs directory, only have the directory itself represented. Change the API to send in a list of entries that represent all the files in the directory (but not other directories). The entry list contains a name and a callback function that will be used to create the files when they are accessed. struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, const struct eventfs_entry *entries, int size, void *data); is used for the top level eventfs directory, and returns an eventfs_inode that will be used by: struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, const struct eventfs_entry *entries, int size, void *data); where both of the above take an array of struct eventfs_entry entries for every file that is in the directory. The entries are defined by: typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, const struct file_operations **fops); struct eventfs_entry { const char *name; eventfs_callback callback; }; Where the name is the name of the file and the callback gets called when the file is being created. The callback passes in the name (in case the same callback is used for multiple files), a pointer to the mode, data and fops. The data will be pointing to the data that was passed in eventfs_create_dir() or eventfs_create_events_dir() but may be overridden to point to something else, as it will be used to point to the inode->i_private that is created. The information passed back from the callback is used to create the dentry/inode. If the callback fills the data and the file should be created, it must return a positive number. On zero or negative, the file is ignored. This logic may also be used as a prototype to convert entire pseudo file systems into just-in-time allocation. The "show_events_dentry" file has been updated to show the directories, and any files they have. With just the eventfs_file allocations: Before after deltas for meminfo (in kB): MemFree: -14360 MemAvailable: -14260 Buffers: 40 Cached: 24 Active: 44 Inactive: 48 Inactive(anon): 28 Active(file): 44 Inactive(file): 20 Dirty: -4 AnonPages: 28 Mapped: 4 KReclaimable: 132 Slab: 1604 SReclaimable: 132 SUnreclaim: 1472 Committed_AS: 12 Before after deltas for slabinfo: : [ * = ] ext4_inode_cache 27 [* 1184 = 31968 ] extent_status 102 [* 40 = 4080 ] tracefs_inode_cache 144 [* 656 = 94464 ] buffer_head 39 [* 104 = 4056 ] shmem_inode_cache 49 [* 800 = 39200 ] filp -53 [* 256 = -13568 ] dentry 251 [* 192 = 48192 ] lsm_file_cache 277 [* 32 = 8864 ] vm_area_struct -14 [* 184 = -2576 ] trace_event_file 1748 [* 88 = 153824 ] kmalloc-1k 35 [* 1024 = 35840 ] kmalloc-256 49 [* 256 = 12544 ] kmalloc-192 -28 [* 192 = -5376 ] kmalloc-128 -30 [* 128 = -3840 ] kmalloc-96 10581 [* 96 = 1015776 ] kmalloc-64 3056 [* 64 = 195584 ] kmalloc-32 1291 [* 32 = 41312 ] kmalloc-16 2310 [* 16 = 36960 ] kmalloc-8 9216 [* 8 = 73728 ] Free memory dropped by 14,360 kB Available memory dropped by 14,260 kB Total slab additions in size: 1,771,032 bytes With this change: Before after deltas for meminfo (in kB): MemFree: -12084 MemAvailable: -11976 Buffers: 32 Cached: 32 Active: 72 Inactive: 168 Inactive(anon): 176 Active(file): 72 Inactive(file): -8 Dirty: 24 AnonPages: 196 Mapped: 8 KReclaimable: 148 Slab: 836 SReclaimable: 148 SUnreclaim: 688 Committed_AS: 324 Before after deltas for slabinfo: : [ * = ] tracefs_inode_cache 144 [* 656 = 94464 ] shmem_inode_cache -23 [* 800 = -18400 ] filp -92 [* 256 = -23552 ] dentry 179 [* 192 = 34368 ] lsm_file_cache -3 [* 32 = -96 ] vm_area_struct -13 [* 184 = -2392 ] trace_event_file 1748 [* 88 = 153824 ] kmalloc-1k -49 [* 1024 = -50176 ] kmalloc-256 -27 [* 256 = -6912 ] kmalloc-128 1864 [* 128 = 238592 ] kmalloc-64 4685 [* 64 = 299840 ] kmalloc-32 -72 [* 32 = -2304 ] kmalloc-16 256 [* 16 = 4096 ] total = 721352 Free memory dropped by 12,084 kB Available memory dropped by 11,976 kB Total slab additions in size: 721,352 bytes That's over 2 MB in savings per instance for free and available memory, and over 1 MB in savings per instance of slab memory. Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +- kernel/trace/trace.h | 4 +- kernel/trace/trace_events.c | 313 ++++++++++++++++++++++++++++++-------------- 3 files changed, 221 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dd6395692ff9..4383be8fa1b0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9764,7 +9764,6 @@ static __init void create_trace_instances(struct dentry *d_tracer) static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - struct trace_event_file *file; int cpu; trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, @@ -9797,11 +9796,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); - file = __find_event_file(tr, "ftrace", "print"); - if (file && file->ef) - eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, - file, &event_trigger_fops); - tr->trace_marker_file = file; + tr->trace_marker_file = __find_event_file(tr, "ftrace", "print"); trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e92cb9c1292f..0e1405abf4f7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -381,7 +381,7 @@ struct trace_array { struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; - struct dentry *event_dir; + struct eventfs_inode *event_dir; struct trace_options *topts; struct list_head systems; struct list_head events; @@ -1349,7 +1349,7 @@ struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; - struct eventfs_file *ef; + struct eventfs_inode *ei; int ref_count; int nr_events; }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 099b9b6bbdc4..a3b9d9423824 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -984,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) return; if (!--dir->nr_events) { - eventfs_remove(dir->ef); + eventfs_remove_dir(dir->ei); list_del(&dir->list); __put_system_dir(dir); } @@ -992,7 +992,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) static void remove_event_file_dir(struct trace_event_file *file) { - eventfs_remove(file->ef); + eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); @@ -2282,14 +2282,40 @@ create_new_subsystem(const char *name) return NULL; } -static struct eventfs_file * +int system_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (strcmp(name, "filter") == 0) + *fops = &ftrace_subsystem_filter_fops; + + else if (strcmp(name, "enable") == 0) + *fops = &ftrace_system_enable_fops; + + else + return 0; + + *mode = TRACE_MODE_WRITE; + return 1; +} + +static struct eventfs_inode * event_subsystem_dir(struct trace_array *tr, const char *name, - struct trace_event_file *file, struct dentry *parent) + struct trace_event_file *file, struct eventfs_inode *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct eventfs_file *ef; - int res; + struct eventfs_inode *ei; + int nr_entries; + static struct eventfs_entry system_entries[] = { + { + .name = "filter", + .callback = system_callback, + }, + { + .name = "enable", + .callback = system_callback, + } + }; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { @@ -2297,7 +2323,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; - return dir->ef; + return dir->ei; } } @@ -2321,39 +2347,29 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - ef = eventfs_add_subsystem_dir(name, parent); - if (IS_ERR(ef)) { + /* ftrace only has directories no files */ + if (strcmp(name, "ftrace") == 0) + nr_entries = 0; + else + nr_entries = ARRAY_SIZE(system_entries); + + ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir); + if (!ei) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } - dir->ef = ef; + dir->ei = ei; dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; dir->subsystem = system; file->system = dir; - /* the ftrace system is special, do not create enable or filter files */ - if (strcmp(name, "ftrace") != 0) { - - res = eventfs_add_file("filter", TRACE_MODE_WRITE, - dir->ef, dir, - &ftrace_subsystem_filter_fops); - if (res) { - kfree(system->filter); - system->filter = NULL; - pr_warn("Could not create tracefs '%s/filter' entry\n", name); - } - - eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir, - &ftrace_system_enable_fops); - } - list_add(&dir->list, &tr->systems); - return dir->ef; + return dir->ei; out_free: kfree(dir); @@ -2402,15 +2418,134 @@ event_define_fields(struct trace_event_call *call) return ret; } +static int event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + struct trace_event_file *file = *data; + struct trace_event_call *call = file->event_call; + + if (strcmp(name, "format") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_event_format_fops; + *data = call; + return 1; + } + + /* + * Only event directories that can be enabled should have + * triggers or filters, with the exception of the "print" + * event that can have a "trigger" file. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { + if (call->class->reg && strcmp(name, "enable") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_enable_fops; + return 1; + } + + if (strcmp(name, "filter") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_event_filter_fops; + return 1; + } + } + + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || + strcmp(trace_event_name(call), "print") == 0) { + if (strcmp(name, "trigger") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &event_trigger_fops; + return 1; + } + } + +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && call->class->reg && + strcmp(name, "id") == 0) { + *mode = TRACE_MODE_READ; + *data = (void *)(long)call->event.type; + *fops = &ftrace_event_id_fops; + return 1; + } +#endif + +#ifdef CONFIG_HIST_TRIGGERS + if (strcmp(name, "hist") == 0) { + *mode = TRACE_MODE_READ; + *fops = &event_hist_fops; + return 1; + } +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + if (strcmp(name, "hist_debug") == 0) { + *mode = TRACE_MODE_READ; + *fops = &event_hist_debug_fops; + return 1; + } +#endif +#ifdef CONFIG_TRACE_EVENT_INJECT + if (call->event.type && call->class->reg && + strcmp(name, "inject") == 0) { + *mode = 0200; + *fops = &event_inject_fops; + return 1; + } +#endif + return 0; +} + static int -event_create_dir(struct dentry *parent, struct trace_event_file *file) +event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; - struct eventfs_file *ef_subsystem = NULL; struct trace_array *tr = file->tr; - struct eventfs_file *ef; + struct eventfs_inode *e_events; + struct eventfs_inode *ei; const char *name; + int nr_entries; int ret; + static struct eventfs_entry event_entries[] = { + { + .name = "enable", + .callback = event_callback, + }, + { + .name = "filter", + .callback = event_callback, + }, + { + .name = "trigger", + .callback = event_callback, + }, + { + .name = "format", + .callback = event_callback, + }, +#ifdef CONFIG_PERF_EVENTS + { + .name = "id", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_HIST_TRIGGERS + { + .name = "hist", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + { + .name = "hist_debug", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_TRACE_EVENT_INJECT + { + .name = "inject", + .callback = event_callback, + }, +#endif + }; /* * If the trace point header did not define TRACE_SYSTEM @@ -2420,29 +2555,20 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) return -ENODEV; - ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent); - if (!ef_subsystem) + e_events = event_subsystem_dir(tr, call->class->system, file, parent); + if (!e_events) return -ENOMEM; + nr_entries = ARRAY_SIZE(event_entries); + name = trace_event_name(call); - ef = eventfs_add_dir(name, ef_subsystem); - if (IS_ERR(ef)) { + ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); + if (IS_ERR(ei)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } - file->ef = ef; - - if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file, - &ftrace_enable_fops); - -#ifdef CONFIG_PERF_EVENTS - if (call->event.type && call->class->reg) - eventfs_add_file("id", TRACE_MODE_READ, file->ef, - (void *)(long)call->event.type, - &ftrace_event_id_fops); -#endif + file->ei = ei; ret = event_define_fields(call); if (ret < 0) { @@ -2450,35 +2576,6 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) return ret; } - /* - * Only event directories that can be enabled should have - * triggers or filters. - */ - if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef, - file, &ftrace_event_filter_fops); - - eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, - file, &event_trigger_fops); - } - -#ifdef CONFIG_HIST_TRIGGERS - eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file, - &event_hist_fops); -#endif -#ifdef CONFIG_HIST_TRIGGERS_DEBUG - eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file, - &event_hist_debug_fops); -#endif - eventfs_add_file("format", TRACE_MODE_READ, file->ef, call, - &ftrace_event_format_fops); - -#ifdef CONFIG_TRACE_EVENT_INJECT - if (call->event.type && call->class->reg) - eventfs_add_file("inject", 0200, file->ef, file, - &event_inject_fops); -#endif - return 0; } @@ -3623,30 +3720,65 @@ static __init int setup_trace_event(char *str) } __setup("trace_event=", setup_trace_event); +static int events_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (strcmp(name, "enable") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_tr_enable_fops; + return 1; + } + + if (strcmp(name, "header_page") == 0) + *data = ring_buffer_print_page_header; + + else if (strcmp(name, "header_event") == 0) + *data = ring_buffer_print_entry_header; + + else + return 0; + + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_fops; + return 1; +} + /* Expects to have event_mutex held when called */ static int create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { - struct dentry *d_events; + struct eventfs_inode *e_events; struct dentry *entry; - int error = 0; + int nr_entries; + static struct eventfs_entry events_entries[] = { + { + .name = "enable", + .callback = events_callback, + }, + { + .name = "header_page", + .callback = events_callback, + }, + { + .name = "header_event", + .callback = events_callback, + }, + }; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; - d_events = eventfs_create_events_dir("events", parent); - if (IS_ERR(d_events)) { + nr_entries = ARRAY_SIZE(events_entries); + + e_events = eventfs_create_events_dir("events", parent, events_entries, + nr_entries, tr); + if (IS_ERR(e_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } - error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events, - tr, &ftrace_tr_enable_fops); - if (error) - return -ENOMEM; - /* There are not as crucial, just warn if they are not created */ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, @@ -3656,16 +3788,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_notrace_pid_fops); - /* ring buffer internal formats */ - eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); - - tr->event_dir = d_events; + tr->event_dir = e_events; return 0; } @@ -3749,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - eventfs_remove_events_dir(tr->event_dir); + eventfs_remove_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; -- cgit v1.2.3 From 2819f23ac12ce93ff79ca7a54597df9a4a1f6331 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 5 Oct 2023 09:13:48 -0400 Subject: eventfs: Use eventfs_remove_events_dir() The update to removing the eventfs_file changed the way the events top level directory was handled. Instead of returning a dentry, it now returns the eventfs_inode. In this changed, the removing of the events top level directory is not much different than removing any of the other directories. Because of this, the removal just called eventfs_remove_dir() instead of eventfs_remove_events_dir(). Although eventfs_remove_dir() does the clean up, it misses out on the dget() of the ei->dentry done in eventfs_create_events_dir(). It makes more sense to match eventfs_create_events_dir() with a specific function eventfs_remove_events_dir() and this specific function can then perform the dput() to the dentry that had the dget() when it was created. Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3b9d9423824..0e3a1c70e410 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3872,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - eventfs_remove_dir(tr->event_dir); + eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; -- cgit v1.2.3 From 5ddd8baa4857709b4e5d84b376d735152851955b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 5 Oct 2023 10:47:45 -0400 Subject: tracing: Make system_callback() function static The system_callback() function in trace_events.c is only used within that file. The "static" annotation was missed. Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0e3a1c70e410..db46d2116500 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2282,7 +2282,7 @@ create_new_subsystem(const char *name) return NULL; } -int system_callback(const char *name, umode_t *mode, void **data, +static int system_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { if (strcmp(name, "filter") == 0) -- cgit v1.2.3 From 5264a2f4bb3baf712e19f1f053caaa8d7d3afa2e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 20 Oct 2023 16:52:45 +0300 Subject: tracing: Fix a NULL vs IS_ERR() bug in event_subsystem_dir() The eventfs_create_dir() function returns error pointers, it never returns NULL. Update the check to reflect that. Link: https://lore.kernel.org/linux-trace-kernel/ff641474-84e2-46a7-9d7a-62b251a1050c@moroto.mountain Cc: Masami Hiramatsu Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db46d2116500..f9e3e24d8796 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2354,7 +2354,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, nr_entries = ARRAY_SIZE(system_entries); ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir); - if (!ei) { + if (IS_ERR(ei)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; -- cgit v1.2.3 From d0ed46b60396cfa7e0056f55e1ce0b43c7db57b6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 20 Oct 2023 04:35:45 +0100 Subject: tracing: Move readpos from seq_buf to trace_seq To make seq_buf more lightweight as a string buf, move the readpos member from seq_buf to its container, trace_seq. That puts the responsibility of maintaining the readpos entirely in the tracing code. If some future users want to package up the readpos with a seq_buf, we can define a new struct then. Link: https://lore.kernel.org/linux-trace-kernel/20231020033545.2587554-2-willy@infradead.org Cc: Kees Cook Cc: Justin Stitt Cc: Kent Overstreet Cc: Petr Mladek Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 10 +++++----- kernel/trace/trace_seq.c | 6 +++++- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4383be8fa1b0..d629065c2383 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1731,15 +1731,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - if (trace_seq_used(s) <= s->seq.readpos) + if (trace_seq_used(s) <= s->readpos) return -EBUSY; - len = trace_seq_used(s) - s->seq.readpos; + len = trace_seq_used(s) - s->readpos; if (cnt > len) cnt = len; - memcpy(buf, s->buffer + s->seq.readpos, cnt); + memcpy(buf, s->buffer + s->readpos, cnt); - s->seq.readpos += cnt; + s->readpos += cnt; return cnt; } @@ -7008,7 +7008,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) + if (iter->seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index bac06ee3b98b..7be97229ddf8 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -370,8 +370,12 @@ EXPORT_SYMBOL_GPL(trace_seq_path); */ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) { + int ret; __trace_seq_init(s); - return seq_buf_to_user(&s->seq, ubuf, cnt); + ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt); + if (ret > 0) + s->readpos += ret; + return ret; } EXPORT_SYMBOL_GPL(trace_seq_to_user); -- cgit v1.2.3 From 545db7e21e64766e6b7cb987fbfd3e79419726ce Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 21 Oct 2023 16:42:41 +0200 Subject: tracing/histograms: Simplify last_cmd_set() Turn a kzalloc()+strcpy()+strncat() into an equivalent and less verbose kasprintf(). Link: https://lore.kernel.org/linux-trace-kernel/30b6fb04dadc10a03cc1ad08f5d8a93ef623a167.1697899346.git.christophe.jaillet@wanadoo.fr Cc: Masami Hiramatsu Signed-off-by: Christophe JAILLET Reviewed-by: Mukesh ojha Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d06938ae0717..1abc07fba1b9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -774,23 +774,16 @@ static void last_cmd_set(struct trace_event_file *file, char *str) { const char *system = NULL, *name = NULL; struct trace_event_call *call; - int len; if (!str) return; - /* sizeof() contains the nul byte */ - len = sizeof(HIST_PREFIX) + strlen(str); kfree(last_cmd); - last_cmd = kzalloc(len, GFP_KERNEL); + + last_cmd = kasprintf(GFP_KERNEL, HIST_PREFIX "%s", str); if (!last_cmd) return; - strcpy(last_cmd, HIST_PREFIX); - /* Again, sizeof() contains the nul byte */ - len -= sizeof(HIST_PREFIX); - strncat(last_cmd, str, len); - if (file) { call = file->event_call; system = call->class->system; -- cgit v1.2.3 From dcc4e5728eeaeda84878ca0018758cff1abfca21 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Oct 2023 08:56:38 -0700 Subject: seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str() Solve two ergonomic issues with struct seq_buf; 1) Too much boilerplate is required to initialize: struct seq_buf s; char buf[32]; seq_buf_init(s, buf, sizeof(buf)); Instead, we can build this directly on the stack. Provide DECLARE_SEQ_BUF() macro to do this: DECLARE_SEQ_BUF(s, 32); 2) %NUL termination is fragile and requires 2 steps to get a valid C String (and is a layering violation exposing the "internals" of seq_buf): seq_buf_terminate(s); do_something(s->buffer); Instead, we can just return s->buffer directly after terminating it in the refactored seq_buf_terminate(), now known as seq_buf_str(): do_something(seq_buf_str(s)); Link: https://lore.kernel.org/linux-trace-kernel/20231027155634.make.260-kees@kernel.org Link: https://lore.kernel.org/linux-trace-kernel/20231026194033.it.702-kees@kernel.org/ Cc: Yosry Ahmed Cc: "Matthew Wilcox (Oracle)" Cc: Christoph Hellwig Cc: Justin Stitt Cc: Kent Overstreet Cc: Petr Mladek Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Cc: Masami Hiramatsu Cc: Greg Kroah-Hartman Cc: Arnd Bergmann Cc: Jonathan Corbet Cc: Yun Zhou Cc: Jacob Keller Cc: Zhen Lei Signed-off-by: Kees Cook Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d629065c2383..2539cfc20a97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3828,15 +3828,6 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static const char *show_buffer(struct trace_seq *s) -{ - struct seq_buf *seq = &s->seq; - - seq_buf_terminate(seq); - - return seq->buffer; -} - static DEFINE_STATIC_KEY_FALSE(trace_no_verify); static int test_can_verify_check(const char *fmt, ...) @@ -3976,7 +3967,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, */ if (WARN_ONCE(!trace_safe_str(iter, str, star, len), "fmt: '%s' current_buffer: '%s'", - fmt, show_buffer(&iter->seq))) { + fmt, seq_buf_str(&iter->seq.seq))) { int ret; /* Try to safely read the string */ -- cgit v1.2.3 From bb32500fb9b78215e4ef6ee8b4345c5f5d7eafb4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 31 Oct 2023 12:24:53 -0400 Subject: tracing: Have trace_event_file have ref counters The following can crash the kernel: # cd /sys/kernel/tracing # echo 'p:sched schedule' > kprobe_events # exec 5>>events/kprobes/sched/enable # > kprobe_events # exec 5>&- The above commands: 1. Change directory to the tracefs directory 2. Create a kprobe event (doesn't matter what one) 3. Open bash file descriptor 5 on the enable file of the kprobe event 4. Delete the kprobe event (removes the files too) 5. Close the bash file descriptor 5 The above causes a crash! BUG: kernel NULL pointer dereference, address: 0000000000000028 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 6 PID: 877 Comm: bash Not tainted 6.5.0-rc4-test-00008-g2c6b6b1029d4-dirty #186 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:tracing_release_file_tr+0xc/0x50 What happens here is that the kprobe event creates a trace_event_file "file" descriptor that represents the file in tracefs to the event. It maintains state of the event (is it enabled for the given instance?). Opening the "enable" file gets a reference to the event "file" descriptor via the open file descriptor. When the kprobe event is deleted, the file is also deleted from the tracefs system which also frees the event "file" descriptor. But as the tracefs file is still opened by user space, it will not be totally removed until the final dput() is called on it. But this is not true with the event "file" descriptor that is already freed. If the user does a write to or simply closes the file descriptor it will reference the event "file" descriptor that was just freed, causing a use-after-free bug. To solve this, add a ref count to the event "file" descriptor as well as a new flag called "FREED". The "file" will not be freed until the last reference is released. But the FREE flag will be set when the event is removed to prevent any more modifications to that event from happening, even if there's still a reference to the event "file" descriptor. Link: https://lore.kernel.org/linux-trace-kernel/20231031000031.1e705592@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231031122453.7a48b923@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and filter files") Reported-by: Beau Belgrave Tested-by: Beau Belgrave Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 31 +++++++++++++++++++++++++++---- kernel/trace/trace_events_filter.c | 3 +++ 4 files changed, 48 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2539cfc20a97..9aebf904ff97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4978,6 +4978,20 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret; + mutex_lock(&event_mutex); + + /* Fail if the file is marked for removal */ + if (file->flags & EVENT_FILE_FL_FREED) { + trace_array_put(file->tr); + ret = -ENODEV; + } else { + event_file_get(file); + } + + mutex_unlock(&event_mutex); + if (ret) + return ret; + filp->private_data = inode->i_private; return 0; @@ -4988,6 +5002,7 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp) struct trace_event_file *file = inode->i_private; trace_array_put(file->tr); + event_file_put(file); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0e1405abf4f7..b7f4ea25a194 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1669,6 +1669,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops, char *glob, struct event_trigger_data *trigger_data); +extern void event_file_get(struct trace_event_file *file); +extern void event_file_put(struct trace_event_file *file); + /** * struct event_trigger_ops - callbacks for trace event triggers * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f9e3e24d8796..f29e815ca5b2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -990,13 +990,35 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) } } +void event_file_get(struct trace_event_file *file) +{ + atomic_inc(&file->ref); +} + +void event_file_put(struct trace_event_file *file) +{ + if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (file->flags & EVENT_FILE_FL_FREED) + kmem_cache_free(file_cachep, file); + return; + } + + if (atomic_dec_and_test(&file->ref)) { + /* Count should only go to zero when it is freed */ + if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) + return; + kmem_cache_free(file_cachep, file); + } +} + static void remove_event_file_dir(struct trace_event_file *file) { eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); - kmem_cache_free(file_cachep, file); + file->flags |= EVENT_FILE_FL_FREED; + event_file_put(file); } /* @@ -1369,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, flags = file->flags; mutex_unlock(&event_mutex); - if (!file) + if (!file || flags & EVENT_FILE_FL_FREED) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1403,7 +1425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); - if (likely(file)) { + if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); @@ -1683,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); file = event_file_data(filp); - if (file) + if (file && !(file->flags & EVENT_FILE_FL_FREED)) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -2902,6 +2924,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); + event_file_get(file); return file; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 33264e510d16..0c611b281a5b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) struct event_filter *filter = NULL; int err; + if (file->flags & EVENT_FILE_FL_FREED) + return -ENODEV; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable(file); filter = event_filter(file); -- cgit v1.2.3 From 4f7969bcd6d33042d62e249b41b5578161e4c868 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 31 Oct 2023 15:10:33 -0400 Subject: tracing: Have the user copy of synthetic event address use correct context A synthetic event is created by the synthetic event interface that can read both user or kernel address memory. In reality, it reads any arbitrary memory location from within the kernel. If the address space is in USER (where CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE is set) then it uses strncpy_from_user_nofault() to copy strings otherwise it uses strncpy_from_kernel_nofault(). But since both functions use the same variable there's no annotation to what that variable is (ie. __user). This makes sparse complain. Quiet sparse by typecasting the strncpy_from_user_nofault() variable to a __user pointer. Link: https://lore.kernel.org/linux-trace-kernel/20231031151033.73c42e23@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 0934ae9977c2 ("tracing: Fix reading strings from synthetic events"); Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311010013.fm8WTxa5-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 14cb275a0bab..846e02c0fb59 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -452,7 +452,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)str_val < TASK_SIZE) - ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX); + ret = strncpy_from_user_nofault(str_field, (const void __user *)str_val, STR_VAR_LEN_MAX); else #endif ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX); -- cgit v1.2.3