diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2023-11-06 15:42:08 -0800 |
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2023-11-06 15:42:08 -0800 |
| commit | cdd5b5a9761fd66d17586e4f4ba6588c70e640ea (patch) | |
| tree | aba8409818be01f6af8683bf76594c790942d0bc /kernel/trace | |
| parent | 5c15c60e7be615f05a45cd905093a54b11f461bc (diff) | |
| parent | 28d3fe32354701decc3e76d89712569c269b5e4f (diff) | |
Merge branch 'next' into for-linus
Prepare input updates for 6.7 merge window.
Diffstat (limited to 'kernel/trace')
35 files changed, 2914 insertions, 376 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8cf97fa4a4b3..61c541c36596 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,6 +31,9 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst +config HAVE_FUNCTION_GRAPH_RETVAL + bool + config HAVE_DYNAMIC_FTRACE bool help @@ -227,6 +230,18 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config FUNCTION_GRAPH_RETVAL + bool "Kernel Function Graph Return Value" + depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on FUNCTION_GRAPH_TRACER + default n + help + Support recording and printing the function return value when + using function graph tracer. It can be helpful to locate functions + that return errors. This feature is off by default, and you can + enable it via the trace option funcgraph-retval. + See Documentation/trace/ftrace.rst + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER @@ -650,6 +665,32 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config FPROBE_EVENTS + depends on FPROBE + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable fprobe-based dynamic events" + select TRACING + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + This allows user to add tracing events on the function entry and + exit via ftrace interface. The syntax is same as the kprobe events + and the kprobe events on function entry and exit will be + transparently converted to this fprobe events. + +config PROBE_EVENTS_BTF_ARGS + depends on HAVE_FUNCTION_ARG_ACCESS_API + depends on FPROBE_EVENTS || KPROBE_EVENTS + depends on DEBUG_INFO_BTF && BPF_SYSCALL + bool "Support BTF function arguments for probe events" + default y + help + The user can specify the arguments of the probe event using the names + of the arguments of the probed function, when the probe location is a + kernel function entry or a tracepoint. + This is available only if BTF (BPF Type Format) support is enabled. + config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c6651e16b557..64b61f67a403 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1f4b07da327a..bd1a42b23f3f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,8 +661,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); - int nest_level = this_cpu_inc_return(bpf_trace_nest_level); + struct bpf_trace_sample_data *sds; struct perf_raw_record raw = { .frag = { .size = size, @@ -670,7 +669,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, }, }; struct perf_sample_data *sd; - int err; + int nest_level, err; + + preempt_disable(); + sds = this_cpu_ptr(&bpf_trace_sds); + nest_level = this_cpu_inc_return(bpf_trace_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { err = -EBUSY; @@ -688,9 +691,9 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_save_raw_data(sd, &raw); err = __bpf_perf_event_output(regs, map, flags, sd); - out: this_cpu_dec(bpf_trace_nest_level); + preempt_enable(); return err; } @@ -715,7 +718,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -732,8 +734,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_sample_data *sd; struct pt_regs *regs; + int nest_level; u64 ret; + preempt_disable(); + nest_level = this_cpu_inc_return(bpf_event_output_nest_level); + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { ret = -EBUSY; goto out; @@ -748,6 +754,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, ret = __bpf_perf_event_output(regs, map, flags, sd); out: this_cpu_dec(bpf_event_output_nest_level); + preempt_enable(); return ret; } @@ -1359,9 +1366,9 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, } return verify_pkcs7_signature(data_ptr->data, - bpf_dynptr_get_size(data_ptr), + __bpf_dynptr_size(data_ptr), sig_ptr->data, - bpf_dynptr_get_size(sig_ptr), + __bpf_dynptr_size(sig_ptr), trusted_keyring->key, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); @@ -2652,7 +2659,8 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, struct pt_regs *regs, + void *data) { struct bpf_kprobe_multi_link *link; @@ -2663,7 +2671,8 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, struct pt_regs *regs, + void *data) { struct bpf_kprobe_multi_link *link; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 218cd95bf8e4..c83c005e654e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -15,6 +15,7 @@ #include <trace/events/sched.h> #include "ftrace_internal.h" +#include "trace.h" #ifdef CONFIG_DYNAMIC_FTRACE #define ASSIGN_OPS_HASH(opsname, val) \ @@ -236,16 +237,23 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; +/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ +struct fgraph_ret_regs; + /* * Send the trace to the ring-buffer. * @return the original return address. */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, + unsigned long frame_pointer) { struct ftrace_graph_ret trace; unsigned long ret; ftrace_pop_return_trace(&trace, &ret, frame_pointer); +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + trace.retval = fgraph_ret_regs_return_value(ret_regs); +#endif trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); /* @@ -266,6 +274,23 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/* + * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can + * leave only ftrace_return_to_handler(ret_regs). + */ +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL +unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +{ + return __ftrace_return_to_handler(ret_regs, + fgraph_ret_regs_frame_pointer(ret_regs)); +} +#else +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + return __ftrace_return_to_handler(NULL, frame_pointer); +} +#endif + /** * ftrace_graph_get_ret_stack - return the entry of the shadow stack * @task: The task to read the shadow stack from diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 18d36842faf5..3b21f4063258 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -46,7 +46,7 @@ static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, } if (fp->entry_handler) - ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data); + ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); /* If entry_handler returns !0, nmissed is not counted. */ if (rh) { @@ -100,19 +100,27 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, return; } + /* + * This user handler is shared with other kprobes and is not expected to be + * called recursively. So if any other kprobe handler is running, this will + * exit as kprobe does. See the section 'Share the callbacks with kprobes' + * in Documentation/trace/fprobe.rst for more information. + */ if (unlikely(kprobe_running())) { fp->nmissed++; - return; + goto recursion_unlock; } kprobe_busy_begin(); __fprobe_handler(ip, parent_ip, ops, fregs); kprobe_busy_end(); + +recursion_unlock: ftrace_test_recursion_unlock(bit); } static void fprobe_exit_handler(struct rethook_node *rh, void *data, - struct pt_regs *regs) + unsigned long ret_ip, struct pt_regs *regs) { struct fprobe *fp = (struct fprobe *)data; struct fprobe_rethook_node *fpr; @@ -133,7 +141,7 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data, return; } - fp->exit_handler(fp, fpr->entry_ip, regs, + fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, fp->entry_data_size ? (void *)fpr->data : NULL); ftrace_test_recursion_unlock(bit); } @@ -348,6 +356,14 @@ int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) } EXPORT_SYMBOL_GPL(register_fprobe_syms); +bool fprobe_is_registered(struct fprobe *fp) +{ + if (!fp || (fp->ops.saved_func != fprobe_handler && + fp->ops.saved_func != fprobe_kprobe_handler)) + return false; + return true; +} + /** * unregister_fprobe() - Unregister fprobe from ftrace * @fp: A fprobe data structure to be unregistered. @@ -360,23 +376,19 @@ int unregister_fprobe(struct fprobe *fp) { int ret; - if (!fp || (fp->ops.saved_func != fprobe_handler && - fp->ops.saved_func != fprobe_kprobe_handler)) + if (!fprobe_is_registered(fp)) return -EINVAL; - /* - * rethook_free() starts disabling the rethook, but the rethook handlers - * may be running on other processors at this point. To make sure that all - * current running handlers are finished, call unregister_ftrace_function() - * after this. - */ if (fp->rethook) - rethook_free(fp->rethook); + rethook_stop(fp->rethook); ret = unregister_ftrace_function(&fp->ops); if (ret < 0) return ret; + if (fp->rethook) + rethook_free(fp->rethook); + ftrace_free_filter(&fp->ops); return ret; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 764668467155..05c0024815bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3305,6 +3305,22 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) return cnt; } +static void ftrace_free_pages(struct ftrace_page *pages) +{ + struct ftrace_page *pg = pages; + + while (pg) { + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } + pages = pg->next; + kfree(pg); + pg = pages; + ftrace_number_of_groups--; + } +} + static struct ftrace_page * ftrace_allocate_pages(unsigned long num_to_init) { @@ -3343,17 +3359,7 @@ ftrace_allocate_pages(unsigned long num_to_init) return start_pg; free_pages: - pg = start_pg; - while (pg) { - if (pg->records) { - free_pages((unsigned long)pg->records, pg->order); - ftrace_number_of_pages -= 1 << pg->order; - } - start_pg = pg->next; - kfree(pg); - pg = start_pg; - ftrace_number_of_groups--; - } + ftrace_free_pages(start_pg); pr_info("ftrace: FAILED to allocate memory for functions\n"); return NULL; } @@ -3861,6 +3867,9 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; + if (iter->flags & FTRACE_ITER_ADDRS) + seq_printf(m, "%lx ", rec->ip); + if (print_rec(m, rec->ip)) { /* This should only happen when a rec is disabled */ WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); @@ -3996,6 +4005,30 @@ ftrace_touched_open(struct inode *inode, struct file *file) return 0; } +static int +ftrace_avail_addrs_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ADDRS; + iter->ops = &global_ops; + + return 0; +} + /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -5743,7 +5776,7 @@ bool ftrace_filter_param __initdata; static int __init set_ftrace_notrace(char *str) { ftrace_filter_param = true; - strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); @@ -5751,7 +5784,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { ftrace_filter_param = true; - strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); @@ -5763,14 +5796,14 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static int __init set_graph_function(char *str) { - strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_filter=", set_graph_function); static int __init set_graph_notrace_function(char *str) { - strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_notrace=", set_graph_notrace_function); @@ -5916,6 +5949,13 @@ static const struct file_operations ftrace_touched_fops = { .release = seq_release_private, }; +static const struct file_operations ftrace_avail_addrs_fops = { + .open = ftrace_avail_addrs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -6377,6 +6417,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops); + trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_avail_addrs_fops); + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); @@ -6434,9 +6477,11 @@ static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *pg_unuse = NULL; struct ftrace_page *start_pg; struct ftrace_page *pg; struct dyn_ftrace *rec; + unsigned long skipped = 0; unsigned long count; unsigned long *p; unsigned long addr; @@ -6499,8 +6544,10 @@ static int ftrace_process_locs(struct module *mod, * object files to satisfy alignments. * Skip any NULL pointers. */ - if (!addr) + if (!addr) { + skipped++; continue; + } end_offset = (pg->index+1) * sizeof(pg->records[0]); if (end_offset > PAGE_SIZE << pg->order) { @@ -6514,8 +6561,10 @@ static int ftrace_process_locs(struct module *mod, rec->ip = addr; } - /* We should have used all pages */ - WARN_ON(pg->next); + if (pg->next) { + pg_unuse = pg->next; + pg->next = NULL; + } /* Assign the last page to ftrace_pages */ ftrace_pages = pg; @@ -6537,6 +6586,11 @@ static int ftrace_process_locs(struct module *mod, out: mutex_unlock(&ftrace_lock); + /* We should have used all pages unless we skipped some */ + if (pg_unuse) { + WARN_ON(!skipped); + ftrace_free_pages(pg_unuse); + } return ret; } @@ -6569,8 +6623,8 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, continue; *value = op->trampoline; *type = 't'; - strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); - strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + strscpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strscpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); *exported = 0; return 0; } @@ -6933,7 +6987,7 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, if (off) *off = addr - found_func->ip; if (sym) - strlcpy(sym, found_func->name, KSYM_NAME_LEN); + strscpy(sym, found_func->name, KSYM_NAME_LEN); return found_func->name; } @@ -6987,8 +7041,8 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, *value = mod_func->ip; *type = 'T'; - strlcpy(name, mod_func->name, KSYM_NAME_LEN); - strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); + strscpy(name, mod_func->name, KSYM_NAME_LEN); + strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); *exported = 1; preempt_enable(); return 0; diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 382775edf690..5012c04f92c0 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -2,6 +2,9 @@ #ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H #define _LINUX_KERNEL_FTRACE_INTERNAL_H +int __register_ftrace_function(struct ftrace_ops *ops); +int __unregister_ftrace_function(struct ftrace_ops *ops); + #ifdef CONFIG_FUNCTION_TRACER extern struct mutex ftrace_lock; @@ -15,8 +18,6 @@ int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs); #else /* !CONFIG_DYNAMIC_FTRACE */ -int __register_ftrace_function(struct ftrace_ops *ops); -int __unregister_ftrace_function(struct ftrace_ops *ops); /* Keep as macros so we do not need to define the commands */ # define ftrace_startup(ops, command) \ ({ \ diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 60f6cb2b486b..5eb9b598f4e9 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -54,6 +54,19 @@ static void rethook_free_rcu(struct rcu_head *head) } /** + * rethook_stop() - Stop using a rethook. + * @rh: the struct rethook to stop. + * + * Stop using a rethook to prepare for freeing it. If you want to wait for + * all running rethook handler before calling rethook_free(), you need to + * call this first and wait RCU, and call rethook_free(). + */ +void rethook_stop(struct rethook *rh) +{ + WRITE_ONCE(rh->handler, NULL); +} + +/** * rethook_free() - Free struct rethook. * @rh: the struct rethook to be freed. * @@ -301,7 +314,8 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, break; handler = READ_ONCE(rhn->rethook->handler); if (handler) - handler(rhn, rhn->rethook->data, regs); + handler(rhn, rhn->rethook->data, + correct_ret_addr, regs); if (first == node) break; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 834b361a4a66..52dea5dd5362 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -523,6 +523,8 @@ struct ring_buffer_per_cpu { rb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; + /* pages removed since last reset */ + unsigned long pages_removed; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -536,6 +538,7 @@ struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; + atomic_t resizing; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; @@ -558,6 +561,7 @@ struct ring_buffer_iter { struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; + unsigned long cache_pages_removed; u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; @@ -946,6 +950,7 @@ static void rb_wake_up_waiters(struct irq_work *work) /** * ring_buffer_wake_waiters - wake up any waiters on this ring buffer * @buffer: The ring buffer to wake waiters on + * @cpu: The CPU buffer to wake waiters on * * In the case of a file that represents a ring buffer is closing, * it is prudent to wake up any waiters that are on this. @@ -1956,6 +1961,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) to_remove = rb_list_head(to_remove)->next; head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } + /* Read iterators need to reset themselves when some pages removed */ + cpu_buffer->pages_removed += nr_removed; next_page = rb_list_head(to_remove)->next; @@ -1977,12 +1984,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) cpu_buffer->head_page = list_entry(next_page, struct buffer_page, list); - /* - * change read pointer to make sure any read iterators reset - * themselves - */ - cpu_buffer->read = 0; - /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); @@ -2167,7 +2168,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); - + atomic_inc(&buffer->resizing); if (cpu_id == RING_BUFFER_ALL_CPUS) { /* @@ -2322,6 +2323,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, atomic_dec(&buffer->record_disabled); } + atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return 0; @@ -2342,6 +2344,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } } out_err_unlock: + atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return err; } @@ -3373,7 +3376,6 @@ void ring_buffer_nest_end(struct trace_buffer *buffer) /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to - * @event: The event pointer to commit. * * This commits the data to the ring buffer, and releases any locks held. * @@ -4392,6 +4394,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; + iter->cache_pages_removed = cpu_buffer->pages_removed; if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; @@ -4846,12 +4849,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) buffer = cpu_buffer->buffer; /* - * Check if someone performed a consuming read to - * the buffer. A consuming read invalidates the iterator - * and we need to reset the iterator in this case. + * Check if someone performed a consuming read to the buffer + * or removed some pages from the buffer. In these cases, + * iterator was invalidated and we need to reset it. */ if (unlikely(iter->cache_read != cpu_buffer->read || - iter->cache_reader_page != cpu_buffer->reader_page)) + iter->cache_reader_page != cpu_buffer->reader_page || + iter->cache_pages_removed != cpu_buffer->pages_removed)) rb_iter_reset(iter); again: @@ -5242,28 +5246,34 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_size); +static void rb_clear_buffer_page(struct buffer_page *page) +{ + local_set(&page->write, 0); + local_set(&page->entries, 0); + rb_init_page(page->page); + page->read = 0; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + struct buffer_page *page; + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); - local_set(&cpu_buffer->head_page->write, 0); - local_set(&cpu_buffer->head_page->entries, 0); - local_set(&cpu_buffer->head_page->page->commit, 0); - - cpu_buffer->head_page->read = 0; + rb_clear_buffer_page(cpu_buffer->head_page); + list_for_each_entry(page, cpu_buffer->pages, list) { + rb_clear_buffer_page(page); + } cpu_buffer->tail_page = cpu_buffer->head_page; cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); - cpu_buffer->reader_page->read = 0; + rb_clear_buffer_page(cpu_buffer->reader_page); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); @@ -5289,6 +5299,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->last_overrun = 0; rb_head_page_activate(cpu_buffer); + cpu_buffer->pages_removed = 0; } /* Must have disabled the cpu buffer then done a synchronize_rcu */ @@ -5347,7 +5358,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of - * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { @@ -5535,6 +5545,15 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (local_read(&cpu_buffer_b->committing)) goto out_dec; + /* + * When resize is in progress, we cannot swap it because + * it will mess the state of the cpu buffer. + */ + if (atomic_read(&buffer_a->resizing)) + goto out_dec; + if (atomic_read(&buffer_b->resizing)) + goto out_dec; + buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64a4dde073ef..8e64aaad5361 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -199,7 +199,7 @@ static int boot_snapshot_index; static int __init set_cmdline_ftrace(char *str) { - strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = true; @@ -284,7 +284,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { - strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); @@ -294,7 +294,7 @@ static char *trace_boot_clock __initdata; static int __init set_trace_boot_clock(char *str) { - strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; return 1; } @@ -1928,9 +1928,10 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * place on this CPU. We fail to record, but we reset * the max trace buffer (no one writes directly to it) * and flag that it failed. + * Another reason is resize is in progress. */ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, - "Failed to swap buffers due to commit in progress\n"); + "Failed to swap buffers due to commit or resize in progress\n"); } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); @@ -2546,7 +2547,7 @@ static void __trace_find_cmdline(int pid, char comm[]) if (map != NO_CMDLINE_MAP) { tpid = savedcmd->map_cmdline_to_pid[map]; if (tpid == pid) { - strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); return; } } @@ -3118,6 +3119,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; + void *ptr; /* * Add one, for this function and the call to save_stack_trace() @@ -3161,9 +3163,25 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, trace_ctx); if (!event) goto out; - entry = ring_buffer_event_data(event); + ptr = ring_buffer_event_data(event); + entry = ptr; + + /* + * For backward compatibility reasons, the entry->caller is an + * array of 8 slots to store the stack. This is also exported + * to user space. The amount allocated on the ring buffer actually + * holds enough for the stack specified by nr_entries. This will + * go into the location of entry->caller. Due to string fortifiers + * checking the size of the destination of memcpy() it triggers + * when it detects that size is greater than 8. To hide this from + * the fortifiers, we use "ptr" and pointer arithmetic to assign caller. + * + * The below is really just: + * memcpy(&entry->caller, fstack->calls, size); + */ + ptr += offsetof(typeof(*entry), caller); + memcpy(ptr, fstack->calls, size); - memcpy(&entry->caller, fstack->calls, size); entry->size = nr_entries; if (!call_filter_check_discard(call, entry, buffer, event)) @@ -4195,8 +4213,15 @@ static void *s_start(struct seq_file *m, loff_t *pos) * will point to the same string as current_trace->name. */ mutex_lock(&trace_types_lock); - if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) + if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) { + /* Close iter->trace before switching to the new current tracer */ + if (iter->trace->close) + iter->trace->close(iter); *iter->trace = *tr->current_trace; + /* Reopen the new current tracer */ + if (iter->trace->open) + iter->trace->open(iter); + } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE @@ -5199,7 +5224,7 @@ static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .write = tracing_write_stub, .llseek = tracing_lseek, .release = tracing_release, @@ -5259,11 +5284,17 @@ int tracing_set_cpumask(struct trace_array *tr, !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); +#endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); +#endif } } arch_spin_unlock(&tr->max_lock); @@ -5672,10 +5703,17 @@ static const char readme_msg[] = " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \ + defined(CONFIG_FPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n" "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n" +#endif +#ifdef CONFIG_FPROBE_EVENTS + "\t f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n" + "\t t[:[<group>/][<event>]] <tracepoint> [<args>]\n" +#endif #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/]<event> <field> [<field>]\n" #endif @@ -5691,7 +5729,11 @@ static const char readme_msg[] = "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + "\t $stack<index>, $stack, $retval, $comm, $arg<N>, <argname>\n" +#else "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" +#endif #else "\t $stack<index>, $stack, $retval, $comm,\n" #endif @@ -6676,10 +6718,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, #endif +static int open_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + if (cpumask_empty(tr->pipe_cpumask)) { + cpumask_setall(tr->pipe_cpumask); + return 0; + } + } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { + cpumask_set_cpu(cpu, tr->pipe_cpumask); + return 0; + } + return -EBUSY; +} + +static void close_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + WARN_ON(!cpumask_full(tr->pipe_cpumask)); + cpumask_clear(tr->pipe_cpumask); + } else { + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); + cpumask_clear_cpu(cpu, tr->pipe_cpumask); + } +} + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; + int cpu; int ret; ret = tracing_check_open_get_tr(tr); @@ -6687,13 +6755,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return ret; mutex_lock(&trace_types_lock); + cpu = tracing_get_cpu(inode); + ret = open_pipe_on_cpu(tr, cpu); + if (ret) + goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; - __trace_array_put(tr); - goto out; + goto fail_alloc_iter; } trace_seq_init(&iter->seq); @@ -6716,7 +6787,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->tr = tr; iter->array_buffer = &tr->array_buffer; - iter->cpu_file = tracing_get_cpu(inode); + iter->cpu_file = cpu; mutex_init(&iter->mutex); filp->private_data = iter; @@ -6726,12 +6797,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); tr->trace_ref++; -out: + mutex_unlock(&trace_types_lock); return ret; fail: kfree(iter); +fail_alloc_iter: + close_pipe_on_cpu(tr, cpu); +fail_pipe_on_cpu: __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; @@ -6748,11 +6822,12 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) if (iter->trace->pipe_close) iter->trace->pipe_close(iter); - + close_pipe_on_cpu(tr, iter->cpu_file); mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); kfree(iter->fmt); + kfree(iter->temp); mutex_destroy(&iter->mutex); kfree(iter); @@ -8135,7 +8210,7 @@ static const struct file_operations tracing_err_log_fops = { .open = tracing_err_log_open, .write = tracing_err_log_write, .read = seq_read, - .llseek = seq_lseek, + .llseek = tracing_lseek, .release = tracing_err_log_release, }; @@ -9411,6 +9486,9 @@ static struct trace_array *trace_array_create(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; + if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) + goto out_free_tr; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9452,6 +9530,7 @@ static struct trace_array *trace_array_create(const char *name) out_free_tr: ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); + free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -9554,6 +9633,7 @@ static int __remove_instance(struct trace_array *tr) } kfree(tr->topts); + free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -10351,12 +10431,14 @@ __init static int tracer_alloc_buffers(void) if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; + if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) + goto out_free_savedcmd; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); - goto out_free_savedcmd; + goto out_free_pipe_cpumask; } - if (global_trace.buffer_disabled) tracing_off(); @@ -10409,6 +10491,8 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_pipe_cpumask: + free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..73eaec158473 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -113,6 +113,8 @@ enum trace_type { #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define FAULT_STRING "(fault)" + #define HIST_STACKTRACE_DEPTH 16 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 @@ -148,6 +150,17 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; +struct fentry_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct fexit_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; + #define TRACE_BUF_SIZE 1024 struct trace_array; @@ -364,6 +377,8 @@ struct trace_array { struct list_head events; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ + /* one per_cpu trace_pipe can be opened by only one user */ + cpumask_var_t pipe_cpumask; int ref; int trace_ref; #ifdef CONFIG_FUNCTION_TRACER @@ -832,6 +847,8 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_TAIL 0x100 #define TRACE_GRAPH_SLEEP_TIME 0x200 #define TRACE_GRAPH_GRAPH_TIME 0x400 +#define TRACE_GRAPH_PRINT_RETVAL 0x800 +#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -1280,6 +1297,14 @@ static inline void trace_branch_disable(void) /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); +union trace_synth_field { + u8 as_u8; + u16 as_u16; + u32 as_u32; + u64 as_u64; + struct trace_dynamic_info as_dynamic; +}; + struct ftrace_event_field { struct list_head link; const char *name; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 778200dd8ede..7ccc7a8e155b 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) /* Common ftrace options */ xbc_node_for_each_array_value(node, "options", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) { pr_err("String is too long: %s\n", p); continue; } @@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) const char *p; xbc_node_for_each_array_value(node, "events", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) { pr_err("String is too long: %s\n", p); continue; } @@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, p = xbc_node_find_value(enode, "filter", NULL); if (p && *p != '\0') { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) pr_err("filter string is too long: %s\n", p); else if (apply_event_filter(file, buf) < 0) pr_err("Failed to apply filter: %s\n", buf); @@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index cd41e863b51c..340b2fa98218 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -86,6 +86,30 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_packed( unsigned long, ret, func ) + __field_packed( unsigned long, ret, retval ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + ), + + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + (void *)__entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth, __entry->retval) +); + +#else + FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, @@ -105,6 +129,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __entry->depth) ); +#endif + /* * Context switch trace entry - which task (and prio) we switched from/to: * diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 67e854979d53..a0a704ba27db 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -227,37 +227,6 @@ error: return ERR_PTR(ret); } -static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) -{ - struct probe_arg *parg = &ep->tp.args[i]; - struct ftrace_event_field *field; - struct list_head *head; - int ret = -ENOENT; - - head = trace_get_fields(ep->event); - list_for_each_entry(field, head, link) { - if (!strcmp(parg->code->data, field->name)) { - kfree(parg->code->data); - parg->code->data = field; - return 0; - } - } - - /* - * Argument not found on event. But allow for comm and COMM - * to be used to get the current->comm. - */ - if (strcmp(parg->code->data, "COMM") == 0 || - strcmp(parg->code->data, "comm") == 0) { - parg->code->op = FETCH_OP_COMM; - ret = 0; - } - - kfree(parg->code->data); - parg->code->data = NULL; - return ret; -} - static int eprobe_event_define_fields(struct trace_event_call *event_call) { struct eprobe_trace_entry_head field; @@ -675,6 +644,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, struct trace_eprobe *ep; bool enabled; int ret = 0; + int cnt = 0; tp = trace_probe_primary_from_call(call); if (WARN_ON_ONCE(!tp)) @@ -698,12 +668,25 @@ static int enable_trace_eprobe(struct trace_event_call *call, if (ret) break; enabled = true; + cnt++; } if (ret) { /* Failed to enable one of them. Roll back all */ - if (enabled) - disable_eprobe(ep, file->tr); + if (enabled) { + /* + * It's a bug if one failed for something other than memory + * not being available but another eprobe succeeded. + */ + WARN_ON_ONCE(ret != -ENOMEM); + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + disable_eprobe(ep, file->tr); + if (!--cnt) + break; + } + } if (file) trace_probe_remove_file(tp, file); else @@ -817,19 +800,16 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT; + struct traceprobe_parse_context ctx = { + .event = ep->event, + .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, + }; int ret; - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); if (ret) return ret; - if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) { - ret = trace_eprobe_tp_arg_update(ep, i); - if (ret) - trace_probe_log_err(0, BAD_ATTACH_ARG); - } - /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57e539d47989..578f1f7d49a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -611,7 +611,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; - unsigned long file_flags = file->flags; int ret = 0; int disable; @@ -635,6 +634,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Disable use of trace_buffered_event */ + trace_buffered_event_disable(); } else disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); @@ -673,6 +674,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (atomic_inc_return(&file->sm_ref) > 1) break; set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Enable use of trace_buffered_event */ + trace_buffered_event_enable(); } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { @@ -712,15 +715,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; } - /* Enable or disable use of trace_buffered_event */ - if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != - (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { - if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) - trace_buffered_event_enable(); - else - trace_buffered_event_disable(); - } - return ret; } @@ -2833,7 +2827,7 @@ static __init int setup_trace_triggers(char *str) char *buf; int i; - strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; disable_tracing_selftest("running event triggers"); @@ -3623,7 +3617,7 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { - strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; disable_tracing_selftest("running event tracing"); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b97d3ad832f1..d06938ae0717 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6663,13 +6663,16 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (get_named_trigger_data(trigger_data)) goto enable; - if (has_hist_vars(hist_data)) - save_hist_vars(hist_data); - ret = create_actions(hist_data); if (ret) goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_unreg; + } + ret = tracing_map_init(hist_data->map); if (ret) goto out_unreg; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index d6b4935a78c0..abe805d471eb 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -217,7 +217,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) char *addr = (char *)(unsigned long) val; if (field->filter_type == FILTER_STATIC_STRING) { - strlcpy(entry + field->offset, addr, field->size); + strscpy(entry + field->offset, addr, field->size); } else if (field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; @@ -232,7 +232,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) } entry = *pentry; - strlcpy(entry + (entry_size - str_len), addr, str_len); + strscpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); if (field->filter_type == FILTER_RDYN_STRING) str_loc -= field->offset + field->size; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index d6a70aff2410..9897d0bfcab7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -127,7 +127,7 @@ static bool synth_event_match(const char *system, const char *event, struct synth_trace_event { struct trace_entry ent; - u64 fields[]; + union trace_synth_field fields[]; }; static int synth_event_define_fields(struct trace_event_call *call) @@ -321,19 +321,19 @@ static const char *synth_field_fmt(char *type) static void print_synth_event_num_val(struct trace_seq *s, char *print_fmt, char *name, - int size, u64 val, char *space) + int size, union trace_synth_field *val, char *space) { switch (size) { case 1: - trace_seq_printf(s, print_fmt, name, (u8)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u8, space); break; case 2: - trace_seq_printf(s, print_fmt, name, (u16)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u16, space); break; case 4: - trace_seq_printf(s, print_fmt, name, (u32)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u32, space); break; default: @@ -350,7 +350,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; struct synth_trace_event *entry; struct synth_event *se; - unsigned int i, n_u64; + unsigned int i, j, n_u64; char print_fmt[32]; const char *fmt; @@ -374,43 +374,28 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, /* parameter values */ if (se->fields[i]->is_string) { if (se->fields[i]->is_dynamic) { - u32 offset, data_offset; - char *str_field; - - offset = (u32)entry->fields[n_u64]; - data_offset = offset & 0xffff; - - str_field = (char *)entry + data_offset; + union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, STR_VAR_LEN_MAX, - str_field, + (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, STR_VAR_LEN_MAX, - (char *)&entry->fields[n_u64], + (char *)&entry->fields[n_u64].as_u64, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } } else if (se->fields[i]->is_stack) { - u32 offset, data_offset, len; - unsigned long *p, *end; - - offset = (u32)entry->fields[n_u64]; - data_offset = offset & 0xffff; - len = offset >> 16; - - p = (void *)entry + data_offset; - end = (void *)p + len - (sizeof(long) - 1); + union trace_synth_field *data = &entry->fields[n_u64]; + unsigned long *p = (void *)entry + data->as_dynamic.offset; trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); - - for (; *p && p < end; p++) - trace_seq_printf(s, "=> %pS\n", (void *)*p); + for (j = 1; j < data->as_dynamic.len / sizeof(long); j++) + trace_seq_printf(s, "=> %pS\n", (void *)p[j]); n_u64++; - } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -419,13 +404,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, print_synth_event_num_val(s, print_fmt, se->fields[i]->name, se->fields[i]->size, - entry->fields[n_u64], + &entry->fields[n_u64], space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", - entry->fields[n_u64], + entry->fields[n_u64].as_u64, __flags); trace_seq_putc(s, ')'); } @@ -454,21 +439,16 @@ static unsigned int trace_string(struct synth_trace_event *entry, int ret; if (is_dynamic) { - u32 data_offset; + union trace_synth_field *data = &entry->fields[*n_u64]; - data_offset = struct_size(entry, fields, event->n_u64); - data_offset += data_size; - - len = fetch_store_strlen((unsigned long)str_val); - - data_offset |= len << 16; - *(u32 *)&entry->fields[*n_u64] = data_offset; + data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size; + data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val); ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); (*n_u64)++; } else { - str_field = (char *)&entry->fields[*n_u64]; + str_field = (char *)&entry->fields[*n_u64].as_u64; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)str_val < TASK_SIZE) @@ -492,6 +472,7 @@ static unsigned int trace_stack(struct synth_trace_event *entry, unsigned int data_size, unsigned int *n_u64) { + union trace_synth_field *data = &entry->fields[*n_u64]; unsigned int len; u32 data_offset; void *data_loc; @@ -504,10 +485,6 @@ static unsigned int trace_stack(struct synth_trace_event *entry, break; } - /* Include the zero'd element if it fits */ - if (len < HIST_STACKTRACE_DEPTH) - len++; - len *= sizeof(long); /* Find the dynamic section to copy the stack into. */ @@ -515,8 +492,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry, memcpy(data_loc, stack, len); /* Fill in the field that holds the offset/len combo */ - data_offset |= len << 16; - *(u32 *)&entry->fields[*n_u64] = data_offset; + + data->as_dynamic.offset = data_offset; + data->as_dynamic.len = len; (*n_u64)++; @@ -550,7 +528,8 @@ static notrace void trace_event_raw_event_synth(void *__data, str_val = (char *)(long)var_ref_vals[val_idx]; if (event->dynamic_fields[i]->is_stack) { - len = *((unsigned long *)str_val); + /* reserve one extra element for size */ + len = *((unsigned long *)str_val) + 1; len *= sizeof(unsigned long); } else { len = fetch_store_strlen((unsigned long)str_val); @@ -592,19 +571,19 @@ static notrace void trace_event_raw_event_synth(void *__data, switch (field->size) { case 1: - *(u8 *)&entry->fields[n_u64] = (u8)val; + entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&entry->fields[n_u64] = (u16)val; + entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&entry->fields[n_u64] = (u32)val; + entry->fields[n_u64].as_u32 = (u32)val; break; default: - entry->fields[n_u64] = val; + entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -1230,6 +1209,7 @@ EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); * synth_event_gen_cmd_array_start - Start synthetic event command from an array * @cmd: A pointer to the dynevent_cmd struct representing the new event * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module * @fields: An array of type/name field descriptions * @n_fields: The number of field descriptions contained in the fields array * @@ -1790,19 +1770,19 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) switch (field->size) { case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; + state.entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; + state.entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; + state.entry->fields[n_u64].as_u32 = (u32)val; break; default: - state.entry->fields[n_u64] = val; + state.entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -1883,19 +1863,19 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, switch (field->size) { case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; + state.entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; + state.entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; + state.entry->fields[n_u64].as_u32 = (u32)val; break; default: - state.entry->fields[n_u64] = val; + state.entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -2030,19 +2010,19 @@ static int __synth_event_add_val(const char *field_name, u64 val, } else { switch (field->size) { case 1: - *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + trace_state->entry->fields[field->offset].as_u8 = (u8)val; break; case 2: - *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + trace_state->entry->fields[field->offset].as_u16 = (u16)val; break; case 4: - *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + trace_state->entry->fields[field->offset].as_u32 = (u32)val; break; default: - trace_state->entry->fields[field->offset] = val; + trace_state->entry->fields[field->offset].as_u64 = val; break; } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e535959939d3..46439e3bcec4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -31,7 +31,9 @@ void trigger_data_free(struct event_trigger_data *data) /** * event_triggers_call - Call triggers associated with a trace event * @file: The trace_event_file associated with the event + * @buffer: The ring buffer that the event is being written to * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command. If rec is diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 8df0550415e7..33cb6af31f39 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -498,7 +498,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, return -EBUSY; ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, - &page, NULL, NULL); + &page, NULL); if (unlikely(ret <= 0)) { if (!fixup_fault) @@ -1317,6 +1317,9 @@ static int user_field_set_string(struct ftrace_event_field *field, pos += snprintf(buf + pos, LEN_OR_ZERO, " "); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + if (str_has_prefix(field->type, "struct ")) + pos += snprintf(buf + pos, LEN_OR_ZERO, " %d", field->size); + if (colon) pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); @@ -2096,7 +2099,8 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) if (unlikely(faulted)) return -EFAULT; - } + } else + return -EBADF; return ret; } diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c new file mode 100644 index 000000000000..dfe2e546acdc --- /dev/null +++ b/kernel/trace/trace_fprobe.c @@ -0,0 +1,1199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Fprobe-based tracing events + * Copyright (C) 2022 Google LLC. + */ +#define pr_fmt(fmt) "trace_fprobe: " fmt + +#include <linux/fprobe.h> +#include <linux/module.h> +#include <linux/rculist.h> +#include <linux/security.h> +#include <linux/tracepoint.h> +#include <linux/uaccess.h> + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" + +#define FPROBE_EVENT_SYSTEM "fprobes" +#define TRACEPOINT_EVENT_SYSTEM "tracepoints" +#define RETHOOK_MAXACTIVE_MAX 4096 + +static int trace_fprobe_create(const char *raw_command); +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_fprobe_release(struct dyn_event *ev); +static bool trace_fprobe_is_busy(struct dyn_event *ev); +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations trace_fprobe_ops = { + .create = trace_fprobe_create, + .show = trace_fprobe_show, + .is_busy = trace_fprobe_is_busy, + .free = trace_fprobe_release, + .match = trace_fprobe_match, +}; + +/* + * Fprobe event core functions + */ +struct trace_fprobe { + struct dyn_event devent; + struct fprobe fp; + const char *symbol; + struct tracepoint *tpoint; + struct module *mod; + struct trace_probe tp; +}; + +static bool is_trace_fprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_fprobe_ops; +} + +static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_fprobe, devent); +} + +/** + * for_each_trace_fprobe - iterate over the trace_fprobe list + * @pos: the struct trace_fprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_fprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos))) + +static bool trace_fprobe_is_return(struct trace_fprobe *tf) +{ + return tf->fp.exit_handler != NULL; +} + +static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) +{ + return tf->tpoint != NULL; +} + +static const char *trace_fprobe_symbol(struct trace_fprobe *tf) +{ + return tf->symbol ? tf->symbol : "unknown"; +} + +static bool trace_fprobe_is_busy(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + return trace_probe_is_enabled(&tf->tp); +} + +static bool trace_fprobe_match_command_head(struct trace_fprobe *tf, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tf->tp, argc, argv); +} + +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event)) + return false; + + if (system && strcmp(trace_probe_group_name(&tf->tp), system)) + return false; + + return trace_fprobe_match_command_head(tf, argc, argv); +} + +static bool trace_fprobe_is_registered(struct trace_fprobe *tf) +{ + return fprobe_is_registered(&tf->fp); +} + +/* + * Note that we don't verify the fetch_insn code, since it does not come + * from user space. + */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + struct pt_regs *regs = rec; + unsigned long val; + int ret; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* function entry handler */ +static nokprobe_inline void +__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fentry_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->ip = entry_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fentry_trace_func(tf, entry_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fentry_trace_func); + +/* Kretprobe handler */ +static nokprobe_inline void +__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fexit_trace_entry_head *entry; + struct trace_event_buffer fbuffer; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fexit_trace_func); + +#ifdef CONFIG_PERF_EVENTS + +static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fentry_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return 0; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return 0; + + entry->ip = entry_ip; + memset(&entry[1], 0, dsize); + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); + return 0; +} +NOKPROBE_SYMBOL(fentry_perf_func); + +static void +fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fexit_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return; + + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); +} +NOKPROBE_SYMBOL(fexit_perf_func); +#endif /* CONFIG_PERF_EVENTS */ + +static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + int ret = 0; + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fentry_trace_func(tf, entry_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + ret = fentry_perf_func(tf, entry_ip, regs); +#endif + return ret; +} +NOKPROBE_SYMBOL(fentry_dispatcher); + +static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fexit_trace_func(tf, entry_ip, ret_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + fexit_perf_func(tf, entry_ip, ret_ip, regs); +#endif +} +NOKPROBE_SYMBOL(fexit_dispatcher); + +static void free_trace_fprobe(struct trace_fprobe *tf) +{ + if (tf) { + trace_probe_cleanup(&tf->tp); + kfree(tf->symbol); + kfree(tf); + } +} + +/* + * Allocate new trace_probe and initialize it (including fprobe). + */ +static struct trace_fprobe *alloc_trace_fprobe(const char *group, + const char *event, + const char *symbol, + struct tracepoint *tpoint, + int maxactive, + int nargs, bool is_return) +{ + struct trace_fprobe *tf; + int ret = -ENOMEM; + + tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); + if (!tf) + return ERR_PTR(ret); + + tf->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tf->symbol) + goto error; + + if (is_return) + tf->fp.exit_handler = fexit_dispatcher; + else + tf->fp.entry_handler = fentry_dispatcher; + + tf->tpoint = tpoint; + tf->fp.nr_maxactive = maxactive; + + ret = trace_probe_init(&tf->tp, event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&tf->devent, &trace_fprobe_ops); + return tf; +error: + free_trace_fprobe(tf); + return ERR_PTR(ret); +} + +static struct trace_fprobe *find_trace_fprobe(const char *event, + const char *group) +{ + struct dyn_event *pos; + struct trace_fprobe *tf; + + for_each_trace_fprobe(tf, pos) + if (strcmp(trace_probe_name(&tf->tp), event) == 0 && + strcmp(trace_probe_group_name(&tf->tp), group) == 0) + return tf; + return NULL; +} + +static inline int __enable_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) + enable_fprobe(&tf->fp); + + return 0; +} + +static void __disable_trace_fprobe(struct trace_probe *tp) +{ + struct trace_fprobe *tf; + + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + if (!trace_fprobe_is_registered(tf)) + continue; + disable_fprobe(&tf->fp); + } +} + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + /* TODO: check the fprobe is gone */ + __enable_trace_fprobe(tf); + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) + __disable_trace_fprobe(tp); + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* Event entry printers */ +static enum print_line_t +print_fentry_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fentry_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fentry_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static enum print_line_t +print_fexit_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fexit_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fexit_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_puts(s, " <- "); + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + +static int fentry_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fentry_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static int fexit_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fexit_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_functions fentry_funcs = { + .trace = print_fentry_event +}; + +static struct trace_event_functions fexit_funcs = { + .trace = print_fexit_event +}; + +static struct trace_event_fields fentry_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fentry_event_define_fields }, + {} +}; + +static struct trace_event_fields fexit_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fexit_event_define_fields }, + {} +}; + +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data); + +static inline void init_trace_event_call(struct trace_fprobe *tf) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + + if (trace_fprobe_is_return(tf)) { + call->event.funcs = &fexit_funcs; + call->class->fields_array = fexit_fields_array; + } else { + call->event.funcs = &fentry_funcs; + call->class->fields_array = fentry_fields_array; + } + + call->flags = TRACE_EVENT_FL_FPROBE; + call->class->reg = fprobe_register; +} + +static int register_fprobe_event(struct trace_fprobe *tf) +{ + init_trace_event_call(tf); + + return trace_probe_register_event_call(&tf->tp); +} + +static int unregister_fprobe_event(struct trace_fprobe *tf) +{ + return trace_probe_unregister_event_call(&tf->tp); +} + +/* Internal register function - just handle fprobe and flags */ +static int __register_trace_fprobe(struct trace_fprobe *tf) +{ + int i, ret; + + /* Should we need new LOCKDOWN flag for fprobe? */ + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + + if (trace_fprobe_is_registered(tf)) + return -EINVAL; + + for (i = 0; i < tf->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tf->tp.args[i]); + if (ret) + return ret; + } + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(&tf->tp)) + tf->fp.flags &= ~FPROBE_FL_DISABLED; + else + tf->fp.flags |= FPROBE_FL_DISABLED; + + if (trace_fprobe_is_tracepoint(tf)) { + struct tracepoint *tpoint = tf->tpoint; + unsigned long ip = (unsigned long)tpoint->probestub; + /* + * Here, we do 2 steps to enable fprobe on a tracepoint. + * At first, put __probestub_##TP function on the tracepoint + * and put a fprobe on the stub function. + */ + ret = tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); + if (ret < 0) + return ret; + return register_fprobe_ips(&tf->fp, &ip, 1); + } + + /* TODO: handle filter, nofilter or symbol list */ + return register_fprobe(&tf->fp, tf->symbol, NULL); +} + +/* Internal unregister function - just handle fprobe and flags */ +static void __unregister_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) { + unregister_fprobe(&tf->fp); + memset(&tf->fp, 0, sizeof(tf->fp)); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } + } +} + +/* TODO: make this trace_*probe common function */ +/* Unregister a trace_probe and probe_event */ +static int unregister_trace_fprobe(struct trace_fprobe *tf) +{ + /* If other probes are on the event, just unregister fprobe */ + if (trace_probe_has_sibling(&tf->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&tf->tp)) + return -EBUSY; + + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tf->tp))) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_fprobe_event(tf)) + return -EBUSY; + +unreg: + __unregister_trace_fprobe(tf); + dyn_event_remove(&tf->devent); + trace_probe_unlink(&tf->tp); + + return 0; +} + +static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, + struct trace_fprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + int i; + + list_for_each_entry(orig, &tpe->probes, tp.list) { + if (strcmp(trace_fprobe_symbol(orig), + trace_fprobe_symbol(comp))) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +{ + int ret; + + if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to) || + trace_fprobe_is_tracepoint(tf) != trace_fprobe_is_tracepoint(to)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + return -EEXIST; + } + ret = trace_probe_compare_arg_type(&tf->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_fprobe_has_same_fprobe(to, tf)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tf->tp, &to->tp); + if (ret) + return ret; + + ret = __register_trace_fprobe(tf); + if (ret) + trace_probe_unlink(&tf->tp); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + + return ret; +} + +/* Register a trace_probe and probe_event */ +static int register_trace_fprobe(struct trace_fprobe *tf) +{ + struct trace_fprobe *old_tf; + int ret; + + mutex_lock(&event_mutex); + + old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), + trace_probe_group_name(&tf->tp)); + if (old_tf) { + ret = append_trace_fprobe(tf, old_tf); + goto end; + } + + /* Register new event */ + ret = register_fprobe_event(tf); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); + goto end; + } + + /* Register fprobe */ + ret = __register_trace_fprobe(tf); + if (ret < 0) + unregister_fprobe_event(tf); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + +end: + mutex_unlock(&event_mutex); + return ret; +} + +#ifdef CONFIG_MODULES +static int __tracepoint_probe_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ + struct tp_module *tp_mod = data; + struct trace_fprobe *tf; + struct dyn_event *pos; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + mutex_lock(&event_mutex); + for_each_trace_fprobe(tf, pos) { + if (tp_mod->mod == tf->mod) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } + } + mutex_unlock(&event_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; +#endif /* CONFIG_MODULES */ + +struct __find_tracepoint_cb_data { + const char *tp_name; + struct tracepoint *tpoint; +}; + +static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) + data->tpoint = tp; +} + +static struct tracepoint *find_tracepoint(const char *tp_name) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + }; + + for_each_kernel_tracepoint(__find_tracepoint_cb, &data); + + return data.tpoint; +} + +static int __trace_fprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * - Add fentry probe: + * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS] + * - Add fexit probe: + * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS] + * - Add tracepoint probe: + * t[:[GRP/][EVENT]] TRACEPOINT [FETCHARGS] + * + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth entry of stack (N:0-) + * $argN : fetch Nth argument (N:1-) + * $comm : fetch current task comm + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_fprobe *tf = NULL; + int i, len, new_argc = 0, ret = 0; + bool is_return = false; + char *symbol = NULL, *tmp = NULL; + const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + const char **new_argv = NULL; + int maxactive = 0; + char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; + char sbuf[KSYM_NAME_LEN]; + char abuf[MAX_BTF_ARGS_LEN]; + bool is_tracepoint = false; + struct tracepoint *tpoint = NULL; + struct traceprobe_parse_context ctx = { + .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + }; + + if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) + return -ECANCELED; + + if (argv[0][0] == 't') { + is_tracepoint = true; + group = TRACEPOINT_EVENT_SYSTEM; + } + + trace_probe_log_init("trace_fprobe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) + event++; + + if (isdigit(argv[0][1])) { + if (event) + len = event - &argv[0][1] - 1; + else + len = strlen(&argv[0][1]); + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + memcpy(buf, &argv[0][1], len); + buf[len] = '\0'; + ret = kstrtouint(buf, 0, &maxactive); + if (ret || !maxactive) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + /* fprobe rethook instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > RETHOOK_MAXACTIVE_MAX) { + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; + } + } + + trace_probe_log_set_index(1); + + /* a symbol(or tracepoint) must be specified */ + symbol = kstrdup(argv[1], GFP_KERNEL); + if (!symbol) + return -ENOMEM; + + tmp = strchr(symbol, '%'); + if (tmp) { + if (!is_tracepoint && !strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); + goto parse_error; + } + } + if (!is_return && maxactive) { + trace_probe_log_set_index(0); + trace_probe_log_err(1, BAD_MAXACT_TYPE); + goto parse_error; + } + + trace_probe_log_set_index(0); + if (event) { + ret = traceprobe_parse_event_name(&event, &group, gbuf, + event - argv[0]); + if (ret) + goto parse_error; + } + + if (!event) { + /* Make a new event name */ + if (is_tracepoint) + snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + isdigit(*symbol) ? "_" : "", symbol); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + is_return ? "exit" : "entry"); + sanitize_event_name(buf); + event = buf; + } + + if (is_return) + ctx.flags |= TPARG_FL_RETURN; + else + ctx.flags |= TPARG_FL_FENTRY; + + if (is_tracepoint) { + ctx.flags |= TPARG_FL_TPOINT; + tpoint = find_tracepoint(symbol); + if (!tpoint) { + trace_probe_log_set_index(1); + trace_probe_log_err(0, NO_TRACEPOINT); + goto parse_error; + } + ctx.funcname = kallsyms_lookup( + (unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); + } else + ctx.funcname = symbol; + + argc -= 2; argv += 2; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + + /* setup a probe */ + tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, + argc, is_return); + if (IS_ERR(tf)) { + ret = PTR_ERR(tf); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto out; /* We know tf is not allocated */ + } + + if (is_tracepoint) + tf->mod = __module_text_address( + (unsigned long)tf->tpoint->probestub); + + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ctx.offset = 0; + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); + if (ret) + goto error; /* This can be -ENOMEM */ + } + + ret = traceprobe_set_print_fmt(&tf->tp, + is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); + if (ret < 0) + goto error; + + ret = register_trace_fprobe(tf); + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM && ret != -EEXIST) + trace_probe_log_err(0, FAIL_REG_PROBE); + goto error; + } + +out: + trace_probe_log_clear(); + kfree(new_argv); + kfree(symbol); + return ret; + +parse_error: + ret = -EINVAL; +error: + free_trace_fprobe(tf); + goto out; +} + +static int trace_fprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_fprobe_create); +} + +static int trace_fprobe_release(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int ret = unregister_trace_fprobe(tf); + + if (!ret) + free_trace_fprobe(tf); + return ret; +} + +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int i; + + if (trace_fprobe_is_tracepoint(tf)) + seq_putc(m, 't'); + else + seq_putc(m, 'f'); + if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) + seq_printf(m, "%d", tf->fp.nr_maxactive); + seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), + trace_probe_name(&tf->tp)); + + seq_printf(m, " %s%s", trace_fprobe_symbol(tf), + trace_fprobe_is_return(tf) ? "%return" : ""); + + for (i = 0; i < tf->tp.nr_args; i++) + seq_printf(m, " %s=%s", tf->tp.args[i].name, tf->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + */ +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_fprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_fprobe(event, file); + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return enable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_UNREGISTER: + return disable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup fprobe + * events in postcore_initcall without tracefs. + */ +static __init int init_fprobe_trace_early(void) +{ + int ret; + + ret = dyn_event_register(&trace_fprobe_ops); + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = register_tracepoint_module_notifier(&tracepoint_module_nb); + if (ret) + return ret; +#endif + + return 0; +} +core_initcall(init_fprobe_trace_early); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 203204cadf92..c35fbaab2a47 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -58,6 +58,12 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, /* Display function name after trailing } */ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + /* Display function return value ? */ + { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) }, + /* Display function return value in hexadecimal format ? */ + { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -619,6 +625,56 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, trace_seq_puts(s, "| "); } +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL + +static void print_graph_retval(struct trace_seq *s, unsigned long retval, + bool leaf, void *func, bool hex_format) +{ + unsigned long err_code = 0; + + if (retval == 0 || hex_format) + goto done; + + /* Check if the return value matches the negative format */ + if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && + (((u64)retval) >> 32) == 0) { + /* sign extension */ + err_code = (unsigned long)(s32)retval; + } else { + err_code = retval; + } + + if (!IS_ERR_VALUE(err_code)) + err_code = 0; + +done: + if (leaf) { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "%ps(); /* = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "%ps(); /* = %ld */\n", + func, err_code); + } else { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "} /* %ps = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "} /* %ps = %ld */\n", + func, err_code); + } +} + +#else + +#define __TRACE_GRAPH_PRINT_RETVAL 0 + +#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0) + +#endif + /* Case of a leaf function on its call entry */ static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, @@ -663,7 +719,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); - trace_seq_printf(s, "%ps();\n", (void *)call->func); + /* + * Write out the function return value if the option function-retval is + * enabled. + */ + if (flags & __TRACE_GRAPH_PRINT_RETVAL) + print_graph_retval(s, graph_ret->retval, true, (void *)call->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + else + trace_seq_printf(s, "%ps();\n", (void *)call->func); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -942,16 +1006,25 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, trace_seq_putc(s, ' '); /* - * If the return function does not have a matching entry, - * then the entry was lost. Instead of just printing - * the '}' and letting the user guess what function this - * belongs to, write out the function name. Always do - * that if the funcgraph-tail option is enabled. + * Always write out the function name and its return value if the + * function-retval option is enabled. */ - if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); - else - trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + if (flags & __TRACE_GRAPH_PRINT_RETVAL) { + print_graph_retval(s, trace->retval, false, (void *)trace->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + } else { + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. + */ + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + } /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 590b3d51afae..ba37f768e2f2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -231,7 +231,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - + else + iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 59cda19a9033..23dba01831f7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,7 +30,7 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; static int __init set_kprobe_boot_events(char *str) { - strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); return 1; @@ -732,9 +732,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk = NULL; - int i, len, ret = 0; + int i, len, new_argc = 0, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; + const char **new_argv = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; enum probe_print_type ptype; int maxactive = 0; @@ -742,7 +743,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - unsigned int flags = TPARG_FL_KERNEL; + char abuf[MAX_BTF_ARGS_LEN]; + struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { case 'r': @@ -764,7 +766,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (isdigit(argv[0][1])) { if (!is_return) { - trace_probe_log_err(1, MAXACT_NO_KPROBE); + trace_probe_log_err(1, BAD_MAXACT_TYPE); goto parse_error; } if (event) @@ -823,10 +825,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) goto parse_error; } if (is_return) - flags |= TPARG_FL_RETURN; + ctx.flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); - if (ret == 0) - flags |= TPARG_FL_FENTRY; + if (ret == 0 && !is_return) + ctx.flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); @@ -854,21 +856,35 @@ static int __trace_kprobe_create(int argc, const char *argv[]) event = buf; } + argc -= 2; argv += 2; + ctx.funcname = symbol; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc - 2, is_return); + argc, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); goto out; /* We know tk is not allocated */ } - argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); + ctx.offset = 0; + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ } @@ -892,6 +908,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) out: trace_probe_log_clear(); + kfree(new_argv); kfree(symbol); return ret; diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c index 16548ee4c8c6..3851cd1e6a62 100644 --- a/kernel/trace/trace_kprobe_selftest.c +++ b/kernel/trace/trace_kprobe_selftest.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 + +#include "trace_kprobe_selftest.h" + /* * Function used during the kprobe self test. This function is in a separate * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e97e3fa5cbed..bd0d01d00fb9 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -181,6 +181,7 @@ struct osn_irq { #define IRQ_CONTEXT 0 #define THREAD_CONTEXT 1 +#define THREAD_URET 2 /* * sofirq runtime info. */ @@ -238,6 +239,7 @@ struct timerlat_variables { u64 abs_period; bool tracing_thread; u64 count; + bool uthread_migrate; }; static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); @@ -1181,6 +1183,78 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) osn_var->thread.arrival_time = 0; } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise_stop_exception - Stop tracing and the tracer. + */ +static __always_inline void osnoise_stop_exception(char *msg, int cpu) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +} + +/* + * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler + * + * his function is hooked to the sched:sched_migrate_task trace event, and monitors + * timerlat user-space thread migration. + */ +static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu) +{ + struct osnoise_variables *osn_var; + long cpu = task_cpu(p); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + if (osn_var->pid == p->pid && dest_cpu != cpu) { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user-thread migrated\n"); + osnoise_stop_exception("timerlat user-thread migrated", cpu); + } +} + +static int register_migration_monitor(void) +{ + int ret = 0; + + /* + * Timerlat thread migration check is only required when running timerlat in user-space. + * Thus, enable callback only if timerlat is set with no workload. + */ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + + return ret; +} + +static void unregister_migration_monitor(void) +{ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); +} +#else +static int register_migration_monitor(void) +{ + return 0; +} +static void unregister_migration_monitor(void) {} +#endif /* * trace_sched_switch - sched:sched_switch trace event handler * @@ -1204,7 +1278,7 @@ trace_sched_switch_callback(void *data, bool preempt, } /* - * hook_thread_events - Hook the insturmentation for thread noise + * hook_thread_events - Hook the instrumentation for thread noise * * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1217,11 +1291,19 @@ static int hook_thread_events(void) if (ret) return -EINVAL; + ret = register_migration_monitor(); + if (ret) + goto out_unreg; + return 0; + +out_unreg: + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + return -EINVAL; } /* - * unhook_thread_events - *nhook the insturmentation for thread noise + * unhook_thread_events - unhook the instrumentation for thread noise * * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1229,6 +1311,7 @@ static int hook_thread_events(void) static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + unregister_migration_monitor(); } /* @@ -1286,6 +1369,22 @@ static __always_inline void osnoise_stop_tracing(void) } /* + * osnoise_has_tracing_on - Check if there is at least one instance on + */ +static __always_inline int osnoise_has_tracing_on(void) +{ + struct osnoise_instance *inst; + int trace_is_on = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) + trace_is_on += tracer_tracing_is_on(inst->tr); + rcu_read_unlock(); + + return trace_is_on; +} + +/* * notify_new_max_latency - Notify a new max latency via fsnotify interface. */ static void notify_new_max_latency(u64 latency) @@ -1517,13 +1616,16 @@ static struct cpumask save_cpumask; /* * osnoise_sleep - sleep until the next period */ -static void osnoise_sleep(void) +static void osnoise_sleep(bool skip_period) { u64 interval; ktime_t wake_time; mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + if (skip_period) + interval = osnoise_data.sample_period; + else + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; mutex_unlock(&interface_lock); /* @@ -1546,6 +1648,39 @@ static void osnoise_sleep(void) } /* + * osnoise_migration_pending - checks if the task needs to migrate + * + * osnoise/timerlat threads are per-cpu. If there is a pending request to + * migrate the thread away from the current CPU, something bad has happened. + * Play the good citizen and leave. + * + * Returns 0 if it is safe to continue, 1 otherwise. + */ +static inline int osnoise_migration_pending(void) +{ + if (!current->migration_pending) + return 0; + + /* + * If migration is pending, there is a task waiting for the + * tracer to enable migration. The tracer does not allow migration, + * thus: taint and leave to unblock the blocked thread. + */ + osnoise_taint("migration requested to osnoise threads, leaving."); + + /* + * Unset this thread from the threads managed by the interface. + * The tracers are responsible for cleaning their env before + * exiting. + */ + mutex_lock(&interface_lock); + this_cpu_osn_var()->kthread = NULL; + mutex_unlock(&interface_lock); + + return 1; +} + +/* * osnoise_main - The osnoise detection kernel thread * * Calls run_osnoise() function to measure the osnoise for the configured runtime, @@ -1553,12 +1688,35 @@ static void osnoise_sleep(void) */ static int osnoise_main(void *data) { + unsigned long flags; + + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); while (!kthread_should_stop()) { + if (osnoise_migration_pending()) + break; + + /* skip a period if tracing is off on all instances */ + if (!osnoise_has_tracing_on()) { + osnoise_sleep(true); + continue; + } + run_osnoise(); - osnoise_sleep(); + osnoise_sleep(false); } + migrate_enable(); return 0; } @@ -1706,6 +1864,7 @@ static int timerlat_main(void *data) struct timerlat_variables *tlat = this_cpu_tmr_var(); struct timerlat_sample s; struct sched_param sp; + unsigned long flags; u64 now, diff; /* @@ -1714,6 +1873,18 @@ static int timerlat_main(void *data) sp.sched_priority = DEFAULT_TIMERLAT_PRIO; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + tlat->count = 0; tlat->tracing_thread = false; @@ -1731,6 +1902,7 @@ static int timerlat_main(void *data) osn_var->sampling = 1; while (!kthread_should_stop()) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); diff = now - tlat->abs_period; @@ -1749,10 +1921,14 @@ static int timerlat_main(void *data) if (time_to_us(diff) >= osnoise_data.stop_tracing_total) osnoise_stop_tracing(); + if (osnoise_migration_pending()) + break; + wait_next_period(tlat); } hrtimer_cancel(&tlat->timer); + migrate_enable(); return 0; } #else /* CONFIG_TIMERLAT_TRACER */ @@ -1771,10 +1947,24 @@ static void stop_kthread(unsigned int cpu) kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; if (kthread) { - kthread_stop(kthread); + if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + kthread_stop(kthread); + } else { + /* + * This is a user thread waiting on the timerlat_fd. We need + * to close all users, and the best way to guarantee this is + * by killing the thread. NOTE: this is a purpose specific file. + */ + kill_pid(kthread->thread_pid, SIGKILL, 1); + put_task_struct(kthread); + } per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } else { + /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + /* + * This is set in the osnoise tracer case. + */ per_cpu(per_cpu_osnoise_var, cpu).sampling = false; barrier(); return; @@ -1819,7 +2009,6 @@ static int start_kthread(unsigned int cpu) barrier(); return 0; } - snprintf(comm, 24, "osnoise/%d", cpu); } @@ -1848,6 +2037,11 @@ static int start_per_cpu_kthreads(void) int retval = 0; int cpu; + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (timerlat_enabled()) + return 0; + } + cpus_read_lock(); /* * Run only on online CPUs in which osnoise is allowed to run. @@ -2188,6 +2382,223 @@ err_free: return err; } +#ifdef CONFIG_TIMERLAT_TRACER +static int timerlat_fd_open(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + long cpu = (long) inode->i_cdev; + + mutex_lock(&interface_lock); + + /* + * This file is accessible only if timerlat is enabled, and + * NO_OSNOISE_WORKLOAD is set. + */ + if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) { + mutex_unlock(&interface_lock); + return -EINVAL; + } + + migrate_disable(); + + osn_var = this_cpu_osn_var(); + + /* + * The osn_var->pid holds the single access to this file. + */ + if (osn_var->pid) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EBUSY; + } + + /* + * timerlat tracer is a per-cpu tracer. Check if the user-space too + * is pinned to a single CPU. The tracer laters monitor if the task + * migrates and then disables tracer if it does. However, it is + * worth doing this basic acceptance test to avoid obviusly wrong + * setup. + */ + if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EPERM; + } + + /* + * From now on, it is good to go. + */ + file->private_data = inode->i_cdev; + + get_task_struct(current); + + osn_var->kthread = current; + osn_var->pid = current->pid; + + /* + * Setup is done. + */ + mutex_unlock(&interface_lock); + + tlat = this_cpu_tmr_var(); + tlat->count = 0; + + migrate_enable(); + return 0; +}; + +/* + * timerlat_fd_read - Read function for "timerlat_fd" file + * @file: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error. + */ +static ssize_t +timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos) +{ + long cpu = (long) file->private_data; + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + struct timerlat_sample s; + s64 diff; + u64 now; + + migrate_disable(); + + tlat = this_cpu_tmr_var(); + + /* + * While in user-space, the thread is migratable. There is nothing + * we can do about it. + * So, if the thread is running on another CPU, stop the machinery. + */ + if (cpu == smp_processor_id()) { + if (tlat->uthread_migrate) { + migrate_enable(); + return -EINVAL; + } + } else { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user thread migrate\n"); + osnoise_stop_tracing(); + migrate_enable(); + return -EINVAL; + } + + osn_var = this_cpu_osn_var(); + + /* + * The timerlat in user-space runs in a different order: + * the read() starts from the execution of the previous occurrence, + * sleeping for the next occurrence. + * + * So, skip if we are entering on read() before the first wakeup + * from timerlat IRQ: + */ + if (likely(osn_var->sampling)) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_URET; + + trace_timerlat_sample(&s); + + notify_new_max_latency(diff); + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + } else { + tlat->tracing_thread = false; + tlat->kthread = current; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + + /* Annotate now to drift new period */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + osn_var->sampling = 1; + } + + /* wait for the next period */ + wait_next_period(tlat); + + /* This is the wakeup from this cycle */ + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); + notify_new_max_latency(diff); + osnoise_stop_tracing(); + } + } + +out: + migrate_enable(); + return 0; +} + +static int timerlat_fd_release(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat_var; + long cpu = (long) file->private_data; + + migrate_disable(); + mutex_lock(&interface_lock); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + + hrtimer_cancel(&tlat_var->timer); + memset(tlat_var, 0, sizeof(*tlat_var)); + + osn_var->sampling = 0; + osn_var->pid = 0; + + /* + * We are leaving, not being stopped... see stop_kthread(); + */ + if (osn_var->kthread) { + put_task_struct(osn_var->kthread); + osn_var->kthread = NULL; + } + + mutex_unlock(&interface_lock); + migrate_enable(); + return 0; +} +#endif + /* * osnoise/runtime_us: cannot be greater than the period. */ @@ -2251,6 +2662,13 @@ static struct trace_min_max_param timerlat_period = { .max = &timerlat_max_period, .min = &timerlat_min_period, }; + +static const struct file_operations timerlat_fd_fops = { + .open = timerlat_fd_open, + .read = timerlat_fd_read, + .release = timerlat_fd_release, + .llseek = generic_file_llseek, +}; #endif static const struct file_operations cpus_fops = { @@ -2288,18 +2706,63 @@ static int init_timerlat_stack_tracefs(struct dentry *top_dir) } #endif /* CONFIG_STACKTRACE */ +static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) +{ + struct dentry *timerlat_fd; + struct dentry *per_cpu; + struct dentry *cpu_dir; + char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */ + long cpu; + + /* + * Why not using tracing instance per_cpu/ dir? + * + * Because osnoise/timerlat have a single workload, having + * multiple files like these are wast of memory. + */ + per_cpu = tracefs_create_dir("per_cpu", top_dir); + if (!per_cpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + snprintf(cpu_str, 30, "cpu%ld", cpu); + cpu_dir = tracefs_create_dir(cpu_str, per_cpu); + if (!cpu_dir) + goto out_clean; + + timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ, + cpu_dir, NULL, &timerlat_fd_fops); + if (!timerlat_fd) + goto out_clean; + + /* Record the CPU */ + d_inode(timerlat_fd)->i_cdev = (void *)(cpu); + } + + return 0; + +out_clean: + tracefs_remove(per_cpu); + return -ENOMEM; +} + /* * init_timerlat_tracefs - A function to initialize the timerlat interface files */ static int init_timerlat_tracefs(struct dentry *top_dir) { struct dentry *tmp; + int retval; tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); + if (retval) + return retval; + return init_timerlat_stack_tracefs(top_dir); } #else /* CONFIG_TIMERLAT_TRACER */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1e33f367783e..db575094c498 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1446,6 +1446,8 @@ static struct trace_event trace_osnoise_event = { }; /* TRACE_TIMERLAT */ + +static char *timerlat_lat_context[] = {"irq", "thread", "user-ret"}; static enum print_line_t trace_timerlat_print(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -1458,7 +1460,7 @@ trace_timerlat_print(struct trace_iterator *iter, int flags, trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n", field->seqnum, - field->context ? "thread" : "irq", + timerlat_lat_context[field->context], field->timer_latency); return trace_handle_return(s); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..c68a72707852 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -11,6 +11,8 @@ */ #define pr_fmt(fmt) "trace_probe: " fmt +#include <linux/bpf.h> + #include "trace_probe.h" #undef C @@ -65,7 +67,7 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) int len = *(u32 *)data >> 16; if (!len) - trace_seq_puts(s, "(fault)"); + trace_seq_puts(s, FAULT_STRING); else trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); @@ -254,7 +256,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, trace_probe_log_err(offset, GROUP_TOO_LONG); return -EINVAL; } - strlcpy(buf, event, slash - event + 1); + strscpy(buf, event, slash - event + 1); if (!is_good_system_name(buf)) { trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; @@ -283,69 +285,354 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return 0; } +static int parse_trace_event_arg(char *arg, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ctx->event); + list_for_each_entry(field, head, link) { + if (!strcmp(arg, field->name)) { + code->op = FETCH_OP_TP_ARG; + code->data = field; + return 0; + } + } + return -ENOENT; +} + +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + +static struct btf *traceprobe_get_btf(void) +{ + struct btf *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(btf)) + return NULL; + + return btf; +} + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static const char *type_from_btf_id(struct btf *btf, s32 id) +{ + const struct btf_type *t; + u32 intdata; + s32 tid; + + /* TODO: const char * could be converted as a string */ + t = btf_type_skip_modifiers(btf, id, &tid); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_ENUM: + /* enum is "int", so convert to "s32" */ + return "s32"; + case BTF_KIND_ENUM64: + return "s64"; + case BTF_KIND_PTR: + /* pointer will be converted to "x??" */ + if (IS_ENABLED(CONFIG_64BIT)) + return "x64"; + else + return "x32"; + case BTF_KIND_INT: + intdata = btf_type_int(t); + if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) { + switch (BTF_INT_BITS(intdata)) { + case 8: + return "s8"; + case 16: + return "s16"; + case 32: + return "s32"; + case 64: + return "s64"; + } + } else { /* unsigned */ + switch (BTF_INT_BITS(intdata)) { + case 8: + return "u8"; + case 16: + return "u16"; + case 32: + return "u32"; + case 64: + return "u64"; + } + } + } + /* TODO: support other types */ + + return NULL; +} + +static const struct btf_type *find_btf_func_proto(const char *funcname) +{ + struct btf *btf = traceprobe_get_btf(); + const struct btf_type *t; + s32 id; + + if (!btf || !funcname) + return ERR_PTR(-EINVAL); + + id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC); + if (id <= 0) + return ERR_PTR(-ENOENT); + + /* Get BTF_KIND_FUNC type */ + t = btf_type_by_id(btf, id); + if (!t || !btf_type_is_func(t)) + return ERR_PTR(-ENOENT); + + /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ + t = btf_type_by_id(btf, t->type); + if (!t || !btf_type_is_func_proto(t)) + return ERR_PTR(-ENOENT); + + return t; +} + +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) +{ + const struct btf_param *param; + const struct btf_type *t; + + if (!funcname || !nr) + return ERR_PTR(-EINVAL); + + t = find_btf_func_proto(funcname); + if (IS_ERR(t)) + return (const struct btf_param *)t; + + *nr = btf_type_vlen(t); + param = (const struct btf_param *)(t + 1); + + /* Hide the first 'data' argument of tracepoint */ + if (tracepoint) { + (*nr)--; + param++; + } + + if (*nr > 0) + return param; + else + return NULL; +} + +static int parse_btf_arg(const char *varname, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const struct btf_param *params; + int i; + + if (!btf) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; + } + + if (WARN_ON_ONCE(!ctx->funcname)) + return -EINVAL; + + if (!ctx->params) { + params = find_btf_func_param(ctx->funcname, &ctx->nr_params, + ctx->flags & TPARG_FL_TPOINT); + if (IS_ERR_OR_NULL(params)) { + trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); + return PTR_ERR(params); + } + ctx->params = params; + } else + params = ctx->params; + + for (i = 0; i < ctx->nr_params; i++) { + const char *name = btf_name_by_offset(btf, params[i].name_off); + + if (name && !strcmp(name, varname)) { + code->op = FETCH_OP_ARG; + if (ctx->flags & TPARG_FL_TPOINT) + code->param = i + 1; + else + code->param = i; + return 0; + } + } + trace_probe_log_err(ctx->offset, NO_BTFARG); + return -ENOENT; +} + +static const struct fetch_type *parse_btf_arg_type(int arg_idx, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *typestr = NULL; + + if (btf && ctx->params) { + if (ctx->flags & TPARG_FL_TPOINT) + arg_idx--; + typestr = type_from_btf_id(btf, ctx->params[arg_idx].type); + } + + return find_fetch_type(typestr, ctx->flags); +} + +static const struct fetch_type *parse_btf_retval_type( + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *typestr = NULL; + const struct btf_type *t; + + if (btf && ctx->funcname) { + t = find_btf_func_proto(ctx->funcname); + if (!IS_ERR(t)) + typestr = type_from_btf_id(btf, t->type); + } + + return find_fetch_type(typestr, ctx->flags); +} + +static bool is_btf_retval_void(const char *funcname) +{ + const struct btf_type *t; + + t = find_btf_func_proto(funcname); + if (IS_ERR(t)) + return false; + + return t->type == 0; +} +#else +static struct btf *traceprobe_get_btf(void) +{ + return NULL; +} + +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int parse_btf_arg(const char *varname, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} + +#define parse_btf_arg_type(idx, ctx) \ + find_fetch_type(NULL, ctx->flags) + +#define parse_btf_retval_type(ctx) \ + find_fetch_type(NULL, ctx->flags) + +#define is_btf_retval_void(funcname) (false) + +#endif + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, unsigned int flags, int offs) + struct fetch_insn *code, + struct traceprobe_parse_context *ctx) { unsigned long param; + int err = TP_ERR_BAD_VAR; int ret = 0; int len; - if (flags & TPARG_FL_TPOINT) { + if (ctx->flags & TPARG_FL_TEVENT) { if (code->data) return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; - } else if (strcmp(arg, "retval") == 0) { - if (flags & TPARG_FL_RETURN) { + ret = parse_trace_event_arg(arg, code, ctx); + if (!ret) + return 0; + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } + /* backward compatibility */ + ctx->offset = 0; + goto inval; + } + + if (strcmp(arg, "retval") == 0) { + if (ctx->flags & TPARG_FL_RETURN) { + if ((ctx->flags & TPARG_FL_KERNEL) && + is_btf_retval_void(ctx->funcname)) { + err = TP_ERR_NO_RETVAL; + goto inval; + } code->op = FETCH_OP_RETVAL; - } else { - trace_probe_log_err(offs, RETVAL_ON_PROBE); - ret = -EINVAL; + return 0; } - } else if ((len = str_has_prefix(arg, "stack"))) { + err = TP_ERR_RETVAL_ON_PROBE; + goto inval; + } + + len = str_has_prefix(arg, "stack"); + if (len) { + if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; - } else if (isdigit(arg[len])) { + return 0; + } + + if (isdigit(arg[len])) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret) { - goto inval_var; - } else if ((flags & TPARG_FL_KERNEL) && - param > PARAM_MAX_STACK) { - trace_probe_log_err(offs, BAD_STACK_NUM); - ret = -EINVAL; - } else { - code->op = FETCH_OP_STACK; - code->param = (unsigned int)param; + if (ret) + goto inval; + + if ((ctx->flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_STACK_NUM; + goto inval; } - } else - goto inval_var; - } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; + return 0; + } + goto inval; + } + + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; + return 0; + } + #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API - } else if (((flags & TPARG_FL_MASK) == - (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && - (len = str_has_prefix(arg, "arg"))) { + len = str_has_prefix(arg, "arg"); + if (len && tparg_is_function_entry(ctx->flags)) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret) { - goto inval_var; - } else if (!param || param > PARAM_MAX_STACK) { - trace_probe_log_err(offs, BAD_ARG_NUM); - return -EINVAL; + if (ret) + goto inval; + + if (!param || param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_ARG_NUM; + goto inval; } + code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; + /* + * The tracepoint probe will probe a stub function, and the + * first parameter of the stub is a dummy and should be ignored. + */ + if (ctx->flags & TPARG_FL_TPOINT) + code->param++; + return 0; + } #endif - } else - goto inval_var; - return ret; - -inval_var: - trace_probe_log_err(offs, BAD_VAR); +inval: + __trace_probe_log_err(ctx->offset, err); return -EINVAL; } @@ -378,7 +665,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - unsigned int flags, int offs) + struct traceprobe_parse_context *ctx) { struct fetch_insn *code = *pcode; unsigned long param; @@ -389,13 +676,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, flags, offs); + ret = parse_probe_vars(arg + 1, type, code, ctx); break; case '%': /* named register */ - if (flags & TPARG_FL_TPOINT) { - /* eprobes do not handle registers */ - trace_probe_log_err(offs, BAD_VAR); + if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { + /* eprobe and fprobe do not handle registers */ + trace_probe_log_err(ctx->offset, BAD_VAR); break; } ret = regs_query_register_offset(arg + 1); @@ -404,14 +691,14 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->param = (unsigned int)ret; ret = 0; } else - trace_probe_log_err(offs, BAD_REG_NAME); + trace_probe_log_err(ctx->offset, BAD_REG_NAME); break; case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); if (ret) { - trace_probe_log_err(offs, BAD_MEM_ADDR); + trace_probe_log_err(ctx->offset, BAD_MEM_ADDR); break; } /* load address */ @@ -419,13 +706,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (flags & TPARG_FL_KERNEL) { - trace_probe_log_err(offs, FILE_ON_KPROBE); + if (ctx->flags & TPARG_FL_KERNEL) { + trace_probe_log_err(ctx->offset, FILE_ON_KPROBE); return -EINVAL; } ret = kstrtol(arg + 2, 0, &offset); if (ret) { - trace_probe_log_err(offs, BAD_FILE_OFFS); + trace_probe_log_err(ctx->offset, BAD_FILE_OFFS); break; } @@ -433,8 +720,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!(flags & TPARG_FL_KERNEL)) { - trace_probe_log_err(offs, SYM_ON_UPROBE); + if (!(ctx->flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(ctx->offset, SYM_ON_UPROBE); return -EINVAL; } /* Preserve symbol for updating */ @@ -443,7 +730,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, if (!code->data) return -ENOMEM; if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } code->op = FETCH_OP_IMM; @@ -451,7 +738,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } /* These are fetching from memory */ if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } *pcode = code; @@ -470,36 +757,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type, arg++; /* Skip '+', because kstrtol() rejects it. */ tmp = strchr(arg, '('); if (!tmp) { - trace_probe_log_err(offs, DEREF_NEED_BRACE); + trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE); return -EINVAL; } *tmp = '\0'; ret = kstrtol(arg, 0, &offset); if (ret) { - trace_probe_log_err(offs, BAD_DEREF_OFFS); + trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS); break; } - offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); + ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); arg = tmp + 1; tmp = strrchr(arg, ')'); if (!tmp) { - trace_probe_log_err(offs + strlen(arg), + trace_probe_log_err(ctx->offset + strlen(arg), DEREF_OPEN_BRACE); return -EINVAL; } else { - const struct fetch_type *t2 = find_fetch_type(NULL, flags); + const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags); + int cur_offs = ctx->offset; *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, flags, offs); + ret = parse_probe_arg(arg, t2, &code, end, ctx); if (ret) break; + ctx->offset = cur_offs; if (code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) { - trace_probe_log_err(offs, COMM_CANT_DEREF); + trace_probe_log_err(ctx->offset, COMM_CANT_DEREF); return -EINVAL; } if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } *pcode = code; @@ -510,7 +799,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '\\': /* Immediate value */ if (arg[1] == '"') { /* Immediate string */ - ret = __parse_imm_string(arg + 2, &tmp, offs + 2); + ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2); if (ret) break; code->op = FETCH_OP_DATA; @@ -518,15 +807,24 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } else { ret = str_to_immediate(arg + 1, &code->immediate); if (ret) - trace_probe_log_err(offs + 1, BAD_IMM); + trace_probe_log_err(ctx->offset + 1, BAD_IMM); else code->op = FETCH_OP_IMM; } break; + default: + if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EINVAL; + } + ret = parse_btf_arg(arg, code, ctx); + break; + } } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ - trace_probe_log_err(offs, BAD_FETCH_ARG); + trace_probe_log_err(ctx->offset, BAD_FETCH_ARG); ret = -EINVAL; } return ret; @@ -571,12 +869,13 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, - struct probe_arg *parg, unsigned int flags, int offset) + struct probe_arg *parg, + struct traceprobe_parse_context *ctx) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2, *t3; - char *arg; int ret, len; + char *arg; arg = kstrdup(argv, GFP_KERNEL); if (!arg) @@ -585,10 +884,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, ret = -EINVAL; len = strlen(arg); if (len > MAX_ARGSTR_LEN) { - trace_probe_log_err(offset, ARG_TOO_LONG); + trace_probe_log_err(ctx->offset, ARG_TOO_LONG); goto out; } else if (len == 0) { - trace_probe_log_err(offset, NO_ARG_BODY); + trace_probe_log_err(ctx->offset, NO_ARG_BODY); goto out; } @@ -606,23 +905,24 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, *t2++ = '\0'; t3 = strchr(t2, ']'); if (!t3) { - offset += t2 + strlen(t2) - arg; - trace_probe_log_err(offset, + int offs = t2 + strlen(t2) - arg; + + trace_probe_log_err(ctx->offset + offs, ARRAY_NO_CLOSE); goto out; } else if (t3[1] != '\0') { - trace_probe_log_err(offset + t3 + 1 - arg, + trace_probe_log_err(ctx->offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); goto out; } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { - trace_probe_log_err(offset + t2 - arg, + trace_probe_log_err(ctx->offset + t2 - arg, BAD_ARRAY_NUM); goto out; } if (parg->count > MAX_ARRAY_LEN) { - trace_probe_log_err(offset + t2 - arg, + trace_probe_log_err(ctx->offset + t2 - arg, ARRAY_TOO_BIG); goto out; } @@ -633,17 +933,17 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. But ignore for eprobes. */ - if (!(flags & TPARG_FL_TPOINT) && + if (!(ctx->flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; - parg->type = find_fetch_type("string", flags); + parg->type = find_fetch_type("string", ctx->flags); } else - parg->type = find_fetch_type(t, flags); + parg->type = find_fetch_type(t, ctx->flags); if (!parg->type) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; } parg->offset = *size; @@ -665,10 +965,18 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - flags, offset); + ctx); if (ret) goto fail; + /* Update storing type if BTF is available */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && !t) { + if (code->op == FETCH_OP_ARG) + parg->type = parse_btf_arg_type(code->param, ctx); + else if (code->op == FETCH_OP_RETVAL) + parg->type = parse_btf_retval_type(ctx); + } + ret = -EINVAL; /* Store operation */ if (parg->type->is_string) { @@ -676,7 +984,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_SYMSTRING); goto fail; } @@ -684,7 +992,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } @@ -703,7 +1011,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, */ code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } } @@ -726,7 +1034,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, } else { code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } code->op = FETCH_OP_ST_RAW; @@ -737,7 +1045,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); if (ret) { - trace_probe_log_err(offset + t - arg, BAD_BITFIELD); + trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); goto fail; } } @@ -747,13 +1055,13 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING && scode->op != FETCH_OP_ST_USTRING) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -801,8 +1109,35 @@ static int traceprobe_conflict_field_name(const char *name, return 0; } +static char *generate_probe_arg_name(const char *arg, int idx) +{ + char *name = NULL; + const char *end; + + /* + * If argument name is omitted, try arg as a name (BTF variable) + * or "argN". + */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) { + end = strchr(arg, ':'); + if (!end) + end = arg + strlen(arg); + + name = kmemdup_nul(arg, end - arg, GFP_KERNEL); + if (!name || !is_good_name(name)) { + kfree(name); + name = NULL; + } + } + + if (!name) + name = kasprintf(GFP_KERNEL, "arg%d", idx + 1); + + return name; +} + int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, - unsigned int flags) + struct traceprobe_parse_context *ctx) { struct probe_arg *parg = &tp->args[i]; const char *body; @@ -822,8 +1157,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { - /* If argument name is omitted, set "argN" */ - parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1); + parg->name = generate_probe_arg_name(arg, i); body = arg; } if (!parg->name) @@ -837,9 +1171,9 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, trace_probe_log_err(0, USED_ARG_NAME); return -EINVAL; } + ctx->offset = body - arg; /* Parse fetch argument */ - return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, - body - arg); + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, ctx); } void traceprobe_free_probe_arg(struct probe_arg *arg) @@ -858,6 +1192,151 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) kfree(arg->fmt); } +static int argv_has_var_arg(int argc, const char *argv[], int *args_idx, + struct traceprobe_parse_context *ctx) +{ + int i, found = 0; + + for (i = 0; i < argc; i++) + if (str_has_prefix(argv[i], "$arg")) { + trace_probe_log_set_index(i + 2); + + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(0, NOFENTRY_ARGS); + return -EINVAL; + } + + if (isdigit(argv[i][4])) { + found = 1; + continue; + } + + if (argv[i][4] != '*') { + trace_probe_log_err(0, BAD_VAR); + return -EINVAL; + } + + if (*args_idx >= 0 && *args_idx < argc) { + trace_probe_log_err(0, DOUBLE_ARGS); + return -EINVAL; + } + found = 1; + *args_idx = i; + } + + return found; +} + +static int sprint_nth_btf_arg(int idx, const char *type, + char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *name; + int ret; + + if (idx >= ctx->nr_params) { + trace_probe_log_err(0, NO_BTFARG); + return -ENOENT; + } + name = btf_name_by_offset(btf, ctx->params[idx].name_off); + if (!name) { + trace_probe_log_err(0, NO_BTF_ENTRY); + return -ENOENT; + } + ret = snprintf(buf, bufsize, "%s%s", name, type); + if (ret >= bufsize) { + trace_probe_log_err(0, ARGS_2LONG); + return -E2BIG; + } + return ret; +} + +/* Return new_argv which must be freed after use */ +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + const struct btf_param *params = NULL; + int i, j, n, used, ret, args_idx = -1; + const char **new_argv = NULL; + int nr_params; + + ret = argv_has_var_arg(argc, argv, &args_idx, ctx); + if (ret < 0) + return ERR_PTR(ret); + + if (!ret) { + *new_argc = argc; + return NULL; + } + + params = find_btf_func_param(ctx->funcname, &nr_params, + ctx->flags & TPARG_FL_TPOINT); + if (IS_ERR_OR_NULL(params)) { + if (args_idx != -1) { + /* $arg* requires BTF info */ + trace_probe_log_err(0, NOSUP_BTFARG); + return (const char **)params; + } + *new_argc = argc; + return NULL; + } + ctx->params = params; + ctx->nr_params = nr_params; + + if (args_idx >= 0) + *new_argc = argc + ctx->nr_params - 1; + else + *new_argc = argc; + + new_argv = kcalloc(*new_argc, sizeof(char *), GFP_KERNEL); + if (!new_argv) + return ERR_PTR(-ENOMEM); + + used = 0; + for (i = 0, j = 0; i < argc; i++) { + trace_probe_log_set_index(i + 2); + if (i == args_idx) { + for (n = 0; n < nr_params; n++) { + ret = sprint_nth_btf_arg(n, "", buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + + new_argv[j++] = buf + used; + used += ret + 1; + } + continue; + } + + if (str_has_prefix(argv[i], "$arg")) { + char *type = NULL; + + n = simple_strtoul(argv[i] + 4, &type, 10); + if (type && !(*type == ':' || *type == '\0')) { + trace_probe_log_err(0, BAD_VAR); + ret = -ENOENT; + goto error; + } + /* Note: $argN starts from $arg1 */ + ret = sprint_nth_btf_arg(n - 1, type, buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + new_argv[j++] = buf + used; + used += ret + 1; + } else + new_argv[j++] = argv[i]; + } + + return new_argv; + +error: + kfree(new_argv); + return ERR_PTR(ret); +} + int traceprobe_update_arg(struct probe_arg *arg) { struct fetch_insn *code = arg->code; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 6a4ecfb1da43..01ea148723de 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -23,6 +23,7 @@ #include <linux/limits.h> #include <linux/uaccess.h> #include <linux/bitops.h> +#include <linux/btf.h> #include <asm/bitsperlong.h> #include "trace.h" @@ -32,7 +33,9 @@ #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 +#define MAX_BTF_ARGS_LEN 128 #define MAX_STRING_SIZE PATH_MAX +#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -357,15 +360,42 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a #define trace_probe_for_each_link_rcu(pos, tp) \ list_for_each_entry_rcu(pos, &(tp)->event->files, list) +/* + * The flags used for parsing trace_probe arguments. + * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TEVENT are mutually exclusive. + * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive. + * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with + * TPARG_FL_KERNEL. + */ #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -#define TPARG_FL_TPOINT BIT(3) +#define TPARG_FL_TEVENT BIT(3) #define TPARG_FL_USER BIT(4) -#define TPARG_FL_MASK GENMASK(4, 0) +#define TPARG_FL_FPROBE BIT(5) +#define TPARG_FL_TPOINT BIT(6) +#define TPARG_FL_LOC_MASK GENMASK(4, 0) + +static inline bool tparg_is_function_entry(unsigned int flags) +{ + return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); +} + +struct traceprobe_parse_context { + struct trace_event_call *event; + const struct btf_param *params; + s32 nr_params; + const char *funcname; + unsigned int flags; + int offset; +}; extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, - const char *argv, unsigned int flags); + const char *argv, + struct traceprobe_parse_context *ctx); +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); @@ -404,11 +434,12 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ - C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ + C(BAD_MAXACT_TYPE, "Maxactive is only for function exit"), \ C(BAD_MAXACT, "Invalid maxactive number"), \ C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_TRACEPOINT, "Tracepoint is not found"), \ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ @@ -418,6 +449,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(NO_RETVAL, "This function returns 'void' type"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ C(BAD_VAR, "Invalid $-valiable specified"), \ @@ -456,7 +488,14 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ - C(NO_EP_FILTER, "No filter rule after 'if'"), + C(NO_EP_FILTER, "No filter rule after 'if'"), \ + C(NOSUP_BTFARG, "BTF is not available or not supported"), \ + C(NO_BTFARG, "This variable is not found at this probe point"),\ + C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \ + C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ + C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ + C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ + C(ARGS_2LONG, "$arg* failed because the argument list is too long"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index c4e1d4c03a85..bb723eefd7b7 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -2,8 +2,6 @@ #ifndef __TRACE_PROBE_KERNEL_H_ #define __TRACE_PROBE_KERNEL_H_ -#define FAULT_STRING "(fault)" - /* * This depends on trace_probe.h, but can not include it due to * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. @@ -15,16 +13,8 @@ static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; - int ret; - ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); - /* - * strnlen_user_nofault returns zero on fault, insert the - * FAULT_STRING when that occurs. - */ - if (ret <= 0) - return strlen(FAULT_STRING) + 1; - return ret; + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); } /* Return the length of string -- including null terminal byte */ @@ -44,18 +34,14 @@ fetch_store_strlen(unsigned long addr) len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - /* For faults, return enough to hold the FAULT_STRING */ - return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; + return (ret < 0) ? ret : len; } -static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base) { - if (ret >= 0) { - *(u32 *)dest = make_data_loc(ret, __dest - base); - } else { - strscpy(__dest, FAULT_STRING, len); - ret = strlen(__dest) + 1; - } + if (ret < 0) + ret = 0; + *(u32 *)dest = make_data_loc(ret, __dest - base); } /* @@ -76,7 +62,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + set_data_loc(ret, dest, __dest, base); return ret; } @@ -107,7 +93,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + set_data_loc(ret, dest, __dest, base); return ret; } diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 00707630788d..3935b347f874 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -156,11 +156,11 @@ stage3: code++; goto array; case FETCH_OP_ST_USTRING: - ret += fetch_store_strlen_user(val + code->offset); + ret = fetch_store_strlen_user(val + code->offset); code++; goto array; case FETCH_OP_ST_SYMSTR: - ret += fetch_store_symstrlen(val + code->offset); + ret = fetch_store_symstrlen(val + code->offset); code++; goto array; default: @@ -204,6 +204,8 @@ stage3: array: /* the last stage: Loop on array */ if (code->op == FETCH_OP_LP_ARRAY) { + if (ret < 0) + ret = 0; total += ret; if (++i < code->param) { code = s3; @@ -265,9 +267,7 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) { - *dl = make_data_loc(0, dyndata - base); - } else { + if (arg->dynamic && likely(ret > 0)) { dyndata += ret; maxlen -= ret; } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 330aee1c1a49..0469a04a355f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -168,6 +168,8 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); + else + iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index e5e299260d0c..bac06ee3b98b 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string + * @args: Arguments for the format string * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formatting of a trace diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b92e34ff0c8..688bf579f2f1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -170,7 +170,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) */ ret++; *(u32 *)dest = make_data_loc(ret, (void *)dst - base); - } + } else + *(u32 *)dest = make_data_loc(0, (void *)dst - base); return ret; } @@ -686,10 +687,12 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { + .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, + }; + trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], - (is_return ? TPARG_FL_RETURN : 0) | - TPARG_FL_USER); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); if (ret) goto error; } diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 2c765ee2a4d4..99c37eeebc16 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -272,10 +272,6 @@ extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i); -extern void tracing_map_set_field_descr(struct tracing_map *map, - unsigned int i, - unsigned int key_offset, - tracing_map_cmp_fn_t cmp_fn); extern int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_key *sort_keys, |
