From 8f7262cd66699a4b02eb7549b35c81b2116aad95 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:38:37 +0900 Subject: kprobes: Do not use local variable when creating debugfs file debugfs_create_file() takes a pointer argument that can be used during file operation callbacks (accessible via i_private in the inode structure). An obvious requirement is for the pointer to refer to valid memory when used. When creating the debugfs file to dynamically enable / disable kprobes, a pointer to local variable is passed to debugfs_create_file(); which will go out of scope when the init function returns. The reason this hasn't triggered random memory corruption is because the pointer is not accessed during the debugfs file callbacks. Since the enabled state is managed by the kprobes_all_disabled global variable, the local variable is not needed. Fix the incorrect (and unnecessary) usage of local variable during debugfs_file_create() by passing NULL instead. Link: https://lkml.kernel.org/r/163163031686.489837.4476867635937014973.stgit@devnote2 Fixes: bf8f6e5b3e51 ("Kprobes: The ON/OFF knob thru debugfs") Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 790a573bbe00..1cf8bca1ea86 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2809,13 +2809,12 @@ static const struct file_operations fops_kp = { static int __init debugfs_kprobe_init(void) { struct dentry *dir; - unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); - debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); + debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, &kprobe_blacklist_fops); -- cgit v1.2.3 From 5d6de7d7fb4b0f752adff80ca003b4fd4b467b64 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:38:46 +0900 Subject: kprobes: Use helper to parse boolean input from userspace The "enabled" file provides a debugfs interface to arm / disarm kprobes in the kernel. In order to parse the buffer containing the values written from userspace, the callback manually parses the user input to convert it to a boolean value. As taking a string value from userspace and converting it to boolean is a common operation, a helper kstrtobool_from_user() is already available in the kernel. Update the callback to use the common helper to parse the write buffer from userspace. Link: https://lkml.kernel.org/r/163163032637.489837.10678039554832855327.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1cf8bca1ea86..26fc9904c3b1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2770,30 +2770,14 @@ static ssize_t read_enabled_file_bool(struct file *file, static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - char buf[32]; - size_t buf_size; - int ret = 0; - - buf_size = min(count, (sizeof(buf)-1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; + bool enable; + int ret; - buf[buf_size] = '\0'; - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - ret = arm_all_kprobes(); - break; - case 'n': - case 'N': - case '0': - ret = disarm_all_kprobes(); - break; - default: - return -EINVAL; - } + ret = kstrtobool_from_user(user_buf, count, &enable); + if (ret) + return ret; + ret = enable ? arm_all_kprobes() : disarm_all_kprobes(); if (ret) return ret; -- cgit v1.2.3 From 02afb8d6048d6526619e6e2dcdc95ce9c2bdb52f Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:38:57 +0900 Subject: kprobe: Simplify prepare_kprobe() by dropping redundant version The function prepare_kprobe() is called during kprobe registration and is responsible for ensuring any architecture related preparation for the kprobe is done before returning. One of two versions of prepare_kprobe() is chosen depending on the availability of KPROBE_ON_FTRACE in the kernel configuration. Simplify the code by dropping the version when KPROBE_ON_FTRACE is not selected - instead relying on kprobe_ftrace() to return false when KPROBE_ON_FTRACE is not set. No functional change. Link: https://lkml.kernel.org/r/163163033696.489837.9264661820279300788.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 26fc9904c3b1..cfa9d3c263eb 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1033,15 +1033,6 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; -/* Must ensure p->addr is really on ftrace */ -static int prepare_kprobe(struct kprobe *p) -{ - if (!kprobe_ftrace(p)) - return arch_prepare_kprobe(p); - - return arch_prepare_kprobe_ftrace(p); -} - /* Caller must lock kprobe_mutex */ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) @@ -1113,11 +1104,6 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } #else /* !CONFIG_KPROBES_ON_FTRACE */ -static inline int prepare_kprobe(struct kprobe *p) -{ - return arch_prepare_kprobe(p); -} - static inline int arm_kprobe_ftrace(struct kprobe *p) { return -ENODEV; @@ -1129,6 +1115,15 @@ static inline int disarm_kprobe_ftrace(struct kprobe *p) } #endif +static int prepare_kprobe(struct kprobe *p) +{ + /* Must ensure p->addr is really on ftrace */ + if (kprobe_ftrace(p)) + return arch_prepare_kprobe_ftrace(p); + + return arch_prepare_kprobe(p); +} + /* Arm a kprobe with text_mutex */ static int arm_kprobe(struct kprobe *kp) { -- cgit v1.2.3 From 4402deae8993fb0e25a19bb999b38df13e25a7e0 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:39:16 +0900 Subject: kprobes: Make arch_check_ftrace_location static arch_check_ftrace_location() was introduced as a weak function in commit f7f242ff004499 ("kprobes: introduce weak arch_check_ftrace_location() helper function") to allow architectures to handle kprobes call site on their own. Recently, the only architecture (csky) to implement arch_check_ftrace_location() was migrated to using the common version. As a result, further cleanup the code to drop the weak attribute and rename the function to remove the architecture specific implementation. Link: https://lkml.kernel.org/r/163163035673.489837.2367816318195254104.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cfa9d3c263eb..30199bfcc74a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1524,7 +1524,7 @@ static inline int warn_kprobe_rereg(struct kprobe *p) return ret; } -int __weak arch_check_ftrace_location(struct kprobe *p) +static int check_ftrace_location(struct kprobe *p) { unsigned long ftrace_addr; @@ -1547,7 +1547,7 @@ static int check_kprobe_address_safe(struct kprobe *p, { int ret; - ret = arch_check_ftrace_location(p); + ret = check_ftrace_location(p); if (ret) return ret; jump_label_lock(); -- cgit v1.2.3 From 9c89bb8e327203bc27e09ebd82d8f61ac2ae8b24 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:25 +0900 Subject: kprobes: treewide: Cleanup the error messages for kprobes This clean up the error/notification messages in kprobes related code. Basically this defines 'pr_fmt()' macros for each files and update the messages which describes - what happened, - what is the kernel going to do or not do, - is the kernel fine, - what can the user do about it. Also, if the message is not needed (e.g. the function returns unique error code, or other error message is already shown.) remove it, and replace the message with WARN_*() macros if suitable. Link: https://lkml.kernel.org/r/163163036568.489837.14085396178727185469.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 30199bfcc74a..7663c8a51889 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -18,6 +18,9 @@ * and Prasanna S Panchamukhi * added function-return probes. */ + +#define pr_fmt(fmt) "kprobes: " fmt + #include #include #include @@ -892,7 +895,7 @@ static void optimize_all_kprobes(void) optimize_kprobe(p); } cpus_read_unlock(); - printk(KERN_INFO "Kprobes globally optimized\n"); + pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n"); out: mutex_unlock(&kprobe_mutex); } @@ -925,7 +928,7 @@ static void unoptimize_all_kprobes(void) /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); - printk(KERN_INFO "Kprobes globally unoptimized\n"); + pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n"); } static DEFINE_MUTEX(kprobe_sysctl_mutex); @@ -1003,7 +1006,7 @@ static int reuse_unused_kprobe(struct kprobe *ap) * unregistered. * Thus there should be no chance to reuse unused kprobe. */ - printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); + WARN_ON_ONCE(1); return -EINVAL; } @@ -1040,18 +1043,13 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int ret = 0; ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0); - if (ret) { - pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", - p->addr, ret); + if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret)) return ret; - } if (*cnt == 0) { ret = register_ftrace_function(ops); - if (ret) { - pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); + if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) goto err_ftrace; - } } (*cnt)++; @@ -1083,14 +1081,14 @@ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, if (*cnt == 1) { ret = unregister_ftrace_function(ops); - if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) + if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret)) return ret; } (*cnt)--; ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); - WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", + WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (error %d)\n", p->addr, ret); return ret; } @@ -1880,7 +1878,7 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, node = node->next; } - pr_err("Oops! Kretprobe fails to find correct return address.\n"); + pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n"); BUG_ON(1); found: @@ -2209,8 +2207,7 @@ EXPORT_SYMBOL_GPL(enable_kprobe); /* Caller must NOT call this in usual path. This is only for critical case */ void dump_kprobe(struct kprobe *kp) { - pr_err("Dumping kprobe:\n"); - pr_err("Name: %s\nOffset: %x\nAddress: %pS\n", + pr_err("Dump kprobe:\n.symbol_name = %s, .offset = %x, .addr = %pS\n", kp->symbol_name, kp->offset, kp->addr); } NOKPROBE_SYMBOL(dump_kprobe); @@ -2473,8 +2470,7 @@ static int __init init_kprobes(void) err = populate_kprobe_blacklist(__start_kprobe_blacklist, __stop_kprobe_blacklist); if (err) { - pr_err("kprobes: failed to populate blacklist: %d\n", err); - pr_err("Please take care of using kprobes.\n"); + pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err); } if (kretprobe_blacklist_size) { @@ -2483,7 +2479,7 @@ static int __init init_kprobes(void) kretprobe_blacklist[i].addr = kprobe_lookup_name(kretprobe_blacklist[i].name, 0); if (!kretprobe_blacklist[i].addr) - printk("kretprobe: lookup failed: %s\n", + pr_err("Failed to lookup symbol '%s' for kretprobe blacklist. Maybe the target function is removed or renamed.\n", kretprobe_blacklist[i].name); } } @@ -2687,7 +2683,7 @@ static int arm_all_kprobes(void) } if (errors) - pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", + pr_warn("Kprobes globally enabled, but failed to enable %d out of %d probes. Please check which kprobes are kept disabled via debugfs.\n", errors, total); else pr_info("Kprobes globally enabled\n"); @@ -2730,7 +2726,7 @@ static int disarm_all_kprobes(void) } if (errors) - pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", + pr_warn("Kprobes globally disabled, but failed to disable %d out of %d probes. Please check which kprobes are kept enabled via debugfs.\n", errors, total); else pr_info("Kprobes globally disabled\n"); -- cgit v1.2.3 From 223a76b268c9cfa265d454879ae09e2c9c808f87 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:34 +0900 Subject: kprobes: Fix coding style issues Fix coding style issues reported by checkpatch.pl and update comments to quote variable names and add "()" to function name. One TODO comment in __disarm_kprobe() is removed because it has been done by following commit. Link: https://lkml.kernel.org/r/163163037468.489837.4282347782492003960.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 236 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 122 insertions(+), 114 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7663c8a51889..ad39eeaa4371 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Kernel Probes (KProbes) - * kernel/kprobes.c * * Copyright (C) IBM Corporation, 2002, 2004 * @@ -52,18 +51,18 @@ static int kprobes_initialized; /* kprobe_table can be accessed by - * - Normal hlist traversal and RCU add/del under kprobe_mutex is held. + * - Normal hlist traversal and RCU add/del under 'kprobe_mutex' is held. * Or * - RCU hlist traversal under disabling preempt (breakpoint handlers) */ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; -/* NOTE: change this value only with kprobe_mutex held */ +/* NOTE: change this value only with 'kprobe_mutex' held */ static bool kprobes_all_disarmed; -/* This protects kprobe_table and optimizing_list */ +/* This protects 'kprobe_table' and 'optimizing_list' */ static DEFINE_MUTEX(kprobe_mutex); -static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static DEFINE_PER_CPU(struct kprobe *, kprobe_instance); kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, unsigned int __unused) @@ -71,12 +70,15 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); } -/* Blacklist -- list of struct kprobe_blacklist_entry */ +/* + * Blacklist -- list of 'struct kprobe_blacklist_entry' to store info where + * kprobes can not probe. + */ static LIST_HEAD(kprobe_blacklist); #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* - * kprobe->ainsn.insn points to the copy of the instruction to be + * 'kprobe::ainsn.insn' points to the copy of the instruction to be * single-stepped. x86_64, POWER4 and above have no-exec support and * stepping on the instruction on a vmalloced/kmalloced/data page * is a recipe for disaster @@ -107,6 +109,12 @@ enum kprobe_slot_state { void __weak *alloc_insn_page(void) { + /* + * Use module_alloc() so this page is within +/- 2GB of where the + * kernel image and loaded module images reside. This is required + * for most of the architectures. + * (e.g. x86-64 needs this to handle the %rip-relative fixups.) + */ return module_alloc(PAGE_SIZE); } @@ -142,6 +150,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) list_for_each_entry_rcu(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { int i; + for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; @@ -167,11 +176,6 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) if (!kip) goto out; - /* - * Use module_alloc so this page is within +/- 2GB of where the - * kernel image and loaded module images reside. This is required - * so x86_64 can correctly handle the %rip-relative fixups. - */ kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); @@ -233,6 +237,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) list_for_each_entry_safe(kip, next, &c->pages, list) { int i; + if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ @@ -313,7 +318,7 @@ int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, list_for_each_entry_rcu(kip, &c->pages, list) { if ((*symnum)--) continue; - strlcpy(sym, c->sym, KSYM_NAME_LEN); + strscpy(sym, c->sym, KSYM_NAME_LEN); *type = 't'; *value = (unsigned long)kip->insns; ret = 0; @@ -361,9 +366,9 @@ static inline void reset_kprobe_instance(void) /* * This routine is called either: - * - under the kprobe_mutex - during kprobe_[un]register() - * OR - * - with preemption disabled - from arch/xxx/kernel/kprobes.c + * - under the 'kprobe_mutex' - during kprobe_[un]register(). + * OR + * - with preemption disabled - from architecture specific code. */ struct kprobe *get_kprobe(void *addr) { @@ -383,22 +388,20 @@ NOKPROBE_SYMBOL(get_kprobe); static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); -/* Return true if the kprobe is an aggregator */ +/* Return true if 'p' is an aggregator */ static inline int kprobe_aggrprobe(struct kprobe *p) { return p->pre_handler == aggr_pre_handler; } -/* Return true(!0) if the kprobe is unused */ +/* Return true if 'p' is unused */ static inline int kprobe_unused(struct kprobe *p) { return kprobe_aggrprobe(p) && kprobe_disabled(p) && list_empty(&p->list); } -/* - * Keep all fields in the kprobe consistent - */ +/* Keep all fields in the kprobe consistent. */ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) { memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); @@ -406,11 +409,11 @@ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) } #ifdef CONFIG_OPTPROBES -/* NOTE: change this value only with kprobe_mutex held */ +/* NOTE: This is protected by 'kprobe_mutex'. */ static bool kprobes_allow_optimization; /* - * Call all pre_handler on the list, but ignores its return value. + * Call all 'kprobe::pre_handler' on the list, but ignores its return value. * This must be called from arch-dep optimized caller. */ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) @@ -438,7 +441,7 @@ static void free_aggr_kprobe(struct kprobe *p) kfree(op); } -/* Return true(!0) if the kprobe is ready for optimization. */ +/* Return true if the kprobe is ready for optimization. */ static inline int kprobe_optready(struct kprobe *p) { struct optimized_kprobe *op; @@ -451,7 +454,7 @@ static inline int kprobe_optready(struct kprobe *p) return 0; } -/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ +/* Return true if the kprobe is disarmed. Note: p must be on hash list */ static inline int kprobe_disarmed(struct kprobe *p) { struct optimized_kprobe *op; @@ -465,7 +468,7 @@ static inline int kprobe_disarmed(struct kprobe *p) return kprobe_disabled(p) && list_empty(&op->list); } -/* Return true(!0) if the probe is queued on (un)optimizing lists */ +/* Return true if the probe is queued on (un)optimizing lists */ static int kprobe_queued(struct kprobe *p) { struct optimized_kprobe *op; @@ -480,7 +483,7 @@ static int kprobe_queued(struct kprobe *p) /* * Return an optimized kprobe whose optimizing code replaces - * instructions including addr (exclude breakpoint). + * instructions including 'addr' (exclude breakpoint). */ static struct kprobe *get_optimized_kprobe(unsigned long addr) { @@ -501,7 +504,7 @@ static struct kprobe *get_optimized_kprobe(unsigned long addr) return NULL; } -/* Optimization staging list, protected by kprobe_mutex */ +/* Optimization staging list, protected by 'kprobe_mutex' */ static LIST_HEAD(optimizing_list); static LIST_HEAD(unoptimizing_list); static LIST_HEAD(freeing_list); @@ -512,20 +515,20 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); /* * Optimize (replace a breakpoint with a jump) kprobes listed on - * optimizing_list. + * 'optimizing_list'. */ static void do_optimize_kprobes(void) { lockdep_assert_held(&text_mutex); /* - * The optimization/unoptimization refers online_cpus via - * stop_machine() and cpu-hotplug modifies online_cpus. - * And same time, text_mutex will be held in cpu-hotplug and here. - * This combination can cause a deadlock (cpu-hotplug try to lock - * text_mutex but stop_machine can not be done because online_cpus - * has been changed) - * To avoid this deadlock, caller must have locked cpu hotplug - * for preventing cpu-hotplug outside of text_mutex locking. + * The optimization/unoptimization refers 'online_cpus' via + * stop_machine() and cpu-hotplug modifies the 'online_cpus'. + * And same time, 'text_mutex' will be held in cpu-hotplug and here. + * This combination can cause a deadlock (cpu-hotplug tries to lock + * 'text_mutex' but stop_machine() can not be done because + * the 'online_cpus' has been changed) + * To avoid this deadlock, caller must have locked cpu-hotplug + * for preventing cpu-hotplug outside of 'text_mutex' locking. */ lockdep_assert_cpus_held(); @@ -539,7 +542,7 @@ static void do_optimize_kprobes(void) /* * Unoptimize (replace a jump with a breakpoint and remove the breakpoint - * if need) kprobes listed on unoptimizing_list. + * if need) kprobes listed on 'unoptimizing_list'. */ static void do_unoptimize_kprobes(void) { @@ -554,7 +557,7 @@ static void do_unoptimize_kprobes(void) return; arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); - /* Loop free_list for disarming */ + /* Loop on 'freeing_list' for disarming */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Switching from detour code to origin */ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; @@ -565,7 +568,7 @@ static void do_unoptimize_kprobes(void) /* * Remove unused probes from hash list. After waiting * for synchronization, these probes are reclaimed. - * (reclaiming is done by do_free_cleaned_kprobes.) + * (reclaiming is done by do_free_cleaned_kprobes().) */ hlist_del_rcu(&op->kp.hlist); } else @@ -573,7 +576,7 @@ static void do_unoptimize_kprobes(void) } } -/* Reclaim all kprobes on the free_list */ +/* Reclaim all kprobes on the 'freeing_list' */ static void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -645,9 +648,9 @@ void wait_for_kprobe_optimizer(void) while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { mutex_unlock(&kprobe_mutex); - /* this will also make optimizing_work execute immmediately */ + /* This will also make 'optimizing_work' execute immmediately */ flush_delayed_work(&optimizing_work); - /* @optimizing_work might not have been queued yet, relax */ + /* 'optimizing_work' might not have been queued yet, relax */ cpu_relax(); mutex_lock(&kprobe_mutex); @@ -678,7 +681,7 @@ static void optimize_kprobe(struct kprobe *p) (kprobe_disabled(p) || kprobes_all_disarmed)) return; - /* kprobes with post_handler can not be optimized */ + /* kprobes with 'post_handler' can not be optimized */ if (p->post_handler) return; @@ -698,7 +701,10 @@ static void optimize_kprobe(struct kprobe *p) } op->kp.flags |= KPROBE_FLAG_OPTIMIZED; - /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */ + /* + * On the 'unoptimizing_list' and 'optimizing_list', + * 'op' must have OPTIMIZED flag + */ if (WARN_ON_ONCE(!list_empty(&op->list))) return; @@ -768,7 +774,7 @@ static int reuse_unused_kprobe(struct kprobe *ap) WARN_ON_ONCE(list_empty(&op->list)); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; - /* Optimize it again (remove from op->list) */ + /* Optimize it again. (remove from 'op->list') */ if (!kprobe_optready(ap)) return -EINVAL; @@ -818,7 +824,7 @@ static void prepare_optimized_kprobe(struct kprobe *p) __prepare_optimized_kprobe(op, p); } -/* Allocate new optimized_kprobe and try to prepare optimized instructions */ +/* Allocate new optimized_kprobe and try to prepare optimized instructions. */ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -837,19 +843,19 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); /* - * Prepare an optimized_kprobe and optimize it - * NOTE: p must be a normal registered kprobe + * Prepare an optimized_kprobe and optimize it. + * NOTE: 'p' must be a normal registered kprobe. */ static void try_to_optimize_kprobe(struct kprobe *p) { struct kprobe *ap; struct optimized_kprobe *op; - /* Impossible to optimize ftrace-based kprobe */ + /* Impossible to optimize ftrace-based kprobe. */ if (kprobe_ftrace(p)) return; - /* For preparing optimization, jump_label_text_reserved() is called */ + /* For preparing optimization, jump_label_text_reserved() is called. */ cpus_read_lock(); jump_label_lock(); mutex_lock(&text_mutex); @@ -860,14 +866,14 @@ static void try_to_optimize_kprobe(struct kprobe *p) op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { - /* If failed to setup optimizing, fallback to kprobe */ + /* If failed to setup optimizing, fallback to kprobe. */ arch_remove_optimized_kprobe(op); kfree(op); goto out; } init_aggr_kprobe(ap, p); - optimize_kprobe(ap); /* This just kicks optimizer thread */ + optimize_kprobe(ap); /* This just kicks optimizer thread. */ out: mutex_unlock(&text_mutex); @@ -882,7 +888,7 @@ static void optimize_all_kprobes(void) unsigned int i; mutex_lock(&kprobe_mutex); - /* If optimization is already allowed, just return */ + /* If optimization is already allowed, just return. */ if (kprobes_allow_optimization) goto out; @@ -908,7 +914,7 @@ static void unoptimize_all_kprobes(void) unsigned int i; mutex_lock(&kprobe_mutex); - /* If optimization is already prohibited, just return */ + /* If optimization is already prohibited, just return. */ if (!kprobes_allow_optimization) { mutex_unlock(&kprobe_mutex); return; @@ -926,7 +932,7 @@ static void unoptimize_all_kprobes(void) cpus_read_unlock(); mutex_unlock(&kprobe_mutex); - /* Wait for unoptimizing completion */ + /* Wait for unoptimizing completion. */ wait_for_kprobe_optimizer(); pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n"); } @@ -953,12 +959,12 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, } #endif /* CONFIG_SYSCTL */ -/* Put a breakpoint for a probe. Must be called with text_mutex locked */ +/* Put a breakpoint for a probe. Must be called with 'text_mutex' locked. */ static void __arm_kprobe(struct kprobe *p) { struct kprobe *_p; - /* Check collision with other optimized kprobes */ + /* Find the overlapping optimized kprobes. */ _p = get_optimized_kprobe((unsigned long)p->addr); if (unlikely(_p)) /* Fallback to unoptimized kprobe */ @@ -968,7 +974,7 @@ static void __arm_kprobe(struct kprobe *p) optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ } -/* Remove the breakpoint of a probe. Must be called with text_mutex locked */ +/* Remove the breakpoint of a probe. Must be called with 'text_mutex' locked. */ static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; @@ -978,12 +984,17 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) if (!kprobe_queued(p)) { arch_disarm_kprobe(p); - /* If another kprobe was blocked, optimize it. */ + /* If another kprobe was blocked, re-optimize it. */ _p = get_optimized_kprobe((unsigned long)p->addr); if (unlikely(_p) && reopt) optimize_kprobe(_p); } - /* TODO: reoptimize others after unoptimized this probe */ + /* + * TODO: Since unoptimization and real disarming will be done by + * the worker thread, we can not check whether another probe are + * unoptimized because of this probe here. It should be re-optimized + * by the worker thread. + */ } #else /* !CONFIG_OPTPROBES */ @@ -1036,7 +1047,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; -/* Caller must lock kprobe_mutex */ +/* Caller must lock 'kprobe_mutex' */ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { @@ -1073,7 +1084,7 @@ static int arm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } -/* Caller must lock kprobe_mutex */ +/* Caller must lock 'kprobe_mutex'. */ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { @@ -1122,7 +1133,7 @@ static int prepare_kprobe(struct kprobe *p) return arch_prepare_kprobe(p); } -/* Arm a kprobe with text_mutex */ +/* Arm a kprobe with 'text_mutex'. */ static int arm_kprobe(struct kprobe *kp) { if (unlikely(kprobe_ftrace(kp))) @@ -1137,7 +1148,7 @@ static int arm_kprobe(struct kprobe *kp) return 0; } -/* Disarm a kprobe with text_mutex */ +/* Disarm a kprobe with 'text_mutex'. */ static int disarm_kprobe(struct kprobe *kp, bool reopt) { if (unlikely(kprobe_ftrace(kp))) @@ -1187,17 +1198,17 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(aggr_post_handler); -/* Walks the list and increments nmissed count for multiprobe case */ +/* Walks the list and increments 'nmissed' if 'p' has child probes. */ void kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; + if (!kprobe_aggrprobe(p)) { p->nmissed++; } else { list_for_each_entry_rcu(kp, &p->list, list) kp->nmissed++; } - return; } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); @@ -1215,9 +1226,9 @@ static void recycle_rp_inst(struct kretprobe_instance *ri) { struct kretprobe *rp = get_kretprobe(ri); - if (likely(rp)) { + if (likely(rp)) freelist_add(&ri->freelist, &rp->freelist); - } else + else call_rcu(&ri->rcu, free_rp_inst_rcu); } NOKPROBE_SYMBOL(recycle_rp_inst); @@ -1243,8 +1254,8 @@ void kprobe_busy_end(void) } /* - * This function is called from finish_task_switch when task tk becomes dead, - * so that we can recycle any function-return probe instances associated + * This function is called from finish_task_switch() when task 'tk' becomes + * dead, so that we can recycle any kretprobe instances associated * with this task. These left over instances represent probed functions * that have been called but will never return. */ @@ -1292,7 +1303,7 @@ static inline void free_rp_inst(struct kretprobe *rp) } } -/* Add the new probe to ap->list */ +/* Add the new probe to 'ap->list'. */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { if (p->post_handler) @@ -1306,12 +1317,12 @@ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) } /* - * Fill in the required fields of the "manager kprobe". Replace the - * earlier kprobe in the hlist with the manager kprobe + * Fill in the required fields of the aggregator kprobe. Replace the + * earlier kprobe in the hlist with the aggregator kprobe. */ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { - /* Copy p's insn slot to ap */ + /* Copy the insn slot of 'p' to 'ap'. */ copy_kprobe(p, ap); flush_insn_slot(ap); ap->addr = p->addr; @@ -1329,8 +1340,7 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) } /* - * This is the second or subsequent kprobe at the address - handle - * the intricacies + * This registers the second or subsequent kprobe at the same address. */ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) { @@ -1344,7 +1354,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) mutex_lock(&text_mutex); if (!kprobe_aggrprobe(orig_p)) { - /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ + /* If 'orig_p' is not an 'aggr_kprobe', create new one. */ ap = alloc_aggr_kprobe(orig_p); if (!ap) { ret = -ENOMEM; @@ -1369,8 +1379,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) if (ret) /* * Even if fail to allocate new slot, don't need to - * free aggr_probe. It will be used next time, or - * freed by unregister_kprobe. + * free the 'ap'. It will be used next time, or + * freed by unregister_kprobe(). */ goto out; @@ -1385,7 +1395,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) | KPROBE_FLAG_DISABLED; } - /* Copy ap's insn slot to p */ + /* Copy the insn slot of 'p' to 'ap'. */ copy_kprobe(ap, p); ret = add_new_kprobe(ap, p); @@ -1411,7 +1421,7 @@ out: bool __weak arch_within_kprobe_blacklist(unsigned long addr) { - /* The __kprobes marked functions and entry code must not be probed */ + /* The '__kprobes' functions and entry code must not be probed. */ return addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end; } @@ -1423,8 +1433,8 @@ static bool __within_kprobe_blacklist(unsigned long addr) if (arch_within_kprobe_blacklist(addr)) return true; /* - * If there exists a kprobe_blacklist, verify and - * fail any probe registration in the prohibited area + * If 'kprobe_blacklist' is defined, check the address and + * reject any probe registration in the prohibited area. */ list_for_each_entry(ent, &kprobe_blacklist, list) { if (addr >= ent->start_addr && addr < ent->end_addr) @@ -1454,7 +1464,7 @@ bool within_kprobe_blacklist(unsigned long addr) } /* - * If we have a symbol_name argument, look it up and add the offset field + * If 'symbol_name' is specified, look it up and add the 'offset' * to it. This way, we can specify a relative address to a symbol. * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. @@ -1484,7 +1494,10 @@ static kprobe_opcode_t *kprobe_addr(struct kprobe *p) return _kprobe_addr(p->addr, p->symbol_name, p->offset); } -/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +/* + * Check the 'p' is valid and return the aggregator kprobe + * at the same address. + */ static struct kprobe *__get_valid_kprobe(struct kprobe *p) { struct kprobe *ap, *list_p; @@ -1561,7 +1574,7 @@ static int check_kprobe_address_safe(struct kprobe *p, goto out; } - /* Check if are we probing a module */ + /* Check if 'p' is probing a module. */ *probed_mod = __module_text_address((unsigned long) p->addr); if (*probed_mod) { /* @@ -1574,7 +1587,7 @@ static int check_kprobe_address_safe(struct kprobe *p, } /* - * If the module freed .init.text, we couldn't insert + * If the module freed '.init.text', we couldn't insert * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && @@ -1621,7 +1634,7 @@ int register_kprobe(struct kprobe *p) old_p = get_kprobe(p->addr); if (old_p) { - /* Since this may unoptimize old_p, locking text_mutex. */ + /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ ret = register_aggr_kprobe(old_p, p); goto out; } @@ -1660,7 +1673,7 @@ out: } EXPORT_SYMBOL_GPL(register_kprobe); -/* Check if all probes on the aggrprobe are disabled */ +/* Check if all probes on the 'ap' are disabled. */ static int aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; @@ -1670,15 +1683,15 @@ static int aggr_kprobe_disabled(struct kprobe *ap) list_for_each_entry(kp, &ap->list, list) if (!kprobe_disabled(kp)) /* - * There is an active probe on the list. - * We can't disable this ap. + * Since there is an active probe on the list, + * we can't disable this 'ap'. */ return 0; return 1; } -/* Disable one kprobe: Make sure called under kprobe_mutex is locked */ +/* Disable one kprobe: Make sure called under 'kprobe_mutex' is locked. */ static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; @@ -1697,7 +1710,7 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If kprobes_all_disarmed is set, orig_p + * If 'kprobes_all_disarmed' is set, 'orig_p' * should have already been disarmed, so * skip unneed disarming process. */ @@ -1984,7 +1997,7 @@ int register_kretprobe(struct kretprobe *rp) if (ret) return ret; - /* If only rp->kp.addr is specified, check reregistering kprobes */ + /* If only 'rp->kp.addr' is specified, check reregistering kprobes */ if (rp->kp.addr && warn_kprobe_rereg(&rp->kp)) return -EINVAL; @@ -2089,13 +2102,13 @@ EXPORT_SYMBOL_GPL(unregister_kretprobes); #else /* CONFIG_KRETPROBES */ int register_kretprobe(struct kretprobe *rp) { - return -ENOSYS; + return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(register_kretprobe); int register_kretprobes(struct kretprobe **rps, int num) { - return -ENOSYS; + return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(register_kretprobes); @@ -2144,7 +2157,7 @@ static void kill_kprobe(struct kprobe *p) /* * The module is going away. We should disarm the kprobe which * is using ftrace, because ftrace framework is still available at - * MODULE_STATE_GOING notification. + * 'MODULE_STATE_GOING' notification. */ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); @@ -2317,13 +2330,13 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret; } - /* Symbols in __kprobes_text are blacklisted */ + /* Symbols in '__kprobes_text' are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); if (ret) return ret; - /* Symbols in noinstr section are blacklisted */ + /* Symbols in 'noinstr' section are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, (unsigned long)__noinstr_text_end); @@ -2395,9 +2408,9 @@ static int kprobes_module_callback(struct notifier_block *nb, return NOTIFY_DONE; /* - * When MODULE_STATE_GOING was notified, both of module .text and - * .init.text sections would be freed. When MODULE_STATE_LIVE was - * notified, only .init.text section would be freed. We need to + * When 'MODULE_STATE_GOING' was notified, both of module '.text' and + * '.init.text' sections would be freed. When 'MODULE_STATE_LIVE' was + * notified, only '.init.text' section would be freed. We need to * disable kprobes which have been inserted in the sections. */ mutex_lock(&kprobe_mutex); @@ -2414,9 +2427,9 @@ static int kprobes_module_callback(struct notifier_block *nb, * * Note, this will also move any optimized probes * that are pending to be removed from their - * corresponding lists to the freeing_list and + * corresponding lists to the 'freeing_list' and * will not be touched by the delayed - * kprobe_optimizer work handler. + * kprobe_optimizer() work handler. */ kill_kprobe(p); } @@ -2432,10 +2445,6 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; -/* Markers of _kprobe_blacklist section */ -extern unsigned long __start_kprobe_blacklist[]; -extern unsigned long __stop_kprobe_blacklist[]; - void kprobe_free_init_mem(void) { void *start = (void *)(&__init_begin); @@ -2446,7 +2455,7 @@ void kprobe_free_init_mem(void) mutex_lock(&kprobe_mutex); - /* Kill all kprobes on initmem */ + /* Kill all kprobes on initmem because the target code has been freed. */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry(p, head, hlist) { @@ -2469,9 +2478,8 @@ static int __init init_kprobes(void) err = populate_kprobe_blacklist(__start_kprobe_blacklist, __stop_kprobe_blacklist); - if (err) { + if (err) pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err); - } if (kretprobe_blacklist_size) { /* lookup the function address from its name */ @@ -2488,7 +2496,7 @@ static int __init init_kprobes(void) kprobes_all_disarmed = false; #if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT) - /* Init kprobe_optinsn_slots for allocation */ + /* Init 'kprobe_optinsn_slots' for allocation */ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; #endif @@ -2622,7 +2630,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) list_entry(v, struct kprobe_blacklist_entry, list); /* - * If /proc/kallsyms is not showing kernel address, we won't + * If '/proc/kallsyms' is not showing kernel address, we won't * show them here either. */ if (!kallsyms_show_value(m->file->f_cred)) -- cgit v1.2.3 From 57d4e31780106ad97516bfd197fac47a81482353 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:55 +0900 Subject: kprobes: Add assertions for required lock Add assertions for required locks instead of comment it so that the lockdep can inspect locks automatically. Link: https://lkml.kernel.org/r/163163039572.489837.18011973177537476885.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ad39eeaa4371..ec3d97fd8c6b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -959,11 +959,13 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, } #endif /* CONFIG_SYSCTL */ -/* Put a breakpoint for a probe. Must be called with 'text_mutex' locked. */ +/* Put a breakpoint for a probe. */ static void __arm_kprobe(struct kprobe *p) { struct kprobe *_p; + lockdep_assert_held(&text_mutex); + /* Find the overlapping optimized kprobes. */ _p = get_optimized_kprobe((unsigned long)p->addr); if (unlikely(_p)) @@ -974,11 +976,13 @@ static void __arm_kprobe(struct kprobe *p) optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ } -/* Remove the breakpoint of a probe. Must be called with 'text_mutex' locked. */ +/* Remove the breakpoint of a probe. */ static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; + lockdep_assert_held(&text_mutex); + /* Try to unoptimize */ unoptimize_kprobe(p, kprobes_all_disarmed); @@ -1047,12 +1051,13 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; -/* Caller must lock 'kprobe_mutex' */ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { int ret = 0; + lockdep_assert_held(&kprobe_mutex); + ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0); if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret)) return ret; @@ -1084,12 +1089,13 @@ static int arm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } -/* Caller must lock 'kprobe_mutex'. */ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { int ret = 0; + lockdep_assert_held(&kprobe_mutex); + if (*cnt == 1) { ret = unregister_ftrace_function(ops); if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret)) @@ -1133,7 +1139,6 @@ static int prepare_kprobe(struct kprobe *p) return arch_prepare_kprobe(p); } -/* Arm a kprobe with 'text_mutex'. */ static int arm_kprobe(struct kprobe *kp) { if (unlikely(kprobe_ftrace(kp))) @@ -1148,7 +1153,6 @@ static int arm_kprobe(struct kprobe *kp) return 0; } -/* Disarm a kprobe with 'text_mutex'. */ static int disarm_kprobe(struct kprobe *kp, bool reopt) { if (unlikely(kprobe_ftrace(kp))) @@ -1691,12 +1695,13 @@ static int aggr_kprobe_disabled(struct kprobe *ap) return 1; } -/* Disable one kprobe: Make sure called under 'kprobe_mutex' is locked. */ static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; int ret; + lockdep_assert_held(&kprobe_mutex); + /* Get an original kprobe for return */ orig_p = __get_valid_kprobe(p); if (unlikely(orig_p == NULL)) -- cgit v1.2.3 From c42421e205fc2570a4d019184ea7d6c382c93f4c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:07 +0900 Subject: kprobes: treewide: Use 'kprobe_opcode_t *' for the code address in get_optimized_kprobe() Since get_optimized_kprobe() is only used inside kprobes, it doesn't need to use 'unsigned long' type for 'addr' parameter. Make it use 'kprobe_opcode_t *' for the 'addr' parameter and subsequent call of arch_within_optimized_kprobe() also should use 'kprobe_opcode_t *'. Note that MAX_OPTIMIZED_LENGTH and RELATIVEJUMP_SIZE are defined by byte-size, but the size of 'kprobe_opcode_t' depends on the architecture. Therefore, we must be careful when calculating addresses using those macros. Link: https://lkml.kernel.org/r/163163040680.489837.12133032364499833736.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ec3d97fd8c6b..b6f1dcf4bff3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -485,15 +485,15 @@ static int kprobe_queued(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including 'addr' (exclude breakpoint). */ -static struct kprobe *get_optimized_kprobe(unsigned long addr) +static struct kprobe *get_optimized_kprobe(kprobe_opcode_t *addr) { int i; struct kprobe *p = NULL; struct optimized_kprobe *op; /* Don't check i == 0, since that is a breakpoint case. */ - for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) - p = get_kprobe((void *)(addr - i)); + for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH / sizeof(kprobe_opcode_t); i++) + p = get_kprobe(addr - i); if (p && kprobe_optready(p)) { op = container_of(p, struct optimized_kprobe, kp); @@ -967,7 +967,7 @@ static void __arm_kprobe(struct kprobe *p) lockdep_assert_held(&text_mutex); /* Find the overlapping optimized kprobes. */ - _p = get_optimized_kprobe((unsigned long)p->addr); + _p = get_optimized_kprobe(p->addr); if (unlikely(_p)) /* Fallback to unoptimized kprobe */ unoptimize_kprobe(_p, true); @@ -989,7 +989,7 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) if (!kprobe_queued(p)) { arch_disarm_kprobe(p); /* If another kprobe was blocked, re-optimize it. */ - _p = get_optimized_kprobe((unsigned long)p->addr); + _p = get_optimized_kprobe(p->addr); if (unlikely(_p) && reopt) optimize_kprobe(_p); } -- cgit v1.2.3 From 29e8077ae2beea6a85ad2d0bae9c550bd5d05ed9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:16 +0900 Subject: kprobes: Use bool type for functions which returns boolean value Use the 'bool' type instead of 'int' for the functions which returns a boolean value, because this makes clear that those functions don't return any error code. Link: https://lkml.kernel.org/r/163163041649.489837.17311187321419747536.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 26 +++++++++++++------------- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b6f1dcf4bff3..8021bccb7770 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -198,8 +198,8 @@ out: return slot; } -/* Return 1 if all garbages are collected, otherwise 0. */ -static int collect_one_slot(struct kprobe_insn_page *kip, int idx) +/* Return true if all garbages are collected, otherwise false. */ +static bool collect_one_slot(struct kprobe_insn_page *kip, int idx) { kip->slot_used[idx] = SLOT_CLEAN; kip->nused--; @@ -223,9 +223,9 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) kip->cache->free(kip->insns); kfree(kip); } - return 1; + return true; } - return 0; + return false; } static int collect_garbage_slots(struct kprobe_insn_cache *c) @@ -389,13 +389,13 @@ NOKPROBE_SYMBOL(get_kprobe); static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); /* Return true if 'p' is an aggregator */ -static inline int kprobe_aggrprobe(struct kprobe *p) +static inline bool kprobe_aggrprobe(struct kprobe *p) { return p->pre_handler == aggr_pre_handler; } /* Return true if 'p' is unused */ -static inline int kprobe_unused(struct kprobe *p) +static inline bool kprobe_unused(struct kprobe *p) { return kprobe_aggrprobe(p) && kprobe_disabled(p) && list_empty(&p->list); @@ -455,7 +455,7 @@ static inline int kprobe_optready(struct kprobe *p) } /* Return true if the kprobe is disarmed. Note: p must be on hash list */ -static inline int kprobe_disarmed(struct kprobe *p) +static inline bool kprobe_disarmed(struct kprobe *p) { struct optimized_kprobe *op; @@ -469,16 +469,16 @@ static inline int kprobe_disarmed(struct kprobe *p) } /* Return true if the probe is queued on (un)optimizing lists */ -static int kprobe_queued(struct kprobe *p) +static bool kprobe_queued(struct kprobe *p) { struct optimized_kprobe *op; if (kprobe_aggrprobe(p)) { op = container_of(p, struct optimized_kprobe, kp); if (!list_empty(&op->list)) - return 1; + return true; } - return 0; + return false; } /* @@ -1678,7 +1678,7 @@ out: EXPORT_SYMBOL_GPL(register_kprobe); /* Check if all probes on the 'ap' are disabled. */ -static int aggr_kprobe_disabled(struct kprobe *ap) +static bool aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; @@ -1690,9 +1690,9 @@ static int aggr_kprobe_disabled(struct kprobe *ap) * Since there is an active probe on the list, * we can't disable this 'ap'. */ - return 0; + return false; - return 1; + return true; } static struct kprobe *__disable_kprobe(struct kprobe *p) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3a64ba4bbad6..0e1e7ce5f7ed 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -97,7 +97,7 @@ static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) { - return !!(kprobe_gone(&tk->rp.kp)); + return kprobe_gone(&tk->rp.kp); } static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, -- cgit v1.2.3 From f2ec8d9a3b8c0f22cd6a2b4f5a2d9aee5206e3b7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:36 +0900 Subject: kprobes: treewide: Replace arch_deref_entry_point() with dereference_symbol_descriptor() ~15 years ago kprobes grew the 'arch_deref_entry_point()' __weak function: 3d7e33825d87: ("jprobes: make jprobes a little safer for users") But this is just open-coded dereference_symbol_descriptor() in essence, and its obscure nature was causing bugs. Just use the real thing and remove arch_deref_entry_point(). Link: https://lkml.kernel.org/r/163163043630.489837.7924988885652708696.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8021bccb7770..550042d9a6ef 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1861,11 +1861,6 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to be notified first */ }; -unsigned long __weak arch_deref_entry_point(void *entry) -{ - return (unsigned long)entry; -} - #ifdef CONFIG_KRETPROBES unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, @@ -2327,7 +2322,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start, int ret; for (iter = start; iter < end; iter++) { - entry = arch_deref_entry_point((void *)*iter); + entry = (unsigned long)dereference_symbol_descriptor((void *)*iter); ret = kprobe_add_ksym_blacklist(entry); if (ret == -EINVAL) continue; -- cgit v1.2.3 From 96fed8ac2bb64ab45497fdd8e3d390165b7a9be8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:45 +0900 Subject: kprobes: treewide: Remove trampoline_address from kretprobe_trampoline_handler() The __kretprobe_trampoline_handler() callback, called from low level arch kprobes methods, has the 'trampoline_address' parameter, which is entirely superfluous as it basically just replicates: dereference_kernel_function_descriptor(kretprobe_trampoline) In fact we had bugs in arch code where it wasn't replicated correctly. So remove this superfluous parameter and use kretprobe_trampoline_addr() instead. Link: https://lkml.kernel.org/r/163163044546.489837.13505751885476015002.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 550042d9a6ef..6ed755111eea 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1864,7 +1864,6 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, - void *trampoline_address, void *frame_pointer) { kprobe_opcode_t *correct_ret_addr = NULL; @@ -1879,7 +1878,7 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, BUG_ON(ri->fp != frame_pointer); - if (ri->ret_addr != trampoline_address) { + if (ri->ret_addr != kretprobe_trampoline_addr()) { correct_ret_addr = ri->ret_addr; /* * This is the real return address. Any other -- cgit v1.2.3 From adf8a61a940c49fea6fab9c3865f2b69b8ceef28 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:54 +0900 Subject: kprobes: treewide: Make it harder to refer kretprobe_trampoline directly Since now there is kretprobe_trampoline_addr() for referring the address of kretprobe trampoline code, we don't need to access kretprobe_trampoline directly. Make it harder to refer by renaming it to __kretprobe_trampoline(). Link: https://lkml.kernel.org/r/163163045446.489837.14510577516938803097.stgit@devnote2 Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c2ca40e8595b..5a5949c659d0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -349,7 +349,7 @@ EXPORT_SYMBOL_GPL(trace_output_call); #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { - static const char tramp_name[] = "kretprobe_trampoline"; + static const char tramp_name[] = "__kretprobe_trampoline"; int size = sizeof(tramp_name); if (strncmp(tramp_name, name, size) == 0) -- cgit v1.2.3 From 03bac0df2886882c43e6d0bfff9dee84a184fc7e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:41:04 +0900 Subject: kprobes: Add kretprobe_find_ret_addr() for searching return address Introduce kretprobe_find_ret_addr() and is_kretprobe_trampoline(). These APIs will be used by the ORC stack unwinder and ftrace, so that they can check whether the given address points kretprobe trampoline code and query the correct return address in that case. Link: https://lkml.kernel.org/r/163163046461.489837.1044778356430293962.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 109 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6ed755111eea..833f07f33115 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1863,45 +1863,87 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES -unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, - void *frame_pointer) +/* This assumes the 'tsk' is the current task or the is not running. */ +static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk, + struct llist_node **cur) { - kprobe_opcode_t *correct_ret_addr = NULL; struct kretprobe_instance *ri = NULL; - struct llist_node *first, *node; - struct kretprobe *rp; + struct llist_node *node = *cur; + + if (!node) + node = tsk->kretprobe_instances.first; + else + node = node->next; - /* Find all nodes for this frame. */ - first = node = current->kretprobe_instances.first; while (node) { ri = container_of(node, struct kretprobe_instance, llist); - - BUG_ON(ri->fp != frame_pointer); - if (ri->ret_addr != kretprobe_trampoline_addr()) { - correct_ret_addr = ri->ret_addr; - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - goto found; + *cur = node; + return ri->ret_addr; } - node = node->next; } - pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n"); - BUG_ON(1); + return NULL; +} +NOKPROBE_SYMBOL(__kretprobe_find_ret_addr); -found: - /* Unlink all nodes for this frame. */ - current->kretprobe_instances.first = node->next; - node->next = NULL; +/** + * kretprobe_find_ret_addr -- Find correct return address modified by kretprobe + * @tsk: Target task + * @fp: A frame pointer + * @cur: a storage of the loop cursor llist_node pointer for next call + * + * Find the correct return address modified by a kretprobe on @tsk in unsigned + * long type. If it finds the return address, this returns that address value, + * or this returns 0. + * The @tsk must be 'current' or a task which is not running. @fp is a hint + * to get the currect return address - which is compared with the + * kretprobe_instance::fp field. The @cur is a loop cursor for searching the + * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the + * first call, but '@cur' itself must NOT NULL. + */ +unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, + struct llist_node **cur) +{ + struct kretprobe_instance *ri = NULL; + kprobe_opcode_t *ret; + + if (WARN_ON_ONCE(!cur)) + return 0; + + do { + ret = __kretprobe_find_ret_addr(tsk, cur); + if (!ret) + break; + ri = container_of(*cur, struct kretprobe_instance, llist); + } while (ri->fp != fp); - /* Run them.. */ + return (unsigned long)ret; +} +NOKPROBE_SYMBOL(kretprobe_find_ret_addr); + +unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, + void *frame_pointer) +{ + kprobe_opcode_t *correct_ret_addr = NULL; + struct kretprobe_instance *ri = NULL; + struct llist_node *first, *node = NULL; + struct kretprobe *rp; + + /* Find correct address and all nodes for this frame. */ + correct_ret_addr = __kretprobe_find_ret_addr(current, &node); + if (!correct_ret_addr) { + pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n"); + BUG_ON(1); + } + + /* Run the user handler of the nodes. */ + first = current->kretprobe_instances.first; while (first) { ri = container_of(first, struct kretprobe_instance, llist); - first = first->next; + + if (WARN_ON_ONCE(ri->fp != frame_pointer)) + break; rp = get_kretprobe(ri); if (rp && rp->handler) { @@ -1912,6 +1954,21 @@ found: rp->handler(ri, regs); __this_cpu_write(current_kprobe, prev); } + if (first == node) + break; + + first = first->next; + } + + /* Unlink all nodes for this frame. */ + first = current->kretprobe_instances.first; + current->kretprobe_instances.first = node->next; + node->next = NULL; + + /* Recycle free instances. */ + while (first) { + ri = container_of(first, struct kretprobe_instance, llist); + first = first->next; recycle_rp_inst(ri); } -- cgit v1.2.3 From df91c5bccb0c2cb868b54bd68a6ddf1fcbede6b1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:12 +0900 Subject: kprobes: Enable stacktrace from pt_regs in kretprobe handler Since the ORC unwinder from pt_regs requires setting up regs->ip correctly, set the correct return address to the regs->ip before calling user kretprobe handler. This allows the kretrprobe handler to trace stack from the kretprobe's pt_regs by stack_trace_save_regs() (eBPF will do this), instead of stack tracing from the handler context by stack_trace_save() (ftrace will do this). Link: https://lkml.kernel.org/r/163163053237.489837.4272653874525136832.stgit@devnote2 Suggested-by: Josh Poimboeuf Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Acked-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 833f07f33115..ebc587b9a346 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1937,6 +1937,13 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, BUG_ON(1); } + /* + * Set the return address as the instruction pointer, because if the + * user handler calls stack_trace_save_regs() with this 'regs', + * the stack trace will start from the instruction pointer. + */ + instruction_pointer_set(regs, (unsigned long)correct_ret_addr); + /* Run the user handler of the nodes. */ first = current->kretprobe_instances.first; while (first) { -- cgit v1.2.3 From 7da89495d500d6a1e6fe1019587c3b611c7bd217 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:40 +0900 Subject: tracing: Show kretprobe unknown indicator only for kretprobe_trampoline ftrace shows "[unknown/kretprobe'd]" indicator all addresses in the kretprobe_trampoline, but the modified address by kretprobe should be only kretprobe_trampoline+0. Link: https://lkml.kernel.org/r/163163056044.489837.794883849706638013.stgit@devnote2 Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5a5949c659d0..3547e7176ff7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -346,22 +347,12 @@ int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_output_call); -#ifdef CONFIG_KRETPROBES -static inline const char *kretprobed(const char *name) +static inline const char *kretprobed(const char *name, unsigned long addr) { - static const char tramp_name[] = "__kretprobe_trampoline"; - int size = sizeof(tramp_name); - - if (strncmp(tramp_name, name, size) == 0) + if (is_kretprobe_trampoline(addr)) return "[unknown/kretprobe'd]"; return name; } -#else -static inline const char *kretprobed(const char *name) -{ - return name; -} -#endif /* CONFIG_KRETPROBES */ void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) @@ -374,7 +365,7 @@ trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) sprint_symbol(str, address); else kallsyms_lookup(address, NULL, NULL, NULL, str); - name = kretprobed(str); + name = kretprobed(str, address); if (name && strlen(name)) { trace_seq_puts(s, name); -- cgit v1.2.3 From bf094cffea2a6503ce84062f9f0243bef77c58f9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:51 +0900 Subject: x86/kprobes: Fixup return address in generic trampoline handler In x86, the fake return address on the stack saved by __kretprobe_trampoline() will be replaced with the real return address after returning from trampoline_handler(). Before fixing the return address, the real return address can be found in the 'current->kretprobe_instances'. However, since there is a window between updating the 'current->kretprobe_instances' and fixing the address on the stack, if an interrupt happens at that timing and the interrupt handler does stacktrace, it may fail to unwind because it can not get the correct return address from 'current->kretprobe_instances'. This will eliminate that window by fixing the return address right before updating 'current->kretprobe_instances'. Link: https://lkml.kernel.org/r/163163057094.489837.9044470370440745866.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ebc587b9a346..b62af9fc3607 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1922,6 +1922,15 @@ unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, } NOKPROBE_SYMBOL(kretprobe_find_ret_addr); +void __weak arch_kretprobe_fixup_return(struct pt_regs *regs, + kprobe_opcode_t *correct_ret_addr) +{ + /* + * Do nothing by default. Please fill this to update the fake return + * address on the stack with the correct one on each arch if possible. + */ +} + unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, void *frame_pointer) { @@ -1967,6 +1976,8 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, first = first->next; } + arch_kretprobe_fixup_return(regs, correct_ret_addr); + /* Unlink all nodes for this frame. */ first = current->kretprobe_instances.first; current->kretprobe_instances.first = node->next; -- cgit v1.2.3 From 6954e415264eeb5ee6be0d22d789ad12c995ee64 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Sep 2021 21:03:49 -0400 Subject: tracing: Place trace_pid_list logic into abstract functions Instead of having the logic that does trace_pid_list open coded, wrap it in abstract functions. This will allow a rewrite of the logic that implements the trace_pid_list without affecting the users. Note, this causes a change in behavior. Every time a pid is written into the set_*_pid file, it creates a new list and uses RCU to update it. If pid_max is lowered, but there was a pid currently in the list that was higher than pid_max, those pids will now be removed on updating the list. The old behavior kept that from happening. The rewrite of the pid_list logic will no longer depend on pid_max, and will return the old behavior. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Makefile | 1 + kernel/trace/ftrace.c | 6 +- kernel/trace/pid_list.c | 160 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/pid_list.h | 13 ++++ kernel/trace/trace.c | 78 ++++++++------------- kernel/trace/trace.h | 14 ++-- kernel/trace/trace_events.c | 6 +- 7 files changed, 217 insertions(+), 61 deletions(-) create mode 100644 kernel/trace/pid_list.c create mode 100644 kernel/trace/pid_list.h (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6de5d4d63165..bedc5caceec7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -47,6 +47,7 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING) += pid_list.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7efbc8aaf7f6..3eec6792f115 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7184,10 +7184,10 @@ static void clear_ftrace_pids(struct trace_array *tr, int type) synchronize_rcu(); if ((type & TRACE_PIDS) && pid_list) - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) - trace_free_pid_list(no_pid_list); + trace_pid_list_free(no_pid_list); } void ftrace_clear_pids(struct trace_array *tr) @@ -7428,7 +7428,7 @@ pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_rcu(); - trace_free_pid_list(filtered_pids); + trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { /* Register a probe to set whether to ignore the tracing of a task */ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c new file mode 100644 index 000000000000..4483ef70b562 --- /dev/null +++ b/kernel/trace/pid_list.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 VMware Inc, Steven Rostedt + */ +#include +#include +#include "trace.h" + +/** + * trace_pid_list_is_set - test if the pid is set in the list + * @pid_list: The pid list to test + * @pid: The pid to to see if set in the list. + * + * Tests if @pid is is set in the @pid_list. This is usually called + * from the scheduler when a task is scheduled. Its pid is checked + * if it should be traced or not. + * + * Return true if the pid is in the list, false otherwise. + */ +bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (pid >= pid_list->pid_max) + return false; + + return test_bit(pid, pid_list->pids); +} + +/** + * trace_pid_list_set - add a pid to the list + * @pid_list: The pid list to add the @pid to. + * @pid: The pid to add. + * + * Adds @pid to @pid_list. This is usually done explicitly by a user + * adding a task to be traced, or indirectly by the fork function + * when children should be traced and a task's pid is in the list. + * + * Return 0 on success, negative otherwise. + */ +int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* Sorry, but we don't support pid_max changing after setting */ + if (pid >= pid_list->pid_max) + return -EINVAL; + + set_bit(pid, pid_list->pids); + + return 0; +} + +/** + * trace_pid_list_clear - remove a pid from the list + * @pid_list: The pid list to remove the @pid from. + * @pid: The pid to remove. + * + * Removes @pid from @pid_list. This is usually done explicitly by a user + * removing tasks from tracing, or indirectly by the exit function + * when a task that is set to be traced exits. + * + * Return 0 on success, negative otherwise. + */ +int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* Sorry, but we don't support pid_max changing after setting */ + if (pid >= pid_list->pid_max) + return -EINVAL; + + clear_bit(pid, pid_list->pids); + + return 0; +} + +/** + * trace_pid_list_next - return the next pid in the list + * @pid_list: The pid list to examine. + * @pid: The pid to start from + * @next: The pointer to place the pid that is set starting from @pid. + * + * Looks for the next consecutive pid that is in @pid_list starting + * at the pid specified by @pid. If one is set (including @pid), then + * that pid is placed into @next. + * + * Return 0 when a pid is found, -1 if there are no more pids included. + */ +int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, + unsigned int *next) +{ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + if (pid < pid_list->pid_max) { + *next = pid; + return 0; + } + return -1; +} + +/** + * trace_pid_list_first - return the first pid in the list + * @pid_list: The pid list to examine. + * @pid: The pointer to place the pid first found pid that is set. + * + * Looks for the first pid that is set in @pid_list, and places it + * into @pid if found. + * + * Return 0 when a pid is found, -1 if there are no pids set. + */ +int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) +{ + unsigned int first; + + first = find_first_bit(pid_list->pids, pid_list->pid_max); + + if (first < pid_list->pid_max) { + *pid = first; + return 0; + } + return -1; +} + +/** + * trace_pid_list_alloc - create a new pid_list + * + * Allocates a new pid_list to store pids into. + * + * Returns the pid_list on success, NULL otherwise. + */ +struct trace_pid_list *trace_pid_list_alloc(void) +{ + struct trace_pid_list *pid_list; + + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return NULL; + + pid_list->pid_max = READ_ONCE(pid_max); + + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + return NULL; + } + return pid_list; +} + +/** + * trace_pid_list_free - Frees an allocated pid_list. + * + * Frees the memory for a pid_list that was allocated. + */ +void trace_pid_list_free(struct trace_pid_list *pid_list) +{ + if (!pid_list) + return; + + vfree(pid_list->pids); + kfree(pid_list); +} diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h new file mode 100644 index 000000000000..80d0ecfe1536 --- /dev/null +++ b/kernel/trace/pid_list.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Do not include this file directly. */ + +#ifndef _TRACE_INTERNAL_PID_LIST_H +#define _TRACE_INTERNAL_PID_LIST_H + +struct trace_pid_list { + int pid_max; + unsigned long *pids; +}; + +#endif /* _TRACE_INTERNAL_PID_LIST_H */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7896d30d90f7..dcced07a45e6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -512,12 +512,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } -void trace_free_pid_list(struct trace_pid_list *pid_list) -{ - vfree(pid_list->pids); - kfree(pid_list); -} - /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -528,14 +522,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list) bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (search_pid >= filtered_pids->pid_max) - return false; - - return test_bit(search_pid, filtered_pids->pids); + return trace_pid_list_is_set(filtered_pids, search_pid); } /** @@ -592,15 +579,11 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, return; } - /* Sorry, but we don't support pid_max changing after setting */ - if (task->pid >= pid_list->pid_max) - return; - /* "self" is set for forks, and NULL for exits */ if (self) - set_bit(task->pid, pid_list->pids); + trace_pid_list_set(pid_list, task->pid); else - clear_bit(task->pid, pid_list->pids); + trace_pid_list_clear(pid_list, task->pid); } /** @@ -617,18 +600,19 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, */ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) { - unsigned long pid = (unsigned long)v; + long pid = (unsigned long)v; + unsigned int next; (*pos)++; /* pid already is +1 of the actual previous bit */ - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + if (trace_pid_list_next(pid_list, pid, &next) < 0) + return NULL; - /* Return pid + 1 to allow zero to be represented */ - if (pid < pid_list->pid_max) - return (void *)(pid + 1); + pid = next; - return NULL; + /* Return pid + 1 to allow zero to be represented */ + return (void *)(pid + 1); } /** @@ -645,12 +629,14 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) { unsigned long pid; + unsigned int first; loff_t l = 0; - pid = find_first_bit(pid_list->pids, pid_list->pid_max); - if (pid >= pid_list->pid_max) + if (trace_pid_list_first(pid_list, &first) < 0) return NULL; + pid = first; + /* Return pid + 1 so that zero can be the exit value */ for (pid++; pid && l < *pos; pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) @@ -686,7 +672,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, unsigned long val; int nr_pids = 0; ssize_t read = 0; - ssize_t ret = 0; + ssize_t ret; loff_t pos; pid_t pid; @@ -699,34 +685,23 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * the user. If the operation fails, then the current list is * not modified. */ - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + pid_list = trace_pid_list_alloc(); if (!pid_list) { trace_parser_put(&parser); return -ENOMEM; } - pid_list->pid_max = READ_ONCE(pid_max); - - /* Only truncating will shrink pid_max */ - if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) - pid_list->pid_max = filtered_pids->pid_max; - - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - trace_parser_put(&parser); - kfree(pid_list); - return -ENOMEM; - } - if (filtered_pids) { /* copy the current bits to the new max */ - for_each_set_bit(pid, filtered_pids->pids, - filtered_pids->pid_max) { - set_bit(pid, pid_list->pids); + ret = trace_pid_list_first(filtered_pids, &pid); + while (!ret) { + trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } } + ret = 0; while (cnt > 0) { pos = 0; @@ -742,12 +717,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; - if (val >= pid_list->pid_max) - break; pid = (pid_t)val; - set_bit(pid, pid_list->pids); + if (trace_pid_list_set(pid_list, pid) < 0) { + ret = -1; + break; + } nr_pids++; trace_parser_clear(&parser); @@ -756,13 +732,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_put(&parser); if (ret < 0) { - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); return ret; } if (!nr_pids) { /* Cleared the list of pids */ - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); read = ret; pid_list = NULL; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b7c0f8e160fb..2ec500186992 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,6 +22,8 @@ #include #include +#include "pid_list.h" + #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ #include /* some archs define it here */ @@ -188,10 +190,14 @@ struct trace_options { struct trace_option_dentry *topts; }; -struct trace_pid_list { - int pid_max; - unsigned long *pids; -}; +struct trace_pid_list *trace_pid_list_alloc(void); +void trace_pid_list_free(struct trace_pid_list *pid_list); +bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid); +int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, + unsigned int *next); enum { TRACE_PIDS = BIT(0), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 830b3b9940f4..bf54d2803261 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -885,10 +885,10 @@ static void __ftrace_clear_event_pids(struct trace_array *tr, int type) tracepoint_synchronize_unregister(); if ((type & TRACE_PIDS) && pid_list) - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) - trace_free_pid_list(no_pid_list); + trace_pid_list_free(no_pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr, int type) @@ -1967,7 +1967,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { tracepoint_synchronize_unregister(); - trace_free_pid_list(filtered_pids); + trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { register_pid_events(tr); } -- cgit v1.2.3 From 8d6e90983ade25ec7925211ac31d9ccaf64b7edf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Sep 2021 22:20:57 -0400 Subject: tracing: Create a sparse bitmask for pid filtering When the trace_pid_list was created, the default pid max was 32768. Creating a bitmask that can hold one bit for all 32768 took up 4096 (one page). Having a one page bitmask was not much of a problem, and that was used for mapping pids. But today, systems are bigger and can run more tasks, and now the default pid_max is usually set to 4194304. Which means to handle that many pids requires 524288 bytes. Worse yet, the pid_max can be set to 2^30 (1073741824 or 1G) which would take 134217728 (128M) of memory to store this array. Since the pid_list array is very sparsely populated, it is a huge waste of memory to store all possible bits for each pid when most will not be set. Instead, use a page table scheme to store the array, and allow this to handle up to 30 bit pids. The pid_mask will start out with 256 entries for the first 8 MSB bits. This will cost 1K for 32 bit architectures and 2K for 64 bit. Each of these will have a 256 array to store the next 8 bits of the pid (another 1 or 2K). These will hold an 2K byte bitmask (which will cover the LSB 14 bits or 16384 pids). When the trace_pid_list is allocated, it will have the 1/2K upper bits allocated, and then it will allocate a cache for the next upper chunks and the lower chunks (default 6 of each). Then when a bit is "set", these chunks will be pulled from the free list and added to the array. If the free list gets down to a lever (default 2), it will trigger an irqwork that will refill the cache back up. On clearing a bit, if the clear causes the bitmask to be zero, that chunk will then be placed back into the free cache for later use, keeping the need to allocate more down to a minimum. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/pid_list.c | 401 ++++++++++++++++++++++++++++++++++++++++++++---- kernel/trace/pid_list.h | 79 +++++++++- 2 files changed, 445 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 4483ef70b562..cbf8031b2b99 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -2,10 +2,119 @@ /* * Copyright (C) 2021 VMware Inc, Steven Rostedt */ -#include +#include +#include #include #include "trace.h" +/* See pid_list.h for details */ + +static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list) +{ + union lower_chunk *chunk; + + lockdep_assert_held(&pid_list->lock); + + if (!pid_list->lower_list) + return NULL; + + chunk = pid_list->lower_list; + pid_list->lower_list = chunk->next; + pid_list->free_lower_chunks--; + WARN_ON_ONCE(pid_list->free_lower_chunks < 0); + chunk->next = NULL; + /* + * If a refill needs to happen, it can not happen here + * as the scheduler run queue locks are held. + */ + if (pid_list->free_lower_chunks <= CHUNK_REALLOC) + irq_work_queue(&pid_list->refill_irqwork); + + return chunk; +} + +static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list) +{ + union upper_chunk *chunk; + + lockdep_assert_held(&pid_list->lock); + + if (!pid_list->upper_list) + return NULL; + + chunk = pid_list->upper_list; + pid_list->upper_list = chunk->next; + pid_list->free_upper_chunks--; + WARN_ON_ONCE(pid_list->free_upper_chunks < 0); + chunk->next = NULL; + /* + * If a refill needs to happen, it can not happen here + * as the scheduler run queue locks are held. + */ + if (pid_list->free_upper_chunks <= CHUNK_REALLOC) + irq_work_queue(&pid_list->refill_irqwork); + + return chunk; +} + +static inline void put_lower_chunk(struct trace_pid_list *pid_list, + union lower_chunk *chunk) +{ + lockdep_assert_held(&pid_list->lock); + + chunk->next = pid_list->lower_list; + pid_list->lower_list = chunk; + pid_list->free_lower_chunks++; +} + +static inline void put_upper_chunk(struct trace_pid_list *pid_list, + union upper_chunk *chunk) +{ + lockdep_assert_held(&pid_list->lock); + + chunk->next = pid_list->upper_list; + pid_list->upper_list = chunk; + pid_list->free_upper_chunks++; +} + +static inline bool upper_empty(union upper_chunk *chunk) +{ + /* + * If chunk->data has no lower chunks, it will be the same + * as a zeroed bitmask. Use find_first_bit() to test it + * and if it doesn't find any bits set, then the array + * is empty. + */ + int bit = find_first_bit((unsigned long *)chunk->data, + sizeof(chunk->data) * 8); + return bit >= sizeof(chunk->data) * 8; +} + +static inline int pid_split(unsigned int pid, unsigned int *upper1, + unsigned int *upper2, unsigned int *lower) +{ + /* MAX_PID should cover all pids */ + BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT); + + /* In case a bad pid is passed in, then fail */ + if (unlikely(pid >= MAX_PID)) + return -1; + + *upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK; + *upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK; + *lower = pid & LOWER_MASK; + + return 0; +} + +static inline unsigned int pid_join(unsigned int upper1, + unsigned int upper2, unsigned int lower) +{ + return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) | + ((upper2 & UPPER_MASK) << UPPER2_SHIFT) | + (lower & LOWER_MASK); +} + /** * trace_pid_list_is_set - test if the pid is set in the list * @pid_list: The pid list to test @@ -19,14 +128,30 @@ */ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) { - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (pid >= pid_list->pid_max) + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + bool ret = false; + + if (!pid_list) return false; - return test_bit(pid, pid_list->pids); + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return false; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + + return ret; } /** @@ -42,13 +167,44 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) */ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) { - /* Sorry, but we don't support pid_max changing after setting */ - if (pid >= pid_list->pid_max) - return -EINVAL; + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + int ret; - set_bit(pid, pid_list->pids); + if (!pid_list) + return -ENODEV; - return 0; + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return -EINVAL; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (!upper_chunk) { + upper_chunk = get_upper_chunk(pid_list); + if (!upper_chunk) { + ret = -ENOMEM; + goto out; + } + pid_list->upper[upper1] = upper_chunk; + } + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) { + lower_chunk = get_lower_chunk(pid_list); + if (!lower_chunk) { + ret = -ENOMEM; + goto out; + } + upper_chunk->data[upper2] = lower_chunk; + } + set_bit(lower, lower_chunk->data); + ret = 0; + out: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + return ret; } /** @@ -64,12 +220,41 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) */ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) { - /* Sorry, but we don't support pid_max changing after setting */ - if (pid >= pid_list->pid_max) + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) return -EINVAL; - clear_bit(pid, pid_list->pids); + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (!upper_chunk) + goto out; + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + goto out; + + clear_bit(lower, lower_chunk->data); + + /* if there's no more bits set, add it to the free list */ + if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) { + put_lower_chunk(pid_list, lower_chunk); + upper_chunk->data[upper2] = NULL; + if (upper_empty(upper_chunk)) { + put_upper_chunk(pid_list, upper_chunk); + pid_list->upper[upper1] = NULL; + } + } + out: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -88,13 +273,45 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, unsigned int *next) { - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return -EINVAL; - if (pid < pid_list->pid_max) { - *next = pid; - return 0; + raw_spin_lock_irqsave(&pid_list->lock, flags); + for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) { + upper_chunk = pid_list->upper[upper1]; + + if (!upper_chunk) + continue; + + for (; upper2 <= UPPER_MASK; upper2++, lower = 0) { + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + continue; + + lower = find_next_bit(lower_chunk->data, LOWER_MAX, + lower); + if (lower < LOWER_MAX) + goto found; + } } - return -1; + + found: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + if (upper1 > UPPER_MASK) + return -1; + + *next = pid_join(upper1, upper2, lower); + return 0; } /** @@ -109,15 +326,79 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, */ int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) { - unsigned int first; + return trace_pid_list_next(pid_list, 0, pid); +} + +static void pid_list_refill_irq(struct irq_work *iwork) +{ + struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list, + refill_irqwork); + union upper_chunk *upper; + union lower_chunk *lower; + union upper_chunk **upper_next = &upper; + union lower_chunk **lower_next = &lower; + int upper_count; + int lower_count; + int ucnt = 0; + int lcnt = 0; - first = find_first_bit(pid_list->pids, pid_list->pid_max); + again: + raw_spin_lock(&pid_list->lock); + upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; + lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + raw_spin_unlock(&pid_list->lock); + + if (upper_count <= 0 && lower_count <= 0) + return; - if (first < pid_list->pid_max) { - *pid = first; - return 0; + while (upper_count-- > 0) { + union upper_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + *upper_next = chunk; + upper_next = &chunk->next; + ucnt++; } - return -1; + + while (lower_count-- > 0) { + union lower_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + *lower_next = chunk; + lower_next = &chunk->next; + lcnt++; + } + + raw_spin_lock(&pid_list->lock); + if (upper) { + *upper_next = pid_list->upper_list; + pid_list->upper_list = upper; + pid_list->free_upper_chunks += ucnt; + } + if (lower) { + *lower_next = pid_list->lower_list; + pid_list->lower_list = lower; + pid_list->free_lower_chunks += lcnt; + } + raw_spin_unlock(&pid_list->lock); + + /* + * On success of allocating all the chunks, both counters + * will be less than zero. If they are not, then an allocation + * failed, and we should not try again. + */ + if (upper_count >= 0 || lower_count >= 0) + return; + /* + * When the locks were released, free chunks could have + * been used and allocation needs to be done again. Might as + * well allocate it now. + */ + goto again; } /** @@ -130,18 +411,41 @@ int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) struct trace_pid_list *trace_pid_list_alloc(void) { struct trace_pid_list *pid_list; + int i; + + /* According to linux/thread.h, pids can be no bigger that 30 bits */ + WARN_ON_ONCE(pid_max > (1 << 30)); - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) return NULL; - pid_list->pid_max = READ_ONCE(pid_max); + init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - kfree(pid_list); - return NULL; + raw_spin_lock_init(&pid_list->lock); + + for (i = 0; i < CHUNK_ALLOC; i++) { + union upper_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + chunk->next = pid_list->upper_list; + pid_list->upper_list = chunk; + pid_list->free_upper_chunks++; } + + for (i = 0; i < CHUNK_ALLOC; i++) { + union lower_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + chunk->next = pid_list->lower_list; + pid_list->lower_list = chunk; + pid_list->free_lower_chunks++; + } + return pid_list; } @@ -152,9 +456,40 @@ struct trace_pid_list *trace_pid_list_alloc(void) */ void trace_pid_list_free(struct trace_pid_list *pid_list) { + union upper_chunk *upper; + union lower_chunk *lower; + int i, j; + if (!pid_list) return; - vfree(pid_list->pids); + irq_work_sync(&pid_list->refill_irqwork); + + while (pid_list->lower_list) { + union lower_chunk *chunk; + + chunk = pid_list->lower_list; + pid_list->lower_list = pid_list->lower_list->next; + kfree(chunk); + } + + while (pid_list->upper_list) { + union upper_chunk *chunk; + + chunk = pid_list->upper_list; + pid_list->upper_list = pid_list->upper_list->next; + kfree(chunk); + } + + for (i = 0; i < UPPER1_SIZE; i++) { + upper = pid_list->upper[i]; + if (upper) { + for (j = 0; j < UPPER2_SIZE; j++) { + lower = upper->data[j]; + kfree(lower); + } + kfree(upper); + } + } kfree(pid_list); } diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 80d0ecfe1536..62e73f1ac85f 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -5,9 +5,84 @@ #ifndef _TRACE_INTERNAL_PID_LIST_H #define _TRACE_INTERNAL_PID_LIST_H +/* + * In order to keep track of what pids to trace, a tree is created much + * like page tables are used. This creates a sparse bit map, where + * the tree is filled in when needed. A PID is at most 30 bits (see + * linux/thread.h), and is broken up into 3 sections based on the bit map + * of the bits. The 8 MSB is the "upper1" section. The next 8 MSB is the + * "upper2" section and the 14 LSB is the "lower" section. + * + * A trace_pid_list structure holds the "upper1" section, in an + * array of 256 pointers (1 or 2K in size) to "upper_chunk" unions, where + * each has an array of 256 pointers (1 or 2K in size) to the "lower_chunk" + * structures, where each has an array of size 2K bytes representing a bitmask + * of the 14 LSB of the PID (256 * 8 = 2048) + * + * When a trace_pid_list is allocated, it includes the 256 pointer array + * of the upper1 unions. Then a "cache" of upper and lower is allocated + * where these will be assigned as needed. + * + * When a bit is set in the pid_list bitmask, the pid to use has + * the 8 MSB masked, and this is used to index the array in the + * pid_list to find the next upper union. If the element is NULL, + * then one is retrieved from the upper_list cache. If none is + * available, then -ENOMEM is returned. + * + * The next 8 MSB is used to index into the "upper2" section. If this + * element is NULL, then it is retrieved from the lower_list cache. + * Again, if one is not available -ENOMEM is returned. + * + * Finally the 14 LSB of the PID is used to set the bit in the 16384 + * bitmask (made up of 2K bytes). + * + * When the second upper section or the lower section has their last + * bit cleared, they are added back to the free list to be reused + * when needed. + */ + +#define UPPER_BITS 8 +#define UPPER_MAX (1 << UPPER_BITS) +#define UPPER1_SIZE (1 << UPPER_BITS) +#define UPPER2_SIZE (1 << UPPER_BITS) + +#define LOWER_BITS 14 +#define LOWER_MAX (1 << LOWER_BITS) +#define LOWER_SIZE (LOWER_MAX / BITS_PER_LONG) + +#define UPPER1_SHIFT (LOWER_BITS + UPPER_BITS) +#define UPPER2_SHIFT LOWER_BITS +#define LOWER_MASK (LOWER_MAX - 1) + +#define UPPER_MASK (UPPER_MAX - 1) + +/* According to linux/thread.h pids can not be bigger than or equal to 1 << 30 */ +#define MAX_PID (1 << 30) + +/* Just keep 6 chunks of both upper and lower in the cache on alloc */ +#define CHUNK_ALLOC 6 + +/* Have 2 chunks free, trigger a refill of the cache */ +#define CHUNK_REALLOC 2 + +union lower_chunk { + union lower_chunk *next; + unsigned long data[LOWER_SIZE]; // 2K in size +}; + +union upper_chunk { + union upper_chunk *next; + union lower_chunk *data[UPPER2_SIZE]; // 1 or 2K in size +}; + struct trace_pid_list { - int pid_max; - unsigned long *pids; + raw_spinlock_t lock; + struct irq_work refill_irqwork; + union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size + union upper_chunk *upper_list; + union lower_chunk *lower_list; + int free_upper_chunks; + int free_lower_chunks; }; #endif /* _TRACE_INTERNAL_PID_LIST_H */ -- cgit v1.2.3 From b30a779d5c557e99b93917f33d441948c9aead97 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 7 Oct 2021 09:53:53 -0400 Subject: tracing: Initialize upper and lower vars in pid_list_refill_irq() The upper and lower variables are set as link lists to add into the sparse array. If they are NULL, after the needed allocations are done, then there is nothing to add. But they need to be initialized to NULL for this to work. Link: https://lore.kernel.org/all/221bc7ba-a475-1cb9-1bbe-730bb9c2d448@canonical.com/ Fixes: 8d6e90983ade ("tracing: Create a sparse bitmask for pid filtering") Reported-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/pid_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index cbf8031b2b99..a2ef1d18126a 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -333,8 +333,8 @@ static void pid_list_refill_irq(struct irq_work *iwork) { struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list, refill_irqwork); - union upper_chunk *upper; - union lower_chunk *lower; + union upper_chunk *upper = NULL; + union lower_chunk *lower = NULL; union upper_chunk **upper_next = &upper; union lower_chunk **lower_next = &lower; int upper_count; -- cgit v1.2.3 From 21ccc9cd72116289469e5519b6159c675a2fa58f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 18 Aug 2021 11:24:51 -0400 Subject: tracing: Disable "other" permission bits in the tracefs files When building the files in the tracefs file system, do not by default set any permissions for OTH (other). This will make it easier for admins who want to define a group for accessing tracefs and not having to first disable all the permission bits for "other" in the file system. As tracing can leak sensitive information, it should never by default allowing all users access. An admin can still set the permission bits for others to have access, which may be useful for creating a honeypot and seeing who takes advantage of it and roots the machine. Link: https://lkml.kernel.org/r/20210818153038.864149276@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 23 +++++------ kernel/trace/trace.c | 73 ++++++++++++++++++----------------- kernel/trace/trace.h | 3 ++ kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_events.c | 42 ++++++++++---------- kernel/trace/trace_events_synth.c | 4 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hwlat.c | 6 +-- kernel/trace/trace_kprobe.c | 8 ++-- kernel/trace/trace_osnoise.c | 14 +++---- kernel/trace/trace_printk.c | 2 +- kernel/trace/trace_recursion_record.c | 4 +- kernel/trace/trace_stack.c | 6 +-- kernel/trace/trace_stat.c | 6 +-- kernel/trace/trace_uprobe.c | 4 +- 15 files changed, 103 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3eec6792f115..0a0dbc2d411b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -988,8 +988,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } } - entry = tracefs_create_file("function_profile_enabled", 0644, - d_tracer, NULL, &ftrace_profile_fops); + entry = tracefs_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); if (!entry) pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } @@ -6109,10 +6110,10 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent) { - trace_create_file("set_ftrace_filter", 0644, parent, + trace_create_file("set_ftrace_filter", TRACE_MODE_WRITE, parent, ops, &ftrace_filter_fops); - trace_create_file("set_ftrace_notrace", 0644, parent, + trace_create_file("set_ftrace_notrace", TRACE_MODE_WRITE, parent, ops, &ftrace_notrace_fops); } @@ -6139,19 +6140,19 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { - trace_create_file("available_filter_functions", 0444, + trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops); - trace_create_file("enabled_functions", 0444, + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - trace_create_file("set_graph_function", 0644, d_tracer, + trace_create_file("set_graph_function", TRACE_MODE_WRITE, d_tracer, NULL, &ftrace_graph_fops); - trace_create_file("set_graph_notrace", 0644, d_tracer, + trace_create_file("set_graph_notrace", TRACE_MODE_WRITE, d_tracer, NULL, &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -7494,10 +7495,10 @@ static const struct file_operations ftrace_no_pid_fops = { void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - trace_create_file("set_ftrace_pid", 0644, d_tracer, + trace_create_file("set_ftrace_pid", TRACE_MODE_WRITE, d_tracer, tr, &ftrace_pid_fops); - trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer, - tr, &ftrace_no_pid_fops); + trace_create_file("set_ftrace_notrace_pid", TRACE_MODE_WRITE, + d_tracer, tr, &ftrace_no_pid_fops); } void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dcced07a45e6..985390cb8441 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1690,7 +1690,8 @@ static void trace_create_maxlat_file(struct trace_array *tr, { INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); - tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, d_tracer, &tr->max_latency, &tracing_max_lat_fops); } @@ -1727,8 +1728,8 @@ void latency_fsnotify(struct trace_array *tr) #else #define trace_create_maxlat_file(tr, d_tracer) \ - trace_create_file("tracing_max_latency", 0644, d_tracer, \ - &tr->max_latency, &tracing_max_lat_fops) + trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ + d_tracer, &tr->max_latency, &tracing_max_lat_fops) #endif @@ -6054,7 +6055,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, static void trace_create_eval_file(struct dentry *d_tracer) { - trace_create_file("eval_map", 0444, d_tracer, + trace_create_file("eval_map", TRACE_MODE_READ, d_tracer, NULL, &tracing_eval_map_fops); } @@ -8567,27 +8568,27 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) } /* per cpu trace_pipe */ - trace_create_cpu_file("trace_pipe", 0444, d_cpu, + trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_pipe_fops); /* per cpu trace */ - trace_create_cpu_file("trace", 0644, d_cpu, + trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu, tr, cpu, &tracing_fops); - trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, + trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_buffers_fops); - trace_create_cpu_file("stats", 0444, d_cpu, + trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_stats_fops); - trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, + trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_cpu_file("snapshot", 0644, d_cpu, + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, tr, cpu, &snapshot_fops); - trace_create_cpu_file("snapshot_raw", 0444, d_cpu, + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &snapshot_raw_fops); #endif } @@ -8793,8 +8794,8 @@ create_trace_option_file(struct trace_array *tr, topt->opt = opt; topt->tr = tr; - topt->entry = trace_create_file(opt->name, 0644, t_options, topt, - &trace_options_fops); + topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, + t_options, topt, &trace_options_fops); } @@ -8869,7 +8870,7 @@ create_trace_option_core_file(struct trace_array *tr, if (!t_options) return NULL; - return trace_create_file(option, 0644, t_options, + return trace_create_file(option, TRACE_MODE_WRITE, t_options, (void *)&tr->trace_flags_index[index], &trace_options_core_fops); } @@ -9394,28 +9395,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) struct trace_event_file *file; int cpu; - trace_create_file("available_tracers", 0444, d_tracer, + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, tr, &show_traces_fops); - trace_create_file("current_tracer", 0644, d_tracer, + trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, tr, &set_tracer_fops); - trace_create_file("tracing_cpumask", 0644, d_tracer, + trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, tr, &tracing_cpumask_fops); - trace_create_file("trace_options", 0644, d_tracer, + trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); - trace_create_file("trace", 0644, d_tracer, + trace_create_file("trace", TRACE_MODE_WRITE, d_tracer, tr, &tracing_fops); - trace_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops); - trace_create_file("buffer_size_kb", 0644, d_tracer, + trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &tracing_entries_fops); - trace_create_file("buffer_total_size_kb", 0444, d_tracer, + trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); trace_create_file("free_buffer", 0200, d_tracer, @@ -9426,25 +9427,25 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) file = __find_event_file(tr, "ftrace", "print"); if (file && file->dir) - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + file, &event_trigger_fops); tr->trace_marker_file = file; trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); - trace_create_file("trace_clock", 0644, d_tracer, tr, + trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, &trace_clock_fops); - trace_create_file("tracing_on", 0644, d_tracer, + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, tr, &rb_simple_fops); - trace_create_file("timestamp_mode", 0444, d_tracer, tr, + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, &trace_time_stamp_mode_fops); tr->buffer_percent = 50; - trace_create_file("buffer_percent", 0444, d_tracer, + trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer, tr, &buffer_percent_fops); create_trace_options_dir(tr); @@ -9457,11 +9458,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) MEM_FAIL(1, "Could not allocate function filter files"); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_tracer, + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif - trace_create_file("error_log", 0644, d_tracer, + trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); for_each_tracing_cpu(cpu) @@ -9654,19 +9655,19 @@ static __init int tracer_init_tracefs(void) init_tracer_tracefs(&global_trace, NULL); ftrace_init_tracefs_toplevel(&global_trace, NULL); - trace_create_file("tracing_thresh", 0644, NULL, + trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL, &global_trace, &tracing_thresh_fops); - trace_create_file("README", 0444, NULL, + trace_create_file("README", TRACE_MODE_READ, NULL, NULL, &tracing_readme_fops); - trace_create_file("saved_cmdlines", 0444, NULL, + trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("saved_cmdlines_size", 0644, NULL, + trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL, NULL, &tracing_saved_cmdlines_size_fops); - trace_create_file("saved_tgids", 0444, NULL, + trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); trace_eval_init(); @@ -9678,7 +9679,7 @@ static __init int tracer_init_tracefs(void) #endif #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, NULL, + trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL, NULL, &tracing_dyn_info_fops); #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2ec500186992..6c3808132b16 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -29,6 +29,9 @@ #include /* some archs define it here */ #endif +#define TRACE_MODE_WRITE 0640 +#define TRACE_MODE_READ 0440 + enum trace_type { __TRACE_FIRST_TYPE = 0, diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 1110112e55bd..e34e8182ee4b 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -262,7 +262,7 @@ static __init int init_dynamic_event(void) if (ret) return 0; - entry = tracefs_create_file("dynamic_events", 0644, NULL, + entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, NULL, &dynamic_events_ops); /* Event list interface */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bf54d2803261..4021b9a79f93 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2312,7 +2312,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name, /* the ftrace system is special, do not create enable or filter files */ if (strcmp(name, "ftrace") != 0) { - entry = tracefs_create_file("filter", 0644, dir->entry, dir, + entry = tracefs_create_file("filter", TRACE_MODE_WRITE, + dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); @@ -2320,7 +2321,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, pr_warn("Could not create tracefs '%s/filter' entry\n", name); } - trace_create_file("enable", 0644, dir->entry, dir, + trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir, &ftrace_system_enable_fops); } @@ -2402,12 +2403,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", 0644, file->dir, file, + trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file, &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, file->dir, + trace_create_file("id", TRACE_MODE_READ, file->dir, (void *)(long)call->event.type, &ftrace_event_id_fops); #endif @@ -2423,22 +2424,22 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) * triggers or filters. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - trace_create_file("filter", 0644, file->dir, file, - &ftrace_event_filter_fops); + trace_create_file("filter", TRACE_MODE_WRITE, file->dir, + file, &ftrace_event_filter_fops); - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + file, &event_trigger_fops); } #ifdef CONFIG_HIST_TRIGGERS - trace_create_file("hist", 0444, file->dir, file, + trace_create_file("hist", TRACE_MODE_READ, file->dir, file, &event_hist_fops); #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG - trace_create_file("hist_debug", 0444, file->dir, file, + trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file, &event_hist_debug_fops); #endif - trace_create_file("format", 0444, file->dir, call, + trace_create_file("format", TRACE_MODE_READ, file->dir, call, &ftrace_event_format_fops); #ifdef CONFIG_TRACE_EVENT_INJECT @@ -3433,7 +3434,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = tracefs_create_file("set_event", 0644, parent, + entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) { pr_warn("Could not create tracefs 'set_event' entry\n"); @@ -3446,7 +3447,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } - entry = trace_create_file("enable", 0644, d_events, + entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); if (!entry) { pr_warn("Could not create tracefs 'enable' entry\n"); @@ -3455,24 +3456,25 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) /* There are not as crucial, just warn if they are not created */ - entry = tracefs_create_file("set_event_pid", 0644, parent, + entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_pid' entry\n"); - entry = tracefs_create_file("set_event_notrace_pid", 0644, parent, - tr, &ftrace_set_event_notrace_pid_fops); + entry = tracefs_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); /* ring buffer internal formats */ - entry = trace_create_file("header_page", 0444, d_events, + entry = trace_create_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); if (!entry) pr_warn("Could not create tracefs 'header_page' entry\n"); - entry = trace_create_file("header_event", 0444, d_events, + entry = trace_create_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); if (!entry) @@ -3689,8 +3691,8 @@ __init int event_trace_init(void) if (!tr) return -ENODEV; - entry = tracefs_create_file("available_events", 0444, NULL, - tr, &ftrace_avail_fops); + entry = tracefs_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index d54094b7a9d7..22db3ce95e74 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -2227,8 +2227,8 @@ static __init int trace_events_synth_init(void) if (err) goto err; - entry = tracefs_create_file("synthetic_events", 0644, NULL, - NULL, &synth_events_fops); + entry = tracefs_create_file("synthetic_events", TRACE_MODE_WRITE, + NULL, NULL, &synth_events_fops); if (!entry) { err = -ENODEV; goto err; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0de6837722da..6b5ff3ba4251 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1340,7 +1340,7 @@ static __init int init_graph_tracefs(void) if (ret) return 0; - trace_create_file("max_graph_depth", 0644, NULL, + trace_create_file("max_graph_depth", TRACE_MODE_WRITE, NULL, NULL, &graph_depth_fops); return 0; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1b83d75eb103..d0a730d99a33 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -782,21 +782,21 @@ static int init_tracefs(void) if (!top_dir) return -ENOMEM; - hwlat_sample_window = tracefs_create_file("window", 0640, + hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE, top_dir, &hwlat_window, &trace_min_max_fops); if (!hwlat_sample_window) goto err; - hwlat_sample_width = tracefs_create_file("width", 0644, + hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE, top_dir, &hwlat_width, &trace_min_max_fops); if (!hwlat_sample_width) goto err; - hwlat_thread_mode = trace_create_file("mode", 0644, + hwlat_thread_mode = trace_create_file("mode", TRACE_MODE_WRITE, top_dir, NULL, &thread_mode_fops); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0e1e7ce5f7ed..33272a7b6912 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1925,16 +1925,16 @@ static __init int init_kprobe_trace(void) if (ret) return 0; - entry = tracefs_create_file("kprobe_events", 0644, NULL, - NULL, &kprobe_events_ops); + entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops); /* Event list interface */ if (!entry) pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", 0444, NULL, - NULL, &kprobe_profile_ops); + entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops); if (!entry) pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index ce053619f289..c4f14fb98aaa 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1856,38 +1856,38 @@ static int init_tracefs(void) if (!top_dir) return 0; - tmp = tracefs_create_file("period_us", 0640, top_dir, + tmp = tracefs_create_file("period_us", TRACE_MODE_WRITE, top_dir, &osnoise_period, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("runtime_us", 0644, top_dir, + tmp = tracefs_create_file("runtime_us", TRACE_MODE_WRITE, top_dir, &osnoise_runtime, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir, + tmp = tracefs_create_file("stop_tracing_us", TRACE_MODE_WRITE, top_dir, &osnoise_stop_tracing_in, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir, + tmp = tracefs_create_file("stop_tracing_total_us", TRACE_MODE_WRITE, top_dir, &osnoise_stop_tracing_total, &trace_min_max_fops); if (!tmp) goto err; - tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); + tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops); if (!tmp) goto err; #ifdef CONFIG_TIMERLAT_TRACER #ifdef CONFIG_STACKTRACE - tmp = tracefs_create_file("print_stack", 0640, top_dir, + tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir, &osnoise_print_stack, &trace_min_max_fops); if (!tmp) goto err; #endif - tmp = tracefs_create_file("timerlat_period_us", 0640, top_dir, + tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) goto err; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 4b320fe7df70..29f6e95439b6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -384,7 +384,7 @@ static __init int init_trace_printk_function_export(void) if (ret) return 0; - trace_create_file("printk_formats", 0444, NULL, + trace_create_file("printk_formats", TRACE_MODE_READ, NULL, NULL, &ftrace_formats_fops); return 0; diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index b2edac1fe156..4d4b78c8ca25 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -226,8 +226,8 @@ __init static int create_recursed_functions(void) { struct dentry *dentry; - dentry = trace_create_file("recursed_functions", 0644, NULL, NULL, - &recursed_functions_fops); + dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE, + NULL, NULL, &recursed_functions_fops); if (!dentry) pr_warn("WARNING: Failed to create recursed_functions\n"); return 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 63c285042051..5a48dba912ea 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -559,14 +559,14 @@ static __init int stack_trace_init(void) if (ret) return 0; - trace_create_file("stack_max_size", 0644, NULL, + trace_create_file("stack_max_size", TRACE_MODE_WRITE, NULL, &stack_trace_max_size, &stack_max_size_fops); - trace_create_file("stack_trace", 0444, NULL, + trace_create_file("stack_trace", TRACE_MODE_READ, NULL, NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0644, NULL, + trace_create_file("stack_trace_filter", TRACE_MODE_WRITE, NULL, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8d141c3825a9..bb247beec447 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -297,9 +297,9 @@ static int init_stat_file(struct stat_session *session) if (!stat_dir && (ret = tracing_stat_init())) return ret; - session->file = tracefs_create_file(session->ts->name, 0644, - stat_dir, - session, &tracing_stat_fops); + session->file = tracefs_create_file(session->ts->name, TRACE_MODE_WRITE, + stat_dir, session, + &tracing_stat_fops); if (!session->file) return -ENOMEM; return 0; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 225ce569bf8f..0a5c0db3137e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1655,10 +1655,10 @@ static __init int init_uprobe_trace(void) if (ret) return 0; - trace_create_file("uprobe_events", 0644, NULL, + trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL, NULL, &uprobe_events_ops); /* Profile interface */ - trace_create_file("uprobe_profile", 0444, NULL, + trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL, NULL, &uprobe_profile_ops); return 0; } -- cgit v1.2.3 From 6644c654ea70e0d8b8d5111e1272f8f29df00f21 Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Thu, 9 Sep 2021 17:02:16 +0800 Subject: ftrace: Cleanup ftrace_dyn_arch_init() Most of ARCHs use empty ftrace_dyn_arch_init(), introduce a weak common ftrace_dyn_arch_init() to cleanup them. Link: https://lkml.kernel.org/r/20210909090216.1955240-1-o451686892@gmail.com Acked-by: Heiko Carstens (s390) Acked-by: Helge Deller (parisc) Signed-off-by: Weizhao Ouyang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0a0dbc2d411b..2c3e9760df7f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6847,6 +6847,11 @@ void __init ftrace_free_init_mem(void) ftrace_free_mem(NULL, start, end); } +int __init __weak ftrace_dyn_arch_init(void) +{ + return 0; +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; -- cgit v1.2.3 From 43c9dd8ddf4efdce126e0a0b176d729c72445b0f Mon Sep 17 00:00:00 2001 From: Carles Pey Date: Sat, 18 Sep 2021 19:30:43 +0400 Subject: ftrace: Add unit test for removing trace function A self test is provided for the trace function removal functionality. Link: https://lkml.kernel.org/r/20210918153043.318016-2-carles.pey@gmail.com Signed-off-by: Carles Pey Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index adf7ef194005..875b4f1a0476 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -287,6 +287,40 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) if (trace_selftest_test_probe3_cnt != 4) goto out_free; + /* Remove trace function from probe 3 */ + func1_name = "!" __stringify(DYN_FTRACE_TEST_NAME); + len1 = strlen(func1_name); + + ftrace_set_filter(&test_probe3, func1_name, len1, 0); + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 3) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out_free; + } + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 3) + goto out_free; + if (trace_selftest_test_probe2_cnt != 3) + goto out_free; + if (trace_selftest_test_probe3_cnt != 5) + goto out_free; + ret = 0; out_free: unregister_ftrace_function(dyn_ops); -- cgit v1.2.3 From affc659246293df42ba2d184c674cc959c05aa02 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Sep 2021 08:03:42 +0800 Subject: tracing: in_irq() cleanup Replace the obsolete and ambiguos macro in_irq() with new macro in_hardirq(). Link: https://lkml.kernel.org/r/20210930000342.6016-1-changbin.du@gmail.com Reviewed-by: Petr Mladek Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6c3808132b16..6b60ab9475ed 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -890,7 +890,7 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) * is set, and called by an interrupt handler, we still * want to trace it. */ - if (in_irq()) + if (in_hardirq()) trace_recursion_set(TRACE_IRQ_BIT); else trace_recursion_clear(TRACE_IRQ_BIT); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6b5ff3ba4251..203204cadf92 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -120,7 +120,7 @@ static inline int ftrace_graph_ignore_irqs(void) if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; - return in_irq(); + return in_hardirq(); } int trace_graph_entry(struct ftrace_graph_ent *trace) -- cgit v1.2.3 From 34cdd18b8d245f3e901e5325313c27de727ab80d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 17 Jun 2020 16:56:16 -0400 Subject: tracing: Use linker magic instead of recasting ftrace_ops_list_func() In an effort to enable -Wcast-function-type in the top-level Makefile to support Control Flow Integrity builds, all function casts need to be removed. This means that ftrace_ops_list_func() can no longer be defined as ftrace_ops_no_ops(). The reason for ftrace_ops_no_ops() is to use that when an architecture calls ftrace_ops_list_func() with only two parameters (called from assembly). And to make sure there's no C side-effects, those archs call ftrace_ops_no_ops() which only has two parameters, as ftrace_ops_list_func() has four parameters. Instead of a typecast, use vmlinux.lds.h to define ftrace_ops_list_func() to arch_ftrace_ops_list_func() that will define the proper set of parameters. Link: https://lore.kernel.org/r/20200614070154.6039-1-oscar.carter@gmx.com Link: https://lkml.kernel.org/r/20200617165616.52241bde@oasis.local.home Link: https://lore.kernel.org/all/20211005053922.GA702049@embeddedor/ Requested-by: Oscar Carter Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c3e9760df7f..8b5801881271 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -119,14 +119,9 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; struct ftrace_ops global_ops; -#if ARCH_SUPPORTS_FTRACE_OPS -static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct ftrace_regs *fregs); -#else -/* See comment below, where ftrace_ops_list_func is defined */ -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); -#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) -#endif +/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */ +void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); static inline void ftrace_ops_init(struct ftrace_ops *ops) { @@ -7032,21 +7027,23 @@ out: * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still * set the ARCH_SUPPORTS_FTRACE_OPS. + * + * In vmlinux.lds.h, ftrace_ops_list_func() is defined to be + * arch_ftrace_ops_list_func. */ #if ARCH_SUPPORTS_FTRACE_OPS -static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct ftrace_regs *fregs) +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) { __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } -NOKPROBE_SYMBOL(ftrace_ops_list_func); #else -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } -NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif +NOKPROBE_SYMBOL(arch_ftrace_ops_list_func); /* * If there's only one function registered but it does not support -- cgit v1.2.3 From 7ce1bb83a14019f8c396d57ec704d19478747716 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 13 Oct 2021 21:52:17 -0700 Subject: tracing/cfi: Fix cmp_entries_* functions signature mismatch If CONFIG_CFI_CLANG=y, attempting to read an event histogram will cause the kernel to panic due to failed CFI check. 1. echo 'hist:keys=common_pid' >> events/sched/sched_switch/trigger 2. cat events/sched/sched_switch/hist 3. kernel panics on attempting to read hist This happens because the sort() function expects a generic int (*)(const void *, const void *) pointer for the compare function. To prevent this CFI failure, change tracing map cmp_entries_* function signatures to match this. Also, fix the build error reported by the kernel test robot [1]. [1] https://lore.kernel.org/r/202110141140.zzi4dRh4-lkp@intel.com/ Link: https://lkml.kernel.org/r/20211014045217.3265162-1-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index d6bddb157ef2..39bb56d2dcbe 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -834,29 +834,35 @@ int tracing_map_init(struct tracing_map *map) return err; } -static int cmp_entries_dup(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_dup(const void *A, const void *B) { + const struct tracing_map_sort_entry *a, *b; int ret = 0; - if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size)) + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + if (memcmp(a->key, b->key, a->elt->map->key_size)) ret = 1; return ret; } -static int cmp_entries_sum(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_sum(const void *A, const void *B) { const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; struct tracing_map_sort_key *sort_key; struct tracing_map_field *field; tracing_map_cmp_fn_t cmp_fn; void *val_a, *val_b; int ret = 0; - elt_a = (*a)->elt; - elt_b = (*b)->elt; + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; sort_key = &elt_a->map->sort_key; @@ -873,18 +879,21 @@ static int cmp_entries_sum(const struct tracing_map_sort_entry **a, return ret; } -static int cmp_entries_key(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_key(const void *A, const void *B) { const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; struct tracing_map_sort_key *sort_key; struct tracing_map_field *field; tracing_map_cmp_fn_t cmp_fn; void *val_a, *val_b; int ret = 0; - elt_a = (*a)->elt; - elt_b = (*b)->elt; + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; sort_key = &elt_a->map->sort_key; @@ -989,10 +998,8 @@ static void sort_secondary(struct tracing_map *map, struct tracing_map_sort_key *primary_key, struct tracing_map_sort_key *secondary_key) { - int (*primary_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); - int (*secondary_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); + int (*primary_fn)(const void *, const void *); + int (*secondary_fn)(const void *, const void *); unsigned i, start = 0, n_sub = 1; if (is_key(map, primary_key->field_idx)) @@ -1061,8 +1068,7 @@ int tracing_map_sort_entries(struct tracing_map *map, unsigned int n_sort_keys, struct tracing_map_sort_entry ***sort_entries) { - int (*cmp_entries_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); + int (*cmp_entries_fn)(const void *, const void *); struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; -- cgit v1.2.3 From 9b84fadc444de5456ab5f5487e2108311c724c3f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Oct 2021 13:42:40 -0400 Subject: tracing: Reuse logic from perf's get_recursion_context() Instead of having branches that adds noise to the branch prediction, use the addition logic to set the bit for the level of interrupt context that the state is currently in. This copies the logic from perf's get_recursion_context() function. Link: https://lore.kernel.org/all/20211015161702.GF174703@worktop.programming.kicks-ass.net/ Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c5a3fbf19617..15d4380006e3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3168,13 +3168,13 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; unsigned long pc = preempt_count(); - int bit; + int bit = 0; - if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) - bit = RB_CTX_NORMAL; - else - bit = pc & NMI_MASK ? RB_CTX_NMI : - pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; + bit += !!(pc & (NMI_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + bit = RB_CTX_NORMAL - bit; if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { /* -- cgit v1.2.3 From 91ebe8bcbff9d2ff21303e73bf7434f39a98b255 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Oct 2021 15:01:19 -0400 Subject: tracing/perf: Add interrupt_context_level() helper Now that there are three different instances of doing the addition trick to the preempt_count() and NMI_MASK, HARDIRQ_MASK and SOFTIRQ_OFFSET macros, it deserves a helper function defined in the preempt.h header. Add the interrupt_context_level() helper and replace the three instances that do that logic with it. Link: https://lore.kernel.org/all/20211015142541.4badd8a9@gandalf.local.home/ Signed-off-by: Steven Rostedt (VMware) --- kernel/events/internal.h | 7 +------ kernel/trace/ring_buffer.c | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 228801e20788..082832738c8f 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,12 +205,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { - unsigned int pc = preempt_count(); - unsigned char rctx = 0; - - rctx += !!(pc & (NMI_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char rctx = interrupt_context_level(); if (recursion[rctx]) return -1; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 15d4380006e3..f6520d0a4c8c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3167,12 +3167,7 @@ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; - unsigned long pc = preempt_count(); - int bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit; -- cgit v1.2.3 From 0c0593b45c9b4e5b212ffb3fb28bb8d3c0ec0dc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Oct 2021 11:13:31 +0200 Subject: x86/ftrace: Make function graph use ftrace directly We don't need special hook for graph tracer entry point, but instead we can use graph_ops::func function to install the return_hooker. This moves the graph tracing setup _before_ the direct trampoline prepares the stack, so the return_hooker will be called when the direct trampoline is finished. This simplifies the code, because we don't need to take into account the direct trampoline setup when preparing the graph tracer hooker and we can allow function graph tracer on entries registered with direct trampoline. Link: https://lkml.kernel.org/r/20211008091336.33616-4-jolsa@kernel.org [fixed compile error reported by kernel test robot ] Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index b8a0d1d564fb..22061d38fc00 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -115,6 +115,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, { struct ftrace_graph_ent trace; +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS /* * Skip graph tracing if the return location is served by direct trampoline, * since call sequence and return addresses are unpredictable anyway. @@ -124,6 +125,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, if (ftrace_direct_func_count && ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) return -EBUSY; +#endif trace.func = func; trace.depth = ++current->curr_ret_depth; @@ -333,10 +335,10 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, #endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ static struct ftrace_ops graph_ops = { - .func = ftrace_stub, + .func = ftrace_graph_func, .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID | - FTRACE_OPS_FL_STUB, + FTRACE_OPS_GRAPH_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR .trampoline = FTRACE_GRAPH_TRAMP_ADDR, /* trampoline_size is only needed for dynamically allocated tramps */ -- cgit v1.2.3 From 130c08065848a98163b243b55e99f66c24609efb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:32 +0200 Subject: tracing: Add trampoline/graph selftest Adding selftest for checking that direct trampoline can co-exist together with graph tracer on same function. This is supported for CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS config option, which is defined only for x86_64 for now. Link: https://lkml.kernel.org/r/20211008091336.33616-5-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 54 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 875b4f1a0476..3404a245417e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,6 +784,8 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; +noinline __noclone static void trace_direct_tramp(void) { } + /* * Pretty much the same than for the function tracer from which the selftest * has been borrowed. @@ -794,6 +796,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, { int ret; unsigned long count; + char *func_name __maybe_unused; #ifdef CONFIG_DYNAMIC_FTRACE if (ftrace_filter_param) { @@ -842,8 +845,57 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } - /* Don't test dynamic tracing, the function tracer already did */ +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + tracing_reset_online_cpus(&tr->array_buffer); + set_graph_array(tr); + + /* + * Some archs *cough*PowerPC*cough* add characters to the + * start of the function names. We simply put a '*' to + * accommodate them. + */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + ftrace_set_global_filter(func_name, strlen(func_name), 1); + + /* + * Register direct function together with graph tracer + * and make sure we get graph trace. + */ + ret = register_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME, + (unsigned long) trace_direct_tramp); + if (ret) + goto out; + + ret = register_ftrace_graph(&fgraph_ops); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + count = 0; + + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + + unregister_ftrace_graph(&fgraph_ops); + + ret = unregister_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME, + (unsigned long) trace_direct_tramp); + if (ret) + goto out; + + tracing_start(); + if (!ret && !count) { + ret = -1; + goto out; + } +#endif + + /* Don't test dynamic tracing, the function tracer already did */ out: /* Stop it if we failed */ if (ret) -- cgit v1.2.3 From 4e341cad6b7a58376bfc6d1c8347727d094a6274 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Oct 2021 13:43:57 -0400 Subject: tracing: Fix selftest config check for function graph start up test There's a new test in trace_selftest_startup_function_graph() that requires the use of ftrace args being supported as well does some tricks with dynamic tracing. Although this code checks HAVE_DYNAMIC_FTRACE_WITH_ARGS it fails to check DYNAMIC_FTRACE, and the kernel fails to build due to that dependency. Also only define the prototype of trace_direct_tramp() if it is used. Link: https://lkml.kernel.org/r/20211021134357.7f48e173@gandalf.local.home Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3404a245417e..afd937a46496 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,7 +784,11 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; +#if defined(CONFIG_DYNAMIC_FTRACE) && \ + defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) +#define TEST_DIRECT_TRAMP noinline __noclone static void trace_direct_tramp(void) { } +#endif /* * Pretty much the same than for the function tracer from which the selftest @@ -845,7 +849,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } -#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +#ifdef TEST_DIRECT_TRAMP tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); -- cgit v1.2.3 From 1904a8144598031af85406873c5fbec806ee3fd7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:33 +0200 Subject: ftrace: Add ftrace_add_rec_direct function Factor out the code that adds (ip, addr) tuple to direct_functions hash in new ftrace_add_rec_direct function. It will be used in following patches. Link: https://lkml.kernel.org/r/20211008091336.33616-6-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 64 +++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b5801881271..ccbd8377e580 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2390,6 +2390,39 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) return entry->direct; } +static struct ftrace_func_entry* +ftrace_add_rec_direct(unsigned long ip, unsigned long addr, + struct ftrace_hash **free_hash) +{ + struct ftrace_func_entry *entry; + + if (ftrace_hash_empty(direct_functions) || + direct_functions->count > 2 * (1 << direct_functions->size_bits)) { + struct ftrace_hash *new_hash; + int size = ftrace_hash_empty(direct_functions) ? 0 : + direct_functions->count + 1; + + if (size < 32) + size = 32; + + new_hash = dup_hash(direct_functions, size); + if (!new_hash) + return NULL; + + *free_hash = direct_functions; + direct_functions = new_hash; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->ip = ip; + entry->direct = addr; + __add_hash_entry(direct_functions, entry); + return entry; +} + static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { @@ -5106,39 +5139,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } ret = -ENOMEM; - if (ftrace_hash_empty(direct_functions) || - direct_functions->count > 2 * (1 << direct_functions->size_bits)) { - struct ftrace_hash *new_hash; - int size = ftrace_hash_empty(direct_functions) ? 0 : - direct_functions->count + 1; - - if (size < 32) - size = 32; - - new_hash = dup_hash(direct_functions, size); - if (!new_hash) - goto out_unlock; - - free_hash = direct_functions; - direct_functions = new_hash; - } - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto out_unlock; - direct = ftrace_find_direct_func(addr); if (!direct) { direct = ftrace_alloc_direct_func(addr); - if (!direct) { - kfree(entry); + if (!direct) goto out_unlock; - } } - entry->ip = ip; - entry->direct = addr; - __add_hash_entry(direct_functions, entry); + entry = ftrace_add_rec_direct(ip, addr, &free_hash); + if (!entry) + goto out_unlock; ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); if (ret) -- cgit v1.2.3 From f64dd4627ec6edc39bf1430fe6dbc923d2300a88 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:34 +0200 Subject: ftrace: Add multi direct register/unregister interface Adding interface to register multiple direct functions within single call. Adding following functions: register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) The register_ftrace_direct_multi registers direct function (addr) with all functions in ops filter. The ops filter can be updated before with ftrace_set_filter_ip calls. All requested functions must not have direct function currently registered, otherwise register_ftrace_direct_multi will fail. The unregister_ftrace_direct_multi unregisters ops related direct functions. Link: https://lkml.kernel.org/r/20211008091336.33616-7-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccbd8377e580..a05b25fb77d8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5401,6 +5401,148 @@ int modify_ftrace_direct(unsigned long ip, return ret; } EXPORT_SYMBOL_GPL(modify_ftrace_direct); + +#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \ + FTRACE_OPS_FL_SAVE_REGS) + +static int check_direct_multi(struct ftrace_ops *ops) +{ + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; + if ((ops->flags & MULTI_FLAGS) != MULTI_FLAGS) + return -EINVAL; + return 0; +} + +static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long addr) +{ + struct ftrace_func_entry *entry, *del; + int size, i; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (del && del->direct == addr) { + remove_hash_entry(direct_functions, del); + kfree(del); + } + } + } +} + +/** + * register_ftrace_direct_multi - Call a custom trampoline directly + * for multiple functions registered in @ops + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the trampoline to call at @ops functions + * + * This is used to connect a direct calls to @addr from the nop locations + * of the functions registered in @ops (with by ftrace_set_filter_ip + * function). + * + * The location that it calls (@addr) must be able to handle a direct call, + * and save the parameters of the function being traced, and restore them + * (or inject new ones if needed), before returning. + * + * Returns: + * 0 on success + * -EINVAL - The @ops object was already registered with this call or + * when there are no functions in @ops object. + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash, *free_hash = NULL; + struct ftrace_func_entry *entry, *new; + int err = -EBUSY, size, i; + + if (ops->func || ops->trampoline) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + return -EINVAL; + + hash = ops->func_hash->filter_hash; + if (ftrace_hash_empty(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered.. */ + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_find_rec_direct(entry->ip)) + goto out_unlock; + } + } + + /* ... and insert them to direct_functions hash. */ + err = -ENOMEM; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); + if (!new) + goto out_remove; + entry->direct = addr; + } + } + + ops->func = call_direct_funcs; + ops->flags = MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + + err = register_ftrace_function(ops); + + out_remove: + if (err) + remove_direct_functions_hash(hash, addr); + + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + return err; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct_multi); + +/** + * unregister_ftrace_direct_multi - Remove calls to custom trampoline + * previously registered by register_ftrace_direct_multi for @ops object. + * @ops: The address of the struct ftrace_ops object + * + * This is used to remove a direct calls to @addr from the nop locations + * of the functions registered in @ops (with by ftrace_set_filter_ip + * function). + * + * Returns: + * 0 on success + * -EINVAL - The @ops object was not properly registered. + */ +int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + err = unregister_ftrace_function(ops); + remove_direct_functions_hash(hash, addr); + mutex_unlock(&direct_mutex); + return err; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From ccf5a89efd6f0a9483cea8acd4a0822b1a47e59a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:35 +0200 Subject: ftrace: Add multi direct modify interface Adding interface to modify registered direct function for ftrace_ops. Adding following function: modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) The function changes the currently registered direct function for all attached functions. Link: https://lkml.kernel.org/r/20211008091336.33616-8-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a05b25fb77d8..30120342176e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5543,6 +5543,68 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) return err; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); + +/** + * modify_ftrace_direct_multi - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + struct ftrace_func_entry *entry, *iter; + int i, size; + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + mutex_lock(&ftrace_lock); + + /* + * Shutdown the ops, change 'direct' pointer for each + * ops entry in direct_functions hash and startup the + * ops back again. + * + * Note there is no callback called for @ops object after + * this ftrace_shutdown call until ftrace_startup is called + * later on. + */ + err = ftrace_shutdown(ops, 0); + if (err) + goto out_unlock; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(iter, &hash->buckets[i], hlist) { + entry = __ftrace_lookup_ip(direct_functions, iter->ip); + if (!entry) + continue; + entry->direct = addr; + } + } + + err = ftrace_startup(ops, 0); + + out_unlock: + mutex_unlock(&ftrace_lock); + mutex_unlock(&direct_mutex); + return err; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From ed29271894aa92826d308231593b7ee7ac5a4932 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Oct 2021 16:11:14 -0400 Subject: ftrace/direct: Do not disable when switching direct callers Currently to switch a set of "multi" direct trampolines from one trampoline to another, a full shutdown of the current set needs to be done, followed by an update to what trampoline the direct callers would call, and then re-enabling the callers. This leaves a time when the functions will not be calling anything, and events may be missed. Instead, use a trick to allow all the functions with direct trampolines attached will always call either the new or old trampoline while the switch is happening. To do this, first attach a "dummy" callback via ftrace to all the functions that the current direct trampoline is attached to. This will cause the functions to call the "list func" instead of the direct trampoline. The list function will call the direct trampoline "helper" that will set the function it should call as it returns back to the ftrace trampoline. At this moment, the direct caller descriptor can safely update the direct call trampoline. The list function will pick either the new or old function (depending on the memory coherency model of the architecture). Now removing the dummy function from each of the locations of the direct trampoline caller, will put back the direct call, but now to the new trampoline. A better visual is: [ Changing direct call from my_direct_1 to my_direct_2 ] : call my_direct_1 |||||||||||||||||||| vvvvvvvvvvvvvvvvvvvv : call ftrace_caller : [..] call ftrace_ops_list_func ftrace_ops_list_func() { ops->func() -> direct_helper -> set rax to my_direct_1 or my_direct_2 } call rax (to either my_direct_1 or my_direct_2 |||||||||||||||||||| vvvvvvvvvvvvvvvvvvvv : call my_direct_2 Link: https://lore.kernel.org/all/20211014162819.5c85618b@gandalf.local.home/ Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30120342176e..f90ed00a6d5b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5561,8 +5561,12 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); */ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; + struct ftrace_hash *hash; struct ftrace_func_entry *entry, *iter; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; int i, size; int err; @@ -5572,21 +5576,22 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) return -EINVAL; mutex_lock(&direct_mutex); - mutex_lock(&ftrace_lock); + + /* Enable the tmp_ops to have the same functions as the direct ops */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash = ops->func_hash; + + err = register_ftrace_function(&tmp_ops); + if (err) + goto out_direct; /* - * Shutdown the ops, change 'direct' pointer for each - * ops entry in direct_functions hash and startup the - * ops back again. - * - * Note there is no callback called for @ops object after - * this ftrace_shutdown call until ftrace_startup is called - * later on. + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. */ - err = ftrace_shutdown(ops, 0); - if (err) - goto out_unlock; + mutex_lock(&ftrace_lock); + hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -5597,10 +5602,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) } } - err = ftrace_startup(ops, 0); + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); - out_unlock: mutex_unlock(&ftrace_lock); + out_direct: mutex_unlock(&direct_mutex); return err; } -- cgit v1.2.3 From 8720aeecc246837bc6da64c5118dc3177c162e14 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Oct 2021 17:33:13 +0200 Subject: tracing: use %ps format string to print symbols clang started warning about excessive stack usage in hist_trigger_print_key() kernel/trace/trace_events_hist.c:4723:13: error: stack frame size (1336) exceeds limit (1024) in function 'hist_trigger_print_key' [-Werror,-Wframe-larger-than] The problem is that there are two 512-byte arrays on the stack if hist_trigger_stacktrace_print() gets inlined. I don't think this has changed in the past five years, but something probably changed the inlining decisions made by the compiler, so the problem is now made more obvious. Rather than printing the symbol names into separate buffers, it seems we can simply use the special %ps format string modifier to print the pointers symbolically and get rid of both buffers. Marking hist_trigger_stacktrace_print() would be a simpler way of avoiding the warning, but that would not address the excessive stack usage. Link: https://lkml.kernel.org/r/20211019153337.294790-1-arnd@kernel.org Fixes: 69a0200c2e25 ("tracing: Add hist trigger support for stacktraces as keys") Link: https://lore.kernel.org/all/20211015095704.49a99859@gandalf.local.home/ Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a6061a69aa84..b64aed538628 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4706,7 +4706,6 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, unsigned long *stacktrace_entries, unsigned int max_entries) { - char str[KSYM_SYMBOL_LEN]; unsigned int spaces = 8; unsigned int i; @@ -4715,8 +4714,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, return; seq_printf(m, "%*c", 1 + spaces, ' '); - sprint_symbol(str, stacktrace_entries[i]); - seq_printf(m, "%s\n", str); + seq_printf(m, "%pS\n", (void*)stacktrace_entries[i]); } } @@ -4726,7 +4724,6 @@ static void hist_trigger_print_key(struct seq_file *m, struct tracing_map_elt *elt) { struct hist_field *key_field; - char str[KSYM_SYMBOL_LEN]; bool multiline = false; const char *field_name; unsigned int i; @@ -4747,14 +4744,12 @@ static void hist_trigger_print_key(struct seq_file *m, seq_printf(m, "%s: %llx", field_name, uval); } else if (key_field->flags & HIST_FIELD_FL_SYM) { uval = *(u64 *)(key + key_field->offset); - sprint_symbol_no_offset(str, uval); - seq_printf(m, "%s: [%llx] %-45s", field_name, - uval, str); + seq_printf(m, "%s: [%llx] %-45ps", field_name, + uval, (void *)(uintptr_t)uval); } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { uval = *(u64 *)(key + key_field->offset); - sprint_symbol(str, uval); - seq_printf(m, "%s: [%llx] %-55s", field_name, - uval, str); + seq_printf(m, "%s: [%llx] %-55pS", field_name, + uval, (void *)(uintptr_t)uval); } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { struct hist_elt_data *elt_data = elt->private_data; char *comm; -- cgit v1.2.3 From e44e81c5b90f698025eadceb7eef8661eda117d5 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 21 Oct 2021 09:54:24 +0900 Subject: kprobes: convert tests to kunit This converts the kprobes testcases to use the kunit framework. It adds a dependency on CONFIG_KUNIT, and the output will change to TAP: TAP version 14 1..1 # Subtest: kprobes_test 1..4 random: crng init done ok 1 - test_kprobe ok 2 - test_kprobes ok 3 - test_kretprobe ok 4 - test_kretprobes ok 1 - kprobes_test Note that the kprobes testcases are no longer run immediately after kprobes initialization, but as a late initcall when kunit is initialized. kprobes itself is initialized with an early initcall, so the order is still correct. Signed-off-by: Sven Schnelle Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 3 - kernel/test_kprobes.c | 222 ++++++++++++++------------------------------------ 2 files changed, 59 insertions(+), 166 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b62af9fc3607..4676627cb066 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2581,9 +2581,6 @@ static int __init init_kprobes(void) err = register_module_notifier(&kprobe_module_nb); kprobes_initialized = (err == 0); - - if (!err) - init_test_probes(); return err; } early_initcall(init_kprobes); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 76c997fdbc9d..e78f18144145 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -5,18 +5,17 @@ * Copyright IBM Corp. 2008 */ -#define pr_fmt(fmt) "Kprobe smoke test: " fmt - #include #include #include +#include #define div_factor 3 static u32 rand1, preh_val, posth_val; -static int errors, handler_errors, num_tests; static u32 (*target)(u32 value); static u32 (*target2)(u32 value); +static struct kunit *current_test; static noinline u32 kprobe_target(u32 value) { @@ -25,10 +24,7 @@ static noinline u32 kprobe_target(u32 value) static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) { - if (preemptible()) { - handler_errors++; - pr_err("pre-handler is preemptible\n"); - } + KUNIT_EXPECT_FALSE(current_test, preemptible()); preh_val = (rand1 / div_factor); return 0; } @@ -36,14 +32,8 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { - if (preemptible()) { - handler_errors++; - pr_err("post-handler is preemptible\n"); - } - if (preh_val != (rand1 / div_factor)) { - handler_errors++; - pr_err("incorrect value in post_handler\n"); - } + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, preh_val, (rand1 / div_factor)); posth_val = preh_val + div_factor; } @@ -53,30 +43,14 @@ static struct kprobe kp = { .post_handler = kp_post_handler }; -static int test_kprobe(void) +static void test_kprobe(struct kunit *test) { - int ret; - - ret = register_kprobe(&kp); - if (ret < 0) { - pr_err("register_kprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); + current_test = test; + KUNIT_EXPECT_EQ(test, 0, register_kprobe(&kp)); + target(rand1); unregister_kprobe(&kp); - - if (preh_val == 0) { - pr_err("kprobe pre_handler not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - pr_err("kprobe post_handler not called\n"); - handler_errors++; - } - - return 0; + KUNIT_EXPECT_NE(test, 0, preh_val); + KUNIT_EXPECT_NE(test, 0, posth_val); } static noinline u32 kprobe_target2(u32 value) @@ -93,10 +67,7 @@ static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { - if (preh_val != (rand1 / div_factor) + 1) { - handler_errors++; - pr_err("incorrect value in post_handler2\n"); - } + KUNIT_EXPECT_EQ(current_test, preh_val, (rand1 / div_factor) + 1); posth_val = preh_val + div_factor; } @@ -106,51 +77,31 @@ static struct kprobe kp2 = { .post_handler = kp_post_handler2 }; -static int test_kprobes(void) +static void test_kprobes(struct kunit *test) { - int ret; struct kprobe *kps[2] = {&kp, &kp2}; + current_test = test; + /* addr and flags should be cleard for reusing kprobe. */ kp.addr = NULL; kp.flags = 0; - ret = register_kprobes(kps, 2); - if (ret < 0) { - pr_err("register_kprobes returned %d\n", ret); - return ret; - } + KUNIT_EXPECT_EQ(test, 0, register_kprobes(kps, 2)); preh_val = 0; posth_val = 0; - ret = target(rand1); + target(rand1); - if (preh_val == 0) { - pr_err("kprobe pre_handler not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - pr_err("kprobe post_handler not called\n"); - handler_errors++; - } + KUNIT_EXPECT_NE(test, 0, preh_val); + KUNIT_EXPECT_NE(test, 0, posth_val); preh_val = 0; posth_val = 0; - ret = target2(rand1); - - if (preh_val == 0) { - pr_err("kprobe pre_handler2 not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - pr_err("kprobe post_handler2 not called\n"); - handler_errors++; - } + target2(rand1); + KUNIT_EXPECT_NE(test, 0, preh_val); + KUNIT_EXPECT_NE(test, 0, posth_val); unregister_kprobes(kps, 2); - return 0; - } #ifdef CONFIG_KRETPROBES @@ -158,10 +109,7 @@ static u32 krph_val; static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { - if (preemptible()) { - handler_errors++; - pr_err("kretprobe entry handler is preemptible\n"); - } + KUNIT_EXPECT_FALSE(current_test, preemptible()); krph_val = (rand1 / div_factor); return 0; } @@ -170,19 +118,9 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long ret = regs_return_value(regs); - if (preemptible()) { - handler_errors++; - pr_err("kretprobe return handler is preemptible\n"); - } - if (ret != (rand1 / div_factor)) { - handler_errors++; - pr_err("incorrect value in kretprobe handler\n"); - } - if (krph_val == 0) { - handler_errors++; - pr_err("call to kretprobe entry handler failed\n"); - } - + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, ret, rand1 / div_factor); + KUNIT_EXPECT_NE(current_test, krph_val, 0); krph_val = rand1; return 0; } @@ -193,39 +131,21 @@ static struct kretprobe rp = { .kp.symbol_name = "kprobe_target" }; -static int test_kretprobe(void) +static void test_kretprobe(struct kunit *test) { - int ret; - - ret = register_kretprobe(&rp); - if (ret < 0) { - pr_err("register_kretprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); + current_test = test; + KUNIT_EXPECT_EQ(test, 0, register_kretprobe(&rp)); + target(rand1); unregister_kretprobe(&rp); - if (krph_val != rand1) { - pr_err("kretprobe handler not called\n"); - handler_errors++; - } - - return 0; + KUNIT_EXPECT_EQ(test, krph_val, rand1); } static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long ret = regs_return_value(regs); - if (ret != (rand1 / div_factor) + 1) { - handler_errors++; - pr_err("incorrect value in kretprobe handler2\n"); - } - if (krph_val == 0) { - handler_errors++; - pr_err("call to kretprobe entry handler failed\n"); - } - + KUNIT_EXPECT_EQ(current_test, ret, (rand1 / div_factor) + 1); + KUNIT_EXPECT_NE(current_test, krph_val, 0); krph_val = rand1; return 0; } @@ -236,78 +156,54 @@ static struct kretprobe rp2 = { .kp.symbol_name = "kprobe_target2" }; -static int test_kretprobes(void) +static void test_kretprobes(struct kunit *test) { - int ret; struct kretprobe *rps[2] = {&rp, &rp2}; + current_test = test; /* addr and flags should be cleard for reusing kprobe. */ rp.kp.addr = NULL; rp.kp.flags = 0; - ret = register_kretprobes(rps, 2); - if (ret < 0) { - pr_err("register_kretprobe returned %d\n", ret); - return ret; - } + KUNIT_EXPECT_EQ(test, 0, register_kretprobes(rps, 2)); krph_val = 0; - ret = target(rand1); - if (krph_val != rand1) { - pr_err("kretprobe handler not called\n"); - handler_errors++; - } + target(rand1); + KUNIT_EXPECT_EQ(test, krph_val, rand1); krph_val = 0; - ret = target2(rand1); - if (krph_val != rand1) { - pr_err("kretprobe handler2 not called\n"); - handler_errors++; - } + target2(rand1); + KUNIT_EXPECT_EQ(test, krph_val, rand1); unregister_kretprobes(rps, 2); - return 0; } #endif /* CONFIG_KRETPROBES */ -int init_test_probes(void) +static int kprobes_test_init(struct kunit *test) { - int ret; - target = kprobe_target; target2 = kprobe_target2; do { rand1 = prandom_u32(); } while (rand1 <= div_factor); + return 0; +} - pr_info("started\n"); - num_tests++; - ret = test_kprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_kprobes(); - if (ret < 0) - errors++; - +static struct kunit_case kprobes_testcases[] = { + KUNIT_CASE(test_kprobe), + KUNIT_CASE(test_kprobes), #ifdef CONFIG_KRETPROBES - num_tests++; - ret = test_kretprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_kretprobes(); - if (ret < 0) - errors++; -#endif /* CONFIG_KRETPROBES */ + KUNIT_CASE(test_kretprobe), + KUNIT_CASE(test_kretprobes), +#endif + {} +}; - if (errors) - pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); - else if (handler_errors) - pr_err("BUG: %d error(s) running handlers\n", handler_errors); - else - pr_info("passed successfully\n"); +static struct kunit_suite kprobes_test_suite = { + .name = "kprobes_test", + .init = kprobes_test_init, + .test_cases = kprobes_testcases, +}; - return 0; -} +kunit_test_suites(&kprobes_test_suite); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 172f7ba9772cae12f099fc563352e905dc9a1921 Mon Sep 17 00:00:00 2001 From: chongjiapeng Date: Tue, 19 Oct 2021 18:48:54 +0800 Subject: ftrace: Make ftrace_profile_pages_init static This symbol is not used outside of ftrace.c, so marks it static. Fixes the following sparse warning: kernel/trace/ftrace.c:579:5: warning: symbol 'ftrace_profile_pages_init' was not declared. Should it be static? Link: https://lkml.kernel.org/r/1634640534-18280-1-git-send-email-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Fixes: cafb168a1c92 ("tracing: make the function profiler per cpu") Signed-off-by: chongjiapeng Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f90ed00a6d5b..2057ad363772 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -576,7 +576,7 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat) FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); } -int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +static int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; int functions; -- cgit v1.2.3 From 9e20028b529dfc26666b3f527d34ea4d36f60f66 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Oct 2021 14:07:32 -0700 Subject: perf/core: allow ftrace for functions in kernel/event/core.c It is useful to trace functions in kernel/event/core.c. Allow ftrace for them by removing $(CC_FLAGS_FTRACE) from Makefile. Link: https://lkml.kernel.org/r/20211006210732.2826289-1-songliubraving@fb.com Cc: Peter Zijlstra Cc: Andrii Nakryiko Cc: KP Singh Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/events/Makefile | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 3c022e33c109..8591c180b52b 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,10 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) -endif - obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_UPROBES) += uprobes.o - -- cgit v1.2.3 From e0f3b18be733ac4a3b6deb2ff586bc1936ad0368 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 15 Oct 2021 17:07:50 +0200 Subject: trace/osnoise: Add migrate-disabled field to the osnoise header Since "54357f0c9149 tracing: Add migrate-disabled counter to tracing output," the migrate disabled field is also printed in the !PREEMPR_RT kernel config. While this information was added to the vast majority of tracers, osnoise and timerlat were not updated (because they are new tracers). Fix osnoise header by adding the information about migrate disabled. Link: https://lkml.kernel.org/r/9cb3d54e29e0588dbba12e81486bd8a09adcd8ca.1634308385.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Sebastian Andrzej Siewior Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 54357f0c9149 ("tracing: Add migrate-disabled counter to tracing output.") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index c4f14fb98aaa..34f26c632442 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -294,19 +294,19 @@ static void print_osnoise_headers(struct seq_file *s) seq_puts(s, "# _-----=> irqs-off\n"); seq_puts(s, "# / _----=> need-resched\n"); seq_puts(s, "# | / _---=> hardirq/softirq\n"); - seq_puts(s, "# || / _--=> preempt-depth "); - seq_puts(s, " MAX\n"); - - seq_puts(s, "# || / "); + seq_puts(s, "# || / _--=> preempt-depth\n"); + seq_puts(s, "# ||| / _-=> migrate-disable "); + seq_puts(s, " MAX\n"); + seq_puts(s, "# |||| / delay "); seq_puts(s, " SINGLE Interference counters:\n"); - seq_puts(s, "# |||| RUNTIME "); + seq_puts(s, "# ||||| RUNTIME "); seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); - seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP IN US "); + seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP IN US "); seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); - seq_puts(s, "# | | | |||| | | "); + seq_puts(s, "# | | | ||||| | | "); seq_puts(s, " | | | | | | | |\n"); } #endif /* CONFIG_PREEMPT_RT */ -- cgit v1.2.3 From aeafcb82d99c97ff5c6054a4091eeb12aefca9ab Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 15 Oct 2021 17:07:51 +0200 Subject: trace/timerlat: Add migrate-disabled field to the timerlat header Since "54357f0c9149 tracing: Add migrate-disabled counter to tracing output," the migrate disabled field is also printed in the !PREEMPR_RT kernel config. While this information was added to the vast majority of tracers, osnoise and timerlat were not updated (because they are new tracers). Fix timerlat header by adding the information about migrate disabled. Link: https://lkml.kernel.org/r/bc0c234ab49946cdd63effa6584e1d5e8662cb44.1634308385.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Sebastian Andrzej Siewior Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 54357f0c9149 ("tracing: Add migrate-disabled counter to tracing output.") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 34f26c632442..d11b41784fac 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -378,11 +378,12 @@ static void print_timerlat_headers(struct seq_file *s) seq_puts(s, "# / _----=> need-resched\n"); seq_puts(s, "# | / _---=> hardirq/softirq\n"); seq_puts(s, "# || / _--=> preempt-depth\n"); - seq_puts(s, "# || /\n"); - seq_puts(s, "# |||| ACTIVATION\n"); - seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP ID "); - seq_puts(s, " CONTEXT LATENCY\n"); - seq_puts(s, "# | | | |||| | | "); + seq_puts(s, "# ||| / _-=> migrate-disable\n"); + seq_puts(s, "# |||| / delay\n"); + seq_puts(s, "# ||||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | ||||| | | "); seq_puts(s, " | |\n"); } #endif /* CONFIG_PREEMPT_RT */ -- cgit v1.2.3 From 3c20bd3af535d64771b193bb4dd41ed662c464ce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 15 Oct 2021 15:55:50 -0400 Subject: tracing: Fix missing trace_boot_init_histograms kstrdup NULL checks trace_boot_init_histograms misses NULL pointer checks for kstrdup failure. Link: https://lkml.kernel.org/r/20211015195550.22742-1-mathieu.desnoyers@efficios.com Fixes: 64dc7f6958ef5 ("tracing/boot: Show correct histogram error command") Acked-by: Masami Hiramatsu Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 8d252f63cd78..0580287d7a0d 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -430,6 +430,8 @@ trace_boot_init_histograms(struct trace_event_file *file, /* All digit started node should be instances. */ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return; if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply hist trigger: %s\n", tmp); kfree(tmp); @@ -439,6 +441,8 @@ trace_boot_init_histograms(struct trace_event_file *file, if (xbc_node_find_subkey(hnode, "keys")) { if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return; if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply hist trigger: %s\n", tmp); kfree(tmp); -- cgit v1.2.3 From 1d6288914264a22c0efdfb3a5748c101c0d12baa Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Thu, 21 Oct 2021 11:52:25 +0800 Subject: tracing/hwlat: Make some internal symbols static The sparse tool complains as follows: kernel/trace/trace_hwlat.c:82:27: warning: symbol 'hwlat_single_cpu_data' was not declared. Should it be static? kernel/trace/trace_hwlat.c:83:1: warning: symbol '__pcpu_scope_hwlat_per_cpu_data' was not declared. Should it be static? This symbol is not used outside of trace_hwlat.c, so this commit marks it static. Link: https://lkml.kernel.org/r/20211021035225.1050685-1-bobo.shaobowang@huawei.com Signed-off-by: Wang ShaoBo Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d0a730d99a33..56bb7b890578 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -79,8 +79,8 @@ struct hwlat_kthread_data { int nmi_cpu; }; -struct hwlat_kthread_data hwlat_single_cpu_data; -DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); +static struct hwlat_kthread_data hwlat_single_cpu_data; +static DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); /* Tells NMIs to call back to the hwlat tracer to record timestamps */ bool trace_hwlat_callback_enabled; -- cgit v1.2.3 From 1f6d3a8f5e397f5d31afbc58d84e1dc68318b874 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 25 Oct 2021 20:41:52 +0900 Subject: kprobes: Add a test case for stacktrace from kretprobe handler Add a test case for stacktrace from kretprobe handler and nested kretprobe handlers. This test checks both of stack trace inside kretprobe handler and stack trace from pt_regs. Those stack trace must include actual function return address instead of kretprobe trampoline. The nested kretprobe stacktrace test checks whether the unwinder can correctly unwind the call frame on the stack which has been modified by the kretprobe. Since the stacktrace on kretprobe is correctly fixed only on x86, this introduces a meta kconfig ARCH_CORRECT_STACKTRACE_ON_KRETPROBE which tells user that the stacktrace on kretprobe is correct or not. The test results will be shown like below; TAP version 14 1..1 # Subtest: kprobes_test 1..6 ok 1 - test_kprobe ok 2 - test_kprobes ok 3 - test_kretprobe ok 4 - test_kretprobes ok 5 - test_stacktrace_on_kretprobe ok 6 - test_stacktrace_on_nested_kretprobe # kprobes_test: pass:6 fail:0 skip:0 total:6 # Totals: pass:6 fail:0 skip:0 total:6 ok 1 - kprobes_test Link: https://lkml.kernel.org/r/163516211244.604541.18350507860972214415.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/test_kprobes.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index e78f18144145..a5edc2ebc947 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -17,6 +17,11 @@ static u32 (*target)(u32 value); static u32 (*target2)(u32 value); static struct kunit *current_test; +static unsigned long (*internal_target)(void); +static unsigned long (*stacktrace_target)(void); +static unsigned long (*stacktrace_driver)(void); +static unsigned long target_return_address[2]; + static noinline u32 kprobe_target(u32 value) { return (value / div_factor); @@ -58,6 +63,33 @@ static noinline u32 kprobe_target2(u32 value) return (value / div_factor) + 1; } +static noinline unsigned long kprobe_stacktrace_internal_target(void) +{ + if (!target_return_address[0]) + target_return_address[0] = (unsigned long)__builtin_return_address(0); + return target_return_address[0]; +} + +static noinline unsigned long kprobe_stacktrace_target(void) +{ + if (!target_return_address[1]) + target_return_address[1] = (unsigned long)__builtin_return_address(0); + + if (internal_target) + internal_target(); + + return target_return_address[1]; +} + +static noinline unsigned long kprobe_stacktrace_driver(void) +{ + if (stacktrace_target) + stacktrace_target(); + + /* This is for preventing inlining the function */ + return (unsigned long)__builtin_return_address(0); +} + static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) { preh_val = (rand1 / div_factor) + 1; @@ -175,12 +207,138 @@ static void test_kretprobes(struct kunit *test) KUNIT_EXPECT_EQ(test, krph_val, rand1); unregister_kretprobes(rps, 2); } + +#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE +#define STACK_BUF_SIZE 16 +static unsigned long stack_buf[STACK_BUF_SIZE]; + +static int stacktrace_return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long retval = regs_return_value(regs); + int i, ret; + + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, retval, target_return_address[1]); + + /* + * Test stacktrace inside the kretprobe handler, this will involves + * kretprobe trampoline, but must include correct return address + * of the target function. + */ + ret = stack_trace_save(stack_buf, STACK_BUF_SIZE, 0); + KUNIT_EXPECT_NE(current_test, ret, 0); + + for (i = 0; i < ret; i++) { + if (stack_buf[i] == target_return_address[1]) + break; + } + KUNIT_EXPECT_NE(current_test, i, ret); + +#if !IS_MODULE(CONFIG_KPROBES_SANITY_TEST) + /* + * Test stacktrace from pt_regs at the return address. Thus the stack + * trace must start from the target return address. + */ + ret = stack_trace_save_regs(regs, stack_buf, STACK_BUF_SIZE, 0); + KUNIT_EXPECT_NE(current_test, ret, 0); + KUNIT_EXPECT_EQ(current_test, stack_buf[0], target_return_address[1]); +#endif + + return 0; +} + +static struct kretprobe rp3 = { + .handler = stacktrace_return_handler, + .kp.symbol_name = "kprobe_stacktrace_target" +}; + +static void test_stacktrace_on_kretprobe(struct kunit *test) +{ + unsigned long myretaddr = (unsigned long)__builtin_return_address(0); + + current_test = test; + rp3.kp.addr = NULL; + rp3.kp.flags = 0; + + /* + * Run the stacktrace_driver() to record correct return address in + * stacktrace_target() and ensure stacktrace_driver() call is not + * inlined by checking the return address of stacktrace_driver() + * and the return address of this function is different. + */ + KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); + + KUNIT_ASSERT_EQ(test, 0, register_kretprobe(&rp3)); + KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); + unregister_kretprobe(&rp3); +} + +static int stacktrace_internal_return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long retval = regs_return_value(regs); + int i, ret; + + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, retval, target_return_address[0]); + + /* + * Test stacktrace inside the kretprobe handler for nested case. + * The unwinder will find the kretprobe_trampoline address on the + * return address, and kretprobe must solve that. + */ + ret = stack_trace_save(stack_buf, STACK_BUF_SIZE, 0); + KUNIT_EXPECT_NE(current_test, ret, 0); + + for (i = 0; i < ret - 1; i++) { + if (stack_buf[i] == target_return_address[0]) { + KUNIT_EXPECT_EQ(current_test, stack_buf[i + 1], target_return_address[1]); + break; + } + } + KUNIT_EXPECT_NE(current_test, i, ret); + +#if !IS_MODULE(CONFIG_KPROBES_SANITY_TEST) + /* Ditto for the regs version. */ + ret = stack_trace_save_regs(regs, stack_buf, STACK_BUF_SIZE, 0); + KUNIT_EXPECT_NE(current_test, ret, 0); + KUNIT_EXPECT_EQ(current_test, stack_buf[0], target_return_address[0]); + KUNIT_EXPECT_EQ(current_test, stack_buf[1], target_return_address[1]); +#endif + + return 0; +} + +static struct kretprobe rp4 = { + .handler = stacktrace_internal_return_handler, + .kp.symbol_name = "kprobe_stacktrace_internal_target" +}; + +static void test_stacktrace_on_nested_kretprobe(struct kunit *test) +{ + unsigned long myretaddr = (unsigned long)__builtin_return_address(0); + struct kretprobe *rps[2] = {&rp3, &rp4}; + + current_test = test; + rp3.kp.addr = NULL; + rp3.kp.flags = 0; + + //KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); + + KUNIT_ASSERT_EQ(test, 0, register_kretprobes(rps, 2)); + KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); + unregister_kretprobes(rps, 2); +} +#endif /* CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE */ + #endif /* CONFIG_KRETPROBES */ static int kprobes_test_init(struct kunit *test) { target = kprobe_target; target2 = kprobe_target2; + stacktrace_target = kprobe_stacktrace_target; + internal_target = kprobe_stacktrace_internal_target; + stacktrace_driver = kprobe_stacktrace_driver; do { rand1 = prandom_u32(); @@ -194,6 +352,10 @@ static struct kunit_case kprobes_testcases[] = { #ifdef CONFIG_KRETPROBES KUNIT_CASE(test_kretprobe), KUNIT_CASE(test_kretprobes), +#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE + KUNIT_CASE(test_stacktrace_on_kretprobe), + KUNIT_CASE(test_stacktrace_on_nested_kretprobe), +#endif #endif {} }; -- cgit v1.2.3 From b9e94a7bb6fad880ba1ec0d58897480c62f587cf Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 26 Oct 2021 09:51:30 +0800 Subject: test_kprobes: Move it from kernel/ to lib/ Since config KPROBES_SANITY_TEST is in lib/Kconfig.debug, it is better to let test_kprobes.c in lib/, just like other similar tests found in lib/. Link: https://lkml.kernel.org/r/1635213091-24387-4-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/Makefile | 1 - kernel/test_kprobes.c | 371 -------------------------------------------------- 2 files changed, 372 deletions(-) delete mode 100644 kernel/test_kprobes.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4df609be42d0..9e4d33dce8a5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -85,7 +85,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKHEADERS) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o -obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c deleted file mode 100644 index a5edc2ebc947..000000000000 --- a/kernel/test_kprobes.c +++ /dev/null @@ -1,371 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * test_kprobes.c - simple sanity test for *probes - * - * Copyright IBM Corp. 2008 - */ - -#include -#include -#include -#include - -#define div_factor 3 - -static u32 rand1, preh_val, posth_val; -static u32 (*target)(u32 value); -static u32 (*target2)(u32 value); -static struct kunit *current_test; - -static unsigned long (*internal_target)(void); -static unsigned long (*stacktrace_target)(void); -static unsigned long (*stacktrace_driver)(void); -static unsigned long target_return_address[2]; - -static noinline u32 kprobe_target(u32 value) -{ - return (value / div_factor); -} - -static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - KUNIT_EXPECT_FALSE(current_test, preemptible()); - preh_val = (rand1 / div_factor); - return 0; -} - -static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - KUNIT_EXPECT_FALSE(current_test, preemptible()); - KUNIT_EXPECT_EQ(current_test, preh_val, (rand1 / div_factor)); - posth_val = preh_val + div_factor; -} - -static struct kprobe kp = { - .symbol_name = "kprobe_target", - .pre_handler = kp_pre_handler, - .post_handler = kp_post_handler -}; - -static void test_kprobe(struct kunit *test) -{ - current_test = test; - KUNIT_EXPECT_EQ(test, 0, register_kprobe(&kp)); - target(rand1); - unregister_kprobe(&kp); - KUNIT_EXPECT_NE(test, 0, preh_val); - KUNIT_EXPECT_NE(test, 0, posth_val); -} - -static noinline u32 kprobe_target2(u32 value) -{ - return (value / div_factor) + 1; -} - -static noinline unsigned long kprobe_stacktrace_internal_target(void) -{ - if (!target_return_address[0]) - target_return_address[0] = (unsigned long)__builtin_return_address(0); - return target_return_address[0]; -} - -static noinline unsigned long kprobe_stacktrace_target(void) -{ - if (!target_return_address[1]) - target_return_address[1] = (unsigned long)__builtin_return_address(0); - - if (internal_target) - internal_target(); - - return target_return_address[1]; -} - -static noinline unsigned long kprobe_stacktrace_driver(void) -{ - if (stacktrace_target) - stacktrace_target(); - - /* This is for preventing inlining the function */ - return (unsigned long)__builtin_return_address(0); -} - -static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) -{ - preh_val = (rand1 / div_factor) + 1; - return 0; -} - -static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - KUNIT_EXPECT_EQ(current_test, preh_val, (rand1 / div_factor) + 1); - posth_val = preh_val + div_factor; -} - -static struct kprobe kp2 = { - .symbol_name = "kprobe_target2", - .pre_handler = kp_pre_handler2, - .post_handler = kp_post_handler2 -}; - -static void test_kprobes(struct kunit *test) -{ - struct kprobe *kps[2] = {&kp, &kp2}; - - current_test = test; - - /* addr and flags should be cleard for reusing kprobe. */ - kp.addr = NULL; - kp.flags = 0; - - KUNIT_EXPECT_EQ(test, 0, register_kprobes(kps, 2)); - preh_val = 0; - posth_val = 0; - target(rand1); - - KUNIT_EXPECT_NE(test, 0, preh_val); - KUNIT_EXPECT_NE(test, 0, posth_val); - - preh_val = 0; - posth_val = 0; - target2(rand1); - - KUNIT_EXPECT_NE(test, 0, preh_val); - KUNIT_EXPECT_NE(test, 0, posth_val); - unregister_kprobes(kps, 2); -} - -#ifdef CONFIG_KRETPROBES -static u32 krph_val; - -static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - KUNIT_EXPECT_FALSE(current_test, preemptible()); - krph_val = (rand1 / div_factor); - return 0; -} - -static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long ret = regs_return_value(regs); - - KUNIT_EXPECT_FALSE(current_test, preemptible()); - KUNIT_EXPECT_EQ(current_test, ret, rand1 / div_factor); - KUNIT_EXPECT_NE(current_test, krph_val, 0); - krph_val = rand1; - return 0; -} - -static struct kretprobe rp = { - .handler = return_handler, - .entry_handler = entry_handler, - .kp.symbol_name = "kprobe_target" -}; - -static void test_kretprobe(struct kunit *test) -{ - current_test = test; - KUNIT_EXPECT_EQ(test, 0, register_kretprobe(&rp)); - target(rand1); - unregister_kretprobe(&rp); - KUNIT_EXPECT_EQ(test, krph_val, rand1); -} - -static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long ret = regs_return_value(regs); - - KUNIT_EXPECT_EQ(current_test, ret, (rand1 / div_factor) + 1); - KUNIT_EXPECT_NE(current_test, krph_val, 0); - krph_val = rand1; - return 0; -} - -static struct kretprobe rp2 = { - .handler = return_handler2, - .entry_handler = entry_handler, - .kp.symbol_name = "kprobe_target2" -}; - -static void test_kretprobes(struct kunit *test) -{ - struct kretprobe *rps[2] = {&rp, &rp2}; - - current_test = test; - /* addr and flags should be cleard for reusing kprobe. */ - rp.kp.addr = NULL; - rp.kp.flags = 0; - KUNIT_EXPECT_EQ(test, 0, register_kretprobes(rps, 2)); - - krph_val = 0; - target(rand1); - KUNIT_EXPECT_EQ(test, krph_val, rand1); - - krph_val = 0; - target2(rand1); - KUNIT_EXPECT_EQ(test, krph_val, rand1); - unregister_kretprobes(rps, 2); -} - -#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE -#define STACK_BUF_SIZE 16 -static unsigned long stack_buf[STACK_BUF_SIZE]; - -static int stacktrace_return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long retval = regs_return_value(regs); - int i, ret; - - KUNIT_EXPECT_FALSE(current_test, preemptible()); - KUNIT_EXPECT_EQ(current_test, retval, target_return_address[1]); - - /* - * Test stacktrace inside the kretprobe handler, this will involves - * kretprobe trampoline, but must include correct return address - * of the target function. - */ - ret = stack_trace_save(stack_buf, STACK_BUF_SIZE, 0); - KUNIT_EXPECT_NE(current_test, ret, 0); - - for (i = 0; i < ret; i++) { - if (stack_buf[i] == target_return_address[1]) - break; - } - KUNIT_EXPECT_NE(current_test, i, ret); - -#if !IS_MODULE(CONFIG_KPROBES_SANITY_TEST) - /* - * Test stacktrace from pt_regs at the return address. Thus the stack - * trace must start from the target return address. - */ - ret = stack_trace_save_regs(regs, stack_buf, STACK_BUF_SIZE, 0); - KUNIT_EXPECT_NE(current_test, ret, 0); - KUNIT_EXPECT_EQ(current_test, stack_buf[0], target_return_address[1]); -#endif - - return 0; -} - -static struct kretprobe rp3 = { - .handler = stacktrace_return_handler, - .kp.symbol_name = "kprobe_stacktrace_target" -}; - -static void test_stacktrace_on_kretprobe(struct kunit *test) -{ - unsigned long myretaddr = (unsigned long)__builtin_return_address(0); - - current_test = test; - rp3.kp.addr = NULL; - rp3.kp.flags = 0; - - /* - * Run the stacktrace_driver() to record correct return address in - * stacktrace_target() and ensure stacktrace_driver() call is not - * inlined by checking the return address of stacktrace_driver() - * and the return address of this function is different. - */ - KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); - - KUNIT_ASSERT_EQ(test, 0, register_kretprobe(&rp3)); - KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); - unregister_kretprobe(&rp3); -} - -static int stacktrace_internal_return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long retval = regs_return_value(regs); - int i, ret; - - KUNIT_EXPECT_FALSE(current_test, preemptible()); - KUNIT_EXPECT_EQ(current_test, retval, target_return_address[0]); - - /* - * Test stacktrace inside the kretprobe handler for nested case. - * The unwinder will find the kretprobe_trampoline address on the - * return address, and kretprobe must solve that. - */ - ret = stack_trace_save(stack_buf, STACK_BUF_SIZE, 0); - KUNIT_EXPECT_NE(current_test, ret, 0); - - for (i = 0; i < ret - 1; i++) { - if (stack_buf[i] == target_return_address[0]) { - KUNIT_EXPECT_EQ(current_test, stack_buf[i + 1], target_return_address[1]); - break; - } - } - KUNIT_EXPECT_NE(current_test, i, ret); - -#if !IS_MODULE(CONFIG_KPROBES_SANITY_TEST) - /* Ditto for the regs version. */ - ret = stack_trace_save_regs(regs, stack_buf, STACK_BUF_SIZE, 0); - KUNIT_EXPECT_NE(current_test, ret, 0); - KUNIT_EXPECT_EQ(current_test, stack_buf[0], target_return_address[0]); - KUNIT_EXPECT_EQ(current_test, stack_buf[1], target_return_address[1]); -#endif - - return 0; -} - -static struct kretprobe rp4 = { - .handler = stacktrace_internal_return_handler, - .kp.symbol_name = "kprobe_stacktrace_internal_target" -}; - -static void test_stacktrace_on_nested_kretprobe(struct kunit *test) -{ - unsigned long myretaddr = (unsigned long)__builtin_return_address(0); - struct kretprobe *rps[2] = {&rp3, &rp4}; - - current_test = test; - rp3.kp.addr = NULL; - rp3.kp.flags = 0; - - //KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); - - KUNIT_ASSERT_EQ(test, 0, register_kretprobes(rps, 2)); - KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); - unregister_kretprobes(rps, 2); -} -#endif /* CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE */ - -#endif /* CONFIG_KRETPROBES */ - -static int kprobes_test_init(struct kunit *test) -{ - target = kprobe_target; - target2 = kprobe_target2; - stacktrace_target = kprobe_stacktrace_target; - internal_target = kprobe_stacktrace_internal_target; - stacktrace_driver = kprobe_stacktrace_driver; - - do { - rand1 = prandom_u32(); - } while (rand1 <= div_factor); - return 0; -} - -static struct kunit_case kprobes_testcases[] = { - KUNIT_CASE(test_kprobe), - KUNIT_CASE(test_kprobes), -#ifdef CONFIG_KRETPROBES - KUNIT_CASE(test_kretprobe), - KUNIT_CASE(test_kretprobes), -#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE - KUNIT_CASE(test_stacktrace_on_kretprobe), - KUNIT_CASE(test_stacktrace_on_nested_kretprobe), -#endif -#endif - {} -}; - -static struct kunit_suite kprobes_test_suite = { - .name = "kprobes_test", - .init = kprobes_test_init, - .test_cases = kprobes_testcases, -}; - -kunit_test_suites(&kprobes_test_suite); - -MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 52cfb373536a7fb744b0ec4b748518e5dc874fb7 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:33 -0700 Subject: tracing: Add support for creating hist trigger variables from literal Currently hist trigger expressions don't support the use of numeric literals: e.g. echo 'hist:keys=common_pid:x=$y-1234' --> is not valid expression syntax Having the ability to use numeric constants in hist triggers supports a wider range of expressions for creating variables. Add support for creating trace event histogram variables from numeric literals. e.g. echo 'hist:keys=common_pid:x=1234,y=size-1024' >> event/trigger A negative numeric constant is created, using unary minus operator (parentheses are required). e.g. echo 'hist:keys=common_pid:z=-(2)' >> event/trigger Constants can be used with division/multiplication (added in the next patch in this series) to implement granularity filters for frequent trace events. For instance we can limit emitting the rss_stat trace event to when there is a 512KB cross over in the rss size: # Create a synthetic event to monitor instead of the high frequency # rss_stat event echo 'rss_stat_throttled unsigned int mm_id; unsigned int curr; int member; long size' >> tracing/synthetic_events # Create a hist trigger that emits the synthetic rss_stat_throttled # event only when the rss size crosses a 512KB boundary. echo 'hist:keys=keys=mm_id,member:bucket=size/0x80000:onchange($bucket) .rss_stat_throttled(mm_id,curr,member,size)' >> events/kmem/rss_stat/trigger A use case for using constants with addition/subtraction is not yet known, but for completeness the use of constants are supported for all operators. Link: https://lkml.kernel.org/r/20211025200852.3002369-2-kaleshsingh@google.com Signed-off-by: Kalesh Singh Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 71 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b64aed538628..e6165e36d3b6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -66,7 +66,8 @@ C(EMPTY_SORT_FIELD, "Empty sort field"), \ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ - C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), + C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ + C(EXPECT_NUMBER, "Expecting numeric literal"), #undef C #define C(a, b) HIST_ERR_##a @@ -89,6 +90,7 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field, #define HIST_FIELD_OPERANDS_MAX 2 #define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) #define HIST_ACTIONS_MAX 8 +#define HIST_CONST_DIGITS_MAX 21 enum field_op_id { FIELD_OP_NONE, @@ -152,6 +154,9 @@ struct hist_field { bool read_once; unsigned int var_str_idx; + + /* Numeric literals are represented as u64 */ + u64 constant; }; static u64 hist_field_none(struct hist_field *field, @@ -163,6 +168,15 @@ static u64 hist_field_none(struct hist_field *field, return 0; } +static u64 hist_field_const(struct hist_field *field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return field->constant; +} + static u64 hist_field_counter(struct hist_field *field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -341,6 +355,7 @@ enum hist_field_flags { HIST_FIELD_FL_CPU = 1 << 15, HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, + HIST_FIELD_FL_CONST = 1 << 18, }; struct var_defs { @@ -1516,6 +1531,12 @@ static void expr_field_str(struct hist_field *field, char *expr) { if (field->flags & HIST_FIELD_FL_VAR_REF) strcat(expr, "$"); + else if (field->flags & HIST_FIELD_FL_CONST) { + char str[HIST_CONST_DIGITS_MAX]; + + snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant); + strcat(expr, str); + } strcat(expr, hist_field_name(field, 0)); @@ -1689,6 +1710,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_CONST) { + hist_field->fn = hist_field_const; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + if (flags & HIST_FIELD_FL_STACKTRACE) { hist_field->fn = hist_field_none; goto out; @@ -2090,6 +2120,29 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, return alias; } +static struct hist_field *parse_const(struct hist_trigger_data *hist_data, + char *str, char *var_name, + unsigned long *flags) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *field = NULL; + u64 constant; + + if (kstrtoull(str, 0, &constant)) { + hist_err(tr, HIST_ERR_EXPECT_NUMBER, errpos(str)); + return NULL; + } + + *flags |= HIST_FIELD_FL_CONST; + field = create_hist_field(hist_data, NULL, *flags, var_name); + if (!field) + return NULL; + + field->constant = constant; + + return field; +} + static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long *flags, char *var_name) @@ -2100,6 +2153,15 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, unsigned long buckets = 0; int ret = 0; + if (isdigit(str[0])) { + hist_field = parse_const(hist_data, str, var_name, flags); + if (!hist_field) { + ret = -EINVAL; + goto out; + } + return hist_field; + } + s = strchr(str, '.'); if (s) { s = strchr(++s, '.'); @@ -4945,6 +5007,8 @@ static void hist_field_debug_show_flags(struct seq_file *m, if (flags & HIST_FIELD_FL_ALIAS) seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); + else if (flags & HIST_FIELD_FL_CONST) + seq_puts(m, " HIST_FIELD_FL_CONST\n"); } static int hist_field_debug_show(struct seq_file *m, @@ -4966,6 +5030,9 @@ static int hist_field_debug_show(struct seq_file *m, field->var.idx); } + if (field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, " constant: %llu\n", field->constant); + if (field->flags & HIST_FIELD_FL_ALIAS) seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", field->var_ref_idx); @@ -5208,6 +5275,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + else if (hist_field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, "%llu", hist_field->constant); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) -- cgit v1.2.3 From bcef044150320217e2a00c65050114e509c222b8 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:34 -0700 Subject: tracing: Add division and multiplication support for hist triggers Adds basic support for division and multiplication operations for hist trigger variable expressions. For simplicity this patch only supports, division and multiplication for a single operation expression (e.g. x=$a/$b), as currently expressions are always evaluated right to left. This can lead to some incorrect results: e.g. echo 'hist:keys=common_pid:x=8-4-2' >> event/trigger 8-4-2 should evaluate to 2 i.e. (8-4)-2 but currently x evaluate to 6 i.e. 8-(4-2) Multiplication and division in sub-expressions will work correctly, once correct operator precedence support is added (See next patch in this series). For the undefined case of division by 0, the histogram expression evaluates to (u64)(-1). Since this cannot be detected when the expression is created, it is the responsibility of the user to be aware and account for this possibility. Examples: echo 'hist:keys=common_pid:a=8,b=4,x=$a/$b' \ >> event/trigger echo 'hist:keys=common_pid:y=5*$b' \ >> event/trigger Link: https://lkml.kernel.org/r/20211025200852.3002369-3-kaleshsingh@google.com Signed-off-by: Kalesh Singh Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 72 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e6165e36d3b6..1edec5d471c1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -97,6 +97,8 @@ enum field_op_id { FIELD_OP_PLUS, FIELD_OP_MINUS, FIELD_OP_UNARY_MINUS, + FIELD_OP_DIV, + FIELD_OP_MULT, }; /* @@ -285,6 +287,40 @@ static u64 hist_field_minus(struct hist_field *hist_field, return val1 - val2; } +static u64 hist_field_div(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + + /* Return -1 for the undefined case */ + if (!val2) + return -1; + + return div64_u64(val1, val2); +} + +static u64 hist_field_mult(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + + return val1 * val2; +} + static u64 hist_field_unary_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -1592,6 +1628,12 @@ static char *expr_str(struct hist_field *field, unsigned int level) case FIELD_OP_PLUS: strcat(expr, "+"); break; + case FIELD_OP_DIV: + strcat(expr, "/"); + break; + case FIELD_OP_MULT: + strcat(expr, "*"); + break; default: kfree(expr); return NULL; @@ -1607,7 +1649,7 @@ static int contains_operator(char *str) enum field_op_id field_op = FIELD_OP_NONE; char *op; - op = strpbrk(str, "+-"); + op = strpbrk(str, "+-/*"); if (!op) return FIELD_OP_NONE; @@ -1628,6 +1670,12 @@ static int contains_operator(char *str) case '+': field_op = FIELD_OP_PLUS; break; + case '/': + field_op = FIELD_OP_DIV; + break; + case '*': + field_op = FIELD_OP_MULT; + break; default: break; } @@ -2361,10 +2409,26 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, case FIELD_OP_PLUS: sep = "+"; break; + case FIELD_OP_DIV: + sep = "/"; + break; + case FIELD_OP_MULT: + sep = "*"; + break; default: goto free; } + /* + * Multiplication and division are only supported in single operator + * expressions, since the expression is always evaluated from right + * to left. + */ + if ((field_op == FIELD_OP_DIV || field_op == FIELD_OP_MULT) && level > 0) { + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + return ERR_PTR(-EINVAL); + } + operand1_str = strsep(&str, sep); if (!operand1_str || !str) goto free; @@ -2436,6 +2500,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, case FIELD_OP_PLUS: expr->fn = hist_field_plus; break; + case FIELD_OP_DIV: + expr->fn = hist_field_div; + break; + case FIELD_OP_MULT: + expr->fn = hist_field_mult; + break; default: ret = -EINVAL; goto free; -- cgit v1.2.3 From 9710b2f341a0d96f35b911580639853cfda4677d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:35 -0700 Subject: tracing: Fix operator precedence for hist triggers expression The current histogram expression evaluation logic evaluates the expression from right to left. This can lead to incorrect results if the operations are not associative (as is the case for subtraction and, the now added, division operators). e.g. 16-8-4-2 should be 2 not 10 --> 16-8-4-2 = ((16-8)-4)-2 64/8/4/2 should be 1 not 16 --> 64/8/4/2 = ((64/8)/4)/2 Division and multiplication are currently limited to single operation expression due to operator precedence support not yet implemented. Rework the expression parsing to support the correct evaluation of expressions containing operators of different precedences; and fix the associativity error by evaluating expressions with operators of the same precedence from left to right. Examples: (1) echo 'hist:keys=common_pid:a=8,b=4,c=2,d=1,w=$a-$b-$c-$d' \ >> event/trigger (2) echo 'hist:keys=common_pid:x=$a/$b/3/2' >> event/trigger (3) echo 'hist:keys=common_pid:y=$a+10/$c*1024' >> event/trigger (4) echo 'hist:keys=common_pid:z=$a/$b+$c*$d' >> event/trigger Link: https://lkml.kernel.org/r/20211025200852.3002369-4-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 210 ++++++++++++++++++++++++++------------- 1 file changed, 140 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1edec5d471c1..7a50ea2ac6b1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -67,7 +67,9 @@ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ - C(EXPECT_NUMBER, "Expecting numeric literal"), + C(EXPECT_NUMBER, "Expecting numeric literal"), \ + C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ + C(SYM_OFFSET_SUBEXPR, ".sym-offset not supported in sub-expressions"), #undef C #define C(a, b) HIST_ERR_##a @@ -1644,40 +1646,96 @@ static char *expr_str(struct hist_field *field, unsigned int level) return expr; } -static int contains_operator(char *str) +/* + * If field_op != FIELD_OP_NONE, *sep points to the root operator + * of the expression tree to be evaluated. + */ +static int contains_operator(char *str, char **sep) { enum field_op_id field_op = FIELD_OP_NONE; - char *op; + char *minus_op, *plus_op, *div_op, *mult_op; + + + /* + * Report the last occurrence of the operators first, so that the + * expression is evaluated left to right. This is important since + * subtraction and division are not associative. + * + * e.g + * 64/8/4/2 is 1, i.e 64/8/4/2 = ((64/8)/4)/2 + * 14-7-5-2 is 0, i.e 14-7-5-2 = ((14-7)-5)-2 + */ - op = strpbrk(str, "+-/*"); - if (!op) - return FIELD_OP_NONE; + /* + * First, find lower precedence addition and subtraction + * since the expression will be evaluated recursively. + */ + minus_op = strrchr(str, '-'); + if (minus_op) { + /* Unfortunately, the modifier ".sym-offset" can confuse things. */ + if (minus_op - str >= 4 && !strncmp(minus_op - 4, ".sym-offset", 11)) + goto out; - switch (*op) { - case '-': /* - * Unfortunately, the modifier ".sym-offset" - * can confuse things. + * Unary minus is not supported in sub-expressions. If + * present, it is always the next root operator. */ - if (op - str >= 4 && !strncmp(op - 4, ".sym-offset", 11)) - return FIELD_OP_NONE; - - if (*str == '-') + if (minus_op == str) { field_op = FIELD_OP_UNARY_MINUS; - else - field_op = FIELD_OP_MINUS; - break; - case '+': - field_op = FIELD_OP_PLUS; - break; - case '/': + goto out; + } + + field_op = FIELD_OP_MINUS; + } + + plus_op = strrchr(str, '+'); + if (plus_op || minus_op) { + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (plus_op > minus_op) + field_op = FIELD_OP_PLUS; + goto out; + } + + /* + * Multiplication and division have higher precedence than addition and + * subtraction. + */ + div_op = strrchr(str, '/'); + if (div_op) field_op = FIELD_OP_DIV; - break; - case '*': + + mult_op = strrchr(str, '*'); + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (mult_op > div_op) field_op = FIELD_OP_MULT; - break; - default: - break; + +out: + if (sep) { + switch (field_op) { + case FIELD_OP_UNARY_MINUS: + case FIELD_OP_MINUS: + *sep = minus_op; + break; + case FIELD_OP_PLUS: + *sep = plus_op; + break; + case FIELD_OP_DIV: + *sep = div_op; + break; + case FIELD_OP_MULT: + *sep = mult_op; + break; + case FIELD_OP_NONE: + default: + *sep = NULL; + break; + } } return field_op; @@ -2003,7 +2061,7 @@ static char *field_name_from_var(struct hist_trigger_data *hist_data, if (strcmp(var_name, name) == 0) { field = hist_data->attrs->var_defs.expr[i]; - if (contains_operator(field) || is_var_ref(field)) + if (contains_operator(field, NULL) || is_var_ref(field)) continue; return field; } @@ -2266,21 +2324,24 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level); + char *var_name, unsigned int *n_subexprs); static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level) + char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1, *expr = NULL; unsigned long operand_flags; int ret = 0; char *s; + /* Unary minus operator, increment n_subexprs */ + ++*n_subexprs; + /* we support only -(xxx) i.e. explicit parens required */ - if (level > 3) { + if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; @@ -2297,8 +2358,16 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, } s = strrchr(str, ')'); - if (s) + if (s) { + /* unary minus not supported in sub-expressions */ + if (*(s+1) != '\0') { + hist_err(file->tr, HIST_ERR_UNARY_MINUS_SUBEXPR, + errpos(str)); + ret = -EINVAL; + goto free; + } *s = '\0'; + } else { ret = -EINVAL; /* no closing ')' */ goto free; @@ -2312,7 +2381,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, } operand_flags = 0; - operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand1)) { ret = PTR_ERR(operand1); goto free; @@ -2382,60 +2451,61 @@ static int check_expr_operands(struct trace_array *tr, static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level) + char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; unsigned long operand_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; - if (level > 3) { + if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } - field_op = contains_operator(str); + /* + * ".sym-offset" in expressions has no effect on their evaluation, + * but can confuse operator parsing. + */ + if (*n_subexprs == 0) { + sep = strstr(str, ".sym-offset"); + if (sep) { + *sep = '\0'; + if (strpbrk(str, "+-/*") || strpbrk(sep + 11, "+-/*")) { + *sep = '.'; + hist_err(file->tr, HIST_ERR_SYM_OFFSET_SUBEXPR, + errpos(sep)); + return ERR_PTR(-EINVAL); + } + *sep = '.'; + } + } + + field_op = contains_operator(str, &sep); if (field_op == FIELD_OP_NONE) return parse_atom(hist_data, file, str, &flags, var_name); if (field_op == FIELD_OP_UNARY_MINUS) - return parse_unary(hist_data, file, str, flags, var_name, ++level); + return parse_unary(hist_data, file, str, flags, var_name, n_subexprs); - switch (field_op) { - case FIELD_OP_MINUS: - sep = "-"; - break; - case FIELD_OP_PLUS: - sep = "+"; - break; - case FIELD_OP_DIV: - sep = "/"; - break; - case FIELD_OP_MULT: - sep = "*"; - break; - default: - goto free; - } + /* Binary operator found, increment n_subexprs */ + ++*n_subexprs; - /* - * Multiplication and division are only supported in single operator - * expressions, since the expression is always evaluated from right - * to left. - */ - if ((field_op == FIELD_OP_DIV || field_op == FIELD_OP_MULT) && level > 0) { - hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); - return ERR_PTR(-EINVAL); - } + /* Split the expression string at the root operator */ + if (!sep) + goto free; + *sep = '\0'; + operand1_str = str; + str = sep+1; - operand1_str = strsep(&str, sep); if (!operand1_str || !str) goto free; operand_flags = 0; - operand1 = parse_atom(hist_data, file, operand1_str, - &operand_flags, NULL); + + /* LHS of string is an expression e.g. a+b in a+b+c */ + operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand1)) { ret = PTR_ERR(operand1); operand1 = NULL; @@ -2447,9 +2517,9 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - /* rest of string could be another expression e.g. b+c in a+b+c */ + /* RHS of string is another expression e.g. c in a+b+c */ operand_flags = 0; - operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand2)) { ret = PTR_ERR(operand2); operand2 = NULL; @@ -3883,9 +3953,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, unsigned long flags) { struct hist_field *hist_field; - int ret = 0; + int ret = 0, n_subexprs = 0; - hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0); + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, &n_subexprs); if (IS_ERR(hist_field)) { ret = PTR_ERR(hist_field); goto out; @@ -4026,7 +4096,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct hist_field *hist_field = NULL; unsigned long flags = 0; unsigned int key_size; - int ret = 0; + int ret = 0, n_subexprs = 0; if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) return -EINVAL; @@ -4039,7 +4109,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, hist_field = create_hist_field(hist_data, NULL, flags, NULL); } else { hist_field = parse_expr(hist_data, file, field_str, flags, - NULL, 0); + NULL, &n_subexprs); if (IS_ERR(hist_field)) { ret = PTR_ERR(hist_field); goto out; -- cgit v1.2.3 From c5eac6ee8bc5d32e48b3845472b547574061f49f Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:36 -0700 Subject: tracing/histogram: Simplify handling of .sym-offset in expressions The '-' in .sym-offset can confuse the hist trigger arithmetic expression parsing. Simplify the handling of this by replacing the 'sym-offset' with 'symXoffset'. This allows us to correctly evaluate expressions where the user may have inadvertently added a .sym-offset modifier to one of the operands in an expression, instead of bailing out. In this case the .sym-offset has no effect on the evaluation of the expression. The only valid use of the .sym-offset is as a hist key modifier. Link: https://lkml.kernel.org/r/20211025200852.3002369-5-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 43 ++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7a50ea2ac6b1..bbaf2e16b7ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -68,8 +68,7 @@ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ C(EXPECT_NUMBER, "Expecting numeric literal"), \ - C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ - C(SYM_OFFSET_SUBEXPR, ".sym-offset not supported in sub-expressions"), + C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), #undef C #define C(a, b) HIST_ERR_##a @@ -1672,10 +1671,6 @@ static int contains_operator(char *str, char **sep) */ minus_op = strrchr(str, '-'); if (minus_op) { - /* Unfortunately, the modifier ".sym-offset" can confuse things. */ - if (minus_op - str >= 4 && !strncmp(minus_op - 4, ".sym-offset", 11)) - goto out; - /* * Unary minus is not supported in sub-expressions. If * present, it is always the next root operator. @@ -2138,7 +2133,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_HEX; else if (strcmp(modifier, "sym") == 0) *flags |= HIST_FIELD_FL_SYM; - else if (strcmp(modifier, "sym-offset") == 0) + /* + * 'sym-offset' occurrences in the trigger string are modified + * to 'symXoffset' to simplify arithmetic expression parsing. + */ + else if (strcmp(modifier, "symXoffset") == 0) *flags |= HIST_FIELD_FL_SYM_OFFSET; else if ((strcmp(modifier, "execname") == 0) && (strcmp(field_name, "common_pid") == 0)) @@ -2463,24 +2462,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, return ERR_PTR(-EINVAL); } - /* - * ".sym-offset" in expressions has no effect on their evaluation, - * but can confuse operator parsing. - */ - if (*n_subexprs == 0) { - sep = strstr(str, ".sym-offset"); - if (sep) { - *sep = '\0'; - if (strpbrk(str, "+-/*") || strpbrk(sep + 11, "+-/*")) { - *sep = '.'; - hist_err(file->tr, HIST_ERR_SYM_OFFSET_SUBEXPR, - errpos(sep)); - return ERR_PTR(-EINVAL); - } - *sep = '.'; - } - } - field_op = contains_operator(str, &sep); if (field_op == FIELD_OP_NONE) @@ -5999,7 +5980,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct synth_event *se; const char *se_name; bool remove = false; - char *trigger, *p; + char *trigger, *p, *start; int ret = 0; lockdep_assert_held(&event_mutex); @@ -6047,6 +6028,16 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger = strstrip(trigger); } + /* + * To simplify arithmetic expression parsing, replace occurrences of + * '.sym-offset' modifier with '.symXoffset' + */ + start = strstr(trigger, ".sym-offset"); + while (start) { + *(start + 4) = 'X'; + start = strstr(start + 11, ".sym-offset"); + }; + attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) return PTR_ERR(attrs); -- cgit v1.2.3 From f47716b7a955e40e2591b960d1eccb1fde967a70 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:37 -0700 Subject: tracing/histogram: Covert expr to const if both operands are constants If both operands of a hist trigger expression are constants, convert the expression to a constant. This optimization avoids having to perform the same calculation multiple times and also saves on memory since the merged constants are represented by a single struct hist_field instead or multiple. Link: https://lkml.kernel.org/r/20211025200852.3002369-6-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 104 ++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bbaf2e16b7ae..71b453576d85 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2411,9 +2411,15 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } +/* + * If the operands are var refs, return pointers the + * variable(s) referenced in var1 and var2, else NULL. + */ static int check_expr_operands(struct trace_array *tr, struct hist_field *operand1, - struct hist_field *operand2) + struct hist_field *operand2, + struct hist_field **var1, + struct hist_field **var2) { unsigned long operand1_flags = operand1->flags; unsigned long operand2_flags = operand2->flags; @@ -2426,6 +2432,7 @@ static int check_expr_operands(struct trace_array *tr, if (!var) return -EINVAL; operand1_flags = var->flags; + *var1 = var; } if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || @@ -2436,6 +2443,7 @@ static int check_expr_operands(struct trace_array *tr, if (!var) return -EINVAL; operand2_flags = var->flags; + *var2 = var; } if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != @@ -2453,9 +2461,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; - unsigned long operand_flags; + struct hist_field *var1 = NULL, *var2 = NULL; + unsigned long operand_flags, operand2_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; + hist_field_fn_t op_fn; + bool combine_consts; if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); @@ -2512,11 +2523,38 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - ret = check_expr_operands(file->tr, operand1, operand2); + switch (field_op) { + case FIELD_OP_MINUS: + op_fn = hist_field_minus; + break; + case FIELD_OP_PLUS: + op_fn = hist_field_plus; + break; + case FIELD_OP_DIV: + op_fn = hist_field_div; + break; + case FIELD_OP_MULT: + op_fn = hist_field_mult; + break; + default: + ret = -EINVAL; + goto free; + } + + ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2); if (ret) goto free; - flags |= HIST_FIELD_FL_EXPR; + operand_flags = var1 ? var1->flags : operand1->flags; + operand2_flags = var2 ? var2->flags : operand2->flags; + + /* + * If both operands are constant, the expression can be + * collapsed to a single constant. + */ + combine_consts = operand_flags & operand2_flags & HIST_FIELD_FL_CONST; + + flags |= combine_consts ? HIST_FIELD_FL_CONST : HIST_FIELD_FL_EXPR; flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); @@ -2533,37 +2571,43 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operands[1] = operand2; - /* The operand sizes should be the same, so just pick one */ - expr->size = operand1->size; + if (combine_consts) { + if (var1) + expr->operands[0] = var1; + if (var2) + expr->operands[1] = var2; - expr->operator = field_op; - expr->name = expr_str(expr, 0); - expr->type = kstrdup_const(operand1->type, GFP_KERNEL); - if (!expr->type) { - ret = -ENOMEM; - goto free; - } + expr->constant = op_fn(expr, NULL, NULL, NULL, NULL); - switch (field_op) { - case FIELD_OP_MINUS: - expr->fn = hist_field_minus; - break; - case FIELD_OP_PLUS: - expr->fn = hist_field_plus; - break; - case FIELD_OP_DIV: - expr->fn = hist_field_div; - break; - case FIELD_OP_MULT: - expr->fn = hist_field_mult; - break; - default: - ret = -EINVAL; - goto free; + expr->operands[0] = NULL; + expr->operands[1] = NULL; + + /* + * var refs won't be destroyed immediately + * See: destroy_hist_field() + */ + destroy_hist_field(operand2, 0); + destroy_hist_field(operand1, 0); + + expr->name = expr_str(expr, 0); + } else { + expr->fn = op_fn; + + /* The operand sizes should be the same, so just pick one */ + expr->size = operand1->size; + + expr->operator = field_op; + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } + + expr->name = expr_str(expr, 0); } return expr; - free: +free: destroy_hist_field(operand1, 0); destroy_hist_field(operand2, 0); destroy_hist_field(expr, 0); -- cgit v1.2.3 From 722eddaa4043acee8f031cf238ced5f7514ad638 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:38 -0700 Subject: tracing/histogram: Optimize division by a power of 2 The division is a slow operation. If the divisor is a power of 2, use a shift instead. Results were obtained using Android's version of perf (simpleperf[1]) as described below: 1. hist_field_div() is modified to call 2 test functions: test_hist_field_div_[not]_optimized(); passing them the same args. Use noinline and volatile to ensure these are not optimized out by the compiler. 2. Create a hist event trigger that uses division: events/kmem/rss_stat$ echo 'hist:keys=common_pid:x=size/' >> trigger events/kmem/rss_stat$ echo 'hist:keys=common_pid:vals=$x' >> trigger 3. Run Android's lmkd_test[2] to generate rss_stat events, and record CPU samples with Android's simpleperf: simpleperf record -a --exclude-perf --post-unwind=yes -m 16384 -g -f 2000 -o perf.data == Results == Divisor is a power of 2 (divisor == 32): test_hist_field_div_not_optimized | 8,717,091 cpu-cycles test_hist_field_div_optimized | 1,643,137 cpu-cycles If the divisor is a power of 2, the optimized version is ~5.3x faster. Divisor is not a power of 2 (divisor == 33): test_hist_field_div_not_optimized | 4,444,324 cpu-cycles test_hist_field_div_optimized | 5,497,958 cpu-cycles If the divisor is not a power of 2, as expected, the optimized version is slightly slower (~24% slower). [1] https://android.googlesource.com/platform/system/extras/+/master/simpleperf/doc/README.md [2] https://cs.android.com/android/platform/superproject/+/master:system/memory/lmkd/tests/lmkd_test.cpp Link: https://lkml.kernel.org/r/20211025200852.3002369-7-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 71b453576d85..452daad7cfb3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -304,6 +304,10 @@ static u64 hist_field_div(struct hist_field *hist_field, if (!val2) return -1; + /* Use shift if the divisor is a power of 2 */ + if (!(val2 & (val2 - 1))) + return val1 >> __ffs64(val2); + return div64_u64(val1, val2); } -- cgit v1.2.3 From ce5e48036c9e76a2a5bd4d9079eac273087a533a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= Date: Wed, 27 Oct 2021 11:14:44 +0800 Subject: ftrace: disable preemption when recursion locked As the documentation explained, ftrace_test_recursion_trylock() and ftrace_test_recursion_unlock() were supposed to disable and enable preemption properly, however currently this work is done outside of the function, which could be missing by mistake. And since the internal using of trace_test_and_set_recursion() and trace_clear_recursion() also require preemption disabled, we can just merge the logical. This patch will make sure the preemption has been disabled when trace_test_and_set_recursion() return bit >= 0, and trace_clear_recursion() will enable the preemption if previously enabled. Link: https://lkml.kernel.org/r/13bde807-779c-aa4c-0672-20515ae365ea@linux.alibaba.com CC: Petr Mladek Cc: Guo Ren Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Joe Lawrence Cc: Masami Hiramatsu Cc: Nicholas Piggin Cc: Jisheng Zhang CC: Steven Rostedt CC: Miroslav Benes Reported-by: Abaci Suggested-by: Peter Zijlstra Signed-off-by: Michael Wang [ Removed extra line in comment - SDR ] Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/patch.c | 12 ++++++------ kernel/trace/ftrace.c | 15 +++++---------- kernel/trace/trace_functions.c | 5 ----- 3 files changed, 11 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index e8029aea67f1..fe316c021d73 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -49,14 +49,15 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); + /* + * The ftrace_test_recursion_trylock() will disable preemption, + * which is required for the variant of synchronize_rcu() that is + * used to allow patching functions where RCU is not watching. + * See klp_synchronize_transition() for more details. + */ bit = ftrace_test_recursion_trylock(ip, parent_ip); if (WARN_ON_ONCE(bit < 0)) return; - /* - * A variant of synchronize_rcu() is used to allow patching functions - * where RCU is not watching, see klp_synchronize_transition(). - */ - preempt_disable_notrace(); func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, stack_node); @@ -120,7 +121,6 @@ static void notrace klp_ftrace_handler(unsigned long ip, klp_arch_set_pc(fregs, (unsigned long)func->new_func); unlock: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2057ad363772..b4ed1a301232 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7198,16 +7198,15 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; + /* + * The ftrace_test_and_set_recursion() will disable preemption, + * which is required since some of the ops may be dynamically + * allocated, they must be freed after a synchronize_rcu(). + */ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; - /* - * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_rcu(). - */ - preempt_disable_notrace(); - do_for_each_ftrace_op(op, ftrace_ops_list) { /* Stub functions don't need to be called nor tested */ if (op->flags & FTRACE_OPS_FL_STUB) @@ -7231,7 +7230,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, } } while_for_each_ftrace_op(op); out: - preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -7279,12 +7277,9 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); - if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) op->func(ip, parent_ip, op, fregs); - preempt_enable_notrace(); trace_clear_recursion(bit); } NOKPROBE_SYMBOL(ftrace_ops_assist_func); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 1f0e63f5d1f9..9f1bfbe105e8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -186,7 +186,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, return; trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); @@ -194,7 +193,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx); ftrace_test_recursion_unlock(bit); - preempt_enable_notrace(); } #ifdef CONFIG_UNWINDER_ORC @@ -298,8 +296,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); - cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); if (atomic_read(&data->disabled)) @@ -324,7 +320,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, out: ftrace_test_recursion_unlock(bit); - preempt_enable_notrace(); } static void -- cgit v1.2.3 From d33cc657372366a8959f099c619a208b4c5dc664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= Date: Wed, 27 Oct 2021 11:15:11 +0800 Subject: ftrace: do CPU checking after preemption disabled With CONFIG_DEBUG_PREEMPT we observed reports like: BUG: using smp_processor_id() in preemptible caller is perf_ftrace_function_call+0x6f/0x2e0 CPU: 1 PID: 680 Comm: a.out Not tainted Call Trace: dump_stack_lvl+0x8d/0xcf check_preemption_disabled+0x104/0x110 ? optimize_nops.isra.7+0x230/0x230 ? text_poke_bp_batch+0x9f/0x310 perf_ftrace_function_call+0x6f/0x2e0 ... __text_poke+0x5/0x620 text_poke_bp_batch+0x9f/0x310 This telling us the CPU could be changed after task is preempted, and the checking on CPU before preemption will be invalid. Since now ftrace_test_recursion_trylock() will help to disable the preemption, this patch just do the checking after trylock() to address the issue. Link: https://lkml.kernel.org/r/54880691-5fe2-33e7-d12f-1fa6136f5183@linux.alibaba.com CC: Steven Rostedt Cc: Guo Ren Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Joe Lawrence Cc: Masami Hiramatsu Cc: "Peter Zijlstra (Intel)" Cc: Nicholas Piggin Cc: Jisheng Zhang Reported-by: Abaci Signed-off-by: Michael Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 6aed10e2f7ce..fba8cb77a73a 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -441,13 +441,13 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, if (!rcu_is_watching()) return; - if ((unsigned long)ops->private != smp_processor_id()) - return; - bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; + if ((unsigned long)ops->private != smp_processor_id()) + goto out; + event = container_of(ops, struct perf_event, ftrace_ops); /* -- cgit v1.2.3 From a90afe8d020da9298c98fddb19b7a6372e2feb45 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Mon, 30 Aug 2021 21:37:22 -0700 Subject: tracing: Show size of requested perf buffer If the perf buffer isn't large enough, provide a hint about how large it needs to be for whatever is running. Link: https://lkml.kernel.org/r/20210831043723.13481-1-robbat2@gentoo.org Signed-off-by: Robin H. Johnson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fba8cb77a73a..a114549720d6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -400,7 +400,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) + "perf buffer not large enough, wanted %d, have %d", + size, PERF_MAX_TRACE_SIZE)) return NULL; *rctxp = rctx = perf_swevent_get_recursion_context(); -- cgit v1.2.3 From feea69ec121f067073868cebe0cb9d003e64ad80 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Sat, 30 Oct 2021 08:56:15 +0800 Subject: tracing/histogram: Fix semicolon.cocci warnings kernel/trace/trace_events_hist.c:6039:2-3: Unneeded semicolon Remove unneeded semicolon. Generated by: scripts/coccinelle/misc/semicolon.cocci Link: https://lkml.kernel.org/r/20211030005615.GA41257@3074f0d39c61 Fixes: c5eac6ee8bc5 ("tracing/histogram: Simplify handling of .sym-offset in expressions") CC: Kalesh Singh Reported-by: kernel test robot Signed-off-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 452daad7cfb3..682870d004c4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6084,7 +6084,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, while (start) { *(start + 4) = 'X'; start = strstr(start + 11, ".sym-offset"); - }; + } attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) -- cgit v1.2.3