From 52c3fb1a0f822fd64529ca64f3792095524de450 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jul 2024 10:21:49 +0200 Subject: perf/x86: Add hw_perf_event::aux_config Start a new section for AUX PMUs in hw_perf_event. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a8942277dda..6bb0c21d6335 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -168,6 +168,9 @@ struct hw_perf_event { struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; + struct { /* aux / Intel-PT */ + u64 aux_config; + }; struct { /* software */ struct hrtimer hrtimer; }; -- cgit v1.2.3 From 79bd233010859463e46a0a4b3926eaaba25a6110 Mon Sep 17 00:00:00 2001 From: Ben Gainey Date: Tue, 30 Jul 2024 09:44:14 +0100 Subject: perf: Rename perf_event_context.nr_pending to nr_no_switch_fast. nr_pending counts the number of events in the context that either pending_sigtrap or pending_work, but it is used to prevent taking the fast path in perf_event_context_sched_out. Renamed to reflect what it is used for, rather than what it counts. This change allows using the field to track other event properties that also require skipping the fast path without possible confusion over the name. Signed-off-by: Ben Gainey Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240730084417.7693-2-ben.gainey@arm.com --- include/linux/perf_event.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6bb0c21d6335..655f66b18418 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -966,12 +966,13 @@ struct perf_event_context { struct rcu_head rcu_head; /* - * Sum (event->pending_work + event->pending_work) + * The count of events for which using the switch-out fast path + * should be avoided. * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ - local_t nr_pending; + local_t nr_no_switch_fast; }; struct perf_cpu_pmu_context { -- cgit v1.2.3 From 7e8b255650fcfa1d05a57e4093d8405e6d8dd488 Mon Sep 17 00:00:00 2001 From: Ben Gainey Date: Tue, 30 Jul 2024 09:44:15 +0100 Subject: perf: Support PERF_SAMPLE_READ with inherit This change allows events to use PERF_SAMPLE_READ with inherit so long as PERF_SAMPLE_TID is also set. This enables sample based profiling of a group of counters over a hierarchy of processes or threads. This is useful, for example, for collecting per-thread counters/metrics, event based sampling of multiple counters as a unit, access to the enabled and running time when using multiplexing and so on. Prior to this, users were restricted to either collecting aggregate statistics for a multi-threaded/-process application (e.g. with "perf stat"), or to sample individual threads, or to profile the entire system (which requires root or CAP_PERFMON, and may produce much more data than is required). Theoretically a tool could poll for or otherwise monitor thread/process creation and construct whatever events the user is interested in using perf_event_open, for each new thread or process, but this is racy, can lead to file-descriptor exhaustion, and ultimately just replicates the behaviour of inherit, but in userspace. This configuration differs from inherit without PERF_SAMPLE_READ in that the accumulated event count, and consequently any sample (such as if triggered by overflow of sample_period) will be on a per-thread rather than on an aggregate basis. The meaning of read_format::value field of both PERF_RECORD_READ and PERF_RECORD_SAMPLE is changed such that if the sampled event uses this new configuration then the values reported will be per-thread rather than the global aggregate value. This is a change from the existing semantics of read_format (where PERF_SAMPLE_READ is used without inherit), but it is necessary to expose the per-thread counter values, and it avoids reinventing a separate "read_format_thread" field that otherwise replicates the same behaviour. This change should not break existing tools, since this configuration was not previously valid and was rejected by the kernel. Tools that opt into this new mode will need to account for this when calculating the counter delta for a given sample. Tools that wish to have both the per-thread and aggregate value can perform the global aggregation themselves from the per-thread values. The change to read_format::value does not affect existing valid perf_event_attr configurations, nor does it change the behaviour of calls to "read" on an event descriptor. Both continue to report the aggregate value for the entire thread/process hierarchy. The difference between the results reported by "read" and PERF_RECORD_SAMPLE in this new configuration is justified on the basis that it is not (easily) possible for "read" to target a specific thread (the caller only has the fd for the original parent event). Signed-off-by: Ben Gainey Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240730084417.7693-3-ben.gainey@arm.com --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 655f66b18418..701549967c18 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -969,6 +969,9 @@ struct perf_event_context { * The count of events for which using the switch-out fast path * should be avoided. * + * Sum (event->pending_work + events with + * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) + * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ -- cgit v1.2.3 From cfa7f3d2c526c224a6271cc78a4a27a0de06f4f0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jul 2024 10:52:23 -0700 Subject: perf,x86: avoid missing caller address in stack traces captured in uprobe When tracing user functions with uprobe functionality, it's common to install the probe (e.g., a BPF program) at the first instruction of the function. This is often going to be `push %rbp` instruction in function preamble, which means that within that function frame pointer hasn't been established yet. This leads to consistently missing an actual caller of the traced function, because perf_callchain_user() only records current IP (capturing traced function) and then following frame pointer chain (which would be caller's frame, containing the address of caller's caller). So when we have target_1 -> target_2 -> target_3 call chain and we are tracing an entry to target_3, captured stack trace will report target_1 -> target_3 call chain, which is wrong and confusing. This patch proposes a x86-64-specific heuristic to detect `push %rbp` (`push %ebp` on 32-bit architecture) instruction being traced. Given entire kernel implementation of user space stack trace capturing works under assumption that user space code was compiled with frame pointer register (%rbp/%ebp) preservation, it seems pretty reasonable to use this instruction as a strong indicator that this is the entry to the function. In that case, return address is still pointed to by %rsp/%esp, so we fetch it and add to stack trace before proceeding to unwind the rest using frame pointer-based logic. We also check for `endbr64` (for 64-bit modes) as another common pattern for function entry, as suggested by Josh Poimboeuf. Even if we get this wrong sometimes for uprobes attached not at the function entry, it's OK because stack trace will still be overall meaningful, just with one extra bogus entry. If we don't detect this, we end up with guaranteed to be missing caller function entry in the stack trace, which is worse overall. Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240729175223.23914-1-andrii@kernel.org --- include/linux/uprobes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b503fafb7fb3..a270a5892ab4 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -76,6 +76,8 @@ struct uprobe_task { struct uprobe *active_uprobe; unsigned long xol_vaddr; + struct arch_uprobe *auprobe; + struct return_instance *return_instances; unsigned int depth; }; -- cgit v1.2.3 From e04332ebc8ac128fa551e83f1161ab1c094d13a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:28 +0200 Subject: uprobes: kill uprobe_register_refctr() It doesn't make any sense to have 2 versions of _register(). Note that trace_uprobe_enable(), the only user of uprobe_register(), doesn't need to check tu->ref_ctr_offset to decide which one should be used, it could safely pass ref_ctr_offset == 0 to uprobe_register_refctr(). Add this argument to uprobe_register(), update the callers, and kill uprobe_register_refctr(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240801132728.GA8800@redhat.com --- include/linux/uprobes.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index a270a5892ab4..788813c0b8fc 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -112,8 +112,7 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); -extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); -extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); +extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); @@ -154,11 +153,7 @@ static inline void uprobes_init(void) #define uprobe_get_trap_addr(regs) instruction_pointer(regs) static inline int -uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - return -ENOSYS; -} -static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) +uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { return -ENOSYS; } -- cgit v1.2.3 From 3c83a9ad0295eb63bdeb81d821b8c3b9417fbcac Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:34 +0200 Subject: uprobes: make uprobe_register() return struct uprobe * This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *" rather than inode + offset. This simplifies the code and allows to avoid the unnecessary find_uprobe() + put_uprobe() in these functions. TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that this uprobe can't be freed before up_write(&uprobe->register_rwsem). Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20240801132734.GA8803@redhat.com --- include/linux/uprobes.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 788813c0b8fc..f50df1fa93e7 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include #include +struct uprobe; struct vm_area_struct; struct mm_struct; struct inode; @@ -112,9 +113,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); -extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); -extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); +extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); +extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@ -152,18 +153,18 @@ static inline void uprobes_init(void) #define uprobe_get_trap_addr(regs) instruction_pointer(regs) -static inline int +static inline struct uprobe * uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { - return -ENOSYS; + return ERR_PTR(-ENOSYS); } static inline int -uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) +uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) { return -ENOSYS; } static inline void -uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { } static inline int uprobe_mmap(struct vm_area_struct *vma) -- cgit v1.2.3 From 59da880afed211c989ef65da577b24215ce57774 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:45:58 -0700 Subject: uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks It serves no purpose beyond adding unnecessray argument passed to the filter callback. Just get rid of it, no one is actually using it. Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-4-andrii@kernel.org --- include/linux/uprobes.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f50df1fa93e7..63ae2ade3487 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -28,20 +28,12 @@ struct page; #define MAX_URETPROBE_DEPTH 64 -enum uprobe_filter_ctx { - UPROBE_FILTER_REGISTER, - UPROBE_FILTER_UNREGISTER, - UPROBE_FILTER_MMAP, -}; - struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs); - bool (*filter)(struct uprobe_consumer *self, - enum uprobe_filter_ctx ctx, - struct mm_struct *mm); + bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); struct uprobe_consumer *next; }; -- cgit v1.2.3 From cc01bd044e6a521d2cd128f685ee8d23ef0067f2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:45:59 -0700 Subject: uprobes: travers uprobe's consumer list locklessly under SRCU protection uprobe->register_rwsem is one of a few big bottlenecks to scalability of uprobes, so we need to get rid of it to improve uprobe performance and multi-CPU scalability. First, we turn uprobe's consumer list to a typical doubly-linked list and utilize existing RCU-aware helpers for traversing such lists, as well as adding and removing elements from it. For entry uprobes we already have SRCU protection active since before uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe won't go away from under us, but we add SRCU protection around consumer list traversal. Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple, we remember whether any removal was requested during handler calls, but then we double-check the decision under a proper register_rwsem using consumers' filter callbacks. Handler removal is very rare, so this extra lock won't hurt performance, overall, but we also avoid the need for any extra protection (e.g., seqcount locks). Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-5-andrii@kernel.org --- include/linux/uprobes.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 63ae2ade3487..f112b56e9479 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -29,13 +29,21 @@ struct page; #define MAX_URETPROBE_DEPTH 64 struct uprobe_consumer { + /* + * handler() can return UPROBE_HANDLER_REMOVE to signal the need to + * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is + * returned, filter() callback has to be implemented as well and it + * should return false to "confirm" the decision to uninstall uprobe + * for the current process. If filter() is omitted or returns true, + * UPROBE_HANDLER_REMOVE is effectively ignored. + */ int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs); bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); - struct uprobe_consumer *next; + struct list_head cons_node; }; #ifdef CONFIG_UPROBES -- cgit v1.2.3 From 04b01625da130c7521b768996cd5e48052198b97 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Sep 2024 10:46:00 -0700 Subject: perf/uprobe: split uprobe_unregister() With uprobe_unregister() having grown a synchronize_srcu(), it becomes fairly slow to call. Esp. since both users of this API call it in a loop. Peel off the sync_srcu() and do it once, after the loop. We also need to add uprobe_unregister_sync() into uprobe_register()'s error handling path, as we need to be careful about returning to the caller before we have a guarantee that partially attached consumer won't be called anymore. This is an unlikely slow path and this should be totally fine to be slow in the case of a failed attach. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: "Peter Zijlstra (Intel)" Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-6-andrii@kernel.org --- include/linux/uprobes.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f112b56e9479..2b294bf1881f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -115,7 +115,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); -extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc); +extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); +extern void uprobe_unregister_sync(void); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@ -164,7 +165,10 @@ uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) return -ENOSYS; } static inline void -uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) +{ +} +static inline void uprobe_unregister_sync(void) { } static inline int uprobe_mmap(struct vm_area_struct *vma) -- cgit v1.2.3 From 50a38035ed5ccc2ab8a28eaf70c3c7a87e060345 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Sep 2024 10:46:01 -0700 Subject: rbtree: provide rb_find_rcu() / rb_find_add_rcu() Much like latch_tree, add two RCU methods for the regular RB-tree, which can be used in conjunction with a seqcount to provide lockless lookups. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Reviewed-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/20240903174603.3554182-7-andrii@kernel.org --- include/linux/rbtree.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index f7edca369eda..7c173aa64e1e 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -244,6 +244,42 @@ rb_find_add(struct rb_node *node, struct rb_root *tree, return NULL; } +/** + * rb_find_add_rcu() - find equivalent @node in @tree, or add @node + * @node: node to look-for / insert + * @tree: tree to search / modify + * @cmp: operator defining the node order + * + * Adds a Store-Release for link_node. + * + * Returns the rb_node matching @node, or NULL when no match is found and @node + * is inserted. + */ +static __always_inline struct rb_node * +rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + int c; + + while (*link) { + parent = *link; + c = cmp(node, parent); + + if (c < 0) + link = &parent->rb_left; + else if (c > 0) + link = &parent->rb_right; + else + return parent; + } + + rb_link_node_rcu(node, parent, link); + rb_insert_color(node, tree); + return NULL; +} + /** * rb_find() - find @key in tree @tree * @key: key to match @@ -272,6 +308,37 @@ rb_find(const void *key, const struct rb_root *tree, return NULL; } +/** + * rb_find_rcu() - find @key in tree @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining the node order + * + * Notably, tree descent vs concurrent tree rotations is unsound and can result + * in false-negatives. + * + * Returns the rb_node matching @key or NULL. + */ +static __always_inline struct rb_node * +rb_find_rcu(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + + while (node) { + int c = cmp(key, node); + + if (c < 0) + node = rcu_dereference_raw(node->rb_left); + else if (c > 0) + node = rcu_dereference_raw(node->rb_right); + else + return node; + } + + return NULL; +} + /** * rb_find_first() - find the first @key in @tree * @key: key to match -- cgit v1.2.3 From 4ba4f1afb6a9fed8ef896c2363076e36572f71da Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:37 -0700 Subject: perf: Generic hotplug support for a PMU with a scope The perf subsystem assumes that the counters of a PMU are per-CPU. So the user space tool reads a counter from each CPU in the system wide mode. However, many PMUs don't have a per-CPU counter. The counter is effective for a scope, e.g., a die or a socket. To address this, a cpumask is exposed by the kernel driver to restrict to one CPU to stand for a specific scope. In case the given CPU is removed, the hotplug support has to be implemented for each such driver. The codes to support the cpumask and hotplug are very similar. - Expose a cpumask into sysfs - Pickup another CPU in the same scope if the given CPU is removed. - Invoke the perf_pmu_migrate_context() to migrate to a new CPU. - In event init, always set the CPU in the cpumask to event->cpu Similar duplicated codes are implemented for each such PMU driver. It would be good to introduce a generic infrastructure to avoid such duplication. 5 popular scopes are implemented here, core, die, cluster, pkg, and the system-wide. The scope can be set when a PMU is registered. If so, a "cpumask" is automatically exposed for the PMU. The "cpumask" is from the perf_online__mask, which is to track the active CPU for each scope. They are set when the first CPU of the scope is online via the generic perf hotplug support. When a corresponding CPU is removed, the perf_online__mask is updated accordingly and the PMU will be moved to a new CPU from the same scope if possible. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 701549967c18..a3cbcd727366 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -295,6 +295,19 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 +/** + * pmu::scope + */ +enum perf_pmu_scope { + PERF_PMU_SCOPE_NONE = 0, + PERF_PMU_SCOPE_CORE, + PERF_PMU_SCOPE_DIE, + PERF_PMU_SCOPE_CLUSTER, + PERF_PMU_SCOPE_PKG, + PERF_PMU_SCOPE_SYS_WIDE, + PERF_PMU_MAX_SCOPE, +}; + struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) @@ -318,6 +331,11 @@ struct pmu { */ int capabilities; + /* + * PMU scope + */ + unsigned int scope; + int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ -- cgit v1.2.3 From a48a36b316ae5d3ab83f9b545dba15998e96d59c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:38 -0700 Subject: perf: Add PERF_EV_CAP_READ_SCOPE Usually, an event can be read from any CPU of the scope. It doesn't need to be read from the advertised CPU. Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with scope can be read from any active CPU in the scope. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-3-kan.liang@linux.intel.com --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a3cbcd727366..794f66057878 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -636,10 +636,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. + * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the + * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) +#define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) -- cgit v1.2.3 From 08155c7f2a2cc6ff99886ea5743da274be24ebe4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:39 -0700 Subject: perf/x86/intel/cstate: Clean up cpumask and hotplug There are three cstate PMUs with different scopes, core, die and module. The scopes are supported by the generic perf_event subsystem now. Set the scope for each PMU and remove all the cpumask and hotplug codes. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-4-kan.liang@linux.intel.com --- include/linux/cpuhotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 9316c39260e0..2101ae2ecfca 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -152,7 +152,6 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, - CPUHP_AP_PERF_X86_CSTATE_STARTING, CPUHP_AP_PERF_XTENSA_STARTING, CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, @@ -209,7 +208,6 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, CPUHP_AP_PERF_X86_RAPL_ONLINE, - CPUHP_AP_PERF_X86_CSTATE_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, -- cgit v1.2.3