From e12f03d7031a977356e3d7b75a68c2185ff8d155 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2017 14:45:15 -0800 Subject: perf/core: Implement the 'perf_kprobe' PMU A new PMU type, perf_kprobe is added. Based on attr from perf_event_open(), perf_kprobe creates a kprobe (or kretprobe) for the perf_event. This kprobe is private to this perf_event, and thus not added to global lists, and not available in tracefs. Two functions, create_local_trace_kprobe() and destroy_local_trace_kprobe() are added to created and destroy these local trace_kprobe. Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yonghong Song Reviewed-by: Josef Bacik Cc: Cc: Cc: Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171206224518.3598254-6-songliubraving@fb.com Signed-off-by: Ingo Molnar --- include/linux/trace_events.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index af44e7c2d577..21c5d43a21af 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -533,6 +533,10 @@ extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); extern int perf_trace_add(struct perf_event *event, int flags); extern void perf_trace_del(struct perf_event *event, int flags); +#ifdef CONFIG_KPROBE_EVENTS +extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); +extern void perf_kprobe_destroy(struct perf_event *event); +#endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); -- cgit v1.2.3 From 33ea4b24277b06dbc55d7f5772a46f029600255e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2017 14:45:16 -0800 Subject: perf/core: Implement the 'perf_uprobe' PMU This patch adds perf_uprobe support with similar pattern as previous patch (for kprobe). Two functions, create_local_trace_uprobe() and destroy_local_trace_uprobe(), are created so a uprobe can be created and attached to the file descriptor created by perf_event_open(). Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yonghong Song Reviewed-by: Josef Bacik Cc: Cc: Cc: Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171206224518.3598254-7-songliubraving@fb.com Signed-off-by: Ingo Molnar --- include/linux/trace_events.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 21c5d43a21af..0d9d6cb454b1 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -537,6 +537,10 @@ extern void perf_trace_del(struct perf_event *event, int flags); extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); #endif +#ifdef CONFIG_UPROBE_EVENTS +extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); +extern void perf_uprobe_destroy(struct perf_event *event); +#endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); -- cgit v1.2.3 From 8e1a2031e4b556b01ca53cd1fb2d83d811a6605b Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Fri, 8 Sep 2017 11:47:03 +0300 Subject: perf/cor: Use RB trees for pinned/flexible groups Change event groups into RB trees sorted by CPU and then by a 64bit index, so that multiplexing hrtimer interrupt handler would be able skipping to the current CPU's list and ignore groups allocated for the other CPUs. New API for manipulating event groups in the trees is implemented as well as adoption on the API in the current implementation. pinned_group_sched_in() and flexible_group_sched_in() API are introduced to consolidate code enabling the whole group from pinned and flexible groups appropriately. Signed-off-by: Alexey Budankov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/372f9c8b-0cfe-4240-e44d-83d863d40813@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7546822a1d74..6e3f854a34d8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -558,7 +558,11 @@ struct perf_event { */ struct list_head group_entry; struct list_head sibling_list; - + /* + * Node on the pinned or flexible tree located at the event context; + */ + struct rb_node group_node; + u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the @@ -690,6 +694,12 @@ struct perf_event { #endif /* CONFIG_PERF_EVENTS */ }; + +struct perf_event_groups { + struct rb_root tree; + u64 index; +}; + /** * struct perf_event_context - event context structure * @@ -710,8 +720,8 @@ struct perf_event_context { struct mutex mutex; struct list_head active_ctx_list; - struct list_head pinned_groups; - struct list_head flexible_groups; + struct perf_event_groups pinned_groups; + struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_active; -- cgit v1.2.3 From 8343aae66167df6708128a778e750d48dbe31302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Nov 2017 14:28:33 +0100 Subject: perf/core: Remove perf_event::group_entry Now that all the grouping is done with RB trees, we no longer need group_entry and can replace the whole thing with sibling_list. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6e3f854a34d8..84044ec21b31 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -549,14 +549,9 @@ struct perf_event { struct list_head event_entry; /* - * XXX: group_entry and sibling_list should be mutually exclusive; - * either you're a sibling on a group, or you're the group leader. - * Rework the code to always use the same list element. - * * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ - struct list_head group_entry; struct list_head sibling_list; /* * Node on the pinned or flexible tree located at the event context; -- cgit v1.2.3 From 6668128a9e25f7a11d25359e46df2541e6b43fc9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Nov 2017 14:28:38 +0100 Subject: perf/core: Optimize ctx_sched_out() When an event group contains more events than can be scheduled on the hardware, iterating the full event group for ctx_sched_out is a waste of time. Keep track of the events that got programmed on the hardware, such that we can iterate this smaller list in order to schedule them out. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Dmitri Prokhorov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Valery Cherepennikov Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 84044ec21b31..2bb200e1bbea 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -553,6 +553,7 @@ struct perf_event { * either sufficies for read. */ struct list_head sibling_list; + struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ @@ -718,6 +719,10 @@ struct perf_event_context { struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; + + struct list_head pinned_active; + struct list_head flexible_active; + int nr_events; int nr_active; int is_active; -- cgit v1.2.3 From 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573 Mon Sep 17 00:00:00 2001 From: Milind Chabbi Date: Mon, 12 Mar 2018 14:45:47 +0100 Subject: perf/core: Implement fast breakpoint modification via _IOC_MODIFY_ATTRIBUTES Problem and motivation: Once a breakpoint perf event (PERF_TYPE_BREAKPOINT) is created, there is no flexibility to change the breakpoint type (bp_type), breakpoint address (bp_addr), or breakpoint length (bp_len). The only option is to close the perf event and configure a new breakpoint event. This inflexibility has a significant performance overhead. For example, sampling-based, lightweight performance profilers (and also concurrency bug detection tools), monitor different addresses for a short duration using PERF_TYPE_BREAKPOINT and change the address (bp_addr) to another address or change the kind of breakpoint (bp_type) from "write" to a "read" or vice-versa or change the length (bp_len) of the address being monitored. The cost of these modifications is prohibitive since it involves unmapping the circular buffer associated with the perf event, closing the perf event, opening another perf event and mmaping another circular buffer. Solution: The new ioctl flag for perf events, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, introduced in this patch takes a pointer to a struct perf_event_attr as an argument to update an old breakpoint event with new address, type, and size. This facility allows retaining a previous mmaped perf events ring buffer and avoids having to close and reopen another perf event. This patch supports only changing PERF_TYPE_BREAKPOINT event type; future implementations can extend this feature. The patch replicates some of its functionality of modify_user_hw_breakpoint() in kernel/events/hw_breakpoint.c. modify_user_hw_breakpoint cannot be called directly since perf_event_ctx_lock() is already held in _perf_ioctl(). Evidence: Experiments show that the baseline (not able to modify an already created breakpoint) costs an order of magnitude (~10x) more than the suggested optimization (having the ability to dynamically modifying a configured breakpoint via ioctl). When the breakpoints typically do not trap, the speedup due to the suggested optimization is ~10x; even when the breakpoints always trap, the speedup is ~4x due to the suggested optimization. Testing: tests posted at https://github.com/linux-contrib/perf_event_modify_bp demonstrate the performance significance of this patch. Tests also check the functional correctness of the patch. Signed-off-by: Milind Chabbi [ Using modify_user_hw_breakpoint_check function. ] [ Reformated PERF_EVENT_IOC_*, so the values are all in one column. ] Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Frederic Weisbecker Cc: Hari Bathini Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Michael Ellerman Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20180312134548.31532-8-jolsa@kernel.org Signed-off-by: Ingo Molnar --- include/linux/hw_breakpoint.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index cf045885a499..6058c3844a76 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -53,6 +53,9 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, /* FIXME: only change from the attr, and don't unregister */ extern int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); +extern int +modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, + bool check); /* * Kernel breakpoints are not associated with any particular thread. @@ -97,6 +100,10 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, static inline int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { return -ENOSYS; } +static inline int +modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, + bool check) { return -ENOSYS; } + static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, -- cgit v1.2.3 From edb39592a5877bd91b2e6ee15194268f35b04892 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Mar 2018 17:36:56 +0100 Subject: perf: Fix sibling iteration Mark noticed that the change to sibling_list changed some iteration semantics; because previously we used group_list as list entry, sibling events would always have an empty sibling_list. But because we now use sibling_list for both list head and list entry, siblings will report as having siblings. Fix this with a custom for_each_sibling_event() iterator. Fixes: 8343aae66167 ("perf/core: Remove perf_event::group_entry") Reported-by: Mark Rutland Suggested-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: vincent.weaver@maine.edu Cc: alexander.shishkin@linux.intel.com Cc: torvalds@linux-foundation.org Cc: alexey.budankov@linux.intel.com Cc: valery.cherepennikov@intel.com Cc: eranian@google.com Cc: acme@redhat.com Cc: linux-tip-commits@vger.kernel.org Cc: davidcc@google.com Cc: kan.liang@intel.com Cc: Dmitry.Prohorov@intel.com Cc: jolsa@redhat.com Link: https://lkml.kernel.org/r/20180315170129.GX4043@hirez.programming.kicks-ass.net --- include/linux/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2bb200e1bbea..ff39ab011376 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -536,6 +536,10 @@ struct pmu_event_list { struct list_head list; }; +#define for_each_sibling_event(sibling, event) \ + if ((event)->group_leader == (event)) \ + list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) + /** * struct perf_event - performance event kernel representation: */ -- cgit v1.2.3 From 6ed70cf342de03c7b11cd4eb032705faeb29d284 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 29 Mar 2018 15:06:48 +0300 Subject: perf/x86/pt, coresight: Clean up address filter structure This is a cosmetic patch that deals with the address filter structure's ambiguous fields 'filter' and 'range'. The former stands to mean that the filter's *action* should be to filter the traces to its address range if it's set or stop tracing if it's unset. This is confusing and hard on the eyes, so this patch replaces it with 'action' enum. The 'range' field is completely redundant (meaning that the filter is an address range as opposed to a single address trigger), as we can use zero size to mean the same thing. Signed-off-by: Alexander Shishkin Acked-by: Mathieu Poirier Acked-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Will Deacon Link: http://lkml.kernel.org/r/20180329120648.11902-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ff39ab011376..e71e99eb9a4e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -449,14 +449,19 @@ struct pmu { int (*filter_match) (struct perf_event *event); /* optional */ }; +enum perf_addr_filter_action_t { + PERF_ADDR_FILTER_ACTION_STOP = 0, + PERF_ADDR_FILTER_ACTION_START, + PERF_ADDR_FILTER_ACTION_FILTER, +}; + /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @inode: object file's inode for file-based filters * @offset: filter range offset - * @size: filter range size - * @range: 1: range, 0: address - * @filter: 1: filter/start, 0: stop + * @size: filter range size (size==0 means single address trigger) + * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ @@ -465,8 +470,7 @@ struct perf_addr_filter { struct inode *inode; unsigned long offset; unsigned long size; - unsigned int range : 1, - filter : 1; + enum perf_addr_filter_action_t action; }; /** -- cgit v1.2.3