From ab573844e3058eef2788803d373019f8bebead57 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 1 May 2013 17:25:44 +0200 Subject: perf: Fix hw breakpoints overflow period sampling The hw breakpoint pmu 'add' function is missing the period_left update needed for SW events. The perf HW breakpoint events use the SW events framework to process the overflow, so it needs to be properly initialized in the PMU 'add' method. Signed-off-by: Jiri Olsa Reviewed-by: Peter Zijlstra Cc: H. Peter Anvin Cc: Oleg Nesterov Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Paul Mackerras Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Vince Weaver Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1367421944-19082-5-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f463a46424e2..fa38612d70b6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -743,6 +743,7 @@ extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); +extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern int __perf_event_disable(void *info); @@ -782,6 +783,7 @@ static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } +static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } -- cgit v1.2.3 From 9e6302056f8029f438e853432a856b9f13de26a6 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 3 Apr 2013 14:21:33 +0200 Subject: perf: Use hrtimers for event multiplexing The current scheme of using the timer tick was fine for per-thread events. However, it was causing bias issues in system-wide mode (including for uncore PMUs). Event groups would not get their fair share of runtime on the PMU. With tickless kernels, if a core is idle there is no timer tick, and thus no event rotation (multiplexing). However, there are events (especially uncore events) which do count even though cores are asleep. This patch changes the timer source for multiplexing. It introduces a per-PMU per-cpu hrtimer. The advantage is that even when a core goes idle, it will come back to service the hrtimer, thus multiplexing on system-wide events works much better. The per-PMU implementation (suggested by PeterZ) enables adjusting the multiplexing interval per PMU. The preferred interval is stashed into the struct pmu. If not set, it will be forced to the default interval value. In order to minimize the impact of the hrtimer, it is turned on and off on demand. When the PMU on a CPU is overcommited, the hrtimer is activated. It is stopped when the PMU is not overcommitted. In order for this to work properly, we had to change the order of initialization in start_kernel() such that hrtimer_init() is run before perf_event_init(). The default interval in milliseconds is set to a timer tick just like with the old code. We will provide a sysctl to tune this in another patch. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1364991694-5876-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fa38612d70b6..72138d75a60a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -501,8 +501,9 @@ struct perf_cpu_context { struct perf_event_context *task_ctx; int active_oncpu; int exclusive; + struct hrtimer hrtimer; + ktime_t hrtimer_interval; struct list_head rotation_list; - int jiffies_interval; struct pmu *unique_pmu; struct perf_cgroup *cgrp; }; -- cgit v1.2.3 From 62b8563979273424d6ebe9201e34d1acc133ad4f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 3 Apr 2013 14:21:34 +0200 Subject: perf: Add sysfs entry to adjust multiplexing interval per PMU This patch adds /sys/device/xxx/perf_event_mux_interval_ms to ajust the multiplexing interval per PMU. The unit is milliseconds. Value has to be >= 1. In the 4th version, we renamed the sysfs file to be more consistent with the other /proc/sys/kernel entries for perf_events. In the 5th version, we handle the reprogramming of the hrtimer using hrtimer_forward_now(). That way, we sync up to new timer value quickly (suggested by Jiri Olsa). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1364991694-5876-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 72138d75a60a..6fddac1b27cb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -194,6 +194,7 @@ struct pmu { int * __percpu pmu_disable_count; struct perf_cpu_context * __percpu pmu_cpu_context; int task_ctx_nr; + int hrtimer_interval_ms; /* * Fully disable/enable this PMU, can be used to protect from the PMI -- cgit v1.2.3 From 03d8e80beb7db78a13c192431205b9c83f7e0cd1 Mon Sep 17 00:00:00 2001 From: Mischa Jonker Date: Tue, 4 Jun 2013 11:45:48 +0200 Subject: perf: Add const qualifier to perf_pmu_register's 'name' arg This allows us to use pdev->name for registering a PMU device. IMO the name is not supposed to be changed anyway. Signed-off-by: Mischa Jonker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1370339148-5566-1-git-send-email-mjonker@synopsys.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 74a4e14ab60b..4bc57d017fc8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -188,7 +188,7 @@ struct pmu { struct device *dev; const struct attribute_group **attr_groups; - char *name; + const char *name; int type; int * __percpu pmu_disable_count; @@ -519,7 +519,7 @@ struct perf_output_handle { #ifdef CONFIG_PERF_EVENTS -extern int perf_pmu_register(struct pmu *pmu, char *name, int type); +extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern int perf_num_counters(void); -- cgit v1.2.3 From 43b4578071c0e6d87761e113e05d45776cc75437 Mon Sep 17 00:00:00 2001 From: Andrew Hunter Date: Thu, 23 May 2013 11:07:03 -0700 Subject: perf/x86: Reduce stack usage of x86_schedule_events() x86_schedule_events() caches event constraints on the stack during scheduling. Given the number of possible events, this is 512 bytes of stack; since it can be invoked under schedule() under god-knows-what, this is causing stack blowouts. Trade some space usage for stack safety: add a place to cache the constraint pointer to struct perf_event. For 8 bytes per event (1% of its size) we can save the giant stack frame. This shouldn't change any aspect of scheduling whatsoever and while in theory the locality's a tiny bit worse, I doubt we'll see any performance impact either. Tested: `perf stat whatever` does not blow up and produces results that aren't hugely obviously wrong. I'm not sure how to run particularly good tests of perf code, but this should not produce any functional change whatsoever. Signed-off-by: Andrew Hunter Reviewed-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369332423-4400-1-git-send-email-ahh@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4bc57d017fc8..33e8d65836d6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -113,6 +113,8 @@ struct hw_perf_event_extra { int idx; /* index in shared_regs->regs[] */ }; +struct event_constraint; + /** * struct hw_perf_event - performance event hardware details: */ @@ -131,6 +133,8 @@ struct hw_perf_event { struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; + + struct event_constraint *constraint; }; struct { /* software */ struct hrtimer hrtimer; -- cgit v1.2.3 From 135c5612c460f89657c4698fe2ea753f6f667963 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 17 Jun 2013 17:36:51 -0700 Subject: perf/x86/intel: Support Haswell/v4 LBR format Haswell has two additional LBR from flags for TSX: in_tx and abort_tx, implemented as a new "v4" version of the LBR format. Handle those in and adjust the sign extension code to still correctly extend. The flags are exported similarly in the LBR record to the existing misprediction flag Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1371515812-9646-6-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 7 ++++++- include/uapi/linux/perf_event.h | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 33e8d65836d6..056f93a7990f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -73,13 +73,18 @@ struct perf_raw_record { * * support for mispred, predicted is optional. In case it * is not supported mispred = predicted = 0. + * + * in_tx: running in a hardware transaction + * abort: aborting a hardware transaction */ struct perf_branch_entry { __u64 from; __u64 to; __u64 mispred:1, /* target mispredicted */ predicted:1,/* target predicted */ - reserved:62; + in_tx:1, /* in transaction */ + abort:1, /* transaction abort */ + reserved:60; }; /* diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index fb104e51496e..0b1df41691e8 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -157,8 +157,11 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX = 1U << 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX = 1U << 9, /* not in transaction */ - PERF_SAMPLE_BRANCH_MAX = 1U << 7, /* non-ABI */ + PERF_SAMPLE_BRANCH_MAX = 1U << 10, /* non-ABI */ }; #define PERF_SAMPLE_BRANCH_PLM_ALL \ -- cgit v1.2.3 From 14c63f17b1fde5a575a28e96547a22b451c71fb5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 21 Jun 2013 08:51:36 -0700 Subject: perf: Drop sample rate when sampling is too slow This patch keeps track of how long perf's NMI handler is taking, and also calculates how many samples perf can take a second. If the sample length times the expected max number of samples exceeds a configurable threshold, it drops the sample rate. This way, we don't have a runaway sampling process eating up the CPU. This patch can tend to drop the sample rate down to level where perf doesn't work very well. *BUT* the alternative is that my system hangs because it spends all of its time handling NMIs. I'll take a busted performance tool over an entire system that's busted and undebuggable any day. BTW, my suspicion is that there's still an underlying bug here. Using the HPET instead of the TSC is definitely a contributing factor, but I suspect there are some other things going on. But, I can't go dig down on a bug like that with my machine hanging all the time. Signed-off-by: Dave Hansen Acked-by: Peter Zijlstra Cc: paulus@samba.org Cc: acme@ghostprotocols.net Cc: Dave Hansen [ Prettified it a bit. ] Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 056f93a7990f..50b3efd14d29 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -706,10 +706,17 @@ static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; +extern int sysctl_perf_cpu_time_max_percent; + +extern void perf_sample_event_took(u64 sample_len_ns); extern int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + static inline bool perf_paranoid_tracepoint_raw(void) { -- cgit v1.2.3 From 0c4df02d739fed5ab081b330d67403206dd3967e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 21 Jun 2013 08:51:38 -0700 Subject: x86: Add NMI duration tracepoints This patch has been invaluable in my adventures finding issues in the perf NMI handler. I'm as big a fan of printk() as anybody is, but using printk() in NMIs is deadly when they're happening frequently. Even hacking in trace_printk() ended up eating enough CPU to throw off some of the measurements I was making. Signed-off-by: Dave Hansen Acked-by: Peter Zijlstra Cc: paulus@samba.org Cc: acme@ghostprotocols.net Cc: Dave Hansen Signed-off-by: Ingo Molnar --- include/trace/events/nmi.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 include/trace/events/nmi.h (limited to 'include') diff --git a/include/trace/events/nmi.h b/include/trace/events/nmi.h new file mode 100644 index 000000000000..da3ee96b8d03 --- /dev/null +++ b/include/trace/events/nmi.h @@ -0,0 +1,37 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nmi + +#if !defined(_TRACE_NMI_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NMI_H + +#include +#include + +TRACE_EVENT(nmi_handler, + + TP_PROTO(void *handler, s64 delta_ns, int handled), + + TP_ARGS(handler, delta_ns, handled), + + TP_STRUCT__entry( + __field( void *, handler ) + __field( s64, delta_ns) + __field( int, handled ) + ), + + TP_fast_assign( + __entry->handler = handler; + __entry->delta_ns = delta_ns; + __entry->handled = handled; + ), + + TP_printk("%ps() delta_ns: %lld handled: %d", + __entry->handler, + __entry->delta_ns, + __entry->handled) +); + +#endif /* _TRACE_NMI_H */ + +/* This part ust be outside protection */ +#include -- cgit v1.2.3