From 35edc2a5095efb189e60dc32bbb9d2663aec6d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 20:36:02 +0100 Subject: perf, arch: Rework perf_event_index() Put the logic to compute the event index into a per pmu method. This is required because the x86 rules are weird and wonderful and don't match the capabilities of the current scheme. AFAIK only powerpc actually has a usable userspace read of the PMCs but I'm not at all sure anybody actually used that. ARM is restored to the default since it currently does not support userspace access at all. And all software events are provided with a method that reports their index as 0 (disabled). Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Richard Kuo Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-dfydxodki16lylkt3gl2j7cw@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 08855613ceb3..02545e6df95b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -680,6 +680,12 @@ struct pmu { * for each successful ->add() during the transaction. */ void (*cancel_txn) (struct pmu *pmu); /* optional */ + + /* + * Will return the value for perf_event_mmap_page::index for this event, + * if no implementation is provided it will default to: event->hw.idx + 1. + */ + int (*event_idx) (struct perf_event *event); /*optional */ }; /** -- cgit v1.2.3 From 0c9d42ed4cee2aa1dfc3a260b741baae8615744f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 23:30:47 +0100 Subject: perf, x86: Provide means for disabling userspace RDPMC Allow the disabling of RDPMC via a pmu specific attribute: echo 0 > /sys/bus/event_source/devices/cpu/rdpmc Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-pqeog465zo5hsimtkfz73f27@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 02545e6df95b..5311b79fe62c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -615,6 +615,7 @@ struct pmu { struct list_head entry; struct device *dev; + const struct attribute_group **attr_groups; char *name; int type; -- cgit v1.2.3 From e3f3541c19c89a4daae39300defba68943301949 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 11:43:53 +0100 Subject: perf: Extend the mmap control page with time (TSC) fields Extend the mmap control page with fields so that userspace can compute time deltas relative to the provided time fields. Currently only implemented for x86 with constant and nonstop TSC. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-3u1jucza77j3wuvs0x2bic0f@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5311b79fe62c..0b91db2522cc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -291,12 +291,14 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ + __u32 time_mult, time_shift; + __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[123]; /* align to 1k */ + __u64 __reserved[121]; /* align to 1k */ /* * Control data for the mmap() data buffer. -- cgit v1.2.3 From efb3040d481a1594592b1defb4526c406c7a4751 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2012 13:32:15 +0100 Subject: jump_label: Add some documentation akpm figured we could do with a blub explaining what static_branch() is and why it lives... Grumpily-requested-by: Andrew Morton Cc: Jason Baron Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-h02wu6kabpoojxf03wke704k@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 5ce8b140428f..f7c69580fea7 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -1,6 +1,38 @@ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H +/* + * Jump label support + * + * Copyright (C) 2009-2012 Jason Baron + * Copyright (C) 2011-2012 Peter Zijlstra + * + * Jump labels provide an interface to generate dynamic branches using + * self-modifying code. Assuming toolchain and architecture support the result + * of a "if (static_branch(&key))" statement is a unconditional branch (which + * defaults to false - and the true block is placed out of line). + * + * However at runtime we can change the 'static' branch target using + * jump_label_{inc,dec}(). These function as a 'reference' count on the key + * object and for as long as there are references all branches referring to + * that particular key will point to the (out of line) true block. + * + * Since this relies on modifying code the jump_label_{inc,dec}() functions + * must be considered absolute slow paths (machine wide synchronization etc.). + * OTOH, since the affected branches are unconditional their runtime overhead + * will be absolutely minimal, esp. in the default (off) case where the total + * effect is a single NOP of appropriate size. The on case will patch in a jump + * to the out-of-line block. + * + * When the control is directly exposed to userspace it is prudent to delay the + * decrement to avoid high frequency code modifications which can (and do) + * cause significant performance degradation. Struct jump_label_key_deferred and + * jump_label_dec_deferred() provide for this. + * + * Lacking toolchain and or architecture support, it falls back to a simple + * conditional branch. + */ + #include #include #include -- cgit v1.2.3 From ac483c446b67870444c9eeaf8325d3d2af9b91bc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Jan 2012 10:04:14 +0100 Subject: ftrace: Change filter/notrace set functions to return exit code Currently the ftrace_set_filter and ftrace_set_notrace functions do not return any return code. So there's no way for ftrace_ops user to tell wether the filter was correctly applied. The set_ftrace_filter interface returns error in case the filter did not match: # echo krava > set_ftrace_filter bash: echo: write error: Invalid argument Changing both ftrace_set_filter and ftrace_set_notrace functions to return zero if the filter was applied correctly or -E* values in case of error. Link: http://lkml.kernel.org/r/1325495060-6402-2-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 028e26f0bf08..f33fb3b041c6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -178,9 +178,9 @@ struct dyn_ftrace { }; int ftrace_force_update(void); -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); void ftrace_set_global_filter(unsigned char *buf, int len, int reset); void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); -- cgit v1.2.3 From f069686e4bdc60a637d210ea3eea25fcdb82df88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Jan 2012 20:18:55 -0500 Subject: tracing/softirq: Move __raise_softirq_irqoff() out of header The __raise_softirq_irqoff() contains a tracepoint. As tracepoints in headers can cause issues, and not to mention, bloats the kernel when they are in a static inline, it is best to move the function that contains the tracepoint out of the header and into softirq.c. Link: http://lkml.kernel.org/r/20120118120711.GB14863@elte.hu Suggested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/linux/interrupt.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a64b00e286f5..3f830e005118 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -20,7 +20,6 @@ #include #include #include -#include /* * These correspond to the IORESOURCE_IRQ_* defines in @@ -456,11 +455,7 @@ asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -static inline void __raise_softirq_irqoff(unsigned int nr) -{ - trace_softirq_raise(nr); - or_softirq_pending(1UL << nr); -} +extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); -- cgit v1.2.3 From 2fbb90db1b8fcc78f43830f1a009f3af243c5f42 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 7 Feb 2012 09:32:43 -0500 Subject: tracing/rcu: Add trace_##name##__rcuidle() static tracepoint for inside rcu_idle_exit() sections Added is a new static inline function that lets *any* tracepoint be used inside a rcu_idle_exit() section. And this also solves the problem where the same tracepoint may be used inside a rcu_idle_exit() section as well as outside of one. I added a new tracepoint function with a "_rcuidle" extension. All tracepoints can be used with either the normal "trace_foobar()" function, or the "trace_foobar_rcuidle()" function when inside a rcu_idle_exit() section. All tracepoints defined by TRACE_EVENT() or any of the derivatives will have a "_rcuidle()" function also defined. When a tracepoint is used within an rcu_idle_exit() section, the "_rcuidle()" version must be used. This denotes that the tracepoint is within rcu_idle_exit() and it allows the rcu read locks within the tracepoint to still be valid, as this version takes us out of rcu_idle_exit(). Another nice aspect about this patch is that "static inline"s are not compiled into text when not used. So only the tracepoints that actually use the _rcuidle() version will have them defined in the actual text that is booted. Link: http://lkml.kernel.org/r/1328563113.2200.39.camel@gandalf.stny.rr.com> Acked-by: Paul E. McKenney Reviewed-by: Josh Triplett Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index df0a779c1bbd..fc36da97ff7e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -114,7 +114,7 @@ static inline void tracepoint_synchronize_unregister(void) * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto". */ -#define __DO_TRACE(tp, proto, args, cond) \ +#define __DO_TRACE(tp, proto, args, cond, prercu, postrcu) \ do { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ @@ -122,6 +122,7 @@ static inline void tracepoint_synchronize_unregister(void) \ if (!(cond)) \ return; \ + prercu; \ rcu_read_lock_sched_notrace(); \ it_func_ptr = rcu_dereference_sched((tp)->funcs); \ if (it_func_ptr) { \ @@ -132,6 +133,7 @@ static inline void tracepoint_synchronize_unregister(void) } while ((++it_func_ptr)->func); \ } \ rcu_read_unlock_sched_notrace(); \ + postrcu; \ } while (0) /* @@ -139,7 +141,7 @@ static inline void tracepoint_synchronize_unregister(void) * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. */ -#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ @@ -147,7 +149,17 @@ static inline void tracepoint_synchronize_unregister(void) __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ - TP_CONDITION(cond)); \ + TP_CONDITION(cond),,); \ + } \ + static inline void trace_##name##_rcuidle(proto) \ + { \ + if (static_branch(&__tracepoint_##name.key)) \ + __DO_TRACE(&__tracepoint_##name, \ + TP_PROTO(data_proto), \ + TP_ARGS(data_args), \ + TP_CONDITION(cond), \ + rcu_idle_exit(), \ + rcu_idle_enter()); \ } \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ @@ -190,9 +202,11 @@ static inline void tracepoint_synchronize_unregister(void) EXPORT_SYMBOL(__tracepoint_##name) #else /* !CONFIG_TRACEPOINTS */ -#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ static inline void trace_##name(proto) \ { } \ + static inline void trace_##name##_rcuidle(proto) \ + { } \ static inline int \ register_trace_##name(void (*probe)(data_proto), \ void *data) \ -- cgit v1.2.3 From e248491ac283b516958ca9ab62c8e74b6718bca8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:48 +0100 Subject: ftrace: Add enable/disable ftrace_ops control interface Adding a way to temporarily enable/disable ftrace_ops. The change follows the same way as 'global' ftrace_ops are done. Introducing 2 global ftrace_ops - control_ops and ftrace_control_list which take over all ftrace_ops registered with FTRACE_OPS_FL_CONTROL flag. In addition new per cpu flag called 'disabled' is also added to ftrace_ops to provide the control information for each cpu. When ftrace_ops with FTRACE_OPS_FL_CONTROL is registered, it is set as disabled for all cpus. The ftrace_control_list contains all the registered 'control' ftrace_ops. The control_ops provides function which iterates ftrace_control_list and does the check for 'disabled' flag on current cpu. Adding 3 inline functions: ftrace_function_local_disable/ftrace_function_local_enable - enable/disable the ftrace_ops on current cpu ftrace_function_local_disabled - get disabled ftrace_ops::disabled value for current cpu Link: http://lkml.kernel.org/r/1329317514-8131-2-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f33fb3b041c6..64a309d2fbd6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,16 +31,33 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); +/* + * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are + * set in the flags member. + * + * ENABLED - set/unset when ftrace_ops is registered/unregistered + * GLOBAL - set manualy by ftrace_ops user to denote the ftrace_ops + * is part of the global tracers sharing the same filter + * via set_ftrace_* debugfs files. + * DYNAMIC - set when ftrace_ops is registered to denote dynamically + * allocated ftrace_ops which need special care + * CONTROL - set manualy by ftrace_ops user to denote the ftrace_ops + * could be controled by following calls: + * ftrace_function_local_enable + * ftrace_function_local_disable + */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, FTRACE_OPS_FL_GLOBAL = 1 << 1, FTRACE_OPS_FL_DYNAMIC = 1 << 2, + FTRACE_OPS_FL_CONTROL = 1 << 3, }; struct ftrace_ops { ftrace_func_t func; struct ftrace_ops *next; unsigned long flags; + int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE struct ftrace_hash *notrace_hash; struct ftrace_hash *filter_hash; @@ -97,6 +114,55 @@ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); void clear_ftrace_function(void); +/** + * ftrace_function_local_enable - enable controlled ftrace_ops on current cpu + * + * This function enables tracing on current cpu by decreasing + * the per cpu control variable. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline void ftrace_function_local_enable(struct ftrace_ops *ops) +{ + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) + return; + + (*this_cpu_ptr(ops->disabled))--; +} + +/** + * ftrace_function_local_disable - enable controlled ftrace_ops on current cpu + * + * This function enables tracing on current cpu by decreasing + * the per cpu control variable. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline void ftrace_function_local_disable(struct ftrace_ops *ops) +{ + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) + return; + + (*this_cpu_ptr(ops->disabled))++; +} + +/** + * ftrace_function_local_disabled - returns ftrace_ops disabled value + * on current cpu + * + * This function returns value of ftrace_ops::disabled on current cpu. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline int ftrace_function_local_disabled(struct ftrace_ops *ops) +{ + WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL)); + return *this_cpu_ptr(ops->disabled); +} + extern void ftrace_stub(unsigned long a0, unsigned long a1); #else /* !CONFIG_FUNCTION_TRACER */ -- cgit v1.2.3 From ceec0b6fc7cd43b38a40c2d40223f9cd0616f0cd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:49 +0100 Subject: ftrace, perf: Add open/close tracepoint perf registration actions Adding TRACE_REG_PERF_OPEN and TRACE_REG_PERF_CLOSE to differentiate register/unregister from open/close actions. The register/unregister actions are invoked for the first/last tracepoint user when opening/closing the event. The open/close actions are invoked for each tracepoint user when opening/closing the event. Link: http://lkml.kernel.org/r/1329317514-8131-3-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c3da42dd22ba..195e3606ddd7 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -146,6 +146,8 @@ enum trace_reg { TRACE_REG_UNREGISTER, TRACE_REG_PERF_REGISTER, TRACE_REG_PERF_UNREGISTER, + TRACE_REG_PERF_OPEN, + TRACE_REG_PERF_CLOSE, }; struct ftrace_event_call; @@ -157,7 +159,7 @@ struct ftrace_event_class { void *perf_probe; #endif int (*reg)(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); int (*define_fields)(struct ftrace_event_call *); struct list_head *(*get_fields)(struct ftrace_event_call *); struct list_head fields; @@ -165,7 +167,7 @@ struct ftrace_event_class { }; extern int ftrace_event_reg(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); enum { TRACE_EVENT_FL_ENABLED_BIT, -- cgit v1.2.3 From 489c75c3b333dfda4c8d2b7ad1b00e5da024bfa7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:50 +0100 Subject: ftrace, perf: Add add/del tracepoint perf registration actions Adding TRACE_REG_PERF_ADD and TRACE_REG_PERF_DEL to handle perf event schedule in/out actions. The add action is invoked for when the perf event is scheduled in, while the del action is invoked when the event is scheduled out. Link: http://lkml.kernel.org/r/1329317514-8131-4-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 195e3606ddd7..2bf677cb2d6f 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -148,6 +148,8 @@ enum trace_reg { TRACE_REG_PERF_UNREGISTER, TRACE_REG_PERF_OPEN, TRACE_REG_PERF_CLOSE, + TRACE_REG_PERF_ADD, + TRACE_REG_PERF_DEL, }; struct ftrace_event_call; -- cgit v1.2.3 From ced39002f5ea736b716ae233fb68b26d59783912 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:52 +0100 Subject: ftrace, perf: Add support to use function tracepoint in perf Adding perf registration support for the ftrace function event, so it is now possible to register it via perf interface. The perf_event struct statically contains ftrace_ops as a handle for function tracer. The function tracer is registered/unregistered in open/close actions. To be efficient, we enable/disable ftrace_ops each time the traced process is scheduled in/out (via TRACE_REG_PERF_(ADD|DELL) handlers). This way tracing is enabled only when the process is running. Intentionally using this way instead of the event's hw state PERF_HES_STOPPED, which would not disable the ftrace_ops. It is now possible to use function trace within perf commands like: perf record -e ftrace:function ls perf stat -e ftrace:function ls Allowed only for root. Link: http://lkml.kernel.org/r/1329317514-8131-6-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..92a056f6d18d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,6 +859,9 @@ struct perf_event { #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call *tp_event; struct event_filter *filter; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops ftrace_ops; +#endif #endif #ifdef CONFIG_CGROUP_PERF -- cgit v1.2.3 From 02aa3162edaa166a01d193f80ccde890be8b55da Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:53 +0100 Subject: ftrace: Allow to specify filter field type for ftrace events Adding FILTER_TRACE_FN event field type for function tracepoint event, so it can be properly recognized within filtering code. Currently all fields of ftrace subsystem events share the common field type FILTER_OTHER. Since the function trace fields need special care within the filtering code we need to recognize it properly, hence adding the FILTER_TRACE_FN event type. Adding filter parameter to the FTRACE_ENTRY macro, to specify the filter field type for the event. Link: http://lkml.kernel.org/r/1329317514-8131-7-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2bf677cb2d6f..dd478fc8f9f5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -245,6 +245,7 @@ enum { FILTER_STATIC_STRING, FILTER_DYN_STRING, FILTER_PTR_STRING, + FILTER_TRACE_FN, }; #define EVENT_STORAGE_SIZE 128 -- cgit v1.2.3 From 5500fa51199aee770ce53718853732600543619e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:54 +0100 Subject: ftrace, perf: Add filter support for function trace event Adding support to filter function trace event via perf interface. It is now possible to use filter interface in the perf tool like: perf record -e ftrace:function --filter="(ip == mm_*)" ls The filter syntax is restricted to the the 'ip' field only, and following operators are accepted '==' '!=' '||', ending up with the filter strings like: ip == f1[, ]f2 ... || ip != f3[, ]f4 ... with comma ',' or space ' ' as a function separator. If the space ' ' is used as a separator, the right side of the assignment needs to be enclosed in double quotes '"', e.g.: perf record -e ftrace:function --filter '(ip == do_execve,sys_*,ext*)' ls perf record -e ftrace:function --filter '(ip == "do_execve,sys_*,ext*")' ls perf record -e ftrace:function --filter '(ip == "do_execve sys_* ext*")' ls The '==' operator adds trace filter with same effect as would be added via set_ftrace_filter file. The '!=' operator adds trace filter with same effect as would be added via set_ftrace_notrace file. The right side of the '!=', '==' operators is list of functions or regexp. to be added to filter separated by space. The '||' operator is used for connecting multiple filter definitions together. It is possible to have more than one '==' and '!=' operators within one filter string. Link: http://lkml.kernel.org/r/1329317514-8131-8-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 64a309d2fbd6..72a6cabb4d5b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); void ftrace_set_global_filter(unsigned char *buf, int len, int reset); void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); +void ftrace_free_filter(struct ftrace_ops *ops); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); @@ -380,9 +381,6 @@ extern void ftrace_enable_daemon(void); #else static inline int skip_trace(unsigned long ip) { return 0; } static inline int ftrace_force_update(void) { return 0; } -static inline void ftrace_set_filter(unsigned char *buf, int len, int reset) -{ -} static inline void ftrace_disable_daemon(void) { } static inline void ftrace_enable_daemon(void) { } static inline void ftrace_release_mod(struct module *mod) {} @@ -406,6 +404,9 @@ static inline int ftrace_text_reserved(void *start, void *end) */ #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) #define ftrace_set_early_filter(ops, buf, enable) do { } while (0) +#define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) +#define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) +#define ftrace_free_filter(ops) do { } while (0) static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return -ENODEV; } -- cgit v1.2.3 From c5905afb0ee6550b42c49213da1c22d67316c194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Feb 2012 08:31:31 +0100 Subject: static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]() So here's a boot tested patch on top of Jason's series that does all the cleanups I talked about and turns jump labels into a more intuitive to use facility. It should also address the various misconceptions and confusions that surround jump labels. Typical usage scenarios: #include struct static_key key = STATIC_KEY_INIT_TRUE; if (static_key_false(&key)) do unlikely code else do likely code Or: if (static_key_true(&key)) do likely code else do unlikely code The static key is modified via: static_key_slow_inc(&key); ... static_key_slow_dec(&key); The 'slow' prefix makes it abundantly clear that this is an expensive operation. I've updated all in-kernel code to use this everywhere. Note that I (intentionally) have not pushed through the rename blindly through to the lowest levels: the actual jump-label patching arch facility should be named like that, so we want to decouple jump labels from the static-key facility a bit. On non-jump-label enabled architectures static keys default to likely()/unlikely() branches. Signed-off-by: Ingo Molnar Acked-by: Jason Baron Acked-by: Steven Rostedt Cc: a.p.zijlstra@chello.nl Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 139 ++++++++++++++++++++++++++++++++------------- include/linux/netdevice.h | 4 +- include/linux/netfilter.h | 6 +- include/linux/perf_event.h | 12 ++-- include/linux/static_key.h | 1 + include/linux/tracepoint.h | 8 +-- 6 files changed, 116 insertions(+), 54 deletions(-) create mode 100644 include/linux/static_key.h (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f7c69580fea7..2172da2d9bb4 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -9,15 +9,15 @@ * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support the result - * of a "if (static_branch(&key))" statement is a unconditional branch (which + * of a "if (static_key_false(&key))" statement is a unconditional branch (which * defaults to false - and the true block is placed out of line). * - * However at runtime we can change the 'static' branch target using - * jump_label_{inc,dec}(). These function as a 'reference' count on the key + * However at runtime we can change the branch target using + * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key * object and for as long as there are references all branches referring to * that particular key will point to the (out of line) true block. * - * Since this relies on modifying code the jump_label_{inc,dec}() functions + * Since this relies on modifying code the static_key_slow_{inc,dec}() functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total @@ -26,12 +26,26 @@ * * When the control is directly exposed to userspace it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) - * cause significant performance degradation. Struct jump_label_key_deferred and - * jump_label_dec_deferred() provide for this. + * cause significant performance degradation. Struct static_key_deferred and + * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, it falls back to a simple * conditional branch. - */ + * + * struct static_key my_key = STATIC_KEY_INIT_TRUE; + * + * if (static_key_true(&my_key)) { + * } + * + * will result in the true case being in-line and starts the key with a single + * reference. Mixing static_key_true() and static_key_false() on the same key is not + * allowed. + * + * Not initializing the key (static data is initialized to 0s anyway) is the + * same as using STATIC_KEY_INIT_FALSE and static_key_false() is + * equivalent with static_branch(). + * +*/ #include #include @@ -39,16 +53,17 @@ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) -struct jump_label_key { +struct static_key { atomic_t enabled; +/* Set lsb bit to 1 if branch is default true, 0 ot */ struct jump_entry *entries; #ifdef CONFIG_MODULES - struct jump_label_mod *next; + struct static_key_mod *next; #endif }; -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; unsigned long timeout; struct delayed_work work; }; @@ -66,13 +81,34 @@ struct module; #ifdef HAVE_JUMP_LABEL -#ifdef CONFIG_MODULES -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL, NULL} -#else -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL} -#endif +#define JUMP_LABEL_TRUE_BRANCH 1UL + +static +inline struct jump_entry *jump_label_get_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries + & ~JUMP_LABEL_TRUE_BRANCH); +} + +static inline bool jump_label_get_branch_default(struct static_key *key) +{ + if ((unsigned long)key->entries & JUMP_LABEL_TRUE_BRANCH) + return true; + return false; +} + +static __always_inline bool static_key_false(struct static_key *key) +{ + return arch_static_branch(key); +} -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_true(struct static_key *key) +{ + return !static_key_false(key); +} + +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) { return arch_static_branch(key); } @@ -88,21 +124,24 @@ extern void arch_jump_label_transform(struct jump_entry *entry, extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); extern int jump_label_text_reserved(void *start, void *end); -extern void jump_label_inc(struct jump_label_key *key); -extern void jump_label_dec(struct jump_label_key *key); -extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); -extern bool jump_label_enabled(struct jump_label_key *key); +extern void static_key_slow_inc(struct static_key *key); +extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); -extern void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0), .entries = (void *)0 }) #else /* !HAVE_JUMP_LABEL */ #include -#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} - -struct jump_label_key { +struct static_key { atomic_t enabled; }; @@ -110,30 +149,45 @@ static __always_inline void jump_label_init(void) { } -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; }; -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_false(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static __always_inline bool static_key_true(struct static_key *key) { - if (unlikely(atomic_read(&key->enabled))) + if (likely(atomic_read(&key->enabled)) > 0) return true; return false; } -static inline void jump_label_inc(struct jump_label_key *key) +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static inline void static_key_slow_inc(struct static_key *key) { atomic_inc(&key->enabled); } -static inline void jump_label_dec(struct jump_label_key *key) +static inline void static_key_slow_dec(struct static_key *key) { atomic_dec(&key->enabled); } -static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { - jump_label_dec(&key->key); + static_key_slow_dec(&key->key); } static inline int jump_label_text_reserved(void *start, void *end) @@ -144,9 +198,9 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool jump_label_enabled(struct jump_label_key *key) +static inline bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } static inline int jump_label_apply_nops(struct module *mod) @@ -154,13 +208,20 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, +static inline void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { } + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1) }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0) }) + #endif /* HAVE_JUMP_LABEL */ -#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) -#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) +#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#define jump_label_enabled static_key_enabled #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c95255..7dfaae7846ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -214,8 +214,8 @@ enum { #include #ifdef CONFIG_RPS -#include -extern struct jump_label_key rps_needed; +#include +extern struct static_key rps_needed; #endif struct neighbour; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index b809265607d0..29734be334c1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -163,13 +163,13 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[]; extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #if defined(CONFIG_JUMP_LABEL) -#include -extern struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +#include +extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) - return static_branch(&nf_hooks_needed[pf][hook]); + return static_key_false(&nf_hooks_needed[pf][hook]); return !list_empty(&nf_hooks[pf][hook]); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..0d21e6f1cf53 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -514,7 +514,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1038,7 +1038,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); @@ -1066,7 +1066,7 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - if (static_branch(&perf_swevent_enabled[event_id])) { + if (static_key_false(&perf_swevent_enabled[event_id])) { if (!regs) { perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; @@ -1075,12 +1075,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key_deferred perf_sched_events; +extern struct static_key_deferred perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); } @@ -1089,7 +1089,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); } diff --git a/include/linux/static_key.h b/include/linux/static_key.h new file mode 100644 index 000000000000..27bd3f8a0857 --- /dev/null +++ b/include/linux/static_key.h @@ -0,0 +1 @@ +#include diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index fc36da97ff7e..bd96ecd0e05c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include struct module; struct tracepoint; @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct jump_label_key key; + struct static_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -145,7 +145,7 @@ static inline void tracepoint_synchronize_unregister(void) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_branch(&__tracepoint_##name.key)) \ + if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -188,7 +188,7 @@ static inline void tracepoint_synchronize_unregister(void) __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ static struct tracepoint * const __tracepoint_ptr_##name __used \ __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; -- cgit v1.2.3 From 8eedce996556d7d06522cd3a0e6069141c8dffe0 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 28 Feb 2012 13:49:01 -0500 Subject: static keys: Inline the static_key_enabled() function In the jump label enabled case, calling static_key_enabled() results in a function call. The function returns the results of a compare, so it really doesn't need the overhead of a full function call. Let's make it 'static inline' for both the jump label enabled and disabled cases. Signed-off-by: Jason Baron Cc: a.p.zijlstra@chello.nl Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@polymtl.ca Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/201202281849.q1SIn1p2023270@int-mx10.intmail.prod.int.phx2.redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 2172da2d9bb4..c513a40510f5 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -127,7 +127,6 @@ extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern void static_key_slow_dec_deferred(struct static_key_deferred *key); -extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); extern void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); @@ -198,11 +197,6 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool static_key_enabled(struct static_key *key) -{ - return (atomic_read(&key->enabled) > 0); -} - static inline int jump_label_apply_nops(struct module *mod) { return 0; @@ -224,4 +218,9 @@ jump_label_rate_limit(struct static_key_deferred *key, #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled +static inline bool static_key_enabled(struct static_key *key) +{ + return (atomic_read(&key->enabled) > 0); +} + #endif /* _LINUX_JUMP_LABEL_H */ -- cgit v1.2.3 From bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:51 +0100 Subject: perf: Add generic taken branch sampling support This patch adds the ability to sample taken branches to the perf_event interface. The ability to capture taken branches is very useful for all sorts of analysis. For instance, basic block profiling, call counts, statistical call graph. This new capability requires hardware assist and as such may not be available on all HW platforms. On Intel x86 it is implemented on top of the Last Branch Record (LBR) facility. To enable taken branches sampling, the PERF_SAMPLE_BRANCH_STACK bit must be set in attr->sample_type. Sampled taken branches may be filtered by type and/or priv levels. The patch adds a new field, called branch_sample_type, to the perf_event_attr structure. It contains a bitmask of filters to apply to the sampled taken branches. Filters may be implemented in HW. If the HW filter does not exist or is not good enough, some arch may also implement a SW filter. The following generic filters are currently defined: - PERF_SAMPLE_USER only branches whose targets are at the user level - PERF_SAMPLE_KERNEL only branches whose targets are at the kernel level - PERF_SAMPLE_HV only branches whose targets are at the hypervisor level - PERF_SAMPLE_ANY any type of branches (subject to priv levels filters) - PERF_SAMPLE_ANY_CALL any call branches (may incl. syscall on some arch) - PERF_SAMPLE_ANY_RET any return branches (may incl. syscall returns on some arch) - PERF_SAMPLE_IND_CALL indirect call branches Obviously filter may be combined. The priv level bits are optional. If not provided, the priv level of the associated event are used. It is possible to collect branches at a priv level different from the associated event. Use of kernel, hv priv levels is subject to permissions and availability (hv). The number of taken branch records present in each sample may vary based on HW, the type of sampled branches, the executed code. Therefore each sample contains the number of taken branches it contains. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 71 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 64426b71381f..5fc494f4a094 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,10 +129,39 @@ enum perf_event_sample_format { PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 12, /* non-ABI */ }; +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ + + PERF_SAMPLE_BRANCH_MAX = 1U << 7, /* non-ABI */ +}; + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + /* * The format of the data returned by read() on a perf event fd, * as specified by attr.read_format: @@ -240,6 +269,7 @@ struct perf_event_attr { __u64 bp_len; __u64 config2; /* extension of config1 */ }; + __u64 branch_sample_type; /* enum branch_sample_type */ }; /* @@ -458,6 +488,8 @@ enum perf_event_type { * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK * }; */ PERF_RECORD_SAMPLE = 9, @@ -530,12 +562,34 @@ struct perf_raw_record { void *data; }; +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + */ struct perf_branch_entry { - __u64 from; - __u64 to; - __u64 flags; + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + reserved:62; }; +/* + * branch stack layout: + * nr: number of taken branches stored in entries[] + * + * Note that nr can vary from sample to sample + * branches (to, from) are stored from most recent + * to least recent, i.e., entries[0] contains the most + * recent branch. + */ struct perf_branch_stack { __u64 nr; struct perf_branch_entry entries[0]; @@ -566,7 +620,9 @@ struct hw_perf_event { unsigned long event_base; int idx; int last_cpu; + struct hw_perf_event_extra extra_reg; + struct hw_perf_event_extra branch_reg; }; struct { /* software */ struct hrtimer hrtimer; @@ -1007,12 +1063,14 @@ struct perf_sample_data { u64 period; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; + struct perf_branch_stack *br_stack; }; static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) { data->addr = addr; data->raw = NULL; + data->br_stack = NULL; } extern void perf_output_sample(struct perf_output_handle *handle, @@ -1151,6 +1209,11 @@ extern void perf_bp_event(struct perf_event *event, void *data); # define perf_instruction_pointer(regs) instruction_pointer(regs) #endif +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); -- cgit v1.2.3 From d010b3326cf06b3406cdd88af16dcf4e4b6fec2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:21:00 +0100 Subject: perf: Add callback to flush branch_stack on context switch With branch stack sampling, it is possible to filter by priv levels. In system-wide mode, that means it is possible to capture only user level branches. The builtin SW LBR filter needs to disassemble code based on LBR captured addresses. For that, it needs to know the task the addresses are associated with. Because of context switches, the content of the branch stack buffer may contain addresses from different tasks. We need a callback on context switch to either flush the branch stack or save it. This patch adds a new callback in struct pmu which is called during context switches. The callback is called only when necessary. That is when a system-wide context has, at least, one event which uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for per-thread context. In this version, the Intel x86 code simply flushes (resets) the LBR on context switches (fills it with zeroes). Those zeroed branches are then filtered out by the SW filter. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5fc494f4a094..fbbf5e598368 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -746,6 +746,11 @@ struct pmu { * if no implementation is provided it will default to: event->hw.idx + 1. */ int (*event_idx) (struct perf_event *event); /*optional */ + + /* + * flush branch stack on context-switches (needed in cpu-wide mode) + */ + void (*flush_branch_stack) (void); }; /** @@ -979,7 +984,8 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; - int nr_cgroups; /* cgroup events present */ + int nr_cgroups; /* cgroup evts */ + int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; }; @@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); + struct perf_sample_data { u64 type; -- cgit v1.2.3 From cb5d76999029ae7a517cb07dfa732c1b5a934fc2 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:21:05 +0100 Subject: perf: Add ABI reference sizes This patch adds reference sizes for revision 1 and 2 of the perf_event ABI, i.e., the size of the perf_event_attr struct. With Rev1: config2 was added = +8 bytes With Rev2: branch_sample_type was added = +8 bytes Adds the definition for Rev1, Rev2. This is useful for tools trying to decode the revision numbers based on the size of the struct. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: acme@redhat.com Cc: robert.richter@amd.com Cc: ming.m.lin@intel.com Cc: andi@firstfloor.org Cc: asharma@fb.com Cc: ravitillo@lbl.gov Cc: vweaver1@eecs.utk.edu Cc: khandual@linux.vnet.ibm.com Cc: dsahern@gmail.com Link: http://lkml.kernel.org/r/1328826068-11713-16-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fbbf5e598368..bd9f55a5958d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -192,6 +192,8 @@ enum perf_event_read_format { }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ +#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ /* * Hardware event_id to monitor via a performance monitoring event: -- cgit v1.2.3