summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-02-20 00:21:44 -0500
committerDavid S. Miller <davem@davemloft.net>2016-02-20 00:21:44 -0500
commit80c804bfc487c6df783c258b9034b9d81c34f7a0 (patch)
treeec8f20a73795ff16ed02887eb72870eeeee186a8 /include
parent6b83d28a55a891a9d70fc61ccb1c138e47dcbe74 (diff)
parenta6ffe7b9df6228d11c5689914eceb488bc4e38df (diff)
Merge branch 'bpf-get-stackid'
Alexei Starovoitov says: ==================== bpf_get_stackid() and stack_trace map This patch set introduces new map type to store stack traces and corresponding bpf_get_stackid() helper. BPF programs already can walk the stack via unrolled loop of bpf_probe_read()s which is ok for simple analysis, but it's not efficient and limited to <30 frames after that the programs don't fit into MAX_BPF_STACK. With bpf_get_stackid() helper the programs can collect up to PERF_MAX_STACK_DEPTH both user and kernel frames. Using stack traces as a key in a map turned out to be very useful for generating flame graphs, off-cpu graphs, waker and chain graphs. Patch 3 is a simplified version of 'offwaketime' tool which is described in detail here: http://brendangregg.com/blog/2016-02-01/linux-wakeup-offwake-profiling.html Earlier version of this patch were using save_stack_trace() helper, but 'unreliable' frames add to much noise and two equiavlent stack traces produce different 'stackid's. Using lockdep style of storing frames with MAX_STACK_TRACE_ENTRIES is great for lockdep, but not acceptable for bpf, since the stack_trace map needs to be freed when user Ctrl-C the tool. The ftrace style with per_cpu(struct ftrace_stack) is great, but it's tightly coupled with ftrace ring buffer and has the same 'unreliable' noise. perf_event's perf_callchain() mechanism is also very efficient and it only needed minor generalization which is done in patch 1 to be used by bpf stack_trace maps. Peter, please take a look at patch 1. If you're ok with it, I'd like to take the whole set via net-next. Patch 1 - generalization of perf_callchain() Patch 2 - stack_trace map done as lock-less hashtable without link list to avoid spinlock on insertion which is critical path when bpf_get_stackid() helper is called for every task switch event Patch 3 - offwaketime example After the patch the 'perf report' for artificial 'sched_bench' benchmark that doing pthread_cond_wait/signal and 'offwaketime' example is running in the background: 16.35% swapper [kernel.vmlinux] [k] intel_idle 2.18% sched_bench [kernel.vmlinux] [k] __switch_to 2.18% sched_bench libpthread-2.12.so [.] pthread_cond_signal@@GLIBC_2.3.2 1.72% sched_bench libpthread-2.12.so [.] pthread_mutex_unlock 1.53% sched_bench [kernel.vmlinux] [k] bpf_get_stackid 1.44% sched_bench [kernel.vmlinux] [k] entry_SYSCALL_64 1.39% sched_bench [kernel.vmlinux] [k] __call_rcu.constprop.73 1.13% sched_bench libpthread-2.12.so [.] pthread_mutex_lock 1.07% sched_bench libpthread-2.12.so [.] pthread_cond_wait@@GLIBC_2.3.2 1.07% sched_bench [kernel.vmlinux] [k] hash_futex 1.05% sched_bench [kernel.vmlinux] [k] do_futex 1.05% sched_bench [kernel.vmlinux] [k] get_futex_key_refs.isra.13 The hotest part of bpf_get_stackid() is inlined jhash2, so we may consider using some faster hash in the future, but it's good enough for now. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/bpf.h1
-rw-r--r--include/linux/perf_event.h13
-rw-r--r--include/uapi/linux/bpf.h21
3 files changed, 33 insertions, 2 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 90ee6ab24bc5..0cadbb7456c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -237,6 +237,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
+extern const struct bpf_func_proto bpf_get_stackid_proto;
/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b35a61a481fa..7da3c25999df 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -964,11 +964,20 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
+extern struct perf_callchain_entry *
+get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+ bool crosstask, bool add_mark);
+extern int get_callchain_buffers(void);
+extern void put_callchain_buffers(void);
-static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
+ if (entry->nr < PERF_MAX_STACK_DEPTH) {
entry->ip[entry->nr++] = ip;
+ return 0;
+ } else {
+ return -1; /* no more room, stop walking the stack */
+ }
}
extern int sysctl_perf_event_paranoid;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ee0fde1bf96..d3e77da8e9e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -83,6 +83,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
BPF_MAP_TYPE_PERCPU_HASH,
BPF_MAP_TYPE_PERCPU_ARRAY,
+ BPF_MAP_TYPE_STACK_TRACE,
};
enum bpf_prog_type {
@@ -272,6 +273,20 @@ enum bpf_func_id {
*/
BPF_FUNC_perf_event_output,
BPF_FUNC_skb_load_bytes,
+
+ /**
+ * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
+ * @ctx: struct pt_regs*
+ * @map: pointer to stack_trace map
+ * @flags: bits 0-7 - numer of stack frames to skip
+ * bit 8 - collect user stack instead of kernel
+ * bit 9 - compare stacks by hash only
+ * bit 10 - if two different stacks hash into the same stackid
+ * discard old
+ * other bits - reserved
+ * Return: >= 0 stackid on success or negative error
+ */
+ BPF_FUNC_get_stackid,
__BPF_FUNC_MAX_ID,
};
@@ -294,6 +309,12 @@ enum bpf_func_id {
/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
#define BPF_F_TUNINFO_IPV6 (1ULL << 0)
+/* BPF_FUNC_get_stackid flags. */
+#define BPF_F_SKIP_FIELD_MASK 0xffULL
+#define BPF_F_USER_STACK (1ULL << 8)
+#define BPF_F_FAST_STACK_CMP (1ULL << 9)
+#define BPF_F_REUSE_STACKID (1ULL << 10)
+
/* user accessible mirror of in-kernel sk_buff.
* new fields can only be added to the end of this structure
*/