From f1a66f85b74c5ef7b503f746ea97742dacd56419 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:43 +0100 Subject: ebpf: export BPF_PSEUDO_MAP_FD to uapi We need to export BPF_PSEUDO_MAP_FD to user space, as it's used in the ELF BPF loader where instructions are being loaded that need map fixups. An initial stage loads all maps into the kernel, and later on replaces related instructions in the eBPF blob with BPF_PSEUDO_MAP_FD as source register and the actual fd as immediate value. The kernel verifier recognizes this keyword and replaces the map fd with a real pointer internally. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45da7ec7d274..0248180bf2e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -120,6 +120,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCKET_FILTER, }; +#define BPF_PSEUDO_MAP_FD 1 + /* flags for BPF_MAP_UPDATE_ELEM command */ #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ -- cgit v1.2.3 From 96be4325f443dbbfeb37d2a157675ac0736531a1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:46 +0100 Subject: ebpf: add sched_cls_type and map it to sk_filter's verifier ops As discussed recently and at netconf/netdev01, we want to prevent making bpf_verifier_ops registration available for modules, but have them at a controlled place inside the kernel instead. The reason for this is, that out-of-tree modules can go crazy and define and register any verfifier ops they want, doing all sorts of crap, even bypassing available GPLed eBPF helper functions. We don't want to offer such a shiny playground, of course, but keep strict control to ourselves inside the core kernel. This also encourages us to design eBPF user helpers carefully and generically, so they can be shared among various subsystems using eBPF. For the eBPF traffic classifier (cls_bpf), it's a good start to share the same helper facilities as we currently do in eBPF for socket filters. That way, we have BPF_PROG_TYPE_SCHED_CLS look like it's own type, thus one day if there's a good reason to diverge the set of helper functions from the set available to socket filters, we keep ABI compatibility. In future, we could place all bpf_prog_type_list at a central place, perhaps. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0248180bf2e2..3fa1af8a58d7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_SCHED_CLS, }; #define BPF_PSEUDO_MAP_FD 1 -- cgit v1.2.3 From 03e69b508b6f7c51743055c9f61d1dfeadf4b635 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 14 Mar 2015 02:27:16 +0100 Subject: ebpf: add prandom helper for packet sampling This work is similar to commit 4cd3675ebf74 ("filter: added BPF random opcode") and adds a possibility for packet sampling in eBPF. Currently, this is only possible in classic BPF and useful to combine sampling with f.e. packet sockets, possible also with tc. Example function proto-type looks like: u32 (*prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32; Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3fa1af8a58d7..1c2ca2b477c8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From c04167ce2ca0ecaeaafef006cb0d65cf01b68e42 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 14 Mar 2015 02:27:17 +0100 Subject: ebpf: add helper for obtaining current processor id This patch adds the possibility to obtain raw_smp_processor_id() in eBPF. Currently, this is only possible in classic BPF where commit da2033c28226 ("filter: add SKF_AD_RXHASH and SKF_AD_CPU") has added facilities for this. Perhaps most importantly, this would also allow us to track per CPU statistics with eBPF maps, or to implement a poor-man's per CPU data structure through eBPF maps. Example function proto-type looks like: u32 (*smp_processor_id)(void) = (void *)BPF_FUNC_get_smp_processor_id; Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1c2ca2b477c8..de1f63668daf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -166,6 +166,7 @@ enum bpf_func_id { BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ + BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 9bac3d6d548e5cc925570b263f35b70a00a00ffd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 13 Mar 2015 11:57:42 -0700 Subject: bpf: allow extended BPF programs access skb fields introduce user accessible mirror of in-kernel 'struct sk_buff': struct __sk_buff { __u32 len; __u32 pkt_type; __u32 mark; __u32 queue_mapping; }; bpf programs can do: int bpf_prog(struct __sk_buff *skb) { __u32 var = skb->pkt_type; which will be compiled to bpf assembler as: dst_reg = *(u32 *)(src_reg + 4) // 4 == offsetof(struct __sk_buff, pkt_type) bpf verifier will check validity of access and will convert it to: dst_reg = *(u8 *)(src_reg + offsetof(struct sk_buff, __pkt_type_offset)) dst_reg &= 7 since skb->pkt_type is a bitfield. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index de1f63668daf..929545a27546 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,4 +170,14 @@ enum bpf_func_id { __BPF_FUNC_MAX_ID, }; +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From c24973957975403521ca76a776c2dfd12fbe9add Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 16 Mar 2015 18:06:02 -0700 Subject: bpf: allow BPF programs access 'protocol' and 'vlan_tci' fields as a follow on to patch 70006af95515 ("bpf: allow eBPF access skb fields") this patch allows 'protocol' and 'vlan_tci' fields to be accessible from extended BPF programs. The usage of 'protocol', 'vlan_present' and 'vlan_tci' fields is the same as corresponding SKF_AD_PROTOCOL, SKF_AD_VLAN_TAG_PRESENT and SKF_AD_VLAN_TAG accesses in classic BPF. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 929545a27546..1623047af463 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -178,6 +178,9 @@ struct __sk_buff { __u32 pkt_type; __u32 mark; __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 94caee8c312d96522bcdae88791aaa9ebcd5f22c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 20 Mar 2015 15:11:11 +0100 Subject: ebpf: add sched_act_type and map it to sk_filter's verifier ops In order to prepare eBPF support for tc action, we need to add sched_act_type, so that the eBPF verifier is aware of what helper function act_bpf may use, that it can load skb data and read out currently available skb fields. This is bascially analogous to 96be4325f443 ("ebpf: add sched_cls_type and map it to sk_filter's verifier ops"). BPF_PROG_TYPE_SCHED_CLS and BPF_PROG_TYPE_SCHED_ACT need to be separate since both will have a different set of functionality in future (classifier vs action), thus we won't run into ABI troubles when the point in time comes to diverge functionality from the classifier. The future plan for act_bpf would be that it will be able to write into skb->data and alter selected fields mirrored in struct __sk_buff. For an initial support, it's sufficient to map it to sk_filter_ops. Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Reviewed-by: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1623047af463..3dd314a45d0d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, }; #define BPF_PSEUDO_MAP_FD 1 -- cgit v1.2.3 From 27cd5452476978283decb19e429e81fc6c71e74b Mon Sep 17 00:00:00 2001 From: Michal Sekletar Date: Tue, 24 Mar 2015 14:48:41 +0100 Subject: filter: introduce SKF_AD_VLAN_TPID BPF extension If vlan offloading takes place then vlan header is removed from frame and its contents, both vlan_tci and vlan_proto, is available to user space via TPACKET interface. However, only vlan_tci can be used in BPF filters. This commit introduces a new BPF extension. It makes possible to load the value of vlan_proto (vlan TPID) to register A. Support for classic BPF and eBPF is being added, analogous to skb->protocol. Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Jiri Pirko Signed-off-by: Michal Sekletar Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3dd314a45d0d..27dc4ec58840 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -182,6 +182,7 @@ struct __sk_buff { __u32 protocol; __u32 vlan_present; __u32 vlan_tci; + __u32 vlan_proto; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 608cd71a9c7c9db76e78a792c5a4101e12fea43f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 26 Mar 2015 19:53:57 -0700 Subject: tc: bpf: generalize pedit action existing TC action 'pedit' can munge any bits of the packet. Generalize it for use in bpf programs attached as cls_bpf and act_bpf via bpf_skb_store_bytes() helper function. Signed-off-by: Alexei Starovoitov Reviewed-by: Jiri Pirko Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 27dc4ec58840..74aab6e0d964 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,6 +168,7 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ + BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */ __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 2541517c32be2531e0da59dfd7efc1ce844644f5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:20 -0700 Subject: tracing, perf: Implement BPF programs attached to kprobes BPF programs, attached to kprobes, provide a safe way to execute user-defined BPF byte-code programs without being able to crash or hang the kernel in any way. The BPF engine makes sure that such programs have a finite execution time and that they cannot break out of their sandbox. The user interface is to attach to a kprobe via the perf syscall: struct perf_event_attr attr = { .type = PERF_TYPE_TRACEPOINT, .config = event_id, ... }; event_fd = perf_event_open(&attr,...); ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); 'prog_fd' is a file descriptor associated with BPF program previously loaded. 'event_id' is an ID of the kprobe created. Closing 'event_fd': close(event_fd); ... automatically detaches BPF program from it. BPF programs can call in-kernel helper functions to: - lookup/update/delete elements in maps - probe_read - wraper of probe_kernel_read() used to access any kernel data structures BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is architecture dependent) and return 0 to ignore the event and 1 to store kprobe event into the ring buffer. Note, kprobes are a fundamentally _not_ a stable kernel ABI, so BPF programs attached to kprobes must be recompiled for every kernel version and user must supply correct LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Reviewed-by: Masami Hiramatsu Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45da7ec7d274..b2948feeb70b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, }; /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -151,6 +152,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* checked when prog_type=kprobe */ }; } __attribute__((aligned(8))); @@ -162,6 +164,7 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From d9847d310ab4003725e6ed1822682e24bd406908 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:21 -0700 Subject: tracing: Allow BPF programs to call bpf_ktime_get_ns() bpf_ktime_get_ns() is used by programs to compute time delta between events or as a timestamp Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-5-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b2948feeb70b..238c6883877b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_func_id { BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ + BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 9c959c863f8217a2ff3d7c296e8223654d240569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:22 -0700 Subject: tracing: Allow BPF programs to call bpf_trace_printk() Debugging of BPF programs needs some form of printk from the program, so let programs call limited trace_printk() with %d %u %x %p modifiers only. Similar to kernel modules, during program load verifier checks whether program is calling bpf_trace_printk() and if so, kernel allocates trace_printk buffers and emits big 'this is debug only' banner. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-6-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 238c6883877b..cc47ef41076a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -166,6 +166,7 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ + BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From bcad57182425426dd4aa14deb27f97acb329f3cd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Apr 2015 20:52:24 +0200 Subject: ebpf: add skb->priority to offset map for usage in {cls, act}_bpf This adds the ability to read out the skb->priority from an eBPF program, so that it can be taken into account from a tc filter or action for the use-case where the priority is not being used to directly override the filter classification in a qdisc, but to tag traffic otherwise for the classifier; the priority can be assigned from various places incl. user space, in future we may also mangle it from an eBPF program. Signed-off-by: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 74aab6e0d964..0db8580f3cca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -184,6 +184,7 @@ struct __sk_buff { __u32 vlan_present; __u32 vlan_tci; __u32 vlan_proto; + __u32 priority; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 91bc4822c3d61b9bb7ef66d3b77948a4f9177954 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Apr 2015 17:12:13 -0700 Subject: tc: bpf: add checksum helpers Commit 608cd71a9c7c ("tc: bpf: generalize pedit action") has added the possibility to mangle packet data to BPF programs in the tc pipeline. This patch adds two helpers bpf_l3_csum_replace() and bpf_l4_csum_replace() for fixing up the protocol checksums after the packet mangling. It also adds 'flags' argument to bpf_skb_store_bytes() helper to avoid unnecessary checksum recomputations when BPF programs adjusting l3/l4 checksums and documents all three helpers in uapi header. Moreover, a sample program is added to show how BPF programs can make use of the mangle and csum helpers. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0db8580f3cca..23df3e7f8e7d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,7 +168,43 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ - BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */ + + /** + * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet + * @skb: pointer to skb + * @offset: offset within packet from skb->data + * @from: pointer where to copy bytes from + * @len: number of bytes to store into packet + * @flags: bit 0 - if true, recompute skb->csum + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_skb_store_bytes, + + /** + * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum + * @skb: pointer to skb + * @offset: offset within packet where IP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_l3_csum_replace, + + /** + * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum + * @skb: pointer to skb + * @offset: offset within packet where TCP/UDP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * bit 4 - is pseudo header + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_l4_csum_replace, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From a166151cbe33b53221c24259e4a7201064b3ba79 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 15 Apr 2015 12:55:45 -0700 Subject: bpf: fix bpf helpers to use skb->mac_header relative offsets For the short-term solution, lets fix bpf helper functions to use skb->mac_header relative offsets instead of skb->data in order to get the same eBPF programs with cls_bpf and act_bpf work on ingress and egress qdisc path. We need to ensure that mac_header is set before calling into programs. This is effectively the first option from below referenced discussion. More long term solution for LD_ABS|LD_IND instructions will be more intrusive but also more beneficial than this, and implemented later as it's too risky at this point in time. I.e., we plan to look into the option of moving skb_pull() out of eth_type_trans() and into netif_receive_skb() as has been suggested as second option. Meanwhile, this solution ensures ingress can be used with eBPF, too, and that we won't run into ABI troubles later. For dealing with negative offsets inside eBPF helper functions, we've implemented bpf_skb_clone_unwritable() to test for unwriteable headers. Reference: http://thread.gmane.org/gmane.linux.network/359129/focus=359694 Fixes: 608cd71a9c7c ("tc: bpf: generalize pedit action") Fixes: 91bc4822c3d6 ("tc: bpf: add checksum helpers") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5c1cee11f777..a9ebdf5701e8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -177,7 +177,7 @@ enum bpf_func_id { /** * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet * @skb: pointer to skb - * @offset: offset within packet from skb->data + * @offset: offset within packet from skb->mac_header * @from: pointer where to copy bytes from * @len: number of bytes to store into packet * @flags: bit 0 - if true, recompute skb->csum -- cgit v1.2.3 From 04fd61ab36ec065e194ab5e74ae34a5240d992bb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 19 May 2015 16:59:03 -0700 Subject: bpf: allow bpf programs to tail-call other bpf programs introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. In case of x64 JIT the bigger part of generated assembler prologue is common for all programs, so it is simply skipped while jumping. Other JITs can do similar prologue-skipping optimization or do stack unwind before jumping into the next program. bpf_tail_call() arguments: ctx - context pointer jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table index - index in the jump table Since all BPF programs are idenitified by file descriptor, user space need to populate the jmp_table with FDs of other BPF programs. If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere and program execution continues as normal. New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can populate this jmp_table array with FDs of other bpf programs. Programs can share the same jmp_table array or use multiple jmp_tables. The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. Use cases: Acked-by: Daniel Borkmann ========== - simplify complex programs by splitting them into a sequence of small programs - dispatch routine For tracing and future seccomp the program may be triggered on all system calls, but processing of syscall arguments will be different. It's more efficient to implement them as: int syscall_entry(struct seccomp_data *ctx) { bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */); ... default: process unknown syscall ... } int sys_write_event(struct seccomp_data *ctx) {...} int sys_read_event(struct seccomp_data *ctx) {...} syscall_jmp_table[__NR_write] = sys_write_event; syscall_jmp_table[__NR_read] = sys_read_event; For networking the program may call into different parsers depending on packet format, like: int packet_parser(struct __sk_buff *skb) { ... parse L2, L3 here ... __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol)); bpf_tail_call(skb, &ipproto_jmp_table, ipproto); ... default: process unknown protocol ... } int parse_tcp(struct __sk_buff *skb) {...} int parse_udp(struct __sk_buff *skb) {...} ipproto_jmp_table[IPPROTO_TCP] = parse_tcp; ipproto_jmp_table[IPPROTO_UDP] = parse_udp; - for TC use case, bpf_tail_call() allows to implement reclassify-like logic - bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table are atomic, so user space can build chains of BPF programs on the fly Implementation details: ======================= - high performance of bpf_tail_call() is the goal. It could have been implemented without JIT changes as a wrapper on top of BPF_PROG_RUN() macro, but with two downsides: . all programs would have to pay performance penalty for this feature and tail call itself would be slower, since mandatory stack unwind, return, stack allocate would be done for every tailcall. . tailcall would be limited to programs running preempt_disabled, since generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would need to be either global per_cpu variable accessed by helper and by wrapper or global variable protected by locks. In this implementation x64 JIT bypasses stack unwind and jumps into the callee program after prologue. - bpf_prog_array_compatible() ensures that prog_type of callee and caller are the same and JITed/non-JITed flag is the same, since calling JITed program from non-JITed is invalid, since stack frames are different. Similarly calling kprobe type program from socket type program is invalid. - jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map' abstraction, its user space API and all of verifier logic. It's in the existing arraymap.c file, since several functions are shared with regular array map. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9ebdf5701e8..f0a9af8b4dae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -113,6 +113,7 @@ enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, }; enum bpf_prog_type { @@ -210,6 +211,15 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_l4_csum_replace, + + /** + * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program + * @ctx: context pointer passed to next program + * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY + * @index: index inside array that selects specific program to run + * Return: 0 on success + */ + BPF_FUNC_tail_call, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 37e82c2f974b72c9ab49c787ef7b5bb1aec12768 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 27 May 2015 15:30:39 -0700 Subject: bpf: allow BPF programs access skb->skb_iif and skb->dev->ifindex fields classic BPF already exposes skb->dev->ifindex via SKF_AD_IFINDEX extension. Allow eBPF program to access it as well. Note that classic aborts execution of the program if 'skb->dev == NULL' (which is inconvenient for program writers), whereas eBPF returns zero in such case. Also expose the 'skb_iif' field, since programs triggered by redirected packet need to known the original interface index. Summary: __skb->ifindex -> skb->dev->ifindex __skb->ingress_ifindex -> skb->skb_iif Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f0a9af8b4dae..72f3080afa1e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -236,6 +236,8 @@ struct __sk_buff { __u32 vlan_tci; __u32 vlan_proto; __u32 priority; + __u32 ingress_ifindex; + __u32 ifindex; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 3896d655f4d491c67d669a15f275a39f713410f8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Jun 2015 16:03:14 -0700 Subject: bpf: introduce bpf_clone_redirect() helper Allow eBPF programs attached to classifier/actions to call bpf_clone_redirect(skb, ifindex, flags) helper which will mirror or redirect the packet by dynamic ifindex selection from within the program to a target device either at ingress or at egress. Can be used for various scenarios, for example, to load balance skbs into veths, split parts of the traffic to local taps, etc. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 72f3080afa1e..42aa19abab86 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,16 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_tail_call, + + /** + * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev + * @skb: pointer to skb + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_clone_redirect, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From d691f9e8d4405c334aa10d556e73c8bf44cb0e01 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Jun 2015 10:11:54 -0700 Subject: bpf: allow programs to write to certain skb fields allow programs read/write skb->mark, tc_index fields and ((struct qdisc_skb_cb *)cb)->data. mark and tc_index are generically useful in TC. cb[0]-cb[4] are primarily used to pass arguments from one program to another called via bpf_tail_call() which can be seen in sockex3_kern.c example. All fields of 'struct __sk_buff' are readable to socket and tc_cls_act progs. mark, tc_index are writeable from tc_cls_act only. cb[0]-cb[4] are writeable by both sockets and tc_cls_act. Add verifier tests and improve sample code. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42aa19abab86..602f05b7a275 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -248,6 +248,8 @@ struct __sk_buff { __u32 priority; __u32 ingress_ifindex; __u32 ifindex; + __u32 tc_index; + __u32 cb[5]; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jun 2015 19:39:12 -0700 Subject: bpf: introduce current->pid, tgid, uid, gid, comm accessors eBPF programs attached to kprobes need to filter based on current->pid, uid and other fields, so introduce helper functions: u64 bpf_get_current_pid_tgid(void) Return: current->tgid << 32 | current->pid u64 bpf_get_current_uid_gid(void) Return: current_gid << 32 | current_uid bpf_get_current_comm(char *buf, int size_of_buf) stores current->comm into buf They can be used from the programs attached to TC as well to classify packets based on current task fields. Update tracex2 example to print histogram of write syscalls for each process instead of aggregated for all. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 602f05b7a275..29ef6f99e43d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,6 +230,25 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_clone_redirect, + + /** + * u64 bpf_get_current_pid_tgid(void) + * Return: current->tgid << 32 | current->pid + */ + BPF_FUNC_get_current_pid_tgid, + + /** + * u64 bpf_get_current_uid_gid(void) + * Return: current_gid << 32 | current_uid + */ + BPF_FUNC_get_current_uid_gid, + + /** + * bpf_get_current_comm(char *buf, int size_of_buf) + * stores current->comm into buf + * Return: 0 on success + */ + BPF_FUNC_get_current_comm, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3