From 824bd0ce6c7c43a9e1e210abf124958e54d88342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:53 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do accurate counters without need to use BPF_XADD instruction which turned out to be too costly for high-performance network monitoring. In the typical use case the 'key' is the flow tuple or other long living object that sees a lot of events per second. bpf_map_lookup_elem() returns per-cpu area. Example: struct { u32 packets; u32 bytes; } * ptr = bpf_map_lookup_elem(&map, &key); /* ptr points to this_cpu area of the value, so the following * increments will not collide with other cpus */ ptr->packets ++; ptr->bytes += skb->len; bpf_update_elem() atomically creates a new element where all per-cpu values are zero initialized and this_cpu value is populated with given 'value'. Note that non-per-cpu hash map always allocates new element and then deletes old after rcu grace period to maintain atomicity of update. Per-cpu hash map updates element values in-place. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa6f8571de13..43ae40c8763e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,7 @@ enum bpf_map_type { BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, }; enum bpf_prog_type { -- cgit v1.2.3 From a10423b87a7eae75da79ce80a8d9475047a674ee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:54 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map Primary use case is a histogram array of latency where bpf program computes the latency of block requests or other events and stores histogram of latency into array of 64 elements. All cpus are constantly running, so normal increment is not accurate, bpf_xadd causes cache ping-pong and this per-cpu approach allows fastest collision-free counters. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ae40c8763e..2ee0fde1bf96 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,6 +82,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, }; enum bpf_prog_type { -- cgit v1.2.3 From d5a3b1f691865be576c2bffa708549b8cdccda19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Feb 2016 19:58:58 -0800 Subject: bpf: introduce BPF_MAP_TYPE_STACK_TRACE add new map type to store stack traces and corresponding helper bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id @ctx: struct pt_regs* @map: pointer to stack_trace map @flags: bits 0-7 - numer of stack frames to skip bit 8 - collect user stack instead of kernel bit 9 - compare stacks by hash only bit 10 - if two different stacks hash into the same stackid discard old other bits - reserved Return: >= 0 stackid on success or negative error stackid is a 32-bit integer handle that can be further combined with other data (including other stackid) and used as a key into maps. Userspace will access stackmap using standard lookup/delete syscall commands to retrieve full stack trace for given stackid. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2ee0fde1bf96..d3e77da8e9e8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -83,6 +83,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, }; enum bpf_prog_type { @@ -272,6 +273,20 @@ enum bpf_func_id { */ BPF_FUNC_perf_event_output, BPF_FUNC_skb_load_bytes, + + /** + * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id + * @ctx: struct pt_regs* + * @map: pointer to stack_trace map + * @flags: bits 0-7 - numer of stack frames to skip + * bit 8 - collect user stack instead of kernel + * bit 9 - compare stacks by hash only + * bit 10 - if two different stacks hash into the same stackid + * discard old + * other bits - reserved + * Return: >= 0 stackid on success or negative error + */ + BPF_FUNC_get_stackid, __BPF_FUNC_MAX_ID, }; @@ -294,6 +309,12 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) +/* BPF_FUNC_get_stackid flags. */ +#define BPF_F_SKIP_FIELD_MASK 0xffULL +#define BPF_F_USER_STACK (1ULL << 8) +#define BPF_F_FAST_STACK_CMP (1ULL << 9) +#define BPF_F_REUSE_STACKID (1ULL << 10) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ -- cgit v1.2.3 From 7d672345ed295b1356a5d9f7111da1d1d7d65867 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:23 +0100 Subject: bpf: add generic bpf_csum_diff helper For L4 checksums, we currently have bpf_l4_csum_replace() helper. It's currently limited to handle 2 and 4 byte changes in a header and feeds the from/to into inet_proto_csum_replace{2,4}() helpers of the kernel. When working with IPv6, for example, this makes it rather cumbersome to deal with, similarly when editing larger parts of a header. Instead, extend the API in a more generic way: For bpf_l4_csum_replace(), add a case for header field mask of 0 to change the checksum at a given offset through inet_proto_csum_replace_by_diff(), and provide a helper bpf_csum_diff() that can generically calculate a from/to diff for arbitrary amounts of data. This can be used in multiple ways: for the bpf_l4_csum_replace() only part, this even provides us with the option to insert precalculated diffs from user space f.e. from a map, or from bpf_csum_diff() during runtime. bpf_csum_diff() has a optional from/to stack buffer input, so we can calculate a diff by using a scratchbuffer for scenarios where we're inserting (from is NULL), removing (to is NULL) or diffing (from/to buffers don't need to be of equal size) data. Also, bpf_csum_diff() allows to feed a previous csum into csum_partial(), so the function can also be cascaded. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d3e77da8e9e8..48d0a6c54609 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -287,6 +287,17 @@ enum bpf_func_id { * Return: >= 0 stackid on success or negative error */ BPF_FUNC_get_stackid, + + /** + * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff + * @from: raw from buffer + * @from_size: length of from buffer + * @to: raw to buffer + * @to_size: length of to buffer + * @seed: optional seed + * Return: csum result + */ + BPF_FUNC_csum_diff, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 2f72959a9c1260ade234f353ccca91118151af66 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:26 +0100 Subject: bpf: fix csum update in bpf_l4_csum_replace helper for udp When using this helper for updating UDP checksums, we need to extend this in order to write CSUM_MANGLED_0 for csum computations that result into 0 as sum. Reason we need this is because packets with a checksum could otherwise become incorrectly marked as a packet without a checksum. Likewise, if the user indicates BPF_F_MARK_MANGLED_0, then we should not turn packets without a checksum into ones with a checksum. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 48d0a6c54609..6496f98d3d68 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -313,6 +313,7 @@ enum bpf_func_id { /* BPF_FUNC_l4_csum_replace flags. */ #define BPF_F_PSEUDO_HDR (1ULL << 4) +#define BPF_F_MARK_MANGLED_0 (1ULL << 5) /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ #define BPF_F_INGRESS (1ULL << 0) -- cgit v1.2.3 From 8afd54c87ad7089734ef0527937a256586ba828a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:03 +0100 Subject: bpf: add flags to bpf_skb_store_bytes for clearing hash When overwriting parts of the packet with bpf_skb_store_bytes() that were fed previously into skb->hash calculation, we should clear the current hash with skb_clear_hash(), so that a next skb_get_hash() call can determine the correct hash related to this skb. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ee2193287cbe..2e3e90309904 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -305,6 +305,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_store_bytes flags. */ #define BPF_F_RECOMPUTE_CSUM (1ULL << 0) +#define BPF_F_INVALIDATE_HASH (1ULL << 1) /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. -- cgit v1.2.3 From 2208087061c4ad88de188911367effc550144836 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:05 +0100 Subject: bpf: allow to propagate df in bpf_skb_set_tunnel_key Added by 9a628224a61b ("ip_tunnel: Add dont fragment flag."), allow to feed df flag into tunneling facilities (currently supported on TX by vxlan, geneve and gre) as a hint from eBPF's bpf_skb_set_tunnel_key() helper. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e3e90309904..21ee6d52016f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -330,6 +330,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) +#define BPF_F_DONT_FRAGMENT (1ULL << 2) /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure -- cgit v1.2.3 From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:06 +0100 Subject: bpf: support for access to tunnel options After eBPF being able to programmatically access/manage tunnel key meta data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata") and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary helpers to generically access their auxiliary tunnel options. Geneve and vxlan support this facility. For geneve, TLVs can be pushed, and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve case only makes sense, if we can also read/write TLVs into it. In the GBP case, it provides the flexibility to easily map the group policy ID in combination with other helpers or maps. I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(), for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather complex by itself, and there may be cases for tunnel key backends where tunnel options are not always needed. If we would have integrated this into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with remaining helper arguments, so keeping compatibility on structs in case of passing in a flat buffer gets more cumbersome. Separating both also allows for more flexibility and future extensibility, f.e. options could be fed directly from a map, etc. Moreover, change geneve's xmit path to test only for info->options_len instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's xmit path and allows for avoiding to specify a protocol flag in the API on xmit, so it can be protocol agnostic. Having info->options_len is enough information that is needed. Tested with vxlan and geneve. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 21ee6d52016f..9221f653fee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -298,6 +298,17 @@ enum bpf_func_id { * Return: csum result */ BPF_FUNC_csum_diff, + + /** + * bpf_skb_[gs]et_tunnel_opt(skb, opt, size) + * retrieve or populate tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: 0 on success for set, option size for get + */ + BPF_FUNC_skb_get_tunnel_opt, + BPF_FUNC_skb_set_tunnel_opt, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:15 -0800 Subject: bpf: pre-allocate hash map elements If kprobe is placed on spin_unlock then calling kmalloc/kfree from bpf programs is not safe, since the following dead lock is possible: kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe-> bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock) and deadlocks. The following solutions were considered and some implemented, but eventually discarded - kmem_cache_create for every map - add recursion check to slow-path of slub - use reserved memory in bpf_map_update for in_irq or in preempt_disabled - kmalloc via irq_work At the end pre-allocation of all map elements turned out to be the simplest solution and since the user is charged upfront for all the memory, such pre-allocation doesn't affect the user space visible behavior. Since it's impossible to tell whether kprobe is triggered in a safe location from kmalloc point of view, use pre-allocation by default and introduce new BPF_F_NO_PREALLOC flag. While testing of per-cpu hash maps it was discovered that alloc_percpu(GFP_ATOMIC) has odd corner cases and often fails to allocate memory even when 90% of it is free. The pre-allocation of per-cpu hash elements solves this problem as well. Turned out that bpf_map_update() quickly followed by bpf_map_lookup()+bpf_map_delete() is very common pattern used in many of iovisor/bcc/tools, so there is additional benefit of pre-allocation, since such use cases are must faster. Since all hash map elements are now pre-allocated we can remove atomic increment of htab->count and save few more cycles. Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid large malloc/free done by users who don't have sufficient limits. Pre-allocation is done with vmalloc and alloc/free is done via percpu_freelist. Here are performance numbers for different pre-allocation algorithms that were implemented, but discarded in favor of percpu_freelist: 1 cpu: pcpu_ida 2.1M pcpu_ida nolock 2.3M bt 2.4M kmalloc 1.8M hlist+spinlock 2.3M pcpu_freelist 2.6M 4 cpu: pcpu_ida 1.5M pcpu_ida nolock 1.8M bt w/smp_align 1.7M bt no/smp_align 1.1M kmalloc 0.7M hlist+spinlock 0.2M pcpu_freelist 2.0M 8 cpu: pcpu_ida 0.7M bt w/smp_align 0.8M kmalloc 0.4M pcpu_freelist 1.5M 32 cpu: kmalloc 0.13M pcpu_freelist 0.49M pcpu_ida nolock is a modified percpu_ida algorithm without percpu_ida_cpu locks and without cross-cpu tag stealing. It's faster than existing percpu_ida, but not as fast as pcpu_freelist. bt is a variant of block/blk-mq-tag.c simlified and customized for bpf use case. bt w/smp_align is using cache line for every 'long' (similar to blk-mq-tag). bt no/smp_align allocates 'long' bitmasks continuously to save memory. It's comparable to percpu_ida and in some cases faster, but slower than percpu_freelist hlist+spinlock is the simplest free list with single spinlock. As expeceted it has very bad scaling in SMP. kmalloc is existing implementation which is still available via BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist, but saves memory, so in cases where map->max_entries can be large and number of map update/delete per second is low, it may make sense to use it. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9221f653fee3..0e30b19012a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,12 +101,15 @@ enum bpf_prog_type { #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +#define BPF_F_NO_PREALLOC (1U << 0) + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* prealloc or not */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ -- cgit v1.2.3 From 4018ab1875e0d00b84ac61bc15427136ad55849e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:05 +0100 Subject: bpf: support flow label for bpf_skb_{set, get}_tunnel_key This patch extends bpf_tunnel_key with a tunnel_label member, that maps to ip_tunnel_key's label so underlying backends like vxlan and geneve can propagate the label to udp_tunnel6_xmit_skb(), where it's being set in the IPv6 header. It allows for having 20 more bits to encode/decode flow related meta information programmatically. Tested with vxlan and geneve. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/bpf.h') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0e30b19012a5..924f537183fd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -375,6 +375,7 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; + __u32 tunnel_label; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3