From b0768a86585d4d951a30ff565f19598dbbd67897 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:09 +0900 Subject: net: Export skb_headers_offset_update This is needed for veth XDP which does skb_copy_expand()-like operation. v2: - Drop skb_copy_header part because it has already been exported now. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ebdf158a795..e93b157f526c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); +void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); -- cgit v1.2.3 From 0b19cc0a8694d4295383f88bc3441765875a57bc Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:15 +0900 Subject: bpf: Make redirect_info accessible from modules We are going to add kern_flags field in redirect_info for kernel internal use. In order to avoid function call to access the flags, make redirect_info accessible from modules. Also as it is now non-static, add prefix bpf_ to redirect_info. v6: - Fix sparse warning around EXPORT_SYMBOL. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index c73dd7396886..4717af8b95e6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -537,6 +537,16 @@ struct sk_msg_buff { struct list_head list; }; +struct bpf_redirect_info { + u32 ifindex; + u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; + unsigned long map_owner; +}; + +DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) -- cgit v1.2.3 From 2539650fadbf63a431e76535a9de7bff6ea5e409 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:16 +0900 Subject: xdp: Helpers for disabling napi_direct of xdp_return_frame We need some mechanism to disable napi_direct on calling xdp_return_frame_rx_napi() from some context. When veth gets support of XDP_REDIRECT, it will redirects packets which are redirected from other devices. On redirection veth will reuse xdp_mem_info of the redirection source device to make return_frame work. But in this case .ndo_xdp_xmit() called from veth redirection uses xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit() is not called directly from the rxq which owns the xdp_mem_info. This approach introduces a flag in bpf_redirect_info to indicate that napi_direct should be disabled even when _rx_napi variant is used as well as helper functions to use it. A NAPI handler who wants to use this flag needs to call xdp_set_return_frame_no_direct() before processing packets, and call xdp_clear_return_frame_no_direct() after xdp_do_flush_map() before exiting NAPI. v4: - Use bpf_redirect_info for storing the flag instead of xdp_mem_info to avoid per-frame copy cost. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4717af8b95e6..2b072dab32c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,10 +543,14 @@ struct bpf_redirect_info { struct bpf_map *map; struct bpf_map *map_to_flush; unsigned long map_owner; + u32 kern_flags; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -775,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline bool xdp_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_set_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_clear_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +} + static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { -- cgit v1.2.3 From 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:24 -0700 Subject: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY. To unleash the full potential of a bpf prog, it is essential for the userspace to be capable of directly setting up a bpf map which can then be consumed by the bpf prog to make decision. In this case, decide which SO_REUSEPORT sk to serve the incoming request. By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control and visibility on where a SO_REUSEPORT sk should be located in a bpf map. The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that the bpf prog can directly select a sk from the bpf map. That will raise the programmability of the bpf prog attached to a reuseport group (a group of sk serving the same IP:PORT). For example, in UDP, the bpf prog can peek into the payload (e.g. through the "data" pointer introduced in the later patch) to learn the application level's connection information and then decide which sk to pick from a bpf map. The userspace can tightly couple the sk's location in a bpf map with the application logic in generating the UDP payload's connection information. This connection info contact/API stays within the userspace. Also, when used with map-in-map, the userspace can switch the old-server-process's inner map to a new-server-process's inner map in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)". The bpf prog will then direct incoming requests to the new process instead of the old process. The old process can finish draining the pending requests (e.g. by "accept()") before closing the old-fds. [Note that deleting a fd from a bpf map does not necessary mean the fd is closed] During map_update_elem(), Only SO_REUSEPORT sk (i.e. which has already been added to a reuse->socks[]) can be used. That means a SO_REUSEPORT sk that is "bind()" for UDP or "bind()+listen()" for TCP. These conditions are ensured in "reuseport_array_update_check()". A SO_REUSEPORT sk can only be added once to a map (i.e. the same sk cannot be added twice even to the same map). SO_REUSEPORT already allows another sk to be created for the same IP:PORT. There is no need to re-create a similar usage in the BPF side. When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"), it will notify the bpf map to remove it from the map also. It is done through "bpf_sk_reuseport_detach()" and it will only be called if >=1 of the "reuse->sock[]" has ever been added to a bpf map. The map_update()/map_delete() has to be in-sync with the "reuse->socks[]". Hence, the same "reuseport_lock" used by "reuse->socks[]" has to be used here also. Care has been taken to ensure the lock is only acquired when the adding sk passes some strict tests. and freeing the map does not require the reuseport_lock. The reuseport_array will also support lookup from the syscall side. It will return a sock_gen_cookie(). The sock_gen_cookie() is on-demand (i.e. a sk's cookie is not generated until the very first map_lookup_elem()). The lookup cookie is 64bits but it goes against the logical userspace expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also). It may catch user in surprise if we enforce value_size=8 while userspace still pass a 32bits fd during update. Supporting different value_size between lookup and update seems unintuitive also. We also need to consider what if other existing fd based maps want to return 64bits value from syscall's lookup in the future. Hence, reuseport_array supports both value_size 4 and 8, and assuming user will usually use value_size=4. The syscall's lookup will return ENOSPC on value_size=4. It will will only return 64bits value from sock_gen_cookie() when user consciously choose value_size=8 (as a signal that lookup is desired) which then requires a 64bits value in both lookup and update. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 28 ++++++++++++++++++++++++++++ include/linux/bpf_types.h | 3 +++ 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd8790d2c6ed..db11662faea6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f..14fd6c02d258 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif -- cgit v1.2.3 From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:25 -0700 Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 3 +++ include/linux/filter.h | 15 +++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 14fd6c02d258..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 2b072dab32c0..70e9d57677fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; -- cgit v1.2.3 From 8217ca653ec601246832d562207bc24bdf652d2f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:26 -0700 Subject: bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection This patch allows a BPF_PROG_TYPE_SK_REUSEPORT bpf prog to select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY introduced in the earlier patch. "bpf_run_sk_reuseport()" will return -ECONNREFUSED when the BPF_PROG_TYPE_SK_REUSEPORT prog returns SK_DROP. The callers, in inet[6]_hashtable.c and ipv[46]/udp.c, are modified to handle this case and return NULL immediately instead of continuing the sk search from its hashtable. It re-uses the existing SO_ATTACH_REUSEPORT_EBPF setsockopt to attach BPF_PROG_TYPE_SK_REUSEPORT. The "sk_reuseport_attach_bpf()" will check if the attaching bpf prog is in the new SK_REUSEPORT or the existing SOCKET_FILTER type and then check different things accordingly. One level of "__reuseport_attach_prog()" call is removed. The "sk_unhashed() && ..." and "sk->sk_reuseport_cb" tests are pushed back to "reuseport_attach_prog()" in sock_reuseport.c. sock_reuseport.c seems to have more knowledge on those test requirements than filter.c. In "reuseport_attach_prog()", after new_prog is attached to reuse->prog, the old_prog (if any) is also directly freed instead of returning the old_prog to the caller and asking the caller to free. The sysctl_optmem_max check is moved back to the "sk_reuseport_attach_filter()" and "sk_reuseport_attach_bpf()". As of other bpf prog types, the new BPF_PROG_TYPE_SK_REUSEPORT is only bounded by the usual "bpf_prog_charge_memlock()" during load time instead of bounded by both bpf_prog_charge_memlock and sysctl_optmem_max. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 70e9d57677fe..5d565c50bcb2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -753,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); -- cgit v1.2.3 From e8d2bec0457962e8f348a9a3627b398f7fe5c5fc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 12 Aug 2018 01:59:17 +0200 Subject: bpf: decouple btf from seq bpf fs dump and enable more maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") and 699c86d6ec21 ("bpf: btf: add pretty print for hash/lru_hash maps") enabled support for BTF and dumping via BPF fs for array and hash/lru map. However, both can be decoupled from each other such that regular BPF maps can be supported for attaching BTF key/value information, while not all maps necessarily need to dump via map_seq_show_elem() callback. The basic sanity check which is a prerequisite for all maps is that key/value size has to match in any case, and some maps can have extra checks via map_check_btf() callback, e.g. probing certain types or indicating no support in general. With that we can also enable retrieving BTF info for per-cpu map types and lpm. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song --- include/linux/bpf.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db11662faea6..523481a3471b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,7 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; -struct btf; +struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -48,8 +48,9 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, - u32 key_type_id, u32 value_type_id); + int (*map_check_btf)(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); }; struct bpf_map { @@ -118,9 +119,13 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->ops->map_seq_show_elem && map->ops->map_check_btf; + return map->btf && map->ops->map_seq_show_elem; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ -- cgit v1.2.3 From 7723628101aaeb1d723786747529b4ea65c5b5c5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sun, 12 Aug 2018 10:49:27 -0700 Subject: bpf: Introduce bpf_skb_ancestor_cgroup_id helper == Problem description == It's useful to be able to identify cgroup associated with skb in TC so that a policy can be applied to this skb, and existing bpf_skb_cgroup_id helper can help with this. Though in real life cgroup hierarchy and hierarchy to apply a policy to don't map 1:1. It's often the case that there is a container and corresponding cgroup, but there are many more sub-cgroups inside container, e.g. because it's delegated to containerized application to control resources for its subsystems, or to separate application inside container from infra that belongs to containerization system (e.g. sshd). At the same time it may be useful to apply a policy to container as a whole. If multiple containers like this are run on a host (what is often the case) and many of them have sub-cgroups, it may not be possible to apply per-container policy in TC with existing helpers such as bpf_skb_under_cgroup or bpf_skb_cgroup_id: * bpf_skb_cgroup_id will return id of immediate cgroup associated with skb, i.e. if it's a sub-cgroup inside container, it can't be used to identify container's cgroup; * bpf_skb_under_cgroup can work only with one cgroup and doesn't scale, i.e. if there are N containers on a host and a policy has to be applied to M of them (0 <= M <= N), it'd require M calls to bpf_skb_under_cgroup, and, if M changes, it'd require to rebuild & load new BPF program. == Solution == The patch introduces new helper bpf_skb_ancestor_cgroup_id that can be used to get id of cgroup v2 that is an ancestor of cgroup associated with skb at specified level of cgroup hierarchy. That way admin can place all containers on one level of cgroup hierarchy (what is a good practice in general and already used in many configurations) and identify specific cgroup on this level no matter what sub-cgroup skb is associated with. E.g. if there is a cgroup hierarchy: root/ root/container1/ root/container1/app11/ root/container1/app11/sub-app-a/ root/container1/app12/ root/container2/ root/container2/app21/ root/container2/app22/ root/container2/app22/sub-app-b/ , then having skb associated with root/container1/app11/sub-app-a/ it's possible to get ancestor at level 1, what is container1 and apply policy for this container, or apply another policy if it's container2. Policies can be kept e.g. in a hash map where key is a container cgroup id and value is an action. Levels where container cgroups are created are usually known in advance whether cgroup hierarchy inside container may be hard to predict especially in case when its creation is delegated to containerized application. == Implementation details == The helper gets ancestor by walking parents up to specified level. Another option would be to get different kind of "id" from cgroup->ancestor_ids[level] and use it with idr_find() to get struct cgroup for ancestor. But that would require radix lookup what doesn't seem to be better (at least it's not obviously better). Format of return value of the new helper is same as that of bpf_skb_cgroup_id. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- include/linux/cgroup.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c9fdf6f57913..32c553556bbd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -553,6 +553,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } +/** + * cgroup_ancestor - find ancestor of cgroup + * @cgrp: cgroup to find ancestor of + * @ancestor_level: level of ancestor to find starting from root + * + * Find ancestor of cgroup at specified level starting from root if it exists + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at + * @ancestor_level. + * + * This function is safe to call as long as @cgrp is accessible. + */ +static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, + int ancestor_level) +{ + struct cgroup *ptr; + + if (cgrp->level < ancestor_level) + return NULL; + + for (ptr = cgrp; + ptr && ptr->level > ancestor_level; + ptr = cgroup_parent(ptr)) + ; + + if (ptr && ptr->level == ancestor_level) + return ptr; + + return NULL; +} + /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested -- cgit v1.2.3