From ca34ce29fc4b0e929cc6aada40829d17ab50fee4 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 8 Aug 2022 09:47:23 -0700 Subject: bpf: Improve docstring for BPF_F_USER_BUILD_ID flag Most tools which use bpf_get_stack or bpf_get_stackid symbolicate the stack - meaning the stack of addresses in the target process' address space is transformed into meaningful symbol names. The BPF_F_USER_BUILD_ID flag eases this process by finding the build_id of the file-backed vma which the address falls in and translating the address to an offset within the backing file. To be more specific, the offset is a "file offset" from the beginning of the backing file. The symbols in ET_DYN ELF objects have a st_value which is also described as an "offset" - but an offset in the process address space, relative to the base address of the object. It's necessary to translate between the "file offset" and "virtual address offset" during symbolication before they can be directly compared. Failure to do so can lead to confusing bugs, so this patch clarifies language in the documentation in an attempt to keep this from happening. Signed-off-by: Dave Marchevsky Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220808164723.3107500-1-davemarchevsky@fb.com --- tools/include/uapi/linux/bpf.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 59a217ca2dfd..f58d58e1d547 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3008,8 +3008,18 @@ union bpf_attr { * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. * * **bpf_get_stack**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject -- cgit v1.2.3 From c8996c98f703b09afe77a1d247dae691c9849dc1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Aug 2022 08:08:02 +0200 Subject: bpf: Add BPF-helper for accessing CLOCK_TAI Commit 3dc6ffae2da2 ("timekeeping: Introduce fast accessor to clock tai") introduced a fast and NMI-safe accessor for CLOCK_TAI. Especially in time sensitive networks (TSN), where all nodes are synchronized by Precision Time Protocol (PTP), it's helpful to have the possibility to generate timestamps based on CLOCK_TAI instead of CLOCK_MONOTONIC. With a BPF helper for TAI in place, it becomes very convenient to correlate activity across different machines in the network. Use cases for such a BPF helper include functionalities such as Tx launch time (e.g. ETF and TAPRIO Qdiscs) and timestamping. Note: CLOCK_TAI is nothing new per se, only the NMI-safe variant of it is. Signed-off-by: Jesper Dangaard Brouer [Kurt: Wrote changelog and renamed helper] Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220809060803.5773-2-kurt@linutronix.de Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f58d58e1d547..e174ad28aeb7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5341,6 +5341,18 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5551,6 +5563,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(ktime_get_tai_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 4961d0772578e8737afe61370743f3bc22867111 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 12 Aug 2022 16:37:27 +0100 Subject: bpf: Clear up confusion in bpf_skb_adjust_room()'s documentation Adding or removing room space _below_ layers 2 or 3, as the description mentions, is ambiguous. This was written with a mental image of the packet with layer 2 at the top, layer 3 under it, and so on. But it has led users to believe that it was on lower layers (before the beginning of the L2 and L3 headers respectively). Let's make it more explicit, and specify between which layers the room space is adjusted. Reported-by: Rumen Telbizov Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220812153727.224500-3-quentin@isovalent.com --- tools/include/uapi/linux/bpf.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e174ad28aeb7..1d6085e15fc8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2573,10 +2573,12 @@ union bpf_attr { * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer - * (room space is added or removed below the layer 2 header). + * (room space is added or removed between the layer 2 and + * layer 3 headers). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer - * (room space is added or removed below the layer 3 header). + * (room space is added or removed between the layer 3 and + * layer 4 headers). * * The following flags are supported at this time: * -- cgit v1.2.3 From 91350fe152930c0d61a362af68272526490efea5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:17 +0300 Subject: bpf, flow_dissector: Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode for bpf progs Currently, attaching BPF_PROG_TYPE_FLOW_DISSECTOR programs completely replaces the flow-dissector logic with custom dissection logic. This forces implementors to write programs that handle dissection for any flows expected in the namespace. It makes sense for flow-dissector BPF programs to just augment the dissector with custom logic (e.g. dissecting certain flows or custom protocols), while enjoying the broad capabilities of the standard dissector for any other traffic. Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode. Flow-dissector BPF programs may return this to indicate no dissection was made, and fallback to the standard dissector is requested. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-3-shmulik.ladkani@gmail.com --- tools/include/uapi/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1d6085e15fc8..f38814fbb618 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5861,6 +5861,11 @@ enum bpf_ret_code { * represented by BPF_REDIRECT above). */ BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, }; struct bpf_sock { -- cgit v1.2.3 From 2172fb8007eaafbef18563afb6c1ae5a976bf787 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:54 -0700 Subject: bpf: update bpf_{g,s}et_retval documentation * replace 'syscall' with 'upper layers', still mention that it's being exported via syscall errno * describe what happens in set_retval(-EPERM) + return 1 * describe what happens with bind's 'return 3' Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-5-sdf@google.com Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f38814fbb618..4fb685591035 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5085,17 +5085,29 @@ union bpf_attr { * * int bpf_get_retval(void) * Description - * Get the syscall's return value that will be returned to userspace. + * Get the BPF program's return value that will be returned to the upper layers. * - * This helper is currently supported by cgroup programs only. + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. * Return - * The syscall's return value. + * The BPF program's return value. * * int bpf_set_retval(int retval) * Description - * Set the syscall's return value that will be returned to userspace. + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. * - * This helper is currently supported by cgroup programs only. * Return * 0 on success, or a negative error in case of failure. * -- cgit v1.2.3 From d4ccaf58a8472123ac97e6db03932c375b5c45ba Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 24 Aug 2022 16:31:13 -0700 Subject: bpf: Introduce cgroup iter Cgroup_iter is a type of bpf_iter. It walks over cgroups in four modes: - walking a cgroup's descendants in pre-order. - walking a cgroup's descendants in post-order. - walking a cgroup's ancestors. - process only the given cgroup. When attaching cgroup_iter, one can set a cgroup to the iter_link created from attaching. This cgroup is passed as a file descriptor or cgroup id and serves as the starting point of the walk. If no cgroup is specified, the starting point will be the root cgroup v2. For walking descendants, one can specify the order: either pre-order or post-order. For walking ancestors, the walk starts at the specified cgroup and ends at the root. One can also terminate the walk early by returning 1 from the iter program. Note that because walking cgroup hierarchy holds cgroup_mutex, the iter program is called with cgroup_mutex held. Currently only one session is supported, which means, depending on the volume of data bpf program intends to send to user space, the number of cgroups that can be walked is limited. For example, given the current buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can be walked is 512. This is a limitation of cgroup_iter. If the output data is larger than the kernel buffer size, after all data in the kernel buffer is consumed by user space, the subsequent read() syscall will signal EOPNOTSUPP. In order to work around, the user may have to update their program to reduce the volume of data sent to output. For example, skip some uninteresting cgroups. In future, we may extend bpf_iter flags to allow customizing buffer size. Acked-by: Yonghong Song Acked-by: Tejun Heo Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4fb685591035..5056cef2112f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_cgroup_iter_order { + BPF_ITER_ORDER_UNSPEC = 0, + BPF_ITER_SELF_ONLY, /* process only a single object. */ + BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ @@ -6176,11 +6195,22 @@ struct bpf_link_info { struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ union { struct { __u32 map_id; } map; }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + }; } iter; struct { __u32 netns_ino; -- cgit v1.2.3 From d4ffb6f39f1a1b260966b43a4ffdb64779c650dd Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 25 Aug 2022 15:39:36 -0700 Subject: bpf: Add CGROUP prefix to cgroup_iter_order bpf_cgroup_iter_order is globally visible but the entries do not have CGROUP prefix. As requested by Andrii, put a CGROUP in the names in bpf_cgroup_iter_order. This patch fixes two previous commits: one introduced the API and the other uses the API in bpf selftest (that is, the selftest cgroup_hierarchical_stats). I tested this patch via the following command: test_progs -t cgroup,iter,btf_dump Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter") Fixes: 88886309d2e8 ("selftests/bpf: add a selftest for cgroup hierarchical stats collection") Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220825223936.1865810-1-haoluo@google.com Signed-off-by: Martin KaFai Lau --- tools/include/uapi/linux/bpf.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5056cef2112f..92f7387e378a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -88,11 +88,11 @@ struct bpf_cgroup_storage_key { }; enum bpf_cgroup_iter_order { - BPF_ITER_ORDER_UNSPEC = 0, - BPF_ITER_SELF_ONLY, /* process only a single object. */ - BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ - BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ - BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ }; union bpf_iter_link_info { -- cgit v1.2.3 From aa75622c3be4d5819ce69c714acbcbd67bba5d65 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 25 Aug 2022 23:08:06 +0100 Subject: bpf: Fix a few typos in BPF helpers documentation Address a few typos in the documentation for the BPF helper functions. They were reported by Jakub [0], who ran spell checkers on the generated man page [1]. [0] https://lore.kernel.org/linux-man/d22dcd47-023c-8f52-d369-7b5308e6c842@gmail.com/T/#mb02e7d4b7fb61d98fa914c77b581184e9a9537af [1] https://lore.kernel.org/linux-man/eb6a1e41-c48e-ac45-5154-ac57a2c76108@gmail.com/T/#m4a8d1b003616928013ffcd1450437309ab652f9f v3: Do not copy unrelated (and breaking) elements to tools/ header v2: Turn a ',' into a ';' Reported-by: Jakub Wilk Signed-off-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220825220806.107143-1-quentin@isovalent.com --- tools/include/uapi/linux/bpf.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 92f7387e378a..f4ba82a1eace 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4456,7 +4456,7 @@ union bpf_attr { * * **-EEXIST** if the option already exists. * - * **-EFAULT** on failrue to parse the existing header options. + * **-EFAULT** on failure to parse the existing header options. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. @@ -4665,7 +4665,7 @@ union bpf_attr { * a *map* with *task* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this - * helper enforces the key must be an task_struct and the map must also + * helper enforces the key must be a task_struct and the map must also * be a **BPF_MAP_TYPE_TASK_STORAGE**. * * Underneath, the value is stored locally at *task* instead of @@ -4723,7 +4723,7 @@ union bpf_attr { * * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) * Description - * Returns the stored IMA hash of the *inode* (if it's avaialable). + * Returns the stored IMA hash of the *inode* (if it's available). * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * Return @@ -4747,12 +4747,12 @@ union bpf_attr { * * The argument *len_diff* can be used for querying with a planned * size change. This allows to check MTU prior to changing packet - * ctx. Providing an *len_diff* adjustment that is larger than the + * ctx. Providing a *len_diff* adjustment that is larger than the * actual packet size (resulting in negative packet size) will in - * principle not exceed the MTU, why it is not considered a - * failure. Other BPF-helpers are needed for performing the - * planned size change, why the responsability for catch a negative - * packet size belong in those helpers. + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. * * Specifying *ifindex* zero means the MTU check is performed * against the current net device. This is practical if this isn't -- cgit v1.2.3 From 44c51472bef83bb70b43e2f4b7a592096f32a855 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 31 Aug 2022 17:40:09 +0300 Subject: bpf: Support getting tunnel flags Existing 'bpf_skb_get_tunnel_key' extracts various tunnel parameters (id, ttl, tos, local and remote) but does not expose ip_tunnel_info's tun_flags to the BPF program. It makes sense to expose tun_flags to the BPF program. Assume for example multiple GRE tunnels maintained on a single GRE interface in collect_md mode. The program expects origins to initiate over GRE, however different origins use different GRE characteristics (e.g. some prefer to use GRE checksum, some do not; some pass a GRE key, some do not, etc..). A BPF program getting tun_flags can therefore remember the relevant flags (e.g. TUNNEL_CSUM, TUNNEL_SEQ...) for each initiating remote. In the reply path, the program can use 'bpf_skb_set_tunnel_key' in order to correctly reply to the remote, using similar characteristics, based on the stored tunnel flags. Introduce BPF_F_TUNINFO_FLAGS flag for bpf_skb_get_tunnel_key. If specified, 'bpf_tunnel_key->tunnel_flags' is set with the tun_flags. Decided to use the existing unused 'tunnel_ext' as the storage for the 'tunnel_flags' in order to avoid changing bpf_tunnel_key's layout. Also, the following has been considered during the design: 1. Convert the "interesting" internal TUNNEL_xxx flags back to BPF_F_yyy and place into the new 'tunnel_flags' field. This has 2 drawbacks: - The BPF_F_yyy flags are from *set_tunnel_key* enumeration space, e.g. BPF_F_ZERO_CSUM_TX. It is awkward that it is "returned" into tunnel_flags from a *get_tunnel_key* call. - Not all "interesting" TUNNEL_xxx flags can be mapped to existing BPF_F_yyy flags, and it doesn't make sense to create new BPF_F_yyy flags just for purposes of the returned tunnel_flags. 2. Place key.tun_flags into 'tunnel_flags' but mask them, keeping only "interesting" flags. That's ok, but the drawback is that what's "interesting" for my usecase might be limiting for other usecases. Therefore I decided to expose what's in key.tun_flags *as is*, which seems most flexible. The BPF user can just choose to ignore bits he's not interested in. The TUNNEL_xxx are also UAPI, so no harm exposing them back in the get_tunnel_key call. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220831144010.174110-1-shmulik.ladkani@gmail.com --- tools/include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4ba82a1eace..793103b10eab 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5659,6 +5659,11 @@ enum { BPF_F_SEQ_NUMBER = (1ULL << 3), }; +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ @@ -5848,7 +5853,10 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; - __u16 tunnel_ext; /* Padding, future use. */ + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; __u32 tunnel_label; union { __u32 local_ipv4; -- cgit v1.2.3 From 27ed9353aec9de4277b3389c9f2b04beb6ab7622 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:26:57 -0700 Subject: bpf: Update descriptions for helpers bpf_get_func_arg[_cnt]() Now instead of the number of arguments, the number of registers holding argument values are stored in trampoline. Update the description of bpf_get_func_arg[_cnt]() helpers. Previous programs without struct arguments should continue to work as usual. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152657.2078805-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 793103b10eab..3df78c56c1bf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5079,12 +5079,12 @@ union bpf_attr { * * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) * Description - * Get **n**-th argument (zero based) of the traced function (for tracing programs) + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) * returned in **value**. * * Return * 0 on success. - * **-EINVAL** if n >= arguments count of traced function. + * **-EINVAL** if n >= argument register count of traced function. * * long bpf_get_func_ret(void *ctx, u64 *value) * Description @@ -5097,10 +5097,11 @@ union bpf_attr { * * long bpf_get_func_arg_cnt(void *ctx) * Description - * Get number of arguments of the traced function (for tracing programs). + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. * * Return - * The number of arguments of the traced function. + * The number of argument registers of the traced function. * * int bpf_get_retval(void) * Description -- cgit v1.2.3 From 0e426a3ae030a9e891899370229e117158b35de6 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Wed, 21 Sep 2022 10:46:02 +0000 Subject: bpf, cgroup: Reject prog_attach_flags array when effective query Attach flags is only valid for attached progs of this layer cgroup, but not for effective progs. For querying with EFFECTIVE flags, exporting attach flags does not make sense. So when effective query, we reject prog_attach_flags array and don't need to populate it. Also we limit attach_flags to output 0 during effective query. Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP") Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20220921104604.2340580-2-pulehui@huaweicloud.com Signed-off-by: Martin KaFai Lau --- tools/include/uapi/linux/bpf.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 59a217ca2dfd..4eff7fc7ae58 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1233,7 +1233,7 @@ enum { /* Query effective (directly attached + inherited from ancestor cgroups) * programs that will be executed for events within a cgroup. - * attach_flags with this flag are returned only for directly attached programs. + * attach_flags with this flag are always returned 0. */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) @@ -1432,7 +1432,10 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; - __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ -- cgit v1.2.3 From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:57 -0500 Subject: bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type We want to support a ringbuf map type where samples are published from user-space, to be consumed by BPF programs. BPF currently supports a kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF map type. We'll need to define a new map type for user-space -> kernel, as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply to a user-space producer ring buffer, and we'll want to add one or more helper functions that would not apply for a kernel-producer ring buffer. This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type definition. The map type is useless in its current form, as there is no way to access or use it for anything until we one or more BPF helpers. A follow-on patch will therefore add a new helper function that allows BPF programs to run callbacks on samples that are published to the ring buffer. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as -- cgit v1.2.3 From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:58 -0500 Subject: bpf: Add bpf_user_ringbuf_drain() helper In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which will allow user-space applications to publish messages to a ring buffer that is consumed by a BPF program in kernel-space. In order for this map-type to be useful, it will require a BPF helper function that BPF programs can invoke to drain samples from the ring buffer, and invoke callbacks on those samples. This change adds that capability via a new BPF helper function: bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) BPF programs may invoke this function to run callback_fn() on a series of samples in the ring buffer. callback_fn() has the following signature: long callback_fn(struct bpf_dynptr *dynptr, void *context); Samples are provided to the callback in the form of struct bpf_dynptr *'s, which the program can read using BPF helper functions for querying struct bpf_dynptr's. In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register type is added to the verifier to reflect a dynptr that was allocated by a helper function and passed to a BPF program. Unlike PTR_TO_STACK dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR dynptrs need not use reference tracking, as the BPF helper is trusted to properly free the dynptr before returning. The verifier currently only supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL. Note that while the corresponding user-space libbpf logic will be added in a subsequent patch, this patch does contain an implementation of the .map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This .map_poll() callback guarantees that an epoll-waiting user-space producer will receive at least one event notification whenever at least one sample is drained in an invocation of bpf_user_ringbuf_drain(), provided that the function is not invoked with the BPF_RB_NO_WAKEUP flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup notification is sent even if no sample was drained. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com --- tools/include/uapi/linux/bpf.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e18c85324db6..ead35f39f185 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5388,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5599,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 0e253f7e558a3e250902ba2034091e0185448836 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:39 +0200 Subject: bpf: Return value in kprobe get_func_ip only for entry address Changing return value of kprobe's version of bpf_get_func_ip to return zero if the attach address is not on the function's entry point. For kprobes attached in the middle of the function we can't easily get to the function address especially now with the CONFIG_X86_KERNEL_IBT support. If user cares about current IP for kprobes attached within the function body, they can get it with PT_REGS_IP(ctx). Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martynas Pumputis Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ead35f39f185..d6bd10759eaf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4951,6 +4951,7 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). * * u64 bpf_get_attach_cookie(void *ctx) * Description -- cgit v1.2.3 From 73dfe93ea1b319482e6d82a54fe06f953ceeeccb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 22 Sep 2022 20:41:40 +0200 Subject: headers: Remove some left-over license text Remove some left-over from commit e2be04c7f995 ("License cleanup: add SPDX license identifier to uapi header files with a license") When the SPDX-License-Identifier tag has been added, the corresponding license text has not been removed. Signed-off-by: Christophe JAILLET Acked-by: Alexander Duyck Reviewed-by: Jiri Pirko Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/88410cddd31197ea26840d7dd71612bece8c6acf.1663871981.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- tools/include/uapi/linux/tc_act/tc_bpf.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h index 653c4f94f76e..fe6c8f8f3e8c 100644 --- a/tools/include/uapi/linux/tc_act/tc_bpf.h +++ b/tools/include/uapi/linux/tc_act/tc_bpf.h @@ -1,11 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * Copyright (c) 2015 Jiri Pirko - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #ifndef __LINUX_TC_BPF_H -- cgit v1.2.3 From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:53 -0700 Subject: bpf: Parameterize task iterators. Allow creating an iterator that loops through resources of one thread/process. People could only create iterators to loop through all resources of files, vma, and tasks in the system, even though they were interested in only the resources of a specific task or process. Passing the additional parameters, people can now create an iterator to go through all resources or only the resources of a task. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com --- tools/include/uapi/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d6bd10759eaf..455b21a53aac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ -- cgit v1.2.3 From 21fb6f2aa3890b0d0abf88b7756d0098e9367a7c Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:54 -0700 Subject: bpf: Handle bpf_link_info for the parameterized task BPF iterators. Add new fields to bpf_link_info that users can query it through bpf_obj_get_info_by_fd(). Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-3-kuifeng@fb.com --- tools/include/uapi/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 455b21a53aac..3075018a4ef8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6265,6 +6265,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 order; } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; }; } iter; struct { -- cgit v1.2.3