From 980737282232b752bb14dab96d77665c15889c36 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 2 Apr 2020 11:45:31 +0300 Subject: capabilities: Introduce CAP_PERFMON to kernel and user space Introduce the CAP_PERFMON capability designed to secure system performance monitoring and observability operations so that CAP_PERFMON can assist CAP_SYS_ADMIN capability in its governing role for performance monitoring and observability subsystems. CAP_PERFMON hardens system security and integrity during performance monitoring and observability operations by decreasing attack surface that is available to a CAP_SYS_ADMIN privileged process [2]. Providing the access to system performance monitoring and observability operations under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes the operation more secure. Thus, CAP_PERFMON implements the principle of least privilege for performance monitoring and observability operations (POSIX IEEE 1003.1e: 2.2.2.39 principle of least privilege: A security design principle that states that a process or program be granted only those privileges (e.g., capabilities) necessary to accomplish its legitimate function, and only for the time that such privileges are actually required) CAP_PERFMON meets the demand to secure system performance monitoring and observability operations for adoption in security sensitive, restricted, multiuser production environments (e.g. HPC clusters, cloud and virtual compute environments), where root or CAP_SYS_ADMIN credentials are not available to mass users of a system, and securely unblocks applicability and scalability of system performance monitoring and observability operations beyond root and CAP_SYS_ADMIN use cases. CAP_PERFMON takes over CAP_SYS_ADMIN credentials related to system performance monitoring and observability operations and balances amount of CAP_SYS_ADMIN credentials following the recommendations in the capabilities man page [1] for CAP_SYS_ADMIN: "Note: this capability is overloaded; see Notes to kernel developers, below." For backward compatibility reasons access to system performance monitoring and observability subsystems of the kernel remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN capability usage for secure system performance monitoring and observability operations is discouraged with respect to the designed CAP_PERFMON capability. Although the software running under CAP_PERFMON can not ensure avoidance of related hardware issues, the software can still mitigate these issues following the official hardware issues mitigation procedure [2]. The bugs in the software itself can be fixed following the standard kernel development process [3] to maintain and harden security of system performance monitoring and observability operations. [1] http://man7.org/linux/man-pages/man7/capabilities.7.html [2] https://www.kernel.org/doc/html/latest/process/embargoed-hardware-issues.html [3] https://www.kernel.org/doc/html/latest/admin-guide/security-bugs.html Signed-off-by: Alexey Budankov Acked-by: James Morris Acked-by: Serge E. Hallyn Acked-by: Song Liu Acked-by: Stephen Smalley Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andi Kleen Cc: Igor Lubashev Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: linux-doc@vger.kernel.org Cc: linux-man@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Link: http://lore.kernel.org/lkml/5590d543-82c6-490a-6544-08e6a5517db0@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/capability.h | 4 ++++ include/uapi/linux/capability.h | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/capability.h b/include/linux/capability.h index ecce0f43c73a..027d7e4a853b 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); +static inline bool perfmon_capable(void) +{ + return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); +} /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 272dc69fa080..e58c9636741b 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -367,8 +367,14 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +/* + * Allow system performance and observability privileged operations + * using perf_events, i915_perf and other kernel subsystems + */ + +#define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_LAST_CAP CAP_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) -- cgit v1.2.3 From d26c0cc53950464a24adfa76867f1d71f0cbbea6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Apr 2020 23:30:47 +0200 Subject: bpf: Avoid gcc-10 stringop-overflow warning in struct bpf_prog gcc-10 warns about accesses to zero-length arrays: kernel/bpf/core.c: In function 'bpf_patch_insn_single': cc1: warning: writing 8 bytes into a region of size 0 [-Wstringop-overflow=] In file included from kernel/bpf/core.c:21: include/linux/filter.h:550:20: note: at offset 0 to object 'insnsi' with size 0 declared here 550 | struct bpf_insn insnsi[0]; | ^~~~~~ In this case, we really want to have two flexible-array members, but that is not possible. Removing the union to make insnsi a flexible-array member while leaving insns as a zero-length array fixes the warning, as nothing writes to the other one in that way. This trick only works on linux-3.18 or higher, as older versions had additional members in the union. Fixes: 60a3b2253c41 ("net: bpf: make eBPF interpreter images read-only") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430213101.135134-6-arnd@arndb.de --- include/linux/filter.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index af37318bb1c5..73d06a39e2d6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -545,10 +545,8 @@ struct bpf_prog { unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); /* Instructions for interpreter */ - union { - struct sock_filter insns[0]; - struct bpf_insn insnsi[0]; - }; + struct sock_filter insns[0]; + struct bpf_insn insnsi[]; }; struct sk_filter { -- cgit v1.2.3 From e4e5aefc113510c03d34e182ab30bc0cc196675c Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 May 2020 15:33:51 +0200 Subject: xsk: Change two variable names for increased clarity Change two variables names so that it is clearer what they represent. The first one is xsk_list that in fact only contains the list of AF_XDP sockets with a Tx component. Change this to xsk_tx_list for improved clarity. The second variable is size in the ring structure. One might think that this is the size of the ring, but it is in fact the size of the umem, copied into the ring structure to improve performance. Rename this variable umem_size to avoid any confusion. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/1588599232-24897-2-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e86ec48ef627..b72f1f4c3b15 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -62,8 +62,8 @@ struct xdp_umem { struct net_device *dev; struct xdp_umem_fq_reuse *fq_reuse; bool zc; - spinlock_t xsk_list_lock; - struct list_head xsk_list; + spinlock_t xsk_tx_list_lock; + struct list_head xsk_tx_list; }; /* Nodes are linked in the struct xdp_sock map_list field, and used to -- cgit v1.2.3 From 07bf2d97d1f37e7ac8d7be2d84ff108d43556a1d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 May 2020 15:33:52 +0200 Subject: xsk: Remove unnecessary member in xdp_umem Remove the unnecessary member of address in struct xdp_umem as it is only used during the umem registration. No need to carry this around as it is not used during run-time nor when unregistering the umem. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/1588599232-24897-3-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index b72f1f4c3b15..67191ccaab85 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -50,7 +50,6 @@ struct xdp_umem { u32 headroom; u32 chunk_size_nohr; struct user_struct *user; - unsigned long address; refcount_t users; struct work_struct work; struct page **pgs; -- cgit v1.2.3 From cb0721c7e200750907bb8ef59b12646a5cb2dadf Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:10 -0700 Subject: net: Refactor arguments of inet{,6}_bind The intent is to add an additional bind parameter in the next commit. Instead of adding another argument, let's convert all existing flag arguments into an extendable bit field. No functional changes. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-4-sdf@google.com --- include/net/inet_common.h | 6 +++++- include/net/ipv6_stubs.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index ae2ba897675c..c38f4f7d660a 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -35,8 +35,12 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +/* Don't allocate port at this moment, defer to connect. */ +#define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) +/* Grab and release socket lock. */ +#define BIND_WITH_LOCK (1 << 1) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); + u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index a5f7c12c326a..6e622dd3122e 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -63,7 +63,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ struct ipv6_bpf_stub { int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); + u32 flags); struct sock *(*udp6_lib_lookup)(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, -- cgit v1.2.3 From 8086fbaf49345f988deec539ec8e182b02914401 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:11 -0700 Subject: bpf: Allow any port in bpf_bind helper We want to have a tighter control on what ports we bind to in the BPF_CGROUP_INET{4,6}_CONNECT hooks even if it means connect() becomes slightly more expensive. The expensive part comes from the fact that we now need to call inet_csk_get_port() that verifies that the port is not used and allocates an entry in the hash table for it. Since we can't rely on "snum || !bind_address_no_port" to prevent us from calling POST_BIND hook anymore, let's add another bind flag to indicate that the call site is BPF program. v5: * fix wrong AF_INET (should be AF_INET6) in the bpf program for v6 v3: * More bpf_bind documentation refinements (Martin KaFai Lau) * Add UDP tests as well (Martin KaFai Lau) * Don't start the thread, just do socket+bind+listen (Martin KaFai Lau) v2: * Update documentation (Andrey Ignatov) * Pass BIND_FORCE_ADDRESS_NO_PORT conditionally (Andrey Ignatov) Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-5-sdf@google.com --- include/net/inet_common.h | 2 ++ include/uapi/linux/bpf.h | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c38f4f7d660a..cb2818862919 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -39,6 +39,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) +/* Called from BPF program. */ +#define BIND_FROM_BPF (1 << 2) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * -- cgit v1.2.3 From ae24345da54e452880808b011fa2d8a0bbd191ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:58:59 -0700 Subject: bpf: Implement an interface to register bpf_iter targets The target can call bpf_iter_reg_target() to register itself. The needed information: target: target name seq_ops: the seq_file operations for the target init_seq_private target callback to initialize seq_priv during file open fini_seq_private target callback to clean up seq_priv during file release seq_priv_size: the private_data size needed by the seq_file operations The target name represents a target which provides a seq_ops for iterating objects. The target can provide two callback functions, init_seq_private and fini_seq_private, called during file open/release time. For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net name space needs to be setup properly during file open and released properly during file release. Function bpf_iter_unreg_target() is also implemented to unregister a particular target. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175859.2474669-1-yhs@fb.com --- include/linux/bpf.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1262ec460ab3..40c78b86fe38 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ struct seq_file; struct btf; struct btf_type; struct exception_table_entry; +struct seq_operations; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1126,6 +1127,20 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); +typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); +typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); + +struct bpf_iter_reg { + const char *target; + const struct seq_operations *seq_ops; + bpf_iter_init_seq_priv_t init_seq_private; + bpf_iter_fini_seq_priv_t fini_seq_private; + u32 seq_priv_size; +}; + +int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); +void bpf_iter_unreg_target(const char *target); + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, -- cgit v1.2.3 From 15d83c4d7cef5c067a8b075ce59e97df4f60706e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:00 -0700 Subject: bpf: Allow loading of a bpf_iter program A bpf_iter program is a tracing program with attach type BPF_TRACE_ITER. The load attribute attach_btf_id is used by the verifier against a particular kernel function, which represents a target, e.g., __bpf_iter__bpf_map for target bpf_map which is implemented later. The program return value must be 0 or 1 for now. 0 : successful, except potential seq_file buffer overflow which is handled by seq_file reader. 1 : request to restart the same object In the future, other return values may be used for filtering or teminating the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175900.2474947-1-yhs@fb.com --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c78b86fe38..f28bdd714754 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1127,6 +1127,8 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); +#define BPF_ITER_FUNC_PREFIX "__bpf_iter__" + typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); @@ -1140,6 +1142,7 @@ struct bpf_iter_reg { int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); +bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6e5e7caa3739..c8a5325cc8d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -218,6 +218,7 @@ enum bpf_attach_type { BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, BPF_LSM_MAC, + BPF_TRACE_ITER, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From de4e05cac46d206f9090051ef09930514bff73e4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:01 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_CREATE Given a bpf program, the step to create an anonymous bpf iterator is: - create a bpf_iter_link, which combines bpf program and the target. In the future, there could be more information recorded in the link. A link_fd will be returned to the user space. - create an anonymous bpf iterator with the given link_fd. The bpf_iter_link can be pinned to bpffs mount file system to create a file based bpf iterator as well. The benefit to use of bpf_iter_link: - using bpf link simplifies design and implementation as bpf link is used for other tracing bpf programs. - for file based bpf iterator, bpf_iter_link provides a standard way to replace underlying bpf programs. - for both anonymous and free based iterators, bpf link query capability can be leveraged. The patch added support of tracing/iter programs for BPF_LINK_CREATE. A new link type BPF_LINK_TYPE_ITER is added to facilitate link querying. Currently, only prog_id is needed, so there is no additional in-kernel show_fdinfo() and fill_link_info() hook is needed for BPF_LINK_TYPE_ITER link. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175901.2475084-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f28bdd714754..e93d2d33c82c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1143,6 +1143,7 @@ struct bpf_iter_reg { int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 8345cdf553b8..29d22752fc87 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -124,3 +124,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) #ifdef CONFIG_CGROUP_BPF BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) #endif +BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8a5325cc8d0..1e8dfff5d5d4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -229,6 +229,7 @@ enum bpf_link_type { BPF_LINK_TYPE_RAW_TRACEPOINT = 1, BPF_LINK_TYPE_TRACING = 2, BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3 From ac51d99bf81caac8d8881fe52098948110d0de68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:05 -0700 Subject: bpf: Create anonymous bpf iterator A new bpf command BPF_ITER_CREATE is added. The anonymous bpf iterator is seq_file based. The seq_file private data are referenced by targets. The bpf_iter infrastructure allocated additional space at seq_file->private before the space used by targets to store some meta data, e.g., prog: prog to run session_id: an unique id for each opened seq_file seq_num: how many times bpf programs are queried in this session done_stop: an internal state to decide whether bpf program should be called in seq_ops->stop() or not The seq_num will start from 0 for valid objects. The bpf program may see the same seq_num more than once if - seq_file buffer overflow happens and the same object is retried by bpf_seq_read(), or - the bpf program explicitly requests a retry of the same object Since module is not supported for bpf_iter, all target registeration happens at __init time, so there is no need to change bpf_iter_unreg_target() as it is used mostly in error path of the init function at which time no bpf iterators have been created yet. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175905.2475770-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e93d2d33c82c..80b1b9d8a638 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1144,6 +1144,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_iter_new_fd(struct bpf_link *link); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e8dfff5d5d4..708763f702e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_cmd { BPF_LINK_GET_FD_BY_ID, BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, + BPF_ITER_CREATE, }; enum bpf_map_type { @@ -614,6 +615,11 @@ union bpf_attr { __u32 type; } enable_stats; + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 367ec3e4834cbd611401c2c40a23c22c825474f1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:06 -0700 Subject: bpf: Create file bpf iterator To produce a file bpf iterator, the fd must be corresponding to a link_fd assocciated with a trace/iter program. When the pinned file is opened, a seq_file will be generated. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175906.2475893-1-yhs@fb.com --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80b1b9d8a638..b06653ab3476 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1022,6 +1022,7 @@ static inline void bpf_enable_instrumentation(void) extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; +extern const struct file_operations bpf_iter_fops; #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ @@ -1145,6 +1146,7 @@ void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); +bool bpf_link_is_iter(struct bpf_link *link); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); -- cgit v1.2.3 From e5158d987b72c3f318b4b52a01ac6f3997bd0c00 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:07 -0700 Subject: bpf: Implement common macros/helpers for target iterators Macro DEFINE_BPF_ITER_FUNC is implemented so target can define an init function to capture the BTF type which represents the target. The bpf_iter_meta is a structure holding meta data, common to all targets in the bpf program. Additional marker functions are called before or after bpf_seq_read() show()/next()/stop() callback functions to help calculate precise seq_num and whether call bpf_prog inside stop(). Two functions, bpf_iter_get_info() and bpf_iter_run_prog(), are implemented so target can get needed information from bpf_iter infrastructure and can run the program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175907.2475956-1-yhs@fb.com --- include/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b06653ab3476..ffe0b9b669bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1129,6 +1129,9 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); #define BPF_ITER_FUNC_PREFIX "__bpf_iter__" +#define DEFINE_BPF_ITER_FUNC(target, args...) \ + extern int __bpf_iter__ ## target(args); \ + int __init __bpf_iter__ ## target(args) { return 0; } typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); @@ -1141,12 +1144,20 @@ struct bpf_iter_reg { u32 seq_priv_size; }; +struct bpf_iter_meta { + __bpf_md_ptr(struct seq_file *, seq); + u64 session_id; + u64 seq_num; +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); +struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); -- cgit v1.2.3 From 6086d29def80edd78f9832ea6eafa74e3818f6a7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:09 -0700 Subject: bpf: Add bpf_map iterator Implement seq_file operations to traverse all bpf_maps. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175909.2476096-1-yhs@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ffe0b9b669bf..363ab0751967 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1082,6 +1082,7 @@ int generic_map_update_batch(struct bpf_map *map, int generic_map_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); +struct bpf_map *bpf_map_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; -- cgit v1.2.3 From 138d0be35b141e09f6b267c6ae4094318d4e4491 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:10 -0700 Subject: net: bpf: Add netlink and ipv6_route bpf_iter targets This patch added netlink and ipv6_route targets, using the same seq_ops (except show() and minor changes for stop()) for /proc/net/{netlink,ipv6_route}. The net namespace for these targets are the current net namespace at file open stage, similar to /proc/net/{netlink,ipv6_route} reference counting the net namespace at seq_file open stage. Since module is not supported for now, ipv6_route is supported only if the IPV6 is built-in, i.e., not compiled as a module. The restriction can be lifted once module is properly supported for bpf_iter. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175910.2476329-1-yhs@fb.com --- include/linux/proc_fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 45c05fd9c99d..03953c59807d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -105,6 +105,9 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo void *data); extern struct pid *tgid_pidfd_to_pid(const struct file *file); +extern int bpf_iter_init_seq_net(void *priv_data); +extern void bpf_iter_fini_seq_net(void *priv_data); + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must -- cgit v1.2.3 From b121b341e5983bdccf7a5d6cf9236a45c965a31f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:12 -0700 Subject: bpf: Add PTR_TO_BTF_ID_OR_NULL support Add bpf_reg_type PTR_TO_BTF_ID_OR_NULL support. For tracing/iter program, the bpf program context definition, e.g., for previous bpf_map target, looks like struct bpf_iter__bpf_map { struct bpf_iter_meta *meta; struct bpf_map *map; }; The kernel guarantees that meta is not NULL, but map pointer maybe NULL. The NULL map indicates that all objects have been traversed, so bpf program can take proper action, e.g., do final aggregation and/or send final report to user space. Add btf_id_or_null_non0_off to prog->aux structure, to indicate that if the context access offset is not 0, set to PTR_TO_BTF_ID_OR_NULL instead of PTR_TO_BTF_ID. This bit is set for tracing/iter program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175912.2476576-1-yhs@fb.com --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 363ab0751967..cf4b6e44f2bc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -320,6 +320,7 @@ enum bpf_reg_type { PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_BTF_ID, /* reg points to kernel struct */ + PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -658,6 +659,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool btf_id_or_null_non0_off; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; -- cgit v1.2.3 From 492e639f0c222784e2e0f121966375f641c61b15 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:14 -0700 Subject: bpf: Add bpf_seq_printf and bpf_seq_write helpers Two helpers bpf_seq_printf and bpf_seq_write, are added for writing data to the seq_file buffer. bpf_seq_printf supports common format string flag/width/type fields so at least I can get identical results for netlink and ipv6_route targets. For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW specifically indicates a write failure due to overflow, which means the object will be repeated in the next bpf invocation if object collection stays the same. Note that if the object collection is changed, depending how collection traversal is done, even if the object still in the collection, it may not be visited. For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to read kernel memory. Reading kernel memory may fail in the following two cases: - invalid kernel address, or - valid kernel address but requiring a major fault If reading kernel memory failed, the %s string will be an empty string and %p{i,I}{4,6} will be all 0. Not returning error to bpf program is consistent with what bpf_trace_printk() does for now. bpf_seq_printf may return -EBUSY meaning that internal percpu buffer for memory copy of strings or other pointees is not available. Bpf program can return 1 to indicate it wants the same object to be repeated. Right now, this should not happen on no-RT kernels since migrate_disable(), which guards bpf prog call, calls preempt_disable(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com --- include/uapi/linux/bpf.h | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 708763f702e1..9d1932e23cec 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3077,6 +3077,41 @@ union bpf_attr { * See: clock_gettime(CLOCK_BOOTTIME) * Return * Current *ktime*. + * + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * seq_printf uses seq_file seq_printf() to print out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a u64 array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the *data* size in term of bytes. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what bpf_trace_printk() does for now. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EBUSY** Percpu memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. + * * **-E2BIG** Too many format specifiers. + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * seq_write uses seq_file seq_write() to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EOVERFLOW** Overflow happens, the same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3204,7 +3239,9 @@ union bpf_attr { FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ FN(sk_assign), \ - FN(ktime_get_boot_ns), + FN(ktime_get_boot_ns), \ + FN(seq_printf), \ + FN(seq_write), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From ab8d78093dfa2e7820ca0c28dda9142aa771c510 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 11 May 2020 17:15:35 +0100 Subject: bpf: Minor fixes to BPF helpers documentation Minor improvements to the documentation for BPF helpers: * Fix formatting for the description of "bpf_socket" for bpf_getsockopt() and bpf_setsockopt(), thus suppressing two warnings from rst2man about "Unexpected indentation". * Fix formatting for return values for bpf_sk_assign() and seq_file helpers. * Fix and harmonise formatting, in particular for function/struct names. * Remove blank lines before "Return:" sections. * Replace tabs found in the middle of text lines. * Fix typos. * Add a note to the footer (in Python script) about "bpftool feature probe", including for listing features available to unprivileged users, and add a reference to bpftool man page. Thanks to Florian for reporting two typos (duplicated words). Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200511161536.29853-4-quentin@isovalent.com --- include/uapi/linux/bpf.h | 109 +++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9d1932e23cec..bfb31c1be219 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -675,8 +675,8 @@ union bpf_attr { * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. * - * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() - * instead. + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. * Return * 0 on success, or a negative error in case of failure. * @@ -684,7 +684,7 @@ union bpf_attr { * Description * Return the time elapsed since system boot, in nanoseconds. * Does not include time the system was suspended. - * See: clock_gettime(CLOCK_MONOTONIC) + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) * Return * Current *ktime*. * @@ -1543,11 +1543,11 @@ union bpf_attr { * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address - * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for * more details. * - * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() - * instead. + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -1575,7 +1575,7 @@ union bpf_attr { * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description - * Equivalent to bpf_get_socket_cookie() helper that accepts + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. @@ -1604,6 +1604,7 @@ union bpf_attr { * The option value of length *optlen* is pointed by *optval*. * * *bpf_socket* should be one of the following: + * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * and **BPF_CGROUP_INET6_CONNECT**. @@ -1672,12 +1673,12 @@ union bpf_attr { * * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be - * one of the XDP program return codes up to XDP_TX, as chosen by - * the caller. Any higher bits in the *flags* argument must be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. Any higher bits in the *flags* argument must be * unset. * - * See also bpf_redirect(), which only supports redirecting to an - * ifindex, but doesn't require a map to do so. + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. * Return * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. @@ -1785,7 +1786,7 @@ union bpf_attr { * the time running for event since last normalization. The * enabled and running times are accumulated since the perf event * open. To achieve scaling factor between two invocations of an - * eBPF program, users can can use CPU id as the key (which is + * eBPF program, users can use CPU id as the key (which is * typical for perf array usage model) to remember the previous * value and do the calculation inside the eBPF program. * Return @@ -1812,6 +1813,7 @@ union bpf_attr { * *opval* and of length *optlen*. * * *bpf_socket* should be one of the following: + * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** * and **BPF_CGROUP_INET6_CONNECT**. @@ -1833,7 +1835,7 @@ union bpf_attr { * The first argument is the context *regs* on which the kprobe * works. * - * This helper works by setting setting the PC (program counter) + * This helper works by setting the PC (program counter) * to an override function which is run in place of the original * probed function. This means the probed function is not run at * all. The replacement function just returns with the required @@ -2300,7 +2302,7 @@ union bpf_attr { * **bpf_rc_keydown**\ () again with the same values, or calling * **bpf_rc_repeat**\ (). * - * Some protocols include a toggle bit, in case the button was + * Some protocols include a toggle bit, in case the button was * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into @@ -2646,7 +2648,6 @@ union bpf_attr { * * *th* points to the start of the TCP header, while *th_len* * contains **sizeof**\ (**struct tcphdr**). - * * Return * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. @@ -2829,7 +2830,6 @@ union bpf_attr { * * *th* points to the start of the TCP header, while *th_len* * contains the length of the TCP header. - * * Return * On success, lower 32 bits hold the generated SYN cookie in * followed by 16 bits which hold the MSS value for that cookie, @@ -2912,7 +2912,7 @@ union bpf_attr { * // size, after checking its boundaries. * } * - * In comparison, using **bpf_probe_read_user()** helper here + * In comparison, using **bpf_probe_read_user**\ () helper here * instead to read the string would require to estimate the length * at compile time, and would often result in copying more memory * than necessary. @@ -2930,14 +2930,14 @@ union bpf_attr { * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* - * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. * Return - * On success, the strictly positive length of the string, including + * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description - * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. @@ -2965,19 +2965,19 @@ union bpf_attr { * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the - * branch records (struct perf_branch_entry) associated to *ctx* - * and store it in the buffer pointed by *buf* up to size + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size * *size* bytes. * Return * On success, number of bytes written to *buf*. On error, a * negative value. * * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to - * instead return the number of bytes required to store all the + * instead return the number of bytes required to store all the * branch entries. If this flag is set, *buf* may be NULL. * * **-EINVAL** if arguments invalid or **size** not a multiple - * of sizeof(struct perf_branch_entry). + * of **sizeof**\ (**struct perf_branch_entry**\ ). * * **-ENOENT** if architecture does not support branch records. * @@ -2985,8 +2985,8 @@ union bpf_attr { * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. - * - * On failure, the returned value is one of the following: + * Return + * 0 on success, or one of the following in case of failure: * * **-EINVAL** if dev and inum supplied don't match dev_t and inode number * with nsfs of current task, or if dev conversion to dev_t lost high bits. @@ -3025,8 +3025,8 @@ union bpf_attr { * a global identifier that can be assumed unique. If *ctx* is * NULL, then the helper returns the cookie for the initial * network namespace. The cookie itself is very similar to that - * of bpf_get_socket_cookie() helper, but for network namespaces - * instead of sockets. + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. * Return * A 8-byte long opaque number. * @@ -3061,57 +3061,66 @@ union bpf_attr { * * The *flags* argument must be zero. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: * - * * **-EINVAL** Unsupported flags specified. - * * **-ENOENT** Socket is unavailable for assignment. - * * **-ENETUNREACH** Socket is unreachable (wrong netns). - * * **-EOPNOTSUPP** Unsupported operation, for example a - * call from outside of TC ingress. - * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. * Does include the time the system was suspended. - * See: clock_gettime(CLOCK_BOOTTIME) + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) * Return * Current *ktime*. * * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description - * seq_printf uses seq_file seq_printf() to print out the format string. + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. * The *m* represents the seq_file. The *fmt* and *fmt_size* are for * the format string itself. The *data* and *data_len* are format string - * arguments. The *data* are a u64 array and corresponding format string + * arguments. The *data* are a **u64** array and corresponding format string * values are stored in the array. For strings and pointers where pointees * are accessed, only the pointer values are stored in the *data* array. - * The *data_len* is the *data* size in term of bytes. + * The *data_len* is the size of *data* in bytes. * * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. * Reading kernel memory may fail due to either invalid address or * valid address but requiring a major memory fault. If reading kernel memory * fails, the string for **%s** will be an empty string, and the ip * address for **%p{i,I}{4,6}** will be 0. Not returning error to - * bpf program is consistent with what bpf_trace_printk() does for now. + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. * - * * **-EBUSY** Percpu memory copy buffer is busy, can try again - * by returning 1 from bpf program. - * * **-EINVAL** Invalid arguments, or invalid/unsupported formats. - * * **-E2BIG** Too many format specifiers. - * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description - * seq_write uses seq_file seq_write() to write the data. + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the - * data to write in bytes. + * data to write in bytes. * Return - * 0 on success, or a negative errno in case of failure. + * 0 on success, or a negative error in case of failure: * - * * **-EOVERFLOW** Overflow happens, the same object will be tried again. + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From 21aef70eade22a656297c28d5da93301915d2ac2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:16 -0700 Subject: bpf: Change btf_iter func proto prefix to "bpf_iter_" This is to be consistent with tracing and lsm programs which have prefix "bpf_trace_" and "bpf_lsm_" respectively. Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180216.2949387-1-yhs@fb.com --- include/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cf4b6e44f2bc..ab94dfd8826f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1131,10 +1131,10 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); -#define BPF_ITER_FUNC_PREFIX "__bpf_iter__" +#define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ - extern int __bpf_iter__ ## target(args); \ - int __init __bpf_iter__ ## target(args) { return 0; } + extern int bpf_iter_ ## target(args); \ + int __init bpf_iter_ ## target(args) { return 0; } typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); -- cgit v1.2.3 From 15172a46fa2796c1a1358a36babd31274716ed41 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:19 -0700 Subject: bpf: net: Refactor bpf_iter target registration Currently bpf_iter_reg_target takes parameters from target and allocates memory to save them. This is really not necessary, esp. in the future we may grow information passed from targets to bpf_iter manager. The patch refactors the code so target reg_info becomes static and bpf_iter manager can just take a reference to it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200513180219.2949605-1-yhs@fb.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ab94dfd8826f..6fa773e2d1bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1153,7 +1153,7 @@ struct bpf_iter_meta { u64 seq_num; }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info); +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const char *target); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); -- cgit v1.2.3 From ab2ee4fcb9d61fd57db70db694adbcf54662bd80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:20 -0700 Subject: bpf: Change func bpf_iter_unreg_target() signature Change func bpf_iter_unreg_target() parameter from target name to target reg_info, similar to bpf_iter_reg_target(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180220.2949737-1-yhs@fb.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6fa773e2d1bf..534174eca86b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1154,7 +1154,7 @@ struct bpf_iter_meta { }; int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); -void bpf_iter_unreg_target(const char *target); +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); -- cgit v1.2.3 From 3c32cc1bceba8a1755dc35cd97516f6c67856844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:21 -0700 Subject: bpf: Enable bpf_iter targets registering ctx argument types Commit b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") adds a field btf_id_or_null_non0_off to bpf_prog->aux structure to indicate that the first ctx argument is PTR_TO_BTF_ID reg_type and all others are PTR_TO_BTF_ID_OR_NULL. This approach does not really scale if we have other different reg types in the future, e.g., a pointer to a buffer. This patch enables bpf_iter targets registering ctx argument reg types which may be different from the default one. For example, for pointers to structures, the default reg_type is PTR_TO_BTF_ID for tracing program. The target can register a particular pointer type as PTR_TO_BTF_ID_OR_NULL which can be used by the verifier to enforce accesses. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180221.2949882-1-yhs@fb.com --- include/linux/bpf.h | 12 +++++++++++- include/net/ip6_fib.h | 7 +++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 534174eca86b..c45d198ac38c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -643,6 +643,12 @@ struct bpf_jit_poke_descriptor { u16 reason; }; +/* reg_type info for ctx arguments */ +struct bpf_ctx_arg_aux { + u32 offset; + enum bpf_reg_type reg_type; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -654,12 +660,13 @@ struct bpf_prog_aux { u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + u32 ctx_arg_info_size; + const struct bpf_ctx_arg_aux *ctx_arg_info; struct bpf_prog *linked_prog; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; - bool btf_id_or_null_non0_off; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -1139,12 +1146,15 @@ int bpf_obj_get_user(const char __user *pathname, int flags); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); +#define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 ctx_arg_info_size; + struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; }; struct bpf_iter_meta { diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 80262d2980f5..870b646c5797 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -540,6 +540,13 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__ipv6_route { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct fib6_info *, rt); +}; +#endif + #ifdef CONFIG_IPV6_MULTIPLE_TABLES static inline bool fib6_has_custom_rules(const struct net *net) { -- cgit v1.2.3 From 7aebfa1b3885b5aa29fcb4a596d0485ac463bbe8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 13 May 2020 18:50:27 -0700 Subject: bpf: Support narrow loads from bpf_sock_addr.user_port bpf_sock_addr.user_port supports only 4-byte load and it leads to ugly code in BPF programs, like: volatile __u32 user_port = ctx->user_port; __u16 port = bpf_ntohs(user_port); Since otherwise clang may optimize the load to be 2-byte and it's rejected by verifier. Add support for 1- and 2-byte loads same way as it's supported for other fields in bpf_sock_addr like user_ip4, msg_src_ip4, etc. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/c1e983f4c17573032601d0b2b1f9d1274f24bc16.1589420814.git.rdna@fb.com --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ -- cgit v1.2.3 From f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:47 -0700 Subject: bpf: Introduce bpf_sk_{, ancestor_}cgroup_id helpers With having ability to lookup sockets in cgroup skb programs it becomes useful to access cgroup id of retrieved sockets so that policies can be implemented based on origin cgroup of such socket. For example, a container running in a cgroup can have cgroup skb ingress program that can lookup peer socket that is sending packets to a process inside the container and decide whether those packets should be allowed or denied based on cgroup id of the peer. More specifically such ingress program can implement intra-host policy "allow incoming packets only from this same container and not from any other container on same host" w/o relying on source IP addresses since quite often it can be the case that containers share same IP address on the host. Introduce two new helpers for this use-case: bpf_sk_cgroup_id() and bpf_sk_ancestor_cgroup_id(). These helpers are similar to existing bpf_skb_{,ancestor_}cgroup_id helpers with the only difference that sk is used to get cgroup id instead of skb, and share code with them. See documentation in UAPI for more details. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f5884981249ce911f63e9b57ecd5d7d19154ff39.1589486450.git.rdna@fb.com --- include/uapi/linux/bpf.h | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3