From 4d8926a0407cff0c864b759b59104f4fb6f8efab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 Mar 2024 17:01:27 -0700 Subject: bpf: preserve sleepable bit in subprog info Copy over main program's sleepable bit into subprog's info. This might be important for, e.g., freplace cases. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Message-ID: <20240314000127.3881569-1-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 63749ad5ac6b..7b208e5d38f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19158,6 +19158,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (bpf_prog_calc_tag(func[i])) goto out_free; func[i]->is_func = 1; + func[i]->sleepable = prog->sleepable; func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; -- cgit v1.2.3 From 7d2cc63eca0c993c99d18893214abf8f85d566d8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 8 Mar 2024 06:38:07 +0100 Subject: bpf: Take return from set_memory_ro() into account with bpf_prog_lock_ro() set_memory_ro() can fail, leaving memory unprotected. Check its return and take it into account as an error. Link: https://github.com/KSPP/linux/issues/7 Signed-off-by: Christophe Leroy Cc: linux-hardening@vger.kernel.org Reviewed-by: Kees Cook Message-ID: <286def78955e04382b227cb3e4b6ba272a7442e3.1709850515.git.christophe.leroy@csgroup.eu> Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 +++- kernel/bpf/verifier.c | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 696bc55de8e8..63f100def31b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2403,7 +2403,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } finalize: - bpf_prog_lock_ro(fp); + *err = bpf_prog_lock_ro(fp); + if (*err) + return fp; /* The tail call compatibility check can only be done at * this late stage as we need to determine, if we deal diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b208e5d38f6..de7813947981 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19263,10 +19263,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) * bpf_prog_load will add the kallsyms for the main program. */ for (i = 1; i < env->subprog_cnt; i++) { - bpf_prog_lock_ro(func[i]); - bpf_prog_kallsyms_add(func[i]); + err = bpf_prog_lock_ro(func[i]); + if (err) + goto out_free; } + for (i = 1; i < env->subprog_cnt; i++) + bpf_prog_kallsyms_add(func[i]); + /* Last step: make now unused interpreter insns from main * prog consistent for later dump requests, so they can * later look the same as if they were interpreted only. -- cgit v1.2.3 From 7f3edd0c72c3f7214f8f28495f2e6466348eb128 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 15 Mar 2024 12:21:12 -0700 Subject: bpf: Remove unnecessary err < 0 check in bpf_struct_ops_map_update_elem There is a "if (err)" check earlier, so the "if (err < 0)" check that this patch removing is unnecessary. It was my overlook when making adjustments to the bpf_struct_ops_prepare_trampoline() such that the caller does not have to worry about the new page when the function returns error. Fixes: 187e2af05abe ("bpf: struct_ops supports more than one page for trampolines.") Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240315192112.2825039-1-martin.lau@linux.dev --- kernel/bpf/bpf_struct_ops.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 43356faaa057..3fcd35314ce5 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -728,8 +728,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, cur_image = image; trampoline_start = 0; } - if (err < 0) - goto reset_unlock; *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); -- cgit v1.2.3 From e3362acd796789dc0562eb1a3937007b0beb0c5b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 16 Mar 2024 08:35:40 +0100 Subject: bpf: Remove arch_unprotect_bpf_trampoline() Last user of arch_unprotect_bpf_trampoline() was removed by commit 187e2af05abe ("bpf: struct_ops supports more than one page for trampolines.") Remove arch_unprotect_bpf_trampoline() Reported-by: Daniel Borkmann Fixes: 187e2af05abe ("bpf: struct_ops supports more than one page for trampolines.") Signed-off-by: Christophe Leroy Link: https://lore.kernel.org/r/42c635bb54d3af91db0f9b85d724c7c290069f67.1710574353.git.christophe.leroy@csgroup.eu Signed-off-by: Martin KaFai Lau --- kernel/bpf/trampoline.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index db7599c59c78..04fd1abd3661 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1078,13 +1078,6 @@ void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) set_memory_rox((long)image, 1); } -void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) -{ - WARN_ON_ONCE(size > PAGE_SIZE); - set_memory_nx((long)image, 1); - set_memory_rw((long)image, 1); -} - int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { -- cgit v1.2.3 From c733239f8f530872a1f80d8c45dcafbaff368737 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 16 Mar 2024 08:35:41 +0100 Subject: bpf: Check return from set_memory_rox() arch_protect_bpf_trampoline() and alloc_new_pack() call set_memory_rox() which can fail, leading to unprotected memory. Take into account return from set_memory_rox() function and add __must_check flag to arch_protect_bpf_trampoline(). Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/fe1c163c83767fde5cab31d209a4a6be3ddb3a73.1710574353.git.christophe.leroy@csgroup.eu Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_struct_ops.c | 8 ++++++-- kernel/bpf/core.c | 28 +++++++++++++++++++++------- kernel/bpf/trampoline.c | 8 +++++--- 3 files changed, 32 insertions(+), 12 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 3fcd35314ce5..86c7884abaf8 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -740,8 +740,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } - for (i = 0; i < st_map->image_pages_cnt; i++) - arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); + for (i = 0; i < st_map->image_pages_cnt; i++) { + err = arch_protect_bpf_trampoline(st_map->image_pages[i], + PAGE_SIZE); + if (err) + goto reset_unlock; + } if (st_map->map.map_flags & BPF_F_LINK) { err = 0; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 63f100def31b..5aacb1d3c4cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -908,23 +908,30 @@ static LIST_HEAD(pack_list); static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; + int err; pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); - if (!pack->ptr) { - kfree(pack); - return NULL; - } + if (!pack->ptr) + goto out; bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); - list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + err = set_memory_rox((unsigned long)pack->ptr, + BPF_PROG_PACK_SIZE / PAGE_SIZE); + if (err) + goto out; + list_add_tail(&pack->list, &pack_list); return pack; + +out: + bpf_jit_free_exec(pack->ptr); + kfree(pack); + return NULL; } void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) @@ -939,9 +946,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { + int err; + bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); + err = set_memory_rox((unsigned long)ptr, + size / PAGE_SIZE); + if (err) { + bpf_jit_free_exec(ptr); + ptr = NULL; + } } goto out; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 04fd1abd3661..cc50607f8d8c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -456,7 +456,9 @@ again: if (err < 0) goto out_free; - arch_protect_bpf_trampoline(im->image, im->size); + err = arch_protect_bpf_trampoline(im->image, im->size); + if (err) + goto out_free; WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -1072,10 +1074,10 @@ void __weak arch_free_bpf_trampoline(void *image, unsigned int size) bpf_jit_free_exec(image); } -void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) +int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) { WARN_ON_ONCE(size > PAGE_SIZE); - set_memory_rox((long)image, 1); + return set_memory_rox((long)image, 1); } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, -- cgit v1.2.3 From 1a4a0cb7985f921548f1a7ac17686afbefe67f87 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 18 Mar 2024 14:25:26 +0100 Subject: bpf/lpm_trie: Inline longest_prefix_match for fastpath The BPF map type LPM (Longest Prefix Match) is used heavily in production by multiple products that have BPF components. Perf data shows trie_lookup_elem() and longest_prefix_match() being part of kernels perf top. For every level in the LPM tree trie_lookup_elem() calls out to longest_prefix_match(). The compiler is free to inline this call, but chooses not to inline, because other slowpath callers (that can be invoked via syscall) exists like trie_update_elem(), trie_delete_elem() or trie_get_next_key(). bcc/tools/funccount -Ti 1 'trie_lookup_elem|longest_prefix_match.isra.0' FUNC COUNT trie_lookup_elem 664945 longest_prefix_match.isra.0 8101507 Observation on a single random machine shows a factor 12 between the two functions. Given an average of 12 levels in the trie being searched. This patch force inlining longest_prefix_match(), but only for the lookup fastpath to balance object instruction size. In production with AMD CPUs, measuring the function latency of 'trie_lookup_elem' (bcc/tools/funclatency) we are seeing an improvement function latency reduction 7-8% with this patch applied (to production kernels 6.6 and 6.1). Analyzing perf data, we can explain this rather large improvement due to reducing the overhead for AMD side-channel mitigation SRSO (Speculative Return Stack Overflow). Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/171076828575.2141737.18370644069389889027.stgit@firesoul --- kernel/bpf/lpm_trie.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 050fe1ebf0f7..939620b91c0e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -155,16 +155,17 @@ static inline int extract_bit(const u8 *data, size_t index) } /** - * longest_prefix_match() - determine the longest prefix + * __longest_prefix_match() - determine the longest prefix * @trie: The trie to get internal sizes from * @node: The node to operate on * @key: The key to compare to @node * * Determine the longest prefix of @node that matches the bits in @key. */ -static size_t longest_prefix_match(const struct lpm_trie *trie, - const struct lpm_trie_node *node, - const struct bpf_lpm_trie_key_u8 *key) +static __always_inline +size_t __longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) { u32 limit = min(node->prefixlen, key->prefixlen); u32 prefixlen = 0, i = 0; @@ -224,6 +225,13 @@ static size_t longest_prefix_match(const struct lpm_trie *trie, return prefixlen; } +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key_u8 *key) +{ + return __longest_prefix_match(trie, node, key); +} + /* Called from syscall or from eBPF program */ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { @@ -245,7 +253,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) * If it's the maximum possible prefix for this trie, we have * an exact match and can return it directly. */ - matchlen = longest_prefix_match(trie, node, key); + matchlen = __longest_prefix_match(trie, node, key); if (matchlen == trie->max_prefixlen) { found = node; break; -- cgit v1.2.3 From eb166e522c77699fc19bfa705652327a1e51a117 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Mar 2024 11:48:54 -0700 Subject: bpf: Allow helper bpf_get_[ns_]current_pid_tgid() for all prog types Currently bpf_get_current_pid_tgid() is allowed in tracing, cgroup and sk_msg progs while bpf_get_ns_current_pid_tgid() is only allowed in tracing progs. We have an internal use case where for an application running in a container (with pid namespace), user wants to get the pid associated with the pid namespace in a cgroup bpf program. Currently, cgroup bpf progs already allow bpf_get_current_pid_tgid(). Let us allow bpf_get_ns_current_pid_tgid() as well. With auditing the code, bpf_get_current_pid_tgid() is also used by sk_msg prog. But there are no side effect to expose these two helpers to all prog types since they do not reveal any kernel specific data. The detailed discussion is in [1]. So with this patch, both bpf_get_current_pid_tgid() and bpf_get_ns_current_pid_tgid() are put in bpf_base_func_proto(), making them available to all program types. [1] https://lore.kernel.org/bpf/20240307232659.1115872-1-yonghong.song@linux.dev/ Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240315184854.2975190-1-yonghong.song@linux.dev --- kernel/bpf/cgroup.c | 2 -- kernel/bpf/helpers.c | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 82243cb6c54d..8ba73042a239 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2575,8 +2575,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; #ifdef CONFIG_CGROUP_NET_CLASSID diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a89587859571..9234174ccb21 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1730,6 +1730,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: break; } -- cgit v1.2.3 From d4dfc5700e867b22ab94f960f9a9972696a637d5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:49 -0700 Subject: bpf: pass whole link instead of prog when triggering raw tracepoint Instead of passing prog as an argument to bpf_trace_runX() helpers, that are called from tracepoint triggering calls, store BPF link itself (struct bpf_raw_tp_link for raw tracepoints). This will allow to pass extra information like BPF cookie into raw tracepoint registration. Instead of replacing `struct bpf_prog *prog = __data;` with corresponding `struct bpf_raw_tp_link *link = __data;` assignment in `__bpf_trace_##call` I just passed `__data` through into underlying bpf_trace_runX() call. This works well because we implicitly cast `void *`, and it also avoids naming clashes with arguments coming from tracepoint's "proto" list. We could have run into the same problem with "prog", we just happened to not have a tracepoint that has "prog" input argument. We are less lucky with "link", as there are tracepoints using "link" argument name already. So instead of trying to avoid naming conflicts, let's just remove intermediate local variable. It doesn't hurt readibility, it's either way a bit of a maze of calls and macros, that requires careful reading. Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-3-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..1cb4c3809af4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3469,17 +3469,12 @@ out_put_prog: return err; } -struct bpf_raw_tp_link { - struct bpf_link link; - struct bpf_raw_event_map *btp; -}; - static void bpf_raw_tp_link_release(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); - bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); + bpf_probe_unregister(raw_tp->btp, raw_tp); bpf_put_raw_tracepoint(raw_tp->btp); } @@ -3833,7 +3828,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } - err = bpf_probe_register(link->btp, prog); + err = bpf_probe_register(link->btp, link); if (err) { bpf_link_cleanup(&link_primer); goto out_put_btp; -- cgit v1.2.3 From 68ca5d4eebb8c4de246ee5f634eee26bc689562d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:50 -0700 Subject: bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs Wire up BPF cookie for raw tracepoint programs (both BTF and non-BTF aware variants). This brings them up to part w.r.t. BPF cookie usage with classic tracepoint and fentry/fexit programs. Acked-by: Stanislav Fomichev Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-4-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1cb4c3809af4..e44c276e8617 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3774,7 +3774,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name) + const char __user *user_tp_name, u64 cookie) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -3821,6 +3821,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, &bpf_raw_tp_link_lops, prog); link->btp = btp; + link->cookie = cookie; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -3841,11 +3842,13 @@ out_put_btp: return err; } -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { struct bpf_prog *prog; + void __user *tp_name; + __u64 cookie; int fd; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) @@ -3855,7 +3858,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); + cookie = attr->raw_tracepoint.cookie; + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); if (fd < 0) bpf_prog_put(prog); return fd; @@ -5193,7 +5198,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) -- cgit v1.2.3 From 4c2a26fc80bcb851dc630590f2eec157991eccbf Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Wed, 20 Mar 2024 20:29:54 -0400 Subject: bpf-next: Avoid goto in regs_refine_cond_op() In case of GE/GT/SGE/JST instructions, regs_refine_cond_op() reuses the logic that does analysis of LE/LT/SLE/SLT instructions. This commit avoids the use of a goto to perform the reuse. Signed-off-by: Harishankar Vishwanathan Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240321002955.808604-1-harishankar.vishwanathan@gmail.com --- kernel/bpf/verifier.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index de7813947981..ca6cacf7b42f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14544,7 +14544,19 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state struct tnum t; u64 val; -again: + /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */ + switch (opcode) { + case BPF_JGE: + case BPF_JGT: + case BPF_JSGE: + case BPF_JSGT: + opcode = flip_opcode(opcode); + swap(reg1, reg2); + break; + default: + break; + } + switch (opcode) { case BPF_JEQ: if (is_jmp32) { @@ -14687,14 +14699,6 @@ again: reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); } break; - case BPF_JGE: - case BPF_JGT: - case BPF_JSGE: - case BPF_JSGT: - /* just reuse LE/LT logic above */ - opcode = flip_opcode(opcode); - swap(reg1, reg2); - goto again; default: return; } -- cgit v1.2.3 From 786bf0e7e2ec90b349a7bab6e97947982ab31f2c Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 25 Mar 2024 15:22:10 +0000 Subject: bpf: improve error message for unsupported helper BPF verifier emits "unknown func" message when given BPF program type does not support BPF helper. This message may be confusing for users, as important context that helper is unknown only to current program type is not provided. This patch changes message to "program of this type cannot use helper " and aligns dependent code in libbpf and tests. Any suggestions on improving/changing this message are welcome. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20240325152210.377548-1-yatsenko@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17f26ba1a9e0..edb650667f44 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10201,8 +10201,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { - verbose(env, "unknown func %s#%d\n", func_id_name(func_id), - func_id); + verbose(env, "program of this type cannot use helper %s#%d\n", + func_id_name(func_id), func_id); return -EINVAL; } -- cgit v1.2.3 From 55fc888ded83ed542f3de3e51bae03936a998349 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 27 Mar 2024 14:53:29 +0800 Subject: bpf,arena: Use helper sizeof_field in struct accessors Use the well defined helper sizeof_field() to calculate the size of a struct member, instead of doing custom calculations. Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240327065334.8140-1-haiyue.wang@intel.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c8dd..6c81630c5293 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -37,7 +37,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { -- cgit v1.2.3 From 3124591f686115aca25d772c2ccb7b1e202c3197 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 26 Mar 2024 09:21:50 -0700 Subject: bpf: add bpf_modify_return_test_tp() kfunc triggering tracepoint Add a simple bpf_modify_return_test_tp() kfunc, available to all program types, that is useful for various testing and benchmarking scenarios, as it allows to trigger most tracing BPF program types from BPF side, allowing to do complex testing and benchmarking scenarios. It is also attachable to for fmod_ret programs, making it a good and simple way to trigger fmod_ret program under test/benchmark. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240326162151.3981687-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f860adab3eb9..d9e7aca8ae9e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2625,6 +2625,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From ee3bad033d01066348913f3220ea81987721ed53 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 27 Mar 2024 11:20:22 +0800 Subject: bpf: Mitigate latency spikes associated with freeing non-preallocated htab Following the recent upgrade of one of our BPF programs, we encountered significant latency spikes affecting other applications running on the same host. After thorough investigation, we identified that these spikes were primarily caused by the prolonged duration required to free a non-preallocated htab with approximately 2 million keys. Notably, our kernel configuration lacks the presence of CONFIG_PREEMPT. In scenarios where kernel execution extends excessively, other threads might be starved of CPU time, resulting in latency issues across the system. To mitigate this, we've adopted a proactive approach by incorporating cond_resched() calls within the kernel code. This ensures that during lengthy kernel operations, the scheduler is invoked periodically to provide opportunities for other threads to execute. Signed-off-by: Yafang Shao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240327032022.78391-1-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3a088a5349bc..e81059faae63 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1490,6 +1490,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } + cond_resched(); } migrate_enable(); } -- cgit v1.2.3 From e8742081db7d01f980c6161ae1e8a1dbc1e30979 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 28 Mar 2024 11:58:01 -0700 Subject: bpf: Mark bpf prog stack with kmsan_unposion_memory in interpreter mode syzbot reported uninit memory usages during map_{lookup,delete}_elem. ========== BUG: KMSAN: uninit-value in __dev_map_lookup_elem kernel/bpf/devmap.c:441 [inline] BUG: KMSAN: uninit-value in dev_map_lookup_elem+0xf3/0x170 kernel/bpf/devmap.c:796 __dev_map_lookup_elem kernel/bpf/devmap.c:441 [inline] dev_map_lookup_elem+0xf3/0x170 kernel/bpf/devmap.c:796 ____bpf_map_lookup_elem kernel/bpf/helpers.c:42 [inline] bpf_map_lookup_elem+0x5c/0x80 kernel/bpf/helpers.c:38 ___bpf_prog_run+0x13fe/0xe0f0 kernel/bpf/core.c:1997 __bpf_prog_run256+0xb5/0xe0 kernel/bpf/core.c:2237 ========== The reproducer should be in the interpreter mode. The C reproducer is trying to run the following bpf prog: 0: (18) r0 = 0x0 2: (18) r1 = map[id:49] 4: (b7) r8 = 16777216 5: (7b) *(u64 *)(r10 -8) = r8 6: (bf) r2 = r10 7: (07) r2 += -229 ^^^^^^^^^^ 8: (b7) r3 = 8 9: (b7) r4 = 0 10: (85) call dev_map_lookup_elem#1543472 11: (95) exit It is due to the "void *key" (r2) passed to the helper. bpf allows uninit stack memory access for bpf prog with the right privileges. This patch uses kmsan_unpoison_memory() to mark the stack as initialized. This should address different syzbot reports on the uninit "void *key" argument during map_{lookup,delete}_elem. Reported-by: syzbot+603bcd9b0bf1d94dbb9b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000f9ce6d061494e694@google.com/ Reported-by: syzbot+eb02dc7f03dce0ef39f3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000a5c69c06147c2238@google.com/ Reported-by: syzbot+b4e65ca24fd4d0c734c3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/000000000000ac56fb06143b6cfa@google.com/ Reported-by: syzbot+d2b113dc9fea5e1d2848@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/0000000000000d69b206142d1ff7@google.com/ Reported-by: syzbot+1a3cf6f08d68868f9db3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/0000000000006f876b061478e878@google.com/ Tested-by: syzbot+1a3cf6f08d68868f9db3@syzkaller.appspotmail.com Suggested-by: Yonghong Song Suggested-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240328185801.1843078-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5aacb1d3c4cc..ab400cdd7d7a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2218,6 +2218,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ @@ -2231,6 +2232,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ + kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ -- cgit v1.2.3 From 59f2f841179aa6a0899cb9cf53659149a35749b7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 29 Mar 2024 10:14:39 -0700 Subject: bpf: Avoid kfree_rcu() under lock in bpf_lpm_trie. syzbot reported the following lock sequence: cpu 2: grabs timer_base lock spins on bpf_lpm lock cpu 1: grab rcu krcp lock spins on timer_base lock cpu 0: grab bpf_lpm lock spins on rcu krcp lock bpf_lpm lock can be the same. timer_base lock can also be the same due to timer migration. but rcu krcp lock is always per-cpu, so it cannot be the same lock. Hence it's a false positive. To avoid lockdep complaining move kfree_rcu() after spin_unlock. Reported-by: syzbot+1fa663a2100308ab6eab@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240329171439.37813-1-alexei.starovoitov@gmail.com --- kernel/bpf/lpm_trie.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 939620b91c0e..0218a5132ab5 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -316,6 +316,7 @@ static long trie_update_elem(struct bpf_map *map, { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; @@ -390,7 +391,7 @@ static long trie_update_elem(struct bpf_map *map, trie->n_entries--; rcu_assign_pointer(*slot, new_node); - kfree_rcu(node, rcu); + free_node = node; goto out; } @@ -437,6 +438,7 @@ out: } spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_node, rcu); return ret; } @@ -445,6 +447,7 @@ out: static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *free_node = NULL, *free_parent = NULL; struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; @@ -514,8 +517,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) else rcu_assign_pointer( *trim2, rcu_access_pointer(parent->child[0])); - kfree_rcu(parent, rcu); - kfree_rcu(node, rcu); + free_parent = parent; + free_node = node; goto out; } @@ -529,10 +532,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); else RCU_INIT_POINTER(*trim, NULL); - kfree_rcu(node, rcu); + free_node = node; out: spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_parent, rcu); + kfree_rcu(free_node, rcu); return ret; } -- cgit v1.2.3 From 9dc182c58b5f5d4ac125ac85ad553f7142aa08d4 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 2 Apr 2024 07:33:47 +0000 Subject: bpf: Add a verbose message if map limit is reached When more than 64 maps are used by a program and its subprograms the verifier returns -E2BIG. Add a verbose message which highlights the source of the error and also print the actual limit. Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20240402073347.195920-1-aspsk@isovalent.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index edb650667f44..fcb62300f407 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18348,6 +18348,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } if (env->used_map_cnt >= MAX_USED_MAPS) { + verbose(env, "The total number of maps per program has reached the limit of %u\n", + MAX_USED_MAPS); fdput(f); return -E2BIG; } -- cgit v1.2.3 From ce09cbdd988887662546a1175bcfdfc6c8fdd150 Mon Sep 17 00:00:00 2001 From: Jose Fernandez Date: Mon, 1 Apr 2024 21:40:10 -0600 Subject: bpf: Improve program stats run-time calculation This patch improves the run-time calculation for program stats by capturing the duration as soon as possible after the program returns. Previously, the duration included u64_stats_t operations. While the instrumentation overhead is part of the total time spent when stats are enabled, distinguishing between the program's native execution time and the time spent due to instrumentation is crucial for accurate performance analysis. By making this change, the patch facilitates more precise optimization of BPF programs, enabling users to understand their performance in environments without stats enabled. I used a virtualized environment to measure the run-time over one minute for a basic raw_tracepoint/sys_enter program, which just increments a local counter. Although the virtualization introduced some performance degradation that could affect the results, I observed approximately a 16% decrease in average run-time reported by stats with this change (310 -> 260 nsec). Signed-off-by: Jose Fernandez Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240402034010.25060-1-josef@netflix.com --- kernel/bpf/trampoline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index cc50607f8d8c..26ae703d3c3b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -885,12 +885,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog, * Hence check that 'start' is valid. */ start > NO_START_TIME) { + u64 duration = sched_clock() - start; unsigned long flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, sched_clock() - start); + u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } -- cgit v1.2.3 From 2e114248e086fb376405ed3f89b220f8586a2541 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Tue, 2 Apr 2024 23:52:50 +0000 Subject: bpf: Replace deprecated strncpy with strscpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. bpf sym names get looked up and compared/cleaned with various string apis. This suggests they need to be NUL-terminated (strncpy() suggests this but does not guarantee it). | static int compare_symbol_name(const char *name, char *namebuf) | { | cleanup_symbol_name(namebuf); | return strcmp(name, namebuf); | } | static void cleanup_symbol_name(char *s) | { | ... | res = strstr(s, ".llvm."); | ... | } Use strscpy() as this method guarantees NUL-termination on the destination buffer. This patch also replaces two uses of strncpy() used in log.c. These are simple replacements as postfix has been zero-initialized on the stack and has source arguments with a size less than the destination's size. Note that this patch uses the new 2-argument version of strscpy introduced in commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). Signed-off-by: Justin Stitt Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2] Link: https://github.com/KSPP/linux/issues/90 Link: https://lore.kernel.org/bpf/20240402-strncpy-kernel-bpf-core-c-v1-1-7cb07a426e78@google.com --- kernel/bpf/core.c | 4 ++-- kernel/bpf/log.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ab400cdd7d7a..ae406a2814db 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -747,7 +747,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) @@ -813,7 +813,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - strncpy(sym, ksym->name, KSYM_NAME_LEN); + strscpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 2a243cf37c60..4bd8f17a9f24 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -467,9 +467,9 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) if (type & PTR_MAYBE_NULL) { if (base_type(type) == PTR_TO_BTF_ID) - strncpy(postfix, "or_null_", 16); + strscpy(postfix, "or_null_"); else - strncpy(postfix, "_or_null", 16); + strscpy(postfix, "_or_null"); } snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", -- cgit v1.2.3 From 7bdbf7446305cb65c510c16d57cde82bc76b234a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:02 -0700 Subject: bpf: add special internal-only MOV instruction to resolve per-CPU addrs Add a new BPF instruction for resolving absolute addresses of per-CPU data from their per-CPU offsets. This instruction is internal-only and users are not allowed to use them directly. They will only be used for internal inlining optimizations for now between BPF verifier and BPF JITs. We use a special BPF_MOV | BPF_ALU64 | BPF_X form with insn->off field set to BPF_ADDR_PERCPU = -1. I used negative offset value to distinguish them from positive ones used by user-exposed instructions. Such instruction performs a resolution of a per-CPU offset stored in a register to a valid kernel address which can be dereferenced. It is useful in any use case where absolute address of a per-CPU data has to be resolved (e.g., in inlining bpf_map_lookup_elem()). BPF disassembler is also taught to recognize them to support dumping final BPF assembly code (non-JIT'ed version). Add arch-specific way for BPF JITs to mark support for this instructions. This patch also adds support for these instructions in x86-64 BPF JIT. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 5 +++++ kernel/bpf/disasm.c | 14 ++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ae406a2814db..7a33a3a7e63c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2945,6 +2945,11 @@ bool __weak bpf_jit_supports_subprog_tailcalls(void) return false; } +bool __weak bpf_jit_supports_percpu_insn(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bd2e2dd04740..309c4aa1b026 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn) insn->off == BPF_ADDR_SPACE_CAST; } +/* Special (internal-only) form of mov, used to resolve per-CPU addrs: + * dst_reg = src_reg + + * BPF_ADDR_PERCPU is used as a special insn->off value. + */ +#define BPF_ADDR_PERCPU (-1) + +static inline bool is_mov_percpu_addr(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); + } else if (is_mov_percpu_addr(insn)) { + verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n", + insn->code, insn->dst_reg, insn->src_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', -- cgit v1.2.3 From 1ae6921009e5d72787e07ccc04754514ccf6bc99 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:03 -0700 Subject: bpf: inline bpf_get_smp_processor_id() helper If BPF JIT supports per-CPU MOV instruction, inline bpf_get_smp_processor_id() to eliminate unnecessary function calls. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fcb62300f407..17c06f1505e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20074,6 +20074,30 @@ patch_map_ops_generic: goto next_insn; } +#ifdef CONFIG_X86_64 + /* Implement bpf_get_smp_processor_id() inline. */ + if (insn->imm == BPF_FUNC_get_smp_processor_id && + prog->jit_requested && bpf_jit_supports_percpu_insn()) { + /* BPF_FUNC_get_smp_processor_id inlining is an + * optimization, so if pcpu_hot.cpu_number is ever + * changed in some incompatible and hard to support + * way, it's fine to back out this inlining logic + */ + insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } +#endif /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg) { -- cgit v1.2.3 From db69718b8efac802c7cc20d5a6c7dfc913f99c43 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:04 -0700 Subject: bpf: inline bpf_map_lookup_elem() for PERCPU_ARRAY maps Using new per-CPU BPF instruction implement inlining for per-CPU ARRAY map lookup helper, if BPF JIT support is present. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 13358675ff2e..8c1e6d7654bb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -246,6 +246,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */ +static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + + BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs)); + + *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0); + if (!map->bypass_spec_v1) { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5); + } + + *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0); + return insn - insn_buf; +} + static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -776,6 +808,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, + .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, -- cgit v1.2.3 From 0b56e637f705836b9aa51e2b1058c3c814c121a8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:05 -0700 Subject: bpf: inline bpf_map_lookup_elem() helper for PERCPU_HASH map Using new per-CPU BPF instruction, partially inline bpf_map_lookup_elem() helper for per-CPU hashmap BPF map. Just like for normal HASH map, we still generate a call into __htab_map_lookup_elem(), but after that we resolve per-CPU element address using a new instruction, saving on extra functions calls. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e81059faae63..83a9a74260e9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2308,6 +2308,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call for per-CPU hashmap */ +static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, + offsetof(struct htab_elem, key) + map->key_size); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + + return insn - insn_buf; +} + static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; @@ -2436,6 +2456,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_gen_lookup = htab_percpu_map_gen_lookup, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, -- cgit v1.2.3 From af682b767a41772499f8e54ca7d7e1deb3395f44 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2024 16:38:00 -0700 Subject: bpf: Optimize emit_mov_imm64(). Turned out that bpf prog callback addresses, bpf prog addresses used in bpf_trampoline, and in other cases the 64-bit address can be represented as sign extended 32-bit value. According to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82339 "Skylake has 0.64c throughput for mov r64, imm64, vs. 0.25 for mov r32, imm32." So use shorter encoding and faster instruction when possible. Special care is needed in jit_subprogs(), since bpf_pseudo_func() instruction cannot change its size during the last step of JIT. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/CAADnVQKFfpY-QZBrOU2CG8v2du8Lgyb7MNVmOZVK_yTyOdNbBA@mail.gmail.com Link: https://lore.kernel.org/bpf/20240401233800.42737-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17c06f1505e4..1e03ba9ed07b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19147,12 +19147,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) env->insn_aux_data[i].call_imm = insn->imm; /* point imm to __bpf_call_base+1 from JITs point of view */ insn->imm = 1; - if (bpf_pseudo_func(insn)) + if (bpf_pseudo_func(insn)) { +#if defined(MODULES_VADDR) + u64 addr = MODULES_VADDR; +#else + u64 addr = VMALLOC_START; +#endif /* jit (e.g. x86_64) may emit fewer instructions * if it learns a u32 imm is the same as a u64 imm. - * Force a non zero here. + * Set close enough to possible prog address. */ - insn[1].imm = 1; + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + } } err = bpf_prog_alloc_jited_linfo(prog); -- cgit v1.2.3 From 314a53623cd4e62e1b88126e5ed2bc87073d90ee Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 3 Apr 2024 17:26:40 -0700 Subject: bpf: inline bpf_get_branch_snapshot() helper Inline bpf_get_branch_snapshot() helper using architecture-agnostic inline BPF code which calls directly into underlying callback of perf_snapshot_branch_stack static call. This callback is set early during kernel initialization and is never updated or reset, so it's ok to fetch actual implementation using static_call_query() and call directly into it. This change eliminates a full function call and saves one LBR entry in PERF_SAMPLE_BRANCH_ANY LBR mode. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404002640.1774210-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e03ba9ed07b..ffaa9f7f153c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20188,6 +20188,61 @@ patch_map_ops_generic: goto next_insn; } + /* Implement bpf_get_branch_snapshot inline. */ + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_get_branch_snapshot) { + /* We are dealing with the following func protos: + * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); + * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt); + */ + const u32 br_entry_size = sizeof(struct perf_branch_entry); + + /* struct perf_branch_entry is part of UAPI and is + * used as an array element, so extremely unlikely to + * ever grow or shrink + */ + BUILD_BUG_ON(br_entry_size != 24); + + /* if (unlikely(flags)) return -EINVAL */ + insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7); + + /* Transform size (bytes) into number of entries (cnt = size / 24). + * But to avoid expensive division instruction, we implement + * divide-by-3 through multiplication, followed by further + * division by 8 through 3-bit right shift. + * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., + * p. 227, chapter "Unsigned Divison by 3" for details and proofs. + * + * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. + */ + insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab); + insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0); + insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36); + + /* call perf_snapshot_branch_stack implementation */ + insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack)); + /* if (entry_cnt == 0) return -ENOENT */ + insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4); + /* return entry_cnt * sizeof(struct perf_branch_entry) */ + insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size); + insn_buf[7] = BPF_JMP_A(3); + /* return -EINVAL; */ + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + insn_buf[9] = BPF_JMP_A(1); + /* return -ENOENT; */ + insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT); + cnt = 11; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + /* Implement bpf_kptr_xchg inline */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_kptr_xchg && -- cgit v1.2.3 From 1f2a74b41ea8b902687eb97c4e7e3f558801865b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 Apr 2024 14:45:35 -0700 Subject: bpf: prevent r10 register from being marked as precise r10 is a special register that is not under BPF program's control and is always effectively precise. The rest of precision logic assumes that only r0-r9 SCALAR registers are marked as precise, so prevent r10 from being marked precise. This can happen due to signed cast instruction allowing to do something like `r0 = (s8)r10;`, which later, if r0 needs to be precise, would lead to an attempt to mark r10 as precise. Prevent this with an extra check during instruction backtracking. Fixes: 8100928c8814 ("bpf: Support new sign-extension mov insns") Reported-by: syzbot+148110ee7cf72f39f33e@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240404214536.3551295-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ffaa9f7f153c..41446bea8fab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3615,7 +3615,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * sreg needs precision before this insn */ bt_clear_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } else { /* dreg = K * dreg needs precision after this insn. @@ -3631,7 +3632,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * both dreg and sreg need precision * before this insn */ - bt_set_reg(bt, sreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); } /* else dreg += K * dreg still needs precision before this insn */ -- cgit v1.2.3 From 58babe27180c8d4cb54d831589cf801bd9268876 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Apr 2024 16:26:25 +0200 Subject: bpf: fix perf_snapshot_branch_stack link failure The newly added code to handle bpf_get_branch_snapshot fails to link when CONFIG_PERF_EVENTS is disabled: aarch64-linux-ld: kernel/bpf/verifier.o: in function `do_misc_fixups': verifier.c:(.text+0x1090c): undefined reference to `__SCK__perf_snapshot_branch_stack' Add a build-time check for that Kconfig symbol around the code to remove the link time dependency. Fixes: 314a53623cd4 ("bpf: inline bpf_get_branch_snapshot() helper") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240405142637.577046-1-arnd@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 41446bea8fab..4353cb09c35b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20191,7 +20191,8 @@ patch_map_ops_generic: } /* Implement bpf_get_branch_snapshot inline. */ - if (prog->jit_requested && BITS_PER_LONG == 64 && + if (IS_ENABLED(CONFIG_PERF_EVENTS) && + prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_get_branch_snapshot) { /* We are dealing with the following func protos: * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); -- cgit v1.2.3 From 0a525621b7e5b49202b19d8f75382c6778fdd0c1 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 5 Apr 2024 10:55:34 +0800 Subject: bpf: store both map ptr and state in bpf_insn_aux_data Currently, bpf_insn_aux_data->map_ptr_state is used to store either map_ptr or its poison state (i.e., BPF_MAP_PTR_POISON). Thus BPF_MAP_PTR_POISON must be checked before reading map_ptr. In certain cases, we may need valid map_ptr even in case of poison state. This will be explained in next patch with bpf_for_each_map_elem() helper. This patch changes map_ptr_state into a new struct including both map pointer and its state (poison/unpriv). It's in the same union with struct bpf_loop_inline_state, so there is no extra memory overhead. Besides, macros BPF_MAP_PTR_UNPRIV/BPF_MAP_PTR_POISON/BPF_MAP_PTR are no longer needed. This patch does not change any existing functionality. Signed-off-by: Philo Lu Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240405025536.18113-2-lulie@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4353cb09c35b..62fedb1f3312 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -190,11 +190,6 @@ struct bpf_verifier_stack_elem { #define BPF_MAP_KEY_POISON (1ULL << 63) #define BPF_MAP_KEY_SEEN (1ULL << 62) -#define BPF_MAP_PTR_UNPRIV 1UL -#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ - POISON_POINTER_DELTA)) -#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) - #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); @@ -209,21 +204,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; + return aux->map_ptr_state.poison; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state.unpriv; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, - const struct bpf_map *map, bool unpriv) + struct bpf_map *map, + bool unpriv, bool poison) { - BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_ptr_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state.unpriv = unpriv; + aux->map_ptr_state.poison = poison; + aux->map_ptr_state.map_ptr = map; } static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) @@ -9660,7 +9656,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return -EINVAL; } - map = BPF_MAP_PTR(insn_aux->map_ptr_state); + map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args || !map->ops->map_for_each_callback) { verbose(env, "callback function not allowed for map\n"); @@ -10019,12 +10015,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_ptr_state)) + if (!aux->map_ptr_state.map_ptr) + bpf_map_ptr_store(aux, meta->map_ptr, + !meta->map_ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map_ptr) bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1); - else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) - bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - !meta->map_ptr->bypass_spec_v1); + !meta->map_ptr->bypass_spec_v1, true); return 0; } @@ -19840,7 +19836,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) !bpf_map_ptr_unpriv(aux)) { struct bpf_jit_poke_descriptor desc = { .reason = BPF_POKE_REASON_TAIL_CALL, - .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.map = aux->map_ptr_state.map_ptr, .tail_call.key = bpf_map_key_immediate(aux), .insn_idx = i + delta, }; @@ -19869,7 +19865,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -19977,7 +19973,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_ptr_state); + map_ptr = aux->map_ptr_state.map_ptr; ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { -- cgit v1.2.3 From 9d482da9e17a4ddd5563428f74302a36b2610306 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 5 Apr 2024 10:55:35 +0800 Subject: bpf: allow invoking bpf_for_each_map_elem with different maps Taking different maps within a single bpf_for_each_map_elem call is not allowed before, because from the second map, bpf_insn_aux_data->map_ptr_state will be marked as *poison*. In fact both map_ptr and state are needed to support this use case: map_ptr is used by set_map_elem_callback_state() while poison state is needed to determine whether to use direct call. Signed-off-by: Philo Lu Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240405025536.18113-3-lulie@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 62fedb1f3312..590db4e4c071 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9651,11 +9651,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_map *map; int err; - if (bpf_map_ptr_poisoned(insn_aux)) { - verbose(env, "tail_call abusing map_ptr\n"); - return -EINVAL; - } - + /* valid map_ptr and poison value does not matter */ map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args || !map->ops->map_for_each_callback) { -- cgit v1.2.3 From a8e03b6bbb2cc7cf387d1ce335e4ce4c3bdfef9b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Fri, 5 Apr 2024 09:30:40 -0500 Subject: bpf: Allow invoking kfuncs from BPF_PROG_TYPE_SYSCALL progs Currently, a set of core BPF kfuncs (e.g. bpf_task_*, bpf_cgroup_*, bpf_cpumask_*, etc) cannot be invoked from BPF_PROG_TYPE_SYSCALL programs. The whitelist approach taken for enabling kfuncs makes sense: it not safe to call these kfuncs from every program type. For example, it may not be safe to call bpf_task_acquire() in an fentry to free_task(). BPF_PROG_TYPE_SYSCALL, on the other hand, is a perfectly safe program type from which to invoke these kfuncs, as it's a very controlled environment, and we should never be able to run into any of the typical problems such as recursive invoations, acquiring references on freeing kptrs, etc. Being able to invoke these kfuncs would be useful, as BPF_PROG_TYPE_SYSCALL can be invoked with BPF_PROG_RUN, and would therefore enable user space programs to synchronously call into BPF to manipulate these kptrs. This patch therefore enables invoking the aforementioned core kfuncs from BPF_PROG_TYPE_SYSCALL progs. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240405143041.632519-2-void@manifault.com --- kernel/bpf/cpumask.c | 1 + kernel/bpf/helpers.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index dad0fb1c8e87..33c473d676a5 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -474,6 +474,7 @@ static int __init cpumask_kfunc_init(void) ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, ARRAY_SIZE(cpumask_dtors), THIS_MODULE); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d9e7aca8ae9e..8cde717137bd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2653,6 +2653,7 @@ static int __init kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); -- cgit v1.2.3 From b993115b44d769cb34b394d92658226df81a6c89 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Feb 2024 10:16:17 -0800 Subject: bpf: Select new NEED_TASKS_RCU Kconfig option Currently, if a Kconfig option depends on TASKS_RCU, it conditionally does "select TASKS_RCU if PREEMPTION". This works, but requires any change in this enablement logic to be replicated across all such "select" clauses. A new NEED_TASKS_RCU Kconfig option has been created to allow this enablement logic to be in one place in kernel/rcu/Kconfig. Therefore, make BPF select the new NEED_TASKS_RCU Kconfig option. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: Cc: Ankur Arora Cc: Thomas Gleixner Cc: Steven Rostedt Acked-by: Mark Rutland Signed-off-by: Uladzislau Rezki (Sony) --- kernel/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index bc25f5098a25..4100df44c665 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -28,7 +28,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET -- cgit v1.2.3 From d503a04f8bc0c75dc9db9452d8cc79d748afb752 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 5 Apr 2024 16:11:33 -0700 Subject: bpf: Add support for certain atomics in bpf_arena to x86 JIT Support atomics in bpf_arena that can be JITed as a single x86 instruction. Instructions that are JITed as loops are not supported at the moment, since they require more complex extable and loop logic. JITs can choose to do smarter things with bpf_jit_supports_insn(). Like arm64 may decide to support all bpf atomics instructions when emit_lse_atomic is available and none in ll_sc mode. bpf_jit_supports_percpu_insn(), bpf_jit_supports_ptr_xchg() and other such callbacks can be replaced with bpf_jit_supports_insn() in the future. Signed-off-by: Alexei Starovoitov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240405231134.17274-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/core.c | 5 +++++ kernel/bpf/verifier.c | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7a33a3a7e63c..a41718eaeefe 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2965,6 +2965,11 @@ bool __weak bpf_jit_supports_arena(void) return false; } +bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + return false; +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 590db4e4c071..2aad6d90550f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6970,6 +6970,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, + bool allow_trust_missmatch); + static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { int load_reg; @@ -7030,7 +7033,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg) || - is_arena_reg(env, insn->dst_reg)) { + (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7066,6 +7069,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; + if (is_arena_reg(env, insn->dst_reg)) { + err = save_aux_ptr_type(env, PTR_TO_ARENA, false); + if (err) + return err; + } /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); @@ -18955,6 +18963,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && + env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { + insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; } else { continue; } @@ -19226,6 +19240,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) BPF_CLASS(insn->code) == BPF_ST) && BPF_MODE(insn->code) == BPF_PROBE_MEM32) num_exentries++; + if (BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; -- cgit v1.2.3 From 699c23f02c65cbfc3e638f14ce0d70c23a2e1f02 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Apr 2024 21:35:27 -0700 Subject: bpf: Add bpf_link support for sk_msg and sk_skb progs Add bpf_link support for sk_msg and sk_skb programs. We have an internal request to support bpf_link for sk_msg programs so user space can have a uniform handling with bpf_link based libbpf APIs. Using bpf_link based libbpf API also has a benefit which makes system robust by decoupling prog life cycle and attachment life cycle. Reviewed-by: John Fastabend Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240410043527.3737160-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e44c276e8617..7d392ec83655 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5213,6 +5213,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: + ret = sock_map_link_create(attr, prog); + break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); -- cgit v1.2.3 From 1e52af7f023c44859868306fac1a4b6b556cf47b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Feb 2024 11:17:28 -0800 Subject: bpf: Choose RCU Tasks based on TASKS_RCU rather than PREEMPTION The advent of CONFIG_PREEMPT_AUTO, AKA lazy preemption, will mean that even kernels built with CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY might see the occasional preemption, and that this preemption just might happen within a trampoline. Therefore, update bpf_tramp_image_put() to choose call_rcu_tasks() based on CONFIG_TASKS_RCU instead of CONFIG_PREEMPTION. This change might enable further simplifications, but the goal of this effort is to make the code safe, not necessarily optimal. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: John Fastabend Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: Ankur Arora Cc: Thomas Gleixner Cc: Signed-off-by: Uladzislau Rezki (Sony) --- kernel/bpf/trampoline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index db7599c59c78..88673a4267eb 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -333,7 +333,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, NULL, im->ip_epilogue); WARN_ON(err); - if (IS_ENABLED(CONFIG_PREEMPTION)) + if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); else percpu_ref_kill(&im->pcref); -- cgit v1.2.3 From 37eacb9f6e89fb399a79e952bc9c78eb3e16290e Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 12 Apr 2024 16:11:00 +0200 Subject: bpf: Fix a verifier verbose message Long ago a map file descriptor in a pseudo ldimm64 instruction could only be present as an immediate value insn[0].imm, and thus this value was used in a verbose verifier message printed when the file descriptor wasn't valid. Since addition of BPF_PSEUDO_MAP_IDX_VALUE/BPF_PSEUDO_MAP_IDX the insn[0].imm field can also contain an index pointing to the file descriptor in the attr.fd_array array. However, if the file descriptor is invalid, the verifier still prints the verbose message containing value of insn[0].imm. Patch the verifier message to always print the actual file descriptor value. Fixes: 387544bfa291 ("bpf: Introduce fd_idx") Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240412141100.3562942-1-aspsk@isovalent.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98188379d5c7..aa24beb65393 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18289,8 +18289,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) f = fdget(fd); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose(env, "fd %d is not pointing to valid bpf_map\n", - insn[0].imm); + verbose(env, "fd %d is not pointing to valid bpf_map\n", fd); return PTR_ERR(map); } -- cgit v1.2.3 From fc5eb4a84e4c063e75a6a6e92308e9533c0f19b5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 18:20:45 +0200 Subject: btf: Avoid weak external references If the BTF code is enabled in the build configuration, the start/stop BTF markers are guaranteed to exist. Only when CONFIG_DEBUG_INFO_BTF=n, the references in btf_parse_vmlinux() will remain unsatisfied, relying on the weak linkage of the external references to avoid breaking the build. Avoid GOT based relocations to these markers in the final executable by dropping the weak attribute and instead, make btf_parse_vmlinux() return ERR_PTR(-ENOENT) directly if CONFIG_DEBUG_INFO_BTF is not enabled to begin with. The compiler will drop any subsequent references to __start_BTF and __stop_BTF in that case, allowing the link to succeed. Note that Clang will notice that taking the address of __start_BTF can no longer yield NULL, so testing for that condition becomes unnecessary. Signed-off-by: Ard Biesheuvel Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Arnd Bergmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240415162041.2491523-8-ardb+git@google.com --- kernel/bpf/btf.c | 7 +++++-- kernel/bpf/sysfs_btf.c | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 90c4a32d89ff..6d46cee47ae3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5642,8 +5642,8 @@ errout_free: return ERR_PTR(err); } -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) @@ -5971,6 +5971,9 @@ struct btf *btf_parse_vmlinux(void) struct btf *btf = NULL; int err; + if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) + return ERR_PTR(-ENOENT); + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index ef6911aee3bb..fedb54c94cdb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,8 +9,8 @@ #include /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak __start_BTF[]; -extern char __weak __stop_BTF[]; +extern char __start_BTF[]; +extern char __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, @@ -32,7 +32,7 @@ static int __init btf_vmlinux_init(void) { bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) + if (bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); -- cgit v1.2.3 From 1f586614f3ffa80fdf2116b2a1bebcdb5969cef8 Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Tue, 16 Apr 2024 07:53:02 -0400 Subject: bpf: Harden and/or/xor value tracking in verifier This patch addresses a latent unsoundness issue in the scalar(32)_min_max_and/or/xor functions. While it is not a bugfix, it ensures that the functions produce sound outputs for all inputs. The issue occurs in these functions when setting signed bounds. The following example illustrates the issue for scalar_min_max_and(), but it applies to the other functions. In scalar_min_max_and() the following clause is executed when ANDing positive numbers: /* ANDing two positives gives a positive, so safe to * cast result into s64. */ dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; However, if umin_value and umax_value of dst_reg cross the sign boundary (i.e., if (s64)dst_reg->umin_value > (s64)dst_reg->umax_value), then we will end up with smin_value > smax_value, which is unsound. Previous works [1, 2] have discovered and reported this issue. Our tool Agni [2, 3] consideres it a false positive. This is because, during the verification of the abstract operator scalar_min_max_and(), Agni restricts its inputs to those passing through reg_bounds_sync(). This mimics real-world verifier behavior, as reg_bounds_sync() is invariably executed at the tail of every abstract operator. Therefore, such behavior is unlikely in an actual verifier execution. However, it is still unsound for an abstract operator to set signed bounds such that smin_value > smax_value. This patch fixes it, making the abstract operator sound for all (well-formed) inputs. It is worth noting that while the previous code updated the signed bounds (using the output unsigned bounds) only when the *input signed* bounds were positive, the new code updates them whenever the *output unsigned* bounds do not cross the sign boundary. An alternative approach to fix this latent unsoundness would be to unconditionally set the signed bounds to unbounded [S64_MIN, S64_MAX], and let reg_bounds_sync() refine the signed bounds using the unsigned bounds and the tnum. We found that our approach produces more precise (tighter) bounds. For example, consider these inputs to BPF_AND: /* dst_reg */ var_off.value: 8608032320201083347 var_off.mask: 615339716653692460 smin_value: 8070450532247928832 smax_value: 8070450532247928832 umin_value: 13206380674380886586 umax_value: 13206380674380886586 s32_min_value: -2110561598 s32_max_value: -133438816 u32_min_value: 4135055354 u32_max_value: 4135055354 /* src_reg */ var_off.value: 8584102546103074815 var_off.mask: 9862641527606476800 smin_value: 2920655011908158522 smax_value: 7495731535348625717 umin_value: 7001104867969363969 umax_value: 8584102543730304042 s32_min_value: -2097116671 s32_max_value: 71704632 u32_min_value: 1047457619 u32_max_value: 4268683090 After going through tnum_and() -> scalar32_min_max_and() -> scalar_min_max_and() -> reg_bounds_sync(), our patch produces the following bounds for s32: s32_min_value: -1263875629 s32_max_value: -159911942 Whereas, setting the signed bounds to unbounded in scalar_min_max_and() produces: s32_min_value: -1263875629 s32_max_value: -1 As observed, our patch produces a tighter s32 bound. We also confirmed using Agni and SMT verification that our patch always produces signed bounds that are equal to or more precise than setting the signed bounds to unbounded in scalar_min_max_and(). [1] https://sanjit-bhat.github.io/assets/pdf/ebpf-verifier-range-analysis22.pdf [2] https://link.springer.com/chapter/10.1007/978-3-031-37709-9_12 [3] https://github.com/bpfverif/agni Co-developed-by: Matan Shachnai Signed-off-by: Matan Shachnai Co-developed-by: Srinivas Narayana Signed-off-by: Srinivas Narayana Co-developed-by: Santosh Nagarakatte Signed-off-by: Santosh Nagarakatte Signed-off-by: Harishankar Vishwanathan Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240402212039.51815-1-harishankar.vishwanathan@gmail.com Link: https://lore.kernel.org/bpf/20240416115303.331688-1-harishankar.vishwanathan@gmail.com --- kernel/bpf/verifier.c | 94 ++++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 54 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2aad6d90550f..68cfd6fc6ad4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13320,7 +13320,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umax_val = src_reg->u32_max_value; if (src_known && dst_known) { @@ -13333,18 +13332,16 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13353,7 +13350,6 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umax_val = src_reg->umax_value; if (src_known && dst_known) { @@ -13366,18 +13362,16 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = min(dst_reg->umax_value, umax_val); - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13389,7 +13383,6 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; u32 umin_val = src_reg->u32_min_value; if (src_known && dst_known) { @@ -13402,18 +13395,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; } } @@ -13422,7 +13413,6 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; u64 umin_val = src_reg->umin_value; if (src_known && dst_known) { @@ -13435,18 +13425,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, */ dst_reg->umin_value = max(dst_reg->umin_value, umin_val); dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ + + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); @@ -13458,7 +13446,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - s32 smin_val = src_reg->s32_min_value; if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -13469,10 +13456,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->u32_min_value = var32_off.value; dst_reg->u32_max_value = var32_off.value | var32_off.mask; - if (dst_reg->s32_min_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u32 result into s32. - */ + /* Safe to set s32 bounds by casting u32 result into s32 when u32 + * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. + */ + if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; } else { @@ -13486,7 +13473,6 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - s64 smin_val = src_reg->smin_value; if (src_known && dst_known) { /* dst_reg->var_off.value has been updated earlier */ @@ -13498,10 +13484,10 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, dst_reg->umin_value = dst_reg->var_off.value; dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - if (dst_reg->smin_value >= 0 && smin_val >= 0) { - /* XORing two positive sign numbers gives a positive, - * so safe to cast u64 result into s64. - */ + /* Safe to set s64 bounds by casting u64 result into s64 when u64 + * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. + */ + if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; } else { -- cgit v1.2.3 From e1a7545981e2086feaa40dcb7b0db8de0bdada19 Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Wed, 17 Apr 2024 14:52:26 -0300 Subject: bpf: Fix typo in function save_aux_ptr_type I found this typo in the save_aux_ptr_type function. s/allow_trust_missmatch/allow_trust_mismatch/ I did not find this anywhere else in the codebase. Signed-off-by: Rafael Passos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/fbe1d636-8172-4698-9a5a-5a3444b55322@smtp-relay.sendinblue.com --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68cfd6fc6ad4..7b2aa065e552 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6971,7 +6971,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_missmatch); + bool allow_trust_mismatch); static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { @@ -17530,7 +17530,7 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) } static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, - bool allow_trust_missmatch) + bool allow_trust_mismatch) { enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; @@ -17548,7 +17548,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * src_reg == stack|map in some other branch. * Reject it. */ - if (allow_trust_missmatch && + if (allow_trust_mismatch && base_type(type) == PTR_TO_BTF_ID && base_type(*prev_type) == PTR_TO_BTF_ID) { /* -- cgit v1.2.3 From a7de265cb2d849f8986a197499ad58dca0a4f209 Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Wed, 17 Apr 2024 15:49:14 -0300 Subject: bpf: Fix typos in comments Found the following typos in comments, and fixed them: s/unpriviledged/unprivileged/ s/reponsible/responsible/ s/possiblities/possibilities/ s/Divison/Division/ s/precsion/precision/ s/havea/have a/ s/reponsible/responsible/ s/responsibile/responsible/ s/tigher/tighter/ s/respecitve/respective/ Signed-off-by: Rafael Passos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/6af7deb4-bb24-49e8-b3f1-8dd410597337@smtp-relay.sendinblue.com --- kernel/bpf/bpf_local_storage.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/hashtab.c | 2 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index bdea1a459153..976cb258a0ed 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -318,7 +318,7 @@ static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, * * If the local_storage->list is already empty, the caller will not * care about the bpf_ma value also because the caller is not - * responsibile to free the local_storage. + * responsible to free the local_storage. */ if (storage_smap) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a41718eaeefe..95c7fd093e55 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2814,7 +2814,7 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); -/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +/* RNG for unprivileged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83a9a74260e9..c3e79a0b9361 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1539,7 +1539,7 @@ static void htab_map_free(struct bpf_map *map) */ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it - * underneath and is reponsible for waiting for callbacks to finish + * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy(). */ if (!htab_is_prealloc(htab)) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8cde717137bd..61126180d398 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2412,7 +2412,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer - * if the bpf program allows skb data writes. There are two possiblities + * if the bpf program allows skb data writes. There are two possibilities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b2aa065e552..5a7e34e83a5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -172,7 +172,7 @@ static bool bpf_global_percpu_ma_set; /* verifier_state + insn_idx are pushed to stack when branch is encountered */ struct bpf_verifier_stack_elem { - /* verifer state is 'st' + /* verifier state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ @@ -2131,7 +2131,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) { /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tigher range. + * values on both sides of 64-bit range in hope to have tighter range. * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound @@ -2139,7 +2139,7 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respecitve s64 or u64 domain, just + * with are well-formed ranges in respective s64 or u64 domain, just * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. */ __u64 new_umin, new_umax; @@ -14714,7 +14714,7 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state /* Adjusts the register min/max values in the case that the dst_reg and * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K - * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * check, in which case we have a fake SCALAR_VALUE representing insn->imm). * Technically we can do similar adjustments for pointers to the same object, * but we don't support that right now. */ @@ -17352,7 +17352,7 @@ hit: err = propagate_liveness(env, &sl->state, cur); /* if previous state reached the exit with precision and - * current state is equivalent to it (except precsion marks) + * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ @@ -20209,7 +20209,7 @@ patch_map_ops_generic: * divide-by-3 through multiplication, followed by further * division by 8 through 3-bit right shift. * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., - * p. 227, chapter "Unsigned Divison by 3" for details and proofs. + * p. 227, chapter "Unsigned Division by 3" for details and proofs. * * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. */ -- cgit v1.2.3 From be2749beff62e0d63cf97fe63cabc79a68443139 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:01 +0200 Subject: bpf: make timer data struct more generic To be able to add workqueues and reuse most of the timer code, we need to make bpf_hrtimer more generic. There is no code change except that the new struct gets a new u64 flags attribute. We are still below 2 cache lines, so this shouldn't impact the current running codes. The ordering is also changed. Everything related to async callback is now on top of bpf_hrtimer. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-1-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 71 ++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 33 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 61126180d398..7c7f31dd87ba 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1079,11 +1079,20 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +struct bpf_async_cb { + struct bpf_map *map; + struct bpf_prog *prog; + void __rcu *callback_fn; + void *value; + struct rcu_head rcu; + u64 flags; +}; + /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. - * That space is used to keep 'struct bpf_timer_kern'. + * That space is used to keep 'struct bpf_async_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. @@ -1096,16 +1105,12 @@ const struct bpf_func_proto bpf_snprintf_proto = { * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { + struct bpf_async_cb cb; struct hrtimer timer; - struct bpf_map *map; - struct bpf_prog *prog; - void __rcu *callback_fn; - void *value; - struct rcu_head rcu; }; /* the actual struct hidden inside uapi struct bpf_timer */ -struct bpf_timer_kern { +struct bpf_async_kern { struct bpf_hrtimer *timer; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer @@ -1119,14 +1124,14 @@ static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); - struct bpf_map *map = t->map; - void *value = t->value; + struct bpf_map *map = t->cb.map; + void *value = t->cb.value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); - callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); + callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; @@ -1155,7 +1160,7 @@ out: return HRTIMER_NORESTART; } -BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, +BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, u64, flags) { clockid_t clockid = flags & (MAX_CLOCKS - 1); @@ -1163,8 +1168,8 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map int ret = 0; BUILD_BUG_ON(MAX_CLOCKS != 16); - BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); - BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); if (in_nmi()) return -EOPNOTSUPP; @@ -1187,10 +1192,10 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->record->timer_off; - t->map = map; - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); + t->cb.value = (void *)timer - map->record->timer_off; + t->cb.map = map; + t->cb.prog = NULL; + rcu_assign_pointer(t->cb.callback_fn, NULL); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; WRITE_ONCE(timer->timer, t); @@ -1222,7 +1227,7 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, +BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { struct bpf_prog *prev, *prog = aux->prog; @@ -1237,7 +1242,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb ret = -EINVAL; goto out; } - if (!atomic64_read(&t->map->usercnt)) { + if (!atomic64_read(&t->cb.map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space @@ -1246,7 +1251,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb ret = -EPERM; goto out; } - prev = t->prog; + prev = t->cb.prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. @@ -1259,9 +1264,9 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - t->prog = prog; + t->cb.prog = prog; } - rcu_assign_pointer(t->callback_fn, callback_fn); + rcu_assign_pointer(t->cb.callback_fn, callback_fn); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; @@ -1275,7 +1280,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; @@ -1287,7 +1292,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; - if (!t || !t->prog) { + if (!t || !t->cb.prog) { ret = -EINVAL; goto out; } @@ -1315,18 +1320,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_hrtimer *t) +static void drop_prog_refcnt(struct bpf_async_cb *async) { - struct bpf_prog *prog = t->prog; + struct bpf_prog *prog = async->prog; if (prog) { bpf_prog_put(prog); - t->prog = NULL; - rcu_assign_pointer(t->callback_fn, NULL); + async->prog = NULL; + rcu_assign_pointer(async->callback_fn, NULL); } } -BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t; int ret = 0; @@ -1348,7 +1353,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) ret = -EDEADLK; goto out; } - drop_prog_refcnt(t); + drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish @@ -1371,7 +1376,7 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { */ void bpf_timer_cancel_and_free(void *val) { - struct bpf_timer_kern *timer = val; + struct bpf_async_kern *timer = val; struct bpf_hrtimer *t; /* Performance optimization: read timer->timer without lock first. */ @@ -1383,7 +1388,7 @@ void bpf_timer_cancel_and_free(void *val) t = timer->timer; if (!t) goto out; - drop_prog_refcnt(t); + drop_prog_refcnt(&t->cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ @@ -1410,7 +1415,7 @@ out: */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); - kfree_rcu(t, rcu); + kfree_rcu(t, cb.rcu); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) -- cgit v1.2.3 From 56b4a177ae6322173360a93ea828ad18570a5a14 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:02 +0200 Subject: bpf: replace bpf_timer_init with a generic helper No code change except for the new flags argument being stored in the local data struct. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-2-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 91 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 28 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7c7f31dd87ba..0f0201171576 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1111,7 +1111,10 @@ struct bpf_hrtimer { /* the actual struct hidden inside uapi struct bpf_timer */ struct bpf_async_kern { - struct bpf_hrtimer *timer; + union { + struct bpf_async_cb *cb; + struct bpf_hrtimer *timer; + }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. @@ -1119,6 +1122,10 @@ struct bpf_async_kern { struct bpf_spin_lock lock; } __attribute__((aligned(8))); +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, +}; + static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) @@ -1160,46 +1167,55 @@ out: return HRTIMER_NORESTART; } -BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, - u64, flags) +static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, + enum bpf_async_type type) { - clockid_t clockid = flags & (MAX_CLOCKS - 1); + struct bpf_async_cb *cb; struct bpf_hrtimer *t; + clockid_t clockid; + size_t size; int ret = 0; - BUILD_BUG_ON(MAX_CLOCKS != 16); - BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); - BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); - if (in_nmi()) return -EOPNOTSUPP; - if (flags >= MAX_CLOCKS || - /* similar to timerfd except _ALARM variants are not supported */ - (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME && - clockid != CLOCK_BOOTTIME)) + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + size = sizeof(struct bpf_hrtimer); + break; + default: return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; + } + + __bpf_spin_lock_irqsave(&async->lock); + t = async->timer; if (t) { ret = -EBUSY; goto out; } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ - t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); - if (!t) { + cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + if (!cb) { ret = -ENOMEM; goto out; } - t->cb.value = (void *)timer - map->record->timer_off; - t->cb.map = map; - t->cb.prog = NULL; - rcu_assign_pointer(t->cb.callback_fn, NULL); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; - WRITE_ONCE(timer->timer, t); - /* Guarantee the order between timer->timer and map->usercnt. So + + if (type == BPF_ASYNC_TYPE_TIMER) { + clockid = flags & (MAX_CLOCKS - 1); + t = (struct bpf_hrtimer *)cb; + + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; + cb->value = (void *)async - map->record->timer_off; + } + cb->map = map; + cb->prog = NULL; + cb->flags = flags; + rcu_assign_pointer(cb->callback_fn, NULL); + + WRITE_ONCE(async->cb, cb); + /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. @@ -1209,15 +1225,34 @@ BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(timer->timer, NULL); - kfree(t); + WRITE_ONCE(async->cb, NULL); + kfree(cb); ret = -EPERM; } out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, + u64, flags) +{ + clock_t clockid = flags & (MAX_CLOCKS - 1); + + BUILD_BUG_ON(MAX_CLOCKS != 16); + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); + + if (flags >= MAX_CLOCKS || + /* similar to timerfd except _ALARM variants are not supported */ + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_BOOTTIME)) + return -EINVAL; + + return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, -- cgit v1.2.3 From 073f11b0264310b85754b6a0946afee753790c66 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:03 +0200 Subject: bpf: replace bpf_timer_set_callback with a generic helper In the same way we have a generic __bpf_async_init(), we also need to share code between timer and workqueue for the set_callback call. We just add an unused flags parameter, as it will be used for workqueues. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-3-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0f0201171576..1cbbf28e9398 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1262,22 +1262,23 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, - struct bpf_prog_aux *, aux) +static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, + struct bpf_prog_aux *aux, unsigned int flags, + enum bpf_async_type type) { struct bpf_prog *prev, *prog = aux->prog; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { + __bpf_spin_lock_irqsave(&async->lock); + cb = async->cb; + if (!cb) { ret = -EINVAL; goto out; } - if (!atomic64_read(&t->cb.map->usercnt)) { + if (!atomic64_read(&cb->map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space @@ -1286,7 +1287,7 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callb ret = -EPERM; goto out; } - prev = t->cb.prog; + prev = cb->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. @@ -1299,14 +1300,20 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callb if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - t->cb.prog = prog; + cb->prog = prog; } - rcu_assign_pointer(t->cb.callback_fn, callback_fn); + rcu_assign_pointer(cb->callback_fn, callback_fn); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); return ret; } +BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, + struct bpf_prog_aux *, aux) +{ + return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); +} + static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, -- cgit v1.2.3 From fc22d9495f0b32d75b5d25a17b300b7aad05c55d Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:04 +0200 Subject: bpf: replace bpf_timer_cancel_and_free with a generic helper Same reason than most bpf_timer* functions, we need almost the same for workqueues. So extract the generic part out of it so bpf_wq_cancel_and_free can reuse it. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-4-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1cbbf28e9398..0b8ba6c81994 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1413,36 +1413,44 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) { - struct bpf_async_kern *timer = val; - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; - /* Performance optimization: read timer->timer without lock first. */ - if (!READ_ONCE(timer->timer)) - return; + /* Performance optimization: read async->cb without lock first. */ + if (!READ_ONCE(async->cb)) + return NULL; - __bpf_spin_lock_irqsave(&timer->lock); + __bpf_spin_lock_irqsave(&async->lock); /* re-read it under lock */ - t = timer->timer; - if (!t) + cb = async->cb; + if (!cb) goto out; - drop_prog_refcnt(&t->cb); + drop_prog_refcnt(cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ - WRITE_ONCE(timer->timer, NULL); + WRITE_ONCE(async->cb, NULL); out: - __bpf_spin_unlock_irqrestore(&timer->lock); + __bpf_spin_unlock_irqrestore(&async->lock); + return cb; +} + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_hrtimer *t; + + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!t) return; /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call kfree(t) * right after for both preallocated and non-preallocated maps. - * The timer->timer = NULL was already done and no code path can + * The async->cb = NULL was already done and no code path can * see address 't' anymore. * * Check that bpf_map_delete/update_elem() wasn't called from timer @@ -1451,7 +1459,7 @@ out: * return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', - * since timer->timer = NULL was already done. The timer will be + * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. */ -- cgit v1.2.3 From d56b63cf0c0f71e1b2e04dd8220b408f049e67ff Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:05 +0200 Subject: bpf: add support for bpf_wq user type Mostly a copy/paste from the bpf_timer API, without the initialization and free, as they will be done in a separate patch. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-5-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 17 +++++++++++++++++ kernel/bpf/syscall.c | 6 +++++- kernel/bpf/verifier.c | 9 +++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6d46cee47ae3..8291fbfd27b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3464,6 +3464,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, goto end; } } + if (field_mask & BPF_WORKQUEUE) { + if (!strcmp(name, "bpf_wq")) { + if (*seen_mask & BPF_WORKQUEUE) + return -E2BIG; + *seen_mask |= BPF_WORKQUEUE; + type = BPF_WORKQUEUE; + goto end; + } + } field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); @@ -3515,6 +3524,7 @@ static int btf_find_struct_field(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3582,6 +3592,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, switch (field_type) { case BPF_SPIN_LOCK: case BPF_TIMER: + case BPF_WORKQUEUE: case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: @@ -3816,6 +3827,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; + rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); @@ -3846,6 +3858,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->timer_off = rec->fields[i].offset; break; + case BPF_WORKQUEUE: + WARN_ON_ONCE(rec->wq_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->wq_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d392ec83655..0848e4141b00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -559,6 +559,7 @@ void btf_record_free(struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to release */ break; default: @@ -608,6 +609,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to acquire */ break; default: @@ -679,6 +681,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; + case BPF_WORKQUEUE: + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1085,7 +1089,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5a7e34e83a5b..89490a95b120 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1838,6 +1838,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) */ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; + if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -18141,6 +18143,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_wq yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); -- cgit v1.2.3 From ad2c03e691be3268eefc75ff1d892db3f0e79f62 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:07 +0200 Subject: bpf: verifier: bail out if the argument is not a map When a kfunc is declared with a KF_ARG_PTR_TO_MAP, we should have reg->map_ptr set to a non NULL value, otherwise, that means that the underlying type is not a map. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-7-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89490a95b120..4adf7fc33e5a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11720,6 +11720,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_NULL: continue; case KF_ARG_PTR_TO_MAP: + if (!reg->map_ptr) { + verbose(env, "pointer in R%d isn't map pointer\n", regno); + return -EINVAL; + } + fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) -- cgit v1.2.3 From d940c9b94d7e6d9cff288623e3e8bf5fdea98b79 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:08 +0200 Subject: bpf: add support for KF_ARG_PTR_TO_WORKQUEUE Introduce support for KF_ARG_PTR_TO_WORKQUEUE. The kfuncs will use bpf_wq as argument and that will be recognized as workqueue argument by verifier. bpf_wq_kern casting can happen inside kfunc, but using bpf_wq in argument makes life easier for users who work with non-kern type in BPF progs. Duplicate process_timer_func into process_wq_func. meta argument is only needed to ensure bpf_wq_init's workqueue and map arguments are coming from the same map (map_uid logic is necessary for correct inner-map handling), so also amend check_kfunc_args() to match what helpers functions check is doing. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-8-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4adf7fc33e5a..cc20fda907fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -332,6 +332,10 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; + struct { + struct bpf_map *ptr; + int uid; + } map; u64 mem_size; }; @@ -7598,6 +7602,23 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_wq_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (map->record->wq_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", + val + reg->off, map->record->wq_off); + return -EINVAL; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10843,6 +10864,7 @@ enum { KF_ARG_LIST_NODE_ID, KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, + KF_ARG_WORKQUEUE_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -10851,6 +10873,7 @@ BTF_ID(struct, bpf_list_head) BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) +BTF_ID(struct, bpf_wq) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -10894,6 +10917,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -10963,6 +10991,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_WORKQUEUE, }; enum special_kfunc_type { @@ -11119,6 +11148,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_map(meta->btf, &args[argno])) return KF_ARG_PTR_TO_MAP; + if (is_kfunc_arg_wq(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_WORKQUEUE; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11724,6 +11756,29 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } + if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + /* Use map_uid (which is unique id of inner map) to reject: + * inner_map1 = bpf_map_lookup_elem(outer_map, key1) + * inner_map2 = bpf_map_lookup_elem(outer_map, key2) + * if (inner_map1 && inner_map2) { + * wq = bpf_map_lookup_elem(inner_map1); + * if (wq) + * // mismatch would have been allowed + * bpf_wq_init(wq, inner_map2); + * } + * + * Comparing map_ptr is enough to distinguish normal and outer maps. + */ + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { + verbose(env, + "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } + } + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: @@ -11757,6 +11812,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: + case KF_ARG_PTR_TO_WORKQUEUE: /* Trusted by default */ break; default: @@ -12043,6 +12099,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret) return ret; break; + case KF_ARG_PTR_TO_WORKQUEUE: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_wq_func(env, regno, meta); + if (ret < 0) + return ret; + break; } } -- cgit v1.2.3 From 246331e3f1eac905170a923f0ec76725c2558232 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:09 +0200 Subject: bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps Currently bpf_wq_cancel_and_free() is just a placeholder as there is no memory allocation for bpf_wq just yet. Again, duplication of the bpf_timer approach Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-9-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 18 ++++++++++------- kernel/bpf/hashtab.c | 55 ++++++++++++++++++++++++++++++++++++++++----------- kernel/bpf/helpers.c | 8 ++++++++ kernel/bpf/syscall.c | 9 +++++++++ 4 files changed, 71 insertions(+), 19 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8c1e6d7654bb..580d07b15471 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -428,17 +428,21 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers(struct bpf_map *map) +static void array_map_free_timers_wq(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer on uref dropping to zero. */ - if (!btf_record_has_field(map->record, BPF_TIMER)) - return; + /* We don't reset or free fields other than timer and workqueue + * on uref dropping to zero. + */ + if (btf_record_has_field(map->record, BPF_TIMER)) + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + for (i = 0; i < array->map.max_entries; i++) + bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -782,7 +786,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers, + .map_release_uref = array_map_free_timers_wq, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c3e79a0b9361..0179183c543a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -240,6 +240,26 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_wq(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -1495,7 +1515,7 @@ static void delete_all_elements(struct bpf_htab *htab) migrate_enable(); } -static void htab_free_malloced_timers(struct bpf_htab *htab) +static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) { int i; @@ -1507,24 +1527,35 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); + if (is_timer) + bpf_obj_free_timer(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); + else + bpf_obj_free_workqueue(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers(struct bpf_map *map) +static void htab_map_free_timers_and_wq(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer on uref dropping to zero */ - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers(htab); - else - htab_free_prealloced_timers(htab); + /* We only free timer and workqueue on uref dropping to zero */ + if (btf_record_has_field(htab->map.record, BPF_TIMER)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, true); + else + htab_free_prealloced_timers(htab); + } + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, false); + else + htab_free_prealloced_wq(htab); + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -2260,7 +2291,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2281,7 +2312,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0b8ba6c81994..fe827f6e5234 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1468,6 +1468,14 @@ void bpf_timer_cancel_and_free(void *val) kfree_rcu(t, cb.rcu); } +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_wq_cancel_and_free(void *val) +{ + BTF_TYPE_EMIT(struct bpf_wq); +} + BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) { unsigned long *kptr = map_value; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0848e4141b00..63e368337483 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -661,6 +661,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) + return; + bpf_wq_cancel_and_free(obj + rec->wq_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -682,6 +689,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(field_ptr); break; case BPF_WORKQUEUE: + bpf_wq_cancel_and_free(field_ptr); break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); @@ -1119,6 +1127,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, } break; case BPF_TIMER: + case BPF_WORKQUEUE: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { -- cgit v1.2.3 From eb48f6cd41a0f7803770a76bbffb6bd5b1b2ae2f Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:11 +0200 Subject: bpf: wq: add bpf_wq_init We need to teach the verifier about the second argument which is declared as void * but which is of type KF_ARG_PTR_TO_MAP. We could have dropped this extra case if we declared the second argument as struct bpf_map *, but that means users will have to do extra casting to have their program compile. We also need to duplicate the timer code for the checking if the map argument is matching the provided workqueue. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-11-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fe827f6e5234..1fc002cfe2b1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1109,11 +1109,18 @@ struct bpf_hrtimer { struct hrtimer timer; }; -/* the actual struct hidden inside uapi struct bpf_timer */ +struct bpf_work { + struct bpf_async_cb cb; + struct work_struct work; + struct work_struct delete_work; +}; + +/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ struct bpf_async_kern { union { struct bpf_async_cb *cb; struct bpf_hrtimer *timer; + struct bpf_work *work; }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer @@ -1124,6 +1131,7 @@ struct bpf_async_kern { enum bpf_async_type { BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, }; static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); @@ -1167,11 +1175,64 @@ out: return HRTIMER_NORESTART; } +static void bpf_wq_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, work); + struct bpf_tramp_run_ctx __maybe_unused run_ctx; + struct bpf_async_cb *cb = &w->cb; + struct bpf_prog *prog = cb->prog; + struct bpf_map *map = cb->map; + bpf_callback_t callback_fn; + void *value = cb->value; + void *key; + u32 idx; + + BTF_TYPE_EMIT(struct bpf_wq); + + callback_fn = READ_ONCE(cb->callback_fn); + if (!callback_fn || !prog) + return; + + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* compute the key */ + idx = ((char *)value - array->value) / array->elem_size; + key = &idx; + } else { /* hash or lru */ + key = value - round_up(map->key_size, 8); + } + + run_ctx.bpf_cookie = 0; + + if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { + /* recursion detected */ + __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); + return; + } + + callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); + /* The verifier checked that return value is zero. */ + + __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, + &run_ctx); +} + +static void bpf_wq_delete_work(struct work_struct *work) +{ + struct bpf_work *w = container_of(work, struct bpf_work, delete_work); + + cancel_work_sync(&w->work); + + kfree_rcu(w, cb.rcu); +} + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { struct bpf_async_cb *cb; struct bpf_hrtimer *t; + struct bpf_work *w; clockid_t clockid; size_t size; int ret = 0; @@ -1183,6 +1244,9 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u case BPF_ASYNC_TYPE_TIMER: size = sizeof(struct bpf_hrtimer); break; + case BPF_ASYNC_TYPE_WQ: + size = sizeof(struct bpf_work); + break; default: return -EINVAL; } @@ -1201,13 +1265,22 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - if (type == BPF_ASYNC_TYPE_TIMER) { + switch (type) { + case BPF_ASYNC_TYPE_TIMER: clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; + break; + case BPF_ASYNC_TYPE_WQ: + w = (struct bpf_work *)cb; + + INIT_WORK(&w->work, bpf_wq_work); + INIT_WORK(&w->delete_work, bpf_wq_delete_work); + cb->value = (void *)async - map->record->wq_off; + break; } cb->map = map; cb->prog = NULL; @@ -1473,7 +1546,19 @@ void bpf_timer_cancel_and_free(void *val) */ void bpf_wq_cancel_and_free(void *val) { + struct bpf_work *work; + BTF_TYPE_EMIT(struct bpf_wq); + + work = (struct bpf_work *)__bpf_async_cancel_and_free(val); + if (!work) + return; + /* Trigger cancel of the sleepable work, but *do not* wait for + * it to finish if it was running as we might not be in a + * sleepable context. + * kfree will be called once the work has finished. + */ + schedule_work(&work->delete_work); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) @@ -2612,6 +2697,20 @@ __bpf_kfunc void bpf_throw(u64 cookie) WARN(1, "A call to BPF exception callback should never return\n"); } +__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_map *map = p__map; + + BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq)); + BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq)); + + if (flags) + return -EINVAL; + + return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2689,6 +2788,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) +BTF_ID_FLAGS(func, bpf_wq_init) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 81f1d7a583fa1fa14f0c4e6140d34b5e3d08d227 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:13 +0200 Subject: bpf: wq: add bpf_wq_set_callback_impl To support sleepable async callbacks, we need to tell push_async_cb() whether the cb is sleepable or not. The verifier now detects that we are in bpf_wq_set_callback_impl and can allow a sleepable callback to happen. Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-13-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 15 +++++++++++++ kernel/bpf/verifier.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 69 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1fc002cfe2b1..dd2eaaf77c93 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2711,6 +2711,20 @@ __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, struct bpf_wq *wq), + unsigned int flags, + void *aux__ign) +{ + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + + if (flags) + return -EINVAL; + + return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2789,6 +2803,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) +BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cc20fda907fb..9715c88cc025 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -501,8 +501,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) } static bool is_sync_callback_calling_kfunc(u32 btf_id); +static bool is_async_callback_calling_kfunc(u32 btf_id); +static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); + static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || @@ -530,7 +534,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn) static bool is_async_callback_calling_insn(struct bpf_insn *insn) { - return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm); + return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } static bool is_may_goto_insn(struct bpf_insn *insn) @@ -1429,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->active_rcu_lock = src->active_rcu_lock; + dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->active_lock.ptr = src->active_lock.ptr; dst_state->active_lock.id = src->active_lock.id; @@ -2404,7 +2410,7 @@ static void init_func_state(struct bpf_verifier_env *env, /* Similar to push_stack(), but for async callbacks */ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, - int subprog) + int subprog, bool is_sleepable) { struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; @@ -2431,6 +2437,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * Initialize it similar to do_check_common(). */ elem->st.branches = 1; + elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -5278,7 +5285,8 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable; + return env->prog->sleepable || + (env->cur_state && env->cur_state->in_sleepable); } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -9513,7 +9521,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins */ env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && - !is_sync_callback_calling_kfunc(insn->imm)) { + !is_callback_calling_kfunc(insn->imm)) { verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", func_id_name(insn->imm), insn->imm); return -EFAULT; @@ -9527,10 +9535,11 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins if (is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; - /* there is no real recursion here. timer callbacks are async */ + /* there is no real recursion here. timer and workqueue callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - insn_idx, subprog); + insn_idx, subprog, + is_bpf_wq_set_callback_impl_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -11017,6 +11026,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, + KF_bpf_wq_set_callback_impl, KF_bpf_iter_css_task_new, }; @@ -11041,6 +11051,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #endif @@ -11069,6 +11080,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) +BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #else @@ -11402,12 +11414,28 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } +static bool is_async_callback_calling_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + static bool is_bpf_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; } +static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; +} + +static bool is_callback_calling_kfunc(u32 btf_id) +{ + return is_sync_callback_calling_kfunc(btf_id) || + is_async_callback_calling_kfunc(btf_id); +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id) { return is_bpf_rbtree_api_kfunc(btf_id); @@ -12219,6 +12247,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); @@ -16968,6 +17006,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_rcu_lock != cur->active_rcu_lock) return false; + if (old->in_sleepable != cur->in_sleepable) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -19639,6 +19680,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + *cnt = 3; } return 0; } -- cgit v1.2.3 From 8e83da9732d91c60fdc651b2486c8e5935eb0ca2 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:15 +0200 Subject: bpf: add bpf_wq_start again, copy/paste from bpf_timer_start(). Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-15-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dd2eaaf77c93..047a21b7e4ba 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2711,6 +2711,23 @@ __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) +{ + struct bpf_async_kern *async = (struct bpf_async_kern *)wq; + struct bpf_work *w; + + if (in_nmi()) + return -EOPNOTSUPP; + if (flags) + return -EINVAL; + w = READ_ONCE(async->work); + if (!w || !READ_ONCE(w->cb.prog)) + return -EINVAL; + + schedule_work(&w->work); + return 0; +} + __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, struct bpf_wq *wq), unsigned int flags, @@ -2804,6 +2821,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_start) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 1adb825af946fa6b2104c1713ad4999283e09fbc Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 27 Jun 2023 15:30:19 +0200 Subject: bpf: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from bpf_syscall_table. Acked-by: Andrii Nakryiko Signed-off-by: Joel Granados --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..c7e805087b06 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5979,7 +5979,6 @@ static struct ctl_table bpf_syscall_table[] = { .mode = 0644, .proc_handler = bpf_stats_handler, }, - { } }; static int __init bpf_syscall_sysctl_init(void) -- cgit v1.2.3 From dc92febf7b93da5049fe177804e6b1961fcc6bd7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 24 Apr 2024 09:00:23 -0700 Subject: bpf: Don't check for recursion in bpf_wq_work. __bpf_prog_enter_sleepable_recur does recursion check which is not applicable to wq callback. The callback function is part of bpf program and bpf prog might be running on the same cpu. So recursion check would incorrectly prevent callback from running. The code can call __bpf_prog_enter_sleepable(), but run_ctx would be fake, hence use explicit rcu_read_lock_trace(); migrate_disable(); to address this problem. Another reason to open code is __bpf_prog_enter* are not available in !JIT configs. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404241719.IIGdpAku-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202404241811.FFV4Bku3-lkp@intel.com/ Fixes: eb48f6cd41a0 ("bpf: wq: add bpf_wq_init") Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 047a21b7e4ba..5f61ffa7505a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1178,9 +1178,7 @@ out: static void bpf_wq_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, work); - struct bpf_tramp_run_ctx __maybe_unused run_ctx; struct bpf_async_cb *cb = &w->cb; - struct bpf_prog *prog = cb->prog; struct bpf_map *map = cb->map; bpf_callback_t callback_fn; void *value = cb->value; @@ -1190,7 +1188,7 @@ static void bpf_wq_work(struct work_struct *work) BTF_TYPE_EMIT(struct bpf_wq); callback_fn = READ_ONCE(cb->callback_fn); - if (!callback_fn || !prog) + if (!callback_fn) return; if (map->map_type == BPF_MAP_TYPE_ARRAY) { @@ -1203,19 +1201,13 @@ static void bpf_wq_work(struct work_struct *work) key = value - round_up(map->key_size, 8); } - run_ctx.bpf_cookie = 0; - - if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { - /* recursion detected */ - __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); - return; - } + rcu_read_lock_trace(); + migrate_disable(); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); - /* The verifier checked that return value is zero. */ - __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, - &run_ctx); + migrate_enable(); + rcu_read_unlock_trace(); } static void bpf_wq_delete_work(struct work_struct *work) -- cgit v1.2.3 From fc7566ad0a826cdc8886c5dbbb39ce72a0dc6333 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 24 Apr 2024 03:13:14 +0000 Subject: bpf: Introduce bpf_preempt_[disable,enable] kfuncs Introduce two new BPF kfuncs, bpf_preempt_disable and bpf_preempt_enable. These kfuncs allow disabling preemption in BPF programs. Nesting is allowed, since the intended use cases includes building native BPF spin locks without kernel helper involvement. Apart from that, this can be used to per-CPU data structures for cases where programs (or userspace) may preempt one or the other. Currently, while per-CPU access is stable, whether it will be consistent is not guaranteed, as only migration is disabled for BPF programs. Global functions are disallowed from being called, but support for them will be added as a follow up not just preempt kfuncs, but rcu_read_lock kfuncs as well. Static subprog calls are permitted. Sleepable helpers and kfuncs are disallowed in non-preemptible regions. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240424031315.2757363-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 12 +++++++++ kernel/bpf/verifier.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 81 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5f61ffa7505a..14c401b78e12 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2734,6 +2734,16 @@ __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); } +__bpf_kfunc void bpf_preempt_disable(void) +{ + preempt_disable(); +} + +__bpf_kfunc void bpf_preempt_enable(void) +{ + preempt_enable(); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -2814,6 +2824,8 @@ BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_ID_FLAGS(func, bpf_wq_start) +BTF_ID_FLAGS(func, bpf_preempt_disable) +BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9715c88cc025..8312c7a0ee19 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1434,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->active_rcu_lock = src->active_rcu_lock; + dst_state->active_preempt_lock = src->active_preempt_lock; dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->active_lock.ptr = src->active_lock.ptr; @@ -9599,6 +9600,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + /* Only global subprogs cannot be called with preemption disabled. */ + if (env->cur_state->active_preempt_lock) { + verbose(env, "global function calls are not allowed with preemption disabled,\n" + "use static function instead\n"); + return -EINVAL; + } + if (err) { verbose(env, "Caller passes invalid args into func#%d ('%s')\n", subprog, sub_name); @@ -10285,6 +10293,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + if (env->cur_state->active_preempt_lock) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in non-preemptible region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (in_sleepable(env) && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -11027,6 +11046,8 @@ enum special_kfunc_type { KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, KF_bpf_wq_set_callback_impl, + KF_bpf_preempt_disable, + KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, }; @@ -11081,6 +11102,8 @@ BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_preempt_disable) +BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #else @@ -11107,6 +11130,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock]; } +static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable]; +} + +static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; +} + static enum kfunc_ptr_arg_type get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -12195,11 +12228,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { - const struct btf_type *t, *ptr_type; + bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; u32 i, nargs, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; - bool sleepable, rcu_lock, rcu_unlock; + const struct btf_type *t, *ptr_type; struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; @@ -12260,6 +12293,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); + preempt_disable = is_kfunc_bpf_preempt_disable(&meta); + preempt_enable = is_kfunc_bpf_preempt_enable(&meta); + if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; struct bpf_reg_state *reg; @@ -12292,6 +12328,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + env->cur_state->active_preempt_lock--; + } else if (sleepable) { + verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); + return -EACCES; + } + } else if (preempt_disable) { + env->cur_state->active_preempt_lock++; + } else if (preempt_enable) { + verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); + return -EINVAL; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -15439,6 +15491,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->cur_state->active_preempt_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -17006,6 +17063,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->active_rcu_lock != cur->active_rcu_lock) return false; + if (old->active_preempt_lock != cur->active_preempt_lock) + return false; + if (old->in_sleepable != cur->in_sleepable) return false; @@ -17957,6 +18017,13 @@ process_bpf_exit_full: return -EINVAL; } + if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) { + verbose(env, "%d bpf_preempt_enable%s missing\n", + env->cur_state->active_preempt_lock, + env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are"); + return -EINVAL; + } + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback -- cgit v1.2.3 From 3e1c6f35409f9e447bf37f64840f5b65576bfb78 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Apr 2024 15:50:21 -0700 Subject: bpf: make common crypto API for TC/XDP programs Add crypto API support to BPF to be able to decrypt or encrypt packets in TC/XDP BPF programs. Special care should be taken for initialization part of crypto algo because crypto alloc) doesn't work with preemtion disabled, it can be run only in sleepable BPF program. Also async crypto is not supported because of the very same issue - TC/XDP BPF programs are not sleepable. Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240422225024.2847039-2-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/Makefile | 3 + kernel/bpf/crypto.c | 385 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 1 + 4 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/crypto.c (limited to 'kernel/bpf') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 368c5d86b5b7..736bd22e5ce0 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,6 +44,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif +ifeq ($(CONFIG_CRYPTO),y) +obj-$(CONFIG_BPF_SYSCALL) += crypto.o +endif obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c new file mode 100644 index 000000000000..2bee4af91e38 --- /dev/null +++ b/kernel/bpf/crypto.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta, Inc */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bpf_crypto_type_list { + const struct bpf_crypto_type *type; + struct list_head list; +}; + +/* BPF crypto initialization parameters struct */ +/** + * struct bpf_crypto_params - BPF crypto initialization parameters structure + * @type: The string of crypto operation type. + * @reserved: Reserved member, will be reused for more options in future + * Values: + * 0 + * @algo: The string of algorithm to initialize. + * @key: The cipher key used to init crypto algorithm. + * @key_len: The length of cipher key. + * @authsize: The length of authentication tag used by algorithm. + */ +struct bpf_crypto_params { + char type[14]; + u8 reserved[2]; + char algo[128]; + u8 key[256]; + u32 key_len; + u32 authsize; +}; + +static LIST_HEAD(bpf_crypto_types); +static DECLARE_RWSEM(bpf_crypto_types_sem); + +/** + * struct bpf_crypto_ctx - refcounted BPF crypto context structure + * @type: The pointer to bpf crypto type + * @tfm: The pointer to instance of crypto API struct. + * @siv_len: Size of IV and state storage for cipher + * @rcu: The RCU head used to free the crypto context with RCU safety. + * @usage: Object reference counter. When the refcount goes to 0, the + * memory is released back to the BPF allocator, which provides + * RCU safety. + */ +struct bpf_crypto_ctx { + const struct bpf_crypto_type *type; + void *tfm; + u32 siv_len; + struct rcu_head rcu; + refcount_t usage; +}; + +int bpf_crypto_register_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -EEXIST; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (!strcmp(node->type->name, type->name)) + goto unlock; + } + + node = kmalloc(sizeof(*node), GFP_KERNEL); + err = -ENOMEM; + if (!node) + goto unlock; + + node->type = type; + list_add(&node->list, &bpf_crypto_types); + err = 0; + +unlock: + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_register_type); + +int bpf_crypto_unregister_type(const struct bpf_crypto_type *type) +{ + struct bpf_crypto_type_list *node; + int err = -ENOENT; + + down_write(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, type->name)) + continue; + + list_del(&node->list); + kfree(node); + err = 0; + break; + } + up_write(&bpf_crypto_types_sem); + + return err; +} +EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type); + +static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name) +{ + const struct bpf_crypto_type *type = ERR_PTR(-ENOENT); + struct bpf_crypto_type_list *node; + + down_read(&bpf_crypto_types_sem); + list_for_each_entry(node, &bpf_crypto_types, list) { + if (strcmp(node->type->name, name)) + continue; + + if (try_module_get(node->type->owner)) + type = node->type; + break; + } + up_read(&bpf_crypto_types_sem); + + return type; +} + +__bpf_kfunc_start_defs(); + +/** + * bpf_crypto_ctx_create() - Create a mutable BPF crypto context. + * + * Allocates a crypto context that can be used, acquired, and released by + * a BPF program. The crypto context returned by this function must either + * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release(). + * As crypto API functions use GFP_KERNEL allocations, this function can + * only be used in sleepable BPF programs. + * + * bpf_crypto_ctx_create() allocates memory for crypto context. + * It may return NULL if no memory is available. + * @params: pointer to struct bpf_crypto_params which contains all the + * details needed to initialise crypto context. + * @params__sz: size of steuct bpf_crypto_params usef by bpf program + * @err: integer to store error code when NULL is returned. + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz, + int *err) +{ + const struct bpf_crypto_type *type; + struct bpf_crypto_ctx *ctx; + + if (!params || params->reserved[0] || params->reserved[1] || + params__sz != sizeof(struct bpf_crypto_params)) { + *err = -EINVAL; + return NULL; + } + + type = bpf_crypto_get_type(params->type); + if (IS_ERR(type)) { + *err = PTR_ERR(type); + return NULL; + } + + if (!type->has_algo(params->algo)) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!!params->authsize ^ !!type->setauthsize) { + *err = -EOPNOTSUPP; + goto err_module_put; + } + + if (!params->key_len || params->key_len > sizeof(params->key)) { + *err = -EINVAL; + goto err_module_put; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + *err = -ENOMEM; + goto err_module_put; + } + + ctx->type = type; + ctx->tfm = type->alloc_tfm(params->algo); + if (IS_ERR(ctx->tfm)) { + *err = PTR_ERR(ctx->tfm); + goto err_free_ctx; + } + + if (params->authsize) { + *err = type->setauthsize(ctx->tfm, params->authsize); + if (*err) + goto err_free_tfm; + } + + *err = type->setkey(ctx->tfm, params->key, params->key_len); + if (*err) + goto err_free_tfm; + + if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) { + *err = -EINVAL; + goto err_free_tfm; + } + + ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm); + + refcount_set(&ctx->usage, 1); + + return ctx; + +err_free_tfm: + type->free_tfm(ctx->tfm); +err_free_ctx: + kfree(ctx); +err_module_put: + module_put(type->owner); + + return NULL; +} + +static void crypto_free_cb(struct rcu_head *head) +{ + struct bpf_crypto_ctx *ctx; + + ctx = container_of(head, struct bpf_crypto_ctx, rcu); + ctx->type->free_tfm(ctx->tfm); + module_put(ctx->type->owner); + kfree(ctx); +} + +/** + * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context. + * @ctx: The BPF crypto context being acquired. The ctx must be a trusted + * pointer. + * + * Acquires a reference to a BPF crypto context. The context returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_crypto_ctx_release(). + */ +__bpf_kfunc struct bpf_crypto_ctx * +bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) +{ + if (!refcount_inc_not_zero(&ctx->usage)) + return NULL; + return ctx; +} + +/** + * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context. + * @ctx: The crypto context being released. + * + * Releases a previously acquired reference to a BPF crypto context. When the final + * reference of the BPF crypto context has been released, its memory + * will be released. + */ +__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) +{ + if (refcount_dec_and_test(&ctx->usage)) + call_rcu(&ctx->rcu, crypto_free_cb); +} + +static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv, + bool decrypt) +{ + u32 src_len, dst_len, siv_len; + const u8 *psrc; + u8 *pdst, *piv; + int err; + + if (__bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + siv_len = __bpf_dynptr_size(siv); + src_len = __bpf_dynptr_size(src); + dst_len = __bpf_dynptr_size(dst); + if (!src_len || !dst_len) + return -EINVAL; + + if (siv_len != ctx->siv_len) + return -EINVAL; + + psrc = __bpf_dynptr_data(src, src_len); + if (!psrc) + return -EINVAL; + pdst = __bpf_dynptr_data_rw(dst, dst_len); + if (!pdst) + return -EINVAL; + + piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL; + if (siv_len && !piv) + return -EINVAL; + + err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) + : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); + + return err; +} + +/** + * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer. + * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, true); +} + +/** + * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided. + * @ctx: The crypto context being used. The ctx must be a trusted pointer. + * @src: bpf_dynptr to the plain data. Must be a trusted pointer. + * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer. + * @siv: bpf_dynptr to IV data and state data to be used by decryptor. + * + * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured. + */ +__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, + const struct bpf_dynptr_kern *src, + const struct bpf_dynptr_kern *dst, + const struct bpf_dynptr_kern *siv) +{ + return bpf_crypto_crypt(ctx, src, dst, siv, false); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(crypt_init_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_KFUNCS_END(crypt_init_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_init_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_init_kfunc_btf_ids, +}; + +BTF_KFUNCS_START(crypt_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU) +BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU) +BTF_KFUNCS_END(crypt_kfunc_btf_ids) + +static const struct btf_kfunc_id_set crypt_kfunc_set = { + .owner = THIS_MODULE, + .set = &crypt_kfunc_btf_ids, +}; + +BTF_ID_LIST(bpf_crypto_dtor_ids) +BTF_ID(struct, bpf_crypto_ctx) +BTF_ID(func, bpf_crypto_ctx_release) + +static int __init crypto_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = { + { + .btf_id = bpf_crypto_dtor_ids[0], + .kfunc_btf_id = bpf_crypto_dtor_ids[1] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &crypt_init_kfunc_set); + return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors, + ARRAY_SIZE(bpf_crypto_dtors), + THIS_MODULE); +} + +late_initcall(crypto_kfunc_init); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 14c401b78e12..2a69a9a36c0f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1583,7 +1583,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) +bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8312c7a0ee19..4e474ef44e9c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5310,6 +5310,7 @@ BTF_ID(struct, cgroup) BTF_ID(struct, bpf_cpumask) #endif BTF_ID(struct, task_struct) +BTF_ID(struct, bpf_crypto_ctx) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) -- cgit v1.2.3 From 91b71e78b8e478e1a9af36b15ff1d38810572866 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 16 Mar 2024 01:58:03 +0000 Subject: mm: memcg: add NULL check to obj_cgroup_put() 9 out of 16 callers perform a NULL check before calling obj_cgroup_put(). Move the NULL check in the function, similar to mem_cgroup_put(). The unlikely() NULL check in current_objcg_update() was left alone to avoid dropping the unlikey() annotation as this a fast path. Link: https://lkml.kernel.org/r/20240316015803.2777252-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton --- kernel/bpf/memalloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 550f02e2cb13..a546aba46d5d 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -759,8 +759,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - if (ma->objcg) - obj_cgroup_put(ma->objcg); + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -776,8 +775,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (ma->objcg) - obj_cgroup_put(ma->objcg); + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } -- cgit v1.2.3 From 529ce23a764f25d172198b4c6ba90f1e2ad17f93 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 25 Mar 2024 19:16:44 -0700 Subject: mm: switch mm->get_unmapped_area() to a flag The mm_struct contains a function pointer *get_unmapped_area(), which is set to either arch_get_unmapped_area() or arch_get_unmapped_area_topdown() during the initialization of the mm. Since the function pointer only ever points to two functions that are named the same across all arch's, a function pointer is not really required. In addition future changes will want to add versions of the functions that take additional arguments. So to save a pointers worth of bytes in mm_struct, and prevent adding additional function pointers to mm_struct in future changes, remove it and keep the information about which get_unmapped_area() to use in a flag. Add the new flag to MMF_INIT_MASK so it doesn't get clobbered on fork by mmf_init_flags(). Most MM flags get clobbered on fork. In the pre-existing behavior mm->get_unmapped_area() would get copied to the new mm in dup_mm(), so not clobbering the flag preserves the existing behavior around inheriting the topdown-ness. Introduce a helper, mm_get_unmapped_area(), to easily convert code that refers to the old function pointer to instead select and call either arch_get_unmapped_area() or arch_get_unmapped_area_topdown() based on the flag. Then drop the mm->get_unmapped_area() function pointer. Leave the get_unmapped_area() pointer in struct file_operations alone. The main purpose of this change is to reorganize in preparation for future changes, but it also converts the calls of mm->get_unmapped_area() from indirect branches into a direct ones. The stress-ng bigheap benchmark calls realloc a lot, which calls through get_unmapped_area() in the kernel. On x86, the change yielded a ~1% improvement there on a retpoline config. In testing a few x86 configs, removing the pointer unfortunately didn't result in any actual size reductions in the compiled layout of mm_struct. But depending on compiler or arch alignment requirements, the change could shrink the size of mm_struct. Link: https://lkml.kernel.org/r/20240326021656.202649-3-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe Acked-by: Dave Hansen Acked-by: Liam R. Howlett Reviewed-by: Kirill A. Shutemov Acked-by: Alexei Starovoitov Cc: Dan Williams Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov (AMD) Cc: Christophe Leroy Cc: Deepak Gupta Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Kees Cook Cc: Mark Brown Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/bpf/arena.c | 2 +- kernel/bpf/syscall.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 343c3456c8dd..16dbf4f6b77f 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -314,7 +314,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c287925471f6..9cb89e875f0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -980,7 +980,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); #else return addr; #endif -- cgit v1.2.3 From 66e13b615a0ce76b785d780ecc9776ba71983629 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:08 +0000 Subject: bpf: verifier: prevent userspace memory access With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To thwart invalid memory accesses, the JITs add an exception table entry for all such accesses. But in case the src_reg + offset is a userspace address, the BPF program might read that memory if the user has mapped it. Make the verifier add guard instructions around such memory accesses and skip the load if the address falls into the userspace region. The JITs need to implement bpf_arch_uaddress_limit() to define where the userspace addresses end for that architecture or TASK_SIZE is taken as default. The implementation is as follows: REG_AX = SRC_REG if(offset) REG_AX += offset; REG_AX >>= 32; if (REG_AX <= (uaddress_limit >> 32)) DST_REG = 0; else DST_REG = *(size *)(SRC_REG + offset); Comparing just the upper 32 bits of the load address with the upper 32 bits of uaddress_limit implies that the values are being aligned down to a 4GB boundary before comparison. The above means that all loads with address <= uaddress_limit + 4GB are skipped. This is acceptable because there is a large hole (much larger than 4GB) between userspace and kernel space memory, therefore a correctly functioning BPF program should not access this 4GB memory above the userspace. Let's analyze what this patch does to the following fentry program dereferencing an untrusted pointer: SEC("fentry/tcp_v4_connect") int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) { *(volatile long *)sk; return 0; } BPF Program before | BPF Program after ------------------ | ----------------- 0: (79) r1 = *(u64 *)(r1 +0) 0: (79) r1 = *(u64 *)(r1 +0) ----------------------------------------------------------------------- 1: (79) r1 = *(u64 *)(r1 +0) --\ 1: (bf) r11 = r1 ----------------------------\ \ 2: (77) r11 >>= 32 2: (b7) r0 = 0 \ \ 3: (b5) if r11 <= 0x8000 goto pc+2 3: (95) exit \ \-> 4: (79) r1 = *(u64 *)(r1 +0) \ 5: (05) goto pc+1 \ 6: (b7) r1 = 0 \-------------------------------------- 7: (b7) r0 = 0 8: (95) exit As you can see from above, in the best case (off=0), 5 extra instructions are emitted. Now, we analyze the same program after it has gone through the JITs of ARM64 and RISC-V architectures. We follow the single load instruction that has the untrusted pointer and see what instrumentation has been added around it. x86-64 JIT ========== JIT's Instrumentation (upstream) --------------------- 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 7: push %rbp 8: mov %rsp,%rbp b: mov 0x0(%rdi),%rdi --------------------------------- f: movabs $0x800000000000,%r11 19: cmp %r11,%rdi 1c: jb 0x000000000000002a 1e: mov %rdi,%r11 21: add $0x0,%r11 28: jae 0x000000000000002e 2a: xor %edi,%edi 2c: jmp 0x0000000000000032 2e: mov 0x0(%rdi),%rdi --------------------------------- 32: xor %eax,%eax 34: leave 35: ret The x86-64 JIT already emits some instructions to protect against user memory access. This patch doesn't make any changes for the x86-64 JIT. ARM64 JIT ========= No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: add x9, x30, #0x0 0: add x9, x30, #0x0 4: nop 4: nop 8: paciasp 8: paciasp c: stp x29, x30, [sp, #-16]! c: stp x29, x30, [sp, #-16]! 10: mov x29, sp 10: mov x29, sp 14: stp x19, x20, [sp, #-16]! 14: stp x19, x20, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 24: mov x25, sp 24: mov x25, sp 28: mov x26, #0x0 28: mov x26, #0x0 2c: sub x27, x25, #0x0 2c: sub x27, x25, #0x0 30: sub sp, sp, #0x0 30: sub sp, sp, #0x0 34: ldr x0, [x0] 34: ldr x0, [x0] -------------------------------------------------------------------------------- 38: ldr x0, [x0] ----------\ 38: add x9, x0, #0x0 -----------------------------------\\ 3c: lsr x9, x9, #32 3c: mov x7, #0x0 \\ 40: cmp x9, #0x10, lsl #12 40: mov sp, sp \\ 44: b.ls 0x0000000000000050 44: ldp x27, x28, [sp], #16 \\--> 48: ldr x0, [x0] 48: ldp x25, x26, [sp], #16 \ 4c: b 0x0000000000000054 4c: ldp x21, x22, [sp], #16 \ 50: mov x0, #0x0 50: ldp x19, x20, [sp], #16 \--------------------------------------- 54: ldp x29, x30, [sp], #16 54: mov x7, #0x0 58: add x0, x7, #0x0 58: mov sp, sp 5c: autiasp 5c: ldp x27, x28, [sp], #16 60: ret 60: ldp x25, x26, [sp], #16 64: nop 64: ldp x21, x22, [sp], #16 68: ldr x10, 0x0000000000000070 68: ldp x19, x20, [sp], #16 6c: br x10 6c: ldp x29, x30, [sp], #16 70: add x0, x7, #0x0 74: autiasp 78: ret 7c: nop 80: ldr x10, 0x0000000000000088 84: br x10 There are 6 extra instructions added in ARM64 in the best case. This will become 7 in the worst case (off != 0). RISC-V JIT (RISCV_ISA_C Disabled) ========== No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: nop 0: nop 4: nop 4: nop 8: li a6, 33 8: li a6, 33 c: addi sp, sp, -16 c: addi sp, sp, -16 10: sd s0, 8(sp) 10: sd s0, 8(sp) 14: addi s0, sp, 16 14: addi s0, sp, 16 18: ld a0, 0(a0) 18: ld a0, 0(a0) --------------------------------------------------------------- 1c: ld a0, 0(a0) --\ 1c: mv t0, a0 --------------------------\ \ 20: srli t0, t0, 32 20: li a5, 0 \ \ 24: lui t1, 4096 24: ld s0, 8(sp) \ \ 28: sext.w t1, t1 28: addi sp, sp, 16 \ \ 2c: bgeu t1, t0, 12 2c: sext.w a0, a5 \ \--> 30: ld a0, 0(a0) 30: ret \ 34: j 8 \ 38: li a0, 0 \------------------------------ 3c: li a5, 0 40: ld s0, 8(sp) 44: addi sp, sp, 16 48: sext.w a0, a5 4c: ret There are 7 extra instructions added in RISC-V. Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables") Reported-by: Breno Leitao Suggested-by: Alexei Starovoitov Acked-by: Ilya Leoshkevich Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 9 +++++++++ kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 696bc55de8e8..1ea5ce5bb599 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2942,6 +2942,15 @@ bool __weak bpf_jit_supports_arena(void) return false; } +u64 __weak bpf_arch_uaddress_limit(void) +{ +#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) + return TASK_SIZE; +#else + return 0; +#endif +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa24beb65393..cb7ad1f795e1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19675,6 +19675,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Make it impossible to de-reference a userspace address */ + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { + struct bpf_insn *patch = &insn_buf[0]; + u64 uaddress_limit = bpf_arch_uaddress_limit(); + + if (!uaddress_limit) + goto next_insn; + + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + if (insn->off) + *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off); + *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32); + *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2); + *patch++ = *insn; + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || -- cgit v1.2.3 From 0db63c0b86e981a1e97d2596d64ceceba1a5470e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Apr 2024 17:25:44 -0700 Subject: bpf: Fix verifier assumptions about socket->sk The verifier assumes that 'sk' field in 'struct socket' is valid and non-NULL when 'socket' pointer itself is trusted and non-NULL. That may not be the case when socket was just created and passed to LSM socket_accept hook. Fix this verifier assumption and adjust tests. Reported-by: Liam Wisehart Acked-by: Kumar Kartikeya Dwivedi Fixes: 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier.") Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20240427002544.68803-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87ff414899cf..5d42db05315e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2368,6 +2368,8 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -5400,8 +5402,6 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); - /* For mark_ptr_or_null_reg */ - val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && @@ -5719,7 +5719,8 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) return true; /* Types listed in the reg2btf_ids are always trusted */ - if (reg2btf_ids[base_type(reg->type)]) + if (reg2btf_ids[base_type(reg->type)] && + !bpf_type_has_unsafe_modifiers(reg->type)) return true; /* If a register is not referenced, it is trusted if it has the @@ -6339,6 +6340,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, #define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) #define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null) #define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) +#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null) /* * Allow list few fields as RCU trusted or full trusted. @@ -6402,7 +6404,7 @@ BTF_TYPE_SAFE_TRUSTED(struct dentry) { struct inode *d_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct socket) { +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; @@ -6437,11 +6439,20 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } +static bool type_is_trusted_or_null(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const char *field_name, u32 btf_id) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, + "__safe_trusted_or_null"); +} + static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int regno, int off, int size, @@ -6550,6 +6561,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ if (type_is_trusted(env, reg, field_name, btf_id)) { flag |= PTR_TRUSTED; + } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) { + flag |= PTR_TRUSTED | PTR_MAYBE_NULL; } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ -- cgit v1.2.3 From cb01621b6d91567ac74c8b95e4db731febdbdec3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Apr 2024 15:13:22 +0300 Subject: bpf: Use struct_size() Use struct_size() instead of hand writing it. This is less verbose and more robust. Signed-off-by: Andy Shevchenko Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240429121323.3818497-1-andriy.shevchenko@linux.intel.com --- kernel/bpf/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 95c7fd093e55..a8ca6dd6e614 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -2455,13 +2456,14 @@ EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { + struct bpf_prog_array *p; + if (prog_cnt) - return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog_array_item) * - (prog_cnt + 1), - flags); + p = kzalloc(struct_size(p, items, prog_cnt + 1), flags); + else + p = &bpf_empty_prog_array.hdr; - return &bpf_empty_prog_array.hdr; + return p; } void bpf_prog_array_free(struct bpf_prog_array *progs) -- cgit v1.2.3 From a3034872cd90a6881ad4e10ca6d30e1215a99ada Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Apr 2024 15:00:05 +0300 Subject: bpf: Switch to krealloc_array() Let the krealloc_array() copy the original data and check for a multiplication overflow. Signed-off-by: Andy Shevchenko Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240429120005.3539116-1-andriy.shevchenko@linux.intel.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a8ca6dd6e614..99b8b1c9a248 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -850,7 +850,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return -EINVAL; } - tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL); if (!tab) return -ENOMEM; -- cgit v1.2.3 From b98a5c68ccaa94e93b9e898091fe2cf21c1500e6 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:24 +0200 Subject: bpf: Do not walk twice the map on free If someone stores both a timer and a workqueue in a map, on free we would walk it twice. Add a check in array_map_free_timers_wq and free the timers and workqueues if they are present. Fixes: 246331e3f1ea ("bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-1-27afe7f3b17c@kernel.org --- kernel/bpf/arraymap.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 580d07b15471..feabc0193852 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -436,13 +436,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER)) - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + for (i = 0; i < array->map.max_entries; i++) { + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + } + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ -- cgit v1.2.3 From a891711d0166133ec5120615fcf365d9745d82b2 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:25 +0200 Subject: bpf: Do not walk twice the hash map on free If someone stores both a timer and a workqueue in a hash map, on free, we would walk it twice. Add a check in htab_free_malloced_timers_or_wq and free the timers and workqueues if they are present. Fixes: 246331e3f1ea ("bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-2-27afe7f3b17c@kernel.org --- kernel/bpf/hashtab.c | 49 +++++++++++++------------------------------------ 1 file changed, 13 insertions(+), 36 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0179183c543a..06115f8728e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -221,13 +221,11 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab); } -static void htab_free_prealloced_timers(struct bpf_htab *htab) +static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -235,27 +233,12 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); - cond_resched(); - } -} - -static void htab_free_prealloced_wq(struct bpf_htab *htab) -{ - u32 num_entries = htab->map.max_entries; - int i; - - if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - return; - if (htab_has_extra_elems(htab)) - num_entries += num_possible_cpus(); - - for (i = 0; i < num_entries; i++) { - struct htab_elem *elem; - - elem = get_htab_elem(htab, i); - bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_TIMER)) + bpf_obj_free_timer(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -1515,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) migrate_enable(); } -static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) +static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) { int i; @@ -1527,10 +1510,10 @@ static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - if (is_timer) + if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); - else + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } @@ -1544,17 +1527,11 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map) struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_or_wq(htab, true); - else - htab_free_prealloced_timers(htab); - } - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_or_wq(htab, false); + htab_free_malloced_timers_and_wq(htab); else - htab_free_prealloced_wq(htab); + htab_free_prealloced_timers_and_wq(htab); } } -- cgit v1.2.3 From 535a3692ba7245792e6f23654507865d4293c850 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:24 +0200 Subject: bpf: Add support for kprobe session attach Adding support to attach bpf program for entry and return probe of the same function. This is common use case which at the moment requires to create two kprobe multi links. Adding new BPF_TRACE_KPROBE_SESSION attach type that instructs kernel to attach single link program to both entry and exit probe. It's possible to control execution of the bpf program on return probe simply by returning zero or non zero from the entry bpf program execution to execute or not the bpf program on return probe respectively. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-2-jolsa@kernel.org --- kernel/bpf/syscall.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f655adf42e39..13ad74ecf2cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4016,11 +4016,15 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && + attach_type != BPF_TRACE_KPROBE_SESSION) + return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_SESSION && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; return 0; @@ -5281,7 +5285,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) ret = bpf_uprobe_multi_link_attach(attr, prog); -- cgit v1.2.3 From adf46d88ae4b2557f7e2e02547a25fb866935711 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:25 +0200 Subject: bpf: Add support for kprobe session context Adding struct bpf_session_run_ctx object to hold session related data, which is atm is_return bool and data pointer coming in following changes. Placing bpf_session_run_ctx layer in between bpf_run_ctx and bpf_kprobe_multi_run_ctx so the session data can be retrieved regardless of if it's kprobe_multi or uprobe_multi link, which support is coming in future. This way both kprobe_multi and uprobe_multi can use same kfuncs to access the session data. Adding bpf_session_is_return kfunc that returns true if the bpf program is executed from the exit probe of the kprobe multi link attached in wrapper mode. It returns false otherwise. Adding new kprobe hook for kprobe program type. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-3-jolsa@kernel.org --- kernel/bpf/btf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8291fbfd27b1..821063660d9f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -218,6 +218,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_SOCKET_FILTER, BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, + BTF_KFUNC_HOOK_KPROBE, BTF_KFUNC_HOOK_MAX, }; @@ -8157,6 +8158,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_LWT; case BPF_PROG_TYPE_NETFILTER: return BTF_KFUNC_HOOK_NETFILTER; + case BPF_PROG_TYPE_KPROBE: + return BTF_KFUNC_HOOK_KPROBE; default: return BTF_KFUNC_HOOK_MAX; } -- cgit v1.2.3 From 5c919acef85147886eb2abf86fb147f94680a8b0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:26 +0200 Subject: bpf: Add support for kprobe session cookie Adding support for cookie within the session of kprobe multi entry and return program. The session cookie is u64 value and can be retrieved be new kfunc bpf_session_cookie, which returns pointer to the cookie value. The bpf program can use the pointer to store (on entry) and load (on return) the value. The cookie value is implemented via fprobe feature that allows to share values between entry and return ftrace fprobe callbacks. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-4-jolsa@kernel.org --- kernel/bpf/verifier.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5d42db05315e..7360f04f9ec7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11063,6 +11063,7 @@ enum special_kfunc_type { KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, + KF_bpf_session_cookie, }; BTF_SET_START(special_kfunc_set) @@ -11123,6 +11124,7 @@ BTF_ID(func, bpf_iter_css_task_new) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_session_cookie) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12294,6 +12296,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) { + meta.r0_size = sizeof(u64); + meta.r0_rdonly = false; + } + if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); -- cgit v1.2.3 From 543576ec15b17c0c93301ac8297333c7b6e84ac7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 26 Apr 2024 16:16:18 -0700 Subject: bpf: Add BPF_PROG_TYPE_CGROUP_SKB attach type enforcement in BPF_LINK_CREATE bpf_prog_attach uses attach_type_to_prog_type to enforce proper attach type for BPF_PROG_TYPE_CGROUP_SKB. link_create uses bpf_prog_get and relies on bpf_prog_attach_check_attach_type to properly verify prog_type <> attach_type association. Add missing attach_type enforcement for the link_create case. Otherwise, it's currently possible to attach cgroup_skb prog types to other cgroup hooks. Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Link: https://lore.kernel.org/bpf/0000000000004792a90615a1dde0@google.com/ Reported-by: syzbot+838346b979830606c854@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240426231621.2716876-2-sdf@google.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c287925471f6..cb61d8880dbe 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3985,6 +3985,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, * check permissions at attach time. */ return -EPERM; + + ptype = attach_type_to_prog_type(attach_type); + if (prog->type != ptype) + return -EINVAL; + return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; -- cgit v1.2.3 From ac2f438c2a85acd07e0ac7dc2f69d45bda1bb498 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 1 May 2024 10:01:30 -0700 Subject: bpf: crypto: fix build when CONFIG_CRYPTO=m Crypto subsytem can be build as a module. In this case we still have to build BPF crypto framework otherwise the build will fail. Fixes: 3e1c6f35409f ("bpf: make common crypto API for TC/XDP programs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405011634.4JK40epY-lkp@intel.com/ Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240501170130.1682309-1-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 85786fd97d2a..7eb9ad3a3ae6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,7 +44,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif -ifeq ($(CONFIG_CRYPTO),y) +ifneq ($(CONFIG_CRYPTO),) obj-$(CONFIG_BPF_SYSCALL) += crypto.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ -- cgit v1.2.3 From d786957ebd3fb4cfd9147dbcccd1e8f3871b45ce Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:44 +0100 Subject: bpf/verifier: replace calls to mark_reg_unknown. In order to further simplify the code in adjust_scalar_min_max_vals all the calls to mark_reg_unknown are replaced by __mark_reg_unknown. static void mark_reg_unknown(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { ... mark all regs not init ... return; } __mark_reg_unknown(env, regs + regno); } The 'regno >= MAX_BPF_REG' does not apply to adjust_scalar_min_max_vals(), because it is only called from the following stack: - check_alu_op - adjust_reg_min_max_vals - adjust_scalar_min_max_vals The check_alu_op() does check_reg_arg() which verifies that both src and dst register numbers are within bounds. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240506141849.185293-2-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7360f04f9ec7..41c66cc6db80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13887,7 +13887,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); bool src_known; s64 smin_val, smax_val; @@ -13994,7 +13993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14007,7 +14006,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14020,7 +14019,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14029,7 +14028,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_arsh(dst_reg, &src_reg); break; default: - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } -- cgit v1.2.3 From 0922c78f592c60e5a8fe6ab968479def124d4ff3 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:45 +0100 Subject: bpf/verifier: refactor checks for range computation Split range computation checks in its own function, isolating pessimitic range set for dst_reg and failing return to a single point. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko bpf/verifier: improve code after range computation recent changes. Link: https://lore.kernel.org/r/20240506141849.185293-3-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 109 +++++++++++++++++++++----------------------------- 1 file changed, 45 insertions(+), 64 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 41c66cc6db80..bdaf0413bf06 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13878,6 +13878,50 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, + const struct bpf_reg_state *src_reg) +{ + bool src_is_const = false; + u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + + if (insn_bitness == 32) { + if (tnum_subreg_is_const(src_reg->var_off) + && src_reg->s32_min_value == src_reg->s32_max_value + && src_reg->u32_min_value == src_reg->u32_max_value) + src_is_const = true; + } else { + if (tnum_is_const(src_reg->var_off) + && src_reg->smin_value == src_reg->smax_value + && src_reg->umin_value == src_reg->umax_value) + src_is_const = true; + } + + switch (BPF_OP(insn->code)) { + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + return true; + + /* Compute range for the following only if the src_reg is const. + */ + case BPF_XOR: + case BPF_OR: + case BPF_MUL: + return src_is_const; + + /* Shift operators range is only computable if shift dimension operand + * is a constant. Shifts greater than 31 or 63 are undefined. This + * includes shifts by a negative number. + */ + case BPF_LSH: + case BPF_RSH: + case BPF_ARSH: + return (src_is_const && src_reg->umax_value < insn_bitness); + default: + return false; + } +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -13888,51 +13932,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); - bool src_known; - s64 smin_val, smax_val; - u64 umin_val, umax_val; - s32 s32_min_val, s32_max_val; - u32 u32_min_val, u32_max_val; - u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; - smin_val = src_reg.smin_value; - smax_val = src_reg.smax_value; - umin_val = src_reg.umin_value; - umax_val = src_reg.umax_value; - - s32_min_val = src_reg.s32_min_value; - s32_max_val = src_reg.s32_max_value; - u32_min_val = src_reg.u32_min_value; - u32_max_val = src_reg.u32_max_value; - - if (alu32) { - src_known = tnum_subreg_is_const(src_reg.var_off); - if ((src_known && - (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || - s32_min_val > s32_max_val || u32_min_val > u32_max_val) { - /* Taint dst register if offset had invalid bounds - * derived from e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; - } - } else { - src_known = tnum_is_const(src_reg.var_off); - if ((src_known && - (smin_val != smax_val || umin_val != umax_val)) || - smin_val > smax_val || umin_val > umax_val) { - /* Taint dst register if offset had invalid bounds - * derived from e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; - } - } - - if (!src_known && - opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { + if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) { __mark_reg_unknown(env, dst_reg); return 0; } @@ -13989,46 +13992,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_xor(dst_reg, &src_reg); break; case BPF_LSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_lsh(dst_reg, &src_reg); else scalar_min_max_lsh(dst_reg, &src_reg); break; case BPF_RSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_rsh(dst_reg, &src_reg); else scalar_min_max_rsh(dst_reg, &src_reg); break; case BPF_ARSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_arsh(dst_reg, &src_reg); else scalar_min_max_arsh(dst_reg, &src_reg); break; default: - __mark_reg_unknown(env, dst_reg); break; } -- cgit v1.2.3 From 138cc42c05d11fd5ee82ee1606d2c9823373a926 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:46 +0100 Subject: bpf/verifier: improve XOR and OR range computation Range for XOR and OR operators would not be attempted unless src_reg would resolve to a single value, i.e. a known constant value. This condition is unnecessary, and the following XOR/OR operator handling could compute a possible better range. Acked-by: Eduard Zingerman Signed-off-by: Cupertino Miranda Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240506141849.185293-4-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bdaf0413bf06..1f6deb3e44c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13900,12 +13900,12 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_ADD: case BPF_SUB: case BPF_AND: + case BPF_XOR: + case BPF_OR: return true; /* Compute range for the following only if the src_reg is const. */ - case BPF_XOR: - case BPF_OR: case BPF_MUL: return src_is_const; -- cgit v1.2.3 From 41d047a871062f1a4d1871a1908d380c14e75428 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:48 +0100 Subject: bpf/verifier: relax MUL range computation check MUL instruction required that src_reg would be a known value (i.e. src_reg would be a const value). The condition in this case can be relaxed, since the range computation algorithm used in current code already supports a proper range computation for any valid range value on its operands. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Acked-by: Andrii Nakryiko Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20240506141849.185293-6-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1f6deb3e44c5..9e3aba08984e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13902,12 +13902,8 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_AND: case BPF_XOR: case BPF_OR: - return true; - - /* Compute range for the following only if the src_reg is const. - */ case BPF_MUL: - return src_is_const; + return true; /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This -- cgit v1.2.3 From 75b0fbf15d8466be618a997cae774eef445c0c7d Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 7 May 2024 14:33:39 +0800 Subject: bpf: Remove redundant page mask of vmf->address As the comment described in "struct vm_fault": ".address" : 'Faulting virtual address - masked' ".real_address" : 'Faulting virtual address - unmasked' The link [1] said: "Whatever the routes, all architectures end up to the invocation of handle_mm_fault() which, in turn, (likely) ends up calling __handle_mm_fault() to carry out the actual work of allocating the page tables." __handle_mm_fault() does address assignment: .address = address & PAGE_MASK, .real_address = address, This is debug dump by running `./test_progs -a "*arena*"`: [ 69.767494] arena fault: vmf->address = 10000001d000, vmf->real_address = 10000001d008 [ 69.767496] arena fault: vmf->address = 10000001c000, vmf->real_address = 10000001c008 [ 69.767499] arena fault: vmf->address = 10000001b000, vmf->real_address = 10000001b008 [ 69.767501] arena fault: vmf->address = 10000001a000, vmf->real_address = 10000001a008 [ 69.767504] arena fault: vmf->address = 100000019000, vmf->real_address = 100000019008 [ 69.769388] arena fault: vmf->address = 10000001e000, vmf->real_address = 10000001e1e8 So we can use the value of 'vmf->address' to do BPF arena kernel address space cast directly. [1] https://docs.kernel.org/mm/page_tables.html Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240507063358.8048-1-haiyue.wang@intel.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 6c81630c5293..f5953f1a95cd 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -251,7 +251,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) int ret; kbase = bpf_arena_get_kern_vm_start(arena); - kaddr = kbase + (u32)(vmf->address & PAGE_MASK); + kaddr = kbase + (u32)(vmf->address); guard(mutex)(&arena->lock); page = vmalloc_to_page((void *)kaddr); -- cgit v1.2.3 From 2ddec2c80b4402c293c7e6e0881cecaaf77e8cec Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 2 May 2024 15:18:52 +0000 Subject: riscv, bpf: inline bpf_get_smp_processor_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inline the calls to bpf_get_smp_processor_id() in the riscv bpf jit. RISCV saves the pointer to the CPU's task_struct in the TP (thread pointer) register. This makes it trivial to get the CPU's processor id. As thread_info is the first member of task_struct, we can read the processor id from TP + offsetof(struct thread_info, cpu). RISCV64 JIT output for `call bpf_get_smp_processor_id` ====================================================== Before After -------- ------- auipc t1,0x848c ld a5,32(tp) jalr 604(t1) mv a5,a0 Benchmark using [1] on Qemu. ./benchs/run_bench_trigger.sh glob-arr-inc arr-inc hash-inc +---------------+------------------+------------------+--------------+ | Name | Before | After | % change | |---------------+------------------+------------------+--------------| | glob-arr-inc | 1.077 ± 0.006M/s | 1.336 ± 0.010M/s | + 24.04% | | arr-inc | 1.078 ± 0.002M/s | 1.332 ± 0.015M/s | + 23.56% | | hash-inc | 0.494 ± 0.004M/s | 0.653 ± 0.001M/s | + 32.18% | +---------------+------------------+------------------+--------------+ NOTE: This benchmark includes changes from this patch and the previous patch that implemented the per-cpu insn. [1] https://github.com/anakryiko/linux/commit/8dec900975ef Signed-off-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20240502151854.9810-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 11 +++++++++++ kernel/bpf/verifier.c | 4 ++++ 2 files changed, 15 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 99b8b1c9a248..aa59af9f9bd9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2941,6 +2941,17 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return true if the JIT inlines the call to the helper corresponding to + * the imm. + * + * The verifier will not patch the insn->imm for the call to the helper if + * this returns true. + */ +bool __weak bpf_jit_inlines_helper_call(s32 imm) +{ + return false; +} + /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ bool __weak bpf_jit_supports_subprog_tailcalls(void) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e3aba08984e..1658ca4136a3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19996,6 +19996,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Skip inlining the helper call if the JIT does it. */ + if (bpf_jit_inlines_helper_call(insn->imm)) + goto next_insn; + if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) -- cgit v1.2.3 From 12af2b83d0b17ec8b379b721dd4a8fbcd5d791f3 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 5 May 2024 19:06:18 +0300 Subject: mm: introduce execmem_alloc() and execmem_free() module_alloc() is used everywhere as a mean to allocate memory for code. Beside being semantically wrong, this unnecessarily ties all subsystems that need to allocate code, such as ftrace, kprobes and BPF to modules and puts the burden of code allocation to the modules code. Several architectures override module_alloc() because of various constraints where the executable memory can be located and this causes additional obstacles for improvements of code allocation. Start splitting code allocation from modules by introducing execmem_alloc() and execmem_free() APIs. Initially, execmem_alloc() is a wrapper for module_alloc() and execmem_free() is a replacement of module_memfree() to allow updating all call sites to use the new APIs. Since architectures define different restrictions on placement, permissions, alignment and other parameters for memory that can be used by different subsystems that allocate executable memory, execmem_alloc() takes a type argument, that will be used to identify the calling subsystem and to allow architectures define parameters for ranges suitable for that subsystem. No functional changes. Signed-off-by: Mike Rapoport (IBM) Acked-by: Masami Hiramatsu (Google) Acked-by: Song Liu Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- kernel/bpf/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1ea5ce5bb599..892e50afda59 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -37,6 +36,7 @@ #include #include #include +#include #include #include @@ -1050,12 +1050,12 @@ void bpf_jit_uncharge_modmem(u32 size) void *__weak bpf_jit_alloc_exec(unsigned long size) { - return module_alloc(size); + return execmem_alloc(EXECMEM_BPF, size); } void __weak bpf_jit_free_exec(void *addr) { - module_memfree(addr); + execmem_free(addr); } struct bpf_binary_header * -- cgit v1.2.3 From 2c9e5d4a008293407836d29d35dfd4353615bd2f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 5 May 2024 19:06:28 +0300 Subject: bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF just-in-time compiler depended on CONFIG_MODULES because it used module_alloc() to allocate memory for the generated code. Since code allocations are now implemented with execmem, drop dependency of CONFIG_BPF_JIT on CONFIG_MODULES and make it select CONFIG_EXECMEM. Suggested-by: Björn Töpel Signed-off-by: Mike Rapoport (IBM) Signed-off-by: Luis Chamberlain --- kernel/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 4100df44c665..17067dcb4386 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -43,7 +43,7 @@ config BPF_JIT bool "Enable BPF Just In Time compiler" depends on BPF depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT - depends on MODULES + select EXECMEM help BPF programs are normally handled by a BPF interpreter. This option allows the kernel to generate native code when a program is loaded -- cgit v1.2.3 From 9ee98229083186837199912a7debb666146b8c17 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 14 May 2024 23:24:39 -0700 Subject: bpf: save extended inner map info for percpu array maps as well ARRAY_OF_MAPS and HASH_OF_MAPS map types have special logic to save a few extra fields required for correct operations of ARRAY maps, when they are used as inner maps. PERCPU_ARRAY maps have similar requirements as they now support generating inline element lookup logic. So make sure that both classes of maps are handled correctly. Reported-by: Jakub Kicinski Fixes: db69718b8efa ("bpf: inline bpf_map_lookup_elem() for PERCPU_ARRAY maps") Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240515062440.846086-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 8ef269e66ba5..b4f18c85d7bc 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -32,7 +32,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ - if (inner_map->ops == &array_map_ops) + if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); @@ -68,7 +68,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; - if (inner_map->ops == &array_map_ops) { + if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) { struct bpf_array *inner_array_meta = container_of(inner_map_meta, struct bpf_array, map); struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map); -- cgit v1.2.3