From 2b986b9e917bc88f81aa1ed386af63b26c983f1d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 Aug 2025 20:24:37 +0200 Subject: bpf, cpumap: Disable page_pool direct xdp_return need larger scope When running an XDP bpf_prog on the remote CPU in cpumap code then we must disable the direct return optimization that xdp_return can perform for mem_type page_pool. This optimization assumes code is still executing under RX-NAPI of the original receiving CPU, which isn't true on this remote CPU. The cpumap code already disabled this via helpers xdp_set_return_frame_no_direct() and xdp_clear_return_frame_no_direct(), but the scope didn't include xdp_do_flush(). When doing XDP_REDIRECT towards e.g devmap this causes the function bq_xmit_all() to run with direct return optimization enabled. This can lead to hard to find bugs. The issue only happens when bq_xmit_all() cannot ndo_xdp_xmit all frames and them frees them via xdp_return_frame_rx_napi(). Fix by expanding scope to include xdp_do_flush(). This was found by Dragos Tatulea. Fixes: 11941f8a8536 ("bpf: cpumap: Implement generic cpumap") Reported-by: Dragos Tatulea Reported-by: Chris Arges Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: Chris Arges Link: https://patch.msgid.link/175519587755.3008742.1088294435150406835.stgit@firesoul --- kernel/bpf/cpumap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b2b7b8ec2c2a..c46360b27871 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { @@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } - xdp_clear_return_frame_no_direct(); stats->pass += nframes; return nframes; @@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); + xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); -- cgit v1.2.3 From e4414b01c1cd9887bbde92f946c1ba94e40d6d64 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Aug 2025 22:06:55 +0200 Subject: bpf: Check the helper function is valid in get_helper_proto kernel test robot reported verifier bug [1] where the helper func pointer could be NULL due to disabled config option. As Alexei suggested we could check on that in get_helper_proto directly. Marking tail_call helper func with BPF_PTR_POISON, because it is unused by design. [1] https://lore.kernel.org/oe-lkp/202507160818.68358831-lkp@intel.com Reported-by: kernel test robot Reported-by: syzbot+a9ed3d9132939852d0df@syzkaller.appspotmail.com Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Paul Chaignon Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250814200655.945632-1-jolsa@kernel.org Closes: https://lore.kernel.org/oe-lkp/202507160818.68358831-lkp@intel.com --- kernel/bpf/core.c | 5 ++++- kernel/bpf/verifier.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d..f8ac77d08ca7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3024,7 +3024,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { - .func = NULL, + /* func is unused for tail_call, we set it to pass the + * get_helper_proto check + */ + .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..c89e2b1bc644 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11354,7 +11354,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, -- cgit v1.2.3 From f9bb6ffa7f5ad0f8ee0f53fc4a10655872ee4a14 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 Aug 2025 16:36:56 +0200 Subject: bpf: Fix out-of-bounds dynptr write in bpf_crypto_crypt Stanislav reported that in bpf_crypto_crypt() the destination dynptr's size is not validated to be at least as large as the source dynptr's size before calling into the crypto backend with 'len = src_len'. This can result in an OOB write when the destination is smaller than the source. Concretely, in mentioned function, psrc and pdst are both linear buffers fetched from each dynptr: psrc = __bpf_dynptr_data(src, src_len); [...] pdst = __bpf_dynptr_data_rw(dst, dst_len); [...] err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); The crypto backend expects pdst to be large enough with a src_len length that can be written. Add an additional src_len > dst_len check and bail out if it's the case. Note that these kfuncs are accessible under root privileges only. Fixes: 3e1c6f35409f ("bpf: make common crypto API for TC/XDP programs") Reported-by: Stanislav Fort Signed-off-by: Daniel Borkmann Cc: Vadim Fedorenko Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20250829143657.318524-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc..83c4d9943084 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) -- cgit v1.2.3 From 7edfc024708258d75f65fadffd7e5f6ac46810b6 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 30 Aug 2025 00:31:58 +0800 Subject: bpf: Fix bpf_strnstr() to handle suffix match cases better bpf_strnstr() should not treat the ending '\0' of s2 as a matching character if the parameter 'len' equal to s2 string length, for example: 1. bpf_strnstr("openat", "open", 4) = -ENOENT 2. bpf_strnstr("openat", "open", 5) = 0 This patch makes (1) return 0, fix just the `len == strlen(s2)` case. And fix a more general case when s2 is a suffix of the first len characters of s1. Fixes: e91370550f1f ("bpf: Add kfuncs for read-only string operations") Signed-off-by: Rong Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/tencent_17DC57B9D16BC443837021BEACE84B7C1507@qq.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68..b9b0c5fe33f6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3664,10 +3664,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2__ign + j, char, err_out); if (c2 == '\0') return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; __get_kernel_nofault(&c1, s1__ign + j, char, err_out); if (c1 == '\0') return -ENOENT; -- cgit v1.2.3 From 0d80e7f951be1bdd08d328fd87694be0d6e8aaa8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Sep 2025 18:49:59 +0000 Subject: rqspinlock: Choose trylock fallback for NMI waiters Currently, out of all 3 types of waiters in the rqspinlock slow path (i.e., pending bit waiter, wait queue head waiter, and wait queue non-head waiter), only the pending bit waiter and wait queue head waiters apply deadlock checks and a timeout on their waiting loop. The assumption here was that the wait queue head's forward progress would be sufficient to identify cases where the lock owner or pending bit waiter is stuck, and non-head waiters relying on the head waiter would prove to be sufficient for their own forward progress. However, the head waiter itself can be preempted by a non-head waiter for the same lock (AA) or a different lock (ABBA) in a manner that impedes its forward progress. In such a case, non-head waiters not performing deadlock and timeout checks becomes insufficient, and the system can enter a state of lockup. This is typically not a concern with non-NMI lock acquisitions, as lock holders which in run in different contexts (IRQ, non-IRQ) use "irqsave" variants of the lock APIs, which naturally excludes such lock holders from preempting one another on the same CPU. It might seem likely that a similar case may occur for rqspinlock when programs are attached to contention tracepoints (begin, end), however, these tracepoints either precede the enqueue into the wait queue, or succeed it, therefore cannot be used to preempt a head waiter's waiting loop. We must still be careful against nested kprobe and fentry programs that may attach to the middle of the head's waiting loop to stall forward progress and invoke another rqspinlock acquisition that proceeds as a non-head waiter. To this end, drop CC_FLAGS_FTRACE from the rqspinlock.o object file. For now, this issue is resolved by falling back to a repeated trylock on the lock word from NMI context, while performing the deadlock checks to break out early in case forward progress is impossible, and use the timeout as a final fallback. A more involved fix to terminate the queue when such a condition occurs will be made as a follow up. A selftest to stress this aspect of nested NMI/non-NMI locking attempts will be added in a subsequent patch to the bpf-next tree when this fix lands and trees are synchronized. Reported-by: Josef Bacik Fixes: 164c246571e9 ("rqspinlock: Protect waiters in queue from stalls") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250909184959.3509085-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 1 + kernel/bpf/rqspinlock.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 269c04a24664..f6cf8c2af5f7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 5ab354d55d82..a00561b1d3e5 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -471,7 +471,7 @@ queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= _Q_MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { lockevent_inc(lock_no_node); RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); while (!queued_spin_trylock(lock)) { -- cgit v1.2.3 From df0cb5cb50bd54d3cd4d0d83417ceec6a66404aa Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 9 Sep 2025 22:46:14 +0800 Subject: bpf: Allow fall back to interpreter for programs with stack size <= 512 OpenWRT users reported regression on ARMv6 devices after updating to latest HEAD, where tcpdump filter: tcpdump "not ether host 3c37121a2b3c and not ether host 184ecbca2a3a \ and not ether host 14130b4d3f47 and not ether host f0f61cf440b7 \ and not ether host a84b4dedf471 and not ether host d022be17e1d7 \ and not ether host 5c497967208b and not ether host 706655784d5b" fails with warning: "Kernel filter failed: No error information" when using config: # CONFIG_BPF_JIT_ALWAYS_ON is not set CONFIG_BPF_JIT_DEFAULT_ON=y The issue arises because commits: 1. "bpf: Fix array bounds error with may_goto" changed default runtime to __bpf_prog_ret0_warn when jit_requested = 1 2. "bpf: Avoid __bpf_prog_ret0_warn when jit fails" returns error when jit_requested = 1 but jit fails This change restores interpreter fallback capability for BPF programs with stack size <= 512 bytes when jit fails. Reported-by: Felix Fietkau Closes: https://lore.kernel.org/bpf/2e267b4b-0540-45d8-9310-e127bf95fc63@nbd.name/ Fixes: 6ebc5030e0c5 ("bpf: Fix array bounds error with may_goto") Signed-off-by: KaFai Wan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250909144614.2991253-1-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f8ac77d08ca7..e4568d44e827 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2366,8 +2366,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; @@ -2468,8 +2467,9 @@ out: return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2478,15 +2478,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2505,7 +2506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = fp->jit_requested; + bool jit_needed = false; if (fp->bpf_func) goto finalize; @@ -2514,7 +2515,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during -- cgit v1.2.3 From 6d78b4473cdb08b74662355a9e8510bde09c511e Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 9 Sep 2025 09:52:20 +0000 Subject: bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init() Currently, calling bpf_map_kmalloc_node() from __bpf_async_init() can cause various locking issues; see the following stack trace (edited for style) as one example: ... [10.011566] do_raw_spin_lock.cold [10.011570] try_to_wake_up (5) double-acquiring the same [10.011575] kick_pool rq_lock, causing a hardlockup [10.011579] __queue_work [10.011582] queue_work_on [10.011585] kernfs_notify [10.011589] cgroup_file_notify [10.011593] try_charge_memcg (4) memcg accounting raises an [10.011597] obj_cgroup_charge_pages MEMCG_MAX event [10.011599] obj_cgroup_charge_account [10.011600] __memcg_slab_post_alloc_hook [10.011603] __kmalloc_node_noprof ... [10.011611] bpf_map_kmalloc_node [10.011612] __bpf_async_init [10.011615] bpf_timer_init (3) BPF calls bpf_timer_init() [10.011617] bpf_prog_xxxxxxxxxxxxxxxx_fcg_runnable [10.011619] bpf__sched_ext_ops_runnable [10.011620] enqueue_task_scx (2) BPF runs with rq_lock held [10.011622] enqueue_task [10.011626] ttwu_do_activate [10.011629] sched_ttwu_pending (1) grabs rq_lock ... The above was reproduced on bpf-next (b338cf849ec8) by modifying ./tools/sched_ext/scx_flatcg.bpf.c to call bpf_timer_init() during ops.runnable(), and hacking the memcg accounting code a bit to make a bpf_timer_init() call more likely to raise an MEMCG_MAX event. We have also run into other similar variants (both internally and on bpf-next), including double-acquiring cgroup_file_kn_lock, the same worker_pool::lock, etc. As suggested by Shakeel, fix this by using __GFP_HIGH instead of GFP_ATOMIC in __bpf_async_init(), so that e.g. if try_charge_memcg() raises an MEMCG_MAX event, we call __memcg_memory_event() with @allow_spinning=false and avoid calling cgroup_file_notify() there. Depends on mm patch "memcg: skip cgroup_file_notify if spinning is not allowed": https://lore.kernel.org/bpf/20250905201606.66198-1-shakeel.butt@linux.dev/ v0 approach s/bpf_map_kmalloc_node/bpf_mem_alloc/ https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/ v1 approach: https://lore.kernel.org/bpf/20250905234547.862249-1-yepeilin@google.com/ Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Suggested-by: Shakeel Butt Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/20250909095222.2121438-1-yepeilin@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b9b0c5fe33f6..8af62cb243d9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1274,8 +1274,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until + * kmalloc_nolock() is available, avoid locking issues by using + * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). + */ + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; -- cgit v1.2.3 From e25ddfb388c8b7e5f20e3bf38d627fb485003781 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 10 Sep 2025 20:57:39 +0800 Subject: bpf: Reject bpf_timer for PREEMPT_RT When enable CONFIG_PREEMPT_RT, the kernel will warn when run timer selftests by './test_progs -t timer': BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 In order to avoid such warning, reject bpf_timer in verifier when PREEMPT_RT is enabled. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250910125740.52172-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c89e2b1bc644..9fb1f957a093 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8547,6 +8547,10 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; -- cgit v1.2.3