From 8b62645b09f870d70c7910e7550289d444239a46 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Fri, 20 Sep 2024 16:06:59 -0300 Subject: bpf: Use raw_spinlock_t in ringbuf The function __bpf_ringbuf_reserve is invoked from a tracepoint, which disables preemption. Using spinlock_t in this context can lead to a "sleep in atomic" warning in the RT variant. This issue is illustrated in the example below: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 556208, name: test_progs preempt_count: 1, expected: 0 RCU nest depth: 1, expected: 1 INFO: lockdep is turned off. Preemption disabled at: [] migrate_enable+0xc0/0x39c CPU: 7 PID: 556208 Comm: test_progs Tainted: G Hardware name: Qualcomm SA8775P Ride (DT) Call trace: dump_backtrace+0xac/0x130 show_stack+0x1c/0x30 dump_stack_lvl+0xac/0xe8 dump_stack+0x18/0x30 __might_resched+0x3bc/0x4fc rt_spin_lock+0x8c/0x1a4 __bpf_ringbuf_reserve+0xc4/0x254 bpf_ringbuf_reserve_dynptr+0x5c/0xdc bpf_prog_ac3d15160d62622a_test_read_write+0x104/0x238 trace_call_bpf+0x238/0x774 perf_call_bpf_enter.isra.0+0x104/0x194 perf_syscall_enter+0x2f8/0x510 trace_sys_enter+0x39c/0x564 syscall_trace_enter+0x220/0x3c0 do_el0_svc+0x138/0x1dc el0_svc+0x54/0x130 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Switch the spinlock to raw_spinlock_t to avoid this error. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Brian Grech Signed-off-by: Wander Lairson Costa Signed-off-by: Wander Lairson Costa Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240920190700.617253-1-wander@redhat.com --- kernel/bpf/ringbuf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e20b90c36131..de3b681d1d13 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -29,7 +29,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - spinlock_t spinlock ____cacheline_aligned_in_smp; + raw_spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - spin_lock_init(&rb->spinlock); + raw_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -421,10 +421,10 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { - if (!spin_trylock_irqsave(&rb->spinlock, flags)) + if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { - spin_lock_irqsave(&rb->spinlock, flags); + raw_spin_lock_irqsave(&rb->spinlock, flags); } pend_pos = rb->pending_pos; @@ -450,7 +450,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -462,7 +462,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } -- cgit v1.2.3 From e9bd9c498cb0f5843996dbe5cbce7a1836a83c70 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 24 Sep 2024 14:08:43 -0700 Subject: bpf: sync_linked_regs() must preserve subreg_def Range propagation must not affect subreg_def marks, otherwise the following example is rewritten by verifier incorrectly when BPF_F_TEST_RND_HI32 flag is set: 0: call bpf_ktime_get_ns call bpf_ktime_get_ns 1: r0 &= 0x7fffffff after verifier r0 &= 0x7fffffff 2: w1 = w0 rewrites w1 = w0 3: if w0 < 10 goto +0 --------------> r11 = 0x2f5674a6 (r) 4: r1 >>= 32 r11 <<= 32 (r) 5: r0 = r1 r1 |= r11 (r) 6: exit; if w0 < 0xa goto pc+0 r1 >>= 32 r0 = r1 exit (or zero extension of w1 at (2) is missing for architectures that require zero extension for upper register half). The following happens w/o this patch: - r0 is marked as not a subreg at (0); - w1 is marked as subreg at (2); - w1 subreg_def is overridden at (3) by copy_register_state(); - w1 is read at (5) but mark_insn_zext() does not mark (2) for zero extension, because w1 subreg_def is not set; - because of BPF_F_TEST_RND_HI32 flag verifier inserts random value for hi32 bits of (2) (marked (r)); - this random value is read at (5). Fixes: 75748837b7e5 ("bpf: Propagate scalar ranges through register assignments.") Reported-by: Lonial Con Signed-off-by: Lonial Con Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Closes: https://lore.kernel.org/bpf/7e2aa30a62d740db182c170fdd8f81c596df280d.camel@gmail.com Link: https://lore.kernel.org/bpf/20240924210844.1758441-1-eddyz87@gmail.com --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a7ed527e47e..434de48cd24b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15326,8 +15326,12 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || reg->off == known_reg->off) { + s32 saved_subreg_def = reg->subreg_def; + copy_register_state(reg, known_reg); + reg->subreg_def = saved_subreg_def; } else { + s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; fake_reg.type = SCALAR_VALUE; @@ -15340,6 +15344,7 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); -- cgit v1.2.3 From ca9984c5f0ab3690d98b13937b2485a978c8dd73 Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Wed, 11 Sep 2024 10:41:18 +0200 Subject: bpf: devmap: provide rxq after redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rxq contains a pointer to the device from where the redirect happened. Currently, the BPF program that was executed after a redirect via BPF_MAP_TYPE_DEVMAP* does not have it set. This is particularly bad since accessing ingress_ifindex, e.g. SEC("xdp") int prog(struct xdp_md *pkt) { return bpf_redirect_map(&dev_redirect_map, 0, 0); } SEC("xdp/devmap") int prog_after_redirect(struct xdp_md *pkt) { bpf_printk("ifindex %i", pkt->ingress_ifindex); return XDP_PASS; } depends on access to rxq, so a NULL pointer gets dereferenced: <1>[ 574.475170] BUG: kernel NULL pointer dereference, address: 0000000000000000 <1>[ 574.475188] #PF: supervisor read access in kernel mode <1>[ 574.475194] #PF: error_code(0x0000) - not-present page <6>[ 574.475199] PGD 0 P4D 0 <4>[ 574.475207] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI <4>[ 574.475217] CPU: 4 UID: 0 PID: 217 Comm: kworker/4:1 Not tainted 6.11.0-rc5-reduced-00859-g780801200300 #23 <4>[ 574.475226] Hardware name: Intel(R) Client Systems NUC13ANHi7/NUC13ANBi7, BIOS ANRPL357.0026.2023.0314.1458 03/14/2023 <4>[ 574.475231] Workqueue: mld mld_ifc_work <4>[ 574.475247] RIP: 0010:bpf_prog_5e13354d9cf5018a_prog_after_redirect+0x17/0x3c <4>[ 574.475257] Code: cc cc cc cc cc cc cc 80 00 00 00 cc cc cc cc cc cc cc cc f3 0f 1e fa 0f 1f 44 00 00 66 90 55 48 89 e5 f3 0f 1e fa 48 8b 57 20 <48> 8b 52 00 8b 92 e0 00 00 00 48 bf f8 a6 d5 c4 5d a0 ff ff be 0b <4>[ 574.475263] RSP: 0018:ffffa62440280c98 EFLAGS: 00010206 <4>[ 574.475269] RAX: ffffa62440280cd8 RBX: 0000000000000001 RCX: 0000000000000000 <4>[ 574.475274] RDX: 0000000000000000 RSI: ffffa62440549048 RDI: ffffa62440280ce0 <4>[ 574.475278] RBP: ffffa62440280c98 R08: 0000000000000002 R09: 0000000000000001 <4>[ 574.475281] R10: ffffa05dc8b98000 R11: ffffa05f577fca40 R12: ffffa05dcab24000 <4>[ 574.475285] R13: ffffa62440280ce0 R14: ffffa62440549048 R15: ffffa62440549000 <4>[ 574.475289] FS: 0000000000000000(0000) GS:ffffa05f4f700000(0000) knlGS:0000000000000000 <4>[ 574.475294] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[ 574.475298] CR2: 0000000000000000 CR3: 000000025522e000 CR4: 0000000000f50ef0 <4>[ 574.475303] PKRU: 55555554 <4>[ 574.475306] Call Trace: <4>[ 574.475313] <4>[ 574.475318] ? __die+0x23/0x70 <4>[ 574.475329] ? page_fault_oops+0x180/0x4c0 <4>[ 574.475339] ? skb_pp_cow_data+0x34c/0x490 <4>[ 574.475346] ? kmem_cache_free+0x257/0x280 <4>[ 574.475357] ? exc_page_fault+0x67/0x150 <4>[ 574.475368] ? asm_exc_page_fault+0x26/0x30 <4>[ 574.475381] ? bpf_prog_5e13354d9cf5018a_prog_after_redirect+0x17/0x3c <4>[ 574.475386] bq_xmit_all+0x158/0x420 <4>[ 574.475397] __dev_flush+0x30/0x90 <4>[ 574.475407] veth_poll+0x216/0x250 [veth] <4>[ 574.475421] __napi_poll+0x28/0x1c0 <4>[ 574.475430] net_rx_action+0x32d/0x3a0 <4>[ 574.475441] handle_softirqs+0xcb/0x2c0 <4>[ 574.475451] do_softirq+0x40/0x60 <4>[ 574.475458] <4>[ 574.475461] <4>[ 574.475464] __local_bh_enable_ip+0x66/0x70 <4>[ 574.475471] __dev_queue_xmit+0x268/0xe40 <4>[ 574.475480] ? selinux_ip_postroute+0x213/0x420 <4>[ 574.475491] ? alloc_skb_with_frags+0x4a/0x1d0 <4>[ 574.475502] ip6_finish_output2+0x2be/0x640 <4>[ 574.475512] ? nf_hook_slow+0x42/0xf0 <4>[ 574.475521] ip6_finish_output+0x194/0x300 <4>[ 574.475529] ? __pfx_ip6_finish_output+0x10/0x10 <4>[ 574.475538] mld_sendpack+0x17c/0x240 <4>[ 574.475548] mld_ifc_work+0x192/0x410 <4>[ 574.475557] process_one_work+0x15d/0x380 <4>[ 574.475566] worker_thread+0x29d/0x3a0 <4>[ 574.475573] ? __pfx_worker_thread+0x10/0x10 <4>[ 574.475580] ? __pfx_worker_thread+0x10/0x10 <4>[ 574.475587] kthread+0xcd/0x100 <4>[ 574.475597] ? __pfx_kthread+0x10/0x10 <4>[ 574.475606] ret_from_fork+0x31/0x50 <4>[ 574.475615] ? __pfx_kthread+0x10/0x10 <4>[ 574.475623] ret_from_fork_asm+0x1a/0x30 <4>[ 574.475635] <4>[ 574.475637] Modules linked in: veth br_netfilter bridge stp llc iwlmvm x86_pkg_temp_thermal iwlwifi efivarfs nvme nvme_core <4>[ 574.475662] CR2: 0000000000000000 <4>[ 574.475668] ---[ end trace 0000000000000000 ]--- Therefore, provide it to the program by setting rxq properly. Fixes: cb261b594b41 ("bpf: Run devmap xdp_prog on flush instead of bulk enqueue") Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Florian Kauer Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240911-devel-koalo-fix-ingress-ifindex-v4-1-5c643ae10258@linutronix.de Signed-off-by: Martin KaFai Lau --- kernel/bpf/devmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9e0e3b0a18e4..7878be18e9d2 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -333,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, - struct net_device *dev) + struct net_device *tx_dev, + struct net_device *rx_dev) { - struct xdp_txq_info txq = { .dev = dev }; + struct xdp_txq_info txq = { .dev = tx_dev }; + struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; @@ -346,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; + xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { @@ -360,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); + trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); @@ -388,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) } if (bq->xdp_prog) { - to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } -- cgit v1.2.3 From 45126b155e3b5201179cdc038504bf93a8ccd921 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2024 18:09:58 +0200 Subject: bpf: Fix memory leak in bpf_core_apply We need to free specs properly. Fixes: 3d2786d65aaa ("bpf: correctly handle malformed BPF_CORE_TYPE_ID_LOCAL relos") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20241007160958.607434-1-jolsa@kernel.org --- kernel/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75e4fe83c509..a05da5f43547 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8961,6 +8961,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, if (!type) { bpf_log(ctx->log, "relo #%u: bad type id %u\n", relo_idx, relo->type_id); + kfree(specs); return -EINVAL; } -- cgit v1.2.3 From 117932eea99b729ee5d12783601a4f7f5fd58a23 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 8 Oct 2024 11:24:56 +0000 Subject: cgroup/bpf: use a dedicated workqueue for cgroup bpf destruction A hung_task problem shown below was found: INFO: task kworker/0:0:8 blocked for more than 327 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. Workqueue: events cgroup_bpf_release Call Trace: __schedule+0x5a2/0x2050 ? find_held_lock+0x33/0x100 ? wq_worker_sleeping+0x9e/0xe0 schedule+0x9f/0x180 schedule_preempt_disabled+0x25/0x50 __mutex_lock+0x512/0x740 ? cgroup_bpf_release+0x1e/0x4d0 ? cgroup_bpf_release+0xcf/0x4d0 ? process_scheduled_works+0x161/0x8a0 ? cgroup_bpf_release+0x1e/0x4d0 ? mutex_lock_nested+0x2b/0x40 ? __pfx_delay_tsc+0x10/0x10 mutex_lock_nested+0x2b/0x40 cgroup_bpf_release+0xcf/0x4d0 ? process_scheduled_works+0x161/0x8a0 ? trace_event_raw_event_workqueue_execute_start+0x64/0xd0 ? process_scheduled_works+0x161/0x8a0 process_scheduled_works+0x23a/0x8a0 worker_thread+0x231/0x5b0 ? __pfx_worker_thread+0x10/0x10 kthread+0x14d/0x1c0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x59/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 This issue can be reproduced by the following pressuse test: 1. A large number of cpuset cgroups are deleted. 2. Set cpu on and off repeatly. 3. Set watchdog_thresh repeatly. The scripts can be obtained at LINK mentioned above the signature. The reason for this issue is cgroup_mutex and cpu_hotplug_lock are acquired in different tasks, which may lead to deadlock. It can lead to a deadlock through the following steps: 1. A large number of cpusets are deleted asynchronously, which puts a large number of cgroup_bpf_release works into system_wq. The max_active of system_wq is WQ_DFL_ACTIVE(256). Consequently, all active works are cgroup_bpf_release works, and many cgroup_bpf_release works will be put into inactive queue. As illustrated in the diagram, there are 256 (in the acvtive queue) + n (in the inactive queue) works. 2. Setting watchdog_thresh will hold cpu_hotplug_lock.read and put smp_call_on_cpu work into system_wq. However step 1 has already filled system_wq, 'sscs.work' is put into inactive queue. 'sscs.work' has to wait until the works that were put into the inacvtive queue earlier have executed (n cgroup_bpf_release), so it will be blocked for a while. 3. Cpu offline requires cpu_hotplug_lock.write, which is blocked by step 2. 4. Cpusets that were deleted at step 1 put cgroup_release works into cgroup_destroy_wq. They are competing to get cgroup_mutex all the time. When cgroup_metux is acqured by work at css_killed_work_fn, it will call cpuset_css_offline, which needs to acqure cpu_hotplug_lock.read. However, cpuset_css_offline will be blocked for step 3. 5. At this moment, there are 256 works in active queue that are cgroup_bpf_release, they are attempting to acquire cgroup_mutex, and as a result, all of them are blocked. Consequently, sscs.work can not be executed. Ultimately, this situation leads to four processes being blocked, forming a deadlock. system_wq(step1) WatchDog(step2) cpu offline(step3) cgroup_destroy_wq(step4) ... 2000+ cgroups deleted asyn 256 actives + n inactives __lockup_detector_reconfigure P(cpu_hotplug_lock.read) put sscs.work into system_wq 256 + n + 1(sscs.work) sscs.work wait to be executed warting sscs.work finish percpu_down_write P(cpu_hotplug_lock.write) ...blocking... css_killed_work_fn P(cgroup_mutex) cpuset_css_offline P(cpu_hotplug_lock.read) ...blocking... 256 cgroup_bpf_release mutex_lock(&cgroup_mutex); ..blocking... To fix the problem, place cgroup_bpf_release works on a dedicated workqueue which can break the loop and solve the problem. System wqs are for misc things which shouldn't create a large number of concurrent work items. If something is going to generate >WQ_DFL_ACTIVE(256) concurrent work items, it should use its own dedicated workqueue. Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") Cc: stable@vger.kernel.org # v5.3+ Link: https://lore.kernel.org/cgroups/e90c32d2-2a85-4f28-9154-09c7d320cb60@huawei.com/T/#t Tested-by: Vishal Chourasia Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/bpf/cgroup.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e7113d700b87..025d7e2214ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -24,6 +24,23 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* + * cgroup bpf destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup bpf + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_bpf_destroy_wq; + +static int __init cgroup_bpf_wq_init(void) +{ + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + if (!cgroup_bpf_destroy_wq) + panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); + return 0; +} +core_initcall(cgroup_bpf_wq_init); + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); - queue_work(system_wq, &cgrp->bpf.release_work); + queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); } /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through -- cgit v1.2.3 From b24d7f0da6ef5a23456a301eaf51b170f961d4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 5 Oct 2024 02:06:28 +0200 Subject: bpf, lsm: Remove bpf_lsm_key_free hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The key_free LSM hook has been removed. Remove the corresponding BPF hook. Avoid warnings during the build: BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol bpf_lsm_key_free Fixes: 5f8d28f6d7d5 ("lsm: infrastructure management of the key security blob") Signed-off-by: Thomas Weißschuh Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20241005-lsm-key_free-v1-1-42ea801dbd63@weissschuh.net --- kernel/bpf/bpf_lsm.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 6292ac5f9bd1..3bc61628ab25 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -339,10 +339,6 @@ BTF_ID(func, bpf_lsm_path_chmod) BTF_ID(func, bpf_lsm_path_chown) #endif /* CONFIG_SECURITY_PATH */ -#ifdef CONFIG_KEYS -BTF_ID(func, bpf_lsm_key_free) -#endif /* CONFIG_KEYS */ - BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) -- cgit v1.2.3 From 797d73ee232dd1833dec4824bc53a22032e97c1c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 8 Oct 2024 15:11:13 +0800 Subject: bpf: Check the remaining info_cnt before repeating btf fields When trying to repeat the btf fields for array of nested struct, it doesn't check the remaining info_cnt. The following splat will be reported when the value of ret * nelems is greater than BTF_FIELDS_MAX: ------------[ cut here ]------------ UBSAN: array-index-out-of-bounds in ../kernel/bpf/btf.c:3951:49 index 11 is out of range for type 'btf_field_info [11]' CPU: 6 UID: 0 PID: 411 Comm: test_progs ...... 6.11.0-rc4+ #1 Tainted: [O]=OOT_MODULE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ... Call Trace: dump_stack_lvl+0x57/0x70 dump_stack+0x10/0x20 ubsan_epilogue+0x9/0x40 __ubsan_handle_out_of_bounds+0x6f/0x80 ? kallsyms_lookup_name+0x48/0xb0 btf_parse_fields+0x992/0xce0 map_create+0x591/0x770 __sys_bpf+0x229/0x2410 __x64_sys_bpf+0x1f/0x30 x64_sys_call+0x199/0x9f0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fea56f2cc5d ...... ---[ end trace ]--- Fix it by checking the remaining info_cnt in btf_repeat_fields() before repeating the btf fields. Fixes: 64e8ee814819 ("bpf: look into the types of the fields of a struct type recursively.") Signed-off-by: Hou Tao Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241008071114.3718177-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a05da5f43547..5cd1c7a23848 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3523,7 +3523,7 @@ end: * (i + 1) * elem_size * where i is the repeat index and elem_size is the size of an element. */ -static int btf_repeat_fields(struct btf_field_info *info, +static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { u32 i, j; @@ -3543,6 +3543,12 @@ static int btf_repeat_fields(struct btf_field_info *info, } } + /* The type of struct size or variable size is u32, + * so the multiplication will not overflow. + */ + if (field_cnt * (repeat_cnt + 1) > info_cnt) + return -E2BIG; + cur = field_cnt; for (i = 0; i < repeat_cnt; i++) { memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); @@ -3587,7 +3593,7 @@ static int btf_find_nested_struct(const struct btf *btf, const struct btf_type * info[i].off += off; if (nelems > 1) { - err = btf_repeat_fields(info, ret, nelems - 1, t->size); + err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size); if (err == 0) ret *= nelems; else @@ -3681,10 +3687,10 @@ static int btf_find_field_one(const struct btf *btf, if (ret == BTF_FIELD_IGNORE) return 0; - if (nelems > info_cnt) + if (!info_cnt) return -E2BIG; if (nelems > 1) { - ret = btf_repeat_fields(info, 1, nelems - 1, sz); + ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz); if (ret < 0) return ret; } -- cgit v1.2.3 From 434247637c66e1be2bc71a9987d4c3f0d8672387 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 8 Oct 2024 17:07:35 -0400 Subject: bpf: use kvzmalloc to allocate BPF verifier environment The kzmalloc call in bpf_check can fail when memory is very fragmented, which in turn can lead to an OOM kill. Use kvzmalloc to fall back to vmalloc when memory is too fragmented to allocate an order 3 sized bpf verifier environment. Admittedly this is not a very common case, and only happens on systems where memory has already been squeezed close to the limit, but this does not seem like much of a hot path, and it's a simple enough fix. Signed-off-by: Rik van Riel Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20241008170735.16766766@imladris.surriel.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 434de48cd24b..633fd6da40c2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22315,7 +22315,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -22551,6 +22551,6 @@ err_unlock: mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: - kfree(env); + kvfree(env); return ret; } -- cgit v1.2.3 From 4deecdd29cf29844c7bd164d72dc38d2e672f64e Mon Sep 17 00:00:00 2001 From: Tyrone Wu Date: Tue, 8 Oct 2024 16:43:11 +0000 Subject: bpf: fix unpopulated name_len field in perf_event link info Previously when retrieving `bpf_link_info.perf_event` for kprobe/uprobe/tracepoint, the `name_len` field was not populated by the kernel, leaving it to reflect the value initially set by the user. This behavior was inconsistent with how other input/output string buffer fields function (e.g. `raw_tracepoint.tp_name_len`). This patch fills `name_len` with the actual size of the string name. Fixes: 1b715e1b0ec5 ("bpf: Support ->fill_link_info for perf_event") Signed-off-by: Tyrone Wu Acked-by: Jiri Olsa Acked-by: Yafang Shao Link: https://lore.kernel.org/r/20241008164312.46269-1-wudevelops@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a8f1808a1ca5..8cfa7183d2ef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3565,15 +3565,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) } static int bpf_perf_link_fill_common(const struct perf_event *event, - char __user *uname, u32 ulen, + char __user *uname, u32 *ulenp, u64 *probe_offset, u64 *probe_addr, u32 *fd_type, unsigned long *missed) { const char *buf; - u32 prog_id; + u32 prog_id, ulen; size_t len; int err; + ulen = *ulenp; if (!ulen ^ !uname) return -EINVAL; @@ -3581,10 +3582,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, probe_offset, probe_addr, missed); if (err) return err; + + if (buf) { + len = strlen(buf); + *ulenp = len + 1; + } else { + *ulenp = 1; + } if (!uname) return 0; + if (buf) { - len = strlen(buf); err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; @@ -3609,7 +3617,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, &missed); if (err) return err; @@ -3617,7 +3625,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; - + info->perf_event.kprobe.name_len = ulen; info->perf_event.kprobe.offset = offset; info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) @@ -3639,7 +3647,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, NULL); if (err) return err; @@ -3648,6 +3656,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; return 0; @@ -3673,12 +3682,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, { char __user *uname; u32 ulen; + int err; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; + err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); + if (err) + return err; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + info->perf_event.tracepoint.name_len = ulen; info->perf_event.tracepoint.cookie = event->bpf_cookie; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); + return 0; } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, -- cgit v1.2.3 From 6cb86a0fdece87e126323ec1bb19deb16a52aedf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 10 Oct 2024 15:27:07 +0200 Subject: bpf: fix kfunc btf caching for modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verifier contains a cache for looking up module BTF objects when calling kfuncs defined in modules. This cache uses a 'struct bpf_kfunc_btf_tab', which contains a sorted list of BTF objects that were already seen in the current verifier run, and the BTF objects are looked up by the offset stored in the relocated call instruction using bsearch(). The first time a given offset is seen, the module BTF is loaded from the file descriptor passed in by libbpf, and stored into the cache. However, there's a bug in the code storing the new entry: it stores a pointer to the new cache entry, then calls sort() to keep the cache sorted for the next lookup using bsearch(), and then returns the entry that was just stored through the stored pointer. However, because sort() modifies the list of entries in place *by value*, the stored pointer may no longer point to the right entry, in which case the wrong BTF object will be returned. The end result of this is an intermittent bug where, if a BPF program calls two functions with the same signature in two different modules, the function from the wrong module may sometimes end up being called. Whether this happens depends on the order of the calls in the BPF program (as that affects whether sort() reorders the array of BTF objects), making it especially hard to track down. Simon, credited as reporter below, spent significant effort analysing and creating a reproducer for this issue. The reproducer is added as a selftest in a subsequent patch. The fix is straight forward: simply don't use the stored pointer after calling sort(). Since we already have an on-stack pointer to the BTF object itself at the point where the function return, just use that, and populate it from the cache entry in the branch where the lookup succeeds. Fixes: 2357672c54c3 ("bpf: Introduce BPF support for kernel module function calls") Reported-by: Simon Sundberg Acked-by: Jiri Olsa Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20241010-fix-kfunc-btf-caching-for-modules-v2-1-745af6c1af98@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 633fd6da40c2..bf9996ea34fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2750,10 +2750,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, b->module = mod; b->offset = offset; + /* sort() reorders entries by value, so b may no longer point + * to the right entry after this + */ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); + } else { + btf = b->btf; } - return b->btf; + + return btf; } void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) -- cgit v1.2.3 From ae67b9fb8c4e981e929a665dcaa070f4b05ebdb4 Mon Sep 17 00:00:00 2001 From: Dimitar Kanaliev Date: Mon, 14 Oct 2024 15:11:53 +0300 Subject: bpf: Fix truncation bug in coerce_reg_to_size_sx() coerce_reg_to_size_sx() updates the register state after a sign-extension operation. However, there's a bug in the assignment order of the unsigned min/max values, leading to incorrect truncation: 0: (85) call bpf_get_prandom_u32#7 ; R0_w=scalar() 1: (57) r0 &= 1 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=1,var_off=(0x0; 0x1)) 2: (07) r0 += 254 ; R0_w=scalar(smin=umin=smin32=umin32=254,smax=umax=smax32=umax32=255,var_off=(0xfe; 0x1)) 3: (bf) r0 = (s8)r0 ; R0_w=scalar(smin=smin32=-2,smax=smax32=-1,umin=umin32=0xfffffffe,umax=0xffffffff,var_off=(0xfffffffffffffffe; 0x1)) In the current implementation, the unsigned 32-bit min/max values (u32_min_value and u32_max_value) are assigned directly from the 64-bit signed min/max values (s64_min and s64_max): reg->umin_value = reg->u32_min_value = s64_min; reg->umax_value = reg->u32_max_value = s64_max; Due to the chain assigmnent, this is equivalent to: reg->u32_min_value = s64_min; // Unintended truncation reg->umin_value = reg->u32_min_value; reg->u32_max_value = s64_max; // Unintended truncation reg->umax_value = reg->u32_max_value; Fixes: 1f9a1ea821ff ("bpf: Support new sign-extension load insns") Reported-by: Shung-Hsi Yu Reported-by: Zac Ecob Signed-off-by: Dimitar Kanaliev Acked-by: Yonghong Song Reviewed-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20241014121155.92887-2-dimitar.kanaliev@siteground.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bf9996ea34fe..a8a0b6e4110e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6339,10 +6339,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->smin_value = reg->s32_min_value = s64_min; - reg->smax_value = reg->s32_max_value = s64_max; - reg->umin_value = reg->u32_min_value = s64_min; - reg->umax_value = reg->u32_max_value = s64_max; + reg->s32_min_value = reg->smin_value = s64_min; + reg->s32_max_value = reg->smax_value = s64_max; + reg->u32_min_value = reg->umin_value = s64_min; + reg->u32_max_value = reg->umax_value = s64_max; reg->var_off = tnum_range(s64_min, s64_max); return; } -- cgit v1.2.3 From 9495a5b731fcaf580448a3438d63601c88367661 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Wed, 16 Oct 2024 14:00:47 -0700 Subject: bpf: Fix iter/task tid filtering In userspace, you can add a tid filter by setting the "task.tid" field for "bpf_iter_link_info". However, `get_pid_task` when called for the `BPF_TASK_ITER_TID` type should have been using `PIDTYPE_PID` (tid) instead of `PIDTYPE_TGID` (pid). Fixes: f0d74c4da1f0 ("bpf: Parameterize task iterators.") Signed-off-by: Jordan Rome Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241016210048.1213935-1-linux@jordanrome.com --- kernel/bpf/task_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 02aa9db8d796..5af9e130e500 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -99,7 +99,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co rcu_read_lock(); pid = find_pid_ns(common->pid, common->ns); if (pid) { - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, PIDTYPE_PID); *tid = common->pid; } rcu_read_unlock(); -- cgit v1.2.3 From 3878ae04e9fc24dacb77a1d32bd87e7d8108599e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Oct 2024 15:49:11 +0200 Subject: bpf: Fix incorrect delta propagation between linked registers Nathaniel reported a bug in the linked scalar delta tracking, which can lead to accepting a program with OOB access. The specific code is related to the sync_linked_regs() function and the BPF_ADD_CONST flag, which signifies a constant offset between two scalar registers tracked by the same register id. The verifier attempts to track "similar" scalars in order to propagate bounds information learned about one scalar to others. For instance, if r1 and r2 are known to contain the same value, then upon encountering 'if (r1 != 0x1234) goto xyz', not only does it know that r1 is equal to 0x1234 on the path where that conditional jump is not taken, it also knows that r2 is. Additionally, with env->bpf_capable set, the verifier will track scalars which should be a constant delta apart (if r1 is known to be one greater than r2, then if r1 is known to be equal to 0x1234, r2 must be equal to 0x1233.) The code path for the latter in adjust_reg_min_max_vals() is reached when processing both 32 and 64-bit addition operations. While adjust_reg_min_max_vals() knows whether dst_reg was produced by a 32 or a 64-bit addition (based on the alu32 bool), the only information saved in dst_reg is the id of the source register (reg->id, or'ed by BPF_ADD_CONST) and the value of the constant offset (reg->off). Later, the function sync_linked_regs() will attempt to use this information to propagate bounds information from one register (known_reg) to others, meaning, for all R in linked_regs, it copies known_reg range (and possibly adjusting delta) into R for the case of R->id == known_reg->id. For the delta adjustment, meaning, matching reg->id with BPF_ADD_CONST, the verifier adjusts the register as reg = known_reg; reg += delta where delta is computed as (s32)reg->off - (s32)known_reg->off and placed as a scalar into a fake_reg to then simulate the addition of reg += fake_reg. This is only correct, however, if the value in reg was created by a 64-bit addition. When reg contains the result of a 32-bit addition operation, its upper 32 bits will always be zero. sync_linked_regs() on the other hand, may cause the verifier to believe that the addition between fake_reg and reg overflows into those upper bits. For example, if reg was generated by adding the constant 1 to known_reg using a 32-bit alu operation, then reg->off is 1 and known_reg->off is 0. If known_reg is known to be the constant 0xFFFFFFFF, sync_linked_regs() will tell the verifier that reg is equal to the constant 0x100000000. This is incorrect as the actual value of reg will be 0, as the 32-bit addition will wrap around. Example: 0: (b7) r0 = 0; R0_w=0 1: (18) r1 = 0x80000001; R1_w=0x80000001 3: (37) r1 /= 1; R1_w=scalar() 4: (bf) r2 = r1; R1_w=scalar(id=1) R2_w=scalar(id=1) 5: (bf) r4 = r1; R1_w=scalar(id=1) R4_w=scalar(id=1) 6: (04) w2 += 2147483647; R2_w=scalar(id=1+2147483647,smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 7: (04) w4 += 0 ; R4_w=scalar(id=1+0,smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 8: (15) if r2 == 0x0 goto pc+1 10: R0=0 R1=0xffffffff80000001 R2=0x7fffffff R4=0xffffffff80000001 R10=fp0 What can be seen here is that r1 is copied to r2 and r4, such that {r1,r2,r4}.id are all the same which later lets sync_linked_regs() to be invoked. Then, in a next step constants are added with alu32 to r2 and r4, setting their ->off, as well as id |= BPF_ADD_CONST. Next, the conditional will bind r2 and propagate ranges to its linked registers. The verifier now believes the upper 32 bits of r4 are r4=0xffffffff80000001, while actually r4=r1=0x80000001. One approach for a simple fix suitable also for stable is to limit the constant delta tracking to only 64-bit alu addition. If necessary at some later point, BPF_ADD_CONST could be split into BPF_ADD_CONST64 and BPF_ADD_CONST32 to avoid mixing the two under the tradeoff to further complicate sync_linked_regs(). However, none of the added tests from dedf56d775c0 ("selftests/bpf: Add tests for add_const") make this necessary at this point, meaning, BPF CI also passes with just limiting tracking to 64-bit alu addition. Fixes: 98d7ca374ba4 ("bpf: Track delta between "linked" registers.") Reported-by: Nathaniel Theis Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20241016134913.32249-1-daniel@iogearbox.net --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8a0b6e4110e..411ab1b57af4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14270,12 +14270,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So remember constant delta between r2 and r1 and update r1 after - * 'if' condition. + * So for 64-bit alu remember constant delta between r2 and r1 and + * update r1 after 'if' condition. */ - if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD && - dst_reg->id && is_reg_const(src_reg, alu32)) { - u64 val = reg_const_value(src_reg, alu32); + if (env->bpf_capable && + BPF_OP(insn->code) == BPF_ADD && !alu32 && + dst_reg->id && is_reg_const(src_reg, false)) { + u64 val = reg_const_value(src_reg, false); if ((dst_reg->id & BPF_ADD_CONST) || /* prevent overflow in sync_linked_regs() later */ -- cgit v1.2.3 From 3e9e708757ca3b7eb65a820031d62fea1a265709 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Oct 2024 15:49:12 +0200 Subject: bpf: Fix print_reg_state's constant scalar dump print_reg_state() should not consider adding reg->off to reg->var_off.value when dumping scalars. Scalars can be produced with reg->off != 0 through BPF_ADD_CONST, and thus as-is this can skew the register log dump. Fixes: 98d7ca374ba4 ("bpf: Track delta between "linked" registers.") Reported-by: Nathaniel Theis Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241016134913.32249-2-daniel@iogearbox.net --- kernel/bpf/log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 5aebfc3051e3..4a858fdb6476 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -688,8 +688,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value); return; } -- cgit v1.2.3 From 1f97c03f43fadc407de5b5cb01c07755053e1c22 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 22 Oct 2024 21:01:33 +0800 Subject: bpf: Preserve param->string when parsing mount options In bpf_parse_param(), keep the value of param->string intact so it can be freed later. Otherwise, the kmalloc area pointed to by param->string will be leaked as shown below: unreferenced object 0xffff888118c46d20 (size 8): comm "new_name", pid 12109, jiffies 4295580214 hex dump (first 8 bytes): 61 6e 79 00 38 c9 5c 7e any.8.\~ backtrace (crc e1b7f876): [<00000000c6848ac7>] kmemleak_alloc+0x4b/0x80 [<00000000de9f7d00>] __kmalloc_node_track_caller_noprof+0x36e/0x4a0 [<000000003e29b886>] memdup_user+0x32/0xa0 [<0000000007248326>] strndup_user+0x46/0x60 [<0000000035b3dd29>] __x64_sys_fsconfig+0x368/0x3d0 [<0000000018657927>] x64_sys_call+0xff/0x9f0 [<00000000c0cabc95>] do_syscall_64+0x3b/0xc0 [<000000002f331597>] entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 6c1752e0b6ca ("bpf: Support symbolic BPF FS delegation mount options") Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20241022130133.3798232-1-houtao@huaweicloud.com --- kernel/bpf/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d8fc5eba529d..9aaf5124648b 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -880,7 +880,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) const struct btf_type *enum_t; const char *enum_pfx; u64 *delegate_msk, msk = 0; - char *p; + char *p, *str; int val; /* ignore errors, fallback to hex */ @@ -911,7 +911,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - while ((p = strsep(¶m->string, ":"))) { + str = param->string; + while ((p = strsep(&str, ":"))) { if (strcmp(p, "any") == 0) { msk |= ~0ULL; } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { -- cgit v1.2.3 From 6fad274f06f038c29660aa53fbad14241c9fd976 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Oct 2024 17:28:05 +0200 Subject: bpf: Add MEM_WRITE attribute Add a MEM_WRITE attribute for BPF helper functions which can be used in bpf_func_proto to annotate an argument type in order to let the verifier know that the helper writes into the memory passed as an argument. In the past MEM_UNINIT has been (ab)used for this function, but the latter merely tells the verifier that the passed memory can be uninitialized. There have been bugs with overloading the latter but aside from that there are also cases where the passed memory is read + written which currently cannot be expressed, see also 4b3786a6c539 ("bpf: Zero former ARG_PTR_TO_{LONG,INT} args in case of error"). Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241021152809.33343-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 10 +++++----- kernel/bpf/ringbuf.c | 2 +- kernel/bpf/syscall.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1a43d06eab28..ca3f0a2e5ed5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) @@ -538,7 +538,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; @@ -566,7 +566,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; @@ -1742,7 +1742,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index de3b681d1d13..e1cfe890e0be 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -632,7 +632,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8cfa7183d2ef..67179529080b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5892,7 +5892,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; -- cgit v1.2.3 From 8ea607330a39184f51737c6ae706db7fdca7628e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 21 Oct 2024 17:28:06 +0200 Subject: bpf: Fix overloading of MEM_UNINIT's meaning Lonial reported an issue in the BPF verifier where check_mem_size_reg() has the following code: if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to * initialize all the memory that the helper could * just partially fill up. */ meta = NULL; This means that writes are not checked when the register containing the size of the passed buffer has not a fixed size. Through this bug, a BPF program can write to a map which is marked as read-only, for example, .rodata global maps. The problem is that MEM_UNINIT's initial meaning that "the passed buffer to the BPF helper does not need to be initialized" which was added back in commit 435faee1aae9 ("bpf, verifier: add ARG_PTR_TO_RAW_STACK type") got overloaded over time with "the passed buffer is being written to". The problem however is that checks such as the above which were added later via 06c1c049721a ("bpf: allow helpers access to variable memory") set meta to NULL in order force the user to always initialize the passed buffer to the helper. Due to the current double meaning of MEM_UNINIT, this bypasses verifier write checks to the memory (not boundary checks though) and only assumes the latter memory is read instead. Fix this by reverting MEM_UNINIT back to its original meaning, and having MEM_WRITE as an annotation to BPF helpers in order to then trigger the BPF verifier checks for writing to memory. Some notes: check_arg_pair_ok() ensures that for ARG_CONST_SIZE{,_OR_ZERO} we can access fn->arg_type[arg - 1] since it must contain a preceding ARG_PTR_TO_MEM. For check_mem_reg() the meta argument can be removed altogether since we do check both BPF_READ and BPF_WRITE. Same for the equivalent check_kfunc_mem_size_reg(). Fixes: 7b3552d3f9f6 ("bpf: Reject writes for PTR_TO_MAP_KEY in check_helper_mem_access") Fixes: 97e6d7dab1ca ("bpf: Check PTR_TO_MEM | MEM_RDONLY in check_helper_mem_access") Fixes: 15baa55ff5b0 ("bpf/verifier: allow all functions to read user provided context") Reported-by: Lonial Con Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241021152809.33343-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 73 ++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 38 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 411ab1b57af4..f9e2f1cd4975 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7438,7 +7438,8 @@ mark: } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, + int access_size, enum bpf_access_type access_type, + bool zero_size_allowed, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -7450,7 +7451,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7458,15 +7459,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, - meta && meta->raw_mode ? BPF_WRITE : - BPF_READ)) + if (check_map_access_type(env, regno, reg->off, access_size, access_type)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7477,7 +7476,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7505,7 +7504,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, * Dynamically check it now. */ if (!env->ops->convert_ctx_access) { - enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; int offset = access_size - 1; /* Allow zero-byte read from PTR_TO_CTX */ @@ -7513,7 +7511,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return zero_size_allowed ? 0 : -EACCES; return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - atype, -1, false, false); + access_type, -1, false, false); } fallthrough; @@ -7538,6 +7536,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, + enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7553,15 +7552,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, */ meta->msize_max_value = reg->umax_value; - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. + /* The register is SCALAR_VALUE; the access check happens using + * its boundaries. For unprivileged variable accesses, disable + * raw mode so that the program is required to initialize all + * the memory that the helper could just partially fill up. */ if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ meta = NULL; if (reg->smin_value < 0) { @@ -7581,9 +7577,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, regno); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); + err = check_helper_mem_access(env, regno - 1, reg->umax_value, + access_type, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); return err; @@ -7594,13 +7589,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; - struct bpf_call_arg_meta meta; int err; if (register_is_null(reg)) return 0; - memset(&meta, 0, sizeof(meta)); /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -7610,10 +7603,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg mark_ptr_not_null_reg(reg); } - err = check_helper_mem_access(env, regno, mem_size, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7639,13 +7630,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); + err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; + return err; } @@ -8948,9 +8938,8 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_helper_mem_access(env, regno, - meta->map_ptr->key_size, false, - NULL); + err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, + BPF_READ, false, NULL); break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -8965,9 +8954,9 @@ skip_type_check: return -EACCES; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, - meta->map_ptr->value_size, false, - meta); + err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); break; case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { @@ -9009,7 +8998,9 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta); + err = check_helper_mem_access(env, regno, fn->arg_size[arg], + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); if (err) return err; if (arg_type & MEM_ALIGNED) @@ -9017,10 +9008,16 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, false, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, true, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + true, meta); break; case ARG_PTR_TO_DYNPTR: err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); -- cgit v1.2.3 From 9806f283140ef3e4d259b7646bd8c66026bbaac5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 23 Oct 2024 09:19:16 -0700 Subject: bpf: fix do_misc_fixups() for bpf_get_branch_snapshot() We need `goto next_insn;` at the end of patching instead of `continue;`. It currently works by accident by making verifier re-process patched instructions. Reported-by: Shung-Hsi Yu Fixes: 314a53623cd4 ("bpf: inline bpf_get_branch_snapshot() helper") Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20241023161916.2896274-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f9e2f1cd4975..587a6c76e564 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21210,7 +21210,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_kptr_xchg inline */ -- cgit v1.2.3 From 8421d4c8762bd022cb491f2f0f7019ef51b4f0a7 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 24 Oct 2024 09:35:58 +0800 Subject: bpf: Check validity of link->type in bpf_link_show_fdinfo() If a newly-added link type doesn't invoke BPF_LINK_TYPE(), accessing bpf_link_type_strs[link->type] may result in an out-of-bounds access. To spot such missed invocations early in the future, checking the validity of link->type in bpf_link_show_fdinfo() and emitting a warning when such invocations are missed. Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241024013558.1135167-3-houtao@huaweicloud.com --- kernel/bpf/syscall.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 67179529080b..c5aa127ed4cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3069,13 +3069,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; + enum bpf_link_type type = link->type; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - seq_printf(m, - "link_type:\t%s\n" - "link_id:\t%u\n", - bpf_link_type_strs[link->type], - link->id); + if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + } else { + WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); + seq_printf(m, "link_type:\t<%u>\n", type); + } + seq_printf(m, "link_id:\t%u\n", link->id); + if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, -- cgit v1.2.3 From aa30eb3260b2dea3a68d3c42a39f9a09c5e99cee Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 29 Oct 2024 10:26:40 -0700 Subject: bpf: Force checkpoint when jmp history is too long A specifically crafted program might trick verifier into growing very long jump history within a single bpf_verifier_state instance. Very long jump history makes mark_chain_precision() unreasonably slow, especially in case if verifier processes a loop. Mitigate this by forcing new state in is_state_visited() in case if current state's jump history is too long. Use same constant as in `skip_inf_loop_check`, but multiply it by arbitrarily chosen value 2 to account for jump history containing not only information about jumps, but also information about stack access. For an example of problematic program consider the code below, w/o this patch the example is processed by verifier for ~15 minutes, before failing to allocate big-enough chunk for jmp_history. 0: r7 = *(u16 *)(r1 +0);" 1: r7 += 0x1ab064b9;" 2: if r7 & 0x702000 goto 1b; 3: r7 &= 0x1ee60e;" 4: r7 += r1;" 5: if r7 s> 0x37d2 goto +0;" 6: r0 = 0;" 7: exit;" Perf profiling shows that most of the time is spent in mark_chain_precision() ~95%. The easiest way to explain why this program causes problems is to apply the following patch: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0c216e71cec7..4b4823961abe 100644 \--- a/include/linux/bpf.h \+++ b/include/linux/bpf.h \@@ -1926,7 +1926,7 @@ struct bpf_array { }; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ +#define BPF_COMPLEXITY_LIMIT_INSNS 256 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 /* Maximum number of loops for bpf_loop and bpf_iter_num. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f514247ba8ba..75e88be3bb3e 100644 \--- a/kernel/bpf/verifier.c \+++ b/kernel/bpf/verifier.c \@@ -18024,8 +18024,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) skip_inf_loop_check: if (!force_new_state && env->jmps_processed - env->prev_jmps_processed < 20 && - env->insn_processed - env->prev_insn_processed < 100) + env->insn_processed - env->prev_insn_processed < 100) { + verbose(env, "is_state_visited: suppressing checkpoint at %d, %d jmps processed, cur->jmp_history_cnt is %d\n", + env->insn_idx, + env->jmps_processed - env->prev_jmps_processed, + cur->jmp_history_cnt); add_new_state = false; + } goto miss; } /* If sl->state is a part of a loop and this loop's entry is a part of \@@ -18142,6 +18147,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (!add_new_state) return 0; + verbose(env, "is_state_visited: new checkpoint at %d, resetting env->jmps_processed\n", + env->insn_idx); + /* There were no equivalent states, remember the current one. * Technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) And observe verification log: ... is_state_visited: new checkpoint at 5, resetting env->jmps_processed 5: R1=ctx() R7=ctx(...) 5: (65) if r7 s> 0x37d2 goto pc+0 ; R7=ctx(...) 6: (b7) r0 = 0 ; R0_w=0 7: (95) exit from 5 to 6: R1=ctx() R7=ctx(...) R10=fp0 6: R1=ctx() R7=ctx(...) R10=fp0 6: (b7) r0 = 0 ; R0_w=0 7: (95) exit is_state_visited: suppressing checkpoint at 1, 3 jmps processed, cur->jmp_history_cnt is 74 from 2 to 1: R1=ctx() R7_w=scalar(...) R10=fp0 1: R1=ctx() R7_w=scalar(...) R10=fp0 1: (07) r7 += 447767737 is_state_visited: suppressing checkpoint at 2, 3 jmps processed, cur->jmp_history_cnt is 75 2: R7_w=scalar(...) 2: (45) if r7 & 0x702000 goto pc-2 ... mark_precise 152 steps for r7 ... 2: R7_w=scalar(...) is_state_visited: suppressing checkpoint at 1, 4 jmps processed, cur->jmp_history_cnt is 75 1: (07) r7 += 447767737 is_state_visited: suppressing checkpoint at 2, 4 jmps processed, cur->jmp_history_cnt is 76 2: R7_w=scalar(...) 2: (45) if r7 & 0x702000 goto pc-2 ... BPF program is too large. Processed 257 insn The log output shows that checkpoint at label (1) is never created, because it is suppressed by `skip_inf_loop_check` logic: a. When 'if' at (2) is processed it pushes a state with insn_idx (1) onto stack and proceeds to (3); b. At (5) checkpoint is created, and this resets env->{jmps,insns}_processed. c. Verification proceeds and reaches `exit`; d. State saved at step (a) is popped from stack and is_state_visited() considers if checkpoint needs to be added, but because env->{jmps,insns}_processed had been just reset at step (b) the `skip_inf_loop_check` logic forces `add_new_state` to false. e. Verifier proceeds with current state, which slowly accumulates more and more entries in the jump history. The accumulation of entries in the jump history is a problem because of two factors: - it eventually exhausts memory available for kmalloc() allocation; - mark_chain_precision() traverses the jump history of a state, meaning that if `r7` is marked precise, verifier would iterate ever growing jump history until parent state boundary is reached. (note: the log also shows a REG INVARIANTS VIOLATION warning upon jset processing, but that's another bug to fix). With this patch applied, the example above is rejected by verifier under 1s of time, reaching 1M instructions limit. The program is a simplified reproducer from syzbot report. Previous discussion could be found at [1]. The patch does not cause any changes in verification performance, when tested on selftests from veristat.cfg and cilium programs taken from [2]. [1] https://lore.kernel.org/bpf/20241009021254.2805446-1-eddyz87@gmail.com/ [2] https://github.com/anakryiko/cilium Changelog: - v1 -> v2: - moved patch to bpf tree; - moved force_new_state variable initialization after declaration and shortened the comment. v1: https://lore.kernel.org/bpf/20241018020307.1766906-1-eddyz87@gmail.com/ Fixes: 2589726d12a1 ("bpf: introduce bounded loops") Reported-by: syzbot+7e46cdef14bf496a3ab4@syzkaller.appspotmail.com Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20241029172641.1042523-1-eddyz87@gmail.com Closes: https://lore.kernel.org/bpf/670429f6.050a0220.49194.0517.GAE@google.com/ --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587a6c76e564..3254c27085b8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17886,9 +17886,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; - bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); - bool add_new_state = force_new_state; - bool force_exact; + bool force_new_state, add_new_state, force_exact; + + force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || + /* Avoid accumulating infinitely long jmp history */ + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -17898,6 +17900,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * In tests that amounts to up to 50% reduction into total verifier * memory consumption and 20% verifier time speedup. */ + add_new_state = force_new_state; if (env->jmps_processed - env->prev_jmps_processed >= 2 && env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; -- cgit v1.2.3 From 13400ac8fb80c57c2bfb12ebd35ee121ce9b4d21 Mon Sep 17 00:00:00 2001 From: Byeonguk Jeong Date: Sat, 26 Oct 2024 14:02:43 +0900 Subject: bpf: Fix out-of-bounds write in trie_get_next_key() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit trie_get_next_key() allocates a node stack with size trie->max_prefixlen, while it writes (trie->max_prefixlen + 1) nodes to the stack when it has full paths from the root to leaves. For example, consider a trie with max_prefixlen is 8, and the nodes with key 0x00/0, 0x00/1, 0x00/2, ... 0x00/8 inserted. Subsequent calls to trie_get_next_key with _key with .prefixlen = 8 make 9 nodes be written on the node stack with size 8. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") Signed-off-by: Byeonguk Jeong Reviewed-by: Toke Høiland-Jørgensen Tested-by: Hou Tao Acked-by: Hou Tao Link: https://lore.kernel.org/r/Zxx384ZfdlFYnz6J@localhost.localdomain Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0218a5132ab5..9b60eda0f727 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc_array(trie->max_prefixlen, + node_stack = kmalloc_array(trie->max_prefixlen + 1, sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) -- cgit v1.2.3 From d0b98f6a17a5cb336121302bce0c97eb5fe32d16 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 29 Oct 2024 12:39:11 -0700 Subject: bpf: disallow 40-bytes extra stack for bpf_fastcall patterns Hou Tao reported an issue with bpf_fastcall patterns allowing extra stack space above MAX_BPF_STACK limit. This extra stack allowance is not integrated properly with the following verifier parts: - backtracking logic still assumes that stack can't exceed MAX_BPF_STACK; - bpf_verifier_env->scratched_stack_slots assumes only 64 slots are available. Here is an example of an issue with precision tracking (note stack slot -8 tracked as precise instead of -520): 0: (b7) r1 = 42 ; R1_w=42 1: (b7) r2 = 42 ; R2_w=42 2: (7b) *(u64 *)(r10 -512) = r1 ; R1_w=42 R10=fp0 fp-512_w=42 3: (7b) *(u64 *)(r10 -520) = r2 ; R2_w=42 R10=fp0 fp-520_w=42 4: (85) call bpf_get_smp_processor_id#8 ; R0_w=scalar(...) 5: (79) r2 = *(u64 *)(r10 -520) ; R2_w=42 R10=fp0 fp-520_w=42 6: (79) r1 = *(u64 *)(r10 -512) ; R1_w=42 R10=fp0 fp-512_w=42 7: (bf) r3 = r10 ; R3_w=fp0 R10=fp0 8: (0f) r3 += r2 mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx -1 mark_precise: frame0: regs=r2 stack= before 7: (bf) r3 = r10 mark_precise: frame0: regs=r2 stack= before 6: (79) r1 = *(u64 *)(r10 -512) mark_precise: frame0: regs=r2 stack= before 5: (79) r2 = *(u64 *)(r10 -520) mark_precise: frame0: regs= stack=-8 before 4: (85) call bpf_get_smp_processor_id#8 mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -520) = r2 mark_precise: frame0: regs=r2 stack= before 2: (7b) *(u64 *)(r10 -512) = r1 mark_precise: frame0: regs=r2 stack= before 1: (b7) r2 = 42 9: R2_w=42 R3_w=fp42 9: (95) exit This patch disables the additional allowance for the moment. Also, two test cases are removed: - bpf_fastcall_max_stack_ok: it fails w/o additional stack allowance; - bpf_fastcall_max_stack_fail: this test is no longer necessary, stack size follows regular rules, pattern invalidation is checked by other test cases. Reported-by: Hou Tao Closes: https://lore.kernel.org/bpf/20241023022752.172005-1-houtao@huaweicloud.com/ Fixes: 5b5f51bff1b6 ("bpf: no_caller_saved_registers attribute for helper calls") Signed-off-by: Eduard Zingerman Acked-by: Andrii Nakryiko Tested-by: Hou Tao Link: https://lore.kernel.org/r/20241029193911.1575719-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3254c27085b8..bb99bada7e2e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6804,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; - int min_valid_off, max_bpf_stack; - - /* If accessing instruction is a spill/fill from bpf_fastcall pattern, - * add room for all caller saved registers below MAX_BPF_STACK. - * In case if bpf_fastcall rewrite won't happen maximal stack depth - * would be checked by check_max_stack_depth_subprog(). - */ - max_bpf_stack = MAX_BPF_STACK; - if (aux->fastcall_pattern) - max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; + int min_valid_off; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -max_bpf_stack; + min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; -- cgit v1.2.3 From 101ccfbabf4738041273ce64e2b116cf440dea13 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 30 Oct 2024 18:05:12 +0800 Subject: bpf: Free dynamically allocated bits in bpf_iter_bits_destroy() bpf_iter_bits_destroy() uses "kit->nr_bits <= 64" to check whether the bits are dynamically allocated. However, the check is incorrect and may cause a kmemleak as shown below: unreferenced object 0xffff88812628c8c0 (size 32): comm "swapper/0", pid 1, jiffies 4294727320 hex dump (first 32 bytes): b0 c1 55 f5 81 88 ff ff f0 f0 f0 f0 f0 f0 f0 f0 ..U........... f0 f0 f0 f0 f0 f0 f0 f0 00 00 00 00 00 00 00 00 .............. backtrace (crc 781e32cc): [<00000000c452b4ab>] kmemleak_alloc+0x4b/0x80 [<0000000004e09f80>] __kmalloc_node_noprof+0x480/0x5c0 [<00000000597124d6>] __alloc.isra.0+0x89/0xb0 [<000000004ebfffcd>] alloc_bulk+0x2af/0x720 [<00000000d9c10145>] prefill_mem_cache+0x7f/0xb0 [<00000000ff9738ff>] bpf_mem_alloc_init+0x3e2/0x610 [<000000008b616eac>] bpf_global_ma_init+0x19/0x30 [<00000000fc473efc>] do_one_initcall+0xd3/0x3c0 [<00000000ec81498c>] kernel_init_freeable+0x66a/0x940 [<00000000b119f72f>] kernel_init+0x20/0x160 [<00000000f11ac9a7>] ret_from_fork+0x3c/0x70 [<0000000004671da4>] ret_from_fork_asm+0x1a/0x30 That is because nr_bits will be set as zero in bpf_iter_bits_next() after all bits have been iterated. Fix the issue by setting kit->bit to kit->nr_bits instead of setting kit->nr_bits to zero when the iteration completes in bpf_iter_bits_next(). In addition, use "!nr_bits || bits >= nr_bits" to check whether the iteration is complete and still use "nr_bits > 64" to indicate whether bits are dynamically allocated. The "!nr_bits" check is necessary because bpf_iter_bits_new() may fail before setting kit->nr_bits, and this condition will stop the iteration early instead of accessing the zeroed or freed kit->bits. Considering the initial value of kit->bits is -1 and the type of kit->nr_bits is unsigned int, change the type of kit->nr_bits to int. The potential overflow problem will be handled in the following patch. Fixes: 4665415975b0 ("bpf: Add bits iterator") Acked-by: Yafang Shao Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241030100516.3633640-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ca3f0a2e5ed5..d913a8f1fbd9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2856,7 +2856,7 @@ struct bpf_iter_bits_kern { unsigned long *bits; unsigned long bits_copy; }; - u32 nr_bits; + int nr_bits; int bit; } __aligned(8); @@ -2930,17 +2930,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; - u32 nr_bits = kit->nr_bits; + int bit = kit->bit, nr_bits = kit->nr_bits; const unsigned long *bits; - int bit; - if (nr_bits == 0) + if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; - bit = find_next_bit(bits, nr_bits, kit->bit + 1); + bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { - kit->nr_bits = 0; + kit->bit = bit; return NULL; } -- cgit v1.2.3 From 62a898b07b83f6f407003d8a70f0827a5af08a59 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 30 Oct 2024 18:05:13 +0800 Subject: bpf: Add bpf_mem_alloc_check_size() helper Introduce bpf_mem_alloc_check_size() to check whether the allocation size exceeds the limitation for the kmalloc-equivalent allocator. The upper limit for percpu allocation is LLIST_NODE_SZ bytes larger than non-percpu allocation, so a percpu argument is added to the helper. The helper will be used in the following patch to check whether the size parameter passed to bpf_mem_alloc() is too big. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241030100516.3633640-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index b3858a76e0b3..146f5b57cfb1 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -35,6 +35,8 @@ */ #define LLIST_NODE_SZ sizeof(struct llist_node) +#define BPF_MEM_ALLOC_SIZE_MAX 4096 + /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = { 3, /* 8 */ @@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = { static int bpf_mem_cache_idx(size_t size) { - if (!size || size > 4096) + if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) return -1; if (size <= 192) @@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +int bpf_mem_alloc_check_size(bool percpu, size_t size) +{ + /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ + if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || + (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) + return -E2BIG; + + return 0; +} -- cgit v1.2.3 From 393397fbdcad7396639d7077c33f86169184ba99 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 30 Oct 2024 18:05:14 +0800 Subject: bpf: Check the validity of nr_words in bpf_iter_bits_new() Check the validity of nr_words in bpf_iter_bits_new(). Without this check, when multiplication overflow occurs for nr_bits (e.g., when nr_words = 0x0400-0001, nr_bits becomes 64), stack corruption may occur due to bpf_probe_read_kernel_common(..., nr_bytes = 0x2000-0008). Fix it by limiting the maximum value of nr_words to 511. The value is derived from the current implementation of BPF memory allocator. To ensure compatibility if the BPF memory allocator's size limitation changes in the future, use the helper bpf_mem_alloc_check_size() to check whether nr_bytes is too larger. And return -E2BIG instead of -ENOMEM for oversized nr_bytes. Fixes: 4665415975b0 ("bpf: Add bits iterator") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241030100516.3633640-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d913a8f1fbd9..018985ebc5ce 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2851,6 +2851,8 @@ struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); +#define BITS_ITER_NR_WORDS_MAX 511 + struct bpf_iter_bits_kern { union { unsigned long *bits; @@ -2865,7 +2867,8 @@ struct bpf_iter_bits_kern { * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. - * Due to the limitation of memalloc, it can't be greater than 512. + * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be + * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It @@ -2892,6 +2895,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (!unsafe_ptr__ign || !nr_words) return -EINVAL; + if (nr_words > BITS_ITER_NR_WORDS_MAX) + return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { @@ -2903,6 +2908,9 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w return 0; } + if (bpf_mem_alloc_check_size(false, nr_bytes)) + return -E2BIG; + /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) -- cgit v1.2.3 From e1339383675063ae4760d81ffe13a79981841b8d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 30 Oct 2024 18:05:15 +0800 Subject: bpf: Use __u64 to save the bits in bits iterator On 32-bit hosts (e.g., arm32), when a bpf program passes a u64 to bpf_iter_bits_new(), bpf_iter_bits_new() will use bits_copy to store the content of the u64. However, bits_copy is only 4 bytes, leading to stack corruption. The straightforward solution would be to replace u64 with unsigned long in bpf_iter_bits_new(). However, this introduces confusion and problems for 32-bit hosts because the size of ulong in bpf program is 8 bytes, but it is treated as 4-bytes after passed to bpf_iter_bits_new(). Fix it by changing the type of both bits and bit_count from unsigned long to u64. However, the change is not enough. The main reason is that bpf_iter_bits_next() uses find_next_bit() to find the next bit and the pointer passed to find_next_bit() is an unsigned long pointer instead of a u64 pointer. For 32-bit little-endian host, it is fine but it is not the case for 32-bit big-endian host. Because under 32-bit big-endian host, the first iterated unsigned long will be the bits 32-63 of the u64 instead of the expected bits 0-31. Therefore, in addition to changing the type, swap the two unsigned longs within the u64 for 32-bit big-endian host. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241030100516.3633640-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 018985ebc5ce..3d45ebe8afb4 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2855,13 +2855,36 @@ struct bpf_iter_bits { struct bpf_iter_bits_kern { union { - unsigned long *bits; - unsigned long bits_copy; + __u64 *bits; + __u64 bits_copy; }; int nr_bits; int bit; } __aligned(8); +/* On 64-bit hosts, unsigned long and u64 have the same size, so passing + * a u64 pointer and an unsigned long pointer to find_next_bit() will + * return the same result, as both point to the same 8-byte area. + * + * For 32-bit little-endian hosts, using a u64 pointer or unsigned long + * pointer also makes no difference. This is because the first iterated + * unsigned long is composed of bits 0-31 of the u64 and the second unsigned + * long is composed of bits 32-63 of the u64. + * + * However, for 32-bit big-endian hosts, this is not the case. The first + * iterated unsigned long will be bits 32-63 of the u64, so swap these two + * ulong values within the u64. + */ +static void swap_ulong_in_u64(u64 *bits, unsigned int nr) +{ +#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) + unsigned int i; + + for (i = 0; i < nr; i++) + bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); +#endif +} + /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created @@ -2904,6 +2927,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (err) return -EFAULT; + swap_ulong_in_u64(&kit->bits_copy, nr_words); + kit->nr_bits = nr_bits; return 0; } @@ -2922,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w return err; } + swap_ulong_in_u64(kit->bits, nr_words); + kit->nr_bits = nr_bits; return 0; } @@ -2939,7 +2966,7 @@ __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; int bit = kit->bit, nr_bits = kit->nr_bits; - const unsigned long *bits; + const void *bits; if (!nr_bits || bit >= nr_bits) return NULL; -- cgit v1.2.3