From 0f6ede9fbc747e2553612271bce108f7517e7a45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Dec 2024 12:54:55 +0000 Subject: net: defer final 'struct net' free in netns dismantle Ilya reported a slab-use-after-free in dst_destroy [1] Issue is in xfrm6_net_init() and xfrm4_net_init() : They copy xfrm[46]_dst_ops_template into net->xfrm.xfrm[46]_dst_ops. But net structure might be freed before all the dst callbacks are called. So when dst_destroy() calls later : if (dst->ops->destroy) dst->ops->destroy(dst); dst->ops points to the old net->xfrm.xfrm[46]_dst_ops, which has been freed. See a relevant issue fixed in : ac888d58869b ("net: do not delay dst_entries_add() in dst_release()") A fix is to queue the 'struct net' to be freed after one another cleanup_net() round (and existing rcu_barrier()) [1] BUG: KASAN: slab-use-after-free in dst_destroy (net/core/dst.c:112) Read of size 8 at addr ffff8882137ccab0 by task swapper/37/0 Dec 03 05:46:18 kernel: CPU: 37 UID: 0 PID: 0 Comm: swapper/37 Kdump: loaded Not tainted 6.12.0 #67 Hardware name: Red Hat KVM/RHEL, BIOS 1.16.1-1.el9 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:124) print_address_description.constprop.0 (mm/kasan/report.c:378) ? dst_destroy (net/core/dst.c:112) print_report (mm/kasan/report.c:489) ? dst_destroy (net/core/dst.c:112) ? kasan_addr_to_slab (mm/kasan/common.c:37) kasan_report (mm/kasan/report.c:603) ? dst_destroy (net/core/dst.c:112) ? rcu_do_batch (kernel/rcu/tree.c:2567) dst_destroy (net/core/dst.c:112) rcu_do_batch (kernel/rcu/tree.c:2567) ? __pfx_rcu_do_batch (kernel/rcu/tree.c:2491) ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4339 kernel/locking/lockdep.c:4406) rcu_core (kernel/rcu/tree.c:2825) handle_softirqs (kernel/softirq.c:554) __irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637) irq_exit_rcu (kernel/softirq.c:651) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1049 arch/x86/kernel/apic/apic.c:1049) asm_sysvec_apic_timer_interrupt (./arch/x86/include/asm/idtentry.h:702) RIP: 0010:default_idle (./arch/x86/include/asm/irqflags.h:37 ./arch/x86/include/asm/irqflags.h:92 arch/x86/kernel/process.c:743) Code: 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 6e ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 0f 00 2d c7 c9 27 00 fb f4 c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 RSP: 0018:ffff888100d2fe00 EFLAGS: 00000246 RAX: 00000000001870ed RBX: 1ffff110201a5fc2 RCX: ffffffffb61a3e46 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffb3d4d123 RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed11c7e1835d R10: ffff888e3f0c1aeb R11: 0000000000000000 R12: 0000000000000000 R13: ffff888100d20000 R14: dffffc0000000000 R15: 0000000000000000 ? ct_kernel_exit.constprop.0 (kernel/context_tracking.c:148) ? cpuidle_idle_call (kernel/sched/idle.c:186) default_idle_call (./include/linux/cpuidle.h:143 kernel/sched/idle.c:118) cpuidle_idle_call (kernel/sched/idle.c:186) ? __pfx_cpuidle_idle_call (kernel/sched/idle.c:168) ? lock_release (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5848) ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4347 kernel/locking/lockdep.c:4406) ? tsc_verify_tsc_adjust (arch/x86/kernel/tsc_sync.c:59) do_idle (kernel/sched/idle.c:326) cpu_startup_entry (kernel/sched/idle.c:423 (discriminator 1)) start_secondary (arch/x86/kernel/smpboot.c:202 arch/x86/kernel/smpboot.c:282) ? __pfx_start_secondary (arch/x86/kernel/smpboot.c:232) ? soft_restart_cpu (arch/x86/kernel/head_64.S:452) common_startup_64 (arch/x86/kernel/head_64.S:414) Dec 03 05:46:18 kernel: Allocated by task 12184: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (./arch/x86/include/asm/current.h:49 mm/kasan/common.c:60 mm/kasan/common.c:69) __kasan_slab_alloc (mm/kasan/common.c:319 mm/kasan/common.c:345) kmem_cache_alloc_noprof (mm/slub.c:4085 mm/slub.c:4134 mm/slub.c:4141) copy_net_ns (net/core/net_namespace.c:421 net/core/net_namespace.c:480) create_new_namespaces (kernel/nsproxy.c:110) unshare_nsproxy_namespaces (kernel/nsproxy.c:228 (discriminator 4)) ksys_unshare (kernel/fork.c:3313) __x64_sys_unshare (kernel/fork.c:3382) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Dec 03 05:46:18 kernel: Freed by task 11: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (./arch/x86/include/asm/current.h:49 mm/kasan/common.c:60 mm/kasan/common.c:69) kasan_save_free_info (mm/kasan/generic.c:582) __kasan_slab_free (mm/kasan/common.c:271) kmem_cache_free (mm/slub.c:4579 mm/slub.c:4681) cleanup_net (net/core/net_namespace.c:456 net/core/net_namespace.c:446 net/core/net_namespace.c:647) process_one_work (kernel/workqueue.c:3229) worker_thread (kernel/workqueue.c:3304 kernel/workqueue.c:3391) kthread (kernel/kthread.c:389) ret_from_fork (arch/x86/kernel/process.c:147) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) Dec 03 05:46:18 kernel: Last potentially related work creation: kasan_save_stack (mm/kasan/common.c:48) __kasan_record_aux_stack (mm/kasan/generic.c:541) insert_work (./include/linux/instrumented.h:68 ./include/asm-generic/bitops/instrumented-non-atomic.h:141 kernel/workqueue.c:788 kernel/workqueue.c:795 kernel/workqueue.c:2186) __queue_work (kernel/workqueue.c:2340) queue_work_on (kernel/workqueue.c:2391) xfrm_policy_insert (net/xfrm/xfrm_policy.c:1610) xfrm_add_policy (net/xfrm/xfrm_user.c:2116) xfrm_user_rcv_msg (net/xfrm/xfrm_user.c:3321) netlink_rcv_skb (net/netlink/af_netlink.c:2536) xfrm_netlink_rcv (net/xfrm/xfrm_user.c:3344) netlink_unicast (net/netlink/af_netlink.c:1316 net/netlink/af_netlink.c:1342) netlink_sendmsg (net/netlink/af_netlink.c:1886) sock_write_iter (net/socket.c:729 net/socket.c:744 net/socket.c:1165) vfs_write (fs/read_write.c:590 fs/read_write.c:683) ksys_write (fs/read_write.c:736) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Dec 03 05:46:18 kernel: Second to last potentially related work creation: kasan_save_stack (mm/kasan/common.c:48) __kasan_record_aux_stack (mm/kasan/generic.c:541) insert_work (./include/linux/instrumented.h:68 ./include/asm-generic/bitops/instrumented-non-atomic.h:141 kernel/workqueue.c:788 kernel/workqueue.c:795 kernel/workqueue.c:2186) __queue_work (kernel/workqueue.c:2340) queue_work_on (kernel/workqueue.c:2391) __xfrm_state_insert (./include/linux/workqueue.h:723 net/xfrm/xfrm_state.c:1150 net/xfrm/xfrm_state.c:1145 net/xfrm/xfrm_state.c:1513) xfrm_state_update (./include/linux/spinlock.h:396 net/xfrm/xfrm_state.c:1940) xfrm_add_sa (net/xfrm/xfrm_user.c:912) xfrm_user_rcv_msg (net/xfrm/xfrm_user.c:3321) netlink_rcv_skb (net/netlink/af_netlink.c:2536) xfrm_netlink_rcv (net/xfrm/xfrm_user.c:3344) netlink_unicast (net/netlink/af_netlink.c:1316 net/netlink/af_netlink.c:1342) netlink_sendmsg (net/netlink/af_netlink.c:1886) sock_write_iter (net/socket.c:729 net/socket.c:744 net/socket.c:1165) vfs_write (fs/read_write.c:590 fs/read_write.c:683) ksys_write (fs/read_write.c:736) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Fixes: a8a572a6b5f2 ("xfrm: dst_entries_init() per-net dst_ops") Reported-by: Ilya Maximets Closes: https://lore.kernel.org/netdev/CANn89iKKYDVpB=MtmfH7nyv2p=rJWSLedO5k7wSZgtY_tO8WQg@mail.gmail.com/T/#m02c98c3009fe66382b73cfb4db9cf1df6fab3fbf Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241204125455.3871859-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ae34ac818cda..b5cd3ae4f04c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -449,6 +449,21 @@ out_free: goto out; } +static LLIST_HEAD(defer_free_list); + +static void net_complete_free(void) +{ + struct llist_node *kill_list; + struct net *net, *next; + + /* Get the list of namespaces to free from last round. */ + kill_list = llist_del_all(&defer_free_list); + + llist_for_each_entry_safe(net, next, kill_list, defer_free_list) + kmem_cache_free(net_cachep, net); + +} + static void net_free(struct net *net) { if (refcount_dec_and_test(&net->passive)) { @@ -457,7 +472,8 @@ static void net_free(struct net *net) /* There should not be any trackers left there. */ ref_tracker_dir_exit(&net->notrefcnt_tracker); - kmem_cache_free(net_cachep, net); + /* Wait for an extra rcu_barrier() before final free. */ + llist_add(&net->defer_free_list, &defer_free_list); } } @@ -642,6 +658,8 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); + net_complete_free(); + /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); -- cgit v1.2.3 From 09310cfd4ea5c3ab2c7a610420205e0a1660bf7e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Dec 2024 15:32:52 +0300 Subject: rtnetlink: fix error code in rtnl_newlink() If rtnl_get_peer_net() fails, then propagate the error code. Don't return success. Fixes: 48327566769a ("rtnetlink: fix double call of rtnl_link_get_net_ifla()") Signed-off-by: Dan Carpenter Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/a2d20cd4-387a-4475-887c-bb7d0e88e25a@stanley.mountain Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ab5f201bf0ab..ebcfc2debf1a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3972,8 +3972,10 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ops->peer_type) { peer_net = rtnl_get_peer_net(ops, data, extack); - if (IS_ERR(peer_net)) + if (IS_ERR(peer_net)) { + ret = PTR_ERR(peer_net); goto put_ops; + } if (peer_net) rtnl_nets_add(&rtnl_nets, peer_net); } -- cgit v1.2.3 From 75e072a390da9a22e7ae4a4e8434dfca5da499fb Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 2 Dec 2024 12:29:23 +0100 Subject: bpf, sockmap: Fix update element with same Consider a sockmap entry being updated with the same socket: osk = stab->sks[idx]; sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); Due to sock_map_unref(), which invokes sock_map_del_link(), all the psock's links for stab->sks[idx] are torn: list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { ... list_del(&link->list); sk_psock_free_link(link); } } And that includes the new link sock_map_add_link() added just before the unref. This results in a sockmap holding a socket, but without the respective link. This in turn means that close(sock) won't trigger the cleanup, i.e. a closed socket will not be automatically removed from the sockmap. Stop tearing the links when a matching link_raw is found. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241202-sockmap-replace-v1-1-1e88579e7bd5@rbox.co --- net/core/sock_map.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 78347d7d25ef..20b348b1964a 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -159,6 +159,7 @@ static void sock_map_del_link(struct sock *sk, verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); + break; } } spin_unlock_bh(&psock->link_lock); -- cgit v1.2.3 From ed1fc5d76b81a4d681211333c026202cad4d5649 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 2 Dec 2024 12:29:25 +0100 Subject: bpf, sockmap: Fix race between element replace and close() Element replace (with a socket different from the one stored) may race with socket's close() link popping & unlinking. __sock_map_delete() unconditionally unrefs the (wrong) element: // set map[0] = s0 map_update_elem(map, 0, s0) // drop fd of s0 close(s0) sock_map_close() lock_sock(sk) (s0!) sock_map_remove_links(sk) link = sk_psock_link_pop() sock_map_unlink(sk, link) sock_map_delete_from_link // replace map[0] with s1 map_update_elem(map, 0, s1) sock_map_update_elem (s1!) lock_sock(sk) sock_map_update_common psock = sk_psock(sk) spin_lock(&stab->lock) osk = stab->sks[idx] sock_map_add_link(..., &stab->sks[idx]) sock_map_unref(osk, &stab->sks[idx]) psock = sk_psock(osk) sk_psock_put(sk, psock) if (refcount_dec_and_test(&psock)) sk_psock_drop(sk, psock) spin_unlock(&stab->lock) unlock_sock(sk) __sock_map_delete spin_lock(&stab->lock) sk = *psk // s1 replaced s0; sk == s1 if (!sk_test || sk_test == sk) // sk_test (s0) != sk (s1); no branch sk = xchg(psk, NULL) if (sk) sock_map_unref(sk, psk) // unref s1; sks[idx] will dangle psock = sk_psock(sk) sk_psock_put(sk, psock) if (refcount_dec_and_test()) sk_psock_drop(sk, psock) spin_unlock(&stab->lock) release_sock(sk) Then close(map) enqueues bpf_map_free_deferred, which finally calls sock_map_free(). This results in some refcount_t warnings along with a KASAN splat [1]. Fix __sock_map_delete(), do not allow sock_map_unref() on elements that may have been replaced. [1]: BUG: KASAN: slab-use-after-free in sock_map_free+0x10e/0x330 Write of size 4 at addr ffff88811f5b9100 by task kworker/u64:12/1063 CPU: 14 UID: 0 PID: 1063 Comm: kworker/u64:12 Not tainted 6.12.0+ #125 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 Workqueue: events_unbound bpf_map_free_deferred Call Trace: dump_stack_lvl+0x68/0x90 print_report+0x174/0x4f6 kasan_report+0xb9/0x190 kasan_check_range+0x10f/0x1e0 sock_map_free+0x10e/0x330 bpf_map_free_deferred+0x173/0x320 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x29e/0x360 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 Allocated by task 1202: kasan_save_stack+0x1e/0x40 kasan_save_track+0x10/0x30 __kasan_slab_alloc+0x85/0x90 kmem_cache_alloc_noprof+0x131/0x450 sk_prot_alloc+0x5b/0x220 sk_alloc+0x2c/0x870 unix_create1+0x88/0x8a0 unix_create+0xc5/0x180 __sock_create+0x241/0x650 __sys_socketpair+0x1ce/0x420 __x64_sys_socketpair+0x92/0x100 do_syscall_64+0x93/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Freed by task 46: kasan_save_stack+0x1e/0x40 kasan_save_track+0x10/0x30 kasan_save_free_info+0x37/0x60 __kasan_slab_free+0x4b/0x70 kmem_cache_free+0x1a1/0x590 __sk_destruct+0x388/0x5a0 sk_psock_destroy+0x73e/0xa50 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x29e/0x360 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 The buggy address belongs to the object at ffff88811f5b9080 which belongs to the cache UNIX-STREAM of size 1984 The buggy address is located 128 bytes inside of freed 1984-byte region [ffff88811f5b9080, ffff88811f5b9840) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11f5b8 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff888127d49401 flags: 0x17ffffc0000040(head|node=0|zone=2|lastcpupid=0x1fffff) page_type: f5(slab) raw: 0017ffffc0000040 ffff8881042e4500 dead000000000122 0000000000000000 raw: 0000000000000000 00000000800f000f 00000001f5000000 ffff888127d49401 head: 0017ffffc0000040 ffff8881042e4500 dead000000000122 0000000000000000 head: 0000000000000000 00000000800f000f 00000001f5000000 ffff888127d49401 head: 0017ffffc0000003 ffffea00047d6e01 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88811f5b9000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88811f5b9080: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88811f5b9180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88811f5b9200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Disabling lock debugging due to kernel taint refcount_t: addition on 0; use-after-free. WARNING: CPU: 14 PID: 1063 at lib/refcount.c:25 refcount_warn_saturate+0xce/0x150 CPU: 14 UID: 0 PID: 1063 Comm: kworker/u64:12 Tainted: G B 6.12.0+ #125 Tainted: [B]=BAD_PAGE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 Workqueue: events_unbound bpf_map_free_deferred RIP: 0010:refcount_warn_saturate+0xce/0x150 Code: 34 73 eb 03 01 e8 82 53 ad fe 0f 0b eb b1 80 3d 27 73 eb 03 00 75 a8 48 c7 c7 80 bd 95 84 c6 05 17 73 eb 03 01 e8 62 53 ad fe <0f> 0b eb 91 80 3d 06 73 eb 03 00 75 88 48 c7 c7 e0 bd 95 84 c6 05 RSP: 0018:ffff88815c49fc70 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88811f5b9100 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000001 RBP: 0000000000000002 R08: 0000000000000001 R09: ffffed10bcde6349 R10: ffff8885e6f31a4b R11: 0000000000000000 R12: ffff88813be0b000 R13: ffff88811f5b9100 R14: ffff88811f5b9080 R15: ffff88813be0b024 FS: 0000000000000000(0000) GS:ffff8885e6f00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dda99b0250 CR3: 000000015dbac000 CR4: 0000000000752ef0 PKRU: 55555554 Call Trace: ? __warn.cold+0x5f/0x1ff ? refcount_warn_saturate+0xce/0x150 ? report_bug+0x1ec/0x390 ? handle_bug+0x58/0x90 ? exc_invalid_op+0x13/0x40 ? asm_exc_invalid_op+0x16/0x20 ? refcount_warn_saturate+0xce/0x150 sock_map_free+0x2e5/0x330 bpf_map_free_deferred+0x173/0x320 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x29e/0x360 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 irq event stamp: 10741 hardirqs last enabled at (10741): [] asm_sysvec_apic_timer_interrupt+0x16/0x20 hardirqs last disabled at (10740): [] handle_softirqs+0x60d/0x770 softirqs last enabled at (10506): [] __irq_exit_rcu+0x109/0x210 softirqs last disabled at (10301): [] __irq_exit_rcu+0x109/0x210 refcount_t: underflow; use-after-free. WARNING: CPU: 14 PID: 1063 at lib/refcount.c:28 refcount_warn_saturate+0xee/0x150 CPU: 14 UID: 0 PID: 1063 Comm: kworker/u64:12 Tainted: G B W 6.12.0+ #125 Tainted: [B]=BAD_PAGE, [W]=WARN Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 Workqueue: events_unbound bpf_map_free_deferred RIP: 0010:refcount_warn_saturate+0xee/0x150 Code: 17 73 eb 03 01 e8 62 53 ad fe 0f 0b eb 91 80 3d 06 73 eb 03 00 75 88 48 c7 c7 e0 bd 95 84 c6 05 f6 72 eb 03 01 e8 42 53 ad fe <0f> 0b e9 6e ff ff ff 80 3d e6 72 eb 03 00 0f 85 61 ff ff ff 48 c7 RSP: 0018:ffff88815c49fc70 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88811f5b9100 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000001 RBP: 0000000000000003 R08: 0000000000000001 R09: ffffed10bcde6349 R10: ffff8885e6f31a4b R11: 0000000000000000 R12: ffff88813be0b000 R13: ffff88811f5b9100 R14: ffff88811f5b9080 R15: ffff88813be0b024 FS: 0000000000000000(0000) GS:ffff8885e6f00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dda99b0250 CR3: 000000015dbac000 CR4: 0000000000752ef0 PKRU: 55555554 Call Trace: ? __warn.cold+0x5f/0x1ff ? refcount_warn_saturate+0xee/0x150 ? report_bug+0x1ec/0x390 ? handle_bug+0x58/0x90 ? exc_invalid_op+0x13/0x40 ? asm_exc_invalid_op+0x16/0x20 ? refcount_warn_saturate+0xee/0x150 sock_map_free+0x2d3/0x330 bpf_map_free_deferred+0x173/0x320 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x29e/0x360 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 irq event stamp: 10741 hardirqs last enabled at (10741): [] asm_sysvec_apic_timer_interrupt+0x16/0x20 hardirqs last disabled at (10740): [] handle_softirqs+0x60d/0x770 softirqs last enabled at (10506): [] __irq_exit_rcu+0x109/0x210 softirqs last disabled at (10301): [] __irq_exit_rcu+0x109/0x210 Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241202-sockmap-replace-v1-3-1e88579e7bd5@rbox.co --- net/core/sock_map.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 20b348b1964a..f1b9b3958792 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -412,12 +412,11 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key) static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { - struct sock *sk; + struct sock *sk = NULL; int err = 0; spin_lock_bh(&stab->lock); - sk = *psk; - if (!sk_test || sk_test == sk) + if (!sk_test || sk_test == *psk) sk = xchg(psk, NULL); if (likely(sk)) -- cgit v1.2.3 From b238e187b4a2d3b54d80aec05a9cab6466b79dde Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:54 -0800 Subject: bpf: refactor bpf_helper_changes_pkt_data to use helper number Use BPF helper number instead of function pointer in bpf_helper_changes_pkt_data(). This would simplify usage of this function in verifier.c:check_cfg() (in a follow-up patch), where only helper number is easily available and there is no real need to lookup helper proto. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 63 +++++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 35 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 6625b3f563a4..efb75eed2e35 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7899,42 +7899,35 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { #endif /* CONFIG_INET */ -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == sk_skb_change_head || - func == bpf_skb_change_tail || - func == sk_skb_change_tail || - func == bpf_skb_adjust_room || - func == sk_skb_adjust_room || - func == bpf_skb_pull_data || - func == sk_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_msg_push_data || - func == bpf_msg_pop_data || - func == bpf_xdp_adjust_tail || -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) - func == bpf_lwt_seg6_store_bytes || - func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action || -#endif -#ifdef CONFIG_INET - func == bpf_sock_ops_store_hdr_opt || -#endif - func == bpf_lwt_in_push_encap || - func == bpf_lwt_xmit_push_encap) +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_clone_redirect: + case BPF_FUNC_l3_csum_replace: + case BPF_FUNC_l4_csum_replace: + case BPF_FUNC_lwt_push_encap: + case BPF_FUNC_lwt_seg6_action: + case BPF_FUNC_lwt_seg6_adjust_srh: + case BPF_FUNC_lwt_seg6_store_bytes: + case BPF_FUNC_msg_pop_data: + case BPF_FUNC_msg_pull_data: + case BPF_FUNC_msg_push_data: + case BPF_FUNC_skb_adjust_room: + case BPF_FUNC_skb_change_head: + case BPF_FUNC_skb_change_proto: + case BPF_FUNC_skb_change_tail: + case BPF_FUNC_skb_pull_data: + case BPF_FUNC_skb_store_bytes: + case BPF_FUNC_skb_vlan_pop: + case BPF_FUNC_skb_vlan_push: + case BPF_FUNC_store_hdr_opt: + case BPF_FUNC_xdp_adjust_head: + case BPF_FUNC_xdp_adjust_meta: + case BPF_FUNC_xdp_adjust_tail: return true; - - return false; + default: + return false; + } } const struct bpf_func_proto bpf_event_output_data_proto __weak; -- cgit v1.2.3 From 1a4607ffba35bf2a630aab299e34dd3f6e658d70 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:59 -0800 Subject: bpf: consider that tail calls invalidate packet pointers Tail-called programs could execute any of the helpers that invalidate packet pointers. Hence, conservatively assume that each tail call invalidates packet pointers. Making the change in bpf_helper_changes_pkt_data() automatically makes use of check_cfg() logic that computes 'changes_pkt_data' effect for global sub-programs, such that the following program could be rejected: int tail_call(struct __sk_buff *sk) { bpf_tail_call_static(sk, &jmp_table, 0); return 0; } SEC("tc") int not_safe(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; ... make p valid ... tail_call(sk); *p = 42; /* this is unsafe */ ... } The tc_bpf2bpf.c:subprog_tc() needs change: mark it as a function that can invalidate packet pointers. Otherwise, it can't be freplaced with tailcall_freplace.c:entry_freplace() that does a tail call. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index efb75eed2e35..21131ec25f24 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7924,6 +7924,8 @@ bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) case BPF_FUNC_xdp_adjust_head: case BPF_FUNC_xdp_adjust_meta: case BPF_FUNC_xdp_adjust_tail: + /* tail-called program could call any of the above */ + case BPF_FUNC_tail_call: return true; default: return false; -- cgit v1.2.3 From b1f3a2f5a742c1e939a73031bd31b9e557a2d77d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:40 -0800 Subject: netdev: fix repeated netlink messages in queue dump The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 102 != 100 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 101 != 100 not ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0 not ok 1 selftests: drivers/net: queues.py # exit=1 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9527dd46e4dc..9f086b190619 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -488,24 +488,21 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { int err = 0; - int i; if (!(netdev->flags & IFF_UP)) return err; - for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; - ctx->rxq_idx = i++; } - for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; - ctx->txq_idx = i++; } return err; -- cgit v1.2.3 From ecc391a541573da46b7ccc188105efedd40aef1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:41 -0800 Subject: netdev: fix repeated netlink messages in queue stats The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 103 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 103 != 100 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 102 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 102 != 100 missing queue keys not ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:4 fail:1 xfail:0 xpass:0 skip:0 error:0 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:5 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: ab63a2387cb9 ("netdev: add per-queue statistics") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9f086b190619..1be8c7c21d19 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -668,7 +668,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->rxq_idx = i++; + ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { @@ -676,7 +676,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->txq_idx = i++; + ctx->txq_idx = ++i; } ctx->rxq_idx = 0; -- cgit v1.2.3 From 954a2b40719a21e763a1bba2f0da92347e058fce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Dec 2024 20:04:32 +0900 Subject: rtnetlink: Try the outer netns attribute in rtnl_get_peer_net(). Xiao Liang reported that the cited commit changed netns handling in newlink() of netkit, veth, and vxcan. Before the patch, if we don't find a netns attribute in the peer device attributes, we tried to find another netns attribute in the outer netlink attributes by passing it to rtnl_link_get_net(). Let's restore the original behaviour. Fixes: 48327566769a ("rtnetlink: fix double call of rtnl_link_get_net_ifla()") Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/CABAhCORBVVU8P6AHcEkENMj+gD2d3ce9t=A_o48E0yOQp8_wUQ@mail.gmail.com/#t Signed-off-by: Kuniyuki Iwashima Tested-by: Xiao Liang Link: https://patch.msgid.link/20241216110432.51488-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ebcfc2debf1a..d9f959c619d9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3819,6 +3819,7 @@ out_unregister: } static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -3826,7 +3827,7 @@ static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, int err; if (!data || !data[ops->peer_type]) - return NULL; + return rtnl_link_get_net_ifla(tbp); err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) @@ -3971,7 +3972,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops->peer_type) { - peer_net = rtnl_get_peer_net(ops, data, extack); + peer_net = rtnl_get_peer_net(ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; -- cgit v1.2.3 From 5eb70dbebf32c2fd1f2814c654ae17fc47d6e859 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Dec 2024 18:25:08 -0800 Subject: netdev-genl: avoid empty messages in queue dump Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down it has no queues. Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reported-by: syzbot+0a884bc2d304ce4af70f@syzkaller.appspotmail.com Reviewed-by: Eric Dumazet Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241218022508.815344-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1be8c7c21d19..2d3ae0cd3ad2 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -430,10 +430,10 @@ static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - int err = 0; + int err; if (!(netdev->flags & IFF_UP)) - return err; + return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) -- cgit v1.2.3 From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- net/core/skmsg.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e90fbab703b2..8ad7e6755fd6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } -- cgit v1.2.3 From 4a25201aa46ce88e8e31f9ccdec0e4e3dd6bb736 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:32 -0800 Subject: netdev-genl: avoid empty messages in napi get Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down we "hide" the NAPI instances. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241219032833.1165433-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2d3ae0cd3ad2..b0772d135efb 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -246,8 +246,12 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); rtnl_unlock(); - if (err) + if (err) { + goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; goto err_free_msg; + } return genlmsg_reply(rsp, info); -- cgit v1.2.3 From fdf478d236dcf0f1f68534df5d456ced625195bd Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:22 +0800 Subject: skmsg: Return copied bytes in sk_msg_memcopy_from_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously sk_msg_memcopy_from_iter returns the copied bytes from the last copy_from_iter{,_nocache} call upon success. This commit changes it to return the total number of copied bytes on success. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-1-bae583d014f3@outlook.com --- net/core/skmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8ad7e6755fd6..61f3f3d4e528 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -- cgit v1.2.3 From 9ecc4d858b92c1bb0673ad9c327298e600c55659 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:54 -0800 Subject: bpf: Check negative offsets in __bpf_skb_min_len() skb_network_offset() and skb_transport_offset() can be negative when they are called after we pull the transport header, for example, when we use eBPF sockmap at the point of ->sk_data_ready(). __bpf_skb_min_len() uses an unsigned int to get these offsets, this leads to a very large number which then causes bpf_skb_change_tail() failed unexpectedly. Fix this by using a signed int to get these offsets and ensure the minimum is at least zero. Fixes: 5293efe62df8 ("bpf: add bpf_skb_change_tail helper") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-2-xiyou.wangcong@gmail.com --- net/core/filter.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 21131ec25f24..834614071727 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3734,13 +3734,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } -- cgit v1.2.3 From 68e068cabd2c6c533ef934c2e5151609cf6ecc6d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 1 Jan 2025 11:47:40 -0500 Subject: net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets The blamed commit disabled hardware offoad of IPv6 packets with extension headers on devices that advertise NETIF_F_IPV6_CSUM, based on the definition of that feature in skbuff.h: * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). The change causes skb_warn_bad_offload to fire for BIG TCP packets. [ 496.310233] WARNING: CPU: 13 PID: 23472 at net/core/dev.c:3129 skb_warn_bad_offload+0xc4/0xe0 [ 496.310297] ? skb_warn_bad_offload+0xc4/0xe0 [ 496.310300] skb_checksum_help+0x129/0x1f0 [ 496.310303] skb_csum_hwoffload_help+0x150/0x1b0 [ 496.310306] validate_xmit_skb+0x159/0x270 [ 496.310309] validate_xmit_skb_list+0x41/0x70 [ 496.310312] sch_direct_xmit+0x5c/0x250 [ 496.310317] __qdisc_run+0x388/0x620 BIG TCP introduced an IPV6_TLV_JUMBO IPv6 extension header to communicate packet length, as this is an IPv6 jumbogram. But, the feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices. For this specific case of extension headers that are not transmitted, return to the situation before the blamed commit and support hardware offload. ipv6_has_hopopt_jumbo() tests not only whether this header is present, but also that it is the only extension header before a terminal (L4) header. Fixes: 04c20a9356f2 ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: syzbot Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iK1hdC3Nt8KPhOtTF8vCPc1AHDCtse_BTNki1pWxAByTQ@mail.gmail.com/ Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250101164909.1331680-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 45a8c3dd4a64..faa23042df38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3642,8 +3642,10 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; + switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): -- cgit v1.2.3 From 5b0af621c3f6ef9261cf6067812f2fd9943acb4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Dec 2024 16:05:27 +0000 Subject: net: restrict SO_REUSEPORT to inet sockets After blamed commit, crypto sockets could accidentally be destroyed from RCU call back, as spotted by zyzbot [1]. Trying to acquire a mutex in RCU callback is not allowed. Restrict SO_REUSEPORT socket option to inet sockets. v1 of this patch supported TCP, UDP and SCTP sockets, but fcnal-test.sh test needed RAW and ICMP support. [1] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:562 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 24, name: ksoftirqd/1 preempt_count: 100, expected: 0 RCU nest depth: 0, expected: 0 1 lock held by ksoftirqd/1/24: #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2561 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_core+0xa37/0x17a0 kernel/rcu/tree.c:2823 Preemption disabled at: [] softirq_handle_begin kernel/softirq.c:402 [inline] [] handle_softirqs+0x128/0x9b0 kernel/softirq.c:537 CPU: 1 UID: 0 PID: 24 Comm: ksoftirqd/1 Not tainted 6.13.0-rc3-syzkaller-00174-ga024e377efed #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 __might_resched+0x5d4/0x780 kernel/sched/core.c:8758 __mutex_lock_common kernel/locking/mutex.c:562 [inline] __mutex_lock+0x131/0xee0 kernel/locking/mutex.c:735 crypto_put_default_null_skcipher+0x18/0x70 crypto/crypto_null.c:179 aead_release+0x3d/0x50 crypto/algif_aead.c:489 alg_do_release crypto/af_alg.c:118 [inline] alg_sock_destruct+0x86/0xc0 crypto/af_alg.c:502 __sk_destruct+0x58/0x5f0 net/core/sock.c:2260 rcu_do_batch kernel/rcu/tree.c:2567 [inline] rcu_core+0xaaa/0x17a0 kernel/rcu/tree.c:2823 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 run_ksoftirqd+0xca/0x130 kernel/softirq.c:950 smpboot_thread_fn+0x544/0xa30 kernel/smpboot.c:164 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 8c7138b33e5c ("net: Unpublish sk from sk_reuseport_cb before call_rcu") Reported-by: syzbot+b3e02953598f447d4d2a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772f2f4.050a0220.2f3838.04cb.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241231160527.3994168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 74729d20cd00..be84885f9290 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1295,7 +1295,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); -- cgit v1.2.3