From cfc83a3c71517b59c1047db57da31e26a9dc2f33 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 16 Feb 2026 11:20:29 +0100 Subject: batman-adv: Avoid double-rtnl_lock ELP metric worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit batadv_v_elp_get_throughput() might be called when the RTNL lock is already held. This could be problematic when the work queue item is cancelled via cancel_delayed_work_sync() in batadv_v_elp_iface_disable(). In this case, an rtnl_lock() would cause a deadlock. To avoid this, rtnl_trylock() was used in this function to skip the retrieval of the ethtool information in case the RTNL lock was already held. But for cfg80211 interfaces, batadv_get_real_netdev() was called - which also uses rtnl_lock(). The approach for __ethtool_get_link_ksettings() must also be used instead and the lockless version __batadv_get_real_netdev() has to be called. Cc: stable@vger.kernel.org Fixes: 8c8ecc98f5c6 ("batman-adv: Drop unmanaged ELP metric worker") Reported-by: Christian Schmidbauer Signed-off-by: Sven Eckelmann Tested-by: Sören Skaarup Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 10 +++++++++- net/batman-adv/hard-interface.c | 8 ++++---- net/batman-adv/hard-interface.h | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index cb16c1ed2a58..fe832093d421 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -111,7 +111,15 @@ static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, /* unsupported WiFi driver version */ goto default_throughput; - real_netdev = batadv_get_real_netdev(hard_iface->net_dev); + /* only use rtnl_trylock because the elp worker will be cancelled while + * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise + * wait forever when the elp work_item was started and it is then also + * trying to rtnl_lock + */ + if (!rtnl_trylock()) + return false; + real_netdev = __batadv_get_real_netdev(hard_iface->net_dev); + rtnl_unlock(); if (!real_netdev) goto default_throughput; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 5113f879736b..1c488049d554 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -204,7 +204,7 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev) } /** - * batadv_get_real_netdevice() - check if the given netdev struct is a virtual + * __batadv_get_real_netdev() - check if the given netdev struct is a virtual * interface on top of another 'real' interface * @netdev: the device to check * @@ -214,7 +214,7 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev) * Return: the 'real' net device or the original net device and NULL in case * of an error. */ -static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) +struct net_device *__batadv_get_real_netdev(struct net_device *netdev) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *real_netdev = NULL; @@ -267,7 +267,7 @@ struct net_device *batadv_get_real_netdev(struct net_device *net_device) struct net_device *real_netdev; rtnl_lock(); - real_netdev = batadv_get_real_netdevice(net_device); + real_netdev = __batadv_get_real_netdev(net_device); rtnl_unlock(); return real_netdev; @@ -336,7 +336,7 @@ static u32 batadv_wifi_flags_evaluate(struct net_device *net_device) if (batadv_is_cfg80211_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; - real_netdev = batadv_get_real_netdevice(net_device); + real_netdev = __batadv_get_real_netdev(net_device); if (!real_netdev) return wifi_flags; diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 9db8a310961e..9ba8fb2bdceb 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -67,6 +67,7 @@ enum batadv_hard_if_bcast { extern struct notifier_block batadv_hard_if_notifier; +struct net_device *__batadv_get_real_netdev(struct net_device *net_device); struct net_device *batadv_get_real_netdev(struct net_device *net_device); bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface); bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); -- cgit v1.2.3 From e35626f610f3d2b7953ccddf6a77453da22b3a9e Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 24 Feb 2026 21:28:32 +0100 Subject: net/sched: ets: fix divide by zero in the offload path Offloading ETS requires computing each class' WRR weight: this is done by averaging over the sums of quanta as 'q_sum' and 'q_psum'. Using unsigned int, the same integer size as the individual DRR quanta, can overflow and even cause division by zero, like it happened in the following splat: Oops: divide error: 0000 [#1] SMP PTI CPU: 13 UID: 0 PID: 487 Comm: tc Tainted: G E 6.19.0-virtme #45 PREEMPT(full) Tainted: [E]=UNSIGNED_MODULE Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: 0010:ets_offload_change+0x11f/0x290 [sch_ets] Code: e4 45 31 ff eb 03 41 89 c7 41 89 cb 89 ce 83 f9 0f 0f 87 b7 00 00 00 45 8b 08 31 c0 45 01 cc 45 85 c9 74 09 41 6b c4 64 31 d2 <41> f7 f2 89 c2 44 29 fa 45 89 df 41 83 fb 0f 0f 87 c7 00 00 00 44 RSP: 0018:ffffd0a180d77588 EFLAGS: 00010246 RAX: 00000000ffffff38 RBX: ffff8d3d482ca000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffd0a180d77660 RBP: ffffd0a180d77690 R08: ffff8d3d482ca2d8 R09: 00000000fffffffe R10: 0000000000000000 R11: 0000000000000000 R12: 00000000fffffffe R13: ffff8d3d472f2000 R14: 0000000000000003 R15: 0000000000000000 FS: 00007f440b6c2740(0000) GS:ffff8d3dc9803000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000003cdd2000 CR3: 0000000007b58002 CR4: 0000000000172ef0 Call Trace: ets_qdisc_change+0x870/0xf40 [sch_ets] qdisc_create+0x12b/0x540 tc_modify_qdisc+0x6d7/0xbd0 rtnetlink_rcv_msg+0x168/0x6b0 netlink_rcv_skb+0x5c/0x110 netlink_unicast+0x1d6/0x2b0 netlink_sendmsg+0x22e/0x470 ____sys_sendmsg+0x38a/0x3c0 ___sys_sendmsg+0x99/0xe0 __sys_sendmsg+0x8a/0xf0 do_syscall_64+0x111/0xf80 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f440b81c77e Code: 4d 89 d8 e8 d4 bc 00 00 4c 8b 5d f8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 11 c9 c3 0f 1f 80 00 00 00 00 48 8b 45 10 0f 05 c3 83 e2 39 83 fa 08 75 e7 e8 13 ff ff ff 0f 1f 00 f3 0f 1e fa RSP: 002b:00007fff951e4c10 EFLAGS: 00000202 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000481820 RCX: 00007f440b81c77e RDX: 0000000000000000 RSI: 00007fff951e4cd0 RDI: 0000000000000003 RBP: 00007fff951e4c20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000202 R12: 00007fff951f4fa8 R13: 00000000699ddede R14: 00007f440bb01000 R15: 0000000000486980 Modules linked in: sch_ets(E) netdevsim(E) ---[ end trace 0000000000000000 ]--- RIP: 0010:ets_offload_change+0x11f/0x290 [sch_ets] Code: e4 45 31 ff eb 03 41 89 c7 41 89 cb 89 ce 83 f9 0f 0f 87 b7 00 00 00 45 8b 08 31 c0 45 01 cc 45 85 c9 74 09 41 6b c4 64 31 d2 <41> f7 f2 89 c2 44 29 fa 45 89 df 41 83 fb 0f 0f 87 c7 00 00 00 44 RSP: 0018:ffffd0a180d77588 EFLAGS: 00010246 RAX: 00000000ffffff38 RBX: ffff8d3d482ca000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffd0a180d77660 RBP: ffffd0a180d77690 R08: ffff8d3d482ca2d8 R09: 00000000fffffffe R10: 0000000000000000 R11: 0000000000000000 R12: 00000000fffffffe R13: ffff8d3d472f2000 R14: 0000000000000003 R15: 0000000000000000 FS: 00007f440b6c2740(0000) GS:ffff8d3dc9803000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000003cdd2000 CR3: 0000000007b58002 CR4: 0000000000172ef0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x30000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- Fix this using 64-bit integers for 'q_sum' and 'q_psum'. Cc: stable@vger.kernel.org Fixes: d35eb52bd2ac ("net: sch_ets: Make the ETS qdisc offloadable") Signed-off-by: Davide Caratti Reviewed-by: Jamal Hadi Salim Reviewed-by: Petr Machata Link: https://patch.msgid.link/28504887df314588c7255e9911769c36f751edee.1771964872.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_ets.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 306e046276d4..a4b07b661b77 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -115,12 +115,12 @@ static void ets_offload_change(struct Qdisc *sch) struct ets_sched *q = qdisc_priv(sch); struct tc_ets_qopt_offload qopt; unsigned int w_psum_prev = 0; - unsigned int q_psum = 0; - unsigned int q_sum = 0; unsigned int quantum; unsigned int w_psum; unsigned int weight; unsigned int i; + u64 q_psum = 0; + u64 q_sum = 0; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; @@ -138,8 +138,12 @@ static void ets_offload_change(struct Qdisc *sch) for (i = 0; i < q->nbands; i++) { quantum = q->classes[i].quantum; - q_psum += quantum; - w_psum = quantum ? q_psum * 100 / q_sum : 0; + if (quantum) { + q_psum += quantum; + w_psum = div64_u64(q_psum * 100, q_sum); + } else { + w_psum = 0; + } weight = w_psum - w_psum_prev; w_psum_prev = w_psum; -- cgit v1.2.3 From 2ef2b20cf4e04ac8a6ba68493f8780776ff84300 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Feb 2026 13:15:47 +0000 Subject: net: annotate data-races around sk->sk_{data_ready,write_space} skmsg (and probably other layers) are changing these pointers while other cpus might read them concurrently. Add corresponding READ_ONCE()/WRITE_ONCE() annotations for UDP, TCP and AF_UNIX. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: syzbot+87f770387a9e5dc6b79b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/699ee9fc.050a0220.1cd54b.0009.GAE@google.com/ Signed-off-by: Eric Dumazet Cc: Daniel Borkmann Cc: John Fastabend Cc: Jakub Sitnicki Cc: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260225131547.1085509-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 14 +++++++------- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_bpf.c | 2 +- net/ipv4/tcp_input.c | 14 ++++++++------ net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv4/udp_bpf.c | 2 +- net/unix/af_unix.c | 8 ++++---- 8 files changed, 25 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2e26174c9919..3261793abe83 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1205,8 +1205,8 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; psock->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; + WRITE_ONCE(sk->sk_data_ready, sk_psock_strp_data_ready); + WRITE_ONCE(sk->sk_write_space, sk_psock_write_space); } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) @@ -1216,8 +1216,8 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) if (!psock->saved_data_ready) return; - sk->sk_data_ready = psock->saved_data_ready; - psock->saved_data_ready = NULL; + WRITE_ONCE(sk->sk_data_ready, psock->saved_data_ready); + WRITE_ONCE(psock->saved_data_ready, NULL); strp_stop(&psock->strp); } @@ -1296,8 +1296,8 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) return; psock->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_verdict_data_ready; - sk->sk_write_space = sk_psock_write_space; + WRITE_ONCE(sk->sk_data_ready, sk_psock_verdict_data_ready); + WRITE_ONCE(sk->sk_write_space, sk_psock_write_space); } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) @@ -1308,6 +1308,6 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) if (!psock->saved_data_ready) return; - sk->sk_data_ready = psock->saved_data_ready; + WRITE_ONCE(sk->sk_data_ready, psock->saved_data_ready); psock->saved_data_ready = NULL; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f84d9a45cc9d..8cdc26e8ad68 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1446,7 +1446,7 @@ out_err: err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { - sk->sk_write_space(sk); + READ_ONCE(sk->sk_write_space)(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } if (binding) @@ -4181,7 +4181,7 @@ ao_parse: break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(tp->notsent_lowat, val); - sk->sk_write_space(sk); + READ_ONCE(sk->sk_write_space)(sk); break; case TCP_INQ: if (val > 1 || val < 0) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index c449a044895e..813d2e498c93 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -725,7 +725,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); } else { - sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_write_space, psock->saved_write_space); /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, psock->sk_proto); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 41c57efd125c..6404e53382ca 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5425,7 +5425,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM); return; } @@ -5635,7 +5635,7 @@ err: void tcp_data_ready(struct sock *sk) { if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) @@ -5691,7 +5691,7 @@ queue_and_out: inet_csk(sk)->icsk_ack.pending |= (ICSK_ACK_NOMEM | ICSK_ACK_NOW); inet_csk_schedule_ack(sk); - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); if (skb_queue_len(&sk->sk_receive_queue) && skb->len) { reason = SKB_DROP_REASON_PROTO_MEM; @@ -6114,7 +6114,9 @@ static void tcp_new_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } - INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); + INDIRECT_CALL_1(READ_ONCE(sk->sk_write_space), + sk_stream_write_space, + sk); } /* Caller made space either from: @@ -6325,7 +6327,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); } } } @@ -7792,7 +7794,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, sock_put(fastopen_sk); goto drop_and_free; } - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); } else { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d9c5a43bd281..dafb63b923d0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -1004,7 +1004,7 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent); + READ_ONCE(parent->sk_data_ready)(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6c6b68a66dcd..014fdfdd331b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1787,7 +1787,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * using prepare_to_wait_exclusive(). */ while (nb) { - INDIRECT_CALL_1(sk->sk_data_ready, + INDIRECT_CALL_1(READ_ONCE(sk->sk_data_ready), sock_def_readable, sk); nb--; } diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 91233e37cd97..779a3a03762f 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -158,7 +158,7 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; if (restore) { - sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_write_space, psock->saved_write_space); sock_replace_proto(sk, psock->sk_proto); return 0; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3756a93dc63a..7eaa5b187fef 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1785,7 +1785,7 @@ restart: __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sock_put(other); return 0; @@ -2278,7 +2278,7 @@ restart_locked: scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sock_put(other); scm_destroy(&scm); return len; @@ -2351,7 +2351,7 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, sk_send_sigurg(other); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); return 0; out_unlock: @@ -2477,7 +2477,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sent += size; } -- cgit v1.2.3 From 93c9475c04acad2457a7e7ea4e3ec40a6e6d94a7 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Wed, 25 Feb 2026 16:39:55 +0200 Subject: bridge: Check relevant per-VLAN options in VLAN range grouping The br_vlan_opts_eq_range() function determines if consecutive VLANs can be grouped together in a range for compact netlink notifications. It currently checks state, tunnel info, and multicast router configuration, but misses two categories of per-VLAN options that affect the output: 1. User-visible priv_flags (neigh_suppress, mcast_enabled) 2. Port multicast context (mcast_max_groups, mcast_n_groups) When VLANs have different settings for these options, they are incorrectly grouped into ranges, causing netlink notifications to report only one VLAN's settings for the entire range. Fix by checking priv_flags equality, but only for flags that affect netlink output (BR_VLFLAG_NEIGH_SUPPRESS_ENABLED and BR_VLFLAG_MCAST_ENABLED), and comparing multicast context (mcast_max_groups and mcast_n_groups). Example showing the bugs before the fix: $ bridge vlan set vid 10 dev dummy1 neigh_suppress on $ bridge vlan set vid 11 dev dummy1 neigh_suppress off $ bridge -d vlan show dev dummy1 port vlan-id dummy1 10-11 ... neigh_suppress on $ bridge vlan set vid 10 dev dummy1 mcast_max_groups 100 $ bridge vlan set vid 11 dev dummy1 mcast_max_groups 200 $ bridge -d vlan show dev dummy1 port vlan-id dummy1 10-11 ... mcast_max_groups 100 After the fix, VLANs 10 and 11 are shown as separate entries with their correct individual settings. Fixes: a1aee20d5db2 ("net: bridge: Add netlink knobs for number / maximum MDB entries") Fixes: 83f6d600796c ("bridge: vlan: Allow setting VLAN neighbor suppression state") Signed-off-by: Danielle Ratson Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260225143956.3995415-2-danieller@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_private.h | 10 ++++++++++ net/bridge/br_vlan_options.c | 26 +++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b9b2981c4841..9b55d38ea9ed 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1344,6 +1344,16 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, true; } +static inline bool +br_multicast_port_ctx_options_equal(const struct net_bridge_mcast_port *pmctx1, + const struct net_bridge_mcast_port *pmctx2) +{ + return br_multicast_ngroups_get(pmctx1) == + br_multicast_ngroups_get(pmctx2) && + br_multicast_ngroups_get_max(pmctx1) == + br_multicast_ngroups_get_max(pmctx2); +} + static inline bool br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) { diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index 8fa89b04ee94..5514e1fc8d1f 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -43,9 +43,29 @@ bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, u8 range_mc_rtr = br_vlan_multicast_router(range_end); u8 curr_mc_rtr = br_vlan_multicast_router(v_curr); - return v_curr->state == range_end->state && - __vlan_tun_can_enter_range(v_curr, range_end) && - curr_mc_rtr == range_mc_rtr; + if (v_curr->state != range_end->state) + return false; + + if (!__vlan_tun_can_enter_range(v_curr, range_end)) + return false; + + if (curr_mc_rtr != range_mc_rtr) + return false; + + /* Check user-visible priv_flags that affect output */ + if ((v_curr->priv_flags ^ range_end->priv_flags) & + (BR_VLFLAG_NEIGH_SUPPRESS_ENABLED | BR_VLFLAG_MCAST_ENABLED)) + return false; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (!br_vlan_is_master(v_curr) && + !br_multicast_port_ctx_vlan_disabled(&v_curr->port_mcast_ctx) && + !br_multicast_port_ctx_options_equal(&v_curr->port_mcast_ctx, + &range_end->port_mcast_ctx)) + return false; +#endif + + return true; } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, -- cgit v1.2.3 From 7b6275c80a0c81c5f8943272292dfe67730ce849 Mon Sep 17 00:00:00 2001 From: Eric Badger Date: Mon, 23 Feb 2026 10:28:55 -0800 Subject: xprtrdma: Decrement re_receiving on the early exit paths In the event that rpcrdma_post_recvs() fails to create a work request (due to memory allocation failure, say) or otherwise exits early, we should decrement ep->re_receiving before returning. Otherwise we will hang in rpcrdma_xprt_drain() as re_receiving will never reach zero and the completion will never be triggered. On a system with high memory pressure, this can appear as the following hung task: INFO: task kworker/u385:17:8393 blocked for more than 122 seconds. Tainted: G S E 6.19.0 #3 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/u385:17 state:D stack:0 pid:8393 tgid:8393 ppid:2 task_flags:0x4248060 flags:0x00080000 Workqueue: xprtiod xprt_autoclose [sunrpc] Call Trace: __schedule+0x48b/0x18b0 ? ib_post_send_mad+0x247/0xae0 [ib_core] schedule+0x27/0xf0 schedule_timeout+0x104/0x110 __wait_for_common+0x98/0x180 ? __pfx_schedule_timeout+0x10/0x10 wait_for_completion+0x24/0x40 rpcrdma_xprt_disconnect+0x444/0x460 [rpcrdma] xprt_rdma_close+0x12/0x40 [rpcrdma] xprt_autoclose+0x5f/0x120 [sunrpc] process_one_work+0x191/0x3e0 worker_thread+0x2e3/0x420 ? __pfx_worker_thread+0x10/0x10 kthread+0x10d/0x230 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x273/0x2b0 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Fixes: 15788d1d1077 ("xprtrdma: Do not refresh Receive Queue while it is draining") Signed-off-by: Eric Badger Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 15bbf953dfad..b51a162885bb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1362,7 +1362,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) needed += RPCRDMA_MAX_RECV_BATCH; if (atomic_inc_return(&ep->re_receiving) > 1) - goto out; + goto out_dec; /* fast path: all needed reps can be found on the free list */ wr = NULL; @@ -1385,7 +1385,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) ++count; } if (!wr) - goto out; + goto out_dec; rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); @@ -1400,9 +1400,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) --count; } } + +out_dec: if (atomic_dec_return(&ep->re_receiving) > 0) complete(&ep->re_done); - out: trace_xprtrdma_post_recvs(r_xprt, count); ep->re_receive_count += count; -- cgit v1.2.3 From 62413a9c3cb183afb9bb6e94dd68caf4e4145f4c Mon Sep 17 00:00:00 2001 From: Paul Moses Date: Mon, 23 Feb 2026 15:05:44 +0000 Subject: net/sched: act_gate: snapshot parameters with RCU on replace The gate action can be replaced while the hrtimer callback or dump path is walking the schedule list. Convert the parameters to an RCU-protected snapshot and swap updates under tcf_lock, freeing the previous snapshot via call_rcu(). When REPLACE omits the entry list, preserve the existing schedule so the effective state is unchanged. Fixes: a51c328df310 ("net: qos: introduce a gate control flow action") Cc: stable@vger.kernel.org Signed-off-by: Paul Moses Tested-by: Vladimir Oltean Acked-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20260223150512.2251594-2-p@1g4.org Signed-off-by: Jakub Kicinski --- net/sched/act_gate.c | 265 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 186 insertions(+), 79 deletions(-) (limited to 'net') diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 686eaed81b81..fdbfcaa3e2ab 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -32,9 +32,12 @@ static ktime_t gate_get_time(struct tcf_gate *gact) return KTIME_MAX; } -static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +static void tcf_gate_params_free_rcu(struct rcu_head *head); + +static void gate_get_start_time(struct tcf_gate *gact, + const struct tcf_gate_params *param, + ktime_t *start) { - struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; u64 n; @@ -69,12 +72,14 @@ static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) { struct tcf_gate *gact = container_of(timer, struct tcf_gate, hitimer); - struct tcf_gate_params *p = &gact->param; struct tcfg_gate_entry *next; + struct tcf_gate_params *p; ktime_t close_time, now; spin_lock(&gact->tcf_lock); + p = rcu_dereference_protected(gact->param, + lockdep_is_held(&gact->tcf_lock)); next = gact->next_entry; /* cycle start, clear pending bit, clear total octets */ @@ -225,6 +230,35 @@ static void release_entry_list(struct list_head *entries) } } +static int tcf_gate_copy_entries(struct tcf_gate_params *dst, + const struct tcf_gate_params *src, + struct netlink_ext_ack *extack) +{ + struct tcfg_gate_entry *entry; + int i = 0; + + list_for_each_entry(entry, &src->entries, list) { + struct tcfg_gate_entry *new; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + return -ENOMEM; + } + + new->index = entry->index; + new->gate_state = entry->gate_state; + new->interval = entry->interval; + new->ipv = entry->ipv; + new->maxoctets = entry->maxoctets; + list_add_tail(&new->list, &dst->entries); + i++; + } + + dst->num_entries = i; + return 0; +} + static int parse_gate_list(struct nlattr *list_attr, struct tcf_gate_params *sched, struct netlink_ext_ack *extack) @@ -270,24 +304,44 @@ release_list: return err; } -static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, - enum tk_offsets tko, s32 clockid, - bool do_init) +static bool gate_timer_needs_cancel(u64 basetime, u64 old_basetime, + enum tk_offsets tko, + enum tk_offsets old_tko, + s32 clockid, s32 old_clockid) { - if (!do_init) { - if (basetime == gact->param.tcfg_basetime && - tko == gact->tk_offset && - clockid == gact->param.tcfg_clockid) - return; + return basetime != old_basetime || + clockid != old_clockid || + tko != old_tko; +} - spin_unlock_bh(&gact->tcf_lock); - hrtimer_cancel(&gact->hitimer); - spin_lock_bh(&gact->tcf_lock); +static int gate_clock_resolve(s32 clockid, enum tk_offsets *tko, + struct netlink_ext_ack *extack) +{ + switch (clockid) { + case CLOCK_REALTIME: + *tko = TK_OFFS_REAL; + return 0; + case CLOCK_MONOTONIC: + *tko = TK_OFFS_MAX; + return 0; + case CLOCK_BOOTTIME: + *tko = TK_OFFS_BOOT; + return 0; + case CLOCK_TAI: + *tko = TK_OFFS_TAI; + return 0; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + return -EINVAL; } - gact->param.tcfg_basetime = basetime; - gact->param.tcfg_clockid = clockid; - gact->tk_offset = tko; - hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); +} + +static void gate_setup_timer(struct tcf_gate *gact, s32 clockid, + enum tk_offsets tko) +{ + WRITE_ONCE(gact->tk_offset, tko); + hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, + HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, @@ -296,15 +350,22 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); - enum tk_offsets tk_offset = TK_OFFS_TAI; + u64 cycletime = 0, basetime = 0, cycletime_ext = 0; + struct tcf_gate_params *p = NULL, *old_p = NULL; + enum tk_offsets old_tk_offset = TK_OFFS_TAI; + const struct tcf_gate_params *cur_p = NULL; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; + enum tk_offsets tko = TK_OFFS_TAI; struct tcf_chain *goto_ch = NULL; - u64 cycletime = 0, basetime = 0; - struct tcf_gate_params *p; + s32 timer_clockid = CLOCK_TAI; + bool use_old_entries = false; + s32 old_clockid = CLOCK_TAI; + bool need_cancel = false; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; + u64 old_basetime = 0; int ret = 0, err; u32 gflags = 0; s32 prio = -1; @@ -321,26 +382,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (!tb[TCA_GATE_PARMS]) return -EINVAL; - if (tb[TCA_GATE_CLOCKID]) { + if (tb[TCA_GATE_CLOCKID]) clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); - switch (clockid) { - case CLOCK_REALTIME: - tk_offset = TK_OFFS_REAL; - break; - case CLOCK_MONOTONIC: - tk_offset = TK_OFFS_MAX; - break; - case CLOCK_BOOTTIME: - tk_offset = TK_OFFS_BOOT; - break; - case CLOCK_TAI: - tk_offset = TK_OFFS_TAI; - break; - default: - NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); - return -EINVAL; - } - } parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; @@ -366,6 +409,60 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, return -EEXIST; } + gact = to_gate(*a); + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + err = -ENOMEM; + goto chain_put; + } + INIT_LIST_HEAD(&p->entries); + + use_old_entries = !tb[TCA_GATE_ENTRY_LIST]; + if (!use_old_entries) { + err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); + if (err < 0) + goto err_free; + use_old_entries = !err; + } + + if (ret == ACT_P_CREATED && use_old_entries) { + NL_SET_ERR_MSG(extack, "The entry list is empty"); + err = -EINVAL; + goto err_free; + } + + if (ret != ACT_P_CREATED) { + rcu_read_lock(); + cur_p = rcu_dereference(gact->param); + + old_basetime = cur_p->tcfg_basetime; + old_clockid = cur_p->tcfg_clockid; + old_tk_offset = READ_ONCE(gact->tk_offset); + + basetime = old_basetime; + cycletime_ext = cur_p->tcfg_cycletime_ext; + prio = cur_p->tcfg_priority; + gflags = cur_p->tcfg_flags; + + if (!tb[TCA_GATE_CLOCKID]) + clockid = old_clockid; + + err = 0; + if (use_old_entries) { + err = tcf_gate_copy_entries(p, cur_p, extack); + if (!err && !tb[TCA_GATE_CYCLE_TIME]) + cycletime = cur_p->tcfg_cycletime; + } + rcu_read_unlock(); + if (err) + goto err_free; + } + if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); @@ -375,25 +472,26 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); - gact = to_gate(*a); - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&gact->param.entries); + if (tb[TCA_GATE_CYCLE_TIME]) + cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); - err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); - if (err < 0) - goto release_idr; + if (tb[TCA_GATE_CYCLE_TIME_EXT]) + cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); - spin_lock_bh(&gact->tcf_lock); - p = &gact->param; + err = gate_clock_resolve(clockid, &tko, extack); + if (err) + goto err_free; + timer_clockid = clockid; - if (tb[TCA_GATE_CYCLE_TIME]) - cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); + need_cancel = ret != ACT_P_CREATED && + gate_timer_needs_cancel(basetime, old_basetime, + tko, old_tk_offset, + timer_clockid, old_clockid); - if (tb[TCA_GATE_ENTRY_LIST]) { - err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); - if (err < 0) - goto chain_put; - } + if (need_cancel) + hrtimer_cancel(&gact->hitimer); + + spin_lock_bh(&gact->tcf_lock); if (!cycletime) { struct tcfg_gate_entry *entry; @@ -402,22 +500,20 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); cycletime = cycle; - if (!cycletime) { - err = -EINVAL; - goto chain_put; - } } p->tcfg_cycletime = cycletime; + p->tcfg_cycletime_ext = cycletime_ext; - if (tb[TCA_GATE_CYCLE_TIME_EXT]) - p->tcfg_cycletime_ext = - nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); - - gate_setup_timer(gact, basetime, tk_offset, clockid, - ret == ACT_P_CREATED); + if (need_cancel || ret == ACT_P_CREATED) + gate_setup_timer(gact, timer_clockid, tko); p->tcfg_priority = prio; p->tcfg_flags = gflags; - gate_get_start_time(gact, &start); + p->tcfg_basetime = basetime; + p->tcfg_clockid = timer_clockid; + gate_get_start_time(gact, p, &start); + + old_p = rcu_replace_pointer(gact->param, p, + lockdep_is_held(&gact->tcf_lock)); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; @@ -434,11 +530,15 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); + if (old_p) + call_rcu(&old_p->rcu, tcf_gate_params_free_rcu); + return ret; +err_free: + release_entry_list(&p->entries); + kfree(p); chain_put: - spin_unlock_bh(&gact->tcf_lock); - if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: @@ -446,21 +546,29 @@ release_idr: * without taking tcf_lock. */ if (ret == ACT_P_CREATED) - gate_setup_timer(gact, gact->param.tcfg_basetime, - gact->tk_offset, gact->param.tcfg_clockid, - true); + gate_setup_timer(gact, timer_clockid, tko); + tcf_idr_release(*a, bind); return err; } +static void tcf_gate_params_free_rcu(struct rcu_head *head) +{ + struct tcf_gate_params *p = container_of(head, struct tcf_gate_params, rcu); + + release_entry_list(&p->entries); + kfree(p); +} + static void tcf_gate_cleanup(struct tc_action *a) { struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; - p = &gact->param; hrtimer_cancel(&gact->hitimer); - release_entry_list(&p->entries); + p = rcu_dereference_protected(gact->param, 1); + if (p) + call_rcu(&p->rcu, tcf_gate_params_free_rcu); } static int dumping_entry(struct sk_buff *skb, @@ -509,10 +617,9 @@ static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, struct nlattr *entry_list; struct tcf_t t; - spin_lock_bh(&gact->tcf_lock); - opt.action = gact->tcf_action; - - p = &gact->param; + rcu_read_lock(); + opt.action = READ_ONCE(gact->tcf_action); + p = rcu_dereference(gact->param); if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -552,12 +659,12 @@ static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) goto nla_put_failure; - spin_unlock_bh(&gact->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&gact->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 29252397bcc1e0a1f85e5c3bee59c325f5c26341 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Feb 2026 20:35:45 +0000 Subject: inet: annotate data-races around isk->inet_num UDP/TCP lookups are using RCU, thus isk->inet_num accesses should use READ_ONCE() and WRITE_ONCE() where needed. Fixes: 3ab5aee7fe84 ("net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260225203545.1512417-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 8 ++++---- net/ipv4/tcp_diag.c | 2 +- net/ipv6/inet6_hashtables.c | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fca980772c81..9bfccc283fa6 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -200,7 +200,7 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { - inet_sk(sk)->inet_num = port; + WRITE_ONCE(inet_sk(sk)->inet_num, port); inet_csk(sk)->icsk_bind_hash = tb; inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); @@ -224,7 +224,7 @@ static void __inet_put_port(struct sock *sk) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; - inet_sk(sk)->inet_num = 0; + WRITE_ONCE(inet_sk(sk)->inet_num, 0); sk->sk_userlocks &= ~SOCK_CONNECT_BIND; spin_lock(&head2->lock); @@ -352,7 +352,7 @@ static inline int compute_score(struct sock *sk, const struct net *net, { int score = -1; - if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && + if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum && !ipv6_only_sock(sk)) { if (sk->sk_rcv_saddr != daddr) return -1; @@ -1206,7 +1206,7 @@ error: sk->sk_hash = 0; inet_sk(sk)->inet_sport = 0; - inet_sk(sk)->inet_num = 0; + WRITE_ONCE(inet_sk(sk)->inet_num, 0); if (tw) inet_twsk_bind_unhash(tw, hinfo); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index d83efd91f461..7935702e394b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -509,7 +509,7 @@ next_chunk: if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && + if (r->id.idiag_sport != htons(READ_ONCE(sk->sk_num)) && r->id.idiag_sport) goto next_normal; if (r->id.idiag_dport != sk->sk_dport && diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 5e1da088d8e1..182d38e6d6d8 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -95,7 +95,8 @@ static inline int compute_score(struct sock *sk, const struct net *net, { int score = -1; - if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && + if (net_eq(sock_net(sk), net) && + READ_ONCE(inet_sk(sk)->inet_num) == hnum && sk->sk_family == PF_INET6) { if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; -- cgit v1.2.3 From 0b3cd139be565b85f4a3579e376152b9def6256a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20K=C3=B6ppeler?= Date: Thu, 26 Feb 2026 12:40:15 +0100 Subject: net/sched: sch_cake: avoid sync overhead when unlimited MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip inter-instance sync when no rate limit is configured, as it serves no purpose and only adds overhead. Fixes: 1bddd758bac2 ("net/sched: sch_cake: share shaper state across sub-instances of cake_mq") Signed-off-by: Jonas Köppeler Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260226-cake-mq-skip-sync-bandwidth-unlimited-v1-1-01830bb4db87@tu-berlin.de Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a01f14b1c216..b23fc7d9fafe 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2013,7 +2013,8 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) u64 delay; u32 len; - if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) { + if (q->config->is_shared && q->rate_ns && + now - q->last_checked_active >= q->config->sync_time) { struct net_device *dev = qdisc_dev(sch); struct cake_sched_data *other_priv; u64 new_rate = q->config->rate_bps; -- cgit v1.2.3 From 15c2715a52645fd8e6e18b7abc3449292d118c7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20K=C3=B6ppeler?= Date: Thu, 26 Feb 2026 12:40:16 +0100 Subject: net/sched: sch_cake: fixup cake_mq rate adjustment for diffserv config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_mq's rate adjustment during the sync periods did not adjust the rates for every tin in a diffserv config. This lead to inconsistencies of rates between the tins. Fix this by setting the rates for all tins during synchronization. Fixes: 1bddd758bac2 ("net/sched: sch_cake: share shaper state across sub-instances of cake_mq") Signed-off-by: Jonas Köppeler Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260226-cake-mq-skip-sync-bandwidth-unlimited-v1-2-01830bb4db87@tu-berlin.de Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index b23fc7d9fafe..9efe23f8371b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -391,8 +391,8 @@ static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 1239850263, 1191209601, 1147878294, 1108955788 }; -static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, - u64 target_ns, u64 rtt_est_ns); +static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust); + /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * @@ -2040,12 +2040,9 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) if (num_active_qs > 1) new_rate = div64_u64(q->config->rate_bps, num_active_qs); - /* mtu = 0 is used to only update the rate and not mess with cobalt params */ - cake_set_rate(b, new_rate, 0, 0, 0); + cake_configure_rates(sch, new_rate, true); q->last_checked_active = now; q->active_queues = num_active_qs; - q->rate_ns = b->tin_rate_ns; - q->rate_shft = b->tin_rate_shft; } begin: @@ -2362,12 +2359,10 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, b->cparams.p_dec = 1 << 20; /* 1/4096 */ } -static int cake_config_besteffort(struct Qdisc *sch) +static int cake_config_besteffort(struct Qdisc *sch, u64 rate, u32 mtu) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[0]; - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; q->tin_cnt = 1; @@ -2381,12 +2376,10 @@ static int cake_config_besteffort(struct Qdisc *sch) return 0; } -static int cake_config_precedence(struct Qdisc *sch) +static int cake_config_precedence(struct Qdisc *sch, u64 rate, u32 mtu) { /* convert high-level (user visible) parameters into internal format */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; u32 quantum = 256; u32 i; @@ -2457,7 +2450,7 @@ static int cake_config_precedence(struct Qdisc *sch) * Total 12 traffic classes. */ -static int cake_config_diffserv8(struct Qdisc *sch) +static int cake_config_diffserv8(struct Qdisc *sch, u64 rate, u32 mtu) { /* Pruned list of traffic classes for typical applications: * @@ -2474,8 +2467,6 @@ static int cake_config_diffserv8(struct Qdisc *sch) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; u32 quantum = 256; u32 i; @@ -2505,7 +2496,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) return 0; } -static int cake_config_diffserv4(struct Qdisc *sch) +static int cake_config_diffserv4(struct Qdisc *sch, u64 rate, u32 mtu) { /* Further pruned list of traffic classes for four-class system: * @@ -2518,8 +2509,6 @@ static int cake_config_diffserv4(struct Qdisc *sch) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; u32 quantum = 1024; q->tin_cnt = 4; @@ -2547,7 +2536,7 @@ static int cake_config_diffserv4(struct Qdisc *sch) return 0; } -static int cake_config_diffserv3(struct Qdisc *sch) +static int cake_config_diffserv3(struct Qdisc *sch, u64 rate, u32 mtu) { /* Simplified Diffserv structure with 3 tins. * Latency Sensitive (CS7, CS6, EF, VA, TOS4) @@ -2555,8 +2544,6 @@ static int cake_config_diffserv3(struct Qdisc *sch) * Low Priority (LE, CS1) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->config->rate_bps; u32 quantum = 1024; q->tin_cnt = 3; @@ -2581,32 +2568,33 @@ static int cake_config_diffserv3(struct Qdisc *sch) return 0; } -static void cake_reconfigure(struct Qdisc *sch) +static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust) { + u32 mtu = likely(rate_adjust) ? 0 : psched_mtu(qdisc_dev(sch)); struct cake_sched_data *qd = qdisc_priv(sch); struct cake_sched_config *q = qd->config; int c, ft; switch (q->tin_mode) { case CAKE_DIFFSERV_BESTEFFORT: - ft = cake_config_besteffort(sch); + ft = cake_config_besteffort(sch, rate, mtu); break; case CAKE_DIFFSERV_PRECEDENCE: - ft = cake_config_precedence(sch); + ft = cake_config_precedence(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV8: - ft = cake_config_diffserv8(sch); + ft = cake_config_diffserv8(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV4: - ft = cake_config_diffserv4(sch); + ft = cake_config_diffserv4(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV3: default: - ft = cake_config_diffserv3(sch); + ft = cake_config_diffserv3(sch, rate, mtu); break; } @@ -2617,6 +2605,14 @@ static void cake_reconfigure(struct Qdisc *sch) qd->rate_ns = qd->tins[ft].tin_rate_ns; qd->rate_shft = qd->tins[ft].tin_rate_shft; +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; + + cake_configure_rates(sch, qd->config->rate_bps, false); if (q->buffer_config_limit) { qd->buffer_limit = q->buffer_config_limit; -- cgit v1.2.3 From 11cb63b0d1a0685e0831ae3c77223e002ef18189 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Wed, 25 Feb 2026 10:43:48 -0300 Subject: net/sched: Only allow act_ct to bind to clsact/ingress qdiscs and shared blocks As Paolo said earlier [1]: "Since the blamed commit below, classify can return TC_ACT_CONSUMED while the current skb being held by the defragmentation engine. As reported by GangMin Kim, if such packet is that may cause a UaF when the defrag engine later on tries to tuch again such packet." act_ct was never meant to be used in the egress path, however some users are attaching it to egress today [2]. Attempting to reach a middle ground, we noticed that, while most qdiscs are not handling TC_ACT_CONSUMED, clsact/ingress qdiscs are. With that in mind, we address the issue by only allowing act_ct to bind to clsact/ingress qdiscs and shared blocks. That way it's still possible to attach act_ct to egress (albeit only with clsact). [1] https://lore.kernel.org/netdev/674b8cbfc385c6f37fb29a1de08d8fe5c2b0fbee.1771321118.git.pabeni@redhat.com/ [2] https://lore.kernel.org/netdev/cc6bfb4a-4a2b-42d8-b9ce-7ef6644fb22b@ovn.org/ Reported-by: GangMin Kim Fixes: 3f14b377d01d ("net/sched: act_ct: fix skb leak and crash on ooo frags") CC: stable@vger.kernel.org Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260225134349.1287037-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/act_ct.c | 6 ++++++ net/sched/cls_api.c | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index bd51522c7953..7d5e50c921a0 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1360,6 +1360,12 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, return -EINVAL; } + if (bind && !(flags & TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT)) { + NL_SET_ERR_MSG_MOD(extack, + "Attaching ct to a non ingress/clsact qdisc is unsupported"); + return -EOPNOTSUPP; + } + err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack); if (err < 0) return err; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 343309e9009b..4829c27446e3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2228,6 +2228,11 @@ static bool is_qdisc_ingress(__u32 classid) return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS)); } +static bool is_ingress_or_clsact(struct tcf_block *block, struct Qdisc *q) +{ + return tcf_block_shared(block) || (q && !!(q->flags & TCQ_F_INGRESS)); +} + static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -2420,6 +2425,8 @@ replay: flags |= TCA_ACT_FLAGS_NO_RTNL; if (is_qdisc_ingress(parent)) flags |= TCA_ACT_FLAGS_AT_INGRESS; + if (is_ingress_or_clsact(block, q)) + flags |= TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack); if (err == 0) { -- cgit v1.2.3 From 6996a2d2d0a64808c19c98002aeb5d9d1b2df6a4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 27 Feb 2026 03:55:35 +0000 Subject: udp: Unhash auto-bound connected sk from 4-tuple hash table when disconnected. Let's say we bind() an UDP socket to the wildcard address with a non-zero port, connect() it to an address, and disconnect it from the address. bind() sets SOCK_BINDPORT_LOCK on sk->sk_userlocks (but not SOCK_BINDADDR_LOCK), and connect() calls udp_lib_hash4() to put the socket into the 4-tuple hash table. Then, __udp_disconnect() calls sk->sk_prot->rehash(sk). It computes a new hash based on the wildcard address and moves the socket to a new slot in the 4-tuple hash table, leaving a garbage in the chain that no packet hits. Let's remove such a socket from 4-tuple hash table when disconnected. Note that udp_sk(sk)->udp_portaddr_hash needs to be udpated after udp_hash4_dec(hslot2) in udp_unhash4(). Fixes: 78c91ae2c6de ("ipv4/udp: Add 4-tuple hash for connected socket") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260227035547.3321327-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 014fdfdd331b..b60fad393e18 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2287,7 +2287,6 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); - udp_sk(sk)->udp_portaddr_hash = newhash; if (hslot2 != nhslot2 || rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -2321,19 +2320,25 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) if (udp_hashed4(sk)) { spin_lock_bh(&hslot->lock); - udp_rehash4(udptable, sk, newhash4); - if (hslot2 != nhslot2) { - spin_lock(&hslot2->lock); - udp_hash4_dec(hslot2); - spin_unlock(&hslot2->lock); - - spin_lock(&nhslot2->lock); - udp_hash4_inc(nhslot2); - spin_unlock(&nhslot2->lock); + if (inet_rcv_saddr_any(sk)) { + udp_unhash4(udptable, sk); + } else { + udp_rehash4(udptable, sk, newhash4); + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + udp_hash4_inc(nhslot2); + spin_unlock(&nhslot2->lock); + } } spin_unlock_bh(&hslot->lock); } + + udp_sk(sk)->udp_portaddr_hash = newhash; } } EXPORT_IPV6_MOD(udp_lib_rehash); -- cgit v1.2.3 From 026dfef287c07f37d4d4eef7a0b5a4bfdb29b32d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 26 Feb 2026 16:33:59 -0800 Subject: tcp: give up on stronger sk_rcvbuf checks (for now) We hit another corner case which leads to TcpExtTCPRcvQDrop Connections which send RPCs in the 20-80kB range over loopback experience spurious drops. The exact conditions for most of the drops I investigated are that: - socket exchanged >1MB of data so its not completely fresh - rcvbuf is around 128kB (default, hasn't grown) - there is ~60kB of data in rcvq - skb > 64kB arrives The sum of skb->len (!) of both of the skbs (the one already in rcvq and the arriving one) is larger than rwnd. My suspicion is that this happens because __tcp_select_window() rounds the rwnd up to (1 << wscale) if less than half of the rwnd has been consumed. Eric suggests that given the number of Fixes we already have pointing to 1d2fbaad7cd8 it's probably time to give up on it, until a bigger revamp of rmem management. Also while we could risk tweaking the rwnd math, there are other drops on workloads I investigated, after the commit in question, not explained by this phenomenon. Suggested-by: Eric Dumazet Link: https://lore.kernel.org/20260225122355.585fd57b@kernel.org Fixes: 1d2fbaad7cd8 ("tcp: stronger sk_rcvbuf checks") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260227003359.2391017-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6404e53382ca..7b03f2460751 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5374,25 +5374,11 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); -/* Check if this incoming skb can be added to socket receive queues - * while satisfying sk->sk_rcvbuf limit. - * - * In theory we should use skb->truesize, but this can cause problems - * when applications use too small SO_RCVBUF values. - * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio, - * allowing RWIN to be close to available space. - * Whenever the receive queue gets full, we can receive a small packet - * filling RWIN, but with a high skb->truesize, because most NIC use 4K page - * plus sk_buff metadata even when receiving less than 1500 bytes of payload. - * - * Note that we use skb->len to decide to accept or drop this packet, - * but sk->sk_rmem_alloc is the sum of all skb->truesize. - */ static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); - return rmem + skb->len <= sk->sk_rcvbuf; + return rmem <= sk->sk_rcvbuf; } static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, -- cgit v1.2.3 From 60abb0ac11dccd6b98fd9182bc5f85b621688861 Mon Sep 17 00:00:00 2001 From: "Nikhil P. Rao" Date: Wed, 25 Feb 2026 00:00:26 +0000 Subject: xsk: Fix fragment node deletion to prevent buffer leak After commit b692bf9a7543 ("xsk: Get rid of xdp_buff_xsk::xskb_list_node"), the list_node field is reused for both the xskb pool list and the buffer free list, this causes a buffer leak as described below. xp_free() checks if a buffer is already on the free list using list_empty(&xskb->list_node). When list_del() is used to remove a node from the xskb pool list, it doesn't reinitialize the node pointers. This means list_empty() will return false even after the node has been removed, causing xp_free() to incorrectly skip adding the buffer to the free list. Fix this by using list_del_init() instead of list_del() in all fragment handling paths, this ensures the list node is reinitialized after removal, allowing the list_empty() to work correctly. Fixes: b692bf9a7543 ("xsk: Get rid of xdp_buff_xsk::xskb_list_node") Acked-by: Maciej Fijalkowski Signed-off-by: Nikhil P. Rao Link: https://patch.msgid.link/20260225000456.107806-2-nikhil.rao@amd.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3b46bc635c43..be882a8d473c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -186,7 +186,7 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->list_node); + list_del_init(&pos->list_node); } return 0; -- cgit v1.2.3 From f7387d6579d65efd490a864254101cb665f2e7a7 Mon Sep 17 00:00:00 2001 From: "Nikhil P. Rao" Date: Wed, 25 Feb 2026 00:00:27 +0000 Subject: xsk: Fix zero-copy AF_XDP fragment drop AF_XDP should ensure that only a complete packet is sent to application. In the zero-copy case, if the Rx queue gets full as fragments are being enqueued, the remaining fragments are dropped. For the multi-buffer case, add a check to ensure that the Rx queue has enough space for all fragments of a packet before starting to enqueue them. Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Signed-off-by: Nikhil P. Rao Link: https://patch.msgid.link/20260225000456.107806-3-nikhil.rao@amd.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index be882a8d473c..6149f6a79897 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -167,25 +167,31 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) struct xdp_buff_xsk *pos, *tmp; struct list_head *xskb_list; u32 contd = 0; + u32 num_desc; int err; - if (frags) - contd = XDP_PKT_CONTD; + if (likely(!frags)) { + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err) + goto err; + return 0; + } - err = __xsk_rcv_zc(xs, xskb, len, contd); - if (err) + contd = XDP_PKT_CONTD; + num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1; + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + err = -ENOBUFS; goto err; - if (likely(!frags)) - return 0; + } + __xsk_rcv_zc(xs, xskb, len, contd); xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; - err = __xsk_rcv_zc(xs, pos, len, contd); - if (err) - goto err; + __xsk_rcv_zc(xs, pos, len, contd); list_del_init(&pos->list_node); } -- cgit v1.2.3 From 101bacb303e89dc2e0640ae6a5e0fb97c4eb45bb Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 25 Feb 2026 20:32:40 +0800 Subject: atm: lec: fix null-ptr-deref in lec_arp_clear_vccs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller reported a null-ptr-deref in lec_arp_clear_vccs(). This issue can be easily reproduced using the syzkaller reproducer. In the ATM LANE (LAN Emulation) module, the same atm_vcc can be shared by multiple lec_arp_table entries (e.g., via entry->vcc or entry->recv_vcc). When the underlying VCC is closed, lec_vcc_close() iterates over all ARP entries and calls lec_arp_clear_vccs() for each matched entry. For example, when lec_vcc_close() iterates through the hlists in priv->lec_arp_empty_ones or other ARP tables: 1. In the first iteration, for the first matched ARP entry sharing the VCC, lec_arp_clear_vccs() frees the associated vpriv (which is vcc->user_back) and sets vcc->user_back to NULL. 2. In the second iteration, for the next matched ARP entry sharing the same VCC, lec_arp_clear_vccs() is called again. It obtains a NULL vpriv from vcc->user_back (via LEC_VCC_PRIV(vcc)) and then attempts to dereference it via `vcc->pop = vpriv->old_pop`, leading to a null-ptr-deref crash. Fix this by adding a null check for vpriv before dereferencing it. If vpriv is already NULL, it means the VCC has been cleared by a previous call, so we can safely skip the cleanup and just clear the entry's vcc/recv_vcc pointers. The entire cleanup block (including vcc_release_async()) is placed inside the vpriv guard because a NULL vpriv indicates the VCC has already been fully released by a prior iteration — repeating the teardown would redundantly set flags and trigger callbacks on an already-closing socket. The Fixes tag points to the initial commit because the entry->vcc path has been vulnerable since the original code. The entry->recv_vcc path was later added by commit 8d9f73c0ad2f ("atm: fix a memory leak of vcc->user_back") with the same pattern, and both paths are fixed here. Reported-by: syzbot+72e3ea390c305de0e259@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68c95a83.050a0220.3c6139.0e5c.GAE@google.com/T/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Dan Carpenter Reviewed-by: Simon Horman Signed-off-by: Jiayuan Chen Link: https://patch.msgid.link/20260225123250.189289-1-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/atm/lec.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/atm/lec.c b/net/atm/lec.c index a107375c0629..fb93c6e1c329 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1260,24 +1260,28 @@ static void lec_arp_clear_vccs(struct lec_arp_table *entry) struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); struct net_device *dev = (struct net_device *)vcc->proto_data; - vcc->pop = vpriv->old_pop; - if (vpriv->xoff) - netif_wake_queue(dev); - kfree(vpriv); - vcc->user_back = NULL; - vcc->push = entry->old_push; - vcc_release_async(vcc, -EPIPE); + if (vpriv) { + vcc->pop = vpriv->old_pop; + if (vpriv->xoff) + netif_wake_queue(dev); + kfree(vpriv); + vcc->user_back = NULL; + vcc->push = entry->old_push; + vcc_release_async(vcc, -EPIPE); + } entry->vcc = NULL; } if (entry->recv_vcc) { struct atm_vcc *vcc = entry->recv_vcc; struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); - kfree(vpriv); - vcc->user_back = NULL; + if (vpriv) { + kfree(vpriv); + vcc->user_back = NULL; - entry->recv_vcc->push = entry->old_recv_push; - vcc_release_async(entry->recv_vcc, -EPIPE); + entry->recv_vcc->push = entry->old_recv_push; + vcc_release_async(entry->recv_vcc, -EPIPE); + } entry->recv_vcc = NULL; } } -- cgit v1.2.3 From c35636e91e392e1540949bbc67932167cb48bc3a Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 18 Feb 2026 11:58:06 +0100 Subject: can: bcm: fix locking for bcm_op runtime updates Commit c2aba69d0c36 ("can: bcm: add locking for bcm_op runtime updates") added a locking for some variables that can be modified at runtime when updating the sending bcm_op with a new TX_SETUP command in bcm_tx_setup(). Usually the RX_SETUP only handles and filters incoming traffic with one exception: When the RX_RTR_FRAME flag is set a predefined CAN frame is sent when a specific RTR frame is received. Therefore the rx bcm_op uses bcm_can_tx() which uses the bcm_tx_lock that was only initialized in bcm_tx_setup(). Add the missing spin_lock_init() when allocating the bcm_op in bcm_rx_setup() to handle the RTR case properly. Fixes: c2aba69d0c36 ("can: bcm: add locking for bcm_op runtime updates") Reported-by: syzbot+5b11eccc403dd1cea9f8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-can/699466e4.a70a0220.2c38d7.00ff.GAE@google.com/ Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260218-bcm_spin_lock_init-v1-1-592634c8a5b5@hartkopp.net Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index b7324e9c955b..fd9fa072881e 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1176,6 +1176,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; + spin_lock_init(&op->bcm_tx_lock); op->can_id = msg_head->can_id; op->nframes = msg_head->nframes; op->cfsiz = CFSIZ(msg_head->flags); -- cgit v1.2.3 From 8fb54c7307f6add04246d2f7845e42ecd41950fe Mon Sep 17 00:00:00 2001 From: MeiChia Chiu Date: Tue, 3 Mar 2026 13:47:25 +0800 Subject: wifi: mac80211: fix missing ieee80211_eml_params member initialization The missing initialization causes driver to misinterpret the EML control bitmap, resulting in incorrect link bitmap handling. Fixes: 0d95280a2d54e ("wifi: mac80211: Add eMLSR/eMLMR action frame parsing support") Signed-off-by: MeiChia Chiu Acked-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260303054725.471548-1-MeiChia.Chiu@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/eht.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c index 75096b2195d2..078e1e23d8d1 100644 --- a/net/mac80211/eht.c +++ b/net/mac80211/eht.c @@ -154,6 +154,7 @@ void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, u8 *ptr = mgmt->u.action.u.eml_omn.variable; struct ieee80211_eml_params eml_params = { .link_id = status->link_id, + .control = control, }; struct sta_info *sta; int opt_len = 0; -- cgit v1.2.3 From 6a877ececd6daa002a9a0002cd0fbca6592a9244 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Fri, 27 Feb 2026 13:23:36 -0700 Subject: net/rds: Fix circular locking dependency in rds_tcp_tune syzbot reported a circular locking dependency in rds_tcp_tune() where sk_net_refcnt_upgrade() is called while holding the socket lock: ====================================================== WARNING: possible circular locking dependency detected ====================================================== kworker/u10:8/15040 is trying to acquire lock: ffffffff8e9aaf80 (fs_reclaim){+.+.}-{0:0}, at: __kmalloc_cache_noprof+0x4b/0x6f0 but task is already holding lock: ffff88805a3c1ce0 (k-sk_lock-AF_INET6){+.+.}-{0:0}, at: rds_tcp_tune+0xd7/0x930 The issue occurs because sk_net_refcnt_upgrade() performs memory allocation (via get_net_track() -> ref_tracker_alloc()) while the socket lock is held, creating a circular dependency with fs_reclaim. Fix this by moving sk_net_refcnt_upgrade() outside the socket lock critical section. This is safe because the fields modified by the sk_net_refcnt_upgrade() call (sk_net_refcnt, ns_tracker) are not accessed by any concurrent code path at this point. v2: - Corrected fixes tag - check patch line wrap nits - ai commentary nits Reported-by: syzbot+2e2cf5331207053b8106@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2e2cf5331207053b8106 Fixes: 3a58f13a881e ("net: rds: acquire refcount on TCP sockets") Signed-off-by: Allison Henderson Link: https://patch.msgid.link/20260227202336.167757-1-achender@kernel.org Signed-off-by: Paolo Abeni --- net/rds/tcp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 04f310255692..654e23d13e3d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -490,18 +490,24 @@ bool rds_tcp_tune(struct socket *sock) struct rds_tcp_net *rtn; tcp_sock_set_nodelay(sock->sk); - lock_sock(sk); /* TCP timer functions might access net namespace even after * a process which created this net namespace terminated. */ if (!sk->sk_net_refcnt) { - if (!maybe_get_net(net)) { - release_sock(sk); + if (!maybe_get_net(net)) return false; - } + /* + * sk_net_refcnt_upgrade() must be called before lock_sock() + * because it does a GFP_KERNEL allocation, which can trigger + * fs_reclaim and create a circular lock dependency with the + * socket lock. The fields it modifies (sk_net_refcnt, + * ns_tracker) are not accessed by any concurrent code path + * at this point. + */ sk_net_refcnt_upgrade(sk); put_net(net); } + lock_sock(sk); rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; -- cgit v1.2.3 From 1a86a1f7d88996085934139fa4c063b6299a2dd3 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Fri, 27 Feb 2026 22:19:37 +0000 Subject: net: Fix rcu_tasks stall in threaded busypoll I was debugging a NIC driver when I noticed that when I enable threaded busypoll, bpftrace hangs when starting up. dmesg showed: rcu_tasks_wait_gp: rcu_tasks grace period number 85 (since boot) is 10658 jiffies old. rcu_tasks_wait_gp: rcu_tasks grace period number 85 (since boot) is 40793 jiffies old. rcu_tasks_wait_gp: rcu_tasks grace period number 85 (since boot) is 131273 jiffies old. rcu_tasks_wait_gp: rcu_tasks grace period number 85 (since boot) is 402058 jiffies old. INFO: rcu_tasks detected stalls on tasks: 00000000769f52cd: .N nvcsw: 2/2 holdout: 1 idle_cpu: -1/64 task:napi/eth2-8265 state:R running task stack:0 pid:48300 tgid:48300 ppid:2 task_flags:0x208040 flags:0x00004000 Call Trace: ? napi_threaded_poll_loop+0x27c/0x2c0 ? __pfx_napi_threaded_poll+0x10/0x10 ? napi_threaded_poll+0x26/0x80 ? kthread+0xfa/0x240 ? __pfx_kthread+0x10/0x10 ? ret_from_fork+0x31/0x50 ? __pfx_kthread+0x10/0x10 ? ret_from_fork_asm+0x1a/0x30 The cause is that in threaded busypoll, the main loop is in napi_threaded_poll rather than napi_threaded_poll_loop, where the latter rarely iterates more than once within its loop. For rcu_softirq_qs_periodic inside napi_threaded_poll_loop to report its qs state, the last_qs must be 100ms behind, and this can't happen because napi_threaded_poll_loop rarely iterates in threaded busypoll, and each time napi_threaded_poll_loop is called last_qs is reset to latest jiffies. This patch changes so that in threaded busypoll, last_qs is saved in the outer napi_threaded_poll, and whether busy_poll_last_qs is NULL indicates whether napi_threaded_poll_loop is called for busypoll. This way last_qs would not reset to latest jiffies on each invocation of napi_threaded_poll_loop. Fixes: c18d4b190a46 ("net: Extend NAPI threaded polling to allow kthread based busy polling") Cc: stable@vger.kernel.org Signed-off-by: YiFei Zhu Reviewed-by: Samiullah Khawaja Link: https://patch.msgid.link/20260227221937.1060857-1-zhuyifei@google.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c1a9f7fdcffa..4af4cf2d63a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7794,11 +7794,12 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) +static void napi_threaded_poll_loop(struct napi_struct *napi, + unsigned long *busy_poll_last_qs) { + unsigned long last_qs = busy_poll_last_qs ? *busy_poll_last_qs : jiffies; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; - unsigned long last_qs = jiffies; for (;;) { bool repoll = false; @@ -7827,12 +7828,12 @@ static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) /* When busy poll is enabled, the old packets are not flushed in * napi_complete_done. So flush them here. */ - if (busy_poll) + if (busy_poll_last_qs) gro_flush_normal(&napi->gro, HZ >= 1000); local_bh_enable(); /* Call cond_resched here to avoid watchdog warnings. */ - if (repoll || busy_poll) { + if (repoll || busy_poll_last_qs) { rcu_softirq_qs_periodic(last_qs); cond_resched(); } @@ -7840,11 +7841,15 @@ static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) if (!repoll) break; } + + if (busy_poll_last_qs) + *busy_poll_last_qs = last_qs; } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; + unsigned long last_qs = jiffies; bool want_busy_poll; bool in_busy_poll; unsigned long val; @@ -7862,7 +7867,7 @@ static int napi_threaded_poll(void *data) assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, want_busy_poll); - napi_threaded_poll_loop(napi, want_busy_poll); + napi_threaded_poll_loop(napi, want_busy_poll ? &last_qs : NULL); } return 0; @@ -13175,7 +13180,7 @@ static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); - napi_threaded_poll_loop(&sd->backlog, false); + napi_threaded_poll_loop(&sd->backlog, NULL); } static void backlog_napi_setup(unsigned int cpu) -- cgit v1.2.3 From 2ffb4f5c2ccb2fa1c049dd11899aee7967deef5a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 1 Mar 2026 11:45:48 -0800 Subject: ipv6: fix NULL pointer deref in ip6_rt_get_dev_rcu() l3mdev_master_dev_rcu() can return NULL when the slave device is being un-slaved from a VRF. All other callers deal with this, but we lost the fallback to loopback in ip6_rt_pcpu_alloc() -> ip6_rt_get_dev_rcu() with commit 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address"). KASAN: null-ptr-deref in range [0x0000000000000108-0x000000000000010f] RIP: 0010:ip6_rt_pcpu_alloc (net/ipv6/route.c:1418) Call Trace: ip6_pol_route (net/ipv6/route.c:2318) fib6_rule_lookup (net/ipv6/fib6_rules.c:115) ip6_route_output_flags (net/ipv6/route.c:2607) vrf_process_v6_outbound (drivers/net/vrf.c:437) I was tempted to rework the un-slaving code to clear the flag first and insert synchronize_rcu() before we remove the upper. But looks like the explicit fallback to loopback_dev is an established pattern. And I guess avoiding the synchronize_rcu() is nice, too. Fixes: 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") Reviewed-by: David Ahern Link: https://patch.msgid.link/20260301194548.927324-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 85df25c36409..7db0c837196c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1063,7 +1063,8 @@ static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) */ if (netif_is_l3_slave(dev) && !rt6_need_strict(&res->f6i->fib6_dst.addr)) - dev = l3mdev_master_dev_rcu(dev); + dev = l3mdev_master_dev_rcu(dev) ? : + dev_net(dev)->loopback_dev; else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; /* last case is netif_is_l3_master(dev) is true in which -- cgit v1.2.3 From 67edfec516d30d3e62925c397be4a1e5185802fc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 2 Mar 2026 12:36:00 -0800 Subject: net/tcp-ao: Fix MAC comparison to be constant-time To prevent timing attacks, MACs need to be compared in constant time. Use the appropriate helper function for this. Fixes: 0a3a809089eb ("net/tcp: Verify inbound TCP-AO signed segments") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20260302203600.13561-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/Kconfig | 1 + net/ipv4/tcp_ao.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index b71c22475c51..3ab6247be585 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -748,6 +748,7 @@ config TCP_SIGPOOL config TCP_AO bool "TCP: Authentication Option (RFC5925)" select CRYPTO + select CRYPTO_LIB_UTILS select TCP_SIGPOOL depends on 64BIT && IPV6 != m # seq-number extension needs WRITE_ONCE(u64) help diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 4980caddb0fc..a97cdf3e6af4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include +#include #include #include @@ -922,7 +923,7 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, /* XXX: make it per-AF callback? */ tcp_ao_hash_skb(family, hash_buf, key, sk, skb, traffic_key, (phash - (u8 *)th), sne); - if (memcmp(phash, hash_buf, maclen)) { + if (crypto_memneq(phash, hash_buf, maclen)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); atomic64_inc(&info->counters.pkt_bad); atomic64_inc(&key->pkt_bad); -- cgit v1.2.3 From 4ee7fa6cf78ff26d783d39e2949d14c4c1cd5e7f Mon Sep 17 00:00:00 2001 From: Yung Chih Su Date: Mon, 2 Mar 2026 14:02:47 +0800 Subject: net: ipv4: fix ARM64 alignment fault in multipath hash seed `struct sysctl_fib_multipath_hash_seed` contains two u32 fields (user_seed and mp_seed), making it an 8-byte structure with a 4-byte alignment requirement. In `fib_multipath_hash_from_keys()`, the code evaluates the entire struct atomically via `READ_ONCE()`: mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed; While this silently works on GCC by falling back to unaligned regular loads which the ARM64 kernel tolerates, it causes a fatal kernel panic when compiled with Clang and LTO enabled. Commit e35123d83ee3 ("arm64: lto: Strengthen READ_ONCE() to acquire when CONFIG_LTO=y") strengthens `READ_ONCE()` to use Load-Acquire instructions (`ldar` / `ldapr`) to prevent compiler reordering bugs under Clang LTO. Since the macro evaluates the full 8-byte struct, Clang emits a 64-bit `ldar` instruction. ARM64 architecture strictly requires `ldar` to be naturally aligned, thus executing it on a 4-byte aligned address triggers a strict Alignment Fault (FSC = 0x21). Fix the read side by moving the `READ_ONCE()` directly to the `u32` member, which emits a safe 32-bit `ldar Wn`. Furthermore, Eric Dumazet pointed out that `WRITE_ONCE()` on the entire struct in `proc_fib_multipath_hash_set_seed()` is also flawed. Analysis shows that Clang splits this 8-byte write into two separate 32-bit `str` instructions. While this avoids an alignment fault, it destroys atomicity and exposes a tear-write vulnerability. Fix this by explicitly splitting the write into two 32-bit `WRITE_ONCE()` operations. Finally, add the missing `READ_ONCE()` when reading `user_seed` in `proc_fib_multipath_hash_seed()` to ensure proper pairing and concurrency safety. Fixes: 4ee2a8cace3f ("net: ipv4: Add a sysctl to set multipath hash seed") Signed-off-by: Yung Chih Su Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260302060247.7066-1-yuuchihsu@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/sysctl_net_ipv4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 643763bc2142..5654cc9c8a0b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -486,7 +486,8 @@ static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) proc_fib_multipath_hash_rand_seed), }; - WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new); + WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.user_seed, new.user_seed); + WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed, new.mp_seed); } static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write, @@ -500,7 +501,7 @@ static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write int ret; mphs = &net->ipv4.sysctl_fib_multipath_hash_seed; - user_seed = mphs->user_seed; + user_seed = READ_ONCE(mphs->user_seed); tmp = *table; tmp.data = &user_seed; -- cgit v1.2.3 From 46d0d6f50dab706637f4c18a470aac20a21900d3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 2 Mar 2026 12:34:09 -0800 Subject: net/tcp-md5: Fix MAC comparison to be constant-time To prevent timing attacks, MACs need to be compared in constant time. Use the appropriate helper function for this. Fixes: cfb6eeb4c860 ("[TCP]: MD5 Signature Option (RFC2385) support.") Fixes: 658ddaaf6694 ("tcp: md5: RST: getting md5 key from listener") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20260302203409.13388-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/Kconfig | 1 + net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 4 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 3ab6247be585..df922f9f5289 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -762,6 +762,7 @@ config TCP_AO config TCP_MD5SIG bool "TCP: MD5 Signature Option support (RFC2385)" select CRYPTO_LIB_MD5 + select CRYPTO_LIB_UTILS help RFC2385 specifies a method of giving MD5 protection to TCP sessions. Its main (only?) use is to protect BGP sessions between core routers diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8cdc26e8ad68..202a4e57a218 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -244,6 +244,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include +#include #include #include #include @@ -4970,7 +4971,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, tcp_v4_md5_hash_skb(newhash, key, NULL, skb); else tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); - if (memcmp(hash_location, newhash, 16) != 0) { + if (crypto_memneq(hash_location, newhash, 16)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d53d39be291a..910c25cb24e1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -88,6 +88,7 @@ #include #include +#include #include @@ -839,7 +840,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, goto out; tcp_v4_md5_hash_skb(newhash, key, NULL, skb); - if (memcmp(md5_hash_location, newhash, 16) != 0) + if (crypto_memneq(md5_hash_location, newhash, 16)) goto out; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e46a0efae012..5195a46b951e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -68,6 +68,7 @@ #include #include +#include #include @@ -1048,7 +1049,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, key.type = TCP_KEY_MD5; tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); - if (memcmp(md5_hash_location, newhash, 16) != 0) + if (crypto_memneq(md5_hash_location, newhash, 16)) goto out; } #endif -- cgit v1.2.3 From 165573e41f2f66ef98940cf65f838b2cb575d9d1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Mar 2026 20:55:27 +0000 Subject: tcp: secure_seq: add back ports to TS offset This reverts 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets") tcp_tw_recycle went away in 2017. Zhouyan Deng reported off-path TCP source port leakage via SYN cookie side-channel that can be fixed in multiple ways. One of them is to bring back TCP ports in TS offset randomization. As a bonus, we perform a single siphash() computation to provide both an ISN and a TS offset. Fixes: 28ee1b746f49 ("secure_seq: downgrade to per-host timestamp offsets") Reported-by: Zhouyan Deng Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Acked-by: Florian Westphal Link: https://patch.msgid.link/20260302205527.1982836-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/secure_seq.c | 80 +++++++++++++++++++-------------------------------- net/ipv4/syncookies.c | 11 +++++-- net/ipv4/tcp_input.c | 8 ++++-- net/ipv4/tcp_ipv4.c | 37 +++++++++++------------- net/ipv6/syncookies.c | 11 +++++-- net/ipv6/tcp_ipv6.c | 37 +++++++++++------------- 6 files changed, 85 insertions(+), 99 deletions(-) (limited to 'net') diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 9a3965680451..6a6f2cda5aae 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -20,7 +20,6 @@ #include static siphash_aligned_key_t net_secret; -static siphash_aligned_key_t ts_secret; #define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) @@ -28,11 +27,6 @@ static __always_inline void net_secret_init(void) { net_get_random_once(&net_secret, sizeof(net_secret)); } - -static __always_inline void ts_secret_init(void) -{ - net_get_random_once(&ts_secret, sizeof(ts_secret)); -} #endif #ifdef CONFIG_INET @@ -53,28 +47,9 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -u32 secure_tcpv6_ts_off(const struct net *net, - const __be32 *saddr, const __be32 *daddr) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - }; - - if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) - return 0; - - ts_secret_init(); - return siphash(&combined, offsetofend(typeof(combined), daddr), - &ts_secret); -} -EXPORT_IPV6_MOD(secure_tcpv6_ts_off); - -u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport) +union tcp_seq_and_ts_off +secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr, + const __be32 *daddr, __be16 sport, __be16 dport) { const struct { struct in6_addr saddr; @@ -87,14 +62,20 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, .sport = sport, .dport = dport }; - u32 hash; + union tcp_seq_and_ts_off st; net_secret_init(); - hash = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - return seq_scale(hash); + + st.hash64 = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + st.ts_off = 0; + + st.seq = seq_scale(st.seq); + return st; } -EXPORT_SYMBOL(secure_tcpv6_seq); +EXPORT_SYMBOL(secure_tcpv6_seq_and_ts_off); u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) @@ -118,33 +99,30 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) -{ - if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) - return 0; - - ts_secret_init(); - return siphash_2u32((__force u32)saddr, (__force u32)daddr, - &ts_secret); -} - /* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, * it would be easy enough to have the former function use siphash_4u32, passing * the arguments as separate u32. */ -u32 secure_tcp_seq(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +union tcp_seq_and_ts_off +secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) { - u32 hash; + u32 ports = (__force u32)sport << 16 | (__force u32)dport; + union tcp_seq_and_ts_off st; net_secret_init(); - hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - return seq_scale(hash); + + st.hash64 = siphash_3u32((__force u32)saddr, (__force u32)daddr, + ports, &net_secret); + + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + st.ts_off = 0; + + st.seq = seq_scale(st.seq); + return st; } -EXPORT_SYMBOL_GPL(secure_tcp_seq); +EXPORT_SYMBOL_GPL(secure_tcp_seq_and_ts_off); u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 061751aabc8e..fc3affd9c801 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -378,9 +378,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcp_ts_off(net, - ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr); + union tcp_seq_and_ts_off st; + + st = secure_tcp_seq_and_ts_off(net, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); + tsoff = st.ts_off; tcp_opt.rcv_tsecr -= tsoff; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7b03f2460751..cba89733d121 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7646,6 +7646,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; + union tcp_seq_and_ts_off st; struct request_sock *req; bool want_cookie = false; struct dst_entry *dst; @@ -7715,9 +7716,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!dst) goto drop_and_free; + if (tmp_opt.tstamp_ok || (!want_cookie && !isn)) + st = af_ops->init_seq_and_ts_off(net, skb); + if (tmp_opt.tstamp_ok) { tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); - tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); + tcp_rsk(req)->ts_off = st.ts_off; } if (!want_cookie && !isn) { int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); @@ -7739,7 +7743,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_release; } - isn = af_ops->init_seq(skb); + isn = st.seq; } tcp_ecn_create_request(req, skb, sk, dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 910c25cb24e1..c7b2463c2e25 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -105,17 +105,14 @@ static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { static DEFINE_MUTEX(tcp_exit_batch_mutex); -static u32 tcp_v4_init_seq(const struct sk_buff *skb) +static union tcp_seq_and_ts_off +tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb) { - return secure_tcp_seq(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); -} - -static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) -{ - return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); + return secure_tcp_seq_and_ts_off(net, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -327,15 +324,16 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len rt = NULL; if (likely(!tp->repair)) { + union tcp_seq_and_ts_off st; + + st = secure_tcp_seq_and_ts_off(net, + inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port); if (!tp->write_seq) - WRITE_ONCE(tp->write_seq, - secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port)); - WRITE_ONCE(tp->tsoffset, - secure_tcp_ts_off(net, inet->inet_saddr, - inet->inet_daddr)); + WRITE_ONCE(tp->write_seq, st.seq); + WRITE_ONCE(tp->tsoffset, st.ts_off); } atomic_set(&inet->inet_id, get_random_u16()); @@ -1677,8 +1675,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .cookie_init_seq = cookie_v4_init_sequence, #endif .route_req = tcp_v4_route_req, - .init_seq = tcp_v4_init_seq, - .init_ts_off = tcp_v4_init_ts_off, + .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off, .send_synack = tcp_v4_send_synack, }; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7e007f013ec8..4f6f0d751d6c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -151,9 +151,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcpv6_ts_off(net, - ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32); + union tcp_seq_and_ts_off st; + + st = secure_tcpv6_seq_and_ts_off(net, + ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); + tsoff = st.ts_off; tcp_opt.rcv_tsecr -= tsoff; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5195a46b951e..bb09d5ccf599 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -105,18 +105,14 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) } } -static u32 tcp_v6_init_seq(const struct sk_buff *skb) +static union tcp_seq_and_ts_off +tcp_v6_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb) { - return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); -} - -static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) -{ - return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32); + return secure_tcpv6_seq_and_ts_off(net, + ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, @@ -320,14 +316,16 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, sk_set_txhash(sk); if (likely(!tp->repair)) { + union tcp_seq_and_ts_off st; + + st = secure_tcpv6_seq_and_ts_off(net, + np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); if (!tp->write_seq) - WRITE_ONCE(tp->write_seq, - secure_tcpv6_seq(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport)); - tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32); + WRITE_ONCE(tp->write_seq, st.seq); + tp->tsoffset = st.ts_off; } if (tcp_fastopen_defer_connect(sk, &err)) @@ -817,8 +815,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, - .init_seq = tcp_v6_init_seq, - .init_ts_off = tcp_v6_init_ts_off, + .init_seq_and_ts_off = tcp_v6_init_seq_and_ts_off, .send_synack = tcp_v6_send_synack, }; -- cgit v1.2.3 From f7d92f11bd33a6eb49c7c812255ef4ab13681f0f Mon Sep 17 00:00:00 2001 From: Ian Ray Date: Mon, 2 Mar 2026 18:32:37 +0200 Subject: net: nfc: nci: Fix zero-length proprietary notifications NCI NFC controllers may have proprietary OIDs with zero-length payload. One example is: drivers/nfc/nxp-nci/core.c, NXP_NCI_RF_TXLDO_ERROR_NTF. Allow a zero length payload in proprietary notifications *only*. Before: -- >8 -- kernel: nci: nci_recv_frame: len 3 -- >8 -- After: -- >8 -- kernel: nci: nci_recv_frame: len 3 kernel: nci: nci_ntf_packet: NCI RX: MT=ntf, PBF=0, GID=0x1, OID=0x23, plen=0 kernel: nci: nci_ntf_packet: unknown ntf opcode 0x123 kernel: nfc nfc0: NFC: RF transmitter couldn't start. Bad power and/or configuration? -- >8 -- After fixing the hardware: -- >8 -- kernel: nci: nci_recv_frame: len 27 kernel: nci: nci_ntf_packet: NCI RX: MT=ntf, PBF=0, GID=0x1, OID=0x5, plen=24 kernel: nci: nci_rf_intf_activated_ntf_packet: rf_discovery_id 1 -- >8 -- Fixes: d24b03535e5e ("nfc: nci: Fix uninit-value in nci_dev_up and nci_ntf_packet") Signed-off-by: Ian Ray Link: https://patch.msgid.link/20260302163238.140576-1-ian.ray@gehealthcare.com Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6e9b76e2cc56..e95eef85971f 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1482,10 +1482,20 @@ static bool nci_valid_size(struct sk_buff *skb) unsigned int hdr_size = NCI_CTRL_HDR_SIZE; if (skb->len < hdr_size || - !nci_plen(skb->data) || skb->len < hdr_size + nci_plen(skb->data)) { return false; } + + if (!nci_plen(skb->data)) { + /* Allow zero length in proprietary notifications (0x20 - 0x3F). */ + if (nci_opcode_oid(nci_opcode(skb->data)) >= 0x20 && + nci_mt(skb->data) == NCI_MT_NTF_PKT) + return true; + + /* Disallow zero length otherwise. */ + return false; + } + return true; } -- cgit v1.2.3 From a4c2b8be2e5329e7fac6e8f64ddcb8958155cfcb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2026 01:56:40 +0000 Subject: net_sched: sch_fq: clear q->band_pkt_count[] in fq_reset() When/if a NIC resets, queues are deactivated by dev_deactivate_many(), then reactivated when the reset operation completes. fq_reset() removes all the skbs from various queues. If we do not clear q->band_pkt_count[], these counters keep growing and can eventually reach sch->limit, preventing new packets to be queued. Many thanks to Praveen for discovering the root cause. Fixes: 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") Diagnosed-by: Praveen Kaligineedi Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260304015640.961780-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 80235e85f844..05084c9af48e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -827,6 +827,7 @@ static void fq_reset(struct Qdisc *sch) for (idx = 0; idx < FQ_BANDS; idx++) { q->band_flows[idx].new_flows.first = NULL; q->band_flows[idx].old_flows.first = NULL; + q->band_pkt_count[idx] = 0; } q->delayed = RB_ROOT; q->flows = 0; -- cgit v1.2.3 From 40bf00ec2ee271df5ba67593991760adf8b5d0ed Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Mon, 2 Mar 2026 16:32:56 -0800 Subject: net: devmem: use READ_ONCE/WRITE_ONCE on binding->dev binding->dev is protected on the write-side in mp_dmabuf_devmem_uninstall() against concurrent writes, but due to the concurrent bare reads in net_devmem_get_binding() and validate_xmit_unreadable_skb() it should be wrapped in a READ_ONCE/WRITE_ONCE pair to make sure no compiler optimizations play with the underlying register in unforeseen ways. Doesn't present a critical bug because the known compiler optimizations don't result in bad behavior. There is no tearing on u64, and load omissions/invented loads would only break if additional binding->dev references were inlined together (they aren't right now). This just more strictly follows the linux memory model (i.e., "Lock-Protected Writes With Lockless Reads" in tools/memory-model/Documentation/access-marking.txt). Fixes: bd61848900bf ("net: devmem: Implement TX path") Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260302-devmem-membar-fix-v2-1-5b33c9cbc28b@meta.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- net/core/devmem.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4af4cf2d63a4..20610a192ec7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3987,7 +3987,7 @@ static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, if (shinfo->nr_frags > 0) { niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); if (net_is_devmem_iov(niov) && - net_devmem_iov_binding(niov)->dev != dev) + READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev) goto out_free; } diff --git a/net/core/devmem.c b/net/core/devmem.c index 8c9aad776bf4..69d79aee07ef 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -396,7 +396,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, * net_device. */ dst_dev = dst_dev_rcu(dst); - if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { + if (unlikely(!dst_dev) || + unlikely(dst_dev != READ_ONCE(binding->dev))) { err = -ENODEV; goto out_unlock; } @@ -513,7 +514,8 @@ static void mp_dmabuf_devmem_uninstall(void *mp_priv, xa_erase(&binding->bound_rxqs, xa_idx); if (xa_empty(&binding->bound_rxqs)) { mutex_lock(&binding->lock); - binding->dev = NULL; + ASSERT_EXCLUSIVE_WRITER(binding->dev); + WRITE_ONCE(binding->dev, NULL); mutex_unlock(&binding->lock); } break; -- cgit v1.2.3 From 7bd4b0c4779f978a6528c9b7937d2ca18e936e2c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:41 -0800 Subject: nfc: nci: free skb on nci_transceive early error paths nci_transceive() takes ownership of the skb passed by the caller, but the -EPROTO, -EINVAL, and -EBUSY error paths return without freeing it. Due to issues clearing NCI_DATA_EXCHANGE fixed by subsequent changes the nci/nci_dev selftest hits the error path occasionally in NIPA, and kmemleak detects leaks: unreferenced object 0xff11000015ce6a40 (size 640): comm "nci_dev", pid 3954, jiffies 4295441246 hex dump (first 32 bytes): 6b 6b 6b 6b 00 a4 00 0c 02 e1 03 6b 6b 6b 6b 6b kkkk.......kkkkk 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk backtrace (crc 7c40cc2a): kmem_cache_alloc_node_noprof+0x492/0x630 __alloc_skb+0x11e/0x5f0 alloc_skb_with_frags+0xc6/0x8f0 sock_alloc_send_pskb+0x326/0x3f0 nfc_alloc_send_skb+0x94/0x1d0 rawsock_sendmsg+0x162/0x4c0 do_syscall_64+0x117/0xfc0 Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index e95eef85971f..e859b834c0f1 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1035,18 +1035,23 @@ static int nci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target, struct nci_conn_info *conn_info; conn_info = ndev->rf_conn_info; - if (!conn_info) + if (!conn_info) { + kfree_skb(skb); return -EPROTO; + } pr_debug("target_idx %d, len %d\n", target->idx, skb->len); if (!ndev->target_active_prot) { pr_err("unable to exchange data, no active target\n"); + kfree_skb(skb); return -EINVAL; } - if (test_and_set_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + if (test_and_set_bit(NCI_DATA_EXCHANGE, &ndev->flags)) { + kfree_skb(skb); return -EBUSY; + } /* store cb and context to be used on receiving data */ conn_info->data_exchange_cb = cb; -- cgit v1.2.3 From d42449d2c17cdf06d1f63268557450bd3f051e9a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:42 -0800 Subject: nfc: digital: free skb on digital_in_send error paths digital_in_send() takes ownership of the skb passed by the caller (nfc_data_exchange), make sure it's freed on all error paths. Found looking around the real driver for similar bugs to the one just fixed in nci. Fixes: 2c66daecc409 ("NFC Digital: Add NFC-A technology support") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/digital_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 3670bb33732e..7cb1e6aaae90 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -707,8 +707,10 @@ static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target, int rc; data_exch = kzalloc_obj(*data_exch); - if (!data_exch) + if (!data_exch) { + kfree_skb(skb); return -ENOMEM; + } data_exch->cb = cb; data_exch->cb_context = cb_context; @@ -731,8 +733,10 @@ static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target, data_exch); exit: - if (rc) + if (rc) { + kfree_skb(skb); kfree(data_exch); + } return rc; } -- cgit v1.2.3 From 66083581945bd5b8e99fe49b5aeb83d03f62d053 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:43 -0800 Subject: nfc: nci: complete pending data exchange on device close In nci_close_device(), complete any pending data exchange before closing. The data exchange callback (e.g. rawsock_data_exchange_complete) holds a socket reference. NIPA occasionally hits this leak: unreferenced object 0xff1100000f435000 (size 2048): comm "nci_dev", pid 3954, jiffies 4295441245 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 27 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00 '..@............ backtrace (crc ec2b3c5): __kmalloc_noprof+0x4db/0x730 sk_prot_alloc.isra.0+0xe4/0x1d0 sk_alloc+0x36/0x760 rawsock_create+0xd1/0x540 nfc_sock_create+0x11f/0x280 __sock_create+0x22d/0x630 __sys_socket+0x115/0x1d0 __x64_sys_socket+0x72/0xd0 do_syscall_64+0x117/0xfc0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 38f04c6b1b68 ("NFC: protect nci_data_exchange transactions") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index e859b834c0f1..43d871525dbc 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -567,6 +567,10 @@ static int nci_close_device(struct nci_dev *ndev) flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); timer_delete_sync(&ndev->data_timer); + if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + nci_data_exchange_complete(ndev, NULL, + ndev->cur_conn_id, + -ENODEV); mutex_unlock(&ndev->req_lock); return 0; } @@ -598,6 +602,11 @@ static int nci_close_device(struct nci_dev *ndev) flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); + timer_delete_sync(&ndev->data_timer); + + if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + nci_data_exchange_complete(ndev, NULL, ndev->cur_conn_id, + -ENODEV); /* Clear flags except NCI_UNREG */ ndev->flags &= BIT(NCI_UNREG); -- cgit v1.2.3 From 0efdc02f4f6d52f8ca5d5889560f325a836ce0a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:44 -0800 Subject: nfc: nci: clear NCI_DATA_EXCHANGE before calling completion callback Move clear_bit(NCI_DATA_EXCHANGE) before invoking the data exchange callback in nci_data_exchange_complete(). The callback (e.g. rawsock_data_exchange_complete) may immediately schedule another data exchange via schedule_work(tx_work). On a multi-CPU system, tx_work can run and reach nci_transceive() before the current nci_data_exchange_complete() clears the flag, causing test_and_set_bit(NCI_DATA_EXCHANGE) to return -EBUSY and the new transfer to fail. This causes intermittent flakes in nci/nci_dev in NIPA: # # RUN NCI.NCI1_0.t4t_tag_read ... # # t4t_tag_read: Test terminated by timeout # # FAIL NCI.NCI1_0.t4t_tag_read # not ok 3 NCI.NCI1_0.t4t_tag_read Fixes: 38f04c6b1b68 ("NFC: protect nci_data_exchange transactions") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/nci/data.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 78f4131af3cf..5f98c73db5af 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -33,7 +33,8 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) { kfree_skb(skb); - goto exit; + clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); + return; } cb = conn_info->data_exchange_cb; @@ -45,6 +46,12 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, timer_delete_sync(&ndev->data_timer); clear_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); + /* Mark the exchange as done before calling the callback. + * The callback (e.g. rawsock_data_exchange_complete) may + * want to immediately queue another data exchange. + */ + clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); + if (cb) { /* forward skb to nfc core */ cb(cb_context, skb, err); @@ -54,9 +61,6 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, /* no waiting callback, free skb */ kfree_skb(skb); } - -exit: - clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); } /* ----------------- NCI TX Data ----------------- */ -- cgit v1.2.3 From d793458c45df2aed498d7f74145eab7ee22d25aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Mar 2026 08:23:45 -0800 Subject: nfc: rawsock: cancel tx_work before socket teardown In rawsock_release(), cancel any pending tx_work and purge the write queue before orphaning the socket. rawsock_tx_work runs on the system workqueue and calls nfc_data_exchange which dereferences the NCI device. Without synchronization, tx_work can race with socket and device teardown when a process is killed (e.g. by SIGKILL), leading to use-after-free or leaked references. Set SEND_SHUTDOWN first so that if tx_work is already running it will see the flag and skip transmitting, then use cancel_work_sync to wait for any in-progress execution to finish, and finally purge any remaining queued skbs. Fixes: 23b7869c0fd0 ("NFC: add the NFC socket raw protocol") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260303162346.2071888-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/nfc/rawsock.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index b049022399ae..f7d7a599fade 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -67,6 +67,17 @@ static int rawsock_release(struct socket *sock) if (sock->type == SOCK_RAW) nfc_sock_unlink(&raw_sk_list, sk); + if (sk->sk_state == TCP_ESTABLISHED) { + /* Prevent rawsock_tx_work from starting new transmits and + * wait for any in-progress work to finish. This must happen + * before the socket is orphaned to avoid a race where + * rawsock_tx_work runs after the NCI device has been freed. + */ + sk->sk_shutdown |= SEND_SHUTDOWN; + cancel_work_sync(&nfc_rawsock(sk)->tx_work); + rawsock_write_queue_purge(sk); + } + sock_orphan(sk); sock_put(sk); -- cgit v1.2.3 From fb8d0bccb221080630efcd9660c9f9349e53cc9e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 3 Mar 2026 11:56:03 +0100 Subject: mptcp: pm: avoid sending RM_ADDR over same subflow RM_ADDR are sent over an active subflow, the first one in the subflows list. There is then a high chance the initial subflow is picked. With the in-kernel PM, when an endpoint is removed, a RM_ADDR is sent, then linked subflows are closed. This is done for each active MPTCP connection. MPTCP endpoints are likely removed because the attached network is no longer available or usable. In this case, it is better to avoid sending this RM_ADDR over the subflow that is going to be removed, but prefer sending it over another active and non stale subflow, if any. This modification avoids situations where the other end is not notified when a subflow is no longer usable: typically when the endpoint linked to the initial subflow is removed, especially on the server side. Fixes: 8dd5efb1f91b ("mptcp: send ack for rm_addr") Cc: stable@vger.kernel.org Reported-by: Frank Lorenz Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/612 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260303-net-mptcp-misc-fixes-7-0-rc2-v1-2-4b5462b6f016@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 7298836469b3..57a456690406 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -212,9 +212,24 @@ void mptcp_pm_send_ack(struct mptcp_sock *msk, spin_lock_bh(&msk->pm.lock); } -void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +static bool subflow_in_rm_list(const struct mptcp_subflow_context *subflow, + const struct mptcp_rm_list *rm_list) +{ + u8 i, id = subflow_get_local_id(subflow); + + for (i = 0; i < rm_list->nr; i++) { + if (rm_list->ids[i] == id) + return true; + } + + return false; +} + +static void +mptcp_pm_addr_send_ack_avoid_list(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { - struct mptcp_subflow_context *subflow, *alt = NULL; + struct mptcp_subflow_context *subflow, *stale = NULL, *same_id = NULL; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); @@ -224,19 +239,35 @@ void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) return; mptcp_for_each_subflow(msk, subflow) { - if (__mptcp_subflow_active(subflow)) { - if (!subflow->stale) { - mptcp_pm_send_ack(msk, subflow, false, false); - return; - } + if (!__mptcp_subflow_active(subflow)) + continue; - if (!alt) - alt = subflow; + if (unlikely(subflow->stale)) { + if (!stale) + stale = subflow; + } else if (unlikely(rm_list && + subflow_in_rm_list(subflow, rm_list))) { + if (!same_id) + same_id = subflow; + } else { + goto send_ack; } } - if (alt) - mptcp_pm_send_ack(msk, alt, false, false); + if (same_id) + subflow = same_id; + else if (stale) + subflow = stale; + else + return; + +send_ack: + mptcp_pm_send_ack(msk, subflow, false, false); +} + +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +{ + mptcp_pm_addr_send_ack_avoid_list(msk, NULL); } int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, @@ -470,7 +501,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); - mptcp_pm_addr_send_ack(msk); + mptcp_pm_addr_send_ack_avoid_list(msk, rm_list); return 0; } -- cgit v1.2.3 From 579a752464a64cb5f9139102f0e6b90a1f595ceb Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 3 Mar 2026 11:56:05 +0100 Subject: mptcp: pm: in-kernel: always mark signal+subflow endp as used Syzkaller managed to find a combination of actions that was generating this warning: msk->pm.local_addr_used == 0 WARNING: net/mptcp/pm_kernel.c:1071 at __mark_subflow_endp_available net/mptcp/pm_kernel.c:1071 [inline], CPU#1: syz.2.17/961 WARNING: net/mptcp/pm_kernel.c:1071 at mptcp_nl_remove_subflow_and_signal_addr net/mptcp/pm_kernel.c:1103 [inline], CPU#1: syz.2.17/961 WARNING: net/mptcp/pm_kernel.c:1071 at mptcp_pm_nl_del_addr_doit+0x81d/0x8f0 net/mptcp/pm_kernel.c:1210, CPU#1: syz.2.17/961 Modules linked in: CPU: 1 UID: 0 PID: 961 Comm: syz.2.17 Not tainted 6.19.0-08368-gfafda3b4b06b #22 PREEMPT(full) Hardware name: QEMU Ubuntu 25.10 PC v2 (i440FX + PIIX, + 10.1 machine, 1996), BIOS 1.17.0-debian-1.17.0-1build1 04/01/2014 RIP: 0010:__mark_subflow_endp_available net/mptcp/pm_kernel.c:1071 [inline] RIP: 0010:mptcp_nl_remove_subflow_and_signal_addr net/mptcp/pm_kernel.c:1103 [inline] RIP: 0010:mptcp_pm_nl_del_addr_doit+0x81d/0x8f0 net/mptcp/pm_kernel.c:1210 Code: 89 c5 e8 46 30 6f fe e9 21 fd ff ff 49 83 ed 80 e8 38 30 6f fe 4c 89 ef be 03 00 00 00 e8 db 49 df fe eb ac e8 24 30 6f fe 90 <0f> 0b 90 e9 1d ff ff ff e8 16 30 6f fe eb 05 e8 0f 30 6f fe e8 9a RSP: 0018:ffffc90001663880 EFLAGS: 00010293 RAX: ffffffff82de1a6c RBX: 0000000000000000 RCX: ffff88800722b500 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff8880158b22d0 R08: 0000000000010425 R09: ffffffffffffffff R10: ffffffff82de18ba R11: 0000000000000000 R12: ffff88800641a640 R13: ffff8880158b1880 R14: ffff88801ec3c900 R15: ffff88800641a650 FS: 00005555722c3500(0000) GS:ffff8880f909d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f66346e0f60 CR3: 000000001607c000 CR4: 0000000000350ef0 Call Trace: genl_family_rcv_msg_doit+0x117/0x180 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x3a8/0x3f0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x16d/0x240 net/netlink/af_netlink.c:2550 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline] netlink_unicast+0x3e9/0x4c0 net/netlink/af_netlink.c:1344 netlink_sendmsg+0x4aa/0x5b0 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg+0xc9/0xf0 net/socket.c:742 ____sys_sendmsg+0x272/0x3b0 net/socket.c:2592 ___sys_sendmsg+0x2de/0x320 net/socket.c:2646 __sys_sendmsg net/socket.c:2678 [inline] __do_sys_sendmsg net/socket.c:2683 [inline] __se_sys_sendmsg net/socket.c:2681 [inline] __x64_sys_sendmsg+0x110/0x1a0 net/socket.c:2681 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x143/0x440 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f66346f826d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffc83d8bdc8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f6634985fa0 RCX: 00007f66346f826d RDX: 00000000040000b0 RSI: 0000200000000740 RDI: 0000000000000007 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f6634985fa8 R13: 00007f6634985fac R14: 0000000000000000 R15: 0000000000001770 The actions that caused that seem to be: - Set the MPTCP subflows limit to 0 - Create an MPTCP endpoint with both the 'signal' and 'subflow' flags - Create a new MPTCP connection from a different address: an ADD_ADDR linked to the MPTCP endpoint will be sent ('signal' flag), but no subflows is initiated ('subflow' flag) - Remove the MPTCP endpoint In this case, msk->pm.local_addr_used has been kept to 0 -- because no subflows have been created -- but the corresponding bit in msk->pm.id_avail_bitmap has been cleared when the ADD_ADDR has been sent. This later causes a splat when removing the MPTCP endpoint because msk->pm.local_addr_used has been kept to 0. Now, if an endpoint has both the signal and subflow flags, but it is not possible to create subflows because of the limits or the c-flag case, then the local endpoint counter is still incremented: the endpoint is used at the end. This avoids issues later when removing the endpoint and calling __mark_subflow_endp_available(), which expects msk->pm.local_addr_used to have been previously incremented if the endpoint was marked as used according to msk->pm.id_avail_bitmap. Note that signal_and_subflow variable is reset to false when the limits and the c-flag case allows subflows creation. Also, local_addr_used is only incremented for non ID0 subflows. Fixes: 85df533a787b ("mptcp: pm: do not ignore 'subflow' if 'signal' flag is also set") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/613 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260303-net-mptcp-misc-fixes-7-0-rc2-v1-4-4b5462b6f016@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index b5316a6c7d1b..b2b9df43960e 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -418,6 +418,15 @@ subflow: } exit: + /* If an endpoint has both the signal and subflow flags, but it is not + * possible to create subflows -- the 'while' loop body above never + * executed -- then still mark the endp as used, which is somehow the + * case. This avoids issues later when removing the endpoint and calling + * __mark_subflow_endp_available(), which expects the increment here. + */ + if (signal_and_subflow && local.addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + mptcp_pm_nl_check_work_pending(msk); } -- cgit v1.2.3 From b824c3e16c1904bf80df489e293d1e3cbf98896d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 2 Mar 2026 17:26:31 +0100 Subject: net: Provide a PREEMPT_RT specific check for netdev_queue::_xmit_lock After acquiring netdev_queue::_xmit_lock the number of the CPU owning the lock is recorded in netdev_queue::xmit_lock_owner. This works as long as the BH context is not preemptible. On PREEMPT_RT the softirq context is preemptible and without the softirq-lock it is possible to have multiple user in __dev_queue_xmit() submitting a skb on the same CPU. This is fine in general but this means also that the current CPU is recorded as netdev_queue::xmit_lock_owner. This in turn leads to the recursion alert and the skb is dropped. Instead checking the for CPU number, that owns the lock, PREEMPT_RT can check if the lockowner matches the current task. Add netif_tx_owned() which returns true if the current context owns the lock by comparing the provided CPU number with the recorded number. This resembles the current check by negating the condition (the current check returns true if the lock is not owned). On PREEMPT_RT use rt_mutex_owner() to return the lock owner and compare the current task against it. Use the new helper in __dev_queue_xmit() and netif_local_xmit_active() which provides a similar check. Update comments regarding pairing READ_ONCE(). Reported-by: Bert Karwatzki Closes: https://lore.kernel.org/all/20260216134333.412332-1-spasswolf@web.de Fixes: 3253cb49cbad4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reported-by: Bert Karwatzki Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260302162631.uGUyIqDT@linutronix.de Signed-off-by: Paolo Abeni --- net/core/dev.c | 5 +---- net/core/netpoll.c | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 20610a192ec7..14a83f2035b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4818,10 +4818,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - /* Other cpus might concurrently change txq->xmit_lock_owner - * to -1 or to their cpu id, but not to our id. - */ - if (READ_ONCE(txq->xmit_lock_owner) != cpu) { + if (!netif_tx_owned(txq, cpu)) { bool is_list = false; if (dev_xmit_recursion()) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a8558a52884f..cd74beffd209 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -132,7 +132,7 @@ static int netif_local_xmit_active(struct net_device *dev) for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) + if (netif_tx_owned(txq, smp_processor_id())) return 1; } -- cgit v1.2.3 From def602e498a4f951da95c95b1b8ce8ae68aa733a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 2 Mar 2026 23:12:37 +0100 Subject: netfilter: nf_tables: unconditionally bump set->nelems before insertion In case that the set is full, a new element gets published then removed without waiting for the RCU grace period, while RCU reader can be walking over it already. To address this issue, add the element transaction even if set is full, but toggle the set_full flag to report -ENFILE so the abort path safely unwinds the set to its previous state. As for element updates, decrement set->nelems to restore it. A simpler fix is to call synchronize_rcu() in the error path. However, with a large batch adding elements to already maxed-out set, this could cause noticeable slowdown of such batches. Fixes: 35d0ac9070ef ("netfilter: nf_tables: fix set->nelems counting with no NLM_F_EXCL") Reported-by: Inseo An Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd7f7e4e2a43..df67932d3e09 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7170,6 +7170,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; + bool set_full = false; u64 expiration; u64 timeout; int err, i; @@ -7461,10 +7462,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err_elem_free; + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + unsigned int max = nft_set_maxsize(set), nelems; + + nelems = atomic_inc_return(&set->nelems); + if (nelems > max) + set_full = true; + } + trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err_elem_free; + goto err_set_size; } ext->genmask = nft_genmask_cur(ctx->net); @@ -7516,7 +7525,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ue->priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans); - goto err_elem_free; + goto err_set_size; } } } @@ -7534,23 +7543,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_element_clash; } - if (!(flags & NFT_SET_ELEM_CATCHALL)) { - unsigned int max = nft_set_maxsize(set); - - if (!atomic_add_unless(&set->nelems, 1, max)) { - err = -ENFILE; - goto err_set_full; - } - } - nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans); - return 0; -err_set_full: - nft_setelem_remove(ctx->net, set, elem.priv); + return set_full ? -ENFILE : 0; + err_element_clash: kfree(trans); +err_set_size: + if (!(flags & NFT_SET_ELEM_CATCHALL)) + atomic_dec(&set->nelems); err_elem_free: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: -- cgit v1.2.3 From fb7fb4016300ac622c964069e286dc83166a5d52 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 2 Mar 2026 23:28:15 +0100 Subject: netfilter: nf_tables: clone set on flush only Syzbot with fault injection triggered a failing memory allocation with GFP_KERNEL which results in a WARN splat: iter.err WARNING: net/netfilter/nf_tables_api.c:845 at nft_map_deactivate+0x34e/0x3c0 net/netfilter/nf_tables_api.c:845, CPU#0: syz.0.17/5992 Modules linked in: CPU: 0 UID: 0 PID: 5992 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2026 RIP: 0010:nft_map_deactivate+0x34e/0x3c0 net/netfilter/nf_tables_api.c:845 Code: 8b 05 86 5a 4e 09 48 3b 84 24 a0 00 00 00 75 62 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc cc e8 63 6d fa f7 90 <0f> 0b 90 43 +80 7c 35 00 00 0f 85 23 fe ff ff e9 26 fe ff ff 89 d9 RSP: 0018:ffffc900045af780 EFLAGS: 00010293 RAX: ffffffff89ca45bd RBX: 00000000fffffff4 RCX: ffff888028111e40 RDX: 0000000000000000 RSI: 00000000fffffff4 RDI: 0000000000000000 RBP: ffffc900045af870 R08: 0000000000400dc0 R09: 00000000ffffffff R10: dffffc0000000000 R11: fffffbfff1d141db R12: ffffc900045af7e0 R13: 1ffff920008b5f24 R14: dffffc0000000000 R15: ffffc900045af920 FS: 000055557a6a5500(0000) GS:ffff888125496000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb5ea271fc0 CR3: 000000003269e000 CR4: 00000000003526f0 Call Trace: __nft_release_table+0xceb/0x11f0 net/netfilter/nf_tables_api.c:12115 nft_rcv_nl_event+0xc25/0xdb0 net/netfilter/nf_tables_api.c:12187 notifier_call_chain+0x19d/0x3a0 kernel/notifier.c:85 blocking_notifier_call_chain+0x6a/0x90 kernel/notifier.c:380 netlink_release+0x123b/0x1ad0 net/netlink/af_netlink.c:761 __sock_release net/socket.c:662 [inline] sock_close+0xc3/0x240 net/socket.c:1455 Restrict set clone to the flush set command in the preparation phase. Add NFT_ITER_UPDATE_CLONE and use it for this purpose, update the rbtree and pipapo backends to only clone the set when this iteration type is used. As for the existing NFT_ITER_UPDATE type, update the pipapo backend to use the existing set clone if available, otherwise use the existing set representation. After this update, there is no need to clone a set that is being deleted, this includes bound anonymous set. An alternative approach to NFT_ITER_UPDATE_CLONE is to add a .clone interface and call it from the flush set path. Reported-by: syzbot+4924a0edc148e8b4b342@syzkaller.appspotmail.com Fixes: 3f1d886cc7c3 ("netfilter: nft_set_pipapo: move cloning of match info to insert/removal path") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 10 +++++++++- net/netfilter/nft_set_hash.c | 1 + net/netfilter/nft_set_pipapo.c | 11 +++++++++-- net/netfilter/nft_set_rbtree.c | 8 +++++--- 4 files changed, 24 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index df67932d3e09..058f7004cb2b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -833,6 +833,11 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, } } +/* Use NFT_ITER_UPDATE iterator even if this may be called from the preparation + * phase, the set clone might already exist from a previous command, or it might + * be a set that is going away and does not require a clone. The netns and + * netlink release paths also need to work on the live set. + */ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { @@ -7903,9 +7908,12 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { + /* The set backend might need to clone the set, do it now from the + * preparation phase, use NFT_ITER_UPDATE_CLONE iterator type. + */ struct nft_set_iter iter = { .genmask = genmask, - .type = NFT_ITER_UPDATE, + .type = NFT_ITER_UPDATE_CLONE, .fn = nft_setelem_flush, }; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 739b992bde59..b0e571c8e3f3 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -374,6 +374,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, { switch (iter->type) { case NFT_ITER_UPDATE: + case NFT_ITER_UPDATE_CLONE: /* only relevant for netlink dumps which use READ type */ WARN_ON_ONCE(iter->skip != 0); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 7ef4b44471d3..c091898df710 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2144,13 +2144,20 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_match *m; switch (iter->type) { - case NFT_ITER_UPDATE: + case NFT_ITER_UPDATE_CLONE: m = pipapo_maybe_clone(set); if (!m) { iter->err = -ENOMEM; return; } - + nft_pipapo_do_walk(ctx, set, m, iter); + break; + case NFT_ITER_UPDATE: + if (priv->clone) + m = priv->clone; + else + m = rcu_dereference_protected(priv->match, + nft_pipapo_transaction_mutex_held(set)); nft_pipapo_do_walk(ctx, set, m, iter); break; case NFT_ITER_READ: diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 3f02e4478216..ee3d4f5b9ff7 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -861,13 +861,15 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_rbtree *priv = nft_set_priv(set); switch (iter->type) { - case NFT_ITER_UPDATE: - lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); - + case NFT_ITER_UPDATE_CLONE: if (nft_array_may_resize(set) < 0) { iter->err = -ENOMEM; break; } + fallthrough; + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); break; case NFT_ITER_READ: -- cgit v1.2.3 From 9df95785d3d8302f7c066050117b04cd3c2048c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 3 Mar 2026 16:31:32 +0100 Subject: netfilter: nft_set_pipapo: split gc into unlink and reclaim phase Yiming Qian reports Use-after-free in the pipapo set type: Under a large number of expired elements, commit-time GC can run for a very long time in a non-preemptible context, triggering soft lockup warnings and RCU stall reports (local denial of service). We must split GC in an unlink and a reclaim phase. We cannot queue elements for freeing until pointers have been swapped. Expired elements are still exposed to both the packet path and userspace dumpers via the live copy of the data structure. call_rcu() does not protect us: dump operations or element lookups starting after call_rcu has fired can still observe the free'd element, unless the commit phase has made enough progress to swap the clone and live pointers before any new reader has picked up the old version. This a similar approach as done recently for the rbtree backend in commit 35f83a75529a ("netfilter: nft_set_rbtree: don't gc elements on insert"). Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reported-by: Yiming Qian Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 5 ----- net/netfilter/nft_set_pipapo.c | 51 +++++++++++++++++++++++++++++++++++------- net/netfilter/nft_set_pipapo.h | 2 ++ 3 files changed, 45 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 058f7004cb2b..1862bd7fe804 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10493,11 +10493,6 @@ static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) schedule_work(&trans_gc_work); } -static int nft_trans_gc_space(struct nft_trans_gc *trans) -{ - return NFT_TRANS_GC_BATCHCOUNT - trans->count; -} - struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index c091898df710..a34632ae6048 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1680,11 +1680,11 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, } /** - * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * pipapo_gc_scan() - Drop expired entries from set and link them to gc list * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc_scan(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); @@ -1697,6 +1697,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) if (!gc) return; + list_add(&gc->list, &priv->gc_head); + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const struct nft_pipapo_field *f; @@ -1724,9 +1726,13 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) * NFT_SET_ELEM_DEAD_BIT. */ if (__nft_set_elem_expired(&e->ext, tstamp)) { - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); - if (!gc) - return; + if (!nft_trans_gc_space(gc)) { + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; + + list_add(&gc->list, &priv->gc_head); + } nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); @@ -1740,10 +1746,30 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) } } - gc = nft_trans_gc_catchall_sync(gc); + priv->last_gc = jiffies; +} + +/** + * pipapo_gc_queue() - Free expired elements + * @set: nftables API set representation + */ +static void pipapo_gc_queue(struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_trans_gc *gc, *next; + + /* always do a catchall cycle: */ + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (gc) { + gc = nft_trans_gc_catchall_sync(gc); + if (gc) + nft_trans_gc_queue_sync_done(gc); + } + + /* always purge queued gc elements. */ + list_for_each_entry_safe(gc, next, &priv->gc_head, list) { + list_del(&gc->list); nft_trans_gc_queue_sync_done(gc); - priv->last_gc = jiffies; } } @@ -1797,6 +1823,10 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * * We also need to create a new working copy for subsequent insertions and * deletions. + * + * After the live copy has been replaced by the clone, we can safely queue + * expired elements that have been collected by pipapo_gc_scan() for + * memory reclaim. */ static void nft_pipapo_commit(struct nft_set *set) { @@ -1807,7 +1837,7 @@ static void nft_pipapo_commit(struct nft_set *set) return; if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - pipapo_gc(set, priv->clone); + pipapo_gc_scan(set, priv->clone); old = rcu_replace_pointer(priv->match, priv->clone, nft_pipapo_transaction_mutex_held(set)); @@ -1815,6 +1845,8 @@ static void nft_pipapo_commit(struct nft_set *set) if (old) call_rcu(&old->rcu, pipapo_reclaim_match); + + pipapo_gc_queue(set); } static void nft_pipapo_abort(const struct nft_set *set) @@ -2279,6 +2311,7 @@ static int nft_pipapo_init(const struct nft_set *set, f->mt = NULL; } + INIT_LIST_HEAD(&priv->gc_head); rcu_assign_pointer(priv->match, m); return 0; @@ -2328,6 +2361,8 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; + WARN_ON_ONCE(!list_empty(&priv->gc_head)); + m = rcu_dereference_protected(priv->match, true); if (priv->clone) { diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index eaab422aa56a..9aee9a9eaeb7 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -156,12 +156,14 @@ struct nft_pipapo_match { * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding * @last_gc: Timestamp of last garbage collection run, jiffies + * @gc_head: list of nft_trans_gc to queue up for mem reclaim */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; unsigned long last_gc; + struct list_head gc_head; }; struct nft_pipapo_elem; -- cgit v1.2.3 From e5e890630533bdc15b26a34bb8e7ef539bdf1322 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 4 Mar 2026 13:03:56 +0100 Subject: net: bridge: fix nd_tbl NULL dereference when IPv6 is disabled When booting with the 'ipv6.disable=1' parameter, the nd_tbl is never initialized because inet6_init() exits before ndisc_init() is called which initializes it. Then, if neigh_suppress is enabled and an ICMPv6 Neighbor Discovery packet reaches the bridge, br_do_suppress_nd() will dereference ipv6_stub->nd_tbl which is NULL, passing it to neigh_lookup(). This causes a kernel NULL pointer dereference. BUG: kernel NULL pointer dereference, address: 0000000000000268 Oops: 0000 [#1] PREEMPT SMP NOPTI [...] RIP: 0010:neigh_lookup+0x16/0xe0 [...] Call Trace: ? neigh_lookup+0x16/0xe0 br_do_suppress_nd+0x160/0x290 [bridge] br_handle_frame_finish+0x500/0x620 [bridge] br_handle_frame+0x353/0x440 [bridge] __netif_receive_skb_core.constprop.0+0x298/0x1110 __netif_receive_skb_one_core+0x3d/0xa0 process_backlog+0xa0/0x140 __napi_poll+0x2c/0x170 net_rx_action+0x2c4/0x3a0 handle_softirqs+0xd0/0x270 do_softirq+0x3f/0x60 Fix this by replacing IS_ENABLED(IPV6) call with ipv6_mod_enabled() in the callers. This is in essence disabling NS/NA suppression when IPv6 is disabled. Fixes: ed842faeb2bd ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Reported-by: Guruprasad C P Closes: https://lore.kernel.org/netdev/CAHXs0ORzd62QOG-Fttqa2Cx_A_VFp=utE2H2VTX5nqfgs7LDxQ@mail.gmail.com/ Signed-off-by: Fernando Fernandez Mancera Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260304120357.9778-1-fmancera@suse.de Signed-off-by: Jakub Kicinski --- net/bridge/br_device.c | 2 +- net/bridge/br_input.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ee01122f466f..f7502e62dd35 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -74,7 +74,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { br_do_proxy_suppress_arp(skb, br, vid, NULL); - } else if (IS_ENABLED(CONFIG_IPV6) && + } else if (ipv6_mod_enabled() && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 1405f1061a54..2cbae0f9ae1f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -170,7 +170,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); - } else if (IS_ENABLED(CONFIG_IPV6) && + } else if (ipv6_mod_enabled() && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + -- cgit v1.2.3 From 21ec92774d1536f71bdc90b0e3d052eff99cf093 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 19:38:13 +0800 Subject: net: ipv6: fix panic when IPv4 route references loopback IPv6 nexthop When a standalone IPv6 nexthop object is created with a loopback device (e.g., "ip -6 nexthop add id 100 dev lo"), fib6_nh_init() misclassifies it as a reject route. This is because nexthop objects have no destination prefix (fc_dst=::), causing fib6_is_reject() to match any loopback nexthop. The reject path skips fib_nh_common_init(), leaving nhc_pcpu_rth_output unallocated. If an IPv4 route later references this nexthop, __mkroute_output() dereferences NULL nhc_pcpu_rth_output and panics. Simplify the check in fib6_nh_init() to only match explicit reject routes (RTF_REJECT) instead of using fib6_is_reject(). The loopback promotion heuristic in fib6_is_reject() is handled separately by ip6_route_info_create_nh(). After this change, the three cases behave as follows: 1. Explicit reject route ("ip -6 route add unreachable 2001:db8::/64"): RTF_REJECT is set, enters reject path, skips fib_nh_common_init(). No behavior change. 2. Implicit loopback reject route ("ip -6 route add 2001:db8::/32 dev lo"): RTF_REJECT is not set, takes normal path, fib_nh_common_init() is called. ip6_route_info_create_nh() still promotes it to reject afterward. nhc_pcpu_rth_output is allocated but unused, which is harmless. 3. Standalone nexthop object ("ip -6 nexthop add id 100 dev lo"): RTF_REJECT is not set, takes normal path, fib_nh_common_init() is called. nhc_pcpu_rth_output is properly allocated, fixing the crash when IPv4 routes reference this nexthop. Suggested-by: Ido Schimmel Fixes: 493ced1ac47c ("ipv4: Allow routes to use nexthop objects") Reported-by: syzbot+334190e097a98a1b81bb@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/698f8482.a70a0220.2c38d7.00ca.GAE@google.com/T/ Signed-off-by: Jiayuan Chen Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260304113817.294966-2-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7db0c837196c..08cd86f49bf9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3583,7 +3583,6 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; - int addr_type; int err; fib6_nh->fib_nh_family = AF_INET6; @@ -3625,11 +3624,10 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, fib6_nh->fib_nh_weight = 1; - /* We cannot add true routes via loopback here, - * they would result in kernel looping; promote them to reject routes + /* Reset the nexthop device to the loopback device in case of reject + * routes. */ - addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { + if (cfg->fc_flags & RTF_REJECT) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { -- cgit v1.2.3 From e2cedd400c3ec0302ffca2490e8751772906ac23 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Wed, 4 Mar 2026 09:06:02 -0500 Subject: net/sched: act_ife: Fix metalist update behavior Whenever an ife action replace changes the metalist, instead of replacing the old data on the metalist, the current ife code is appending the new metadata. Aside from being innapropriate behavior, this may lead to an unbounded addition of metadata to the metalist which might cause an out of bounds error when running the encode op: [ 138.423369][ C1] ================================================================== [ 138.424317][ C1] BUG: KASAN: slab-out-of-bounds in ife_tlv_meta_encode (net/ife/ife.c:168) [ 138.424906][ C1] Write of size 4 at addr ffff8880077f4ffe by task ife_out_out_bou/255 [ 138.425778][ C1] CPU: 1 UID: 0 PID: 255 Comm: ife_out_out_bou Not tainted 7.0.0-rc1-00169-gfbdfa8da05b6 #624 PREEMPT(full) [ 138.425795][ C1] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 138.425800][ C1] Call Trace: [ 138.425804][ C1] [ 138.425808][ C1] dump_stack_lvl (lib/dump_stack.c:122) [ 138.425828][ C1] print_report (mm/kasan/report.c:379 mm/kasan/report.c:482) [ 138.425839][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221) [ 138.425844][ C1] ? __virt_addr_valid (./arch/x86/include/asm/preempt.h:95 (discriminator 1) ./include/linux/rcupdate.h:975 (discriminator 1) ./include/linux/mmzone.h:2207 (discriminator 1) arch/x86/mm/physaddr.c:54 (discriminator 1)) [ 138.425853][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:168) [ 138.425859][ C1] kasan_report (mm/kasan/report.c:221 mm/kasan/report.c:597) [ 138.425868][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:168) [ 138.425878][ C1] kasan_check_range (mm/kasan/generic.c:186 (discriminator 1) mm/kasan/generic.c:200 (discriminator 1)) [ 138.425884][ C1] __asan_memset (mm/kasan/shadow.c:84 (discriminator 2)) [ 138.425889][ C1] ife_tlv_meta_encode (net/ife/ife.c:168) [ 138.425893][ C1] ? ife_tlv_meta_encode (net/ife/ife.c:171) [ 138.425898][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221) [ 138.425903][ C1] ife_encode_meta_u16 (net/sched/act_ife.c:57) [ 138.425910][ C1] ? __pfx_do_raw_spin_lock (kernel/locking/spinlock_debug.c:114) [ 138.425916][ C1] ? __asan_memcpy (mm/kasan/shadow.c:105 (discriminator 3)) [ 138.425921][ C1] ? __pfx_ife_encode_meta_u16 (net/sched/act_ife.c:45) [ 138.425927][ C1] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:221) [ 138.425931][ C1] tcf_ife_act (net/sched/act_ife.c:847 net/sched/act_ife.c:879) To solve this issue, fix the replace behavior by adding the metalist to the ife rcu data structure. Fixes: aa9fd9a325d51 ("sched: act: ife: update parameters via rcu handling") Reported-by: Ruitong Liu Tested-by: Ruitong Liu Co-developed-by: Victor Nogueira Signed-off-by: Victor Nogueira Signed-off-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260304140603.76500-1-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/act_ife.c | 93 +++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 79df81d12894..d5e8a91bb4eb 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -293,8 +293,8 @@ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held) /* called when adding new meta information */ static int __add_metainfo(const struct tcf_meta_ops *ops, - struct tcf_ife_info *ife, u32 metaid, void *metaval, - int len, bool atomic, bool exists) + struct tcf_ife_params *p, u32 metaid, void *metaval, + int len, bool atomic) { struct tcf_meta_info *mi = NULL; int ret = 0; @@ -313,45 +313,40 @@ static int __add_metainfo(const struct tcf_meta_ops *ops, } } - if (exists) - spin_lock_bh(&ife->tcf_lock); - list_add_tail(&mi->metalist, &ife->metalist); - if (exists) - spin_unlock_bh(&ife->tcf_lock); + list_add_tail(&mi->metalist, &p->metalist); return ret; } static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops, - struct tcf_ife_info *ife, u32 metaid, - bool exists) + struct tcf_ife_params *p, u32 metaid) { int ret; if (!try_module_get(ops->owner)) return -ENOENT; - ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists); + ret = __add_metainfo(ops, p, metaid, NULL, 0, true); if (ret) module_put(ops->owner); return ret; } -static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, - int len, bool exists) +static int add_metainfo(struct tcf_ife_params *p, u32 metaid, void *metaval, + int len) { const struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret; if (!ops) return -ENOENT; - ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists); + ret = __add_metainfo(ops, p, metaid, metaval, len, false); if (ret) /*put back what find_ife_oplist took */ module_put(ops->owner); return ret; } -static int use_all_metadata(struct tcf_ife_info *ife, bool exists) +static int use_all_metadata(struct tcf_ife_params *p) { struct tcf_meta_ops *o; int rc = 0; @@ -359,7 +354,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists) read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { - rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists); + rc = add_metainfo_and_get_ops(o, p, o->metaid); if (rc == 0) installed += 1; } @@ -371,7 +366,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists) return -EINVAL; } -static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) +static int dump_metalist(struct sk_buff *skb, struct tcf_ife_params *p) { struct tcf_meta_info *e; struct nlattr *nest; @@ -379,14 +374,14 @@ static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) int total_encoded = 0; /*can only happen on decode */ - if (list_empty(&ife->metalist)) + if (list_empty(&p->metalist)) return 0; nest = nla_nest_start_noflag(skb, TCA_IFE_METALST); if (!nest) goto out_nlmsg_trim; - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry(e, &p->metalist, metalist) { if (!e->ops->get(skb, e)) total_encoded += 1; } @@ -403,13 +398,11 @@ out_nlmsg_trim: return -1; } -/* under ife->tcf_lock */ -static void _tcf_ife_cleanup(struct tc_action *a) +static void __tcf_ife_cleanup(struct tcf_ife_params *p) { - struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; - list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + list_for_each_entry_safe(e, n, &p->metalist, metalist) { list_del(&e->metalist); if (e->metaval) { if (e->ops->release) @@ -422,18 +415,23 @@ static void _tcf_ife_cleanup(struct tc_action *a) } } +static void tcf_ife_cleanup_params(struct rcu_head *head) +{ + struct tcf_ife_params *p = container_of(head, struct tcf_ife_params, + rcu); + + __tcf_ife_cleanup(p); + kfree(p); +} + static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; - spin_lock_bh(&ife->tcf_lock); - _tcf_ife_cleanup(a); - spin_unlock_bh(&ife->tcf_lock); - p = rcu_dereference_protected(ife->params, 1); if (p) - kfree_rcu(p, rcu); + call_rcu(&p->rcu, tcf_ife_cleanup_params); } static int load_metalist(struct nlattr **tb, bool rtnl_held) @@ -455,8 +453,7 @@ static int load_metalist(struct nlattr **tb, bool rtnl_held) return 0; } -static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, - bool exists, bool rtnl_held) +static int populate_metalist(struct tcf_ife_params *p, struct nlattr **tb) { int len = 0; int rc = 0; @@ -468,7 +465,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, val = nla_data(tb[i]); len = nla_len(tb[i]); - rc = add_metainfo(ife, i, val, len, exists); + rc = add_metainfo(p, i, val, len); if (rc) return rc; } @@ -523,6 +520,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, p = kzalloc_obj(*p); if (!p) return -ENOMEM; + INIT_LIST_HEAD(&p->metalist); if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, @@ -567,8 +565,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } ife = to_ife(*a); - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&ife->metalist); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) @@ -600,8 +596,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } if (tb[TCA_IFE_METALST]) { - err = populate_metalist(ife, tb2, exists, - !(flags & TCA_ACT_FLAGS_NO_RTNL)); + err = populate_metalist(p, tb2); if (err) goto metadata_parse_err; } else { @@ -610,7 +605,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, * as we can. You better have at least one else we are * going to bail out */ - err = use_all_metadata(ife, exists); + err = use_all_metadata(p); if (err) goto metadata_parse_err; } @@ -626,13 +621,14 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) - kfree_rcu(p, rcu); + call_rcu(&p->rcu, tcf_ife_cleanup_params); return ret; metadata_parse_err: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: + __tcf_ife_cleanup(p); kfree(p); tcf_idr_release(*a, bind); return err; @@ -679,7 +675,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; - if (dump_metalist(skb, ife)) { + if (dump_metalist(skb, p)) { /*ignore failure to dump metalist */ pr_info("Failed to dump metalist\n"); } @@ -693,13 +689,13 @@ nla_put_failure: return -1; } -static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, +static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_params *p, u16 metaid, u16 mlen, void *mdata) { struct tcf_meta_info *e; /* XXX: use hash to speed up */ - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (metaid == e->metaid) { if (e->ops) { /* We check for decode presence already */ @@ -716,10 +712,13 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; + struct tcf_ife_params *p; u8 *ifehdr_end; u8 *tlv_data; u16 metalen; + p = rcu_dereference_bh(ife->params); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); @@ -745,7 +744,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, return TC_ACT_SHOT; } - if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { + if (find_decode_metaid(skb, p, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata * but dont have an ops for it */ @@ -769,12 +768,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, /*XXX: check if we can do this at install time instead of current * send data path **/ -static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) +static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_params *p) { - struct tcf_meta_info *e, *n; + struct tcf_meta_info *e; int tot_run_sz = 0, run_sz = 0; - list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (e->ops->check_presence) { run_sz = e->ops->check_presence(skb, e); tot_run_sz += run_sz; @@ -795,7 +794,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA where ORIGDATA = original ethernet header ... */ - u16 metalen = ife_get_sz(skb, ife); + u16 metalen = ife_get_sz(skb, p); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; unsigned int skboff = 0; int new_len = skb->len + hdrm; @@ -833,25 +832,21 @@ drop: if (!ife_meta) goto drop; - spin_lock(&ife->tcf_lock); - /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... */ - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (e->ops->encode) { err = e->ops->encode(skb, (void *)(ife_meta + skboff), e); } if (err < 0) { /* too corrupt to keep around if overwritten */ - spin_unlock(&ife->tcf_lock); goto drop; } skboff += err; } - spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; if (!is_zero_ether_addr(p->eth_src)) -- cgit v1.2.3 From 88b6b7f7b216108a09887b074395fa7b751880b1 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:42 +0100 Subject: xdp: use modulo operation to calculate XDP frag tailroom The current formula for calculating XDP tailroom in mbuf packets works only if each frag has its own page (if rxq->frag_size is PAGE_SIZE), this defeats the purpose of the parameter overall and without any indication leads to negative calculated tailroom on at least half of frags, if shared pages are used. There are not many drivers that set rxq->frag_size. Among them: * i40e and enetc always split page uniformly between frags, use shared pages * ice uses page_pool frags via libeth, those are power-of-2 and uniformly distributed across page * idpf has variable frag_size with XDP on, so current API is not applicable * mlx5, mtk and mvneta use PAGE_SIZE or 0 as frag_size for page_pool As for AF_XDP ZC, only ice, i40e and idpf declare frag_size for it. Modulo operation yields good results for aligned chunks, they are all power-of-2, between 2K and PAGE_SIZE. Formula without modulo fails when chunk_size is 2K. Buffers in unaligned mode are not distributed uniformly, so modulo operation would not work. To accommodate unaligned buffers, we could define frag_size as data + tailroom, and hence do not subtract offset when calculating tailroom, but this would necessitate more changes in the drivers. Define rxq->frag_size as an even portion of a page that fully belongs to a single frag. When calculating tailroom, locate the data start within such portion by performing a modulo operation on page offset. Fixes: bf25146a5595 ("bpf: add frags support to the bpf_xdp_adjust_tail() API") Acked-by: Jakub Kicinski Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-2-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 0d5d5a17acb2..d6fafb3633b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4155,7 +4155,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; - tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + tailroom = rxq->frag_size - skb_frag_size(frag) - + skb_frag_off(frag) % rxq->frag_size; if (unlikely(offset > tailroom)) return -EINVAL; -- cgit v1.2.3 From 8821e857759be9db3cde337ad328b71fe5c8a55f Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 5 Mar 2026 12:12:50 +0100 Subject: xdp: produce a warning when calculated tailroom is negative MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many ethernet drivers report xdp Rx queue frag size as being the same as DMA write size. However, the only user of this field, namely bpf_xdp_frags_increase_tail(), clearly expects a truesize. Such difference leads to unspecific memory corruption issues under certain circumstances, e.g. in ixgbevf maximum DMA write size is 3 KB, so when running xskxceiver's XDP_ADJUST_TAIL_GROW_MULTI_BUFF, 6K packet fully uses all DMA-writable space in 2 buffers. This would be fine, if only rxq->frag_size was properly set to 4K, but value of 3K results in a negative tailroom, because there is a non-zero page offset. We are supposed to return -EINVAL and be done with it in such case, but due to tailroom being stored as an unsigned int, it is reported to be somewhere near UINT_MAX, resulting in a tail being grown, even if the requested offset is too much (it is around 2K in the abovementioned test). This later leads to all kinds of unspecific calltraces. [ 7340.337579] xskxceiver[1440]: segfault at 1da718 ip 00007f4161aeac9d sp 00007f41615a6a00 error 6 [ 7340.338040] xskxceiver[1441]: segfault at 7f410000000b ip 00000000004042b5 sp 00007f415bffecf0 error 4 [ 7340.338179] in libc.so.6[61c9d,7f4161aaf000+160000] [ 7340.339230] in xskxceiver[42b5,400000+69000] [ 7340.340300] likely on CPU 6 (core 0, socket 6) [ 7340.340302] Code: ff ff 01 e9 f4 fe ff ff 0f 1f 44 00 00 4c 39 f0 74 73 31 c0 ba 01 00 00 00 f0 0f b1 17 0f 85 ba 00 00 00 49 8b 87 88 00 00 00 <4c> 89 70 08 eb cc 0f 1f 44 00 00 48 8d bd f0 fe ff ff 89 85 ec fe [ 7340.340888] likely on CPU 3 (core 0, socket 3) [ 7340.345088] Code: 00 00 00 ba 00 00 00 00 be 00 00 00 00 89 c7 e8 31 ca ff ff 89 45 ec 8b 45 ec 85 c0 78 07 b8 00 00 00 00 eb 46 e8 0b c8 ff ff <8b> 00 83 f8 69 74 24 e8 ff c7 ff ff 8b 00 83 f8 0b 74 18 e8 f3 c7 [ 7340.404334] Oops: general protection fault, probably for non-canonical address 0x6d255010bdffc: 0000 [#1] SMP NOPTI [ 7340.405972] CPU: 7 UID: 0 PID: 1439 Comm: xskxceiver Not tainted 6.19.0-rc1+ #21 PREEMPT(lazy) [ 7340.408006] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014 [ 7340.409716] RIP: 0010:lookup_swap_cgroup_id+0x44/0x80 [ 7340.410455] Code: 83 f8 1c 73 39 48 ba ff ff ff ff ff ff ff 03 48 8b 04 c5 20 55 fa bd 48 21 d1 48 89 ca 83 e1 01 48 d1 ea c1 e1 04 48 8d 04 90 <8b> 00 48 83 c4 10 d3 e8 c3 cc cc cc cc 31 c0 e9 98 b7 dd 00 48 89 [ 7340.412787] RSP: 0018:ffffcc5c04f7f6d0 EFLAGS: 00010202 [ 7340.413494] RAX: 0006d255010bdffc RBX: ffff891f477895a8 RCX: 0000000000000010 [ 7340.414431] RDX: 0001c17e3fffffff RSI: 00fa070000000000 RDI: 000382fc7fffffff [ 7340.415354] RBP: 00fa070000000000 R08: ffffcc5c04f7f8f8 R09: ffffcc5c04f7f7d0 [ 7340.416283] R10: ffff891f4c1a7000 R11: ffffcc5c04f7f9c8 R12: ffffcc5c04f7f7d0 [ 7340.417218] R13: 03ffffffffffffff R14: 00fa06fffffffe00 R15: ffff891f47789500 [ 7340.418229] FS: 0000000000000000(0000) GS:ffff891ffdfaa000(0000) knlGS:0000000000000000 [ 7340.419489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7340.420286] CR2: 00007f415bfffd58 CR3: 0000000103f03002 CR4: 0000000000772ef0 [ 7340.421237] PKRU: 55555554 [ 7340.421623] Call Trace: [ 7340.421987] [ 7340.422309] ? softleaf_from_pte+0x77/0xa0 [ 7340.422855] swap_pte_batch+0xa7/0x290 [ 7340.423363] zap_nonpresent_ptes.constprop.0.isra.0+0xd1/0x270 [ 7340.424102] zap_pte_range+0x281/0x580 [ 7340.424607] zap_pmd_range.isra.0+0xc9/0x240 [ 7340.425177] unmap_page_range+0x24d/0x420 [ 7340.425714] unmap_vmas+0xa1/0x180 [ 7340.426185] exit_mmap+0xe1/0x3b0 [ 7340.426644] __mmput+0x41/0x150 [ 7340.427098] exit_mm+0xb1/0x110 [ 7340.427539] do_exit+0x1b2/0x460 [ 7340.427992] do_group_exit+0x2d/0xc0 [ 7340.428477] get_signal+0x79d/0x7e0 [ 7340.428957] arch_do_signal_or_restart+0x34/0x100 [ 7340.429571] exit_to_user_mode_loop+0x8e/0x4c0 [ 7340.430159] do_syscall_64+0x188/0x6b0 [ 7340.430672] ? __do_sys_clone3+0xd9/0x120 [ 7340.431212] ? switch_fpu_return+0x4e/0xd0 [ 7340.431761] ? arch_exit_to_user_mode_prepare.isra.0+0xa1/0xc0 [ 7340.432498] ? do_syscall_64+0xbb/0x6b0 [ 7340.433015] ? __handle_mm_fault+0x445/0x690 [ 7340.433582] ? count_memcg_events+0xd6/0x210 [ 7340.434151] ? handle_mm_fault+0x212/0x340 [ 7340.434697] ? do_user_addr_fault+0x2b4/0x7b0 [ 7340.435271] ? clear_bhb_loop+0x30/0x80 [ 7340.435788] ? clear_bhb_loop+0x30/0x80 [ 7340.436299] ? clear_bhb_loop+0x30/0x80 [ 7340.436812] ? clear_bhb_loop+0x30/0x80 [ 7340.437323] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 7340.437973] RIP: 0033:0x7f4161b14169 [ 7340.438468] Code: Unable to access opcode bytes at 0x7f4161b1413f. [ 7340.439242] RSP: 002b:00007ffc6ebfa770 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca [ 7340.440173] RAX: fffffffffffffe00 RBX: 00000000000005a1 RCX: 00007f4161b14169 [ 7340.441061] RDX: 00000000000005a1 RSI: 0000000000000109 RDI: 00007f415bfff990 [ 7340.441943] RBP: 00007ffc6ebfa7a0 R08: 0000000000000000 R09: 00000000ffffffff [ 7340.442824] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 7340.443707] R13: 0000000000000000 R14: 00007f415bfff990 R15: 00007f415bfff6c0 [ 7340.444586] [ 7340.444922] Modules linked in: rfkill intel_rapl_msr intel_rapl_common intel_uncore_frequency_common skx_edac_common nfit libnvdimm kvm_intel vfat fat kvm snd_pcm irqbypass rapl iTCO_wdt snd_timer intel_pmc_bxt iTCO_vendor_support snd ixgbevf virtio_net soundcore i2c_i801 pcspkr libeth_xdp net_failover i2c_smbus lpc_ich failover libeth virtio_balloon joydev 9p fuse loop zram lz4hc_compress lz4_compress 9pnet_virtio 9pnet netfs ghash_clmulni_intel serio_raw qemu_fw_cfg [ 7340.449650] ---[ end trace 0000000000000000 ]--- The issue can be fixed in all in-tree drivers, but we cannot just trust OOT drivers to not do this. Therefore, make tailroom a signed int and produce a warning when it is negative to prevent such mistakes in the future. Fixes: bf25146a5595 ("bpf: add frags support to the bpf_xdp_adjust_tail() API") Reviewed-by: Aleksandr Loktionov Reviewed-by: Toke Høiland-Jørgensen Acked-by: Martin KaFai Lau Signed-off-by: Larysa Zaremba Link: https://patch.msgid.link/20260305111253.2317394-10-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index d6fafb3633b0..a77d23fe2359 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4150,13 +4150,14 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; - unsigned int tailroom; + int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag) % rxq->frag_size; + WARN_ON_ONCE(tailroom < 0); if (unlikely(offset > tailroom)) return -EINVAL; -- cgit v1.2.3 From 5c3398a54266541610c8d0a7082e654e9ff3e259 Mon Sep 17 00:00:00 2001 From: Jian Zhang Date: Thu, 5 Mar 2026 14:06:55 +0800 Subject: net: ncsi: fix skb leak in error paths Early return paths in NCSI RX and AEN handlers fail to release the received skb, resulting in a memory leak. Specifically, ncsi_aen_handler() returns on invalid AEN packets without consuming the skb. Similarly, ncsi_rcv_rsp() exits early when failing to resolve the NCSI device, response handler, or request, leaving the skb unfreed. CC: stable@vger.kernel.org Fixes: 7a82ecf4cfb8 ("net/ncsi: NCSI AEN packet handler") Fixes: 138635cc27c9 ("net/ncsi: NCSI response packet handler") Signed-off-by: Jian Zhang Link: https://patch.msgid.link/20260305060656.3357250-1-zhangjian.3032@bytedance.com Signed-off-by: Jakub Kicinski --- net/ncsi/ncsi-aen.c | 3 ++- net/ncsi/ncsi-rsp.c | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 62fb1031763d..040a31557201 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -224,7 +224,8 @@ int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb) if (!nah) { netdev_warn(ndp->ndev.dev, "Invalid AEN (0x%x) received\n", h->type); - return -ENOENT; + ret = -ENOENT; + goto out; } ret = ncsi_validate_aen_pkt(h, nah->payload); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 271ec6c3929e..fbd84bc8026a 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1176,8 +1176,10 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, /* Find the NCSI device */ nd = ncsi_find_dev(orig_dev); ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; - if (!ndp) - return -ENODEV; + if (!ndp) { + ret = -ENODEV; + goto err_free_skb; + } /* Check if it is AEN packet */ hdr = (struct ncsi_pkt_hdr *)skb_network_header(skb); @@ -1199,7 +1201,8 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, if (!nrh) { netdev_err(nd->dev, "Received unrecognized packet (0x%x)\n", hdr->type); - return -ENOENT; + ret = -ENOENT; + goto err_free_skb; } /* Associate with the request */ @@ -1207,7 +1210,8 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, nr = &ndp->requests[hdr->id]; if (!nr->used) { spin_unlock_irqrestore(&ndp->lock, flags); - return -ENODEV; + ret = -ENODEV; + goto err_free_skb; } nr->rsp = skb; @@ -1261,4 +1265,8 @@ out_netlink: out: ncsi_free_request(nr); return ret; + +err_free_skb: + kfree_skb(skb); + return ret; } -- cgit v1.2.3 From 0cc0c2e661af418bbf7074179ea5cfffc0a5c466 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Wed, 4 Mar 2026 12:42:18 +0800 Subject: net/sched: teql: fix NULL pointer dereference in iptunnel_xmit on TEQL slave xmit teql_master_xmit() calls netdev_start_xmit(skb, slave) to transmit through slave devices, but does not update skb->dev to the slave device beforehand. When a gretap tunnel is a TEQL slave, the transmit path reaches iptunnel_xmit() which saves dev = skb->dev (still pointing to teql0 master) and later calls iptunnel_xmit_stats(dev, pkt_len). This function does: get_cpu_ptr(dev->tstats) Since teql_master_setup() does not set dev->pcpu_stat_type to NETDEV_PCPU_STAT_TSTATS, the core network stack never allocates tstats for teql0, so dev->tstats is NULL. get_cpu_ptr(NULL) computes NULL + __per_cpu_offset[cpu], resulting in a page fault. BUG: unable to handle page fault for address: ffff8880e6659018 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 68bc067 P4D 68bc067 PUD 0 Oops: Oops: 0002 [#1] SMP KASAN PTI RIP: 0010:iptunnel_xmit (./include/net/ip_tunnels.h:664 net/ipv4/ip_tunnel_core.c:89) Call Trace: ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) __gre_xmit (net/ipv4/ip_gre.c:478) gre_tap_xmit (net/ipv4/ip_gre.c:779) teql_master_xmit (net/sched/sch_teql.c:319) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) neigh_direct_output (net/core/neighbour.c:1660) ip_finish_output2 (net/ipv4/ip_output.c:237) __ip_finish_output.part.0 (net/ipv4/ip_output.c:315) ip_mc_output (net/ipv4/ip_output.c:369) ip_send_skb (net/ipv4/ip_output.c:1508) udp_send_skb (net/ipv4/udp.c:1195) udp_sendmsg (net/ipv4/udp.c:1485) inet_sendmsg (net/ipv4/af_inet.c:859) __sys_sendto (net/socket.c:2206) Fix this by setting skb->dev = slave before calling netdev_start_xmit(), so that tunnel xmit functions see the correct slave device with properly allocated tstats. Fixes: 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Link: https://patch.msgid.link/20260304044216.3517851-3-bestswngs@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_teql.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 6e4bdaa876ed..783300d8b019 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -315,6 +315,7 @@ restart: if (__netif_tx_trylock(slave_txq)) { unsigned int length = qdisc_pkt_len(skb); + skb->dev = slave; if (!netif_xmit_frozen_or_stopped(slave_txq) && netdev_start_xmit(skb, slave, slave_txq, false) == NETDEV_TX_OK) { -- cgit v1.2.3 From 4245a79003adf30e67f8e9060915bd05cb31d142 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 5 Mar 2026 12:31:01 +0000 Subject: rxrpc, afs: Fix missing error pointer check after rxrpc_kernel_lookup_peer() rxrpc_kernel_lookup_peer() can also return error pointers in addition to NULL, so just checking for NULL is not sufficient. Fix this by: (1) Changing rxrpc_kernel_lookup_peer() to return -ENOMEM rather than NULL on allocation failure. (2) Making the callers in afs use IS_ERR() and PTR_ERR() to pass on the error code returned. Fixes: 72904d7b9bfb ("rxrpc, afs: Allow afs to pin rxrpc_peer objects") Signed-off-by: Miaoqian Lin Co-developed-by: David Howells Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/368272.1772713861@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/af_rxrpc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0c2c68c4b07e..0f90272ac254 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -267,12 +267,13 @@ static int rxrpc_listen(struct socket *sock, int backlog) * Lookup or create a remote transport endpoint record for the specified * address. * - * Return: The peer record found with a reference, %NULL if no record is found - * or a negative error code if the address is invalid or unsupported. + * Return: The peer record found with a reference or a negative error code if + * the address is invalid or unsupported. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) { + struct rxrpc_peer *peer; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; @@ -280,7 +281,8 @@ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, if (ret < 0) return ERR_PTR(ret); - return rxrpc_lookup_peer(rx->local, srx, gfp); + peer = rxrpc_lookup_peer(rx->local, srx, gfp); + return peer ?: ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); -- cgit v1.2.3 From b2662e7593e94ae09b1cf7ee5f09160a3612bcb2 Mon Sep 17 00:00:00 2001 From: Mehul Rao Date: Fri, 6 Mar 2026 18:38:20 -0500 Subject: net: nexthop: fix percpu use-after-free in remove_nh_grp_entry When removing a nexthop from a group, remove_nh_grp_entry() publishes the new group via rcu_assign_pointer() then immediately frees the removed entry's percpu stats with free_percpu(). However, the synchronize_net() grace period in the caller remove_nexthop_from_groups() runs after the free. RCU readers that entered before the publish still see the old group and can dereference the freed stats via nh_grp_entry_stats_inc() -> get_cpu_ptr(nhge->stats), causing a use-after-free on percpu memory. Fix by deferring the free_percpu() until after synchronize_net() in the caller. Removed entries are chained via nh_list onto a local deferred free list. After the grace period completes and all RCU readers have finished, the percpu stats are safely freed. Fixes: f4676ea74b85 ("net: nexthop: Add nexthop group entry stats") Cc: stable@vger.kernel.org Signed-off-by: Mehul Rao Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260306233821.196789-1-mehulrao@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 1aa2b05ee8de..c942f1282236 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2002,7 +2002,8 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg) } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, - struct nl_info *nlinfo) + struct nl_info *nlinfo, + struct list_head *deferred_free) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; @@ -2062,8 +2063,8 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); - free_percpu(nhge->stats); nexthop_put(nhge->nh); + list_add(&nhge->nh_list, deferred_free); /* Removal of a NH from a resilient group is notified through * bucket notifications. @@ -2083,6 +2084,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; + LIST_HEAD(deferred_free); /* If there is nothing to do, let's avoid the costly call to * synchronize_net() @@ -2091,10 +2093,16 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, return; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) - remove_nh_grp_entry(net, nhge, nlinfo); + remove_nh_grp_entry(net, nhge, nlinfo, &deferred_free); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); + + /* Now safe to free percpu stats — all RCU readers have finished */ + list_for_each_entry_safe(nhge, tmp, &deferred_free, nh_list) { + list_del(&nhge->nh_list); + free_percpu(nhge->stats); + } } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) -- cgit v1.2.3 From 7d86aa41c073c4e7eb75fd2e674f1fd8f289728a Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Fri, 6 Mar 2026 03:14:02 +0000 Subject: mctp: route: hold key->lock in mctp_flow_prepare_output() mctp_flow_prepare_output() checks key->dev and may call mctp_dev_set_key(), but it does not hold key->lock while doing so. mctp_dev_set_key() and mctp_dev_release_key() are annotated with __must_hold(&key->lock), so key->dev access is intended to be serialized by key->lock. The mctp_sendmsg() transmit path reaches mctp_flow_prepare_output() via mctp_local_output() -> mctp_dst_output() without holding key->lock, so the check-and-set sequence is racy. Example interleaving: CPU0 CPU1 ---- ---- mctp_flow_prepare_output(key, devA) if (!key->dev) // sees NULL mctp_flow_prepare_output( key, devB) if (!key->dev) // still NULL mctp_dev_set_key(devB, key) mctp_dev_hold(devB) key->dev = devB mctp_dev_set_key(devA, key) mctp_dev_hold(devA) key->dev = devA // overwrites devB Now both devA and devB references were acquired, but only the final key->dev value is tracked for release. One reference can be lost, causing a resource leak as mctp_dev_release_key() would only decrease the reference on one dev. Fix by taking key->lock around the key->dev check and mctp_dev_set_key() call. Fixes: 67737c457281 ("mctp: Pass flow data & flow release events to drivers") Signed-off-by: Chengfeng Ye Link: https://patch.msgid.link/20260306031402.857224-1-dg573847474@gmail.com Signed-off-by: Paolo Abeni --- net/mctp/route.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mctp/route.c b/net/mctp/route.c index 0381377ab760..59ad60b88563 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -359,6 +359,7 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) { struct mctp_sk_key *key; struct mctp_flow *flow; + unsigned long flags; flow = skb_ext_find(skb, SKB_EXT_MCTP); if (!flow) @@ -366,12 +367,14 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) key = flow->key; - if (key->dev) { + spin_lock_irqsave(&key->lock, flags); + + if (!key->dev) + mctp_dev_set_key(dev, key); + else WARN_ON(key->dev != dev); - return; - } - mctp_dev_set_key(dev, key); + spin_unlock_irqrestore(&key->lock, flags); } #else static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {} -- cgit v1.2.3 From 69fb5d91bba44ecf7eb80530b85fa4fb028921d5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 8 Mar 2026 17:38:00 +0100 Subject: libceph: prevent potential out-of-bounds reads in process_message_header() If the message frame is (maliciously) corrupted in a way that the length of the control segment ends up being less than the size of the message header or a different frame is made to look like a message frame, out-of-bounds reads may ensue in process_message_header(). Perform an explicit bounds check before decoding the message header. Cc: stable@vger.kernel.org Reported-by: Raphael Zimmer Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko --- net/ceph/messenger_v2.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 5ec3272cd2dd..ed618435d33a 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -2833,12 +2833,15 @@ static int process_message_header(struct ceph_connection *con, void *p, void *end) { struct ceph_frame_desc *desc = &con->v2.in_desc; - struct ceph_msg_header2 *hdr2 = p; + struct ceph_msg_header2 *hdr2; struct ceph_msg_header hdr; int skip; int ret; u64 seq; + ceph_decode_need(&p, end, sizeof(*hdr2), bad); + hdr2 = p; + /* verify seq# */ seq = le64_to_cpu(hdr2->seq); if ((s64)seq - (s64)con->in_seq < 1) { @@ -2869,6 +2872,10 @@ static int process_message_header(struct ceph_connection *con, WARN_ON(!con->in_msg); WARN_ON(con->in_msg->con != con); return 1; + +bad: + pr_err("failed to decode message header\n"); + return -EINVAL; } static int process_message(struct ceph_connection *con) -- cgit v1.2.3 From a5a373705081d7cc6363e16990e2361b0b362314 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 8 Mar 2026 17:57:23 +0100 Subject: libceph: admit message frames only in CEPH_CON_S_OPEN state Similar checks are performed for all control frames, but an early check for message frames was missing. process_message() is already set up to terminate the loop in case the state changes while con->ops->dispatch() handler is being executed. Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko --- net/ceph/messenger_v2.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index ed618435d33a..c4ddf7911f7d 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -2905,6 +2905,11 @@ static int __handle_control(struct ceph_connection *con, void *p) if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE) return process_control(con, p, end); + if (con->state != CEPH_CON_S_OPEN) { + con->error_msg = "protocol error, unexpected message"; + return -EINVAL; + } + ret = process_message_header(con, p, end); if (ret < 0) return ret; -- cgit v1.2.3 From c4c22b846eceff05b1129b8844a80310e55a7f87 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 8 Mar 2026 20:01:27 +0100 Subject: libceph: reject preamble if control segment is empty While head_onwire_len() has a branch to handle ctrl_len == 0 case, prepare_read_control() always sets up a kvec for the CRC meaning that a non-empty control segment is effectively assumed. All frames that clients deal with meet that assumption, so let's make it official and treat the preamble with an empty control segment as malformed. Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- net/ceph/messenger_v2.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c4ddf7911f7d..50f65820f623 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -392,7 +392,7 @@ static int head_onwire_len(int ctrl_len, bool secure) int head_len; int rem_len; - BUG_ON(ctrl_len < 0 || ctrl_len > CEPH_MSG_MAX_CONTROL_LEN); + BUG_ON(ctrl_len < 1 || ctrl_len > CEPH_MSG_MAX_CONTROL_LEN); if (secure) { head_len = CEPH_PREAMBLE_SECURE_LEN; @@ -401,9 +401,7 @@ static int head_onwire_len(int ctrl_len, bool secure) head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN; } } else { - head_len = CEPH_PREAMBLE_PLAIN_LEN; - if (ctrl_len) - head_len += ctrl_len + CEPH_CRC_LEN; + head_len = CEPH_PREAMBLE_PLAIN_LEN + ctrl_len + CEPH_CRC_LEN; } return head_len; } @@ -528,11 +526,16 @@ static int decode_preamble(void *p, struct ceph_frame_desc *desc) desc->fd_aligns[i] = ceph_decode_16(&p); } - if (desc->fd_lens[0] < 0 || + /* + * This would fire for FRAME_TAG_WAIT (it has one empty + * segment), but we should never get it as client. + */ + if (desc->fd_lens[0] < 1 || desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) { pr_err("bad control segment length %d\n", desc->fd_lens[0]); return -EINVAL; } + if (desc->fd_lens[1] < 0 || desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) { pr_err("bad front segment length %d\n", desc->fd_lens[1]); @@ -549,10 +552,6 @@ static int decode_preamble(void *p, struct ceph_frame_desc *desc) return -EINVAL; } - /* - * This would fire for FRAME_TAG_WAIT (it has one empty - * segment), but we should never get it as client. - */ if (!desc->fd_lens[desc->fd_seg_cnt - 1]) { pr_err("last segment empty, segment count %d\n", desc->fd_seg_cnt); -- cgit v1.2.3 From 6f1a9140ecda3baba3d945b9a6155af4268aafc4 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Sat, 7 Mar 2026 00:01:34 +0800 Subject: net: add xmit recursion limit to tunnel xmit functions Tunnel xmit functions (iptunnel_xmit, ip6tunnel_xmit) lack their own recursion limit. When a bond device in broadcast mode has GRE tap interfaces as slaves, and those GRE tunnels route back through the bond, multicast/broadcast traffic triggers infinite recursion between bond_xmit_broadcast() and ip_tunnel_xmit()/ip6_tnl_xmit(), causing kernel stack overflow. The existing XMIT_RECURSION_LIMIT (8) in the no-qdisc path is not sufficient because tunnel recursion involves route lookups and full IP output, consuming much more stack per level. Use a lower limit of 4 (IP_TUNNEL_RECURSION_LIMIT) to prevent overflow. Add recursion detection using dev_xmit_recursion helpers directly in iptunnel_xmit() and ip6tunnel_xmit() to cover all IPv4/IPv6 tunnel paths including UDP encapsulated tunnels (VXLAN, Geneve, etc.). Move dev_xmit_recursion helpers from net/core/dev.h to public header include/linux/netdevice.h so they can be used by tunnel code. BUG: KASAN: stack-out-of-bounds in blake2s.constprop.0+0xe7/0x160 Write of size 32 at addr ffff88810033fed0 by task kworker/0:1/11 Workqueue: mld mld_ifc_work Call Trace: __build_flow_key.constprop.0 (net/ipv4/route.c:515) ip_rt_update_pmtu (net/ipv4/route.c:1073) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:84) ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) ip_finish_output2 (net/ipv4/ip_output.c:237) ip_output (net/ipv4/ip_output.c:438) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) ip_finish_output2 (net/ipv4/ip_output.c:237) ip_output (net/ipv4/ip_output.c:438) iptunnel_xmit (net/ipv4/ip_tunnel_core.c:86) ip_tunnel_xmit (net/ipv4/ip_tunnel.c:847) gre_tap_xmit (net/ipv4/ip_gre.c:779) dev_hard_start_xmit (net/core/dev.c:3887) sch_direct_xmit (net/sched/sch_generic.c:347) __dev_queue_xmit (net/core/dev.c:4802) bond_dev_queue_xmit (drivers/net/bonding/bond_main.c:312) bond_xmit_broadcast (drivers/net/bonding/bond_main.c:5279) bond_start_xmit (drivers/net/bonding/bond_main.c:5530) dev_hard_start_xmit (net/core/dev.c:3887) __dev_queue_xmit (net/core/dev.c:4841) mld_sendpack mld_ifc_work process_one_work worker_thread Fixes: 745e20f1b626 ("net: add a recursion limit in xmit path") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Link: https://patch.msgid.link/20260306160133.3852900-2-bestswngs@gmail.com Signed-off-by: Paolo Abeni --- net/core/dev.h | 35 ----------------------------------- net/ipv4/ip_tunnel_core.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/core/dev.h b/net/core/dev.h index 98793a738f43..781619e76b3e 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -366,41 +366,6 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) void kick_defer_list_purge(unsigned int cpu); -#define XMIT_RECURSION_LIMIT 8 - -#ifndef CONFIG_PREEMPT_RT -static inline bool dev_xmit_recursion(void) -{ - return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > - XMIT_RECURSION_LIMIT); -} - -static inline void dev_xmit_recursion_inc(void) -{ - __this_cpu_inc(softnet_data.xmit.recursion); -} - -static inline void dev_xmit_recursion_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.recursion); -} -#else -static inline bool dev_xmit_recursion(void) -{ - return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); -} - -static inline void dev_xmit_recursion_inc(void) -{ - current->net_xmit.recursion++; -} - -static inline void dev_xmit_recursion_dec(void) -{ - current->net_xmit.recursion--; -} -#endif - int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 2e61ac137128..b1b6bf949f65 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -58,6 +58,17 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, struct iphdr *iph; int err; + if (dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT) { + net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", + dev->name); + DEV_STATS_INC(dev, tx_errors); + ip_rt_put(rt); + kfree_skb(skb); + return; + } + + dev_xmit_recursion_inc(); + skb_scrub_packet(skb, xnet); skb_clear_hash_if_not_l4(skb); @@ -88,6 +99,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, pkt_len = 0; iptunnel_xmit_stats(dev, pkt_len); } + + dev_xmit_recursion_dec(); } EXPORT_SYMBOL_GPL(iptunnel_xmit); -- cgit v1.2.3 From b7cdc5a97d02c943f4bdde4d5767ad0c13cad92b Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 5 Mar 2026 13:01:44 +0100 Subject: netfilter: nf_tables: Fix for duplicate device in netdev hooks When handling NETDEV_REGISTER notification, duplicate device registration must be avoided since the device may have been added by nft_netdev_hook_alloc() already when creating the hook. Suggested-by: Florian Westphal Reported-by: syzbot+bb9127e278fa198e110c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=bb9127e278fa198e110c Fixes: a331b78a5525 ("netfilter: nf_tables: Respect NETDEV_REGISTER events") Tested-by: Helen Koike Signed-off-by: Phil Sutter Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nft_chain_filter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1862bd7fe804..710f0ee21a34 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9688,7 +9688,7 @@ static int nft_flowtable_event(unsigned long event, struct net_device *dev, break; case NETDEV_REGISTER: /* NOP if not matching or already registered */ - if (!match || (changename && ops)) + if (!match || ops) continue; ops = kzalloc_obj(struct nf_hook_ops, diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b16185e9a6dd..041426e3bdbf 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -344,7 +344,7 @@ static int nft_netdev_event(unsigned long event, struct net_device *dev, break; case NETDEV_REGISTER: /* NOP if not matching or already registered */ - if (!match || (changename && ops)) + if (!match || ops) continue; ops = kmemdup(&basechain->ops, -- cgit v1.2.3 From 7cb9a23d7ae40a702577d3d8bacb7026f04ac2a9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 5 Mar 2026 21:32:00 +0100 Subject: netfilter: nf_tables: always walk all pending catchall elements During transaction processing we might have more than one catchall element: 1 live catchall element and 1 pending element that is coming as part of the new batch. If the map holding the catchall elements is also going away, its required to toggle all catchall elements and not just the first viable candidate. Otherwise, we get: WARNING: ./include/net/netfilter/nf_tables.h:1281 at nft_data_release+0xb7/0xe0 [nf_tables], CPU#2: nft/1404 RIP: 0010:nft_data_release+0xb7/0xe0 [nf_tables] [..] __nft_set_elem_destroy+0x106/0x380 [nf_tables] nf_tables_abort_release+0x348/0x8d0 [nf_tables] nf_tables_abort+0xcf2/0x3ac0 [nf_tables] nfnetlink_rcv_batch+0x9c9/0x20e0 [..] Fixes: 628bd3e49cba ("netfilter: nf_tables: drop map element references from preparation phase") Reported-by: Yiming Qian Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 710f0ee21a34..dacec5f8a11c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -829,7 +829,6 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); - break; } } @@ -5873,7 +5872,6 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); - break; } } -- cgit v1.2.3 From d6d8cd2db236a9dd13dbc2d05843b3445cc964b5 Mon Sep 17 00:00:00 2001 From: Jenny Guanni Qu Date: Fri, 6 Mar 2026 19:12:38 +0000 Subject: netfilter: nft_set_pipapo: fix stack out-of-bounds read in pipapo_drop() pipapo_drop() passes rulemap[i + 1].n to pipapo_unmap() as the to_offset argument on every iteration, including the last one where i == m->field_count - 1. This reads one element past the end of the stack-allocated rulemap array (declared as rulemap[NFT_PIPAPO_MAX_FIELDS] with NFT_PIPAPO_MAX_FIELDS == 16). Although pipapo_unmap() returns early when is_last is true without using the to_offset value, the argument is evaluated at the call site before the function body executes, making this a genuine out-of-bounds stack read confirmed by KASAN: BUG: KASAN: stack-out-of-bounds in pipapo_drop+0x50c/0x57c [nf_tables] Read of size 4 at addr ffff8000810e71a4 This frame has 1 object: [32, 160) 'rulemap' The buggy address is at offset 164 -- exactly 4 bytes past the end of the rulemap array. Pass 0 instead of rulemap[i + 1].n on the last iteration to avoid the out-of-bounds read. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Jenny Guanni Qu Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index a34632ae6048..7fd24e0cc428 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1640,6 +1640,7 @@ static void pipapo_drop(struct nft_pipapo_match *m, int i; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; int g; for (g = 0; g < f->groups; g++) { @@ -1659,7 +1660,7 @@ static void pipapo_drop(struct nft_pipapo_match *m, } pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, - rulemap[i + 1].n, i == m->field_count - 1); + last ? 0 : rulemap[i + 1].n, last); if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { /* We can ignore this, a failure to shrink tables down * doesn't make tables invalid. -- cgit v1.2.3 From cfe770220ac2dbd3e104c6b45094037455da81d4 Mon Sep 17 00:00:00 2001 From: David Dull Date: Sat, 7 Mar 2026 20:26:21 +0200 Subject: netfilter: x_tables: guard option walkers against 1-byte tail reads When the last byte of options is a non-single-byte option kind, walkers that advance with i += op[i + 1] ? : 1 can read op[i + 1] past the end of the option area. Add an explicit i == optlen - 1 check before dereferencing op[i + 1] in xt_tcpudp and xt_dccp option walkers. Fixes: 2e4e6a17af35 ("[NETFILTER] x_tables: Abstraction layer for {ip,ip6,arp}_tables") Signed-off-by: David Dull Signed-off-by: Florian Westphal --- net/netfilter/xt_dccp.c | 4 ++-- net/netfilter/xt_tcpudp.c | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index e5a13ecbe67a..037ab93e25d0 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -62,10 +62,10 @@ dccp_find_option(u_int8_t option, return true; } - if (op[i] < 2) + if (op[i] < 2 || i == optlen - 1) i++; else - i += op[i+1]?:1; + i += op[i + 1] ? : 1; } spin_unlock_bh(&dccp_buflock); diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index e8991130a3de..f76cf18f1a24 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -59,8 +59,10 @@ tcp_find_option(u_int8_t option, for (i = 0; i < optlen; ) { if (op[i] == option) return !invert; - if (op[i] < 2) i++; - else i += op[i+1]?:1; + if (op[i] < 2 || i == optlen - 1) + i++; + else + i += op[i + 1] ? : 1; } return invert; -- cgit v1.2.3 From f1ba83755d81c6fc66ac7acd723d238f974091e9 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Sun, 8 Mar 2026 02:24:06 +0900 Subject: netfilter: nfnetlink_queue: fix entry leak in bridge verdict error path nfqnl_recv_verdict() calls find_dequeue_entry() to remove the queue entry from the queue data structures, taking ownership of the entry. For PF_BRIDGE packets, it then calls nfqa_parse_bridge() to parse VLAN attributes. If nfqa_parse_bridge() returns an error (e.g. NFQA_VLAN present but NFQA_VLAN_TCI missing), the function returns immediately without freeing the dequeued entry or its sk_buff. This leaks the nf_queue_entry, its associated sk_buff, and all held references (net_device refcounts, struct net refcount). Repeated triggering exhausts kernel memory. Fix this by dropping the entry via nfqnl_reinject() with NF_DROP verdict on the error path, consistent with other error handling in this file. Fixes: 8d45ff22f1b4 ("netfilter: bridge: nf queue verdict to use NFQA_VLAN and NFQA_L2HDR") Reviewed-by: David Dull Signed-off-by: Hyunwoo Kim Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink_queue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7f5248b5f1ee..47f7f62906e2 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1546,8 +1546,10 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, if (entry->state.pf == PF_BRIDGE) { err = nfqa_parse_bridge(entry, nfqa); - if (err < 0) + if (err < 0) { + nfqnl_reinject(entry, NF_DROP); return err; + } } if (nfqa[NFQA_PAYLOAD]) { -- cgit v1.2.3 From 6dcee8496d53165b2d8a5909b3050b62ae71fe89 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Sun, 8 Mar 2026 02:23:34 +0900 Subject: netfilter: nfnetlink_cthelper: fix OOB read in nfnl_cthelper_dump_table() nfnl_cthelper_dump_table() has a 'goto restart' that jumps to a label inside the for loop body. When the "last" helper saved in cb->args[1] is deleted between dump rounds, every entry fails the (cur != last) check, so cb->args[1] is never cleared. The for loop finishes with cb->args[0] == nf_ct_helper_hsize, and the 'goto restart' jumps back into the loop body bypassing the bounds check, causing an 8-byte out-of-bounds read on nf_ct_helper_hash[nf_ct_helper_hsize]. The 'goto restart' block was meant to re-traverse the current bucket when "last" is no longer found, but it was placed after the for loop instead of inside it. Move the block into the for loop body so that the restart only occurs while cb->args[0] is still within bounds. BUG: KASAN: slab-out-of-bounds in nfnl_cthelper_dump_table+0x9f/0x1b0 Read of size 8 at addr ffff888104ca3000 by task poc_cthelper/131 Call Trace: nfnl_cthelper_dump_table+0x9f/0x1b0 netlink_dump+0x333/0x880 netlink_recvmsg+0x3e2/0x4b0 sock_recvmsg+0xde/0xf0 __sys_recvfrom+0x150/0x200 __x64_sys_recvfrom+0x76/0x90 do_syscall_64+0xc3/0x6e0 Allocated by task 1: __kvmalloc_node_noprof+0x21b/0x700 nf_ct_alloc_hashtable+0x65/0xd0 nf_conntrack_helper_init+0x21/0x60 nf_conntrack_init_start+0x18d/0x300 nf_conntrack_standalone_init+0x12/0xc0 Fixes: 12f7a505331e ("netfilter: add user-space connection tracking helper infrastructure") Signed-off-by: Hyunwoo Kim Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink_cthelper.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index d658b1478fa0..d545fa459455 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -601,10 +601,10 @@ restart: goto out; } } - } - if (cb->args[1]) { - cb->args[1] = 0; - goto restart; + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } } out: rcu_read_unlock(); -- cgit v1.2.3 From 329f0b9b48ee6ab59d1ab72fef55fe8c6463a6cf Mon Sep 17 00:00:00 2001 From: Yuan Tan Date: Mon, 9 Mar 2026 03:41:46 -0700 Subject: netfilter: xt_IDLETIMER: reject rev0 reuse of ALARM timer labels IDLETIMER revision 0 rules reuse existing timers by label and always call mod_timer() on timer->timer. If the label was created first by revision 1 with XT_IDLETIMER_ALARM, the object uses alarm timer semantics and timer->timer is never initialized. Reusing that object from revision 0 causes mod_timer() on an uninitialized timer_list, triggering debugobjects warnings and possible panic when panic_on_warn=1. Fix this by rejecting revision 0 rule insertion when an existing timer with the same label is of ALARM type. Fixes: 68983a354a65 ("netfilter: xtables: Add snapshot of hardidletimer target") Co-developed-by: Yifan Wu Signed-off-by: Yifan Wu Co-developed-by: Juefei Pu Signed-off-by: Juefei Pu Signed-off-by: Yuan Tan Signed-off-by: Xin Liu Signed-off-by: Florian Westphal --- net/netfilter/xt_IDLETIMER.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 5d93e225d0f8..517106165ad2 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -318,6 +318,12 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); + mutex_unlock(&list_mutex); + return -EINVAL; + } + info->timer->refcnt++; mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); -- cgit v1.2.3 From 28b225282d44e2ef40e7f46cfdbd5d1b20b8874f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Mar 2026 17:39:07 -0700 Subject: page_pool: store detach_time as ktime_t to avoid false-negatives While testing other changes in vng I noticed that nl_netdev.page_pool_check flakes. This never happens in real CI. Turns out vng may boot and get to that test in less than a second. page_pool_detached() records the detach time in seconds, so if vng is fast enough detach time is set to 0. Other code treats 0 as "not detached". detach_time is only used to report the state to the user, so it's not a huge deal in practice but let's fix it. Store the raw ktime_t (nanoseconds) instead. A nanosecond value of 0 is practically impossible. Acked-by: Jesper Dangaard Brouer Fixes: 69cb4952b6f6 ("net: page_pool: report when page pool was destroyed") Link: https://patch.msgid.link/20260310003907.3540019-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/page_pool_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index c82a95beceff..ee5060d8eec0 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -245,7 +245,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, goto err_cancel; if (pool->user.detach_time && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, - pool->user.detach_time)) + ktime_divns(pool->user.detach_time, NSEC_PER_SEC))) goto err_cancel; if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) @@ -337,7 +337,7 @@ err_unlock: void page_pool_detached(struct page_pool *pool) { mutex_lock(&page_pools_lock); - pool->user.detach_time = ktime_get_boottime_seconds(); + pool->user.detach_time = ktime_get_boottime(); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); mutex_unlock(&page_pools_lock); } -- cgit v1.2.3 From 57885276cc16a2e2b76282c808a4e84cbecb3aae Mon Sep 17 00:00:00 2001 From: Paul Moses Date: Mon, 9 Mar 2026 17:35:10 +0000 Subject: net-shapers: don't free reply skb after genlmsg_reply() genlmsg_reply() hands the reply skb to netlink, and netlink_unicast() consumes it on all return paths, whether the skb is queued successfully or freed on an error path. net_shaper_nl_get_doit() and net_shaper_nl_cap_get_doit() currently jump to free_msg after genlmsg_reply() fails and call nlmsg_free(msg), which can hit the same skb twice. Return the genlmsg_reply() error directly and keep free_msg only for pre-reply failures. Fixes: 4b623f9f0f59 ("net-shapers: implement NL get operation") Fixes: 553ea9f1efd6 ("net: shaper: implement introspection support") Cc: stable@vger.kernel.org Signed-off-by: Paul Moses Link: https://patch.msgid.link/20260309173450.538026-2-p@1g4.org Signed-off-by: Jakub Kicinski --- net/shaper/shaper.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 005bfc766e22..3fd6629cb999 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -759,11 +759,7 @@ int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto free_msg; - ret = genlmsg_reply(msg, info); - if (ret) - goto free_msg; - - return 0; + return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); @@ -1313,10 +1309,7 @@ int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto free_msg; - ret = genlmsg_reply(msg, info); - if (ret) - goto free_msg; - return 0; + return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); -- cgit v1.2.3 From 770444611f047dbfd4517ec0bc1b179d40c2f346 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Thu, 26 Feb 2026 16:07:31 +0100 Subject: libceph: Use u32 for non-negative values in ceph_monmap_decode() This patch fixes unnecessary implicit conversions that change signedness of blob_len and num_mon in ceph_monmap_decode(). Currently blob_len and num_mon are (signed) int variables. They are used to hold values that are always non-negative and get assigned in ceph_decode_32_safe(), which is meant to assign u32 values. Both variables are subsequently used as unsigned values, and the value of num_mon is further assigned to monmap->num_mon, which is of type u32. Therefore, both variables should be of type u32. This is especially relevant for num_mon. If the value read from the incoming message is very large, it is interpreted as a negative value, and the check for num_mon > CEPH_MAX_MON does not catch it. This leads to the attempt to allocate a very large chunk of memory for monmap, which will most likely fail. In this case, an unnecessary attempt to allocate memory is performed, and -ENOMEM is returned instead of -EINVAL. Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Viacheslav Dubeyko Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/mon_client.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 5136b3766c44..d5080530ce0c 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -72,8 +72,8 @@ static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2) struct ceph_monmap *monmap = NULL; struct ceph_fsid fsid; u32 struct_len; - int blob_len; - int num_mon; + u32 blob_len; + u32 num_mon; u8 struct_v; u32 epoch; int ret; @@ -112,7 +112,7 @@ static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2) } ceph_decode_32_safe(p, end, num_mon, e_inval); - dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch, + dout("%s fsid %pU epoch %u num_mon %u\n", __func__, &fsid, epoch, num_mon); if (num_mon > CEPH_MAX_MON) goto e_inval; -- cgit v1.2.3 From b282c43ed156ae15ea76748fc15cd5c39dc9ab72 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 10 Mar 2026 15:28:15 +0100 Subject: libceph: Fix potential out-of-bounds access in ceph_handle_auth_reply() This patch fixes an out-of-bounds access in ceph_handle_auth_reply() that can be triggered by a message of type CEPH_MSG_AUTH_REPLY. In ceph_handle_auth_reply(), the value of the payload_len field of such a message is stored in a variable of type int. A value greater than INT_MAX leads to an integer overflow and is interpreted as a negative value. This leads to decrementing the pointer address by this value and subsequently accessing it because ceph_decode_need() only checks that the memory access does not exceed the end address of the allocation. This patch fixes the issue by changing the data type of payload_len to u32. Additionally, the data type of result_msg_len is changed to u32, as it is also a variable holding a non-negative length. Also, an additional layer of sanity checks is introduced, ensuring that directly after reading it from the message, payload_len and result_msg_len are not greater than the overall segment length. BUG: KASAN: slab-out-of-bounds in ceph_handle_auth_reply+0x642/0x7a0 [libceph] Read of size 4 at addr ffff88811404df14 by task kworker/20:1/262 CPU: 20 UID: 0 PID: 262 Comm: kworker/20:1 Not tainted 6.19.2 #5 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Workqueue: ceph-msgr ceph_con_workfn [libceph] Call Trace: dump_stack_lvl+0x76/0xa0 print_report+0xd1/0x620 ? __pfx__raw_spin_lock_irqsave+0x10/0x10 ? kasan_complete_mode_report_info+0x72/0x210 kasan_report+0xe7/0x130 ? ceph_handle_auth_reply+0x642/0x7a0 [libceph] ? ceph_handle_auth_reply+0x642/0x7a0 [libceph] __asan_report_load_n_noabort+0xf/0x20 ceph_handle_auth_reply+0x642/0x7a0 [libceph] mon_dispatch+0x973/0x23d0 [libceph] ? apparmor_socket_recvmsg+0x6b/0xa0 ? __pfx_mon_dispatch+0x10/0x10 [libceph] ? __kasan_check_write+0x14/0x30i ? mutex_unlock+0x7f/0xd0 ? __pfx_mutex_unlock+0x10/0x10 ? __pfx_do_recvmsg+0x10/0x10 [libceph] ceph_con_process_message+0x1f1/0x650 [libceph] process_message+0x1e/0x450 [libceph] ceph_con_v2_try_read+0x2e48/0x6c80 [libceph] ? __pfx_ceph_con_v2_try_read+0x10/0x10 [libceph] ? save_fpregs_to_fpstate+0xb0/0x230 ? raw_spin_rq_unlock+0x17/0xa0 ? finish_task_switch.isra.0+0x13b/0x760 ? __switch_to+0x385/0xda0 ? __kasan_check_write+0x14/0x30 ? mutex_lock+0x8d/0xe0 ? __pfx_mutex_lock+0x10/0x10 ceph_con_workfn+0x248/0x10c0 [libceph] process_one_work+0x629/0xf80 ? __kasan_check_write+0x14/0x30 worker_thread+0x87f/0x1570 ? __pfx__raw_spin_lock_irqsave+0x10/0x10 ? __pfx_try_to_wake_up+0x10/0x10 ? kasan_print_address_stack_frame+0x1f7/0x280 ? __pfx_worker_thread+0x10/0x10 kthread+0x396/0x830 ? __pfx__raw_spin_lock_irq+0x10/0x10 ? __pfx_kthread+0x10/0x10 ? __kasan_check_write+0x14/0x30 ? recalc_sigpending+0x180/0x210 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x3f7/0x610 ? __pfx_ret_from_fork+0x10/0x10 ? __switch_to+0x385/0xda0 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 [ idryomov: replace if statements with ceph_decode_need() for payload_len and result_msg_len ] Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Viacheslav Dubeyko Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/auth.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 343c841784ce..901b93530b21 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -205,9 +205,9 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, s32 result; u64 global_id; void *payload, *payload_end; - int payload_len; + u32 payload_len; char *result_msg; - int result_msg_len; + u32 result_msg_len; int ret = -EINVAL; mutex_lock(&ac->mutex); @@ -217,10 +217,12 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, result = ceph_decode_32(&p); global_id = ceph_decode_64(&p); payload_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, payload_len, bad); payload = p; p += payload_len; ceph_decode_need(&p, end, sizeof(u32), bad); result_msg_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, result_msg_len, bad); result_msg = p; p += result_msg_len; if (p != end) -- cgit v1.2.3 From 94a4b1f959989de9c54d43c3a102fb1ee92e1414 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 7 Mar 2026 17:50:53 -0300 Subject: ipv6: move the disable_ipv6_mod knob to core code From: Jakub Kicinski Make sure disable_ipv6_mod itself is not part of the IPv6 module, in case core code wants to refer to it. We will remove support for IPv6=m soon, this change helps make fixes we commit before that less messy. Link: https://patch.msgid.link/20260307-net-nd_tbl_fixes-v4-1-e2677e85628c@suse.com Signed-off-by: Jakub Kicinski --- net/ipv4/af_inet.c | 6 ++++++ net/ipv6/af_inet6.c | 8 -------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8036e76aa1e4..c7731e300a44 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -124,6 +124,12 @@ #include +/* Keep the definition of IPv6 disable here for now, to avoid annoying linker + * issues in case IPv6=m + */ +int disable_ipv6_mod; +EXPORT_SYMBOL(disable_ipv6_mod); + /* The inetsw table contains everything that inet_create needs to * build a new socket. */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 23cc9b4cb2f1..4cbd45b68088 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -86,8 +86,6 @@ struct ipv6_params ipv6_defaults = { .autoconf = 1, }; -static int disable_ipv6_mod; - module_param_named(disable, disable_ipv6_mod, int, 0444); MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional"); @@ -97,12 +95,6 @@ MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); -bool ipv6_mod_enabled(void) -{ - return disable_ipv6_mod == 0; -} -EXPORT_SYMBOL_GPL(ipv6_mod_enabled); - static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->ipv6_pinfo_offset; -- cgit v1.2.3 From dcb4e2231469523d20cf0a2477d68245795c205d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20B=2E=20Marli=C3=A8re?= Date: Sat, 7 Mar 2026 17:50:55 -0300 Subject: bpf: bpf_out_neigh_v4: Fix nd_tbl NULL dereference when IPv6 is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When booting with the 'ipv6.disable=1' parameter, the nd_tbl is never initialized because inet6_init() exits before ndisc_init() is called which initializes it. If bpf_redirect_neigh() is called from tc with an explicit nexthop of nh_family == AF_INET6, bpf_out_neigh_v4() takes the AF_INET6 branch and calls ip_neigh_gw6(), which relies on ipv6_stub->nd_tbl. BUG: kernel NULL pointer dereference, address: 0000000000000248 Oops: Oops: 0000 [#1] SMP NOPTI RIP: 0010:skb_do_redirect+0xb93/0xf00 Call Trace: ? srso_alias_return_thunk+0x5/0xfbef5 ? __tcf_classify.constprop.0+0x83/0x160 ? srso_alias_return_thunk+0x5/0xfbef5 ? tcf_classify+0x2b/0x50 ? srso_alias_return_thunk+0x5/0xfbef5 ? tc_run+0xb8/0x120 ? srso_alias_return_thunk+0x5/0xfbef5 __dev_queue_xmit+0x6fa/0x1000 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? alloc_skb_with_frags+0x58/0x200 packet_sendmsg+0x10da/0x1700 ? srso_alias_return_thunk+0x5/0xfbef5 __sys_sendto+0x1f3/0x220 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x101/0xf80 ? exc_page_fault+0x6e/0x170 ? srso_alias_return_thunk+0x5/0xfbef5 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fix this by adding an early check in the AF_INET6 branch of bpf_out_neigh_v4(). If IPv6 is disabled, unlock RCU and drop the packet. Suggested-by: Fernando Fernandez Mancera Fixes: ba452c9e996d ("bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop") Signed-off-by: Ricardo B. Marlière Acked-by: Daniel Borkmann Link: https://patch.msgid.link/20260307-net-nd_tbl_fixes-v4-3-e2677e85628c@suse.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a77d23fe2359..fd38b6f8b7a8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2335,6 +2335,10 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { + if (unlikely(!ipv6_mod_enabled())) { + rcu_read_unlock(); + goto out_drop; + } neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); is_v6gw = true; } else if (nh->nh_family == AF_INET) { -- cgit v1.2.3 From d56b5d163458c45ab8fa1f00bd875af01b3ce28c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20B=2E=20Marli=C3=A8re?= Date: Sat, 7 Mar 2026 17:50:56 -0300 Subject: bpf: bpf_out_neigh_v6: Fix nd_tbl NULL dereference when IPv6 is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When booting with the 'ipv6.disable=1' parameter, the nd_tbl is never initialized because inet6_init() exits before ndisc_init() is called which initializes it. If bpf_redirect_neigh() is called with explicit AF_INET6 nexthop parameters, __bpf_redirect_neigh_v6() can skip the IPv6 FIB lookup and call bpf_out_neigh_v6() directly. bpf_out_neigh_v6() then calls ip_neigh_gw6(), which uses ipv6_stub->nd_tbl. BUG: kernel NULL pointer dereference, address: 0000000000000248 Oops: Oops: 0000 [#1] SMP NOPTI RIP: 0010:skb_do_redirect+0x44f/0xf40 Call Trace: ? srso_alias_return_thunk+0x5/0xfbef5 ? __tcf_classify.constprop.0+0x83/0x160 ? srso_alias_return_thunk+0x5/0xfbef5 ? tcf_classify+0x2b/0x50 ? srso_alias_return_thunk+0x5/0xfbef5 ? tc_run+0xb8/0x120 ? srso_alias_return_thunk+0x5/0xfbef5 __dev_queue_xmit+0x6fa/0x1000 ? srso_alias_return_thunk+0x5/0xfbef5 packet_sendmsg+0x10da/0x1700 ? srso_alias_return_thunk+0x5/0xfbef5 __sys_sendto+0x1f3/0x220 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x101/0xf80 ? exc_page_fault+0x6e/0x170 ? srso_alias_return_thunk+0x5/0xfbef5 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fix this by adding an early check in bpf_out_neigh_v6(). If IPv6 is disabled, drop the packet before neighbor lookup. Suggested-by: Fernando Fernandez Mancera Fixes: ba452c9e996d ("bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop") Signed-off-by: Ricardo B. Marlière Acked-by: Daniel Borkmann Link: https://patch.msgid.link/20260307-net-nd_tbl_fixes-v4-4-e2677e85628c@suse.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index fd38b6f8b7a8..78b548158fb0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2228,6 +2228,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, return -ENOMEM; } + if (unlikely(!ipv6_mod_enabled())) + goto out_drop; + rcu_read_lock(); if (!nh) { dst = skb_dst(skb); -- cgit v1.2.3 From 6c5a9baa15de240e747263aba435a0951da8d8d2 Mon Sep 17 00:00:00 2001 From: Mehul Rao Date: Tue, 10 Mar 2026 13:07:30 -0400 Subject: tipc: fix divide-by-zero in tipc_sk_filter_connect() A user can set conn_timeout to any value via setsockopt(TIPC_CONN_TIMEOUT), including values less than 4. When a SYN is rejected with TIPC_ERR_OVERLOAD and the retry path in tipc_sk_filter_connect() executes: delay %= (tsk->conn_timeout / 4); If conn_timeout is in the range [0, 3], the integer division yields 0, and the modulo operation triggers a divide-by-zero exception, causing a kernel oops/panic. Fix this by clamping conn_timeout to a minimum of 4 at the point of use in tipc_sk_filter_connect(). Oops: divide error: 0000 [#1] SMP KASAN NOPTI CPU: 0 UID: 0 PID: 119 Comm: poc-F144 Not tainted 7.0.0-rc2+ RIP: 0010:tipc_sk_filter_rcv (net/tipc/socket.c:2236 net/tipc/socket.c:2362) Call Trace: tipc_sk_backlog_rcv (include/linux/instrumented.h:82 include/linux/atomic/atomic-instrumented.h:32 include/net/sock.h:2357 net/tipc/socket.c:2406) __release_sock (include/net/sock.h:1185 net/core/sock.c:3213) release_sock (net/core/sock.c:3797) tipc_connect (net/tipc/socket.c:2570) __sys_connect (include/linux/file.h:62 include/linux/file.h:83 net/socket.c:2098) Fixes: 6787927475e5 ("tipc: buffer overflow handling in listener socket") Cc: stable@vger.kernel.org Signed-off-by: Mehul Rao Reviewed-by: Tung Nguyen Link: https://patch.msgid.link/20260310170730.28841-1-mehulrao@gmail.com Signed-off-by: Jakub Kicinski --- net/tipc/socket.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 4c618c2b871d..9329919fb07f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2233,6 +2233,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, if (skb_queue_empty(&sk->sk_write_queue)) break; get_random_bytes(&delay, 2); + if (tsk->conn_timeout < 4) + tsk->conn_timeout = 4; delay %= (tsk->conn_timeout / 4); delay = msecs_to_jiffies(delay + 100); sk_reset_timer(sk, &sk->sk_timer, jiffies + delay); -- cgit v1.2.3 From cbada1048847a348797aec63a1d8056621cbe653 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 10 Mar 2026 22:59:16 +0100 Subject: neighbour: restore protocol != 0 check in pneigh update Prior to commit dc2a27e524ac ("neighbour: Update pneigh_entry in pneigh_create()."), a pneigh's protocol was updated only when the value of the NDA_PROTOCOL attribute was non-0. While moving the code, that check was removed. This is a small change of user-visible behavior, and inconsistent with the (non-proxy) neighbour behavior. Fixes: dc2a27e524ac ("neighbour: Update pneigh_entry in pneigh_create().") Signed-off-by: Sabrina Dubroca Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/38c61de1bb032871a886aff9b9b52fe1cdd4cada.1772894876.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a95cfe77f7f0..c56a4e7bf790 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -820,7 +820,8 @@ int pneigh_create(struct neigh_table *tbl, struct net *net, update: WRITE_ONCE(n->flags, flags); n->permanent = permanent; - WRITE_ONCE(n->protocol, protocol); + if (protocol) + WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); return err; -- cgit v1.2.3 From c38b8f5f791ecce13ab77e2257f8fd2444ba80f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Mar 2026 04:39:08 +0000 Subject: net: prevent NULL deref in ip[6]tunnel_xmit() Blamed commit missed that both functions can be called with dev == NULL. Also add unlikely() hints for these conditions that only fuzzers can hit. Fixes: 6f1a9140ecda ("net: add xmit recursion limit to tunnel xmit functions") Signed-off-by: Eric Dumazet CC: Weiming Shi Link: https://patch.msgid.link/20260312043908.2790803-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/ipv4/ip_tunnel_core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index b1b6bf949f65..5683c328990f 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -58,10 +58,12 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, struct iphdr *iph; int err; - if (dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT) { - net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", - dev->name); - DEV_STATS_INC(dev, tx_errors); + if (unlikely(dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT)) { + if (dev) { + net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", + dev->name); + DEV_STATS_INC(dev, tx_errors); + } ip_rt_put(rt); kfree_skb(skb); return; -- cgit v1.2.3