From 1dbfb0363224f6da56f6655d596dc5097308d6f5 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Fri, 5 Sep 2025 06:57:27 -0700 Subject: genetlink: fix genl_bind() invoking bind() after -EPERM Per family bind/unbind callbacks were introduced to allow families to track multicast group consumer presence, e.g. to start or stop producing events depending on listeners. However, in genl_bind() the bind() callback was invoked even if capability checks failed and ret was set to -EPERM. This means that callbacks could run on behalf of unauthorized callers while the syscall still returned failure to user space. Fix this by only invoking bind() after "if (ret) break;" check i.e. after permission checks have succeeded. Fixes: 3de21a8990d3 ("genetlink: Add per family bind/unbind callbacks") Signed-off-by: Alok Tiwari Link: https://patch.msgid.link/20250905135731.3026965-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/netlink/genetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 104732d34543..978c129c6095 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group) !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; + if (ret) + break; + if (family->bind) family->bind(i); -- cgit v1.2.3 From 8625f5748fea960d2af4f3c3e9891ee8f6f80906 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 5 Sep 2025 13:12:33 +0200 Subject: net: bridge: Bounce invalid boolopts The bridge driver currently tolerates options that it does not recognize. Instead, it should bounce them. Fixes: a428afe82f98 ("net: bridge: add support for user-controlled bool options") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/e6fdca3b5a8d54183fbda075daffef38bdd7ddce.1757070067.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 1885d0c315f0..c683baa3847f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -324,6 +324,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br, int err = 0; int opt_id; + opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX); + if (opt_id != BITS_PER_LONG) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d", + opt_id); + return -EINVAL; + } + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); -- cgit v1.2.3 From e3c674db356c4303804b2415e7c2b11776cdd8c3 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 4 Sep 2025 14:53:50 +0200 Subject: tunnels: reset the GSO metadata before reusing the skb If a GSO skb is sent through a Geneve tunnel and if Geneve options are added, the split GSO skb might not fit in the MTU anymore and an ICMP frag needed packet can be generated. In such case the ICMP packet might go through the segmentation logic (and dropped) later if it reaches a path were the GSO status is checked and segmentation is required. This is especially true when an OvS bridge is used with a Geneve tunnel attached to it. The following set of actions could lead to the ICMP packet being wrongfully segmented: 1. An skb is constructed by the TCP layer (e.g. gso_type SKB_GSO_TCPV4, segs >= 2). 2. The skb hits the OvS bridge where Geneve options are added by an OvS action before being sent through the tunnel. 3. When the skb is xmited in the tunnel, the split skb does not fit anymore in the MTU and iptunnel_pmtud_build_icmp is called to generate an ICMP fragmentation needed packet. This is done by reusing the original (GSO!) skb. The GSO metadata is not cleared. 4. The ICMP packet being sent back hits the OvS bridge again and because skb_is_gso returns true, it goes through queue_gso_packets... 5. ...where __skb_gso_segment is called. The skb is then dropped. 6. Note that in the above example on re-transmission the skb won't be a GSO one as it would be segmented (len > MSS) and the ICMP packet should go through. Fix this by resetting the GSO information before reusing an skb in iptunnel_pmtud_build_icmp and iptunnel_pmtud_build_icmpv6. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Reported-by: Adrian Moreno Signed-off-by: Antoine Tenart Reviewed-by: Stefano Brivio Link: https://patch.msgid.link/20250904125351.159740-1-atenart@kernel.org Signed-off-by: Paolo Abeni --- net/ipv4/ip_tunnel_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index cc9915543637..2e61ac137128 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); @@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); -- cgit v1.2.3 From 686cab5a18e443e1d5f2abb17bed45837836425f Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 7 Sep 2025 11:08:21 +0300 Subject: net: dev_ioctl: take ops lock in hwtstamp lower paths ndo hwtstamp callbacks are expected to run under the per-device ops lock. Make the lower get/set paths consistent with the rest of ndo invocations. Kernel log: WARNING: CPU: 13 PID: 51364 at ./include/net/netdev_lock.h:70 __netdev_update_features+0x4bd/0xe60 ... RIP: 0010:__netdev_update_features+0x4bd/0xe60 ... Call Trace: netdev_update_features+0x1f/0x60 mlx5_hwtstamp_set+0x181/0x290 [mlx5_core] mlx5e_hwtstamp_set+0x19/0x30 [mlx5_core] dev_set_hwtstamp_phylib+0x9f/0x220 dev_set_hwtstamp_phylib+0x9f/0x220 dev_set_hwtstamp+0x13d/0x240 dev_ioctl+0x12f/0x4b0 sock_ioctl+0x171/0x370 __x64_sys_ioctl+0x3f7/0x900 ? __sys_setsockopt+0x69/0xb0 do_syscall_64+0x6f/0x2e0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ... .... ---[ end trace 0000000000000000 ]--- Note that the mlx5_hwtstamp_set and mlx5e_hwtstamp_set functions shown in the trace come from an in progress patch converting the legacy ioctl to ndo_hwtstamp_get/set and are not present in mainline. Fixes: ffb7ed19ac0a ("net: hold netdev instance lock during ioctl operations") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Link: https://patch.msgid.link/20250907080821.2353388-1-cjubran@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/dev_ioctl.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9c0ad7f4b5d8..ad54b12d4b4c 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) - return dev_get_hwtstamp_phylib(dev, kernel_cfg); + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) - return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); -- cgit v1.2.3 From 648de37416b301f046f62f1b65715c7fa8ebaa67 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Mon, 8 Sep 2025 11:16:01 -0700 Subject: mptcp: sockopt: make sync_socket_options propagate SOCK_KEEPOPEN Users reported a scenario where MPTCP connections that were configured with SO_KEEPALIVE prior to connect would fail to enable their keepalives if MTPCP fell back to TCP mode. After investigating, this affects keepalives for any connection where sync_socket_options is called on a socket that is in the closed or listening state. Joins are handled properly. For connects, sync_socket_options is called when the socket is still in the closed state. The tcp_set_keepalive() function does not act on sockets that are closed or listening, hence keepalive is not immediately enabled. Since the SO_KEEPOPEN flag is absent, it is not enabled later in the connect sequence via tcp_finish_connect. Setting the keepalive via sockopt after connect does work, but would not address any subsequently created flows. Fortunately, the fix here is straight-forward: set SOCK_KEEPOPEN on the subflow when calling sync_socket_options. The fix was valdidated both by using tcpdump to observe keepalive packets not being sent before the fix, and being sent after the fix. It was also possible to observe via ss that the keepalive timer was not enabled on these sockets before the fix, but was enabled afterwards. Fixes: 1b3e7ede1365 ("mptcp: setsockopt: handle SO_KEEPALIVE and SO_PRIORITY") Cc: stable@vger.kernel.org Signed-off-by: Krister Johansen Reviewed-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/aL8dYfPZrwedCIh9@templeofstupid.com Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2c267aff95be..2abe6f1e9940 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1532,13 +1532,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + bool keep_open; - if (ssk->sk_prot->keepalive) { - if (sock_flag(sk, SOCK_KEEPOPEN)) - ssk->sk_prot->keepalive(ssk, 1); - else - ssk->sk_prot->keepalive(ssk, 0); - } + keep_open = sock_flag(sk, SOCK_KEEPOPEN); + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, keep_open); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open); ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; -- cgit v1.2.3 From 8cc71fc3b82b51e155fbe20876b1aa17a315ac4c Mon Sep 17 00:00:00 2001 From: Nithyanantham Paramasivam Date: Fri, 5 Sep 2025 18:18:00 +0530 Subject: wifi: cfg80211: Fix "no buffer space available" error in nl80211_get_station() for MLO Currently, nl80211_get_station() allocates a fixed buffer size using NLMSG_DEFAULT_SIZE. In multi-link scenarios - particularly when the number of links exceeds two - this buffer size is often insufficient to accommodate complete station statistics, resulting in "no buffer space available" errors. To address this, modify nl80211_get_station() to return only accumulated station statistics and exclude per link stats. Pass a new flag (link_stats) to nl80211_send_station() to control the inclusion of per link statistics. This allows retaining detailed output with per link data in dump commands, while excluding it from other commands where it is not needed. This change modifies the handling of per link stats introduced in commit 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") to enable them only for nl80211_dump_station(). Apply the same fix to cfg80211_del_sta_sinfo() by skipping per link stats to avoid buffer issues. cfg80211_new_sta() doesn't include stats and is therefore not impacted. Fixes: 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") Signed-off-by: Nithyanantham Paramasivam Link: https://patch.msgid.link/20250905124800.1448493-1-nithyanantham.paramasivam@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..f2f7424e930c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7062,7 +7062,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, struct net_device *dev, - const u8 *mac_addr, struct station_info *sinfo) + const u8 *mac_addr, struct station_info *sinfo, + bool link_stats) { void *hdr; struct nlattr *sinfoattr, *bss_param; @@ -7283,7 +7284,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; } - if (sinfo->valid_links) { + if (link_stats && sinfo->valid_links) { links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); if (!links) goto nla_put_failure; @@ -7574,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo) < 0) + &sinfo, true) < 0) goto out; sta_idx++; @@ -7635,7 +7636,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, - rdev, dev, mac_addr, &sinfo) < 0) { + rdev, dev, mac_addr, &sinfo, false) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -19680,7 +19681,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, return; if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } @@ -19710,7 +19711,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, } if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } -- cgit v1.2.3 From 7fcbe5b2c6a4b5407bf2241fdb71e0a390f6ab9a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 25 Aug 2025 23:07:24 +0900 Subject: can: j1939: implement NETDEV_UNREGISTER notification handler syzbot is reporting unregister_netdevice: waiting for vcan0 to become free. Usage count = 2 problem, for j1939 protocol did not have NETDEV_UNREGISTER notification handler for undoing changes made by j1939_sk_bind(). Commit 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") expects that a call to j1939_priv_put() can be unconditionally delayed until j1939_sk_sock_destruct() is called. But we need to call j1939_priv_put() against an extra ref held by j1939_sk_bind() call (as a part of undoing changes made by j1939_sk_bind()) as soon as NETDEV_UNREGISTER notification fires (i.e. before j1939_sk_sock_destruct() is called via j1939_sk_release()). Otherwise, the extra ref on "struct j1939_priv" held by j1939_sk_bind() call prevents "struct net_device" from dropping the usage count to 1; making it impossible for unregister_netdevice() to continue. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84 Tested-by: syzbot Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/ac9db9a4-6c30-416e-8b94-96e6559d55b2@I-love.SAKURA.ne.jp [mkl: remove space in front of label] Signed-off-by: Marc Kleine-Budde --- net/can/j1939/j1939-priv.h | 1 + net/can/j1939/main.c | 3 +++ net/can/j1939/socket.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) (limited to 'net') diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 31a93cae5111..81f58924b4ac 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv); /* notify/alert all j1939 sockets bound to ifindex */ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv); +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv); int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk); void j1939_tp_init(struct j1939_priv *priv); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 7e8a20f2fc42..3706a872ecaf 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -377,6 +377,9 @@ static int j1939_netdev_notify(struct notifier_block *nb, j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; + case NETDEV_UNREGISTER: + j1939_sk_netdev_event_unregister(priv); + break; } j1939_priv_put(priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 3d8b588822f9..70ebc861ea2a 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1300,6 +1300,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) read_unlock_bh(&priv->j1939_socks_lock); } +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) +{ + struct sock *sk; + struct j1939_sock *jsk; + bool wait_rcu = false; + +rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ + read_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + /* Skip if j1939_jsk_add() is not called on this socket. */ + if (!(jsk->state & J1939_SOCK_BOUND)) + continue; + sk = &jsk->sk; + sock_hold(sk); + read_unlock_bh(&priv->j1939_socks_lock); + /* Check if j1939_jsk_del() is not yet called on this socket after holding + * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call + * j1939_jsk_del() with socket's lock held. + */ + lock_sock(sk); + if (jsk->state & J1939_SOCK_BOUND) { + /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). + * Make this socket no longer bound, by pretending as if j1939_sk_bind() + * dropped old references but did not get new references. + */ + j1939_jsk_del(priv, jsk); + j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); + j1939_netdev_stop(priv); + /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from + * calling the corresponding j1939_priv_put(). + * + * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after + * an RCU grace period. But since the caller is holding a ref on this + * "priv", we can defer synchronize_rcu() until immediately before + * the caller calls j1939_priv_put(). + */ + j1939_priv_put(priv); + jsk->priv = NULL; + wait_rcu = true; + } + release_sock(sk); + sock_put(sk); + goto rescan; + } + read_unlock_bh(&priv->j1939_socks_lock); + if (wait_rcu) + synchronize_rcu(); +} + static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { -- cgit v1.2.3 From f214744c8a27c3c1da6b538c232da22cd027530e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 24 Aug 2025 19:30:09 +0900 Subject: can: j1939: j1939_sk_bind(): call j1939_priv_put() immediately when j1939_local_ecu_get() failed Commit 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") expects that a call to j1939_priv_put() can be unconditionally delayed until j1939_sk_sock_destruct() is called. But a refcount leak will happen when j1939_sk_bind() is called again after j1939_local_ecu_get() from previous j1939_sk_bind() call returned an error. We need to call j1939_priv_put() before j1939_sk_bind() returns an error. Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/4f49a1bc-a528-42ad-86c0-187268ab6535@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 70ebc861ea2a..88e7160d4248 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -521,6 +521,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); + jsk->priv = NULL; + synchronize_rcu(); + j1939_priv_put(priv); goto out_release_sock; } -- cgit v1.2.3 From 06e02da29f6f1a45fc07bd60c7eaf172dc21e334 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 24 Aug 2025 19:27:40 +0900 Subject: can: j1939: j1939_local_ecu_get(): undo increment when j1939_local_ecu_get() fails Since j1939_sk_bind() and j1939_sk_release() call j1939_local_ecu_put() when J1939_SOCK_BOUND was already set, but the error handling path for j1939_sk_bind() will not set J1939_SOCK_BOUND when j1939_local_ecu_get() fails, j1939_local_ecu_get() needs to undo priv->ents[sa].nusers++ when j1939_local_ecu_get() returns an error. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/e7f80046-4ff7-4ce2-8ad8-7c3c678a42c9@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 39844f14eed8..797719cb227e 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) if (!ecu) ecu = j1939_ecu_create_locked(priv, name); err = PTR_ERR_OR_ZERO(ecu); - if (err) + if (err) { + if (j1939_address_is_unicast(sa)) + priv->ents[sa].nusers--; goto done; + } ecu->nusers++; /* TODO: do we care if ecu->addr != sa? */ -- cgit v1.2.3 From 5e13f2c491a4100d208e77e92fe577fe3dbad6c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Sep 2025 14:45:21 +0200 Subject: netfilter: nft_set_bitmap: fix lockdep splat due to missing annotation Running new 'set_flush_add_atomic_bitmap' test case for nftables.git with CONFIG_PROVE_RCU_LIST=y yields: net/netfilter/nft_set_bitmap.c:231 RCU-list traversed in non-reader section!! rcu_scheduler_active = 2, debug_locks = 1 1 lock held by nft/4008: #0: ffff888147f79cd8 (&nft_net->commit_mutex){+.+.}-{4:4}, at: nf_tables_valid_genid+0x2f/0xd0 lockdep_rcu_suspicious+0x116/0x160 nft_bitmap_walk+0x22d/0x240 nf_tables_delsetelem+0x1010/0x1a00 .. This is a false positive, the list cannot be altered while the transaction mutex is held, so pass the relevant argument to the iterator. Fixes tag intentionally wrong; no point in picking this up if earlier false-positive-fixups were not applied. Fixes: 28b7a6b84c0a ("netfilter: nf_tables: avoid false-positive lockdep splats in set walker") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_bitmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index c24c922f895d..8d3f040a904a 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -226,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; -- cgit v1.2.3 From c4eaca2e1052adfd67bed0a36a9d4b8e515666e4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:18 +0200 Subject: netfilter: nft_set_pipapo: don't check genbit from packetpath lookups The pipapo set type is special in that it has two copies of its datastructure: one live copy containing only valid elements and one on-demand clone used during transaction where adds/deletes happen. This clone is not visible to the datapath. This is unlike all other set types in nftables, those all link new elements into their live hlist/tree. For those sets, the lookup functions must skip the new elements while the transaction is ongoing to ensure consistency. As the clone is shallow, removal does have an effect on the packet path: once the transaction enters the commit phase the 'gencursor' bit that determines which elements are active and which elements should be ignored (because they are no longer valid) is flipped. This causes the datapath lookup to ignore these elements if they are found during lookup. This opens up a small race window where pipapo has an inconsistent view of the dataset from when the transaction-cpu flipped the genbit until the transaction-cpu calls nft_pipapo_commit() to swap live/clone pointers: cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: I) increments the genbit A) observes new genbit removes elements from the clone so they won't be found anymore B) lookup in datastructure can't see new elements yet, but old elements are ignored -> Only matches elements that were not changed in the transaction II) calls nft_pipapo_commit(), clone and live pointers are swapped. C New nft_lookup happening now will find matching elements. Consider a packet matching range r1-r2: cpu0 processes following transaction: 1. remove r1-r2 2. add r1-r3 P is contained in both ranges. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 does find r1-r2, but then ignores it due to the genbit indicating the range has been removed. At the same time, r1-r3 is not visible yet, because it can only be found in the clone. The situation persists for all lookups until after cpu0 hits II). The fix is easy: Don't check the genbit from pipapo lookup functions. This is possible because unlike the other set types, the new elements are not reachable from the live copy of the dataset. The clone/live pointer swap is enough to avoid matching on old elements while at the same time all new elements are exposed in one go. After this change, step B above returns a match in r1-r2. This is fine: r1-r2 only becomes truly invalid the moment they get freed. This happens after a synchronize_rcu() call and rcu read lock is held via netfilter hook traversal (nf_hook_slow()). Cc: Stefano Brivio Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 20 ++++++++++++++++++-- net/netfilter/nft_set_pipapo_avx2.c | 4 +--- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 9a10251228fd..793790d79d13 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -510,6 +510,23 @@ out: * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. + * Unlike other set types, this uses NFT_GENMASK_ANY instead of + * nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK will still find the 'now old' entries which ensures consistent + * priv->match view. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip. As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones. * * Return: ntables API extension pointer or NULL if no match. */ @@ -518,12 +535,11 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64()); + e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 2f090e253caf..c0884fa68c79 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1152,7 +1152,6 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, struct nft_pipapo *priv = nft_set_priv(set); const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; @@ -1248,8 +1247,7 @@ next_match: if (last) { const struct nft_set_ext *e = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(e) || - !nft_set_elem_active(e, genmask))) + if (unlikely(nft_set_elem_expired(e))) goto next_match; ext = e; -- cgit v1.2.3 From a60f7bf4a1524d8896b76ba89623080aebf44272 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:19 +0200 Subject: netfilter: nft_set_rbtree: continue traversal if element is inactive When the rbtree lookup function finds a match in the rbtree, it sets the range start interval to a potentially inactive element. Then, after tree lookup, if the matching element is inactive, it returns NULL and suppresses a matching result. This is wrong and leads to false negative matches when a transaction has already entered the commit phase. cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: I) increments the genbit A) observes new genbit B) finds matching range C) returns no match: found range invalid in new generation II) removes old elements from the tree C New nft_lookup happening now will find matching element, because it is no longer obscured by old, inactive one. Consider a packet matching range r1-r2: cpu0 processes following transaction: 1. remove r1-r2 2. add r1-r3 P is contained in both ranges. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 does find r1-r2, but then ignores it due to the genbit indicating the range has been removed. It does NOT test for further matches. The situation persists for all lookups until after cpu0 hits II) after which r1-r3 range start node is tested for the first time. Move the "interval start is valid" check ahead so that tree traversal continues if the starting interval is not valid in this generation. Thanks to Stefan Hanreich for providing an initial reproducer for this bug. Reported-by: Stefan Hanreich Fixes: c1eda3c6394f ("netfilter: nft_rbtree: ignore inactive matching element with no descendants") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_rbtree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e..b1f04168ec93 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -77,7 +77,9 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; - interval = rbe; + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_rbtree_elem_expired(rbe)) + interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { @@ -102,8 +104,6 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, } if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) return &interval->ext; -- cgit v1.2.3 From 64102d9bbc3d41dac5188b8fba75b1344c438970 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:20 +0200 Subject: netfilter: nf_tables: place base_seq in struct net This will soon be read from packet path around same time as the gencursor. Both gencursor and base_seq get incremented almost at the same time, so it makes sense to place them in the same structure. This doesn't increase struct net size on 64bit due to padding. Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 65 ++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1082de09656..9518b50695ba 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = nft_pernet(net); + return READ_ONCE(net->nft.base_seq); +} - return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -1155,7 +1158,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -1248,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -2030,7 +2033,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -2133,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -3671,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, - nft_base_seq(net)); + nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -3839,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -4050,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); @@ -4887,7 +4890,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), flags, ctx->family, NFNETLINK_V0, - nft_base_seq(ctx->net)); + nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -5032,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && @@ -6209,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -6238,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, - table->family, NFNETLINK_V0, nft_base_seq(net)); + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -6331,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -6630,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb, } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); out_unlock: rcu_read_unlock(); @@ -8381,7 +8384,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -8446,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -8480,7 +8483,7 @@ cont: idx++; } if (ctx->reset && entries) - audit_log_obj_reset(table, nft_net->base_seq, entries); + audit_log_obj_reset(table, nft_base_seq(net), entries); if (rc < 0) break; } @@ -8649,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); @@ -8754,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); + table->name, nft_base_seq(net)); audit_log_nfcfg(buf, family, @@ -9442,7 +9444,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -9511,7 +9513,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -9696,17 +9698,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, - NFNETLINK_V0, nft_base_seq(net)); + NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -10968,11 +10969,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - base_seq = READ_ONCE(nft_net->base_seq); + base_seq = nft_base_seq(net); while (++base_seq == 0) ; - WRITE_ONCE(nft_net->base_seq, base_seq); + WRITE_ONCE(net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); @@ -11181,7 +11182,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, nft_net->base_seq); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); nft_gc_seq_end(nft_net, gc_seq); nft_net->validate_state = NFT_VALIDATE_SKIP; @@ -11506,7 +11507,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) mutex_lock(&nft_net->commit_mutex); nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || nft_net->base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); @@ -12143,7 +12144,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); - nft_net->base_seq = 1; + net->nft.base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); -- cgit v1.2.3 From 11fe5a82e53ac3581a80c88e0e35fb8a80e15f48 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:21 +0200 Subject: netfilter: nf_tables: make nft_set_do_lookup available unconditionally This function was added for retpoline mitigation and is replaced by a static inline helper if mitigations are not enabled. Enable this helper function unconditionally so next patch can add a lookup restart mechanism to fix possible false negatives while transactions are in progress. Adding lookup restarts in nft_lookup_eval doesn't work as nft_objref would then need the same copypaste loop. This patch is separate to ease review of the actual bug fix. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nft_lookup.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 40c602ffbcba..2c6909bf1b40 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,11 +24,11 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_MITIGATION_RETPOLINE -const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { +#ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) @@ -51,10 +51,17 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set, return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); +#endif return set->ops->lookup(net, set, key); } + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + return __nft_set_do_lookup(net, set, key); +} EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, -- cgit v1.2.3 From b2f742c846cab9afc5953a5d8f17b54922dcc723 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:22 +0200 Subject: netfilter: nf_tables: restart set lookup on base_seq change The hash, hash_fast, rhash and bitwise sets may indicate no result even though a matching element exists during a short time window while other cpu is finalizing the transaction. This happens when the hash lookup/bitwise lookup function has picked up the old genbit, right before it was toggled by nf_tables_commit(), but then the same cpu managed to unlink the matching old element from the hash table: cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: A) observes old genbit increments base_seq I) increments the genbit II) removes old element from the set B) finds matching element C) returns no match: found element is not valid in old generation Next lookup observes new genbit and finds matching e2. Consider a packet matching element e1, e2. cpu0 processes following transaction: 1. remove e1 2. adds e2, which has same key as e1. P matches both e1 and e2. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 observed the old genbit. e2 will not be considered once it is found. The element e1 is not found anymore if cpu0 managed to unlink it from the hlist before cpu1 found it during list traversal. The situation only occurs for a brief time period, lookups happening after I) observe new genbit and return e2. This problem exists in all set types except nft_set_pipapo, so fix it once in nft_lookup rather than each set ops individually. Sample the base sequence counter, which gets incremented right before the genbit is changed. Then, if no match is found, retry the lookup if the base sequence was altered in between. If the base sequence hasn't changed: - No update took place: no-match result is expected. This is the common case. or: - nf_tables_commit() hasn't progressed to genbit update yet. Old elements were still visible and nomatch result is expected, or: - nf_tables_commit updated the genbit: We picked up the new base_seq, so the lookup function also picked up the new genbit, no-match result is expected. If the old genbit was observed, then nft_lookup also picked up the old base_seq: nft_lookup_should_retry() returns true and relookup is performed in the new generation. This problem was added when the unconditional synchronize_rcu() call that followed the current/next generation bit toggle was removed. Thanks to Pablo Neira Ayuso for reviewing an earlier version of this patchset, for suggesting re-use of existing base_seq and placement of the restart loop in nft_set_do_lookup(). Fixes: 0cbc06b3faba ("netfilter: nf_tables: remove synchronize_rcu in commit phase") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 3 ++- net/netfilter/nft_lookup.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9518b50695ba..c3c73411c40c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10973,7 +10973,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) while (++base_seq == 0) ; - WRITE_ONCE(net->nft.base_seq, base_seq); + /* pairs with smp_load_acquire in nft_lookup_eval */ + smp_store_release(&net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 2c6909bf1b40..58c5b14889c4 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -55,11 +55,40 @@ __nft_set_do_lookup(const struct net *net, const struct nft_set *set, return set->ops->lookup(net, set, key); } +static unsigned int nft_base_seq(const struct net *net) +{ + /* pairs with smp_store_release() in nf_tables_commit() */ + return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ + return unlikely(seq != nft_base_seq(net)); +} + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { - return __nft_set_do_lookup(net, set, key); + const struct nft_set_ext *ext; + unsigned int base_seq; + + do { + base_seq = nft_base_seq(net); + + ext = __nft_set_do_lookup(net, set, key); + if (ext) + break; + /* No match? There is a small chance that lookup was + * performed in the old generation, but nf_tables_commit() + * already unlinked a (matching) element. + * + * We need to repeat the lookup to make sure that we didn't + * miss a matching element in the new generation. + */ + } while (nft_lookup_should_retry(net, base_seq)); + + return ext; } EXPORT_SYMBOL_GPL(nft_set_do_lookup); -- cgit v1.2.3 From c3f8d13357deab1e04f8a52b499d6b9b704e578e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2025 15:11:21 +0200 Subject: wifi: nl80211: completely disable per-link stats for now After commit 8cc71fc3b82b ("wifi: cfg80211: Fix "no buffer space available" error in nl80211_get_station() for MLO"), the per-link data is only included in station dumps, where the size limit is somewhat less of an issue. However, it's still an issue, depending on how many links a station has and how much per-link data there is. Thus, for now, disable per-link statistics entirely. A complete fix will need to take this into account, make it opt-in by userspace, and change the dump format to be able to split a single station's data across multiple netlink dump messages, which all together is too much development for a fix. Fixes: 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f2f7424e930c..852573423e52 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7575,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo, true) < 0) + &sinfo, false) < 0) goto out; sta_idx++; -- cgit v1.2.3 From 8884c693991333ae065830554b9b0c96590b1bb2 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:31 +0000 Subject: hsr: use rtnl lock when iterating over ports hsr_for_each_port is called in many places without holding the RCU read lock, this may trigger warnings on debug kernels. Most of the callers are actually hold rtnl lock. So add a new helper hsr_for_each_port_rtnl to allow callers in suitable contexts to iterate ports safely without explicit RCU locking. This patch only fixed the callers that is hold rtnl lock. Other caller issues will be fixed in later patches. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 18 +++++++++--------- net/hsr/hsr_main.c | 2 +- net/hsr/hsr_main.h | 3 +++ 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 88657255fec1..bce7b4061ce0 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master) ASSERT_RTNL(); - hsr_for_each_port(master->hsr, port) { + hsr_for_each_port_rtnl(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; @@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) struct hsr_port *port; mtu_max = ETH_DATA_LEN; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); @@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev) struct hsr_priv *hsr; hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); @@ -484,7 +484,7 @@ static void hsr_set_rx_mode(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -506,7 +506,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -534,7 +534,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) continue; @@ -580,7 +580,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 192893c3f2ec..ac1eb1db1a52 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 135ec5fce019..33b0d2460c9b 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -224,6 +224,9 @@ struct hsr_priv { #define hsr_for_each_port(hsr, port) \ list_for_each_entry_rcu((port), &(hsr)->ports, port_list) +#define hsr_for_each_port_rtnl(hsr, port) \ + list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held()) + struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); /* Caller must ensure skb is a valid HSR frame */ -- cgit v1.2.3 From 393c841fe4333cdd856d0ca37b066d72746cfaa6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:32 +0000 Subject: hsr: use hsr_for_each_port_rtnl in hsr_port_get_hsr hsr_port_get_hsr() iterates over ports using hsr_for_each_port(), but many of its callers do not hold the required RCU lock. Switch to hsr_for_each_port_rtnl(), since most callers already hold the rtnl lock. After review, all callers are covered by either the rtnl lock or the RCU lock, except hsr_dev_xmit(). Fix this by adding an RCU read lock there. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 3 +++ net/hsr/hsr_main.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index bce7b4061ce0..702da1f9aaa9 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; + rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; @@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } + rcu_read_unlock(); + return NETDEV_TX_OK; } diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index ac1eb1db1a52..bc94b07101d8 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type == pt) return port; return NULL; -- cgit v1.2.3 From 847748fc66d08a89135a74e29362a66ba4e3ab15 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:33 +0000 Subject: hsr: hold rcu and dev lock for hsr_get_port_ndev hsr_get_port_ndev calls hsr_for_each_port, which need to hold rcu lock. On the other hand, before return the port device, we need to hold the device reference to avoid UaF in the caller function. Suggested-by: Paolo Abeni Fixes: 9c10dd8eed74 ("net: hsr: Create and export hsr_get_port_ndev()") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-4-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 702da1f9aaa9..fbbc3ccf9df6 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -675,9 +675,14 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev, struct hsr_priv *hsr = netdev_priv(ndev); struct hsr_port *port; + rcu_read_lock(); hsr_for_each_port(hsr, port) - if (port->type == pt) + if (port->type == pt) { + dev_hold(port->dev); + rcu_read_unlock(); return port->dev; + } + rcu_read_unlock(); return NULL; } EXPORT_SYMBOL(hsr_get_port_ndev); -- cgit v1.2.3