From 0a7de4a8f898c480ffafe024c4a0a8b8819597f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:36:02 +0000 Subject: net: rps: remove kfree_rcu_mightsleep() use Add an rcu_head to sd_flow_limit and rps_sock_flow_table structs to use the more conventional and predictable k[v]free_rcu(). Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index e358e9711f27..507f4aa5d39b 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -57,9 +57,10 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 mask; + struct rcu_head rcu; + u32 mask; - u32 ents[] ____cacheline_aligned_in_smp; + u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) -- cgit v1.2.3 From a36283e2b683f172aa1760c77325e50b16c0f792 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Apr 2025 17:45:41 +0200 Subject: udp_tunnel: create a fastpath GRO lookup. Most UDP tunnels bind a socket to a local port, with ANY address, no peer and no interface index specified. Additionally it's quite common to have a single tunnel device per namespace. Track in each namespace the UDP tunnel socket respecting the above. When only a single one is present, store a reference in the netns. When such reference is not NULL, UDP tunnel GRO lookup just need to match the incoming packet destination port vs the socket local port. The tunnel socket never sets the reuse[port] flag[s]. When bound to no address and interface, no other socket can exist in the same netns matching the specified local port. Matching packets with non-local destination addresses will be aggregated, and eventually segmented as needed - no behavior changes intended. Restrict the optimization to kernel sockets only: it covers all the relevant use-cases, and user-space owned sockets could be disconnected and rebound after setup_udp_tunnel_sock(), breaking the uniqueness assumption Note that the UDP tunnel socket reference is stored into struct netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep all the fastpath-related netns fields in the same struct and allow cacheline-based optimization. Currently both the IPv4 and IPv6 socket pointer share the same cacheline as the `udp_table` field. Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/41d16bc8d1257d567f9344c445b4ae0b4a91ede4.1744040675.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 16 ++++++++++++++++ include/net/netns/ipv4.h | 11 +++++++++++ include/net/udp.h | 1 + include/net/udp_tunnel.h | 12 ++++++++++++ 4 files changed, 40 insertions(+) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec9..895240177f4f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 650b2dc9199f..6373e3f17da8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -85,6 +90,11 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -277,4 +287,5 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100d..a772510b2aa5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -290,6 +290,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323..1bb2b852e90e 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -191,6 +191,18 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} + static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) -- cgit v1.2.3 From 5d7f5b2f6b935517ee5fd8058dc32342a5cba3e1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Apr 2025 17:45:42 +0200 Subject: udp_tunnel: use static call for GRO hooks when possible It's quite common to have a single UDP tunnel type active in the whole system. In such a case we can replace the indirect call for the UDP tunnel GRO callback with a static call. Add the related accounting in the control path and switch to static call when possible. To keep the code simple use a static array for the registered tunnel types, and size such array based on the kernel config. Note that there are valid kernel configurations leading to UDP_MAX_TUNNEL_TYPES == 0 even with IS_ENABLED(CONFIG_NET_UDP_TUNNEL), Explicitly skip the accounting in such a case, to avoid compile warning when accessing "udp_tunnel_gro_types". Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/53d156cdfddcc9678449e873cc83e68fa1582653.1744040675.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 1bb2b852e90e..288f06f23a80 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -193,13 +193,16 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); #else static inline void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} #endif static inline void udp_tunnel_cleanup_gro(struct sock *sk) { + udp_tunnel_update_gro_rcv(sk, false); udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); } @@ -212,6 +215,7 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif + udp_tunnel_update_gro_rcv(sk, true); udp_encap_enable(); } -- cgit v1.2.3 From 420aabef3ab5fa743afb4d3d391f03ef0e777ca8 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 7 Apr 2025 21:01:02 +0200 Subject: net: Drop unused @sk of __skb_try_recv_from_queue() __skb_try_recv_from_queue() deals with a queue, @sk is not used since commit e427cad6eee4 ("net: datagram: drop 'destructor' argument from several helpers"). Remove sk from function parameters, adapt callers. No functional change intended. Signed-off-by: Michal Luczaj Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250407-cleanup-drop-param-sk-v1-1-cd076979afac@rbox.co Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a..f1381aff0f89 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4105,8 +4105,7 @@ static inline void skb_frag_list_init(struct sk_buff *skb) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); -- cgit v1.2.3 From a82dc19db13649aa4232ce37cb6f4ceff851e2fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:48 -0700 Subject: net: avoid potential race between netdev_get_by_index_lock() and netns switch netdev_get_by_index_lock() performs following steps: rcu_lock(); dev = lookup(netns, ifindex); dev_get(dev); rcu_unlock(); [... lock & validate the dev ...] return dev Validation right now only checks if the device is registered but since the lookup is netns-aware we must also protect against the device switching netns right after we dropped the RCU lock. Otherwise the caller in netns1 may get a pointer to a device which has just switched to netns2. We can't hold the lock for the entire netns change process (because of the NETDEV_UNREGISTER notifier), and there's no existing marking to indicate that the netns is unlisted because of netns move, so add one. AFAIU none of the existing netdev_get_by_index_lock() callers can suffer from this problem (NAPI code double checks the netns membership and other callers are either under rtnl_lock or not ns-sensitive), so this patch does not have to be treated as a fix. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf3b6445817b..8e9be80bc167 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1952,6 +1952,7 @@ enum netdev_reg_state { * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside + * protected by @lock * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type @@ -2359,6 +2360,9 @@ struct net_device { bool dismantle; + /** @moving_ns: device is changing netns, protected by @lock */ + bool moving_ns; + enum { RTNL_LINK_INITIALIZED, RTNL_LINK_INITIALIZING, @@ -2521,7 +2525,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up + * @up, @moving_ns, @nd_net * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues -- cgit v1.2.3 From 606048cbd8346e616cfaee01b0143d072534136d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:49 -0700 Subject: net: designate XSK pool pointers in queues as "ops protected" Read accesses go via xsk_get_pool_from_qid(), the call coming from the core and gve look safe (other "ops locked" drivers don't support XSK). Write accesses go via xsk_reg_pool_at_qid() and xsk_clear_pool_at_qid(). Former is already under the ops lock, latter is not (both coming from the workqueue via xp_clear_dev() and NETDEV_UNREGISTER via xsk_notifier()). Acked-by: Stanislav Fomichev Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + include/net/netdev_rx_queue.h | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8e9be80bc167..7242fb8a22fc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -688,6 +688,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS + /* "ops protected", see comment about net_device::lock */ struct xsk_buff_pool *pool; #endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index b2238b551dce..8cdcd138b33f 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -20,12 +20,12 @@ struct netdev_rx_queue { struct net_device *dev; netdevice_tracker dev_tracker; + /* All fields below are "ops protected", + * see comment about net_device::lock + */ #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif - /* NAPI instance for the queue - * "ops protected", see comment about net_device::lock - */ struct napi_struct *napi; struct pp_memory_provider_params mp_params; } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 4ec9031cbeb73a66979560bbb6d355329be762de Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:50 -0700 Subject: netdev: add "ops compat locking" helpers Add helpers to "lock a netdev in a backward-compatible way", which for ops-locked netdevs will mean take the instance lock. For drivers which haven't opted into the ops locking we'll take rtnl_lock. The scoped foreach is dropping and re-taking the lock for each device, even if prev and next are both under rtnl_lock. I hope that's fine since we expect that netdev nl to be mostly supported by modern drivers, and modern drivers should also opt into the instance locking. Note that these helpers are mostly needed for queue related state, because drivers modify queue config in their ops in a non-atomic way. Or differently put, queue changes don't have a clear-cut API like NAPI configuration. Any state that can should just use the instance lock directly, not the "compat" hacks. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_lock.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index c316b551df8d..5706835a660c 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -64,6 +64,22 @@ netdev_ops_assert_locked_or_invisible(const struct net_device *dev) netdev_ops_assert_locked(dev); } +static inline void netdev_lock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); + else + rtnl_lock(); +} + +static inline void netdev_unlock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + rtnl_unlock(); +} + static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { -- cgit v1.2.3 From 03df156dd3a6d5992f17682cd5c3b11e5ffdae02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:52 -0700 Subject: xdp: double protect netdev->xdp_flags with netdev->lock Protect xdp_features with netdev->lock. This way pure readers no longer have to take rtnl_lock to access the field. This includes calling NETDEV_XDP_FEAT_CHANGE under the lock. Looks like that's fine for bonding, the only "real" listener, it's the same as ethtool feature change. In terms of normal drivers - only GVE need special consideration (other drivers don't use instance lock or don't support XDP). It calls xdp_set_features_flag() helper from gve_init_priv() which in turn is called from gve_reset_recovery() (locked), or prior to netdev registration. So switch to _locked. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Acked-by: Harshitha Ramamurthy Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250408195956.412733-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- include/net/xdp.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7242fb8a22fc..dece2ae396a1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2526,7 +2526,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net + * @up, @moving_ns, @nd_net, @xdp_flags * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/include/net/xdp.h b/include/net/xdp.h index 48efacbaa35d..20e41b5ff319 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -616,6 +616,7 @@ struct xdp_metadata_ops { u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else -- cgit v1.2.3 From ce7b14947484e6190372f2c3dbfb69aafbc4c0fc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:55 -0700 Subject: netdev: depend on netdev->lock for qstats in ops locked drivers We mostly needed rtnl_lock in qstat to make sure the queue count is stable while we work. For "ops locked" drivers the instance lock protects the queue count, so we don't have to take rtnl_lock. For currently ops-locked drivers: netdevsim and bnxt need the protection from netdev going down while we dump, which instance lock provides. gve doesn't care. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 825141d675e5..ea709b59d827 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -85,9 +85,11 @@ struct netdev_queue_stats_tx { * for some of the events is not maintained, and reliable "total" cannot * be provided). * + * Ops are called under the instance lock if netdev_need_ops_lock() + * returns true, otherwise under rtnl_lock. * Device drivers can assume that when collecting total device stats, * the @get_base_stats and subsequent per-queue calls are performed - * "atomically" (without releasing the rtnl_lock). + * "atomically" (without releasing the relevant lock). * * Device drivers are encouraged to reset the per-queue statistics when * number of queues change. This is because the primary use case for -- cgit v1.2.3 From 229671ac60e298b85c2644f52d7e487e9f487d06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Apr 2025 20:27:42 +0000 Subject: net: remove cpu stall in txq_trans_update() txq_trans_update() currently uses txq->xmit_lock_owner to conditionally update txq->trans_start. For regular devices, txq->xmit_lock_owner is updated from HARD_TX_LOCK() and HARD_TX_UNLOCK(), and this apparently causes cpu stalls. Using dev->lltx, which sits in a read-mostly cache-line, and already used in HARD_TX_LOCK() and HARD_TX_UNLOCK() helps cpu prediction. On an AMD EPYC 7B12 dual socket server, tcp_rr with 128 threads and 30,000 flows gets a 5 % increase in throughput. As explained in commit 95ecba62e2fd ("net: fix races in netdev_tx_sent_queue()/dev_watchdog()") I am planning to no longer update txq->trans_start in the fast path in a followup patch. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250408202742.2145516-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dece2ae396a1..a28a08046615 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4693,9 +4693,10 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) /* * txq->trans_start can be read locklessly from dev_watchdog() */ -static inline void txq_trans_update(struct netdev_queue *txq) +static inline void txq_trans_update(const struct net_device *dev, + struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (!dev->lltx) WRITE_ONCE(txq->trans_start, jiffies); } @@ -5214,7 +5215,7 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) - txq_trans_update(txq); + txq_trans_update(dev, txq); return rc; } -- cgit v1.2.3 From 04271411121a58d37f47b065bc872f333274bf1f Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 9 Apr 2025 19:26:04 +0800 Subject: tcp: add TCP_RFC7323_TW_PAWS drop reason Devices in the networking path, such as firewalls, NATs, or routers, which can perform SNAT or DNAT, use addresses from their own limited address pools to masquerade the source address during forwarding, causing PAWS verification to fail more easily. Currently, packet loss statistics for PAWS can only be viewed through MIB, which is a global metric and cannot be precisely obtained through tracing to get the specific 4-tuple of the dropped packet. In the past, we had to use kprobe ret to retrieve relevant skb information from tcp_timewait_state_process(). We add a drop_reason pointer, similar to what previous commit does: commit e34100c2ecbb ("tcp: add a drop_reason pointer to tcp_check_req()") This commit addresses the PAWSESTABREJECTED case and also sets the corresponding drop reason. We use 'pwru' to test. Before this commit: '''' ./pwru 'port 9999' 2025/04/07 13:40:19 Listening for events.. TUPLE FUNC 172.31.75.115:12345->172.31.75.114:9999(tcp) sk_skb_reason_drop(SKB_DROP_REASON_NOT_SPECIFIED) ''' After this commit: ''' ./pwru 'port 9999' 2025/04/07 13:51:34 Listening for events.. TUPLE FUNC 172.31.75.115:12345->172.31.75.114:9999(tcp) sk_skb_reason_drop(SKB_DROP_REASON_TCP_RFC7323_TW_PAWS) ''' Suggested-by: Eric Dumazet Signed-off-by: Jiayuan Chen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250409112614.16153-2-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ include/net/tcp.h | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e4fdc6b54cef..9701d7f936f6 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -40,6 +40,7 @@ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TW_PAWS) \ FN(TCP_RFC7323_TSECR) \ FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ @@ -283,6 +284,11 @@ enum skb_drop_reason { * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in + * TIME_WAIT state. + */ + SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, /** * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. * Corresponds to LINUX_MIB_TSECRREJECTED. diff --git a/include/net/tcp.h b/include/net/tcp.h index 4450c384ef17..5078ad868fee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -427,7 +427,8 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, - u32 *tw_isn); + u32 *tw_isn, + enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race, enum skb_drop_reason *drop_reason); -- cgit v1.2.3 From c449d5f3a3d70b6223af8df2cadca3ca6eacb613 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 9 Apr 2025 19:26:05 +0800 Subject: tcp: add LINUX_MIB_PAWS_TW_REJECTED counter When TCP is in TIME_WAIT state, PAWS verification uses LINUX_PAWSESTABREJECTED, which is ambiguous and cannot be distinguished from other PAWS verification processes. We added a new counter, like the existing PAWS_OLD_ACK one. Also we update the doc with previously missing PAWS_OLD_ACK. usage: ''' nstat -az | grep PAWSTimewait TcpExtPAWSTimewait 1 0.0 ''' Suggested-by: Eric Dumazet Signed-off-by: Jiayuan Chen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250409112614.16153-3-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 1 + include/uapi/linux/snmp.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 9701d7f936f6..bea77934a235 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -287,6 +287,7 @@ enum skb_drop_reason { /** * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in * TIME_WAIT state. + * Corresponds to LINUX_MIB_PAWS_TW_REJECTED. */ SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, /** diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index ec47f9b68a1b..1d234d7e1892 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -188,6 +188,7 @@ enum LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */ LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ + LINUX_MIB_PAWS_TW_REJECTED, /* PAWSTimewait */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ LINUX_MIB_DELAYEDACKLOCKED, /* DelayedACKLocked */ LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ -- cgit v1.2.3 From b1e904999542ad6764eafa54545f1c55776006d1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Apr 2025 11:32:01 -0700 Subject: net: pass const to msg_data_left() The msg_data_left() function doesn't modify the struct msghdr parameter, so mark it as const. This allows the function to be used with const references, improving type safety and making the API more flexible. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-1-208b87064c28@debian.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index c3322eb3d686..3b262487ec06 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -168,7 +168,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } -static inline size_t msg_data_left(struct msghdr *msg) +static inline size_t msg_data_left(const struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } -- cgit v1.2.3 From 0f08335ade71273f89d19412268b48b55f3e3726 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Apr 2025 11:32:02 -0700 Subject: trace: tcp: Add tracepoint for tcp_sendmsg_locked() Add a tracepoint to monitor TCP send operations, enabling detailed visibility into TCP message transmission. Create a new tracepoint within the tcp_sendmsg_locked function, capturing traditional fields along with size_goal, which indicates the optimal data size for a single TCP segment. Additionally, a reference to the struct sock sk is passed, allowing direct access for BPF programs. The implementation is largely based on David's patch[1] and suggestions. Link: https://lore.kernel.org/all/70168c8f-bf52-4279-b4c4-be64527aa1ac@kernel.org/ [1] Signed-off-by: Breno Leitao Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-2-208b87064c28@debian.org Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1a40c41ff8c3..75d3d53a3832 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -259,6 +259,30 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->saddr_v6, __entry->daddr_v6) ); +TRACE_EVENT(tcp_sendmsg_locked, + TP_PROTO(const struct sock *sk, const struct msghdr *msg, + const struct sk_buff *skb, int size_goal), + + TP_ARGS(sk, msg, skb, size_goal), + + TP_STRUCT__entry( + __field(const void *, skb_addr) + __field(int, skb_len) + __field(int, msg_left) + __field(int, size_goal) + ), + + TP_fast_assign( + __entry->skb_addr = skb; + __entry->skb_len = skb ? skb->len : 0; + __entry->msg_left = msg_data_left(msg); + __entry->size_goal = size_goal; + ), + + TP_printk("skb_addr %p skb_len %d msg_left %d size_goal %d", + __entry->skb_addr, __entry->skb_len, __entry->msg_left, + __entry->size_goal)); + DECLARE_TRACE(tcp_cwnd_reduction_tp, TP_PROTO(const struct sock *sk, int newly_acked_sacked, int newly_lost, int flag), -- cgit v1.2.3 From 2a63dd0edf388802074f1d4d6b588a3b4c380688 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:45 -0700 Subject: net: Retire DCCP socket. DCCP was orphaned in 2021 by commit 054c4610bd05 ("MAINTAINERS: dccp: move Gerrit Renker to CREDITS"), which noted that the last maintainer had been inactive for five years. In recent years, it has become a playground for syzbot, and most changes to DCCP have been odd bug fixes triggered by syzbot. Apart from that, the only changes have been driven by treewide or networking API updates or adjustments related to TCP. Thus, in 2023, we announced we would remove DCCP in 2025 via commit b144fcaf46d4 ("dccp: Print deprecation notice."). Since then, only one individual has contacted the netdev mailing list. [0] There is ongoing research for Multipath DCCP. The repository is hosted on GitHub [1], and development is not taking place through the upstream community. While the repository is published under the GPLv2 license, the scheduling part remains proprietary, with a LICENSE file [2] stating: "This is not Open Source software." The researcher mentioned a plan to address the licensing issue, upstream the patches, and step up as a maintainer, but there has been no further communication since then. Maintaining DCCP for a decade without any real users has become a burden. Therefore, it's time to remove it. Removing DCCP will also provide significant benefits to TCP. It allows us to freely reorganize the layout of struct inet_connection_sock, which is currently shared with DCCP, and optimize it to reduce the number of cachelines accessed in the TCP fast path. Note that we keep DCCP netfilter modules as requested. [3] Link: https://lore.kernel.org/netdev/20230710182253.81446-1-kuniyu@amazon.com/T/#u #[0] Link: https://github.com/telekom/mp-dccp #[1] Link: https://github.com/telekom/mp-dccp/blob/mpdccp_v03_k5.10/net/dccp/non_gpl_scheduler/LICENSE #[2] Link: https://lore.kernel.org/netdev/Z_VQ0KlCRkqYWXa-@calendula/ #[3] Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Moore (LSM and SELinux) Acked-by: Casey Schaufler Link: https://patch.msgid.link/20250410023921.11307-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/dccp.h | 289 ------------------------------------------ include/linux/tfrc.h | 51 -------- include/net/inet_hashtables.h | 6 +- include/net/rstreason.h | 2 +- include/net/secure_seq.h | 4 - include/trace/events/sock.h | 1 - include/trace/events/sunrpc.h | 2 - 7 files changed, 2 insertions(+), 353 deletions(-) delete mode 100644 include/linux/tfrc.h (limited to 'include') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 325af611909f..0b61b8b996d4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -2,79 +2,8 @@ #ifndef _LINUX_DCCP_H #define _LINUX_DCCP_H - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include #include -enum dccp_state { - DCCP_OPEN = TCP_ESTABLISHED, - DCCP_REQUESTING = TCP_SYN_SENT, - DCCP_LISTEN = TCP_LISTEN, - DCCP_RESPOND = TCP_SYN_RECV, - /* - * States involved in closing a DCCP connection: - * 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq. - * - * 2) CLOSING can have three different meanings (RFC 4340, 8.3): - * a. Client has performed active-close, has sent a Close to the server - * from state OPEN or PARTOPEN, and is waiting for the final Reset - * (in this case, SOCK_DONE == 1). - * b. Client is asked to perform passive-close, by receiving a CloseReq - * in (PART)OPEN state. It sends a Close and waits for final Reset - * (in this case, SOCK_DONE == 0). - * c. Server performs an active-close as in (a), keeps TIMEWAIT state. - * - * 3) The following intermediate states are employed to give passively - * closing nodes a chance to process their unread data: - * - PASSIVE_CLOSE (from OPEN => CLOSED) and - * - PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above). - */ - DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1, - DCCP_PASSIVE_CLOSE = TCP_CLOSE_WAIT, /* any node receiving a Close */ - DCCP_CLOSING = TCP_CLOSING, - DCCP_TIME_WAIT = TCP_TIME_WAIT, - DCCP_CLOSED = TCP_CLOSE, - DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV, - DCCP_PARTOPEN = TCP_MAX_STATES, - DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ - DCCP_MAX_STATES -}; - -enum { - DCCPF_OPEN = TCPF_ESTABLISHED, - DCCPF_REQUESTING = TCPF_SYN_SENT, - DCCPF_LISTEN = TCPF_LISTEN, - DCCPF_RESPOND = TCPF_SYN_RECV, - DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1, - DCCPF_CLOSING = TCPF_CLOSING, - DCCPF_TIME_WAIT = TCPF_TIME_WAIT, - DCCPF_CLOSED = TCPF_CLOSE, - DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV, - DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), -}; - -static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) -{ - return (struct dccp_hdr *)skb_transport_header(skb); -} - -static inline struct dccp_hdr *dccp_zeroed_hdr(struct sk_buff *skb, int headlen) -{ - skb_push(skb, headlen); - skb_reset_transport_header(skb); - return memset(skb_transport_header(skb), 0, headlen); -} - static inline struct dccp_hdr_ext *dccp_hdrx(const struct dccp_hdr *dh) { return (struct dccp_hdr_ext *)((unsigned char *)dh + sizeof(*dh)); @@ -85,12 +14,6 @@ static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); } -static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - return __dccp_basic_hdr_len(dh); -} - static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) { __u64 seq_nr = ntohs(dh->dccph_seq); @@ -103,222 +26,10 @@ static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) return seq_nr; } -static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) -{ - return (struct dccp_hdr_request *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) -{ - return (struct dccp_hdr_ack_bits *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) -{ - const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); -} - -static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) -{ - return (struct dccp_hdr_response *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) -{ - return (struct dccp_hdr_reset *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) { return __dccp_basic_hdr_len(dh) + dccp_packet_hdr_len(dh->dccph_type); } -static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) -{ - return __dccp_hdr_len(dccp_hdr(skb)); -} - -/** - * struct dccp_request_sock - represent DCCP-specific connection request - * @dreq_inet_rsk: structure inherited from - * @dreq_iss: initial sequence number, sent on the first Response (RFC 4340, 7.1) - * @dreq_gss: greatest sequence number sent (for retransmitted Responses) - * @dreq_isr: initial sequence number received in the first Request - * @dreq_gsr: greatest sequence number received (for retransmitted Request(s)) - * @dreq_service: service code present on the Request (there is just one) - * @dreq_featneg: feature negotiation options for this connection - * The following two fields are analogous to the ones in dccp_sock: - * @dreq_timestamp_echo: last received timestamp to echo (13.1) - * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo - */ -struct dccp_request_sock { - struct inet_request_sock dreq_inet_rsk; - __u64 dreq_iss; - __u64 dreq_gss; - __u64 dreq_isr; - __u64 dreq_gsr; - __be32 dreq_service; - spinlock_t dreq_lock; - struct list_head dreq_featneg; - __u32 dreq_timestamp_echo; - __u32 dreq_timestamp_time; -}; - -static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) -{ - return (struct dccp_request_sock *)req; -} - -extern struct inet_timewait_death_row dccp_death_row; - -extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb); - -struct dccp_options_received { - u64 dccpor_ndp:48; - u32 dccpor_timestamp; - u32 dccpor_timestamp_echo; - u32 dccpor_elapsed_time; -}; - -struct ccid; - -enum dccp_role { - DCCP_ROLE_UNDEFINED, - DCCP_ROLE_LISTEN, - DCCP_ROLE_CLIENT, - DCCP_ROLE_SERVER, -}; - -struct dccp_service_list { - __u32 dccpsl_nr; - __be32 dccpsl_list[]; -}; - -#define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1) -#define DCCP_SERVICE_CODE_IS_ABSENT 0 - -static inline bool dccp_list_has_service(const struct dccp_service_list *sl, - const __be32 service) -{ - if (likely(sl != NULL)) { - u32 i = sl->dccpsl_nr; - while (i--) - if (sl->dccpsl_list[i] == service) - return true; - } - return false; -} - -struct dccp_ackvec; - -/** - * struct dccp_sock - DCCP socket state - * - * @dccps_swl - sequence number window low - * @dccps_swh - sequence number window high - * @dccps_awl - acknowledgement number window low - * @dccps_awh - acknowledgement number window high - * @dccps_iss - initial sequence number sent - * @dccps_isr - initial sequence number received - * @dccps_osr - first OPEN sequence number received - * @dccps_gss - greatest sequence number sent - * @dccps_gsr - greatest valid sequence number received - * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss - * @dccps_service - first (passive sock) or unique (active sock) service code - * @dccps_service_list - second .. last service code on passive socket - * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option - * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo - * @dccps_l_ack_ratio - feature-local Ack Ratio - * @dccps_r_ack_ratio - feature-remote Ack Ratio - * @dccps_l_seq_win - local Sequence Window (influences ack number validity) - * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) - * @dccps_pcslen - sender partial checksum coverage (via sockopt) - * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) - * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) - * @dccps_ndp_count - number of Non Data Packets since last data packet - * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) - * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) - * @dccps_hc_rx_ackvec - rx half connection ack vector - * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) - * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) - * @dccps_options_received - parsed set of retrieved options - * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy - * @dccps_tx_qlen - maximum length of the TX queue - * @dccps_role - role of this sock, one of %dccp_role - * @dccps_hc_rx_insert_options - receiver wants to add options when acking - * @dccps_hc_tx_insert_options - sender wants to add options when sending - * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) - * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets - * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) - * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) - */ -struct dccp_sock { - /* inet_connection_sock has to be the first member of dccp_sock */ - struct inet_connection_sock dccps_inet_connection; -#define dccps_syn_rtt dccps_inet_connection.icsk_ack.lrcvtime - __u64 dccps_swl; - __u64 dccps_swh; - __u64 dccps_awl; - __u64 dccps_awh; - __u64 dccps_iss; - __u64 dccps_isr; - __u64 dccps_osr; - __u64 dccps_gss; - __u64 dccps_gsr; - __u64 dccps_gar; - __be32 dccps_service; - __u32 dccps_mss_cache; - struct dccp_service_list *dccps_service_list; - __u32 dccps_timestamp_echo; - __u32 dccps_timestamp_time; - __u16 dccps_l_ack_ratio; - __u16 dccps_r_ack_ratio; - __u64 dccps_l_seq_win:48; - __u64 dccps_r_seq_win:48; - __u8 dccps_pcslen:4; - __u8 dccps_pcrlen:4; - __u8 dccps_send_ndp_count:1; - __u64 dccps_ndp_count:48; - unsigned long dccps_rate_last; - struct list_head dccps_featneg; - struct dccp_ackvec *dccps_hc_rx_ackvec; - struct ccid *dccps_hc_rx_ccid; - struct ccid *dccps_hc_tx_ccid; - struct dccp_options_received dccps_options_received; - __u8 dccps_qpolicy; - __u32 dccps_tx_qlen; - enum dccp_role dccps_role:2; - __u8 dccps_hc_rx_insert_options:1; - __u8 dccps_hc_tx_insert_options:1; - __u8 dccps_server_timewait:1; - __u8 dccps_sync_scheduled:1; - struct tasklet_struct dccps_xmitlet; - struct timer_list dccps_xmit_timer; -}; - -#define dccp_sk(ptr) container_of_const(ptr, struct dccp_sock, \ - dccps_inet_connection.icsk_inet.sk) - -static inline const char *dccp_role(const struct sock *sk) -{ - switch (dccp_sk(sk)->dccps_role) { - case DCCP_ROLE_UNDEFINED: return "undefined"; - case DCCP_ROLE_LISTEN: return "listen"; - case DCCP_ROLE_SERVER: return "server"; - case DCCP_ROLE_CLIENT: return "client"; - } - return NULL; -} - -extern void dccp_syn_ack_timeout(const struct request_sock *req); - #endif /* _LINUX_DCCP_H */ diff --git a/include/linux/tfrc.h b/include/linux/tfrc.h deleted file mode 100644 index a5acc768085d..000000000000 --- a/include/linux/tfrc.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _LINUX_TFRC_H_ -#define _LINUX_TFRC_H_ -/* - * TFRC - Data Structures for the TCP-Friendly Rate Control congestion - * control mechanism as specified in RFC 3448. - * - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include - -/** tfrc_rx_info - TFRC Receiver Data Structure - * - * @tfrcrx_x_recv: receiver estimate of sending rate (3.2.2) - * @tfrcrx_rtt: round-trip-time (communicated by sender) - * @tfrcrx_p: current estimate of loss event rate (3.2.2) - */ -struct tfrc_rx_info { - __u32 tfrcrx_x_recv; - __u32 tfrcrx_rtt; - __u32 tfrcrx_p; -}; - -/** tfrc_tx_info - TFRC Sender Data Structure - * - * @tfrctx_x: computed transmit rate (4.3 (4)) - * @tfrctx_x_recv: receiver estimate of send rate (4.3) - * @tfrctx_x_calc: return value of throughput equation (3.1) - * @tfrctx_rtt: (moving average) estimate of RTT (4.3) - * @tfrctx_p: current loss event rate (5.4) - * @tfrctx_rto: estimate of RTO, equals 4*RTT (4.3) - * @tfrctx_ipi: inter-packet interval (4.6) - * - * Note: X and X_recv are both maintained in units of 64 * bytes/second. This - * enables a finer resolution of sending rates and avoids problems with - * integer arithmetic; u32 is not sufficient as scaling consumes 6 bits. - */ -struct tfrc_tx_info { - __u64 tfrctx_x; - __u64 tfrctx_x_recv; - __u32 tfrctx_x_calc; - __u32 tfrctx_rtt; - __u32 tfrctx_p; - __u32 tfrctx_rto; - __u32 tfrctx_ipi; -}; - -#endif /* _LINUX_TFRC_H_ */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 949641e92539..d172b64a6320 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -177,12 +177,8 @@ struct inet_hashinfo { static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { -#if IS_ENABLED(CONFIG_IP_DCCP) - return sk->sk_prot->h.hashinfo ? : - sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#else + /* TODO: rename function */ return sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#endif } static inline struct inet_listen_hashbucket * diff --git a/include/net/rstreason.h b/include/net/rstreason.h index 69cb2e52b7da..979ac87b5d99 100644 --- a/include/net/rstreason.h +++ b/include/net/rstreason.h @@ -36,7 +36,7 @@ /** * enum sk_rst_reason - the reasons of socket reset * - * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols. + * The reasons of sk reset, which are used in TCP/MPTCP protocols. * * There are three parts in order: * 1) skb drop reasons: relying on drop reasons for such as passive reset diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 21e7fa2a1813..cddebafb9f77 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -16,9 +16,5 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport); u32 secure_tcpv6_ts_off(const struct net *net, const __be32 *saddr, const __be32 *daddr); -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport); #endif /* _NET_SECURE_SEQ */ diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 3836de435d9d..b5310439536e 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -19,7 +19,6 @@ /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ - EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5d331383047b..de214f1dea58 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -21,7 +21,6 @@ TRACE_DEFINE_ENUM(SOCK_DGRAM); TRACE_DEFINE_ENUM(SOCK_RAW); TRACE_DEFINE_ENUM(SOCK_RDM); TRACE_DEFINE_ENUM(SOCK_SEQPACKET); -TRACE_DEFINE_ENUM(SOCK_DCCP); TRACE_DEFINE_ENUM(SOCK_PACKET); #define show_socket_type(type) \ @@ -31,7 +30,6 @@ TRACE_DEFINE_ENUM(SOCK_PACKET); { SOCK_RAW, "RAW" }, \ { SOCK_RDM, "RDM" }, \ { SOCK_SEQPACKET, "SEQPACKET" }, \ - { SOCK_DCCP, "DCCP" }, \ { SOCK_PACKET, "PACKET" }) /* This list is known to be incomplete, add new enums as needed. */ -- cgit v1.2.3 From 22d6c9eebf2e68e6ab831ded37daaa83daff6bb8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:46 -0700 Subject: net: Unexport shared functions for DCCP. DCCP was removed, so many inet functions no longer need to be exported. Let's unexport or use EXPORT_IPV6_MOD() for such functions. sk_free_unlock_clone() is inlined in sk_clone_lock() as it's the only caller. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410023921.11307-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 694f954258d4..bb4d6189292f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1781,7 +1781,6 @@ void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); -- cgit v1.2.3 From 235bd9d21fcdf07dd125daa3e60ab64f8aefb927 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:47 -0700 Subject: tcp: Rename tcp_or_dccp_get_hashinfo(). DCCP was removed, so tcp_or_dccp_get_hashinfo() should be renamed. Let's rename it to tcp_get_hashinfo(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410023921.11307-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index d172b64a6320..4564b5d348b1 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -175,9 +175,8 @@ struct inet_hashinfo { bool pernet; } ____cacheline_aligned_in_smp; -static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) +static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) { - /* TODO: rename function */ return sock_net(sk)->ipv4.tcp_death_row.hashinfo; } -- cgit v1.2.3 From c26c192c3d486a2a7d83d254bae294c2f8f50abf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Apr 2025 22:00:56 +0200 Subject: udp: properly deal with xfrm encap and ADDRFORM UDP GRO accounting assumes that the GRO receive callback is always set when the UDP tunnel is enabled, but syzkaller proved otherwise, leading tot the following splat: WARNING: CPU: 0 PID: 5837 at net/ipv4/udp_offload.c:123 udp_tunnel_update_gro_rcv+0x28d/0x4c0 net/ipv4/udp_offload.c:123 Modules linked in: CPU: 0 UID: 0 PID: 5837 Comm: syz-executor850 Not tainted 6.14.0-syzkaller-13320-g420aabef3ab5 #0 PREEMPT(full) Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 RIP: 0010:udp_tunnel_update_gro_rcv+0x28d/0x4c0 net/ipv4/udp_offload.c:123 Code: 00 00 e8 c6 5a 2f f7 48 c1 e5 04 48 8d b5 20 53 c7 9a ba 10 00 00 00 4c 89 ff e8 ce 87 99 f7 e9 ce 00 00 00 e8 a4 5a 2f f7 90 <0f> 0b 90 e9 de fd ff ff bf 01 00 00 00 89 ee e8 cf 5e 2f f7 85 ed RSP: 0018:ffffc90003effa88 EFLAGS: 00010293 RAX: ffffffff8a93fc9c RBX: 0000000000000000 RCX: ffff8880306f9e00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffffffff8a93fabe R09: 1ffffffff20bfb2e R10: dffffc0000000000 R11: fffffbfff20bfb2f R12: ffff88814ef21738 R13: dffffc0000000000 R14: ffff88814ef21778 R15: 1ffff11029de42ef FS: 0000000000000000(0000) GS:ffff888124f96000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f04eec760d0 CR3: 000000000eb38000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: udp_tunnel_cleanup_gro include/net/udp_tunnel.h:205 [inline] udpv6_destroy_sock+0x212/0x270 net/ipv6/udp.c:1829 sk_common_release+0x71/0x2e0 net/core/sock.c:3896 inet_release+0x17d/0x200 net/ipv4/af_inet.c:435 __sock_release net/socket.c:647 [inline] sock_close+0xbc/0x240 net/socket.c:1391 __fput+0x3e9/0x9f0 fs/file_table.c:465 task_work_run+0x251/0x310 kernel/task_work.c:227 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0xa11/0x27f0 kernel/exit.c:953 do_group_exit+0x207/0x2c0 kernel/exit.c:1102 __do_sys_exit_group kernel/exit.c:1113 [inline] __se_sys_exit_group kernel/exit.c:1111 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1111 x64_sys_call+0x26c3/0x26d0 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f04eebfac79 Code: Unable to access opcode bytes at 0x7f04eebfac4f. RSP: 002b:00007fffdcaa34a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f04eebfac79 RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000 RBP: 00007f04eec75270 R08: ffffffffffffffb8 R09: 00007fffdcaa36c8 R10: 0000200000000000 R11: 0000000000000246 R12: 00007f04eec75270 R13: 0000000000000000 R14: 00007f04eec75cc0 R15: 00007f04eebcca70 Address the issue moving the accounting hook into setup_udp_tunnel_sock() and set_xfrm_gro_udp_encap_rcv(), where the GRO callback is actually set. set_xfrm_gro_udp_encap_rcv() is prone to races with IPV6_ADDRFORM, run the relevant setsockopt under the socket lock to ensure using consistent values of sk_family and up->encap_type. Refactor the GRO callback selection code, to make it clear that the function pointer is always initialized. Reported-by: syzbot+8c469a2260132cd095c1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8c469a2260132cd095c1 Fixes: 172bf009c18d ("xfrm: Support GRO for IPv4 ESP in UDP encapsulation") Fixes: 5d7f5b2f6b935 ("udp_tunnel: use static call for GRO hooks when possible") Signed-off-by: Paolo Abeni Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/92bcdb6899145a9a387c8fa9e3ca656642a43634.1744228733.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 288f06f23a80..2df3b8344eb5 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -215,7 +215,6 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif - udp_tunnel_update_gro_rcv(sk, true); udp_encap_enable(); } -- cgit v1.2.3 From 097f171f98289cf737437599c40b0d1e81266e9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:42:46 -0700 Subject: net: convert dev->rtnl_link_state to a bool netdevice reg_state was split into two 16 bit enums back in 2010 in commit a2835763e130 ("rtnetlink: handle rtnl_link netlink notifications manually"). Since the split the fields have been moved apart, and last year we converted reg_state to a normal u8 in commit 4d42b37def70 ("net: convert dev->reg_state to u8"). rtnl_link_state being a 16 bitfield makes no sense. Convert it to a single bool, it seems very unlikely after 15 years that we'll need more values in it. We could drop dev->rtnl_link_ops from the conditions but feels like having it there more clearly points at the reason for this hack. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250410014246.780885-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8544f6a680c..e6036b82ef4c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1946,9 +1946,6 @@ enum netdev_reg_state { * * @reg_state: Register/unregister state machine * @dismantle: Device is going to be freed - * @rtnl_link_state: This enum represents the phases of creating - * a new link - * * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one @@ -2363,11 +2360,8 @@ struct net_device { /** @moving_ns: device is changing netns, protected by @lock */ bool moving_ns; - - enum { - RTNL_LINK_INITIALIZED, - RTNL_LINK_INITIALIZING, - } rtnl_link_state:16; + /** @rtnl_link_initializing: Device being created, suppress events */ + bool rtnl_link_initializing; bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); -- cgit v1.2.3 From e846fb5e7c5243c65ff67247cb29a9d76bbcc4e8 Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Fri, 11 Apr 2025 11:03:16 -0400 Subject: net: bridge: mcast: Add offload failed mdb flag Add MDB_FLAGS_OFFLOAD_FAILED and MDB_PG_FLAGS_OFFLOAD_FAILED to indicate that an attempt to offload the MDB entry to switchdev has failed. Signed-off-by: Joseph Huang Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250411150323.1117797-2-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index a5b743a2f775..f2a6de424f3f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -699,10 +699,11 @@ struct br_mdb_entry { #define MDB_TEMPORARY 0 #define MDB_PERMANENT 1 __u8 state; -#define MDB_FLAGS_OFFLOAD (1 << 0) -#define MDB_FLAGS_FAST_LEAVE (1 << 1) -#define MDB_FLAGS_STAR_EXCL (1 << 2) -#define MDB_FLAGS_BLOCKED (1 << 3) +#define MDB_FLAGS_OFFLOAD (1 << 0) +#define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) +#define MDB_FLAGS_OFFLOAD_FAILED (1 << 4) __u8 flags; __u16 vid; struct { -- cgit v1.2.3 From 9fbe1e3e61c21508861a72324087aeeea85f796f Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Fri, 11 Apr 2025 11:03:17 -0400 Subject: net: bridge: Add offload_fail_notification bopt Add BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION bool option. Signed-off-by: Joseph Huang Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250411150323.1117797-3-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index f2a6de424f3f..73876c0e2bba 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -831,6 +831,7 @@ enum br_boolopt_id { BR_BOOLOPT_NO_LL_LEARN, BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MST_ENABLE, + BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION, BR_BOOLOPT_MAX }; -- cgit v1.2.3 From cd3c93167da0e760b5819246eae7a4ea30fd014b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Apr 2025 12:41:36 +0200 Subject: page_pool: Move pp_magic check into helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we are about to stash some more information into the pp_magic field, let's move the magic signature checks into a pair of helper functions so it can be changed in one place. Reviewed-by: Mina Almasry Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-1-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index b7f13f087954..56c47f4a38ca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,4 +4248,24 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page. + * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling + * the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~0x3UL + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(struct page *page) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From ee62ce7a1d909ccba0399680a03c2dee83bcae95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Apr 2025 12:41:37 +0200 Subject: page_pool: Track DMA-mapped pages and unmap them when destroying the pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When enabling DMA mapping in page_pool, pages are kept DMA mapped until they are released from the pool, to avoid the overhead of re-mapping the pages every time they are used. This causes resource leaks and/or crashes when there are pages still outstanding while the device is torn down, because page_pool will attempt an unmap through a non-existent DMA device on the subsequent page return. To fix this, implement a simple tracking of outstanding DMA-mapped pages in page pool using an xarray. This was first suggested by Mina[0], and turns out to be fairly straight forward: We simply store pointers to pages directly in the xarray with xa_alloc() when they are first DMA mapped, and remove them from the array on unmap. Then, when a page pool is torn down, it can simply walk the xarray and unmap all pages still present there before returning, which also allows us to get rid of the get/put_device() calls in page_pool. Using xa_cmpxchg(), no additional synchronisation is needed, as a page will only ever be unmapped once. To avoid having to walk the entire xarray on unmap to find the page reference, we stash the ID assigned by xa_alloc() into the page structure itself, using the upper bits of the pp_magic field. This requires a couple of defines to avoid conflicting with the POINTER_POISON_DELTA define, but this is all evaluated at compile-time, so does not affect run-time performance. The bitmap calculations in this patch gives the following number of bits for different architectures: - 23 bits on 32-bit architectures - 21 bits on PPC64 (because of the definition of ILLEGAL_POINTER_VALUE) - 32 bits on other 64-bit architectures Stashing a value into the unused bits of pp_magic does have the effect that it can make the value stored there lie outside the unmappable range (as governed by the mmap_min_addr sysctl), for architectures that don't define ILLEGAL_POINTER_VALUE. This means that if one of the pointers that is aliased to the pp_magic field (such as page->lru.next) is dereferenced while the page is owned by page_pool, that could lead to a dereference into userspace, which is a security concern. The risk of this is mitigated by the fact that (a) we always clear pp_magic before releasing a page from page_pool, and (b) this would need a use-after-free bug for struct page, which can have many other risks since page->lru.next is used as a generic list pointer in multiple places in the kernel. As such, with this patch we take the position that this risk is negligible in practice. For more discussion, see[1]. Since all the tracking added in this patch is performed on DMA map/unmap, no additional code is needed in the fast path, meaning the performance overhead of this tracking is negligible there. A micro-benchmark shows that the total overhead of the tracking itself is about 400 ns (39 cycles(tsc) 395.218 ns; sum for both map and unmap[2]). Since this cost is only paid on DMA map and unmap, it seems like an acceptable cost to fix the late unmap issue. Further optimisation can narrow the cases where this cost is paid (for instance by eliding the tracking when DMA map/unmap is a no-op). The extra memory needed to track the pages is neatly encapsulated inside xarray, which uses the 'struct xa_node' structure to track items. This structure is 576 bytes long, with slots for 64 items, meaning that a full node occurs only 9 bytes of overhead per slot it tracks (in practice, it probably won't be this efficient, but in any case it should be an acceptable overhead). [0] https://lore.kernel.org/all/CAHS8izPg7B5DwKfSuzz-iOop_YRbk3Sd6Y4rX7KBG9DcVJcyWg@mail.gmail.com/ [1] https://lore.kernel.org/r/20250320023202.GA25514@openwall.com [2] https://lore.kernel.org/r/ae07144c-9295-4c9d-a400-153bb689fe9e@huawei.com Reported-by: Yonglong Liu Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") Suggested-by: Mina Almasry Reviewed-by: Mina Almasry Reviewed-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Qiuling Ren Tested-by: Yuying Ma Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-2-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 46 +++++++++++++++++++++++++++++++++++++++---- include/linux/poison.h | 4 ++++ include/net/page_pool/types.h | 6 ++++++ 3 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 56c47f4a38ca..130d3c9d2ee4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,13 +4248,51 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page. - * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling - * the pfmemalloc page. + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. */ -#define PP_MAGIC_MASK ~0x3UL +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) #ifdef CONFIG_PAGE_POOL static inline bool page_pool_page_is_pp(struct page *page) diff --git a/include/linux/poison.h b/include/linux/poison.h index 331a9a996fa8..8ca2235f78d5 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -70,6 +70,10 @@ #define KEY_DESTROY 0xbd /********** net/core/page_pool.c **********/ +/* + * page_pool uses additional free bits within this value to store data, see the + * definition of PP_DMA_INDEX_MASK in mm.h + */ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) /********** net/core/skbuff.c **********/ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 36eb57d73abc..431b593de709 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA @@ -33,6 +34,9 @@ #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + /* * Fast allocation side cache array/stack * @@ -221,6 +225,8 @@ struct page_pool { void *mp_priv; const struct memory_provider_ops *mp_ops; + struct xarray dma_mapped; + #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; -- cgit v1.2.3 From ceaceaf79ea0fe337344fc5c1fb10a421a362410 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Apr 2025 23:04:19 +0100 Subject: net: ethtool: fix get_ts_stats() documentation Commit 0e9c127729be ("ethtool: add interface to read Tx hardware timestamping statistics") added documentation for timestamping statistics, but added the detailed explanation for this method to the get_ts_info() rather than get_ts_stats(). Move it to the correct entry. Cc: Rahul Rameshbabu Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3MTz-000Crx-IW@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 8210ece94fa6..013d25858642 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -926,10 +926,11 @@ struct kernel_ethtool_ts_info { * @get_ts_info: Get the time stamping and PTP hardware clock capabilities. * It may be called with RCU, or rtnl or reference on the device. * Drivers supporting transmit time stamps in software should set this to - * ethtool_op_get_ts_info(). Drivers must not zero statistics which they - * don't report. The stats structure is initialized to ETHTOOL_STAT_NOT_SET - * indicating driver does not report statistics. - * @get_ts_stats: Query the device hardware timestamping statistics. + * ethtool_op_get_ts_info(). + * @get_ts_stats: Query the device hardware timestamping statistics. Drivers + * must not zero statistics which they don't report. The stats structure + * is initialized to ETHTOOL_STAT_NOT_SET indicating driver does not + * report statistics. * @get_module_info: Get the size and type of the eeprom contained within * a plug-in module. * @get_module_eeprom: Get the eeprom information from the plug-in module -- cgit v1.2.3 From 7a60d91c690bf73c2c78e763efa29f294e217c3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:32 -0700 Subject: net: Add ->exit_rtnl() hook to struct pernet_operations. struct pernet_operations provides two batching hooks; ->exit_batch() and ->exit_batch_rtnl(). The batching variant is beneficial if ->exit() meets any of the following conditions: 1) ->exit() repeatedly acquires a global lock for each netns 2) ->exit() has a time-consuming operation that can be factored out (e.g. synchronize_rcu(), smp_mb(), etc) 3) ->exit() does not need to repeat the same iterations for each netns (e.g. inet_twsk_purge()) Currently, none of the ->exit_batch_rtnl() functions satisfy any of the above conditions because RTNL is factored out and held by the caller and all of these functions iterate over the dying netns list. Also, we want to hold per-netns RTNL there but avoid spreading __rtnl_net_lock() across multiple locations. Let's add ->exit_rtnl() hook and run it under __rtnl_net_lock(). The following patches will convert all ->exit_batch_rtnl() users to ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index bd57d8fb54f1..b071e6eed9d5 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -475,6 +475,8 @@ struct pernet_operations { void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); /* Following method is called with RTNL held. */ + void (*exit_rtnl)(struct net *net, + struct list_head *dev_kill_list); void (*exit_batch_rtnl)(struct list_head *net_exit_list, struct list_head *dev_kill_list); unsigned int * const id; -- cgit v1.2.3 From a967e01e2ad201f6ddc778ed65a5dae1c68ee8a5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:35 -0700 Subject: ipv4: ip_tunnel: Convert ip_tunnel_delete_nets() callers to ->exit_rtnl(). ip_tunnel_delete_nets() iterates the dying netns list and performs the same operations for each. Let's export ip_tunnel_destroy() as ip_tunnel_delete_net() and call it from ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a36a335cef9f..0c3d571a04a1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -377,10 +377,9 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); - -void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill); +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); -- cgit v1.2.3 From c57a9c503543cd8829eeaaf88362199e0491c0d7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:43 -0700 Subject: net: Remove ->exit_batch_rtnl(). There are no ->exit_batch_rtnl() users remaining. Let's remove the hook. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-15-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b071e6eed9d5..025a7574b275 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -477,8 +477,6 @@ struct pernet_operations { /* Following method is called with RTNL held. */ void (*exit_rtnl)(struct net *net, struct list_head *dev_kill_list); - void (*exit_batch_rtnl)(struct list_head *net_exit_list, - struct list_head *dev_kill_list); unsigned int * const id; const size_t size; }; -- cgit v1.2.3 From 651f88cb046c5e002f7c11de2cebf207787d2346 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 09:08:45 +0100 Subject: net: stmmac: remove eee_usecs_rate plat_dat->eee_users_rate is now unused, so remove this member. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3Vuv-000E7y-9k@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c4ec8bb8144e..8aed09d65b4a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -276,7 +276,6 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; -- cgit v1.2.3 From 23738cc8048322cf324f330cd697380fb3455da5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:47 +0100 Subject: rxrpc: Pull out certain app callback funcs into an ops table A number of functions separately furnish an AF_RXRPC socket with callback function pointers into a kernel app (such as the AFS filesystem) that is using it. Replace most of these with an ops table for the entire socket. This makes it easier to add more callback functions. Note that the call incoming data processing callback is retaind as that gets set to different things, depending on the type of op. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/net/af_rxrpc.h | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index cf793d18e5df..ebb6092c488b 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -29,18 +29,23 @@ enum rxrpc_interruptibility { */ extern atomic_t rxrpc_debug_id; +/* + * Operations table for rxrpc to call out to a kernel application (e.g. kAFS). + */ +struct rxrpc_kernel_ops { + void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID); + void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, - unsigned long); -typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); -void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t, - rxrpc_discard_new_call_t); +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, struct key *key, @@ -72,9 +77,9 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); -int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t, - unsigned int); +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -- cgit v1.2.3 From 5800b1cf3fd8ccab752a101865be1e76dac33142 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:49 +0100 Subject: rxrpc: Allow CHALLENGEs to the passed to the app for a RESPONSE Allow the app to request that CHALLENGEs be passed to it through an out-of-band queue that allows recvmsg() to pick it up so that the app can add data to it with sendmsg(). This will allow the application (AFS or userspace) to interact with the process if it wants to and put values into user-defined fields. This will be used by AFS when talking to a fileserver to supply that fileserver with a crypto key by which callback RPCs can be encrypted (ie. notifications from the fileserver to the client). Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-5-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/net/af_rxrpc.h | 24 +++++++++++++++++++++++ include/trace/events/rxrpc.h | 18 ++++++++++++++++- include/uapi/linux/rxrpc.h | 46 ++++++++++++++++++++++++++++++++------------ 3 files changed, 75 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ebb6092c488b..0b209f703ffc 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -16,6 +16,7 @@ struct sock; struct socket; struct rxrpc_call; struct rxrpc_peer; +struct krb5_buffer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -24,6 +25,10 @@ enum rxrpc_interruptibility { RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ }; +enum rxrpc_oob_type { + RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */ +}; + /* * Debug ID counter for tracing. */ @@ -37,6 +42,7 @@ struct rxrpc_kernel_ops { unsigned long user_call_ID); void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*notify_oob)(struct sock *sk, struct sk_buff *oob); }; typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, @@ -88,5 +94,23 @@ void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); +int rxrpc_sock_set_manage_response(struct sock *sk, bool set); + +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata); +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type); +void rxrpc_kernel_free_oob(struct sk_buff *oob); +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index); +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why); +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata); #endif /* _NET_RXRPC_H */ diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cad50d91077e..08ecebd90595 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -25,6 +25,7 @@ EM(afs_abort_probeuuid_negative, "afs-probeuuid-neg") \ EM(afs_abort_send_data_error, "afs-send-data") \ EM(afs_abort_unmarshal_error, "afs-unmarshal") \ + EM(afs_abort_unsupported_sec_class, "afs-unsup-sec-class") \ /* rxperf errors */ \ EM(rxperf_abort_general_error, "rxperf-error") \ EM(rxperf_abort_oom, "rxperf-oom") \ @@ -77,6 +78,7 @@ EM(rxrpc_abort_call_timeout, "call-timeout") \ EM(rxrpc_abort_no_service_key, "no-serv-key") \ EM(rxrpc_abort_nomem, "nomem") \ + EM(rxrpc_abort_response_sendmsg, "resp-sendmsg") \ EM(rxrpc_abort_service_not_offered, "serv-not-offered") \ EM(rxrpc_abort_shut_down, "shut-down") \ EM(rxrpc_abort_unsupported_security, "unsup-sec") \ @@ -133,24 +135,33 @@ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ + EM(rxrpc_skb_get_post_oob, "GET post-oob ") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ + EM(rxrpc_skb_new_response_rxgk, "NEW resp-rxgk") \ + EM(rxrpc_skb_new_response_rxkad, "NEW resp-rxkd") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ EM(rxrpc_skb_put_call_rx, "PUT call-rx ") \ + EM(rxrpc_skb_put_challenge, "PUT challenge") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ + EM(rxrpc_skb_put_oob, "PUT oob ") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ + EM(rxrpc_skb_put_purge_oob, "PUT purge-oob") \ + EM(rxrpc_skb_put_response, "PUT response ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ + EM(rxrpc_skb_see_oob_challenge, "SEE oob-chall") \ EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ + EM(rxrpc_skb_see_recvmsg_oob, "SEE recvm-oob") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ E_(rxrpc_skb_see_version, "SEE version ") @@ -216,9 +227,11 @@ EM(rxrpc_conn_free, "FREE ") \ EM(rxrpc_conn_get_activate_call, "GET act-call") \ EM(rxrpc_conn_get_call_input, "GET inp-call") \ + EM(rxrpc_conn_get_challenge_input, "GET inp-chal") \ EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ EM(rxrpc_conn_get_idle, "GET idle ") \ EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \ + EM(rxrpc_conn_get_poke_response, "GET response") \ EM(rxrpc_conn_get_poke_secured, "GET secured ") \ EM(rxrpc_conn_get_poke_timer, "GET poke ") \ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ @@ -226,10 +239,12 @@ EM(rxrpc_conn_new_service, "NEW service ") \ EM(rxrpc_conn_put_call, "PUT call ") \ EM(rxrpc_conn_put_call_input, "PUT inp-call") \ + EM(rxrpc_conn_put_challenge_input, "PUT inp-chal") \ EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \ EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ + EM(rxrpc_conn_put_oob, "PUT oob ") \ EM(rxrpc_conn_put_poke, "PUT poke ") \ EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \ EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ @@ -331,6 +346,7 @@ EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ + EM(rxrpc_recvmsg_oobq, "OOBQ") \ EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ @@ -456,7 +472,7 @@ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ - EM(rxrpc_tx_point_rxkad_response, "RxkadResp") \ + EM(rxrpc_tx_point_response, "Response") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ E_(rxrpc_tx_point_version_reply, "VerReply") diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 8f8dc7a937a4..c4e9833b0a12 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -36,26 +36,33 @@ struct sockaddr_rxrpc { #define RXRPC_MIN_SECURITY_LEVEL 4 /* minimum security level */ #define RXRPC_UPGRADEABLE_SERVICE 5 /* Upgrade service[0] -> service[1] */ #define RXRPC_SUPPORTED_CMSG 6 /* Get highest supported control message type */ +#define RXRPC_MANAGE_RESPONSE 7 /* [clnt] Want to manage RESPONSE packets */ /* * RxRPC control messages * - If neither abort or accept are specified, the message is a data message. * - terminal messages mean that a user call ID tag can be recycled + * - C/S/- indicate whether these are applicable to client, server or both * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg() */ enum rxrpc_cmsg_type { - RXRPC_USER_CALL_ID = 1, /* sr: user call ID specifier */ - RXRPC_ABORT = 2, /* sr: abort request / notification [terminal] */ - RXRPC_ACK = 3, /* -r: [Service] RPC op final ACK received [terminal] */ - RXRPC_NET_ERROR = 5, /* -r: network error received [terminal] */ - RXRPC_BUSY = 6, /* -r: server busy received [terminal] */ - RXRPC_LOCAL_ERROR = 7, /* -r: local error generated [terminal] */ - RXRPC_NEW_CALL = 8, /* -r: [Service] new incoming call notification */ - RXRPC_EXCLUSIVE_CALL = 10, /* s-: Call should be on exclusive connection */ - RXRPC_UPGRADE_SERVICE = 11, /* s-: Request service upgrade for client call */ - RXRPC_TX_LENGTH = 12, /* s-: Total length of Tx data */ - RXRPC_SET_CALL_TIMEOUT = 13, /* s-: Set one or more call timeouts */ - RXRPC_CHARGE_ACCEPT = 14, /* s-: Charge the accept pool with a user call ID */ + RXRPC_USER_CALL_ID = 1, /* -sr: User call ID specifier */ + RXRPC_ABORT = 2, /* -sr: Abort request / notification [terminal] */ + RXRPC_ACK = 3, /* S-r: RPC op final ACK received [terminal] */ + RXRPC_NET_ERROR = 5, /* --r: Network error received [terminal] */ + RXRPC_BUSY = 6, /* C-r: Server busy received [terminal] */ + RXRPC_LOCAL_ERROR = 7, /* --r: Local error generated [terminal] */ + RXRPC_NEW_CALL = 8, /* S-r: New incoming call notification */ + RXRPC_EXCLUSIVE_CALL = 10, /* Cs-: Call should be on exclusive connection */ + RXRPC_UPGRADE_SERVICE = 11, /* Cs-: Request service upgrade for client call */ + RXRPC_TX_LENGTH = 12, /* -s-: Total length of Tx data */ + RXRPC_SET_CALL_TIMEOUT = 13, /* -s-: Set one or more call timeouts */ + RXRPC_CHARGE_ACCEPT = 14, /* Ss-: Charge the accept pool with a user call ID */ + RXRPC_OOB_ID = 15, /* -sr: OOB message ID */ + RXRPC_CHALLENGED = 16, /* C-r: Info on a received CHALLENGE */ + RXRPC_RESPOND = 17, /* Cs-: Respond to a challenge */ + RXRPC_RESPONDED = 18, /* S-r: Data received in RESPONSE */ + RXRPC_RESP_RXGK_APPDATA = 19, /* Cs-: RESPONSE: RxGK app data to include */ RXRPC__SUPPORTED }; @@ -118,4 +125,19 @@ enum rxrpc_cmsg_type { #define RXKADDATALEN 19270411 /* user data too long */ #define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ +/* + * Challenge information in the RXRPC_CHALLENGED control message. + */ +struct rxrpc_challenge { + __u16 service_id; /* The service ID of the connection (may be upgraded) */ + __u8 security_index; /* The security index of the connection */ + __u8 pad; /* Round out to a multiple of 4 bytes. */ + /* ... The security class gets to append extra information ... */ +}; + +struct rxgk_challenge { + struct rxrpc_challenge base; + __u32 enctype; /* Krb5 encoding type */ +}; + #endif /* _UAPI_LINUX_RXRPC_H */ -- cgit v1.2.3 From 01af64269751f261421a9e80a527c8e987aeda8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:50 +0100 Subject: rxrpc: Add the security index for yfs-rxgk Add the security index and abort codes for the YFS variant of rxgk. Signed-off-by: David Howells Link: https://patch.msgid.link/20250411095303.2316168-6-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/crypto/krb5.h | 5 +++++ include/uapi/linux/rxrpc.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/crypto/krb5.h b/include/crypto/krb5.h index 62d998e62f47..71dd38f59be1 100644 --- a/include/crypto/krb5.h +++ b/include/crypto/krb5.h @@ -63,6 +63,11 @@ struct scatterlist; #define KEY_USAGE_SEED_ENCRYPTION (0xAA) #define KEY_USAGE_SEED_INTEGRITY (0x55) +/* + * Standard Kerberos error codes. + */ +#define KRB5_PROG_KEYTYPE_NOSUPP -1765328233 + /* * Mode of operation. */ diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index c4e9833b0a12..d9735abd4c79 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -80,6 +80,7 @@ enum rxrpc_cmsg_type { #define RXRPC_SECURITY_RXKAD 2 /* kaserver or kerberos 4 */ #define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ #define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ +#define RXRPC_SECURITY_YFS_RXGK 6 /* YFS gssapi-based */ /* * RxRPC-level abort codes @@ -125,6 +126,36 @@ enum rxrpc_cmsg_type { #define RXKADDATALEN 19270411 /* user data too long */ #define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ +/* + * RxGK GSSAPI security abort codes. + */ +#if 0 /* Original standard abort codes (used by OpenAFS) */ +#define RXGK_INCONSISTENCY 1233242880 /* Security module structure inconsistent */ +#define RXGK_PACKETSHORT 1233242881 /* Packet too short for security challenge */ +#define RXGK_BADCHALLENGE 1233242882 /* Invalid security challenge */ +#define RXGK_BADETYPE 1233242883 /* Invalid or impermissible encryption type */ +#define RXGK_BADLEVEL 1233242884 /* Invalid or impermissible security level */ +#define RXGK_BADKEYNO 1233242885 /* Key version number not found */ +#define RXGK_EXPIRED 1233242886 /* Token has expired */ +#define RXGK_NOTAUTH 1233242887 /* Caller not authorized */ +#define RXGK_BAD_TOKEN 1233242888 /* Security object was passed a bad token */ +#define RXGK_SEALED_INCON 1233242889 /* Sealed data inconsistent */ +#define RXGK_DATA_LEN 1233242890 /* User data too long */ +#define RXGK_BAD_QOP 1233242891 /* Inadequate quality of protection available */ +#else /* Revised standard abort codes (used by YFS) */ +#define RXGK_INCONSISTENCY 1233242880 /* Security module structure inconsistent */ +#define RXGK_PACKETSHORT 1233242881 /* Packet too short for security challenge */ +#define RXGK_BADCHALLENGE 1233242882 /* Security challenge/response failed */ +#define RXGK_SEALEDINCON 1233242883 /* Sealed data is inconsistent */ +#define RXGK_NOTAUTH 1233242884 /* Caller not authorised */ +#define RXGK_EXPIRED 1233242885 /* Authentication expired */ +#define RXGK_BADLEVEL 1233242886 /* Unsupported or not permitted security level */ +#define RXGK_BADKEYNO 1233242887 /* Bad transport key number */ +#define RXGK_NOTRXGK 1233242888 /* Security layer is not rxgk */ +#define RXGK_UNSUPPORTED 1233242889 /* Endpoint does not support rxgk */ +#define RXGK_GSSERROR 1233242890 /* GSSAPI mechanism error */ +#endif + /* * Challenge information in the RXRPC_CHALLENGED control message. */ -- cgit v1.2.3 From 0ca100ff4df64f5d0f6c1dd5080c3e096786bea6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:51 +0100 Subject: rxrpc: Add YFS RxGK (GSSAPI) security class Add support for the YFS-variant RxGK security class to support GSSAPI-derived authentication. This also allows the use of better crypto over the rxkad security class. The key payload is XDR encoded of the form: typedef int64_t opr_time; const AFSTOKEN_RK_TIX_MAX = 12000; /* Matches entry in rxkad.h */ struct token_rxkad { afs_int32 viceid; afs_int32 kvno; afs_int64 key; afs_int32 begintime; afs_int32 endtime; afs_int32 primary_flag; opaque ticket; }; struct token_rxgk { opr_time begintime; opr_time endtime; afs_int64 level; afs_int64 lifetime; afs_int64 bytelife; afs_int64 enctype; opaque key<>; opaque ticket<>; }; const AFSTOKEN_UNION_NOAUTH = 0; const AFSTOKEN_UNION_KAD = 2; const AFSTOKEN_UNION_YFSGK = 6; union ktc_tokenUnion switch (afs_int32 type) { case AFSTOKEN_UNION_KAD: token_rxkad kad; case AFSTOKEN_UNION_YFSGK: token_rxgk gk; }; const AFSTOKEN_LENGTH_MAX = 16384; typedef opaque token_opaque; const AFSTOKEN_MAX = 8; const AFSTOKEN_CELL_MAX = 64; struct ktc_setTokenData { afs_int32 flags; string cell; token_opaque tokens; }; The parser for the basic token struct is already present, as is the rxkad token type. This adds a parser for the rxgk token type. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-7-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/keys/rxrpc-type.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/keys/rxrpc-type.h b/include/keys/rxrpc-type.h index 333c0f49a9cd..0ddbe197a261 100644 --- a/include/keys/rxrpc-type.h +++ b/include/keys/rxrpc-type.h @@ -9,6 +9,7 @@ #define _KEYS_RXRPC_TYPE_H #include +#include /* * key type for AF_RXRPC keys @@ -31,6 +32,21 @@ struct rxkad_key { u8 ticket[]; /* the encrypted ticket */ }; +/* + * RxRPC key for YFS-RxGK (type-6 security) + */ +struct rxgk_key { + s64 begintime; /* Time at which the ticket starts */ + s64 endtime; /* Time at which the ticket ends */ + u64 lifetime; /* Maximum lifespan of a connection (seconds) */ + u64 bytelife; /* Maximum number of bytes on a connection */ + unsigned int enctype; /* Encoding type */ + s8 level; /* Negotiated security RXRPC_SECURITY_PLAIN/AUTH/ENCRYPT */ + struct krb5_buffer key; /* Master key, K0 */ + struct krb5_buffer ticket; /* Ticket to be passed to server */ + u8 _key[]; /* Key storage */ +}; + /* * list of tokens attached to an rxrpc key */ @@ -40,6 +56,7 @@ struct rxrpc_key_token { struct rxrpc_key_token *next; /* the next token in the list */ union { struct rxkad_key *kad; + struct rxgk_key *rxgk; }; }; -- cgit v1.2.3 From 9d1d2b59341f58126a69b51f9f5f8ccb9f12e54a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:53 +0100 Subject: rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI) Implement the basic parts of the yfs-rxgk security class (security index 6) to support GSSAPI-negotiated security. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-9-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 45 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 08ecebd90595..aab81e8196ae 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -69,6 +69,38 @@ EM(rxkad_abort_resp_tkt_sname, "rxkad-resp-tk-sname") \ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ + /* RxGK security errors */ \ + EM(rxgk_abort_1_verify_mic_eproto, "rxgk1-vfy-mic-eproto") \ + EM(rxgk_abort_2_decrypt_eproto, "rxgk2-dec-eproto") \ + EM(rxgk_abort_2_short_data, "rxgk2-short-data") \ + EM(rxgk_abort_2_short_encdata, "rxgk2-short-encdata") \ + EM(rxgk_abort_2_short_header, "rxgk2-short-hdr") \ + EM(rxgk_abort_bad_key_number, "rxgk-bad-key-num") \ + EM(rxgk_abort_chall_key_expired, "rxgk-chall-key-exp") \ + EM(rxgk_abort_chall_no_key, "rxgk-chall-nokey") \ + EM(rxgk_abort_chall_short, "rxgk-chall-short") \ + EM(rxgk_abort_resp_auth_dec, "rxgk-resp-auth-dec") \ + EM(rxgk_abort_resp_bad_callid, "rxgk-resp-bad-callid") \ + EM(rxgk_abort_resp_bad_nonce, "rxgk-resp-bad-nonce") \ + EM(rxgk_abort_resp_bad_param, "rxgk-resp-bad-param") \ + EM(rxgk_abort_resp_call_ctr, "rxgk-resp-call-ctr") \ + EM(rxgk_abort_resp_call_state, "rxgk-resp-call-state") \ + EM(rxgk_abort_resp_internal_error, "rxgk-resp-int-error") \ + EM(rxgk_abort_resp_nopkg, "rxgk-resp-nopkg") \ + EM(rxgk_abort_resp_short_applen, "rxgk-resp-short-applen") \ + EM(rxgk_abort_resp_short_auth, "rxgk-resp-short-auth") \ + EM(rxgk_abort_resp_short_call_list, "rxgk-resp-short-callls") \ + EM(rxgk_abort_resp_short_packet, "rxgk-resp-short-packet") \ + EM(rxgk_abort_resp_short_yfs_klen, "rxgk-resp-short-yfs-klen") \ + EM(rxgk_abort_resp_short_yfs_key, "rxgk-resp-short-yfs-key") \ + EM(rxgk_abort_resp_short_yfs_tkt, "rxgk-resp-short-yfs-tkt") \ + EM(rxgk_abort_resp_tok_dec, "rxgk-resp-tok-dec") \ + EM(rxgk_abort_resp_tok_internal_error, "rxgk-resp-tok-int-err") \ + EM(rxgk_abort_resp_tok_keyerr, "rxgk-resp-tok-keyerr") \ + EM(rxgk_abort_resp_tok_nokey, "rxgk-resp-tok-nokey") \ + EM(rxgk_abort_resp_tok_nopkg, "rxgk-resp-tok-nopkg") \ + EM(rxgk_abort_resp_tok_short, "rxgk-resp-tok-short") \ + EM(rxgk_abort_resp_xdr_align, "rxgk-resp-xdr-align") \ /* rxrpc errors */ \ EM(rxrpc_abort_call_improper_term, "call-improper-term") \ EM(rxrpc_abort_call_reset, "call-reset") \ @@ -471,6 +503,7 @@ EM(rxrpc_tx_point_call_final_resend, "CallFinalResend") \ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ + EM(rxrpc_tx_point_rxgk_challenge, "RxGKChall") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ EM(rxrpc_tx_point_response, "Response") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ @@ -489,6 +522,7 @@ #define rxrpc_txbuf_traces \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ + EM(rxrpc_txbuf_alloc_response, "ALLOC RESP ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ @@ -496,6 +530,7 @@ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ + EM(rxrpc_txbuf_put_response_tx, "PUT RESP TX") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ EM(rxrpc_txbuf_see_lost, "SEE LOST ") \ @@ -1178,6 +1213,7 @@ TRACE_EVENT(rxrpc_rx_challenge, __field(u32, version) __field(u32, nonce) __field(u32, min_level) + __field(u8, security_ix) ), TP_fast_assign( @@ -1186,11 +1222,13 @@ TRACE_EVENT(rxrpc_rx_challenge, __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; + __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x", + TP_printk("C=%08x CHALLENGE r=%08x sx=%u v=%x n=%x ml=%x", __entry->conn, __entry->serial, + __entry->security_ix, __entry->version, __entry->nonce, __entry->min_level) @@ -1208,6 +1246,7 @@ TRACE_EVENT(rxrpc_rx_response, __field(u32, version) __field(u32, kvno) __field(u32, ticket_len) + __field(u8, security_ix) ), TP_fast_assign( @@ -1216,11 +1255,13 @@ TRACE_EVENT(rxrpc_rx_response, __entry->version = version; __entry->kvno = kvno; __entry->ticket_len = ticket_len; + __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x", + TP_printk("C=%08x RESPONSE r=%08x sx=%u v=%x kvno=%x tl=%x", __entry->conn, __entry->serial, + __entry->security_ix, __entry->version, __entry->kvno, __entry->ticket_len) -- cgit v1.2.3 From 7a7513a3081c6a2729d8570c77bbed1978277dc9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:54 +0100 Subject: rxrpc: rxgk: Implement connection rekeying Implement rekeying of connections with the RxGK security class. This involves regenerating the keys with a different key number as part of the input data after a certain amount of time or a certain amount of bytes encrypted. Rekeying may be triggered by either end. The LSW of the key number is inserted into the security-specific field in the RX header, and we try and expand it to 32-bits to make it last longer. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-10-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index aab81e8196ae..920439df1f6f 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -2725,6 +2725,30 @@ TRACE_EVENT(rxrpc_rack_timer, ktime_to_us(__entry->delay)) ); +TRACE_EVENT(rxrpc_rxgk_rekey, + TP_PROTO(struct rxrpc_connection *conn, + unsigned int current_key, unsigned int requested_key), + + TP_ARGS(conn, current_key, requested_key), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(unsigned int, current_key) + __field(unsigned int, requested_key) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->current_key = current_key; + __entry->requested_key = requested_key; + ), + + TP_printk("C=%08x cur=%x req=%x", + __entry->conn, + __entry->current_key, + __entry->requested_key) + ); + #undef EM #undef E_ -- cgit v1.2.3 From d03539d5c2dec9b028297c15e57bd3c01d0d9c0d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:56 +0100 Subject: rxrpc: Display security params in the afs_cb_call tracepoint Make the afs_cb_call tracepoint display some security parameters to make debugging easier. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-12-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/net/af_rxrpc.h | 2 ++ include/trace/events/afs.h | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 0b209f703ffc..f15341594cc8 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -112,5 +112,7 @@ int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, struct krb5_buffer *appdata); +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype); #endif /* _NET_RXRPC_H */ diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 8857f5ea77d4..7f83d242c8e9 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -663,19 +663,26 @@ TRACE_EVENT(afs_cb_call, __field(unsigned int, call) __field(u32, op) __field(u16, service_id) + __field(u8, security_ix) + __field(u32, enctype) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op = call->operation_ID; __entry->service_id = call->service_id; + __entry->security_ix = call->security_ix; + __entry->enctype = call->enctype; ), - TP_printk("c=%08x %s", + TP_printk("c=%08x %s sv=%u sx=%u en=%u", __entry->call, __entry->service_id == 2501 ? __print_symbolic(__entry->op, yfs_cm_operations) : - __print_symbolic(__entry->op, afs_cm_operations)) + __print_symbolic(__entry->op, afs_cm_operations), + __entry->service_id, + __entry->security_ix, + __entry->enctype) ); TRACE_EVENT(afs_call, -- cgit v1.2.3 From fba6995798c6085a0c2fc67e0cacd489a6971044 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:58 +0100 Subject: rxrpc: Add more CHALLENGE/RESPONSE packet tracing Add more tracing for CHALLENGE and RESPONSE packets. Currently, rxrpc only has client-relevant tracepoints (rx_challenge and tx_response), but add the server-side ones too. Further, record the service ID in the rx_challenge tracepoint as well. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-14-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 78 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 920439df1f6f..378d2dfc7392 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1201,6 +1201,39 @@ TRACE_EVENT(rxrpc_rx_conn_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_tx_challenge, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + u32 version, u32 nonce), + + TP_ARGS(conn, serial, version, nonce), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(u32, version) + __field(u32, nonce) + __field(u16, service_id) + __field(u8, security_ix) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->version = version; + __entry->nonce = nonce; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; + ), + + TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x", + __entry->conn, + __entry->serial, + __entry->service_id, + __entry->security_ix, + __entry->version, + __entry->nonce) + ); + TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), @@ -1213,6 +1246,7 @@ TRACE_EVENT(rxrpc_rx_challenge, __field(u32, version) __field(u32, nonce) __field(u32, min_level) + __field(u16, service_id) __field(u8, security_ix) ), @@ -1222,18 +1256,60 @@ TRACE_EVENT(rxrpc_rx_challenge, __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; + __entry->service_id = conn->service_id; __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x CHALLENGE r=%08x sx=%u v=%x n=%x ml=%x", + TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x ml=%x", __entry->conn, __entry->serial, + __entry->service_id, __entry->security_ix, __entry->version, __entry->nonce, __entry->min_level) ); +TRACE_EVENT(rxrpc_tx_response, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + struct rxrpc_skb_priv *rsp), + + TP_ARGS(conn, serial, rsp), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(rxrpc_serial_t, challenge) + __field(u32, version) + __field(u32, kvno) + __field(u16, ticket_len) + __field(u16, appdata_len) + __field(u16, service_id) + __field(u8, security_ix) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->challenge = rsp->resp.challenge_serial; + __entry->version = rsp->resp.version; + __entry->kvno = rsp->resp.kvno; + __entry->ticket_len = rsp->resp.ticket_len; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; + ), + + TP_printk("C=%08x RESPONSE r=%08x cr=%08x sv=%u+%u v=%x kv=%x tl=%u", + __entry->conn, + __entry->serial, + __entry->challenge, + __entry->service_id, + __entry->security_ix, + __entry->version, + __entry->kvno, + __entry->ticket_len) + ); + TRACE_EVENT(rxrpc_rx_response, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 kvno, u32 ticket_len), -- cgit v1.2.3 From 6e83166dd8003e8611f253426b85e0c3d933e1c2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sun, 13 Apr 2025 11:34:32 +0200 Subject: mptcp: sched: remove mptcp_sched_data This is a follow-up of commit b68b106b0f15 ("mptcp: sched: reduce size for unused data"), now removing the mptcp_sched_data structure. Now is a good time to do that, because the previously mentioned WIP work has been updated, no longer depending on this structure. Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-1-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index bfbad695951c..f7263fe2a2e4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -101,18 +101,9 @@ struct mptcp_out_options { #define MPTCP_SCHED_MAX 128 #define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) -#define MPTCP_SUBFLOWS_MAX 8 - -struct mptcp_sched_data { - u8 subflows; - struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; -}; - struct mptcp_sched_ops { - int (*get_send)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); - int (*get_retrans)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); + int (*get_send)(struct mptcp_sock *msk); + int (*get_retrans)(struct mptcp_sock *msk); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; -- cgit v1.2.3 From f99564688f38458d86b64f099ebf03f19517cf77 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 13 Apr 2025 16:09:40 +0200 Subject: net: phy: remove device_phy_find_device AFAICS this function has never had a user. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/ab7b8094-2eea-4e82-a047-fd60117f220b@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index a2bfae80c449..fb755358d965 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1757,7 +1757,6 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); -struct phy_device *device_phy_find_device(struct device *dev); struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); @@ -1779,11 +1778,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) return NULL; } -static inline struct phy_device *device_phy_find_device(struct device *dev) -{ - return NULL; -} - static inline struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) { -- cgit v1.2.3 From 95d06e92a401928fe46fda7616e460f39cb7211b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:07 -0700 Subject: netlink: Introduce nlmsg_payload helper Create a new helper function, nlmsg_payload(), to simplify checking and retrieving Netlink message payloads. This reduces boilerplate code for users who need to verify the message length before accessing its data. Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Jakub Kicinski Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-1-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 29e0db940382..82e07e272290 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -611,6 +611,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) return nlh->nlmsg_len - NLMSG_HDRLEN; } +/** + * nlmsg_payload - message payload if the data fits in the len + * @nlh: netlink message header + * @len: struct length + * + * Returns: The netlink message payload/data if the length is sufficient, + * otherwise NULL. + */ +static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(len)) + return NULL; + + return nlmsg_data(nlh); +} + /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header -- cgit v1.2.3 From 7c571ac57d9d97190dcba18212fabf99888b0c48 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 14 Apr 2025 14:26:30 -0700 Subject: net: ptp: introduce .supported_extts_flags to ptp_clock_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTP_EXTTS_REQUEST(2) ioctl has a flags field which specifies how the external timestamp request should behave. This includes which edge of the signal to timestamp, as well as a specialized "offset" mode. It is expected that more flags will be added in the future. Driver authors routinely do not check the flags, often accepting requests with flags which they do not support. Even drivers which do check flags may not be future-proofed to reject flags not yet defined. Thus, any future flag additions often require manually updating drivers to reject these flags. This approach of hoping we catch flag checks during review, or playing whack-a-mole after the fact is the wrong approach. Introduce the "supported_extts_flags" field to the ptp_clock_info structure. This field defines the set of flags the device actually supports. Update the core character device logic to check this field and reject unsupported requests. Getting this right is somewhat tricky. First, to avoid unnecessary repetition and make basic functionality work when .supported_extts_flags is 0, the core always accepts the PTP_ENABLE_FEATURE flag. This flag is used to set the 'on' parameter to the .enable function and is thus always 'supported' by all drivers. For backwards compatibility, the PTP_RISING_EDGE and PTP_FALLING_EDGE flags are merely "hints" when using the old PTP_EXTTS_REQUEST ioctl, and are not expected to be enforced. If the user issues PTP_EXTTS_REQUEST2, the PTP_STRICT_FLAGS flag is added which is supposed to inform the driver to strictly validate the flags and reject unsupported requests. To handle this, first check if the driver reports PTP_STRICT_FLAGS support. If it does not, then always allow the PTP_RISING_EDGE and PTP_FALLING_EDGE flags. This keeps backwards compatibility with the original PTP_EXTTS_REQUEST ioctl where these flags are not guaranteed to be honored. This way, drivers which do not set the supported_extts_flags will continue to accept requests for the original PTP_EXTTS_REQUEST ioctl. The core will automatically reject requests with new flags, and correctly reject requests with PTP_STRICT_FLAGS, where the driver is supposed to strictly validate the flags. Update the various drivers, refactoring their validation logic into the .supported_extts_flags field. For consistency and readability, PTP_ENABLE_FEATURE is not set in the supported flags list, and PTP_EXTTS_EDGES is expanded to PTP_RISING_EDGE | PTP_FALLING_EDGE in all cases. Note the following driver files set n_ext_ts to a non-zero value but did not check flags at all: • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c • drivers/net/ethernet/freescale/enetc/enetc_ptp.c • drivers/net/ethernet/intel/i40e/i40e_ptp.c • drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c • drivers/net/ethernet/renesas/ravb_ptp.c • drivers/net/ethernet/renesas/rtsn.c • drivers/net/ethernet/renesas/rtsn.h • drivers/net/ethernet/ti/am65-cpts.c • drivers/net/ethernet/ti/cpts.h • drivers/net/ethernet/ti/icssg/icss_iep.c • drivers/net/ethernet/xscale/ptp_ixp46x.c • drivers/net/phy/bcm-phy-ptp.c • drivers/ptp/ptp_ocp.c • drivers/ptp/ptp_pch.c • drivers/ptp/ptp_qoriq.c These drivers behavior does change slightly: they will now reject the PTP_EXTTS_REQUEST2 ioctl, because they do not strictly validate their flags. This also makes them no longer incorrectly accept PTP_EXT_OFFSET. Also note that the renesas ravb driver does not support PTP_STRICT_FLAGS. We could leave the .supported_extts_flags as 0, but I added the PTP_RISING_EDGE | PTP_FALLING_EDGE since the driver previously manually validated these flags. This is equivalent to 0 because the core will allow these flags regardless unless PTP_STRICT_FLAGS is also set. Signed-off-by: Jacob Keller Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-1-f6b17d15475c@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ptp_clock_kernel.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 0d68d09bedd1..25cba2e5ee69 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -68,6 +68,17 @@ struct ptp_system_timestamp { * @n_per_out: The number of programmable periodic signals. * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. + * + * @supported_extts_flags: The set of flags the driver supports for the + * PTP_EXTTS_REQUEST ioctl. The PTP core will use + * this list to reject unsupported requests. + * PTP_ENABLE_FEATURE is assumed and does not need to + * be included. If PTP_STRICT_FLAGS is *not* set, + * then both PTP_RISING_EDGE and PTP_FALLING_EDGE + * will be assumed. Note that PTP_STRICT_FLAGS must + * be set if the drivers wants to honor + * PTP_EXTTS_REQUEST2 and any future flags. + * * @pin_config: Array of length 'n_pins'. If the number of * programmable pins is nonzero, then drivers must * allocate and initialize this array. @@ -174,6 +185,7 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); -- cgit v1.2.3 From d9f3e9ecc4562ae07aaf614cf0a6690ef7ca0e10 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 14 Apr 2025 14:26:31 -0700 Subject: net: ptp: introduce .supported_perout_flags to ptp_clock_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTP_PEROUT_REQUEST2 ioctl has gained support for flags specifying specific output behavior including PTP_PEROUT_ONE_SHOT, PTP_PEROUT_DUTY_CYCLE, PTP_PEROUT_PHASE. Driver authors are notorious for not checking the flags of the request. This results in misinterpreting the request, generating an output signal that does not match the requested value. It is anticipated that even more flags will be added in the future, resulting in even more broken requests. Expecting these issues to be caught during review or playing whack-a-mole after the fact is not a great solution. Instead, introduce the supported_perout_flags field in the ptp_clock_info structure. Update the core character device logic to explicitly reject any request which has a flag not on this list. This ensures that drivers must 'opt in' to the flags they support. Drivers which don't set the .supported_perout_flags field will not need to check that unsupported flags aren't passed, as the core takes care of this. Update the drivers which do support flags to set this new field. Note the following driver files set n_per_out to a non-zero value but did not check the flags at all: • drivers/ptp/ptp_clockmatrix.c • drivers/ptp/ptp_idt82p33.c • drivers/ptp/ptp_fc3.c • drivers/net/ethernet/ti/am65-cpts.c • drivers/net/ethernet/aquantia/atlantic/aq_ptp.c • drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c • drivers/net/dsa/sja1105/sja1105_ptp.c • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c • drivers/net/ethernet/mscc/ocelot_vsc7514.c • drivers/net/ethernet/intel/i40e/i40e_ptp.c Reviewed-by: Vadim Fedorenko Signed-off-by: Jacob Keller Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-2-f6b17d15475c@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ptp_clock_kernel.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 25cba2e5ee69..eced7e9bf69a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -69,6 +69,11 @@ struct ptp_system_timestamp { * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. * + * @supported_perout_flags: The set of flags the driver supports for the + * PTP_PEROUT_REQUEST ioctl. The PTP core will + * reject a request with any flag not specified + * here. + * * @supported_extts_flags: The set of flags the driver supports for the * PTP_EXTTS_REQUEST ioctl. The PTP core will use * this list to reject unsupported requests. @@ -185,6 +190,7 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_perout_flags; unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); -- cgit v1.2.3 From 43eca05b6a3b917c600e10cc6b06bfa57fa57401 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 11 Apr 2025 10:49:56 +0300 Subject: xfrm: Add explicit dev to .xdo_dev_state_{add,delete,free} Previously, device driver IPSec offload implementations would fall into two categories: 1. Those that used xso.dev to determine the offload device. 2. Those that used xso.real_dev to determine the offload device. The first category didn't work with bonding while the second did. In a non-bonding setup the two pointers are the same. This commit adds explicit pointers for the offload netdevice to .xdo_dev_state_add() / .xdo_dev_state_delete() / .xdo_dev_state_free() which eliminates the confusion and allows drivers from the first category to work with bonding. xso.real_dev now becomes a private pointer managed by the bonding driver. Signed-off-by: Cosmin Ratiu Reviewed-by: Leon Romanovsky Reviewed-by: Nikolay Aleksandrov Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 10 +++++++--- include/net/xfrm.h | 8 ++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8544f6a680c..88dfb8aeed3c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1013,9 +1013,13 @@ struct netdev_bpf { #ifdef CONFIG_XFRM_OFFLOAD struct xfrmdev_ops { - int (*xdo_dev_state_add) (struct xfrm_state *x, struct netlink_ext_ack *extack); - void (*xdo_dev_state_delete) (struct xfrm_state *x); - void (*xdo_dev_state_free) (struct xfrm_state *x); + int (*xdo_dev_state_add)(struct net_device *dev, + struct xfrm_state *x, + struct netlink_ext_ack *extack); + void (*xdo_dev_state_delete)(struct net_device *dev, + struct xfrm_state *x); + void (*xdo_dev_state_free)(struct net_device *dev, + struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 39365fd2ea17..3d2f6c879311 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -147,8 +147,16 @@ enum { }; struct xfrm_dev_offload { + /* The device for this offload. + * Device drivers should not use this directly, as that will prevent + * them from working with bonding device. Instead, the device passed + * to the add/delete callbacks should be used. + */ struct net_device *dev; netdevice_tracker dev_tracker; + /* This is a private pointer used by the bonding driver. + * Device drivers should not use it. + */ struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; -- cgit v1.2.3 From d2fddbd3479928e52061e1c8dd302006b6283ce8 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 11 Apr 2025 10:49:58 +0300 Subject: bonding: Fix multiple long standing offload races Refactor the bonding ipsec offload operations to fix a number of long-standing control plane races between state migration and user deletion and a few other issues. xfrm state deletion can happen concurrently with bond_change_active_slave() operation. This manifests itself as a bond_ipsec_del_sa() call with x->lock held, followed by a bond_ipsec_free_sa() a bit later from a wq. The alternate path of these calls coming from xfrm_dev_state_flush() can't happen, as that needs the RTNL lock and bond_change_active_slave() already holds it. 1. bond_ipsec_del_sa_all() might call xdo_dev_state_delete() a second time on an xfrm state that was concurrently killed. This is bad. 2. bond_ipsec_add_sa_all() can add a state on the new device, but pending bond_ipsec_free_sa() calls from the old device will then hit the WARN_ON() and then, worse, call xdo_dev_state_free() on the new device without a corresponding xdo_dev_state_delete(). 3. Resolve a sleeping in atomic context introduced by the mentioned "Fixes" commit. bond_ipsec_del_sa_all() and bond_ipsec_add_sa_all() now acquire x->lock and check for x->km.state to help with problems 1 and 2. And since xso.real_dev is now a private pointer managed by the bonding driver in xfrm state, make better use of it to fully fix problems 1 and 2. In bond_ipsec_del_sa_all(), set xso.real_dev to NULL while holding both the mutex and x->lock, which makes sure that neither bond_ipsec_del_sa() nor bond_ipsec_free_sa() could run concurrently. Fix problem 3 by moving the list cleanup (which requires the mutex) from bond_ipsec_del_sa() (called from atomic context) to bond_ipsec_free_sa() Finally, simplify bond_ipsec_del_sa() and bond_ipsec_free_sa() by using xso->real_dev directly, since it's now protected by locks and can be trusted to always reflect the offload device. Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") Signed-off-by: Cosmin Ratiu Reviewed-by: Leon Romanovsky Reviewed-by: Nikolay Aleksandrov Reviewed-by: Hangbin Liu Tested-by: Hangbin Liu Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3d2f6c879311..b7e8f3f49627 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -154,8 +154,11 @@ struct xfrm_dev_offload { */ struct net_device *dev; netdevice_tracker dev_tracker; - /* This is a private pointer used by the bonding driver. - * Device drivers should not use it. + /* This is a private pointer used by the bonding driver (and eventually + * should be moved there). Device drivers should not use it. + * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases, + * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock + * is held. */ struct net_device *real_dev; unsigned long offload_handle; -- cgit v1.2.3 From cd1fafe7da1f6f2aa25723e317f6e8e9d0c050a1 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 15 Apr 2025 05:24:58 +0000 Subject: eth: bnxt: add support rx side device memory TCP Currently, bnxt_en driver satisfies the requirements of the Device memory TCP, which is HDS. So, it implements rx-side Device memory TCP for bnxt_en driver. It requires only converting the page API to netmem API. `struct page` of agg rings are changed to `netmem_ref netmem` and corresponding functions are changed to a variant of netmem API. It also passes PP_FLAG_ALLOW_UNREADABLE_NETMEM flag to a parameter of page_pool. The netmem will be activated only when a user requests devmem TCP. When netmem is activated, received data is unreadable and netmem is disabled, received data is readable. But drivers don't need to handle both cases because netmem core API will handle it properly. So, using proper netmem API is enough for drivers. Device memory TCP can be tested with tools/testing/selftests/drivers/net/hw/ncdevmem. This is tested with BCM57504-N425G and firmware version 232.0.155.8/pkg 232.1.132.8. Reviewed-by: Mina Almasry Tested-by: David Wei Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250415052458.1260575-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 582a3d00cbe2..93f2c31baf9b 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -395,6 +395,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } +static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, + netmem_ref netmem) +{ + page_pool_put_full_netmem(pool, netmem, true); +} + #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) @@ -492,4 +498,9 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +static inline bool page_pool_is_unreadable(struct page_pool *pool) +{ + return !!pool->mp_ops; +} + #endif /* _NET_PAGE_POOL_HELPERS_H */ -- cgit v1.2.3 From ab244a394c7f13f6573744b9ca72bb22151a3ec4 Mon Sep 17 00:00:00 2001 From: Chiachang Wang Date: Thu, 13 Mar 2025 02:36:40 +0000 Subject: xfrm: Migrate offload configuration Add hardware offload configuration to XFRM_MSG_MIGRATE using an option netlink attribute XFRMA_OFFLOAD_DEV. In the existing xfrm_state_migrate(), the xfrm_init_state() is called assuming no hardware offload by default. Even the original xfrm_state is configured with offload, the setting will be reset. If the device is configured with hardware offload, it's reasonable to allow the device to maintain its hardware offload mode. But the device will end up with offload disabled after receiving a migration event when the device migrates the connection from one netdev to another one. The devices that support migration may work with different underlying networks, such as mobile devices. The hardware setting should be forwarded to the different netdev based on the migration configuration. This change provides the capability for user space to migrate from one netdev to another. Test: Tested with kernel test in the Android tree located in https://android.googlesource.com/kernel/tests/ The xfrm_tunnel_test.py under the tests folder in particular. Signed-off-by: Chiachang Wang Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b7e8f3f49627..466423a1a70a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1904,12 +1904,16 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + struct xfrm_user_offload *xuo); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); -- cgit v1.2.3 From b7a63391aa982295bbb3125e7d4470f51f31ff0f Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:19 +0200 Subject: ovpn: add basic netlink support This commit introduces basic netlink support with family registration/unregistration functionalities and stub pre/post-doit. More importantly it introduces the YAML uAPI description along with its auto-generated files: - include/uapi/linux/ovpn.h - drivers/net/ovpn/netlink-gen.c - drivers/net/ovpn/netlink-gen.h Reviewed-by: Donald Hunter Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-2-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- include/uapi/linux/ovpn.h | 109 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 include/uapi/linux/ovpn.h (limited to 'include') diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h new file mode 100644 index 000000000000..680d1522dc87 --- /dev/null +++ b/include/uapi/linux/ovpn.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_OVPN_H +#define _UAPI_LINUX_OVPN_H + +#define OVPN_FAMILY_NAME "ovpn" +#define OVPN_FAMILY_VERSION 1 + +#define OVPN_NONCE_TAIL_SIZE 8 + +enum ovpn_cipher_alg { + OVPN_CIPHER_ALG_NONE, + OVPN_CIPHER_ALG_AES_GCM, + OVPN_CIPHER_ALG_CHACHA20_POLY1305, +}; + +enum ovpn_del_peer_reason { + OVPN_DEL_PEER_REASON_TEARDOWN, + OVPN_DEL_PEER_REASON_USERSPACE, + OVPN_DEL_PEER_REASON_EXPIRED, + OVPN_DEL_PEER_REASON_TRANSPORT_ERROR, + OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT, +}; + +enum ovpn_key_slot { + OVPN_KEY_SLOT_PRIMARY, + OVPN_KEY_SLOT_SECONDARY, +}; + +enum { + OVPN_A_PEER_ID = 1, + OVPN_A_PEER_REMOTE_IPV4, + OVPN_A_PEER_REMOTE_IPV6, + OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + OVPN_A_PEER_REMOTE_PORT, + OVPN_A_PEER_SOCKET, + OVPN_A_PEER_SOCKET_NETNSID, + OVPN_A_PEER_VPN_IPV4, + OVPN_A_PEER_VPN_IPV6, + OVPN_A_PEER_LOCAL_IPV4, + OVPN_A_PEER_LOCAL_IPV6, + OVPN_A_PEER_LOCAL_PORT, + OVPN_A_PEER_KEEPALIVE_INTERVAL, + OVPN_A_PEER_KEEPALIVE_TIMEOUT, + OVPN_A_PEER_DEL_REASON, + OVPN_A_PEER_VPN_RX_BYTES, + OVPN_A_PEER_VPN_TX_BYTES, + OVPN_A_PEER_VPN_RX_PACKETS, + OVPN_A_PEER_VPN_TX_PACKETS, + OVPN_A_PEER_LINK_RX_BYTES, + OVPN_A_PEER_LINK_TX_BYTES, + OVPN_A_PEER_LINK_RX_PACKETS, + OVPN_A_PEER_LINK_TX_PACKETS, + + __OVPN_A_PEER_MAX, + OVPN_A_PEER_MAX = (__OVPN_A_PEER_MAX - 1) +}; + +enum { + OVPN_A_KEYCONF_PEER_ID = 1, + OVPN_A_KEYCONF_SLOT, + OVPN_A_KEYCONF_KEY_ID, + OVPN_A_KEYCONF_CIPHER_ALG, + OVPN_A_KEYCONF_ENCRYPT_DIR, + OVPN_A_KEYCONF_DECRYPT_DIR, + + __OVPN_A_KEYCONF_MAX, + OVPN_A_KEYCONF_MAX = (__OVPN_A_KEYCONF_MAX - 1) +}; + +enum { + OVPN_A_KEYDIR_CIPHER_KEY = 1, + OVPN_A_KEYDIR_NONCE_TAIL, + + __OVPN_A_KEYDIR_MAX, + OVPN_A_KEYDIR_MAX = (__OVPN_A_KEYDIR_MAX - 1) +}; + +enum { + OVPN_A_IFINDEX = 1, + OVPN_A_PEER, + OVPN_A_KEYCONF, + + __OVPN_A_MAX, + OVPN_A_MAX = (__OVPN_A_MAX - 1) +}; + +enum { + OVPN_CMD_PEER_NEW = 1, + OVPN_CMD_PEER_SET, + OVPN_CMD_PEER_GET, + OVPN_CMD_PEER_DEL, + OVPN_CMD_PEER_DEL_NTF, + OVPN_CMD_KEY_NEW, + OVPN_CMD_KEY_GET, + OVPN_CMD_KEY_SWAP, + OVPN_CMD_KEY_SWAP_NTF, + OVPN_CMD_KEY_DEL, + + __OVPN_CMD_MAX, + OVPN_CMD_MAX = (__OVPN_CMD_MAX - 1) +}; + +#define OVPN_MCGRP_PEERS "peers" + +#endif /* _UAPI_LINUX_OVPN_H */ -- cgit v1.2.3 From c2d950c4672a012ea9765c15a389cdcdf919f652 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:20 +0200 Subject: ovpn: add basic interface creation/destruction/management routines Add basic infrastructure for handling ovpn interfaces. Tested-by: Donald Hunter Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-3-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- include/uapi/linux/if_link.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 318386cc5b0d..3ad2d5d98034 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1986,4 +1986,19 @@ enum { #define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) +/* OVPN section */ + +enum ovpn_mode { + OVPN_MODE_P2P, + OVPN_MODE_MP, +}; + +enum { + IFLA_OVPN_UNSPEC, + IFLA_OVPN_MODE, + __IFLA_OVPN_MAX, +}; + +#define IFLA_OVPN_MAX (__IFLA_OVPN_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ -- cgit v1.2.3 From f6226ae7a0cd47aaa9175aca6a1e19600f884cbf Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:23 +0200 Subject: ovpn: introduce the ovpn_socket object This specific structure is used in the ovpn kernel module to wrap and carry around a standard kernel socket. ovpn takes ownership of passed sockets and therefore an ovpn specific objects is attached to them for status tracking purposes. Initially only UDP support is introduced. TCP will come in a later patch. Cc: willemdebruijn.kernel@gmail.com Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-6-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- include/uapi/linux/udp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index d85d671deed3..edca3e430305 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -43,5 +43,6 @@ struct udphdr { #define UDP_ENCAP_GTP1U 5 /* 3GPP TS 29.060 */ #define UDP_ENCAP_RXRPC 6 #define TCP_ENCAP_ESPINTCP 7 /* Yikes, this is really xfrm encap types. */ +#define UDP_ENCAP_OVPNINUDP 8 /* OpenVPN traffic */ #endif /* _UAPI_LINUX_UDP_H */ -- cgit v1.2.3 From 17240749f26e07cafa676688d8a3326086498447 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:29 +0200 Subject: skb: implement skb_send_sock_locked_with_flags() When sending an skb over a socket using skb_send_sock_locked(), it is currently not possible to specify any flag to be set in msghdr->msg_flags. However, we may want to pass flags the user may have specified, like MSG_NOSIGNAL. Extend __skb_send_sock() with a new argument 'flags' and add a new interface named skb_send_sock_locked_with_flags(). Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-12-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f1381aff0f89..beb084ee4f4d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4145,6 +4145,8 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); -- cgit v1.2.3 From a1b669ea16c4d7c1a1a8fc7e25aaf651ea0078c3 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:45:57 -0700 Subject: bpf: Prepare to reuse get_ctx_arg_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename get_ctx_arg_idx to bpf_ctx_arg_idx, and allow others to call it. No functional change. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-2-ameryhung@gmail.com --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index ebc0c0c9b944..b2983706292f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -522,6 +522,7 @@ bool btf_param_match_suffix(const struct btf *btf, const char *suffix); int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, u32 arg_no); +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off); struct bpf_verifier_log; -- cgit v1.2.3 From 151e13ece86d234213b7f224f0e26a957c0eeb3e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Apr 2025 18:02:15 -0700 Subject: net: ethtool: Adjust exactly ETH_GSTRING_LEN-long stats to use memcpy Many drivers populate the stats buffer using C-String based APIs (e.g. ethtool_sprintf() and ethtool_puts()), usually when building up the list of stats individually (i.e. with a for() loop). This, however, requires that the source strings be populated in such a way as to have a terminating NUL byte in the source. Other drivers populate the stats buffer directly using one big memcpy() of an entire array of strings. No NUL termination is needed here, as the bytes are being directly passed through. Yet others will build up the stats buffer individually, but also use memcpy(). This, too, does not need NUL termination of the source strings. However, there are cases where the strings that populate the source stats strings are exactly ETH_GSTRING_LEN long, and GCC 15's -Wunterminated-string-initialization option complains that the trailing NUL byte has been truncated. This situation is fine only if the driver is using the memcpy() approach. If the C-String APIs are used, the destination string name will have its final byte truncated by the required trailing NUL byte applied by the C-string API. For drivers that are already using memcpy() but have initializers that truncate the NUL terminator, mark their source strings as __nonstring to silence the GCC warnings. For drivers that have initializers that truncate the NUL terminator and are using the C-String APIs, switch to memcpy() to avoid destination string truncation and mark their source strings as __nonstring to silence the GCC warnings. (Also introduce ethtool_cpy() as a helper to make this an easy replacement). Specifically the following warnings were investigated and addressed: ../drivers/net/ethernet/chelsio/cxgb/cxgb2.c:364:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 364 | "TxFramesAbortedDueToXSCollisions", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:165:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 165 | { ENETC_PM_R1523X(0), "MAC rx 1523 to max-octet packets" }, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:190:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 190 | { ENETC_PM_T1523X(0), "MAC tx 1523 to max-octet packets" }, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/google/gve/gve_ethtool.c:76:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 76 | "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:117:53: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 117 | STMMAC_STAT(ptp_rx_msg_type_pdelay_follow_up), | ^ ../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:46:12: note: in definition of macro 'STMMAC_STAT' 46 | { #m, sizeof_field(struct stmmac_extra_stats, m), \ | ^ ../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:328:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 328 | .str = "a_mac_control_frames_transmitted", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:340:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 340 | .str = "a_pause_mac_ctrl_frames_received", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Kees Cook Reviewed-by: Petr Machata # for mlxsw Reviewed-by: Harshitha Ramamurthy Link: https://patch.msgid.link/20250416010210.work.904-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 013d25858642..7edb5f5e7134 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1330,6 +1330,17 @@ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); */ extern void ethtool_puts(u8 **data, const char *str); +/** + * ethtool_cpy - Write possibly-not-NUL-terminated string to ethtool string data + * @data: Pointer to a pointer to the start of string to write into + * @str: NUL-byte padded char array of size ETH_GSTRING_LEN to copy from + */ +#define ethtool_cpy(data, str) do { \ + BUILD_BUG_ON(sizeof(str) != ETH_GSTRING_LEN); \ + memcpy(*(data), str, ETH_GSTRING_LEN); \ + *(data) += ETH_GSTRING_LEN; \ +} while (0) + /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; -- cgit v1.2.3 From 22cbc1ee268b7ec0000848708944daa61c6e4909 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Apr 2025 20:04:47 -0700 Subject: netdev: fix the locking for netdev notifications Kuniyuki reports that the assert for netdev lock fires when there are netdev event listeners (otherwise we skip the netlink event generation). Correct the locking when coming from the notifier. The NETDEV_XDP_FEAT_CHANGE notifier is already fully locked, it's the documentation that's incorrect. Fixes: 99e44f39a8f7 ("netdev: depend on netdev->lock for xdp features") Reported-by: syzkaller Reported-by: Kuniyuki Iwashima Link: https://lore.kernel.org/20250410171019.62128-1-kuniyu@amazon.com Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250416030447.1077551-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- include/net/netdev_lock.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e6036b82ef4c..0321fd952f70 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2520,7 +2520,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net, @xdp_flags + * @up, @moving_ns, @nd_net, @xdp_features * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 5706835a660c..2a753813f849 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -48,6 +48,22 @@ static inline void netdev_unlock_ops(struct net_device *dev) netdev_unlock(dev); } +static inline void netdev_lock_ops_to_full(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_lock(dev); +} + +static inline void netdev_unlock_full_to_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_unlock(dev); +} + static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) -- cgit v1.2.3 From 2b905deb43ea0b67fa8448fc9c15dacb068f45b6 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 16 Apr 2025 19:56:23 +0800 Subject: net: Delete the outer () duplicated of macro SOCK_SKB_CB_OFFSET definition For macro SOCK_SKB_CB_OFFSET definition, Delete the outer () duplicated. Signed-off-by: Zijun Hu Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250416-fix_net-v1-1-d544c9f3f169@quicinc.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index bb4d6189292f..e223102337c7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2604,8 +2604,8 @@ struct sock_skb_cb { * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) -- cgit v1.2.3 From 1df4a945444f071a9c5e09580a485919c42d4de5 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 16 Apr 2025 10:06:12 -0700 Subject: trace: tcp: Add const qualifier to skb parameter in tcp_probe event Change the tcp_probe tracepoint to accept a const struct sk_buff parameter instead of a non-const one. This improves type safety and better reflects that the skb is not modified within the tracepoint implementation. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250416-tcp_probe-v1-1-1edc3c5a1cb8@debian.org Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 75d3d53a3832..53e878fa14d1 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -293,7 +293,7 @@ DECLARE_TRACE(tcp_cwnd_reduction_tp, TRACE_EVENT(tcp_probe, - TP_PROTO(struct sock *sk, struct sk_buff *skb), + TP_PROTO(struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), -- cgit v1.2.3 From 8066e388be48f1ad62b0449dc1d31a25489fa12a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 16 Apr 2025 13:08:40 -0700 Subject: net: add UAPI to the header guard in various network headers fib_rule, ip6_tunnel, and a whole lot of if_* headers lack the customary _UAPI in the header guard. Without it YNL build can't protect from in tree and system headers both getting included. YNL doesn't need most of these but it's annoying to have to fix them one by one. Note that header installation strips this _UAPI prefix so this should result in no change to the end user. Acked-by: Jamal Hadi Salim Reviewed-by: Jason Xing Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250416200840.1338195-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/fib_rules.h | 4 ++-- include/uapi/linux/if_addr.h | 4 ++-- include/uapi/linux/if_addrlabel.h | 4 ++-- include/uapi/linux/if_alg.h | 6 +++--- include/uapi/linux/if_arcnet.h | 6 +++--- include/uapi/linux/if_bonding.h | 6 +++--- include/uapi/linux/if_fc.h | 6 +++--- include/uapi/linux/if_hippi.h | 6 +++--- include/uapi/linux/if_packet.h | 4 ++-- include/uapi/linux/if_plip.h | 4 ++-- include/uapi/linux/if_slip.h | 4 ++-- include/uapi/linux/if_x25.h | 6 +++--- include/uapi/linux/if_xdp.h | 6 +++--- include/uapi/linux/ip6_tunnel.h | 4 ++-- include/uapi/linux/net_dropmon.h | 4 ++-- include/uapi/linux/net_tstamp.h | 6 +++--- include/uapi/linux/netlink_diag.h | 4 ++-- include/uapi/linux/pkt_cls.h | 4 ++-- include/uapi/linux/pkt_sched.h | 4 ++-- 19 files changed, 46 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2df6e4035d50..418c4be697ad 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_FIB_RULES_H -#define __LINUX_FIB_RULES_H +#ifndef _UAPI__LINUX_FIB_RULES_H +#define _UAPI__LINUX_FIB_RULES_H #include #include diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index 1c392dd95a5e..aa7958b4e41d 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_IF_ADDR_H -#define __LINUX_IF_ADDR_H +#ifndef _UAPI__LINUX_IF_ADDR_H +#define _UAPI__LINUX_IF_ADDR_H #include #include diff --git a/include/uapi/linux/if_addrlabel.h b/include/uapi/linux/if_addrlabel.h index d1f5974c76e1..e69db764fbba 100644 --- a/include/uapi/linux/if_addrlabel.h +++ b/include/uapi/linux/if_addrlabel.h @@ -8,8 +8,8 @@ * YOSHIFUJI Hideaki @ USAGI/WIDE */ -#ifndef __LINUX_IF_ADDRLABEL_H -#define __LINUX_IF_ADDRLABEL_H +#ifndef _UAPI__LINUX_IF_ADDRLABEL_H +#define _UAPI__LINUX_IF_ADDRLABEL_H #include diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index 0824fbc026a1..b35871cbeed7 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -11,8 +11,8 @@ * */ -#ifndef _LINUX_IF_ALG_H -#define _LINUX_IF_ALG_H +#ifndef _UAPI_LINUX_IF_ALG_H +#define _UAPI_LINUX_IF_ALG_H #include @@ -58,4 +58,4 @@ struct af_alg_iv { #define ALG_OP_DECRYPT 0 #define ALG_OP_ENCRYPT 1 -#endif /* _LINUX_IF_ALG_H */ +#endif /* _UAPI_LINUX_IF_ALG_H */ diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h index b122cfac7128..473569eaf692 100644 --- a/include/uapi/linux/if_arcnet.h +++ b/include/uapi/linux/if_arcnet.h @@ -14,8 +14,8 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_ARCNET_H -#define _LINUX_IF_ARCNET_H +#ifndef _UAPI_LINUX_IF_ARCNET_H +#define _UAPI_LINUX_IF_ARCNET_H #include #include @@ -127,4 +127,4 @@ struct archdr { } soft; }; -#endif /* _LINUX_IF_ARCNET_H */ +#endif /* _UAPI_LINUX_IF_ARCNET_H */ diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837d..3bcc03f3aa4f 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -41,8 +41,8 @@ * - added definitions for various XOR hashing policies */ -#ifndef _LINUX_IF_BONDING_H -#define _LINUX_IF_BONDING_H +#ifndef _UAPI_LINUX_IF_BONDING_H +#define _UAPI_LINUX_IF_BONDING_H #include #include @@ -152,4 +152,4 @@ enum { }; #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) -#endif /* _LINUX_IF_BONDING_H */ +#endif /* _UAPI_LINUX_IF_BONDING_H */ diff --git a/include/uapi/linux/if_fc.h b/include/uapi/linux/if_fc.h index 3e3173282cc3..ff5ab92d16c2 100644 --- a/include/uapi/linux/if_fc.h +++ b/include/uapi/linux/if_fc.h @@ -18,8 +18,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_FC_H -#define _LINUX_IF_FC_H +#ifndef _UAPI_LINUX_IF_FC_H +#define _UAPI_LINUX_IF_FC_H #include @@ -49,4 +49,4 @@ struct fcllc { __be16 ethertype; /* ether type field */ }; -#endif /* _LINUX_IF_FC_H */ +#endif /* _UAPI_LINUX_IF_FC_H */ diff --git a/include/uapi/linux/if_hippi.h b/include/uapi/linux/if_hippi.h index 785a1452a66c..42c4ffd11dae 100644 --- a/include/uapi/linux/if_hippi.h +++ b/include/uapi/linux/if_hippi.h @@ -20,8 +20,8 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_HIPPI_H -#define _LINUX_IF_HIPPI_H +#ifndef _UAPI_LINUX_IF_HIPPI_H +#define _UAPI_LINUX_IF_HIPPI_H #include #include @@ -151,4 +151,4 @@ struct hippi_hdr { struct hippi_snap_hdr snap; } __attribute__((packed)); -#endif /* _LINUX_IF_HIPPI_H */ +#endif /* _UAPI_LINUX_IF_HIPPI_H */ diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 1d2718dd9647..6cd1d7a41dfb 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_IF_PACKET_H -#define __LINUX_IF_PACKET_H +#ifndef _UAPI__LINUX_IF_PACKET_H +#define _UAPI__LINUX_IF_PACKET_H #include #include diff --git a/include/uapi/linux/if_plip.h b/include/uapi/linux/if_plip.h index 495a366112f2..054d86a9c6e6 100644 --- a/include/uapi/linux/if_plip.h +++ b/include/uapi/linux/if_plip.h @@ -9,8 +9,8 @@ * */ -#ifndef _LINUX_IF_PLIP_H -#define _LINUX_IF_PLIP_H +#ifndef _UAPI_LINUX_IF_PLIP_H +#define _UAPI_LINUX_IF_PLIP_H #include diff --git a/include/uapi/linux/if_slip.h b/include/uapi/linux/if_slip.h index 65937be53103..299bf7adc862 100644 --- a/include/uapi/linux/if_slip.h +++ b/include/uapi/linux/if_slip.h @@ -6,8 +6,8 @@ * KISS TNC driver. */ -#ifndef __LINUX_SLIP_H -#define __LINUX_SLIP_H +#ifndef _UAPI__LINUX_SLIP_H +#define _UAPI__LINUX_SLIP_H #define SL_MODE_SLIP 0 #define SL_MODE_CSLIP 1 diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h index 3a5938e38370..861cfa983db4 100644 --- a/include/uapi/linux/if_x25.h +++ b/include/uapi/linux/if_x25.h @@ -13,8 +13,8 @@ * GNU General Public License for more details. */ -#ifndef _IF_X25_H -#define _IF_X25_H +#ifndef _UAPI_IF_X25_H +#define _UAPI_IF_X25_H #include @@ -24,4 +24,4 @@ #define X25_IFACE_DISCONNECT 0x02 #define X25_IFACE_PARAMS 0x03 -#endif /* _IF_X25_H */ +#endif /* _UAPI_IF_X25_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 42869770776e..44f2bb93e7e6 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -7,8 +7,8 @@ * Magnus Karlsson */ -#ifndef _LINUX_IF_XDP_H -#define _LINUX_IF_XDP_H +#ifndef _UAPI_LINUX_IF_XDP_H +#define _UAPI_LINUX_IF_XDP_H #include @@ -180,4 +180,4 @@ struct xdp_desc { /* TX packet carries valid metadata. */ #define XDP_TX_METADATA (1 << 1) -#endif /* _LINUX_IF_XDP_H */ +#endif /* _UAPI_LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h index 0245269b037c..85182a839d42 100644 --- a/include/uapi/linux/ip6_tunnel.h +++ b/include/uapi/linux/ip6_tunnel.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _IP6_TUNNEL_H -#define _IP6_TUNNEL_H +#ifndef _UAPI_IP6_TUNNEL_H +#define _UAPI_IP6_TUNNEL_H #include #include /* For IFNAMSIZ. */ diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 84f622a66a7a..9dd41c2f58a6 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __NET_DROPMON_H -#define __NET_DROPMON_H +#ifndef _UAPI__NET_DROPMON_H +#define _UAPI__NET_DROPMON_H #include #include diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 383213de612a..a93e6ea37fb3 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -7,8 +7,8 @@ * */ -#ifndef _NET_TIMESTAMPING_H -#define _NET_TIMESTAMPING_H +#ifndef _UAPI_NET_TIMESTAMPING_H +#define _UAPI_NET_TIMESTAMPING_H #include #include /* for SO_TIMESTAMPING */ @@ -216,4 +216,4 @@ struct sock_txtime { __u32 flags; /* as defined by enum txtime_flags */ }; -#endif /* _NET_TIMESTAMPING_H */ +#endif /* _UAPI_NET_TIMESTAMPING_H */ diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index dfa61be43d2f..ff28200204bb 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __NETLINK_DIAG_H__ -#define __NETLINK_DIAG_H__ +#ifndef _UAPI__NETLINK_DIAG_H__ +#define _UAPI__NETLINK_DIAG_H__ #include diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 2c32080416b5..490821364165 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_PKT_CLS_H -#define __LINUX_PKT_CLS_H +#ifndef _UAPI__LINUX_PKT_CLS_H +#define _UAPI__LINUX_PKT_CLS_H #include #include diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 25a9a47001cd..9ea874395717 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_PKT_SCHED_H -#define __LINUX_PKT_SCHED_H +#ifndef _UAPI__LINUX_PKT_SCHED_H +#define _UAPI__LINUX_PKT_SCHED_H #include #include -- cgit v1.2.3 From 9ff2aa4206eff40a202e425f232036bc84ad4c0e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 17 Mar 2025 23:07:30 -0400 Subject: net: ethtool: mm: extract stmmac verification logic into common library It appears that stmmac is not the only hardware which requires a software-driven verification state machine for the MAC Merge layer. While on the one hand it's good to encourage hardware implementations, on the other hand it's quite difficult to tolerate multiple drivers implementing independently fairly non-trivial logic. Extract the hardware-independent logic from stmmac into library code and put it in ethtool. Name the state structure "mmsv" for MAC Merge Software Verification. Let this expose an operations structure for executing the hardware stuff: sync hardware with the tx_active boolean (result of verification process), enable/disable the pMAC, send mPackets, notify library of external events (reception of mPackets), as well as link state changes. Note that it is assumed that the external events are received in hardirq context. If they are not, it is probably a good idea to disable hardirqs when calling ethtool_mmsv_event_handle(), because the library does not do so. Also, the MM software verification process has no business with the tx_min_frag_size, that is all the driver's to handle. Signed-off-by: Vladimir Oltean Co-developed-by: Choong Yong Liang Signed-off-by: Choong Yong Liang Tested-by: Choong Yong Liang Tested-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Signed-off-by: Tony Nguyen --- include/linux/ethtool.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 7edb5f5e7134..117718c24814 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -17,9 +17,13 @@ #include #include #include +#include #include #include +#define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 +#define ETHTOOL_MM_MAX_VERIFY_RETRIES 3 + struct compat_ethtool_rx_flow_spec { u32 flow_type; union ethtool_flow_union h_u; @@ -718,6 +722,75 @@ struct ethtool_mm_stats { u64 MACMergeHoldCount; }; +enum ethtool_mmsv_event { + ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET, + ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET, + ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET, +}; + +/* MAC Merge verification mPacket type */ +enum ethtool_mpacket { + ETHTOOL_MPACKET_VERIFY, + ETHTOOL_MPACKET_RESPONSE, +}; + +struct ethtool_mmsv; + +/** + * struct ethtool_mmsv_ops - Operations for MAC Merge Software Verification + * @configure_tx: Driver callback for the event where the preemptible TX + * becomes active or inactive. Preemptible traffic + * classes must be committed to hardware only while + * preemptible TX is active. + * @configure_pmac: Driver callback for the event where the pMAC state + * changes as result of an administrative setting + * (ethtool) or a call to ethtool_mmsv_link_state_handle(). + * @send_mpacket: Driver-provided method for sending a Verify or a Response + * mPacket. + */ +struct ethtool_mmsv_ops { + void (*configure_tx)(struct ethtool_mmsv *mmsv, bool tx_active); + void (*configure_pmac)(struct ethtool_mmsv *mmsv, bool pmac_enabled); + void (*send_mpacket)(struct ethtool_mmsv *mmsv, enum ethtool_mpacket mpacket); +}; + +/** + * struct ethtool_mmsv - MAC Merge Software Verification + * @ops: operations for MAC Merge Software Verification + * @dev: pointer to net_device structure + * @lock: serialize access to MAC Merge state between + * ethtool requests and link state updates. + * @status: current verification FSM state + * @verify_timer: timer for verification in local TX direction + * @verify_enabled: indicates if verification is enabled + * @verify_retries: number of retries for verification + * @pmac_enabled: indicates if the preemptible MAC is enabled + * @verify_time: time for verification in milliseconds + * @tx_enabled: indicates if transmission is enabled + */ +struct ethtool_mmsv { + const struct ethtool_mmsv_ops *ops; + struct net_device *dev; + spinlock_t lock; + enum ethtool_mm_verify_status status; + struct timer_list verify_timer; + bool verify_enabled; + int verify_retries; + bool pmac_enabled; + u32 verify_time; + bool tx_enabled; +}; + +void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv); +void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up); +void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, + enum ethtool_mmsv_event event); +void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, + struct ethtool_mm_state *state); +void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg); +void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, + const struct ethtool_mmsv_ops *ops); + /** * struct ethtool_rxfh_param - RXFH (RSS) parameters * @hfunc: Defines the current RSS hash function used by HW (or to be set to). -- cgit v1.2.3 From 094adad91310d9f8f8485251129482f4f3e2c5b3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:34 +0300 Subject: vxlan: Use a single lock to protect the FDB table Currently, the VXLAN driver stores FDB entries in a hash table with a fixed number of buckets (256). Subsequent patches are going to convert this table to rhashtable with a linked list for entry traversal, as rhashtable is more scalable. In preparation for this conversion, move from a per-bucket spin lock to a single spin lock that protects the entire FDB table. The per-bucket spin locks were introduced by commit fe1e0713bbe8 ("vxlan: Use FDB_HASH_SIZE hash_locks to reduce contention") citing "huge contention when inserting/deleting vxlan_fdbs into the fdb_head". It is not clear from the commit message which code path was holding the spin lock for long periods of time, but the obvious suspect is the FDB cleanup routine (vxlan_cleanup()) that periodically traverses the entire table in order to delete aged-out entries. This will be solved by subsequent patches that will convert the FDB cleanup routine to traverse the linked list of FDB entries using RCU, only acquiring the spin lock when deleting an aged-out entry. The change reduces the size of the VXLAN device structure from 3600 bytes to 2576 bytes. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-7-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- include/net/vxlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 2dd23ee2bacd..272e11708a33 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -296,7 +296,7 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; -- cgit v1.2.3 From 8d45673d2d2e59d03e108c569a3e8c031aa534c8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:35 +0300 Subject: vxlan: Add a linked list of FDB entries Currently, FDB entries are stored in a hash table with a fixed number of buckets. The table is used for both lookups and entry traversal. Subsequent patches will convert the table to rhashtable which is not suitable for entry traversal. In preparation for this conversion, add FDB entries to a linked list. Subsequent patches will convert the driver to use this list when traversing entries during dump, flush, etc. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-8-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- include/net/vxlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 272e11708a33..96a6c6f45c2e 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -307,6 +307,7 @@ struct vxlan_dev { struct hlist_head fdb_head[FDB_HASH_SIZE]; struct rhashtable mdb_tbl; + struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; -- cgit v1.2.3 From 1f763fa808e92a67feea8364ef80ca3065d74702 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:43 +0300 Subject: vxlan: Convert FDB table to rhashtable FDB entries are currently stored in a hash table with a fixed number of buckets (256), resulting in performance degradation as the number of entries grows. Solve this by converting the driver to use rhashtable which maintains more or less constant performance regardless of the number of entries. Measured transmitted packets per second using a single pktgen thread with varying number of entries when the transmitted packet always hits the default entry (worst case): Number of entries | Improvement ------------------|------------ 1k | +1.12% 4k | +9.22% 16k | +55% 64k | +585% 256k | +2460% In addition, the change reduces the size of the VXLAN device structure from 2584 bytes to 672 bytes. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-16-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- include/net/vxlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 96a6c6f45c2e..e2f7ca045d3e 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -304,7 +304,7 @@ struct vxlan_dev { struct vxlan_vni_group __rcu *vnigrp; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; struct hlist_head fdb_list; -- cgit v1.2.3 From 45bd443bfd8697a7da308c16c3e75e2bb353b3d1 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 18 Apr 2025 02:15:19 +0100 Subject: net: 802: Remove unused p8022 code p8022.c defines two external functions, register_8022_client() and unregister_8022_client(), the last use of which was removed in 2018 by commit 7a2e838d28cf ("staging: ipx: delete it from the tree") Remove the p8022.c file, it's corresponding header, and glue surrounding it. There was one place the header was included in vlan.c but it didn't use the functions it declared. There was a comment in net/802/Makefile about checking against net/core/Makefile, but that's at least 20 years old and there's no sign of net/core/Makefile mentioning it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250418011519.145320-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/p8022.h | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 include/net/p8022.h (limited to 'include') diff --git a/include/net/p8022.h b/include/net/p8022.h deleted file mode 100644 index a29e224ac498..000000000000 --- a/include/net/p8022.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_P8022_H -#define _NET_P8022_H - -struct net_device; -struct packet_type; -struct sk_buff; - -struct datalink_proto * -register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)); -void unregister_8022_client(struct datalink_proto *proto); -#endif -- cgit v1.2.3 From 145436ae01193c0a379fd3ea9c4fbdf32863db1f Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Wed, 16 Apr 2025 19:14:49 +0200 Subject: net: phy: Add helper for getting MAC termination resistance Add helper which returns the MAC termination resistance value. Modifying the resistance to an appropriate value can reduce signal reflections and therefore improve signal quality. Reviewed-by: Russell King (Oracle) Signed-off-by: Dimitri Fedrau Link: https://patch.msgid.link/20250416-dp83822-mac-impedance-v3-3-028ac426cddb@liebherr.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index fb755358d965..066a28a4b64b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2040,6 +2040,9 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, enum ethtool_link_mode_bit_indices linkmode, u32 *val); +int phy_get_mac_termination(struct phy_device *phydev, struct device *dev, + u32 *val); + void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); -- cgit v1.2.3 From 0e0a7e3719bc8cbe6d6e30b3e81f21472ecba5bc Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Mon, 21 Apr 2025 18:16:32 -0700 Subject: xdp: create locked/unlocked instances of xdp redirect target setters Commit 03df156dd3a6 ("xdp: double protect netdev->xdp_flags with netdev->lock") introduces the netdev lock to xdp_set_features_flag(). The change includes a _locked version of the method, as it is possible for a driver to have already acquired the netdev lock before calling this helper. However, the same applies to xdp_features_(set|clear)_redirect_flags(), which ends up calling the unlocked version of xdp_set_features_flags() leading to deadlocks in GVE, which grabs the netdev lock as part of its suspend, reset, and shutdown processes: [ 833.265543] WARNING: possible recursive locking detected [ 833.270949] 6.15.0-rc1 #6 Tainted: G E [ 833.276271] -------------------------------------------- [ 833.281681] systemd-shutdow/1 is trying to acquire lock: [ 833.287090] ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: xdp_set_features_flag+0x29/0x90 [ 833.295470] [ 833.295470] but task is already holding lock: [ 833.301400] ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: gve_shutdown+0x44/0x90 [gve] [ 833.309508] [ 833.309508] other info that might help us debug this: [ 833.316130] Possible unsafe locking scenario: [ 833.316130] [ 833.322142] CPU0 [ 833.324681] ---- [ 833.327220] lock(&dev->lock); [ 833.330455] lock(&dev->lock); [ 833.333689] [ 833.333689] *** DEADLOCK *** [ 833.333689] [ 833.339701] May be due to missing lock nesting notation [ 833.339701] [ 833.346582] 5 locks held by systemd-shutdow/1: [ 833.351205] #0: ffffffffa9c89130 (system_transition_mutex){+.+.}-{4:4}, at: __se_sys_reboot+0xe6/0x210 [ 833.360695] #1: ffff93b399e5c1b8 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xb4/0x1f0 [ 833.369144] #2: ffff949d19a471b8 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xc2/0x1f0 [ 833.377603] #3: ffffffffa9eca050 (rtnl_mutex){+.+.}-{4:4}, at: gve_shutdown+0x33/0x90 [gve] [ 833.386138] #4: ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: gve_shutdown+0x44/0x90 [gve] Introduce xdp_features_(set|clear)_redirect_target_locked() versions which assume that the netdev lock has already been acquired before setting the XDP feature flag and update GVE to use the locked version. Fixes: 03df156dd3a6 ("xdp: double protect netdev->xdp_flags with netdev->lock") Tested-by: Mina Almasry Reviewed-by: Willem de Bruijn Reviewed-by: Harshitha Ramamurthy Signed-off-by: Joshua Washington Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250422011643.3509287-1-joshwash@google.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 20e41b5ff319..b40f1f96cb11 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -618,7 +618,10 @@ bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); +void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } -- cgit v1.2.3 From 76a853f86c97b348dc96e75a6e6f94d8750715ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Mar 2025 13:49:39 +0100 Subject: wifi: free SKBTX_WIFI_STATUS skb tx_flags flag Jason mentioned at netdevconf that we've run out of tx_flags in the skb_shinfo(). Gain one bit back by removing the wifi bit. We can do that because the only userspace application for it (hostapd) doesn't change the setting on the socket, it just uses different sockets, and normally doesn't even use this any more, sending the frames over nl80211 instead. Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250313134942.52ff54a140ec.If390bbdc46904cf451256ba989d7a056c457af6e@changeid Signed-off-by: Johannes Berg --- include/linux/skbuff.h | 3 --- include/net/sock.h | 2 -- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a..9ee39670e8f4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -481,9 +481,6 @@ enum { /* generate software time stamp on packet tx completion */ SKBTX_COMPLETION_TSTAMP = 1 << 3, - /* generate wifi status information (where possible) */ - SKBTX_WIFI_STATUS = 1 << 4, - /* determine hardware time stamp based on time or cycles */ SKBTX_HW_TSTAMP_NETDEV = 1 << 5, diff --git a/include/net/sock.h b/include/net/sock.h index 694f954258d4..36b219109790 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2736,8 +2736,6 @@ static inline void _sock_tx_timestamp(struct sock *sk, *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } } - if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) - *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, -- cgit v1.2.3 From 996c15bd30a9caf5d3a32414a28503f3389fc96e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 29 Mar 2025 22:14:20 +0100 Subject: wifi: cfg80211/mac80211: remove more 5/10 MHz code We still have ieee80211_chandef_rate_flags() and all that, but all the users seem pretty much broken (deflink, etc.) Remove all the code. It's been two years since last anyone even vaguely entertained the notion of looking at this and fixing it. Link: https://patch.msgid.link/20250329221419.c31da7ae8c84.I1a3a4b6008134d66ca75a5bdfc004f4594da8145@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index efbd79c67be2..6df4e17f1437 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1096,43 +1096,6 @@ int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef, **/ int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef); -/** - * ieee80211_chanwidth_rate_flags - return rate flags for channel width - * @width: the channel width of the channel - * - * In some channel types, not all rates may be used - for example CCK - * rates may not be used in 5/10 MHz channels. - * - * Returns: rate flags which apply for this channel width - */ -static inline enum ieee80211_rate_flags -ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width) -{ - switch (width) { - case NL80211_CHAN_WIDTH_5: - return IEEE80211_RATE_SUPPORTS_5MHZ; - case NL80211_CHAN_WIDTH_10: - return IEEE80211_RATE_SUPPORTS_10MHZ; - default: - break; - } - return 0; -} - -/** - * ieee80211_chandef_rate_flags - returns rate flags for a channel - * @chandef: channel definition for the channel - * - * See ieee80211_chanwidth_rate_flags(). - * - * Returns: rate flags which apply for this channel - */ -static inline enum ieee80211_rate_flags -ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) -{ - return ieee80211_chanwidth_rate_flags(chandef->width); -} - /** * ieee80211_chandef_max_power - maximum transmission power for the chandef * -- cgit v1.2.3 From 4876376988081d636a4c4e5f03a5556386b49087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 3 Apr 2025 20:39:28 +0200 Subject: Revert "mac80211: Dynamically set CoDel parameters per station" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 484a54c2e597dbc4ace79c1687022282905afba0. The CoDel parameter change essentially disables CoDel on slow stations, with some questionable assumptions, as Dave pointed out in [0]. Quoting from there: But here are my pithy comments as to why this part of mac80211 is so wrong... static void sta_update_codel_params(struct sta_info *sta, u32 thr) { - if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { 1) sta->local->num_sta is the number of associated, rather than active, stations. "Active" stations in the last 50ms or so, might have been a better thing to use, but as most people have far more than that associated, we end up with really lousy codel parameters, all the time. Mistake numero uno! 2) The STA_SLOW_THRESHOLD was completely arbitrary in 2016. - sta->cparams.target = MS2TIME(50); This, by itself, was probably not too bad. 30ms might have been better, at the time, when we were battling powersave etc, but 20ms was enough, really, to cover most scenarios, even where we had low rate 2Ghz multicast to cope with. Even then, codel has a hard time finding any sane drop rate at all, with a target this high. - sta->cparams.interval = MS2TIME(300); But this was horrible, a total mistake, that is leading to codel being completely ineffective in almost any scenario on clients or APS. 100ms, even 80ms, here, would be vastly better than this insanity. I'm seeing 5+seconds of delay accumulated in a bunch of otherwise happily fq-ing APs.... 100ms of observed jitter during a flow is enough. Certainly (in 2016) there were interactions with powersave that I did not understand, and still don't, but if you are transmitting in the first place, powersave shouldn't be a problemmmm..... - sta->cparams.ecn = false; At the time we were pretty nervous about ecn, I'm kind of sanguine about it now, and reliably indicating ecn seems better than turning it off for any reason. [...] In production, on p2p wireless, I've had 8ms and 80ms for target and interval for years now, and it works great. I think Dave's arguments above are basically sound on the face of it, and various experimentation with tighter CoDel parameters in the OpenWrt community have show promising results[1]. So I don't think there's any reason to keep this parameter fiddling; hence this revert. [0] https://lore.kernel.org/linux-wireless/CAA93jw6NJ2cmLmMauz0xAgC2MGbBq6n0ZiZzAdkK0u4b+O2yXg@mail.gmail.com/ [1] https://forum.openwrt.org/t/reducing-multiplexing-latencies-still-further-in-wifi/133605/130 Suggested-By: Dave Taht In-memory-of: Dave Taht Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250403183930.197716-1-toke@toke.dk Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c498f685d01f..5349df596157 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5346,22 +5346,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_tx_rate *dest, int max_rates); -/** - * ieee80211_sta_set_expected_throughput - set the expected tpt for a station - * - * Call this function to notify mac80211 about a change in expected throughput - * to a station. A driver for a device that does rate control in firmware can - * call this function when the expected throughput estimate towards a station - * changes. The information is used to tune the CoDel AQM applied to traffic - * going towards that station (which can otherwise be too aggressive and cause - * slow stations to starve). - * - * @pubsta: the station to set throughput for. - * @thr: the current expected throughput in kbps. - */ -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr); - /** * ieee80211_tx_rate_update - transmit rate update callback * -- cgit v1.2.3 From fcc2d3e11bcc8f01d52a8c419f49f86ff8343b7c Mon Sep 17 00:00:00 2001 From: Karthikeyan Kathirvel Date: Mon, 21 Apr 2025 16:45:05 +0530 Subject: wifi: ieee80211: define beacon protection bit field An AP supporting Beacon Protection should set bit 84 in the extended capabilities IE (9.4.2.25 in the 802.11be D7 spec). So the *4th* bit of the 10th byte should be checked to figure out whether beacon protection is enabled or disabled. Signed-off-by: Karthikeyan Kathirvel Reviewed-by: Jeff Johnson Link: https://patch.msgid.link/20250421111505.3633992-1-karthikeyan.kathirvel@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 508d466de1cc..cbc3928aa504 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4087,6 +4087,9 @@ enum ieee80211_tdls_actioncode { /* Defines support for enhanced multi-bssid advertisement*/ #define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(3) +/* Enable Beacon Protection */ +#define WLAN_EXT_CAPA11_BCN_PROTECT BIT(4) + /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 -- cgit v1.2.3 From 53160d0edf7336acaed4c74c6af8549d87c92ae6 Mon Sep 17 00:00:00 2001 From: Ramasamy Kaliappan Date: Thu, 27 Mar 2025 10:43:17 +0530 Subject: wifi: cfg80211: Add support to get EMLSR capabilities of non-AP MLD The Enhanced multi-link single-radio (EMLSR) operation allows a non-AP MLD with multiple receive chains to listen on one or more EMLSR links when the corresponding non-AP STA(s) affiliated with the non-AP MLD is (are) in the awake state. [IEEE 802.11be-2024, (35.3.17 Enhanced multi-link single-radio (EMLSR) operation)] An MLD which intends to enable EMLSR operations will set the EML Capabilities Present subfield to 1 and shall set the EMLSR Support subfield in the Common Info field of the Basic Multi-Link element to 1 in all Management frames that include the Basic Multi-Link element except Authentication frames. EML capabilities contains information such as EML Transition timeout, Padding delay and Transition delay. These fields needs to updated to drivers to trigger EMLSR operation and to transmit and receive initial control frame and data frames. Add support to receive EML Capabilities subfield that non-AP MLD advertises during (re)association request and send it to underlying drivers during ADD/SET station. Signed-off-by: Ramasamy Kaliappan Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20250327051320.3253783-2-quic_ramess@quicinc.com [accept EMLSR capabilities only for unassoc AP STA] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6df4e17f1437..87cb66fba621 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1733,6 +1733,9 @@ struct cfg80211_ttlm_params { * @supported_oper_classes_len: number of supported operating classes * @support_p2p_ps: information if station supports P2P PS mechanism * @airtime_weight: airtime scheduler weight for this station + * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is + * present/updated + * @eml_cap: EML capabilities of this station * @link_sta_params: link related params. */ struct station_parameters { @@ -1757,6 +1760,8 @@ struct station_parameters { u8 supported_oper_classes_len; int support_p2p_ps; u16 airtime_weight; + bool eml_cap_present; + u16 eml_cap; struct link_station_parameters link_sta_params; }; -- cgit v1.2.3 From 14e0f59a88cc22ceeb36e26b89b70b22292d23de Mon Sep 17 00:00:00 2001 From: Ramasamy Kaliappan Date: Thu, 27 Mar 2025 10:43:18 +0530 Subject: wifi: mac80211: update ML STA with EML capabilities When an AP and Non-AP MLD operates in EMLSR mode, EML capabilities advertised during Association contains information such as EMLSR transition delay, padding delay and transition timeout values. Save the EML capabilities information that is received during station addition and capabilities update in ieee80211_sta so that drivers can use it for triggering EMLSR operation. Signed-off-by: Ramasamy Kaliappan Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20250327051320.3253783-3-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5349df596157..c305ebfa6e45 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2488,6 +2488,7 @@ struct ieee80211_link_sta { * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. + * @eml_cap: EML capabilities of this MLO station * @cur: currently valid data as aggregated from the active links * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. @@ -2522,6 +2523,7 @@ struct ieee80211_sta { bool mlo; bool spp_amsdu; u8 max_amsdu_subframes; + u16 eml_cap; struct ieee80211_sta_aggregates *cur; -- cgit v1.2.3 From 91ea0489dc97bfda72ed74f98ab66dc0ab4235c7 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Thu, 27 Mar 2025 10:43:19 +0530 Subject: wifi: ieee80211: Add helpers to fetch EMLSR delay and timeout values Add helpers to get EMLSR transition delay, padding delay and transition timeout values from EML capabilities field of Multi-link Element. Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20250327051320.3253783-4-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index cbc3928aa504..15a87f522017 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -5618,6 +5618,80 @@ static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) return len >= fixed + elem_len; } +/** + * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay + * in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Padding delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR + * Padding Delay subfield. + */ + u32 pad_delay = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); + + if (!pad_delay || + pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) + return 0; + + return 32 * (1 << (pad_delay - 1)); +} + +/** + * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition + * delay in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR + * Transition Delay subfield. + */ + u32 trans_delay = + u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); + + /* invalid values also just use 0 */ + if (!trans_delay || + trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) + return 0; + + return 16 * (1 << (trans_delay - 1)); +} + +/** + * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition + * timeout value in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition timeout (in microseconds) encoded in + * the EML Capabilities field + */ + +static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the + * Transition Timeout subfield. + */ + u8 timeout = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_TRANSITION_TIMEOUT); + + /* invalid values also just use 0 */ + if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU) + return 0; + + return 128 * (1 << (timeout - 1)); +} + #define for_each_mle_subelement(_elem, _data, _len) \ if (ieee80211_mle_size_ok(_data, _len)) \ for_each_element(_elem, \ -- cgit v1.2.3 From 37523c3c47b3f3cc4c7d2ff47d28ee9ec99317c1 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Tue, 8 Apr 2025 11:44:59 -0700 Subject: wifi: nl80211: add link id of transmitted profile for MLO MBSSID During non-transmitted (nontx) profile configuration, interface index of the transmitted (tx) profile is used to retrieve the wireless device (wdev) associated with it. With MLO, this 'wdev' may be part of an MLD with more than one link, hence only interface index is not sufficient anymore to retrieve the correct tx profile. Add a new attribute to configure link id of tx profile. Signed-off-by: Rameshkumar Sundaram Co-developed-by: Muna Sinada Signed-off-by: Muna Sinada Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Link: https://patch.msgid.link/20250408184501.3715887-2-aloka.dixit@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 87cb66fba621..d1848dc8ec99 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1263,11 +1263,13 @@ struct cfg80211_crypto_settings { * struct cfg80211_mbssid_config - AP settings for multi bssid * * @tx_wdev: pointer to the transmitted interface in the MBSSID set + * @tx_link_id: link ID of the transmitted profile in an MLD. * @index: index of this AP in the multi bssid group. * @ema: set to true if the beacons should be sent out in EMA mode. */ struct cfg80211_mbssid_config { struct wireless_dev *tx_wdev; + u8 tx_link_id; u8 index; bool ema; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ddcc4cda74af..e9ccf43fe3c6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -8036,6 +8036,11 @@ enum nl80211_sar_specs_attrs { * Setting this flag is permitted only if the driver advertises EMA support * by setting wiphy->ema_max_profile_periodicity to non-zero. * + * @NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID: Link ID of the transmitted profile. + * This parameter is mandatory when NL80211_ATTR_MBSSID_CONFIG attributes + * are sent for a non-transmitted profile and if the transmitted profile + * is part of an MLD. For all other cases this parameter is unnecessary. + * * @__NL80211_MBSSID_CONFIG_ATTR_LAST: Internal * @NL80211_MBSSID_CONFIG_ATTR_MAX: highest attribute */ @@ -8047,6 +8052,7 @@ enum nl80211_mbssid_config_attributes { NL80211_MBSSID_CONFIG_ATTR_INDEX, NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX, NL80211_MBSSID_CONFIG_ATTR_EMA, + NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID, /* keep last */ __NL80211_MBSSID_CONFIG_ATTR_LAST, -- cgit v1.2.3 From f600832794c91d7021d7337104734246b02a2b86 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Tue, 8 Apr 2025 11:45:00 -0700 Subject: wifi: mac80211: restructure tx profile retrieval for MLO MBSSID For MBSSID, each vif (struct ieee80211_vif) stores another vif pointer for the transmitting profile of MBSSID set. This won't suffice for MLO as there may be multiple links, each of which can be part of different MBSSID sets. Hence the information needs to be stored per-link. Additionally, the transmitted profile itself may be part of an MLD hence storing vif will not suffice either. Fix MLO by storing an instance of struct ieee80211_bss_conf for each link. Modify following operations to reflect the above structure updates: - channel switch completion - BSS color change completion - Removing nontransmitted links in ieee80211_stop_mbssid() - drivers retrieving the transmitted link for beacon templates. Signed-off-by: Rameshkumar Sundaram Co-developed-by: Muna Sinada Signed-off-by: Muna Sinada Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Link: https://patch.msgid.link/20250408184501.3715887-3-aloka.dixit@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c305ebfa6e45..fdafc37d17cc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -682,6 +682,9 @@ struct ieee80211_parsed_tpe { * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile + * @tx_bss_conf: Pointer to the BSS configuration of transmitting interface + * if MBSSID is enabled. This pointer is RCU-protected due to CSA finish + * and BSS color change flows accessing it. * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set @@ -804,6 +807,7 @@ struct ieee80211_bss_conf { struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; + struct ieee80211_bss_conf __rcu *tx_bss_conf; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; @@ -2023,7 +2027,6 @@ enum ieee80211_neg_ttlm_res { * @txq: the multicast data TX queue * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. - * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled. */ struct ieee80211_vif { enum nl80211_iftype type; @@ -2052,8 +2055,6 @@ struct ieee80211_vif { bool probe_req_reg; bool rx_mcast_action_reg; - struct ieee80211_vif *mbssid_tx_vif; - /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; -- cgit v1.2.3 From 52358dd63e348c3b6c488acc105be1aeda8fb923 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 18 Apr 2025 11:04:01 +0200 Subject: net: phy: remove function stubs All callers of these functions depend on PHYLIB or select it directly or indirectly by selecting PHYLINK. Stubs make sense for optional functionality, but that's not the case here. MDIO_XGENE usually is selected by NET_XGENE which also selects PHYLIB. Add a dependency to PHYLIB nevertheless, in order not to break randconfig builds. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/f7a69a1f-60e9-4ac0-8b7c-481e0cc850e7@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 066a28a4b64b..3beaf225ee88 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1753,7 +1753,6 @@ int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); -#if IS_ENABLED(CONFIG_PHYLIB) int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); @@ -1761,42 +1760,6 @@ struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); void phy_device_free(struct phy_device *phydev); -#else -static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id) -{ - return 0; -} -static inline -struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode) -{ - return 0; -} - -static inline -struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) -{ - return NULL; -} - -static inline -struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) -{ - return NULL; -} - -static inline -struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) -{ - return NULL; -} - -static inline int phy_device_register(struct phy_device *phy) -{ - return 0; -} - -static inline void phy_device_free(struct phy_device *phydev) { } -#endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_get_c45_ids(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); -- cgit v1.2.3 From 834d97843e3bca86f17cc517885f54f3433427b2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:53 -0700 Subject: ipv6: Protect fib6_link_table() with spinlock. We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT. If the request specifies a new table ID, fib6_new_table() is called to create a new routing table. Two concurrent requests could specify the same table ID, so we need a lock to protect net->ipv6.fib_table_hash[h]. Let's add a spinlock to protect the hash bucket linkage. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-13-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5f2cfd84570a..47dc70d8100a 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -72,6 +72,7 @@ struct netns_ipv6 { struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; + spinlock_t fib_table_hash_lock; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; rwlock_t fib6_walker_lock; -- cgit v1.2.3 From accb46b56bc3bc99ee69ba18b06ca60266ad6fca Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:54 -0700 Subject: ipv6: Defer fib6_purge_rt() in fib6_add_rt2node() to fib6_add(). The next patch adds per-nexthop spinlock which protects nh->f6i_list. When rt->nh is not NULL, fib6_add_rt2node() will be called under the lock. fib6_add_rt2node() could call fib6_purge_rt() for another route, which could holds another nexthop lock. Then, deadlock could happen between two nexthops. Let's defer fib6_purge_rt() after fib6_add_rt2node(). Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-14-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 7c87873ae211..88b0dd4d8e09 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -198,6 +198,7 @@ struct fib6_info { fib6_destroying:1, unused:4; + struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; -- cgit v1.2.3 From 081efd18326e353c6fbfdeff903a83edde953f72 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:55 -0700 Subject: ipv6: Protect nh->f6i_list with spinlock and flag. We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT. Then, we may be going to add a route tied to a dying nexthop. The nexthop itself is not freed during the RCU grace period, but if we link a route after __remove_nexthop_fib() is called for the nexthop, the route will be leaked. To avoid the race between IPv6 route addition under RCU vs nexthop deletion under RTNL, let's add a dead flag and protect it and nh->f6i_list with a spinlock. __remove_nexthop_fib() acquires the nexthop's spinlock and sets false to nh->dead, then calls ip6_del_rt() for the linked route one by one without the spinlock because fib6_purge_rt() acquires it later. While adding an IPv6 route, fib6_add() acquires the nexthop lock and checks the dead flag just before inserting the route. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-15-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/nexthop.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index d9fb44e8b321..572e69cda476 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -152,6 +152,8 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool dead; + spinlock_t lock; /* protect dead and f6i_list */ refcount_t refcnt; struct rcu_head rcu; -- cgit v1.2.3 From 39144062ea335495d659b08c9e3133ab746a0b1b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 23 Apr 2025 00:51:47 +0100 Subject: rxrpc: Remove deadcode Remove three functions that are no longer used. rxrpc_get_txbuf() last use was removed by 2020's commit 5e6ef4f1017c ("rxrpc: Make the I/O thread take over the call and local processor work") rxrpc_kernel_get_epoch() last use was removed by 2020's commit 44746355ccb1 ("afs: Don't get epoch from a server because it may be ambiguous") rxrpc_kernel_set_max_life() last use was removed by 2023's commit db099c625b13 ("rxrpc: Fix timeout of a call that hasn't yet been granted a channel") Both of the rxrpc_kernel_* functions were documented. Remove that documentation as well as the code. Signed-off-by: Dr. David Alan Gilbert Acked-by: David Howells Link: https://patch.msgid.link/20250422235147.146460-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/af_rxrpc.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index f15341594cc8..0fb4c41c9bbf 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -88,9 +88,6 @@ int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); -u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, - unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); -- cgit v1.2.3 From bc2550b4e195754fbb24aac1f012d3dd9e3b4edc Mon Sep 17 00:00:00 2001 From: Jeremy Harris Date: Wed, 23 Apr 2025 13:43:33 +0100 Subject: tcp: fastopen: note that a child socket was created tcp: fastopen: note that a child socket was created This uses up the last bit in a field of tcp_sock. Signed-off-by: Jeremy Harris Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250423124334.4916-2-jgh@exim.org Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1669d95bb0f9..a8af71623ba7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -385,7 +385,8 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ -- cgit v1.2.3 From 2b13042d3636327eb50c8a0ee06f629d52d1b8fb Mon Sep 17 00:00:00 2001 From: Jeremy Harris Date: Wed, 23 Apr 2025 13:43:34 +0100 Subject: tcp: fastopen: pass TFO child indication through getsockopt tcp: fastopen: pass TFO child indication through getsockopt Note that this uses up the last bit of a field in struct tcp_info Signed-off-by: Jeremy Harris Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250423124334.4916-3-jgh@exim.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dc8fdc80e16b..bdac8c42fa82 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -184,6 +184,7 @@ enum tcp_fastopen_client_fail { #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ +#define TCPI_OPT_TFO_CHILD 128 /* child from a Fast Open option on SYN */ /* * Sender's congestion state indicating normal or abnormal situations -- cgit v1.2.3 From d57ee99831e336576359beb26e2b140511c99106 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Apr 2025 17:08:08 +0200 Subject: net: ethernet: mtk_wed: annotate RCU release in attach() There are some sparse warnings in wifi, and it seems that it's actually possible to annotate a function pointer with __releases(), making the sparse warnings go away. In a way that also serves as documentation that rcu_read_unlock() must be called in the attach method, so add that annotation. Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250423150811.456205-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- include/linux/soc/mediatek/mtk_wed.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index a476648858a6..d8949a4ed0dc 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -192,7 +192,7 @@ struct mtk_wed_device { }; struct mtk_wed_ops { - int (*attach)(struct mtk_wed_device *dev); + int (*attach)(struct mtk_wed_device *dev) __releases(RCU); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, -- cgit v1.2.3 From 34dd0fecaa02d654c447d43a7e4c72f9b18b7033 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 25 Apr 2025 16:55:31 +0200 Subject: net: sched: generalize check for no-queue qdisc on TX queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "noqueue" qdisc can either be directly attached, or get default attached if net_device priv_flags has IFF_NO_QUEUE. In both cases, the allocated Qdisc structure gets it's enqueue function pointer reset to NULL by noqueue_init() via noqueue_qdisc_ops. This is a common case for software virtual net_devices. For these devices with no-queue, the transmission path in __dev_queue_xmit() will bypass the qdisc layer. Directly invoking device drivers ndo_start_xmit (via dev_hard_start_xmit). In this mode the device driver is not allowed to ask for packets to be queued (either via returning NETDEV_TX_BUSY or stopping the TXQ). The simplest and most reliable way to identify this no-queue case is by checking if enqueue == NULL. The vrf driver currently open-codes this check (!qdisc->enqueue). While functionally correct, this low-level detail is better encapsulated in a dedicated helper for clarity and long-term maintainability. To make this behavior more explicit and reusable, this patch introduce a new helper: qdisc_txq_has_no_queue(). Helper will also be used by the veth driver in the next patch, which introduces optional qdisc-based backpressure. This is a non-functional change. Reviewed-by: David Ahern Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/174559293172.827981.7583862632045264175.stgit@firesoul Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d48c657191cd..b6c177f7141c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -803,6 +803,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) return false; } +/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ +static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) +{ + struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); + + return qdisc->enqueue == NULL; +} + /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { -- cgit v1.2.3 From 0014af802193aa3547484b5db0f1a258bad28c81 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Apr 2025 15:55:53 +0200 Subject: netfilter: nf_tables: export set count and backend name to userspace nf_tables picks a suitable set backend implementation (bitmap, hash, rbtree..) based on the userspace requirements. Figuring out the chosen backend requires information about the set flags and the kernel version. Export this to userspace so nft can include this information in '--debug=netlink' output. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 49c944e78463..7d6bc19a0153 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -394,6 +394,8 @@ enum nft_set_field_attributes { * @NFTA_SET_HANDLE: set handle (NLA_U64) * @NFTA_SET_EXPR: set expression (NLA_NESTED: nft_expr_attributes) * @NFTA_SET_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) + * @NFTA_SET_TYPE: set backend type (NLA_STRING) + * @NFTA_SET_COUNT: number of set elements (NLA_U32) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -415,6 +417,8 @@ enum nft_set_attributes { NFTA_SET_HANDLE, NFTA_SET_EXPR, NFTA_SET_EXPRESSIONS, + NFTA_SET_TYPE, + NFTA_SET_COUNT, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) -- cgit v1.2.3 From 32607a332cfea5a4b2a185f3e3d605a9bf4f8df0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:18 -0400 Subject: ipv4: prefer multipath nexthop that matches source address With multipath routes, try to ensure that packets leave on the device that is associated with the source address. Avoid the following tcpdump example: veth0 Out IP 10.1.0.2.38640 > 10.2.0.3.8000: Flags [S] veth1 Out IP 10.1.0.2.38648 > 10.2.0.3.8000: Flags [S] Which can happen easily with the most straightforward setup: ip addr add 10.0.0.1/24 dev veth0 ip addr add 10.1.0.1/24 dev veth1 ip route add 10.2.0.3 nexthop via 10.0.0.2 dev veth0 \ nexthop via 10.1.0.2 dev veth1 This is apparently considered WAI, based on the comment in ip_route_output_key_hash_rcu: * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK It may be ok for some uses of multipath, but not all. For instance, when using two ISPs, a router may drop packets with unknown source. The behavior occurs because tcp_v4_connect makes three route lookups when establishing a connection: 1. ip_route_connect calls to select a source address, with saddr zero. 2. ip_route_connect calls again now that saddr and daddr are known. 3. ip_route_newports calls again after a source port is also chosen. With a route with multiple nexthops, each lookup may make a different choice depending on available entropy to fib_select_multipath. So it is possible for 1 to select the saddr from the first entry, but 3 to select the second entry. Leading to the above situation. Address this by preferring a match that matches the flowi4 saddr. This will make 2 and 3 make the same choice as 1. Continue to update the backup choice until a choice that matches saddr is found. Do this in fib_select_multipath itself, rather than passing an fl4_oif constraint, to avoid changing non-multipath route selection. Commit e6b45241c57a ("ipv4: reset flowi parameters on route connect") shows how that may cause regressions. Also read ipv4.sysctl_fib_multipath_use_neigh only once. No need to refresh in the loop. This does not happen in IPv6, which performs only one lookup. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-2-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/ip_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e3864b74e92a..48bb3cf41469 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); -- cgit v1.2.3 From 65e9024643c7512ade3aedbb341e11d77ed7abc2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:19 -0400 Subject: ip: load balance tcp connections to single dst addr and port Load balance new TCP connections across nexthops also when they connect to the same service at a single remote address and port. This affects only port-based multipath hashing: fib_multipath_hash_policy 1 or 3. Local connections must choose both a source address and port when connecting to a remote service, in ip_route_connect. This "chicken-and-egg problem" (commit 2d7192d6cbab ("ipv4: Sanitize and simplify ip_route_{connect,newports}()")) is resolved by first selecting a source address, by looking up a route using the zero wildcard source port and address. As a result multiple connections to the same destination address and port have no entropy in fib_multipath_hash. This is not a problem when forwarding, as skb-based hashing has a 4-tuple. Nor when establishing UDP connections, as autobind there selects a port before reaching ip_route_connect. Load balance also TCP, by using a random port in fib_multipath_hash. Port assignment in inet_hash_connect is not atomic with ip_route_connect. Thus ports are unpredictable, effectively random. Implementation details: Do not actually pass a random fl4_sport, as that affects not only hashing, but routing more broadly, and can match a source port based policy route, which existing wildcard port 0 will not. Instead, define a new wildcard flowi flag that is used only for hashing. Selecting a random source is equivalent to just selecting a random hash entirely. But for code clarity, follow the normal 4-tuple hash process and only update this field. fib_multipath_hash can be reached with zero sport from other code paths, so explicitly pass this flowi flag, rather than trying to infer this case in the function itself. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-3-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/flow.h | 1 + include/net/route.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/flow.h b/include/net/flow.h index 2a3f0c42f092..a1839c278d87 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -39,6 +39,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; __u32 flowic_multipath_hash; diff --git a/include/net/route.h b/include/net/route.h index c605fd5ec0c0..8e39aa822cf9 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); -- cgit v1.2.3 From 144530c15ec7fa95b29812d86f4be527338ea204 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:16 -0700 Subject: pds_core: remove extra name description Fix the kernel-doc complaint include/linux/pds/pds_adminq.h:481: warning: Excess struct member 'name' description in 'pds_core_lif_getattr_comp' Reviewed-by: Simon Horman Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- include/linux/pds/pds_adminq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index ddd111f04ca0..339156113fa5 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -463,7 +463,6 @@ struct pds_core_lif_getattr_cmd { * @rsvd: Word boundary padding * @comp_index: Index in the descriptor ring for which this is the completion * @state: LIF state (enum pds_core_lif_state) - * @name: LIF name string, 0 terminated * @features: Features (enum pds_core_hw_features) * @rsvd2: Word boundary padding * @color: Color bit -- cgit v1.2.3 From 7c4f4c4fa9b6fbb7e483bebd02f7b9cbc20ca5cc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:17 -0700 Subject: pds_core: smaller adminq poll starting interval Shorten the adminq poll starting interval in order to notice any transaction errors more quickly. Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/pds/pds_adminq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index 339156113fa5..40ff0ec2b879 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -4,7 +4,7 @@ #ifndef _PDS_CORE_ADMINQ_H_ #define _PDS_CORE_ADMINQ_H_ -#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256 +#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256000 /* usecs */ enum pds_core_adminq_flags { PDS_AQ_FLAG_FASTPOLL = BIT(1), /* completion poll at 1ms */ -- cgit v1.2.3 From 468d8b462ac64659caec53eff34f02963d5f52c8 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:45 -0500 Subject: iidc/ice/irdma: Rename IDC header file To prepare for the IDC upgrade to support different CORE PCI drivers, rename header file from iidc.h to iidc_rdma.h since this files functionality is specifically for RDMA support. Use net/dscp.h include in irdma osdep.h and DSCP_MAX type.h, instead of iidc header and define. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc.h | 109 ------------------------------------ include/linux/net/intel/iidc_rdma.h | 109 ++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 109 deletions(-) delete mode 100644 include/linux/net/intel/iidc.h create mode 100644 include/linux/net/intel/iidc_rdma.h (limited to 'include') diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h deleted file mode 100644 index 13274c3def66..000000000000 --- a/include/linux/net/intel/iidc.h +++ /dev/null @@ -1,109 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2021, Intel Corporation. */ - -#ifndef _IIDC_H_ -#define _IIDC_H_ - -#include -#include -#include -#include -#include -#include - -enum iidc_event_type { - IIDC_EVENT_BEFORE_MTU_CHANGE, - IIDC_EVENT_AFTER_MTU_CHANGE, - IIDC_EVENT_BEFORE_TC_CHANGE, - IIDC_EVENT_AFTER_TC_CHANGE, - IIDC_EVENT_CRIT_ERR, - IIDC_EVENT_NBITS /* must be last */ -}; - -enum iidc_reset_type { - IIDC_PFR, - IIDC_CORER, - IIDC_GLOBR, -}; - -enum iidc_rdma_protocol { - IIDC_RDMA_PROTOCOL_IWARP = BIT(0), - IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), -}; - -#define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_MAX_DSCP_MAPPING 64 -#define IIDC_DSCP_PFC_MODE 0x1 - -/* Struct to hold per RDMA Qset info */ -struct iidc_rdma_qset_params { - /* Qset TEID returned to the RDMA driver in - * ice_add_rdma_qset and used by RDMA driver - * for calls to ice_del_rdma_qset - */ - u32 teid; /* Qset TEID */ - u16 qs_handle; /* RDMA driver provides this */ - u16 vport_id; /* VSI index */ - u8 tc; /* TC branch the Qset should belong to */ -}; - -struct iidc_qos_info { - u64 tc_ctx; - u8 rel_bw; - u8 prio_type; - u8 egress_virt_up; - u8 ingress_virt_up; -}; - -/* Struct to pass QoS info */ -struct iidc_qos_params { - struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; - u8 up2tc[IIDC_MAX_USER_PRIORITY]; - u8 vport_relative_bw; - u8 vport_priority_type; - u8 num_tc; - u8 pfc_mode; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; -}; - -struct iidc_event { - DECLARE_BITMAP(type, IIDC_EVENT_NBITS); - u32 reg; -}; - -struct ice_pf; - -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); - -/* Structure representing auxiliary driver tailored information about the core - * PCI dev, each auxiliary driver using the IIDC interface will have an - * instance of this struct dedicated to it. - */ - -struct iidc_auxiliary_dev { - struct auxiliary_device adev; - struct ice_pf *pf; -}; - -/* structure representing the auxiliary driver. This struct is to be - * allocated and populated by the auxiliary driver's owner. The core PCI - * driver will access these ops by performing a container_of on the - * auxiliary_device->dev.driver. - */ -struct iidc_auxiliary_drv { - struct auxiliary_driver adrv; - /* This event_handler is meant to be a blocking call. For instance, - * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not - * return until the auxiliary driver is ready for the MTU change to - * happen. - */ - void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); -}; - -#endif /* _IIDC_H_*/ diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h new file mode 100644 index 000000000000..0cd75404e459 --- /dev/null +++ b/include/linux/net/intel/iidc_rdma.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021, Intel Corporation. */ + +#ifndef _IIDC_RDMA_H_ +#define _IIDC_RDMA_H_ + +#include +#include +#include +#include +#include +#include + +enum iidc_event_type { + IIDC_EVENT_BEFORE_MTU_CHANGE, + IIDC_EVENT_AFTER_MTU_CHANGE, + IIDC_EVENT_BEFORE_TC_CHANGE, + IIDC_EVENT_AFTER_TC_CHANGE, + IIDC_EVENT_CRIT_ERR, + IIDC_EVENT_NBITS /* must be last */ +}; + +enum iidc_reset_type { + IIDC_PFR, + IIDC_CORER, + IIDC_GLOBR, +}; + +enum iidc_rdma_protocol { + IIDC_RDMA_PROTOCOL_IWARP = BIT(0), + IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), +}; + +#define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_MAX_DSCP_MAPPING 64 +#define IIDC_DSCP_PFC_MODE 0x1 + +/* Struct to hold per RDMA Qset info */ +struct iidc_rdma_qset_params { + /* Qset TEID returned to the RDMA driver in + * ice_add_rdma_qset and used by RDMA driver + * for calls to ice_del_rdma_qset + */ + u32 teid; /* Qset TEID */ + u16 qs_handle; /* RDMA driver provides this */ + u16 vport_id; /* VSI index */ + u8 tc; /* TC branch the Qset should belong to */ +}; + +struct iidc_qos_info { + u64 tc_ctx; + u8 rel_bw; + u8 prio_type; + u8 egress_virt_up; + u8 ingress_virt_up; +}; + +/* Struct to pass QoS info */ +struct iidc_qos_params { + struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; + u8 up2tc[IIDC_MAX_USER_PRIORITY]; + u8 vport_relative_bw; + u8 vport_priority_type; + u8 num_tc; + u8 pfc_mode; + u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; +}; + +struct iidc_event { + DECLARE_BITMAP(type, IIDC_EVENT_NBITS); + u32 reg; +}; + +struct ice_pf; + +int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); +int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); +void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); + +/* Structure representing auxiliary driver tailored information about the core + * PCI dev, each auxiliary driver using the IIDC interface will have an + * instance of this struct dedicated to it. + */ + +struct iidc_auxiliary_dev { + struct auxiliary_device adev; + struct ice_pf *pf; +}; + +/* structure representing the auxiliary driver. This struct is to be + * allocated and populated by the auxiliary driver's owner. The core PCI + * driver will access these ops by performing a container_of on the + * auxiliary_device->dev.driver. + */ +struct iidc_auxiliary_drv { + struct auxiliary_driver adrv; + /* This event_handler is meant to be a blocking call. For instance, + * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not + * return until the auxiliary driver is ready for the MTU change to + * happen. + */ + void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); +}; + +#endif /* _IIDC_RDMA_H_*/ -- cgit v1.2.3 From 97b5631aae6896369712d6b7131afbc95c753587 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:46 -0500 Subject: iidc/ice/irdma: Rename to iidc_* convention In preparation of supporting more than a single core PCI driver for RDMA, homogenize naming to iidc_rdma_* and IIDC_RDMA_* form. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc_rdma.h | 38 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 0cd75404e459..2b24a9912fa0 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -11,16 +11,16 @@ #include #include -enum iidc_event_type { - IIDC_EVENT_BEFORE_MTU_CHANGE, - IIDC_EVENT_AFTER_MTU_CHANGE, - IIDC_EVENT_BEFORE_TC_CHANGE, - IIDC_EVENT_AFTER_TC_CHANGE, - IIDC_EVENT_CRIT_ERR, - IIDC_EVENT_NBITS /* must be last */ +enum iidc_rdma_event_type { + IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE, + IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, + IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, + IIDC_RDMA_EVENT_AFTER_TC_CHANGE, + IIDC_RDMA_EVENT_CRIT_ERR, + IIDC_RDMA_EVENT_NBITS /* must be last */ }; -enum iidc_reset_type { +enum iidc_rdma_reset_type { IIDC_PFR, IIDC_CORER, IIDC_GLOBR, @@ -47,7 +47,7 @@ struct iidc_rdma_qset_params { u8 tc; /* TC branch the Qset should belong to */ }; -struct iidc_qos_info { +struct iidc_rdma_qos_info { u64 tc_ctx; u8 rel_bw; u8 prio_type; @@ -56,8 +56,8 @@ struct iidc_qos_info { }; /* Struct to pass QoS info */ -struct iidc_qos_params { - struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; +struct iidc_rdma_qos_params { + struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; u8 up2tc[IIDC_MAX_USER_PRIORITY]; u8 vport_relative_bw; u8 vport_priority_type; @@ -66,8 +66,8 @@ struct iidc_qos_params { u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; }; -struct iidc_event { - DECLARE_BITMAP(type, IIDC_EVENT_NBITS); +struct iidc_rdma_event { + DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); u32 reg; }; @@ -75,9 +75,11 @@ struct ice_pf; int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); +int ice_rdma_request_reset(struct ice_pf *pf, + enum iidc_rdma_reset_type reset_type); int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); +void ice_get_qos_params(struct ice_pf *pf, + struct iidc_rdma_qos_params *qos); int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); @@ -86,7 +88,7 @@ void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); * instance of this struct dedicated to it. */ -struct iidc_auxiliary_dev { +struct iidc_rdma_core_auxiliary_dev { struct auxiliary_device adev; struct ice_pf *pf; }; @@ -96,14 +98,14 @@ struct iidc_auxiliary_dev { * driver will access these ops by performing a container_of on the * auxiliary_device->dev.driver. */ -struct iidc_auxiliary_drv { +struct iidc_rdma_core_auxiliary_drv { struct auxiliary_driver adrv; /* This event_handler is meant to be a blocking call. For instance, * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not * return until the auxiliary driver is ready for the MTU change to * happen. */ - void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); + void (*event_handler)(struct ice_pf *pf, struct iidc_rdma_event *event); }; #endif /* _IIDC_RDMA_H_*/ -- cgit v1.2.3 From d9251a560ba67bbedd53b81aee32e1ad95f42000 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:47 -0500 Subject: iidc/ice/irdma: Break iidc.h into two headers In preparation of supporting more than a single core PCI driver for RDMA, break the iidc_rdma.h header file into two more focused headers. Only the elements universal to all Intel drivers will remain in the generic iidc_rdma.h header. Move the ice specific information to an ice specific header file named iidc_rdma_ice.h. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc_rdma.h | 14 +------------- include/linux/net/intel/iidc_rdma_ice.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 13 deletions(-) create mode 100644 include/linux/net/intel/iidc_rdma_ice.h (limited to 'include') diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 2b24a9912fa0..1e8136395154 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2021, Intel Corporation. */ +/* Copyright (C) 2021-2025, Intel Corporation. */ #ifndef _IIDC_RDMA_H_ #define _IIDC_RDMA_H_ @@ -71,18 +71,6 @@ struct iidc_rdma_event { u32 reg; }; -struct ice_pf; - -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, - enum iidc_rdma_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, - struct iidc_rdma_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); - /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an * instance of this struct dedicated to it. diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h new file mode 100644 index 000000000000..78d10003d776 --- /dev/null +++ b/include/linux/net/intel/iidc_rdma_ice.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2025, Intel Corporation. */ + +#ifndef _IIDC_RDMA_ICE_H_ +#define _IIDC_RDMA_ICE_H_ + +struct ice_pf; + +int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct ice_pf *pf, + enum iidc_rdma_reset_type reset_type); +int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); +void ice_get_qos_params(struct ice_pf *pf, + struct iidc_rdma_qos_params *qos); +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); + +#endif /* _IIDC_RDMA_ICE_H_*/ -- cgit v1.2.3 From 8239b771b94b639556c1987185fd82b2a896c923 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Tue, 15 Apr 2025 21:15:48 -0500 Subject: ice: Replace ice specific DSCP mapping num with a kernel define Replace ice driver specific DSCP mapping number defines ICE_DSCP_NUM_VAL and IIDC_MAX_DSCP_MAPPING with an equivalent kernel define DSCP_MAX. Reviewed-by: Przemek Kitszel Signed-off-by: Tatyana Nikolova Signed-off-by: Dave Ertman Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc_rdma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 1e8136395154..7f1910289534 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -10,6 +10,7 @@ #include #include #include +#include enum iidc_rdma_event_type { IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE, @@ -32,7 +33,6 @@ enum iidc_rdma_protocol { }; #define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_MAX_DSCP_MAPPING 64 #define IIDC_DSCP_PFC_MODE 0x1 /* Struct to hold per RDMA Qset info */ @@ -63,7 +63,7 @@ struct iidc_rdma_qos_params { u8 vport_priority_type; u8 num_tc; u8 pfc_mode; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; }; struct iidc_rdma_event { -- cgit v1.2.3 From a3e1c0ad835702555d90565584ab6f723adf7f94 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 29 Apr 2025 08:04:46 +0200 Subject: net: phy: factor out provider part from mdio_bus.c After 52358dd63e34 ("net: phy: remove function stubs") there's a problem if CONFIG_MDIO_BUS is set, but CONFIG_PHYLIB is not. mdiobus_scan() uses phylib functions like get_phy_device(). Bringing back the stub wouldn't make much sense, because it would allow to compile mdiobus_scan(), but the function would be unusable. The stub returned NULL, and we have the following in mdiobus_scan(): phydev = get_phy_device(bus, addr, c45); if (IS_ERR(phydev)) return phydev; So calling mdiobus_scan() w/o CONFIG_PHYLIB would cause a crash later in mdiobus_scan(). In general the PHYLIB functionality isn't optional here. Consequently, MDIO bus providers depend on PHYLIB. Therefore factor it out and build it together with the libphy core modules. In addition make all MDIO bus providers under /drivers/net/mdio depend on PHYLIB. Same applies to enetc MDIO bus provider. Note that PHYLIB selects MDIO_DEVRES, therefore we can omit this here. Fixes: 52358dd63e34 ("net: phy: remove function stubs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504270639.mT0lh2o1-lkp@intel.com/ Reviewed-by: Jacob Keller Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/c74772a9-dab6-44bf-a657-389df89d85c2@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3beaf225ee88..d62d292024bc 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2062,6 +2062,7 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct netlink_ext_ack *extack); extern const struct bus_type mdio_bus_type; +extern const struct class mdio_bus_class; struct mdio_board_info { const char *bus_id; -- cgit v1.2.3 From 66d454e99d71857faf249486912e381ec83760b4 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 2 May 2025 09:15:21 -0700 Subject: bpf: udp: Make sure iter->batch always contains a full bucket snapshot Require that iter->batch always contains a full bucket snapshot. This invariant is important to avoid skipping or repeating sockets during iteration when combined with the next few patches. Before, there were two cases where a call to bpf_iter_udp_batch may only capture part of a bucket: 1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1]. 2. When more sockets are added to the bucket while calling bpf_iter_udp_realloc_batch(), making the updated batch size insufficient [2]. In cases where the batch size only covers part of a bucket, it is possible to forget which sockets were already visited, especially if we have to process a bucket in more than two batches. This forces us to choose between repeating or skipping sockets, so don't allow this: 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation fails instead of continuing with a partial batch. 2. Try bpf_iter_udp_realloc_batch() with GFP_USER just as before, but if we still aren't able to capture the full bucket, call bpf_iter_udp_realloc_batch() again while holding the bucket lock to guarantee the bucket does not change. On the second attempt use GFP_NOWAIT since we hold onto the spin lock. Introduce the udp_portaddr_for_each_entry_from macro and use it instead of udp_portaddr_for_each_entry to make it possible to continue iteration from an arbitrary socket. This is required for this patch in the GFP_NOWAIT case to allow us to fill the rest of a batch starting from the middle of a bucket and the later patch which skips sockets that were already seen. Testing all scenarios directly is a bit difficult, but I did some manual testing to exercise the code paths where GFP_NOWAIT is used and where ERR_PTR(err) is returned. I used the realloc test case included later in this series to trigger a scenario where a realloc happens inside bpf_iter_udp_batch and made a small code tweak to force the first realloc attempt to allocate a too-small batch, thus requiring another attempt with GFP_NOWAIT. Some printks showed both reallocs with the tests passing: Apr 25 23:16:24 crow kernel: go again GFP_USER Apr 25 23:16:24 crow kernel: go again GFP_NOWAIT With this setup, I also forced each of the bpf_iter_udp_realloc_batch calls to return -ENOMEM to ensure that iteration ends and that the read() in userspace fails. [1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/ [2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/ Signed-off-by: Jordan Rife Signed-off-by: Martin KaFai Lau --- include/linux/udp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 895240177f4f..4e1a672af4c5 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -216,6 +216,9 @@ static inline void udp_allow_gso(struct sock *sk) #define udp_portaddr_for_each_entry(__sk, list) \ hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node) +#define udp_portaddr_for_each_entry_from(__sk) \ + hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node) + #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) -- cgit v1.2.3 From ca732e990fc8222a2d6782ae750304719e212fe8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:11 +0100 Subject: net: stmmac: add get_interfaces() platform method Add a get_interfaces() platform method to allow platforms to indicate to phylink which interface modes they support - which then allows phylink to validate on initialisation that the configured PHY interface mode is actually supported. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uASLn-0021Qd-Mi@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 8aed09d65b4a..537bced69c46 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -233,6 +233,8 @@ struct plat_stmmacenet_data { u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; + void (*get_interfaces)(struct stmmac_priv *priv, void *bsp_priv, + unsigned long *interfaces); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); -- cgit v1.2.3 From 9d165dc58055d98658941a33fef9e5da866af3e9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:27 +0100 Subject: net: stmmac: remove speed_mode_2500() method Remove the speed_mode_2500() platform method which is no longer used or necessary, being superseded by the more flexible get_interfaces() method. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1uASM3-0021R3-2B@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 537bced69c46..26ddf95d23f9 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -241,7 +241,6 @@ struct plat_stmmacenet_data { int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); - void (*speed_mode_2500)(struct net_device *ndev, void *priv); int (*mac_finish)(struct net_device *ndev, void *priv, unsigned int mode, -- cgit v1.2.3 From 320a66f84022028f1277bf568a5e8987eac6e797 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 1 May 2025 01:24:02 +0100 Subject: strparser: Remove unused __strp_unpause The last use of __strp_unpause() was removed in 2022 by commit 84c61fe1a75b ("tls: rx: do not use the standard strparser") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250501002402.308843-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/strparser.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h index 0a83010b3a64..0ed73e364faa 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -114,8 +114,6 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); -/* Must be called with process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) -- cgit v1.2.3 From ac8f09b9210c48934c78fdc6bc167e660eaac928 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 2 May 2025 00:38:15 +0100 Subject: sctp: Remove unused sctp_assoc_del_peer and sctp_chunk_iif sctp_assoc_del_peer() last use was removed in 2015 by commit 73e6742027f5 ("sctp: Do not try to search for the transport twice") which now uses rm_peer instead of del_peer. sctp_chunk_iif() last use was removed in 2016 by commit 1f45f78f8e51 ("sctp: allow GSO frags to access the chunk too") Remove them. Signed-off-by: Dr. David Alan Gilbert Acked-by: Xin Long Link: https://patch.msgid.link/20250501233815.99832-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/sctp/sm.h | 1 - include/net/sctp/structs.h | 2 -- 2 files changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 64c42bd56bb2..3bfd261a53cc 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -161,7 +161,6 @@ const struct sctp_sm_table_entry *sctp_sm_lookup_event( enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype); -int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index dcd288fa1bb6..1ad7ce71d0a7 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2152,8 +2152,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, -- cgit v1.2.3 From 429ac6211494c12b668dac59811ea8a96db6d757 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 5 May 2025 13:45:11 +0200 Subject: devlink: define enum for attr types of dynamic attributes Devlink param and health reporter fmsg use attributes with dynamic type which is determined according to a different type. Currently used values are NLA_*. The problem is, they are not part of UAPI. They may change which would cause a break. To make this future safe, introduce a enum that shadows NLA_* values in it and is part of UAPI. Also, this allows to possibly carry types that are unrelated to NLA_* values. Signed-off-by: Saeed Mahameed Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20250505114513.53370-3-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/uapi/linux/devlink.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 9401aa343673..a5ee0f13740a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -385,6 +385,21 @@ enum devlink_linecard_state { DEVLINK_LINECARD_STATE_MAX = __DEVLINK_LINECARD_STATE_MAX - 1 }; +/* Variable attribute type. */ +enum devlink_var_attr_type { + /* Following values relate to the internal NLA_* values */ + DEVLINK_VAR_ATTR_TYPE_U8 = 1, + DEVLINK_VAR_ATTR_TYPE_U16, + DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_VAR_ATTR_TYPE_U64, + DEVLINK_VAR_ATTR_TYPE_STRING, + DEVLINK_VAR_ATTR_TYPE_FLAG, + DEVLINK_VAR_ATTR_TYPE_NUL_STRING = 10, + DEVLINK_VAR_ATTR_TYPE_BINARY, + __DEVLINK_VAR_ATTR_TYPE_CUSTOM_BASE = 0x80, + /* Any possible custom types, unrelated to NLA_* values go below */ +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, -- cgit v1.2.3 From f9e78932eac650cf1385244482b85e65ccaa87cf Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 5 May 2025 13:45:12 +0200 Subject: devlink: avoid param type value translations Assign DEVLINK_PARAM_TYPE_* enum values to DEVLINK_VAR_ATTR_TYPE_* to ensure the same values are used internally and in UAPI. Benefit from that by removing the value translations. Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20250505114513.53370-4-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index b8783126c1ed..0091f23a40f7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -420,11 +420,11 @@ typedef u64 devlink_resource_occ_get_t(void *priv); #define __DEVLINK_PARAM_MAX_STRING_VALUE 32 enum devlink_param_type { - DEVLINK_PARAM_TYPE_U8, - DEVLINK_PARAM_TYPE_U16, - DEVLINK_PARAM_TYPE_U32, - DEVLINK_PARAM_TYPE_STRING, - DEVLINK_PARAM_TYPE_BOOL, + DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8, + DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16, + DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING, + DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG, }; union devlink_param_value { -- cgit v1.2.3 From 22c64f37e1d4e757b0073a72f1439c2c3509c5cb Mon Sep 17 00:00:00 2001 From: Mohan Kumar G Date: Mon, 5 May 2025 20:58:36 +0530 Subject: wifi: mac80211: Update MCS15 support in link_conf As per IEEE 802.11be-2024 - 9.4.2.321, EHT operation element contains MCS15 Disable subfield as the sixth bit, which is set when MCS15 support is not enabled. Get MCS15 support from EHT operation params and add it in link_conf so that driver can use this value to know if EHT-MCS 15 reception is enabled. Co-developed-by: Dhanavandhana Kannan Signed-off-by: Dhanavandhana Kannan Signed-off-by: Mohan Kumar G Link: https://patch.msgid.link/20250505152836.3266829-1-quic_mkumarg@quicinc.com [remove pointless !! for bool assignment] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/mac80211.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 17f917cb4540..420c7f9aa6ee 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2325,6 +2325,7 @@ struct ieee80211_eht_cap_elem { #define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION 0x04 #define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT 0x08 #define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK 0x30 +#define IEEE80211_EHT_OPER_MCS15_DISABLE 0x40 /** * struct ieee80211_eht_operation - eht operation element diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fdafc37d17cc..82617579d910 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -744,6 +744,7 @@ struct ieee80211_parsed_tpe { * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth + * @eht_disable_mcs15: disable EHT-MCS 15 reception capability. * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This * information is the latest known value. It can come from this link's * beacon or from a beacon sent by another link. @@ -852,6 +853,8 @@ struct ieee80211_bss_conf { bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; + bool eht_disable_mcs15; + u8 bss_param_ch_cnt; u8 bss_param_ch_cnt_link_id; }; -- cgit v1.2.3 From 4701073c3debd16d7f534f3eb808bd9b50601c0c Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 6 May 2025 16:07:22 +0800 Subject: net: enetc: add initial netc-lib driver to support NTMP Some NETC functionality is controlled using control messages sent to the hardware using BD ring interface with 32B descriptor similar to transmit BD ring used on ENETC. This BD ring interface is referred to as command BD ring. It is used to configure functionality where the underlying resources may be shared between different entities or being too large to configure using direct registers. Therefore, a messaging protocol called NETC Table Management Protocol (NTMP) is provided for exchanging configuration and management information between the software and the hardware using the command BD ring interface. For the management protocol of LS1028A has been retroactively named NTMP 1.0, and its implementation is in enetc_cbdr.c and enetc_qos.c. However, NTMP of i.MX95 has been upgraded to version 2.0, which is incompatible with LS1028A, because the message formats have been changed. Therefore, add the netc-lib driver to support NTMP 2.0 to operate various tables. Note that, only MAC address filter table and RSS table are supported at the moment. More tables will be supported in subsequent patches. It is worth mentioning that the purpose of the netc-lib driver is to provide some NTMP-based generic interfaces for ENETC and NETC Switch drivers. Currently, it only supports the configurations of some tables. Interfaces such as tc flower and debugfs will be added in the future. Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250506080735.3444381-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/fsl/ntmp.h | 121 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 include/linux/fsl/ntmp.h (limited to 'include') diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h new file mode 100644 index 000000000000..916dc4fe7de3 --- /dev/null +++ b/include/linux/fsl/ntmp.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* Copyright 2025 NXP */ +#ifndef __NETC_NTMP_H +#define __NETC_NTMP_H + +#include +#include + +struct maft_keye_data { + u8 mac_addr[ETH_ALEN]; + __le16 resv; +}; + +struct maft_cfge_data { + __le16 si_bitmap; + __le16 resv; +}; + +struct netc_cbdr_regs { + void __iomem *pir; + void __iomem *cir; + void __iomem *mr; + + void __iomem *bar0; + void __iomem *bar1; + void __iomem *lenr; +}; + +struct netc_tbl_vers { + u8 maft_ver; + u8 rsst_ver; +}; + +struct netc_cbdr { + struct device *dev; + struct netc_cbdr_regs regs; + + int bd_num; + int next_to_use; + int next_to_clean; + + int dma_size; + void *addr_base; + void *addr_base_align; + dma_addr_t dma_base; + dma_addr_t dma_base_align; + + /* Serialize the order of command BD ring */ + spinlock_t ring_lock; +}; + +struct ntmp_user { + int cbdr_num; /* number of control BD ring */ + struct device *dev; + struct netc_cbdr *ring; + struct netc_tbl_vers tbl; +}; + +struct maft_entry_data { + struct maft_keye_data keye; + struct maft_cfge_data cfge; +}; + +#if IS_ENABLED(CONFIG_NXP_NETC_LIB) +int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev, + const struct netc_cbdr_regs *regs); +void ntmp_free_cbdr(struct netc_cbdr *cbdr); + +/* NTMP APIs */ +int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft); +int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft); +int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id); +int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table, + int count); +int ntmp_rsst_query_entry(struct ntmp_user *user, + u32 *table, int count); +#else +static inline int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev, + const struct netc_cbdr_regs *regs) +{ + return 0; +} + +static inline void ntmp_free_cbdr(struct netc_cbdr *cbdr) +{ +} + +static inline int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft) +{ + return 0; +} + +static inline int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft) +{ + return 0; +} + +static inline int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id) +{ + return 0; +} + +static inline int ntmp_rsst_update_entry(struct ntmp_user *user, + const u32 *table, int count) +{ + return 0; +} + +static inline int ntmp_rsst_query_entry(struct ntmp_user *user, + u32 *table, int count) +{ + return 0; +} + +#endif + +#endif -- cgit v1.2.3 From c24a65b6a27c78d8540409800886b6622ea86ebf Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:49 -0500 Subject: iidc/ice/irdma: Update IDC to support multiple consumers In preparation of supporting more than a single core PCI driver for RDMA, move ice specific structs like qset_params, qos_info and qos_params from iidc_rdma.h to iidc_rdma_ice.h. Previously, the ice driver was just exporting its entire PF struct to the auxiliary driver, but since each core driver will have its own different PF struct, implement a universal struct that all core drivers can provide to the auxiliary driver through the probe call. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Co-developed-by: Mustafa Ismail Signed-off-by: Mustafa Ismail Co-developed-by: Shiraz Saleem Signed-off-by: Shiraz Saleem Co-developed-by: Tatyana Nikolova Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc_rdma.h | 67 +++++++++----------------------- include/linux/net/intel/iidc_rdma_ice.h | 69 ++++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 7f1910289534..8baad1082042 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -5,7 +5,6 @@ #define _IIDC_RDMA_H_ #include -#include #include #include #include @@ -17,14 +16,19 @@ enum iidc_rdma_event_type { IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, IIDC_RDMA_EVENT_AFTER_TC_CHANGE, + IIDC_RDMA_EVENT_WARN_RESET, IIDC_RDMA_EVENT_CRIT_ERR, IIDC_RDMA_EVENT_NBITS /* must be last */ }; +struct iidc_rdma_event { + DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); + u32 reg; +}; + enum iidc_rdma_reset_type { - IIDC_PFR, - IIDC_CORER, - IIDC_GLOBR, + IIDC_FUNC_RESET, + IIDC_DEV_RESET, }; enum iidc_rdma_protocol { @@ -32,53 +36,22 @@ enum iidc_rdma_protocol { IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), }; -#define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_DSCP_PFC_MODE 0x1 - -/* Struct to hold per RDMA Qset info */ -struct iidc_rdma_qset_params { - /* Qset TEID returned to the RDMA driver in - * ice_add_rdma_qset and used by RDMA driver - * for calls to ice_del_rdma_qset - */ - u32 teid; /* Qset TEID */ - u16 qs_handle; /* RDMA driver provides this */ - u16 vport_id; /* VSI index */ - u8 tc; /* TC branch the Qset should belong to */ -}; - -struct iidc_rdma_qos_info { - u64 tc_ctx; - u8 rel_bw; - u8 prio_type; - u8 egress_virt_up; - u8 ingress_virt_up; -}; - -/* Struct to pass QoS info */ -struct iidc_rdma_qos_params { - struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; - u8 up2tc[IIDC_MAX_USER_PRIORITY]; - u8 vport_relative_bw; - u8 vport_priority_type; - u8 num_tc; - u8 pfc_mode; - u8 dscp_map[DSCP_MAX]; -}; - -struct iidc_rdma_event { - DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); - u32 reg; +/* Structure to be populated by core LAN PCI driver */ +struct iidc_rdma_core_dev_info { + struct pci_dev *pdev; /* PCI device of corresponding to main function */ + struct auxiliary_device *adev; + /* Current active RDMA protocol */ + enum iidc_rdma_protocol rdma_protocol; + void *iidc_priv; /* elements unique to each driver */ }; /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an * instance of this struct dedicated to it. */ - struct iidc_rdma_core_auxiliary_dev { struct auxiliary_device adev; - struct ice_pf *pf; + struct iidc_rdma_core_dev_info *cdev_info; }; /* structure representing the auxiliary driver. This struct is to be @@ -88,12 +61,8 @@ struct iidc_rdma_core_auxiliary_dev { */ struct iidc_rdma_core_auxiliary_drv { struct auxiliary_driver adrv; - /* This event_handler is meant to be a blocking call. For instance, - * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not - * return until the auxiliary driver is ready for the MTU change to - * happen. - */ - void (*event_handler)(struct ice_pf *pf, struct iidc_rdma_event *event); + void (*event_handler)(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_event *event); }; #endif /* _IIDC_RDMA_H_*/ diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h index 78d10003d776..b40eed0e13fe 100644 --- a/include/linux/net/intel/iidc_rdma_ice.h +++ b/include/linux/net/intel/iidc_rdma_ice.h @@ -4,16 +4,67 @@ #ifndef _IIDC_RDMA_ICE_H_ #define _IIDC_RDMA_ICE_H_ -struct ice_pf; +#include -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, +#define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_DSCP_PFC_MODE 0x1 + +/** + * struct iidc_rdma_qset_params - Struct to hold per RDMA Qset info + * @teid: TEID of the Qset node + * @qs_handle: SW index of the Qset, RDMA provides this + * @vport_id: VSI index + * @tc: Traffic Class branch the QSet should belong to + */ +struct iidc_rdma_qset_params { + /* Qset TEID returned to the RDMA driver in + * ice_add_rdma_qset and used by RDMA driver + * for calls to ice_del_rdma_qset + */ + u32 teid; + u16 qs_handle; + u16 vport_id; + u8 tc; +}; + +struct iidc_rdma_qos_info { + u64 tc_ctx; + u8 rel_bw; + u8 prio_type; + u8 egress_virt_up; + u8 ingress_virt_up; +}; + +/* Struct to pass QoS info */ +struct iidc_rdma_qos_params { + struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; + u8 up2tc[IIDC_MAX_USER_PRIORITY]; + u8 vport_relative_bw; + u8 vport_priority_type; + u8 num_tc; + u8 pfc_mode; + u8 dscp_map[DSCP_MAX]; +}; + +struct iidc_rdma_priv_dev_info { + u8 pf_id; + u16 vport_id; + struct net_device *netdev; + struct iidc_rdma_qos_params qos_info; + u8 __iomem *hw_addr; +}; + +int ice_add_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct iidc_rdma_core_dev_info *cdev, enum iidc_rdma_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, - struct iidc_rdma_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +int ice_rdma_update_vsi_filter(struct iidc_rdma_core_dev_info *cdev, u16 vsi_id, + bool enable); +int ice_alloc_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry); +void ice_free_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry); #endif /* _IIDC_RDMA_ICE_H_*/ -- cgit v1.2.3 From 1b2900db0119c02e6445bb61ec3fba982d10cc8d Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 8 May 2025 13:30:34 +0300 Subject: ethtool: Block setting of symmetric RSS when non-symmetric rx-flow-hash is requested Symmetric RSS hash requires that: * No other fields besides IP src/dst and/or L4 src/dst are set * If src is set, dst must also be set This restriction was only enforced when RXNFC was configured after symmetric hash was enabled. In the opposite order of operations (RXNFC then symmetric enablement) the check was not performed. Perform the sanity check on set_rxfh as well, by iterating over all flow types hash fields and making sure they are all symmetric. Introduce a function that returns whether a flow type is hashable (not spec only) and needs to be iterated over. To make sure that no one forgets to update the list of hashable flow types when adding new flow types, a static assert is added to draw the developer's attention. The conversion of uapi #defines to enum is not ideal, but as Jakub mentioned [1], we have precedent for that. [1] https://lore.kernel.org/netdev/20250324073509.6571ade3@kernel.org/ Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250508103034.885536-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 134 ++++++++++++++++++++++--------------------- 1 file changed, 69 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 84833cca29fe..707c1844010c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2295,71 +2295,75 @@ static inline int ethtool_validate_duplex(__u8 duplex) #define RXH_XFRM_SYM_OR_XOR (1 << 1) #define RXH_XFRM_NO_CHANGE 0xff -/* L2-L4 network traffic flow types */ -#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ -#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ -#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ -#define AH_ESP_V4_FLOW 0x04 /* hash only */ -#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ -#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ -#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ -#define AH_ESP_V6_FLOW 0x08 /* hash only */ -#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ -#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ -#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ -#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ -#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ -#define IP_USER_FLOW IPV4_USER_FLOW -#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ -#define IPV4_FLOW 0x10 /* hash only */ -#define IPV6_FLOW 0x11 /* hash only */ -#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ - -/* Used for GTP-U IPv4 and IPv6. - * The format of GTP packets only includes - * elements such as TEID and GTP version. - * It is primarily intended for data communication of the UE. - */ -#define GTPU_V4_FLOW 0x13 /* hash only */ -#define GTPU_V6_FLOW 0x14 /* hash only */ - -/* Use for GTP-C IPv4 and v6. - * The format of these GTP packets does not include TEID. - * Primarily expected to be used for communication - * to create sessions for UE data communication, - * commonly referred to as CSR (Create Session Request). - */ -#define GTPC_V4_FLOW 0x15 /* hash only */ -#define GTPC_V6_FLOW 0x16 /* hash only */ - -/* Use for GTP-C IPv4 and v6. - * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. - * After session creation, it becomes this packet. - * This is mainly used for requests to realize UE handover. - */ -#define GTPC_TEID_V4_FLOW 0x17 /* hash only */ -#define GTPC_TEID_V6_FLOW 0x18 /* hash only */ - -/* Use for GTP-U and extended headers for the PSC (PDU Session Container). - * The format of these GTP packets includes TEID and QFI. - * In 5G communication using UPF (User Plane Function), - * data communication with this extended header is performed. - */ -#define GTPU_EH_V4_FLOW 0x19 /* hash only */ -#define GTPU_EH_V6_FLOW 0x1a /* hash only */ - -/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. - * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by - * UL/DL included in the PSC. - * There are differences in the data included based on Downlink/Uplink, - * and can be used to distinguish packets. - * The functions described so far are useful when you want to - * handle communication from the mobile network in UPF, PGW, etc. - */ -#define GTPU_UL_V4_FLOW 0x1b /* hash only */ -#define GTPU_UL_V6_FLOW 0x1c /* hash only */ -#define GTPU_DL_V4_FLOW 0x1d /* hash only */ -#define GTPU_DL_V6_FLOW 0x1e /* hash only */ +enum { + /* L2-L4 network traffic flow types */ + TCP_V4_FLOW = 0x01, /* hash or spec (tcp_ip4_spec) */ + UDP_V4_FLOW = 0x02, /* hash or spec (udp_ip4_spec) */ + SCTP_V4_FLOW = 0x03, /* hash or spec (sctp_ip4_spec) */ + AH_ESP_V4_FLOW = 0x04, /* hash only */ + TCP_V6_FLOW = 0x05, /* hash or spec (tcp_ip6_spec; nfc only) */ + UDP_V6_FLOW = 0x06, /* hash or spec (udp_ip6_spec; nfc only) */ + SCTP_V6_FLOW = 0x07, /* hash or spec (sctp_ip6_spec; nfc only) */ + AH_ESP_V6_FLOW = 0x08, /* hash only */ + AH_V4_FLOW = 0x09, /* hash or spec (ah_ip4_spec) */ + ESP_V4_FLOW = 0x0a, /* hash or spec (esp_ip4_spec) */ + AH_V6_FLOW = 0x0b, /* hash or spec (ah_ip6_spec; nfc only) */ + ESP_V6_FLOW = 0x0c, /* hash or spec (esp_ip6_spec; nfc only) */ + IPV4_USER_FLOW = 0x0d, /* spec only (usr_ip4_spec) */ + IP_USER_FLOW = IPV4_USER_FLOW, + IPV6_USER_FLOW = 0x0e, /* spec only (usr_ip6_spec; nfc only) */ + IPV4_FLOW = 0x10, /* hash only */ + IPV6_FLOW = 0x11, /* hash only */ + ETHER_FLOW = 0x12, /* spec only (ether_spec) */ + + /* Used for GTP-U IPv4 and IPv6. + * The format of GTP packets only includes + * elements such as TEID and GTP version. + * It is primarily intended for data communication of the UE. + */ + GTPU_V4_FLOW = 0x13, /* hash only */ + GTPU_V6_FLOW = 0x14, /* hash only */ + + /* Use for GTP-C IPv4 and v6. + * The format of these GTP packets does not include TEID. + * Primarily expected to be used for communication + * to create sessions for UE data communication, + * commonly referred to as CSR (Create Session Request). + */ + GTPC_V4_FLOW = 0x15, /* hash only */ + GTPC_V6_FLOW = 0x16, /* hash only */ + + /* Use for GTP-C IPv4 and v6. + * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. + * After session creation, it becomes this packet. + * This is mainly used for requests to realize UE handover. + */ + GTPC_TEID_V4_FLOW = 0x17, /* hash only */ + GTPC_TEID_V6_FLOW = 0x18, /* hash only */ + + /* Use for GTP-U and extended headers for the PSC (PDU Session Container). + * The format of these GTP packets includes TEID and QFI. + * In 5G communication using UPF (User Plane Function), + * data communication with this extended header is performed. + */ + GTPU_EH_V4_FLOW = 0x19, /* hash only */ + GTPU_EH_V6_FLOW = 0x1a, /* hash only */ + + /* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. + * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by + * UL/DL included in the PSC. + * There are differences in the data included based on Downlink/Uplink, + * and can be used to distinguish packets. + * The functions described so far are useful when you want to + * handle communication from the mobile network in UPF, PGW, etc. + */ + GTPU_UL_V4_FLOW = 0x1b, /* hash only */ + GTPU_UL_V6_FLOW = 0x1c, /* hash only */ + GTPU_DL_V4_FLOW = 0x1d, /* hash only */ + GTPU_DL_V6_FLOW = 0x1e, /* hash only */ + + __FLOW_TYPE_COUNT, +}; /* Flag to enable additional fields in struct ethtool_rx_flow_spec */ #define FLOW_EXT 0x80000000 -- cgit v1.2.3 From 6c14058edfd01cdc0d3018b9069643b0da7c3e80 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 May 2025 12:52:36 +0300 Subject: net: dsa: convert to ndo_hwtstamp_get() and ndo_hwtstamp_set() New timestamping API was introduced in commit 66f7223039c0 ("net: add NDOs for configuring hardware timestamping") from kernel v6.6. It is time to convert DSA to the new API, so that the ndo_eth_ioctl() path can be removed completely. Move the ds->ops->port_hwtstamp_get() and ds->ops->port_hwtstamp_set() calls from dsa_user_ioctl() to dsa_user_hwtstamp_get() and dsa_user_hwtstamp_set(). Due to the fact that the underlying ifreq type changes to kernel_hwtstamp_config, the drivers and the Ocelot switchdev front-end, all hooked up directly or indirectly, must also be converted all at once. The conversion also updates the comment from dsa_port_supports_hwtstamp(), which is no longer true because kernel_hwtstamp_config is kernel memory and does not need copy_to_user(). I've deliberated whether it is necessary to also update "err != -EOPNOTSUPP" to a more general "!err", but all drivers now either return 0 or -EOPNOTSUPP. The existing logic from the ocelot_ioctl() function, to avoid configuring timestamping if the PHY supports the operation, is obsoleted by more advanced core logic in dev_set_hwtstamp_phylib(). This is only a partial preparation for proper PHY timestamping support. None of these switch driver currently sets up PTP traps for PHY timestamping, so setting dev->see_all_hwtstamp_requests is not yet necessary and the conversion is relatively trivial. Signed-off-by: Vladimir Oltean Tested-by: Vladimir Oltean # felix, sja1105, mv88e6xxx Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250508095236.887789-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 5 +++-- include/soc/mscc/ocelot.h | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index a0a9481c52c2..55e2d97f247e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1131,9 +1131,10 @@ struct dsa_switch_ops { * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6db7fc9dbaa4..48d6deb3efd7 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1073,8 +1073,11 @@ int ocelot_vlan_prepare(struct ocelot *ocelot, int port, u16 vid, bool pvid, int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, bool untagged); int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); -int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); -int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); +void ocelot_hwstamp_get(struct ocelot *ocelot, int port, + struct kernel_hwtstamp_config *cfg); +int ocelot_hwstamp_set(struct ocelot *ocelot, int port, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, struct sk_buff *skb, struct sk_buff **clone); -- cgit v1.2.3 From a96876057b9e44f60d936f8e4887543555b0593c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 May 2025 14:27:51 -0700 Subject: netlink: fix policy dump for int with validation callback Recent devlink change added validation of an integer value via NLA_POLICY_VALIDATE_FN, for sparse enums. Handle this in policy dump. We can't extract any info out of the callback, so report only the type. Fixes: 429ac6211494 ("devlink: define enum for attr types of dynamic attributes") Reported-by: syzbot+01eb26848144516e7f0a@syzkaller.appspotmail.com Link: https://patch.msgid.link/20250509212751.1905149-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 82e07e272290..90a560dc167a 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -321,7 +321,13 @@ enum nla_policy_validation { * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: + * NLA_U8, NLA_U16, + * NLA_U32, NLA_U64, + * NLA_S8, NLA_S16, + * NLA_S32, NLA_S64, + * NLA_MSECS, * NLA_BINARY Validation function called for the attribute. + * * All other Unused - but note that it's a union * * Example: -- cgit v1.2.3 From 03e96b8c11d140fb4ead0b30c2d6e1a294b501ef Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:21 +0000 Subject: netmem: add niov->type attribute to distinguish different net_iov types Later patches in the series adds TX net_iovs where there is no pp associated, so we can't rely on niov->pp->mp_ops to tell what is the type of the net_iov. Add a type enum to the net_iov which tells us the net_iov type. Signed-off-by: Mina Almasry Link: https://patch.msgid.link/20250508004830.4100853-2-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/net/netmem.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netmem.h b/include/net/netmem.h index c61d5b21e7b4..973fdbcfef38 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -20,8 +20,17 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); */ #define NET_IOV 0x01UL +enum net_iov_type { + NET_IOV_DMABUF, + NET_IOV_IOURING, + + /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass. + */ + NET_IOV_MAX = ULONG_MAX +}; + struct net_iov { - unsigned long __unused_padding; + enum net_iov_type type; unsigned long pp_magic; struct page_pool *pp; struct net_iov_area *owner; -- cgit v1.2.3 From e9f3d61db5cb29b3f17f0dc40c3ec2cda2ee93e5 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:22 +0000 Subject: net: add get_netmem/put_netmem support Currently net_iovs support only pp ref counts, and do not support a page ref equivalent. This is fine for the RX path as net_iovs are used exclusively with the pp and only pp refcounting is needed there. The TX path however does not use pp ref counts, thus, support for get_page/put_page equivalent is needed for netmem. Support get_netmem/put_netmem. Check the type of the netmem before passing it to page or net_iov specific code to obtain a page ref equivalent. For dmabuf net_iovs, we obtain a ref on the underlying binding. This ensures the entire binding doesn't disappear until all the net_iovs have been put_netmem'ed. We do not need to track the refcount of individual dmabuf net_iovs as we don't allocate/free them from a pool similar to what the buddy allocator does for pages. This code is written to be extensible by other net_iov implementers. get_netmem/put_netmem will check the type of the netmem and route it to the correct helper: pages -> [get|put]_page() dmabuf net_iovs -> net_devmem_[get|put]_net_iov() new net_iovs -> new helpers Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-3-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/skbuff_ref.h | 4 ++-- include/net/netmem.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h index 0f3c58007488..9e49372ef1a0 100644 --- a/include/linux/skbuff_ref.h +++ b/include/linux/skbuff_ref.h @@ -17,7 +17,7 @@ */ static inline void __skb_frag_ref(skb_frag_t *frag) { - get_page(skb_frag_page(frag)); + get_netmem(skb_frag_netmem(frag)); } /** @@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle) if (recycle && napi_pp_put_page(netmem)) return; #endif - put_page(netmem_to_page(netmem)); + put_netmem(netmem); } /** diff --git a/include/net/netmem.h b/include/net/netmem.h index 973fdbcfef38..ecb6b29c93f6 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -273,4 +273,7 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) return __netmem_clear_lsb(netmem)->dma_addr; } +void get_netmem(netmem_ref netmem); +void put_netmem(netmem_ref netmem); + #endif /* _NET_NETMEM_H */ -- cgit v1.2.3 From 8802087d20c0e1c26c4b4fe30e22264bf8285e51 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 8 May 2025 00:48:23 +0000 Subject: net: devmem: TCP tx netlink api Add bind-tx netlink call to attach dmabuf for TX; queue is not required, only ifindex and dmabuf fd for attachment. Signed-off-by: Stanislav Fomichev Signed-off-by: Mina Almasry Link: https://patch.msgid.link/20250508004830.4100853-4-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/uapi/linux/netdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7600bf62dbdf..7eb9571786b8 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -219,6 +219,7 @@ enum { NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From bd61848900bff597764238f3a8ec67c815cd316e Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:24 +0000 Subject: net: devmem: Implement TX path Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. The TX path may release the dmabuf in a context where we cannot wait. This happens when the user unbinds a TX dmabuf while there are still references to its netmems in the TX path. In that case, the netmems will be put_netmem'd from a context where we can't unmap the dmabuf, Resolve this by making __net_devmem_dmabuf_binding_free schedule_work'd. Based on work by Stanislav Fomichev . A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. Cc: Stanislav Fomichev Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-5-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 17 +++++++++++++---- include/net/sock.h | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f3e72be6f634..c7397b17bb08 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); + struct ubuf_info *uarg, bool devmem); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); +struct net_devmem_dmabuf_binding; + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length); + size_t length, + struct net_devmem_dmabuf_binding *binding); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length); @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len, + NULL); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg); + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) @@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev, size_t offset, size_t size, enum dma_data_direction dir) { + if (skb_frag_is_net_iov(frag)) { + return netmem_to_net_iov(frag->netmem)->dma_addr + offset + + frag->offset; + } return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } diff --git a/include/net/sock.h b/include/net/sock.h index f0fabb9fd28a..3e15d7105ad2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1851,6 +1851,7 @@ struct sockcm_cookie { u32 tsflags; u32 ts_opt_id; u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, -- cgit v1.2.3 From 383faec0fd64b9bff15eb5f700f023ec35520a96 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:26 +0000 Subject: net: enable driver support for netmem TX Drivers need to make sure not to pass netmem dma-addrs to the dma-mapping API in order to support netmem TX. Add helpers and netmem_dma_*() helpers that enables special handling of netmem dma-addrs that drivers can use. Document in netmem.rst what drivers need to do to support netmem TX. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-7-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 ++ include/net/netmem.h | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 773167508c82..32a1e41636a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1772,6 +1772,7 @@ enum netdev_reg_state { * @lltx: device supports lockless Tx. Deprecated for real HW * drivers. Mainly used by logical interfaces, such as * bonding and tunnels + * @netmem_tx: device support netmem_tx. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name @@ -2087,6 +2088,7 @@ struct net_device { struct_group(priv_flags_fast, unsigned long priv_flags:32; unsigned long lltx:1; + unsigned long netmem_tx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; diff --git a/include/net/netmem.h b/include/net/netmem.h index ecb6b29c93f6..386164fb9c18 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -8,6 +8,7 @@ #ifndef _NET_NETMEM_H #define _NET_NETMEM_H +#include #include #include @@ -276,4 +277,23 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) void get_netmem(netmem_ref netmem); void put_netmem(netmem_ref netmem); +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + #endif /* _NET_NETMEM_H */ -- cgit v1.2.3 From dc75c3ced10c611f524e9e444303a0249ce32e43 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 12 May 2025 22:20:59 +0200 Subject: net: phy: remove stub for mdiobus_register_board_info The functionality of mdiobus_register_board_info() typically isn't optional for the caller. Therefore remove the stub. Note: Currently we have only one caller of mdiobus_register_board_info(), in a DSA/PHYLINK context. Therefore CONFIG_MDIO_DEVICE is selected anyway. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/410a2222-c4e8-45b0-9091-d49674caeb00@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index d62d292024bc..7c29d346d4b3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2071,17 +2071,8 @@ struct mdio_board_info { const void *platform_data; }; -#if IS_ENABLED(CONFIG_MDIO_DEVICE) int mdiobus_register_board_info(const struct mdio_board_info *info, unsigned int n); -#else -static inline int mdiobus_register_board_info(const struct mdio_board_info *i, - unsigned int n) -{ - return 0; -} -#endif - /** * phy_module_driver() - Helper macro for registering PHY drivers -- cgit v1.2.3 From c16608005ccb99fbde3a4cd96eab28e16f148abf Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 13 May 2025 11:19:22 +0300 Subject: net: Look for bonding slaves in the bond's network namespace Update the for_each_netdev_in_bond_rcu macro to iterate through network devices in the bond's network namespace instead of always using init_net. This change is safe because: 1. **Bond-Slave Namespace Relationship**: A bond device and its slaves must reside in the same network namespace. The bond device's namespace is established at creation time and cannot change. 2. **Slave Movement Implications**: Any attempt to move a slave device to a different namespace automatically removes it from the bond, as per kernel networking stack rules. This maintains the invariant that slaves must exist in the same namespace as their bond. This change is part of an effort to enable Link Aggregation (LAG) to work properly inside custom network namespaces. Previously, the macro would only find slave devices in the initial network namespace, preventing proper bonding functionality in custom namespaces. Signed-off-by: Shay Drory Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250513081922.525716-1-mbloch@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 32a1e41636a9..9e3a2d8452d6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3268,7 +3268,7 @@ int call_netdevice_notifiers_info(unsigned long val, #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ - for_each_netdev_rcu(&init_net, slave) \ + for_each_netdev_rcu(dev_net_rcu(bond), slave) \ if (netdev_master_upper_dev_get_rcu(slave) == (bond)) #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) -- cgit v1.2.3 From b9eef3391de028fdd88fd7a2f81a4834fc98c9ac Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:26 +0200 Subject: xdp: Use nested-BH locking for system_page_pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit system_page_pool is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Make a struct with a page_pool member (original system_page_pool) and a local_lock_t and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Andrew Lunn Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jesper Dangaard Brouer Cc: John Fastabend Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-6-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9e3a2d8452d6..73a97cf1bbce 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3503,7 +3503,12 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); -DECLARE_PER_CPU(struct page_pool *, system_page_pool); + +struct page_pool_bh { + struct page_pool *pool; + local_lock_t bh_lock; +}; +DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) -- cgit v1.2.3 From 7fe70c06a182a140be9996b02256d907e114479a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:31 +0200 Subject: net/sched: act_mirred: Move the recursion counter struct netdev_xmit mirred_nest_level is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move mirred_nest_level to struct netdev_xmit as u8, provide wrappers. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Juri Lelli Link: https://patch.msgid.link/20250512092736.229935-11-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice_xmit.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 38325e070296..848735b3a7c0 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -8,6 +8,9 @@ struct netdev_xmit { #ifdef CONFIG_NET_EGRESS u8 skip_txqueue; #endif +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED) + u8 sched_mirred_nest; +#endif }; #endif -- cgit v1.2.3 From c1269d3d12b88151ee4c109624b5022d53a11738 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 May 2025 19:39:09 +0000 Subject: tcp: add tcp_rcvbuf_grow() tracepoint Provide a new tracepoint to better understand tcp_rcv_space_adjust() (currently broken) behavior. Call it only when tcp_rcv_space_adjust() has a chance to make a change. I chose to leave trace_tcp_rcv_space_adjust() as is, because commit 6163849d289b ("net: introduce a new tracepoint for tcp_rcv_space_adjust") intent was to get it called after each data delivery to user space. Tested: Pair of hosts in the same rack. Ideally, sk->sk_rcvbuf should be kept small. echo "4096 131072 33554432" >/proc/sys/net/ipv4/tcp_rmem ./netserver perf record -C10 -e tcp:tcp_rcvbuf_grow sleep 30 Trace for a TS enabled TCP flow (with standard ms granularity) perf script // We can see that sk_rcvbuf is growing very fast to tcp_mem[2] 260.500397: tcp:tcp_rcvbuf_grow: time=291 rtt_us=274 copied=110592 inq=0 space=41080 ooo=0 scaling_ratio=230 rcvbuf=131072 ... 260.501333: tcp:tcp_rcvbuf_grow: time=555 rtt_us=364 copied=333824 inq=0 space=110592 ooo=0 scaling_ratio=230 rcvbuf=1399144 ... 260.501664: tcp:tcp_rcvbuf_grow: time=331 rtt_us=330 copied=798720 inq=0 space=333824 ooo=0 scaling_ratio=230 rcvbuf=4110551 ... 260.502003: tcp:tcp_rcvbuf_grow: time=340 rtt_us=330 copied=1040384 inq=49152 space=798720 ooo=0 scaling_ratio=230 rcvbuf=7006410 ... 260.502483: tcp:tcp_rcvbuf_grow: time=479 rtt_us=330 copied=2658304 inq=49152 space=1040384 ooo=0 scaling_ratio=230 rcvbuf=7006410 ... 260.502899: tcp:tcp_rcvbuf_grow: time=416 rtt_us=413 copied=4026368 inq=147456 space=2658304 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.504233: tcp:tcp_rcvbuf_grow: time=493 rtt_us=487 copied=4800512 inq=196608 space=4026368 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.504792: tcp:tcp_rcvbuf_grow: time=559 rtt_us=551 copied=5672960 inq=49152 space=4800512 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.506614: tcp:tcp_rcvbuf_grow: time=610 rtt_us=607 copied=6688768 inq=180224 space=5672960 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.507280: tcp:tcp_rcvbuf_grow: time=666 rtt_us=656 copied=6868992 inq=49152 space=6688768 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.507979: tcp:tcp_rcvbuf_grow: time=699 rtt_us=699 copied=7000064 inq=0 space=6868992 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.508681: tcp:tcp_rcvbuf_grow: time=703 rtt_us=699 copied=7208960 inq=0 space=7000064 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.509426: tcp:tcp_rcvbuf_grow: time=744 rtt_us=737 copied=7569408 inq=0 space=7208960 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.510213: tcp:tcp_rcvbuf_grow: time=787 rtt_us=770 copied=7880704 inq=49152 space=7569408 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.511013: tcp:tcp_rcvbuf_grow: time=801 rtt_us=798 copied=8339456 inq=0 space=7880704 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.511860: tcp:tcp_rcvbuf_grow: time=847 rtt_us=824 copied=8601600 inq=49152 space=8339456 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.512710: tcp:tcp_rcvbuf_grow: time=850 rtt_us=846 copied=8814592 inq=65536 space=8601600 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.514428: tcp:tcp_rcvbuf_grow: time=871 rtt_us=865 copied=8855552 inq=49152 space=8814592 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.515333: tcp:tcp_rcvbuf_grow: time=905 rtt_us=882 copied=9228288 inq=49152 space=8855552 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.516237: tcp:tcp_rcvbuf_grow: time=905 rtt_us=896 copied=9371648 inq=49152 space=9228288 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.517149: tcp:tcp_rcvbuf_grow: time=911 rtt_us=909 copied=9543680 inq=49152 space=9371648 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.518070: tcp:tcp_rcvbuf_grow: time=921 rtt_us=921 copied=9793536 inq=0 space=9543680 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.520895: tcp:tcp_rcvbuf_grow: time=948 rtt_us=947 copied=10203136 inq=114688 space=9793536 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.521853: tcp:tcp_rcvbuf_grow: time=959 rtt_us=954 copied=10293248 inq=57344 space=10203136 ooo=0 scaling_ratio=230 rcvbuf=24691992 ... 260.522818: tcp:tcp_rcvbuf_grow: time=964 rtt_us=959 copied=10330112 inq=0 space=10293248 ooo=0 scaling_ratio=230 rcvbuf=24691992 ... 260.524760: tcp:tcp_rcvbuf_grow: time=979 rtt_us=969 copied=10633216 inq=49152 space=10330112 ooo=0 scaling_ratio=230 rcvbuf=24691992 ... 260.526709: tcp:tcp_rcvbuf_grow: time=975 rtt_us=973 copied=12013568 inq=163840 space=10633216 ooo=0 scaling_ratio=230 rcvbuf=25136755 ... 260.527694: tcp:tcp_rcvbuf_grow: time=985 rtt_us=976 copied=12025856 inq=32768 space=12013568 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.530655: tcp:tcp_rcvbuf_grow: time=991 rtt_us=986 copied=12050432 inq=98304 space=12025856 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.533626: tcp:tcp_rcvbuf_grow: time=993 rtt_us=989 copied=12124160 inq=0 space=12050432 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.538606: tcp:tcp_rcvbuf_grow: time=1000 rtt_us=994 copied=12222464 inq=49152 space=12124160 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.545605: tcp:tcp_rcvbuf_grow: time=1005 rtt_us=998 copied=12263424 inq=81920 space=12222464 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.553626: tcp:tcp_rcvbuf_grow: time=1005 rtt_us=999 copied=12320768 inq=12288 space=12263424 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.589749: tcp:tcp_rcvbuf_grow: time=1001 rtt_us=1000 copied=12398592 inq=16384 space=12320768 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.806577: tcp:tcp_rcvbuf_grow: time=1010 rtt_us=1000 copied=12402688 inq=32768 space=12398592 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 261.002386: tcp:tcp_rcvbuf_grow: time=1002 rtt_us=1000 copied=12419072 inq=98304 space=12402688 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 261.803432: tcp:tcp_rcvbuf_grow: time=1013 rtt_us=1000 copied=12468224 inq=49152 space=12419072 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 261.829533: tcp:tcp_rcvbuf_grow: time=1004 rtt_us=1000 copied=12615680 inq=0 space=12468224 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 265.505435: tcp:tcp_rcvbuf_grow: time=1007 rtt_us=1000 copied=12632064 inq=32768 space=12615680 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... We also see rtt_us going gradually to 1000 usec, causing massive overshoot. Trace for a usec TS enabled TCP flow (us granularity) perf script // We can see that sk_rcvbuf is growing to a smaller value, thanks to tight rtt_us values. 1509.273955: tcp:tcp_rcvbuf_grow: time=396 rtt_us=377 copied=110592 inq=0 space=41080 ooo=0 scaling_ratio=230 rcvbuf=131072 ... 1509.274366: tcp:tcp_rcvbuf_grow: time=412 rtt_us=365 copied=129024 inq=0 space=110592 ooo=0 scaling_ratio=230 rcvbuf=1399144 ... 1509.274738: tcp:tcp_rcvbuf_grow: time=372 rtt_us=355 copied=194560 inq=0 space=129024 ooo=0 scaling_ratio=230 rcvbuf=1399144 ... 1509.275020: tcp:tcp_rcvbuf_grow: time=282 rtt_us=257 copied=401408 inq=0 space=194560 ooo=0 scaling_ratio=230 rcvbuf=1399144 ... 1509.275190: tcp:tcp_rcvbuf_grow: time=170 rtt_us=144 copied=741376 inq=229376 space=401408 ooo=0 scaling_ratio=230 rcvbuf=3021625 ... 1509.275300: tcp:tcp_rcvbuf_grow: time=110 rtt_us=110 copied=1146880 inq=65536 space=741376 ooo=0 scaling_ratio=230 rcvbuf=4642390 ... 1509.275449: tcp:tcp_rcvbuf_grow: time=149 rtt_us=106 copied=1310720 inq=737280 space=1146880 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.275560: tcp:tcp_rcvbuf_grow: time=111 rtt_us=107 copied=1388544 inq=430080 space=1310720 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.275674: tcp:tcp_rcvbuf_grow: time=114 rtt_us=113 copied=1495040 inq=421888 space=1388544 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.275800: tcp:tcp_rcvbuf_grow: time=126 rtt_us=126 copied=1572864 inq=77824 space=1495040 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.275968: tcp:tcp_rcvbuf_grow: time=168 rtt_us=161 copied=1863680 inq=172032 space=1572864 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.276129: tcp:tcp_rcvbuf_grow: time=161 rtt_us=161 copied=1941504 inq=204800 space=1863680 ooo=0 scaling_ratio=230 rcvbuf=5782790 ... 1509.276288: tcp:tcp_rcvbuf_grow: time=159 rtt_us=158 copied=1990656 inq=131072 space=1941504 ooo=0 scaling_ratio=230 rcvbuf=5782790 ... 1509.276900: tcp:tcp_rcvbuf_grow: time=228 rtt_us=226 copied=2883584 inq=266240 space=1990656 ooo=0 scaling_ratio=230 rcvbuf=5782790 ... 1509.277819: tcp:tcp_rcvbuf_grow: time=242 rtt_us=236 copied=3022848 inq=0 space=2883584 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.278072: tcp:tcp_rcvbuf_grow: time=253 rtt_us=247 copied=3055616 inq=49152 space=3022848 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.279560: tcp:tcp_rcvbuf_grow: time=268 rtt_us=264 copied=3133440 inq=180224 space=3055616 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.279833: tcp:tcp_rcvbuf_grow: time=274 rtt_us=270 copied=3424256 inq=0 space=3133440 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.282187: tcp:tcp_rcvbuf_grow: time=277 rtt_us=273 copied=3465216 inq=180224 space=3424256 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.284685: tcp:tcp_rcvbuf_grow: time=292 rtt_us=292 copied=3481600 inq=147456 space=3465216 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.284983: tcp:tcp_rcvbuf_grow: time=297 rtt_us=295 copied=3702784 inq=45056 space=3481600 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.285596: tcp:tcp_rcvbuf_grow: time=311 rtt_us=310 copied=3723264 inq=40960 space=3702784 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.285909: tcp:tcp_rcvbuf_grow: time=313 rtt_us=304 copied=3846144 inq=196608 space=3723264 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.291654: tcp:tcp_rcvbuf_grow: time=322 rtt_us=311 copied=3960832 inq=49152 space=3846144 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.291986: tcp:tcp_rcvbuf_grow: time=333 rtt_us=330 copied=4075520 inq=360448 space=3960832 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.292319: tcp:tcp_rcvbuf_grow: time=332 rtt_us=332 copied=4079616 inq=65536 space=4075520 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.292666: tcp:tcp_rcvbuf_grow: time=348 rtt_us=347 copied=4177920 inq=212992 space=4079616 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.293015: tcp:tcp_rcvbuf_grow: time=349 rtt_us=345 copied=4276224 inq=262144 space=4177920 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.293371: tcp:tcp_rcvbuf_grow: time=356 rtt_us=346 copied=4415488 inq=49152 space=4276224 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.515798: tcp:tcp_rcvbuf_grow: time=424 rtt_us=411 copied=4833280 inq=81920 space=4415488 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... Signed-off-by: Eric Dumazet Reviewed-by: Wei Wang Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250513193919.1089692-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 53e878fa14d1..006c2116c8f6 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_ARGS(sk) ); +TRACE_EVENT(tcp_rcvbuf_grow, + + TP_PROTO(struct sock *sk, int time), + + TP_ARGS(sk, time), + + TP_STRUCT__entry( + __field(int, time) + __field(__u32, rtt_us) + __field(__u32, copied) + __field(__u32, inq) + __field(__u32, space) + __field(__u32, ooo_space) + __field(__u32, rcvbuf) + __field(__u8, scaling_ratio) + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + __field(const void *, skaddr) + __field(__u64, sock_cookie) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __be32 *p32; + + __entry->time = time; + __entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3; + __entry->copied = tp->copied_seq - tp->rcvq_space.seq; + __entry->inq = tp->rcv_nxt - tp->copied_seq; + __entry->space = tp->rcvq_space.space; + __entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 : + TCP_SKB_CB(tp->ooo_last_skb)->end_seq - + tp->rcv_nxt; + + __entry->rcvbuf = sk->sk_rcvbuf; + __entry->scaling_ratio = tp->scaling_ratio; + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); + + __entry->skaddr = sk; + __entry->sock_cookie = sock_gen_cookie(sk); + ), + + TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u " + "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 " + "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx", + __entry->time, __entry->rtt_us, __entry->copied, + __entry->inq, __entry->space, __entry->ooo_space, + __entry->scaling_ratio, __entry->rcvbuf, + show_family_name(__entry->family), + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6, + __entry->skaddr, + __entry->sock_cookie) +); + TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), -- cgit v1.2.3 From ea33537d82921e71f852ea2ed985acc562125efe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 May 2025 19:39:12 +0000 Subject: tcp: add receive queue awareness in tcp_rcv_space_adjust() If the application can not drain fast enough a TCP socket queue, tcp_rcv_space_adjust() can overestimate tp->rcvq_space.space. Then sk->sk_rcvbuf can grow and hit tcp_rmem[2] for no good reason. Fix this by taking into acount the number of available bytes. Keeping sk->sk_rcvbuf at the right size allows better cache efficiency. Signed-off-by: Eric Dumazet Reviewed-by: Wei Wang Link: https://patch.msgid.link/20250513193919.1089692-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a8af71623ba7..29f59d50dc73 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -340,7 +340,7 @@ struct tcp_sock { } rcv_rtt_est; /* Receiver queue space */ struct { - u32 space; + int space; u32 seq; u64 time; } rcvq_space; -- cgit v1.2.3 From 1119e5519dcdb7b3527f5d85accf9c7aa02b2b28 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 13 May 2025 15:17:52 -0700 Subject: net: sched: uapi: add more sanely named duplicate defines The TCA_FLOWER_KEY_CFM enum has a UNSPEC and MAX with _OPT in the name, but the real attributes don't. Add a MAX that more reasonably matches the attrs. The PAD in TCA_TAPRIO is the only attr which doesn't have _ATTR in it, perhaps signifying that it's not a real attr? If so interesting idea in abstract but it makes codegen painful. Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250513221752.843102-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/pkt_sched.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 490821364165..28d94b11d1aa 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -697,6 +697,7 @@ enum { }; #define TCA_FLOWER_KEY_CFM_OPT_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1) +#define TCA_FLOWER_KEY_CFM_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1) #define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 9ea874395717..3e41349f3fa2 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1182,6 +1182,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ TCA_TAPRIO_PAD, + TCA_TAPRIO_ATTR_PAD = TCA_TAPRIO_PAD, TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ -- cgit v1.2.3 From 9cd5ef0b8c04c46a15c8f5d002f02ea0d0477790 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 May 2025 10:03:54 +0000 Subject: net: rfs: add sock_rps_delete_flow() helper RFS can exhibit lower performance for workloads using short-lived flows and a small set of 4-tuple. This is often the case for load-testers, using a pair of hosts, if the server has a single listener port. Typical use case : Server : tcp_crr -T128 -F1000 -6 -U -l30 -R 14250 Client : tcp_crr -T128 -F1000 -6 -U -l30 -c -H server | grep local_throughput This is because RFS global hash table contains stale information, when the same RSS key is recycled for another socket and another cpu. Make sure to undo the changes and go back to initial state when a flow is disconnected. Performance of the above test is increased by 22 %, going from 372604 transactions per second to 457773. Signed-off-by: Eric Dumazet Reported-by: Octavian Purdila Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250515100354.3339920-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index 507f4aa5d39b..d8ab3a08bcc4 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -123,6 +123,30 @@ static inline void sock_rps_record_flow(const struct sock *sk) #endif } +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *table; + u32 hash, index; + + if (!static_branch_unlikely(&rfs_needed)) + return; + + hash = READ_ONCE(sk->sk_rxhash); + if (!hash) + return; + + rcu_read_lock(); + table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (table) { + index = hash & table->mask; + if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) + WRITE_ONCE(table->ents[index], RPS_NO_CPU); + } + rcu_read_unlock(); +#endif +} + static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) { #ifdef CONFIG_RPS -- cgit v1.2.3 From 7b151e4efdde7cc7cfaae66e497d12487a70c6e9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 14 May 2025 20:54:29 +0200 Subject: net: phy: fixed_phy: remove fixed_phy_register_with_gpiod Since its introduction 6 yrs ago this functions has never had a user. So remove it. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/ccbeef28-65ae-4e28-b1db-816c44338dee@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 1acafd86ab13..3392c09b5d24 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -13,7 +13,6 @@ struct fixed_phy_status { }; struct device_node; -struct gpio_desc; struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) @@ -24,11 +23,6 @@ extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, struct device_node *np); -extern struct phy_device * -fixed_phy_register_with_gpiod(unsigned int irq, - struct fixed_phy_status *status, - struct gpio_desc *gpiod); - extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, @@ -46,14 +40,6 @@ static inline struct phy_device *fixed_phy_register(unsigned int irq, return ERR_PTR(-ENODEV); } -static inline struct phy_device * -fixed_phy_register_with_gpiod(unsigned int irq, - struct fixed_phy_status *status, - struct gpio_desc *gpiod) -{ - return ERR_PTR(-ENODEV); -} - static inline void fixed_phy_unregister(struct phy_device *phydev) { } -- cgit v1.2.3 From a462903fa22541f212134fba81084315ad843e6e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 May 2025 13:59:27 +0200 Subject: net: netlink: reduce extack cookie size Seems like the extack cookie hasn't found any users outside of wireless, which always uses nl_set_extack_cookie_u64(). Thus, allocating 20 bytes for it is pointless, reduce that to 8 bytes, and add a BUILD_BUG_ON() to ensure it's enough (obviously it is, for a u64, but in case it changes again.) Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250516115927.38209-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index c3ae84a77e16..882e9c1b6c1d 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -63,7 +63,7 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) } /* this can be increased when necessary - don't expose to userland */ -#define NETLINK_MAX_COOKIE_LEN 20 +#define NETLINK_MAX_COOKIE_LEN 8 #define NETLINK_MAX_FMTMSG_LEN 80 /** @@ -212,6 +212,7 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, { if (!extack) return; + BUILD_BUG_ON(sizeof(extack->cookie) < sizeof(cookie)); memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } -- cgit v1.2.3 From 84b21e61ebd64931d865ce3df49d930db8c9e2cd Mon Sep 17 00:00:00 2001 From: Gur Stavi Date: Sun, 18 May 2025 13:00:54 +0300 Subject: queue_api: reduce risk of name collision over txq Rename local variable in macros from txq to _txq. When macro parameter get_desc is expended it is likely to have a txq token that refers to a different txq variable at the caller's site. Signed-off-by: Gur Stavi Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/95b60d218f004308486d92ed17c8cc6f28bac09d.1747559621.git.gur.stavi@huawei.com Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 069ff35a72de..ba2eaf39089b 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -288,27 +288,27 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, #define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_try_stop(txq, get_desc, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_try_stop(_txq, get_desc, start_thrs); \ }) #define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_maybe_stop(_txq, get_desc, stop_thrs, start_thrs); \ }) #define netif_subqueue_completed_wake(dev, idx, pkts, bytes, \ get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_completed_wake(txq, pkts, bytes, \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_completed_wake(_txq, pkts, bytes, \ get_desc, start_thrs); \ }) -- cgit v1.2.3 From 31be641d74267d98317ef5a2b90e6200511cabb3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 15 May 2025 10:11:54 +0200 Subject: net: phy: make mdio consumer / device layer a separate module After having factored out the provider part from mdio_bus.c, we can make the mdio consumer / device layer a separate module. This also allows to remove Kconfig symbol MDIO_DEVICE. The module init / exit functions from mdio_bus.c no longer have to be called from phy_device.c. The link order defined in drivers/net/phy/Makefile ensures that init / exit functions are called in the right order. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/dba6b156-5748-44ce-b5e2-e8dc2fcee5a7@gmail.com Signed-off-by: Paolo Abeni --- include/linux/phy.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7c29d346d4b3..92a88b5ce356 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2033,9 +2033,6 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); -int __init mdio_bus_init(void); -void mdio_bus_exit(void); - int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, -- cgit v1.2.3 From 4c2bd7913f52b1e5c978edf56cdef39c30a1f603 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 17 May 2025 13:08:10 -0700 Subject: net: let lockdep compare instance locks AFAIU always returning -1 from lockdep's compare function basically disables checking of dependencies between given locks. Try to be a little more precise about what guarantees that instance locks won't deadlock. Right now we only nest them under protection of rtnl_lock. Mostly in unregister_netdevice_many() and dev_close_many(). Acked-by: Stanislav Fomichev Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250517200810.466531-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_lock.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 2a753813f849..c345afecd4c5 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -99,16 +99,15 @@ static inline void netdev_unlock_ops_compat(struct net_device *dev) static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { - /* Only lower devices currently grab the instance lock, so no - * real ordering issues can occur. In the near future, only - * hardware devices will grab instance lock which also does not - * involve any ordering. Suppress lockdep ordering warnings - * until (if) we start grabbing instance lock on pure SW - * devices (bond/team/veth/etc). - */ if (a == b) return 0; - return -1; + + /* Allow locking multiple devices only under rtnl_lock, + * the exact order doesn't matter. + * Note that upper devices don't lock their ops, so nesting + * mostly happens in batched device removal for now. + */ + return lockdep_rtnl_is_held() ? -1 : 1; } #define netdev_lockdep_set_classes(dev) \ -- cgit v1.2.3 From 3f1716ee0f6c63795e6d225e3f5ec3825cd2bd57 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:34:32 +0200 Subject: net: phy: fixed_phy: remove irq argument from fixed_phy_add All callers pass PHY_POLL, therefore remove irq argument from fixed_phy_add(). Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Acked-by: Greg Ungerer Link: https://patch.msgid.link/b3b9b3bc-c310-4a54-b376-c909c83575de@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 3392c09b5d24..316bb4deda37 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,8 +17,7 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -extern int fixed_phy_add(unsigned int irq, int phy_id, - struct fixed_phy_status *status); +int fixed_phy_add(int phy_id, struct fixed_phy_status *status); extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, struct device_node *np); @@ -28,7 +27,7 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, struct fixed_phy_status *)); #else -static inline int fixed_phy_add(unsigned int irq, int phy_id, +static inline int fixed_phy_add(int phy_id, struct fixed_phy_status *status) { return -ENODEV; -- cgit v1.2.3 From d23b4af5df3900fb0b4e1a05cb8119dd1c395519 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:35:56 +0200 Subject: net: phy: fixed_phy: remove irq argument from fixed_phy_register All callers pass PHY_POLL, therefore remove irq argument from fixed_phy_register(). Note: I keep the irq argument in fixed_phy_add_gpiod() for now, for the case that somebody may want to use a GPIO interrupt in the future, by e.g. adding a call to fwnode_irq_get() to fixed_phy_get_gpiod(). Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/31cdb232-a5e9-4997-a285-cb9a7d208124@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 316bb4deda37..634149a73c2a 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -18,9 +18,8 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); int fixed_phy_add(int phy_id, struct fixed_phy_status *status); -extern struct phy_device *fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np); +struct phy_device *fixed_phy_register(struct fixed_phy_status *status, + struct device_node *np); extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, @@ -32,9 +31,9 @@ static inline int fixed_phy_add(int phy_id, { return -ENODEV; } -static inline struct phy_device *fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np) +static inline struct phy_device * +fixed_phy_register(struct fixed_phy_status *status, + struct device_node *np) { return ERR_PTR(-ENODEV); } -- cgit v1.2.3 From 4ba1c5bb4811f560a86697311cb4e9741e047a5d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:37:29 +0200 Subject: net: phy: fixed_phy: constify status argument where possible Constify the passed struct fixed_phy_status *status where possible. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/d1764b62-8538-408b-a4e3-b63715481a38@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 634149a73c2a..5399b9e41e35 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,8 +17,8 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -int fixed_phy_add(int phy_id, struct fixed_phy_status *status); -struct phy_device *fixed_phy_register(struct fixed_phy_status *status, +int fixed_phy_add(int phy_id, const struct fixed_phy_status *status); +struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); extern void fixed_phy_unregister(struct phy_device *phydev); @@ -27,12 +27,12 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, struct fixed_phy_status *)); #else static inline int fixed_phy_add(int phy_id, - struct fixed_phy_status *status) + const struct fixed_phy_status *status) { return -ENODEV; } static inline struct phy_device * -fixed_phy_register(struct fixed_phy_status *status, +fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From f0a56c17e64bb5e7cdb9295df2b5fc21e4949005 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:18 -0700 Subject: inet: Remove rtnl_is_held arg of lwtunnel_valid_encap_type(_attr)?(). Commit f130a0cc1b4f ("inet: fix lwtunnel_valid_encap_type() lock imbalance") added the rtnl_is_held argument as a temporary fix while I'm converting nexthop and IPv6 routing table to per-netns RTNL or RCU. Now all callers of lwtunnel_valid_encap_type() do not hold RTNL. Let's remove the argument. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250516022759.44392-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/lwtunnel.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 39cd50300a18..c306ebe379a0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -116,11 +116,9 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack, - bool rtnl_is_held); + struct netlink_ext_ack *extack); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack, - bool rtnl_is_held); + struct netlink_ext_ack *extack); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, @@ -203,15 +201,14 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, } static inline int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } + static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. -- cgit v1.2.3 From d7500fbfb12067ee7313f13f4c58f771be3018ab Mon Sep 17 00:00:00 2001 From: Bert Karwatzki Date: Wed, 21 May 2025 00:34:29 +0200 Subject: wifi: check if socket flags are valid Checking the SOCK_WIFI_STATUS flag bit in sk_flags may give wrong results since sk_flags are part of a union and the union is used otherwise. Add sk_requests_wifi_status() which checks if sk is non-NULL, sk is a full socket (so flags are valid) and checks the flag bit. Fixes: 76a853f86c97 ("wifi: free SKBTX_WIFI_STATUS skb tx_flags flag") Suggested-by: Johannes Berg Signed-off-by: Bert Karwatzki Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250520223430.6875-1-spasswolf@web.de [edit commit message, fix indentation] Signed-off-by: Johannes Berg --- include/net/sock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index f0fabb9fd28a..75c12e14fc47 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2821,6 +2821,12 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } +static inline bool +sk_requests_wifi_status(struct sock *sk) +{ + return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); +} + /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. -- cgit v1.2.3 From b803c4a4f78834b31ebfbbcea350473333760559 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Fri, 2 May 2025 02:12:10 +0900 Subject: can: dev: add struct data_bittiming_params to group FD parameters This is a preparation patch for the introduction of CAN XL. CAN FD and CAN XL uses similar bittiming parameters. Add one level of nesting for all the CAN FD parameters. Typically: priv->can.data_bittiming; becomes: priv->can.fd.data_bittiming; This way, the CAN XL equivalent (to be introduced later) would be: priv->can.xl.data_bittiming; Add the new struct data_bittiming_params which contains all the data bittiming parameters, including the TDC and the callback functions. This done, update all the CAN FD drivers to make use of the new layout. Acked-by: Oliver Hartkopp Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250501171213.2161572-2-mailhol.vincent@wanadoo.fr [mkl: fix rcar_canfd] Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 23492213ea35..492d23bec7be 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -38,6 +38,17 @@ enum can_termination_gpio { CAN_TERMINATION_GPIO_MAX, }; +struct data_bittiming_params { + const struct can_bittiming_const *data_bittiming_const; + struct can_bittiming data_bittiming; + const struct can_tdc_const *tdc_const; + struct can_tdc tdc; + const u32 *data_bitrate_const; + unsigned int data_bitrate_const_cnt; + int (*do_set_data_bittiming)(struct net_device *dev); + int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); +}; + /* * CAN common private data */ @@ -45,16 +56,11 @@ struct can_priv { struct net_device *dev; struct can_device_stats can_stats; - const struct can_bittiming_const *bittiming_const, - *data_bittiming_const; - struct can_bittiming bittiming, data_bittiming; - const struct can_tdc_const *tdc_const; - struct can_tdc tdc; - + const struct can_bittiming_const *bittiming_const; + struct can_bittiming bittiming; + struct data_bittiming_params fd; unsigned int bitrate_const_cnt; const u32 *bitrate_const; - const u32 *data_bitrate_const; - unsigned int data_bitrate_const_cnt; u32 bitrate_max; struct can_clock clock; @@ -77,14 +83,12 @@ struct can_priv { struct delayed_work restart_work; int (*do_set_bittiming)(struct net_device *dev); - int (*do_set_data_bittiming)(struct net_device *dev); int (*do_set_mode)(struct net_device *dev, enum can_mode mode); int (*do_set_termination)(struct net_device *dev, u16 term); int (*do_get_state)(const struct net_device *dev, enum can_state *state); int (*do_get_berr_counter)(const struct net_device *dev, struct can_berr_counter *bec); - int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); }; static inline bool can_tdc_is_enabled(const struct can_priv *priv) @@ -114,11 +118,11 @@ static inline bool can_tdc_is_enabled(const struct can_priv *priv) */ static inline s32 can_get_relative_tdco(const struct can_priv *priv) { - const struct can_bittiming *dbt = &priv->data_bittiming; + const struct can_bittiming *dbt = &priv->fd.data_bittiming; s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg + dbt->phase_seg1) * dbt->brp; - return (s32)priv->tdc.tdco - sample_point_in_tc; + return (s32)priv->fd.tdc.tdco - sample_point_in_tc; } /* helper to define static CAN controller features at device creation time */ -- cgit v1.2.3 From 04425292a62c15d1fde714522beaf8f3c2ed1de9 Mon Sep 17 00:00:00 2001 From: Hsin-chen Chuang Date: Wed, 16 Apr 2025 09:53:35 +0000 Subject: Bluetooth: Introduce HCI Driver protocol Although commit 75ddcd5ad40e ("Bluetooth: btusb: Configure altsetting for HCI_USER_CHANNEL") has enabled the HCI_USER_CHANNEL user to send out SCO data through USB Bluetooth chips, it's observed that with the patch HFP is flaky on most of the existing USB Bluetooth controllers: Intel chips sometimes send out no packet for Transparent codec; MTK chips may generate SCO data with a wrong handle for CVSD codec; RTK could split the data with a wrong packet size for Transparent codec; ... etc. To address the issue above one needs to reset the altsetting back to zero when there is no active SCO connection, which is the same as the BlueZ behavior, and another benefit is the bus doesn't need to reserve bandwidth when no SCO connection. This patch adds the infrastructure that allow the user space program to talk to Bluetooth drivers directly: - Define the new packet type HCI_DRV_PKT which is specifically used for communication between the user space program and the Bluetooth drviers - hci_send_frame intercepts the packets and invokes drivers' HCI Drv callbacks (so far only defined for btusb) - 2 kinds of events to user space: Command Status and Command Complete, the former simply returns the status while the later may contain additional response data. Cc: chromeos-bluetooth-upstreaming@chromium.org Fixes: b16b327edb4d ("Bluetooth: btusb: add sysfs attribute to control USB alt setting") Signed-off-by: Hsin-chen Chuang Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 3 ++ include/net/bluetooth/hci_drv.h | 76 ++++++++++++++++++++++++++++++++++++++++ include/net/bluetooth/hci_mon.h | 2 ++ 4 files changed, 82 insertions(+) create mode 100644 include/net/bluetooth/hci_drv.h (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 797992019f9e..2502febf4da7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -494,6 +494,7 @@ enum { #define HCI_EVENT_PKT 0x04 #define HCI_ISODATA_PKT 0x05 #define HCI_DIAG_PKT 0xf0 +#define HCI_DRV_PKT 0xf1 #define HCI_VENDOR_PKT 0xff /* HCI packet types */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 54bfeeaa0995..57f6175fd1cd 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -613,6 +614,8 @@ struct hci_dev { struct list_head monitored_devices; bool advmon_pend_notify; + struct hci_drv *hci_drv; + #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h new file mode 100644 index 000000000000..2f01c44f05ec --- /dev/null +++ b/include/net/bluetooth/hci_drv.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google Corporation + */ + +#ifndef __HCI_DRV_H +#define __HCI_DRV_H + +#include + +#include +#include + +struct hci_drv_cmd_hdr { + __le16 opcode; + __le16 len; +} __packed; + +struct hci_drv_ev_hdr { + __le16 opcode; + __le16 len; +} __packed; + +#define HCI_DRV_EV_CMD_STATUS 0x0000 +struct hci_drv_ev_cmd_status { + __le16 opcode; + __u8 status; +} __packed; + +#define HCI_DRV_EV_CMD_COMPLETE 0x0001 +struct hci_drv_ev_cmd_complete { + __le16 opcode; + __u8 status; + __u8 data[]; +} __packed; + +#define HCI_DRV_STATUS_SUCCESS 0x00 +#define HCI_DRV_STATUS_UNSPECIFIED_ERROR 0x01 +#define HCI_DRV_STATUS_UNKNOWN_COMMAND 0x02 +#define HCI_DRV_STATUS_INVALID_PARAMETERS 0x03 + +#define HCI_DRV_MAX_DRIVER_NAME_LENGTH 32 + +/* Common commands that make sense on all drivers start from 0x0000 */ +#define HCI_DRV_OP_READ_INFO 0x0000 +#define HCI_DRV_READ_INFO_SIZE 0 +struct hci_drv_rp_read_info { + __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH]; + __le16 num_supported_commands; + __le16 supported_commands[]; +} __packed; + +/* Driver specific OGF (Opcode Group Field) + * Commands in this group may have different meanings across different drivers. + */ +#define HCI_DRV_OGF_DRIVER_SPECIFIC 0x01 + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status); +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len); +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *cmd_skb); + +struct hci_drv_handler { + int (*func)(struct hci_dev *hdev, void *data, u16 data_len); + size_t data_len; +}; + +struct hci_drv { + size_t common_handler_count; + const struct hci_drv_handler *common_handlers; + + size_t specific_handler_count; + const struct hci_drv_handler *specific_handlers; +}; + +#endif /* __HCI_DRV_H */ diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 082f89531b88..bbd752494ef9 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -51,6 +51,8 @@ struct hci_mon_hdr { #define HCI_MON_CTRL_EVENT 17 #define HCI_MON_ISO_TX_PKT 18 #define HCI_MON_ISO_RX_PKT 19 +#define HCI_MON_DRV_TX_PKT 20 +#define HCI_MON_DRV_RX_PKT 21 struct hci_mon_new_index { __u8 type; -- cgit v1.2.3 From dd0ccf858057b793beb3779be7576d92c93cf828 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 27 Apr 2025 14:27:25 +0300 Subject: Bluetooth: add support for SIOCETHTOOL ETHTOOL_GET_TS_INFO Bluetooth needs some way for user to get supported so_timestamping flags for the different socket types. Use SIOCETHTOOL API for this purpose. As hci_dev is not associated with struct net_device, the existing implementation can't be reused, so we add a small one here. Add support (only) for ETHTOOL_GET_TS_INFO command. The API differs slightly from netdev in that the result depends also on socket type. Signed-off-by: Pauli Virtanen Acked-by: Willem de Bruijn Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index bbefde319f95..114299bd8b98 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -29,6 +29,7 @@ #include #include #include +#include #define BT_SUBSYS_VERSION 2 #define BT_SUBSYS_REVISION 22 @@ -448,6 +449,9 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb); +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *ts_info); + #define HCI_REQ_START BIT(0) #define HCI_REQ_SKB BIT(1) -- cgit v1.2.3 From 23205562ffc8de20f57afdd984858cab29e77968 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 3 May 2025 17:08:21 +0300 Subject: Bluetooth: separate CIS_LINK and BIS_LINK link types Use separate link type id for unicast and broadcast ISO connections. These connection types are handled with separate HCI commands, socket API is different, and hci_conn has union fields that are different in the two cases, so they shall not be mixed up. Currently in most places it is attempted to distinguish ucast by bacmp(&c->dst, BDADDR_ANY) but it is wrong as dst is set for bcast sink hci_conn in iso_conn_ready(). Additionally checking sync_handle might be OK, but depends on details of bcast conn configuration flow. To avoid complicating it, use separate link types. Fixes: f764a6c2c1e4 ("Bluetooth: ISO: Add broadcast support") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 3 ++- include/net/bluetooth/hci_core.h | 48 ++++++++++++++++++---------------------- 2 files changed, 23 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 2502febf4da7..82cbd54443ac 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -558,7 +558,8 @@ enum { #define ESCO_LINK 0x02 /* Low Energy links do not have defined link type. Use invented one */ #define LE_LINK 0x80 -#define ISO_LINK 0x82 +#define CIS_LINK 0x82 +#define BIS_LINK 0x83 #define INVALID_LINK 0xff /* LMP features */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 57f6175fd1cd..2b261e74e2c4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -999,7 +999,8 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num++; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: h->iso_num++; break; } @@ -1025,7 +1026,8 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num--; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: h->iso_num--; break; } @@ -1042,7 +1044,8 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) case SCO_LINK: case ESCO_LINK: return h->sco_num; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: return h->iso_num; default: return 0; @@ -1103,7 +1106,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK) continue; if (c->iso_qos.bcast.bis == bis) { @@ -1125,7 +1128,7 @@ hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != BIS_LINK) continue; if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags)) @@ -1151,8 +1154,8 @@ hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK || - !test_bit(HCI_CONN_PER_ADV, &c->flags)) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK || + !test_bit(HCI_CONN_PER_ADV, &c->flags)) continue; if (c->iso_qos.bcast.big == big && @@ -1241,7 +1244,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; /* Match CIG ID if set */ @@ -1273,7 +1276,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; if (handle == c->iso_qos.ucast.cig) { @@ -1296,17 +1299,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) - continue; - - /* An ISO_LINK hcon with BDADDR_ANY as destination - * address is a Broadcast connection. A Broadcast - * slave connection is associated with a PA train, - * so the sync_handle can be used to differentiate - * from unicast. - */ - if (bacmp(&c->dst, BDADDR_ANY) && - c->sync_handle == HCI_SYNC_HANDLE_INVALID) + if (c->type != BIS_LINK) continue; if (handle == c->iso_qos.bcast.big) { @@ -1330,7 +1323,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != BIS_LINK) continue; if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { @@ -1353,8 +1346,8 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK || - c->state != state) + if (c->type != BIS_LINK || bacmp(&c->dst, BDADDR_ANY) || + c->state != state) continue; if (handle == c->iso_qos.bcast.big) { @@ -1377,8 +1370,8 @@ hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || - !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + if (c->type != BIS_LINK || + !test_bit(HCI_CONN_PA_SYNC, &c->flags)) continue; if (c->iso_qos.bcast.big == big) { @@ -1400,7 +1393,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != BIS_LINK) continue; /* Ignore the listen hcon, we are looking @@ -2015,7 +2008,8 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, case ESCO_LINK: return sco_connect_ind(hdev, bdaddr, flags); - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: return iso_connect_ind(hdev, bdaddr, flags); default: -- cgit v1.2.3 From 8b8762eeec59b959fbca60afffe21265bce67168 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 May 2025 09:19:05 -0700 Subject: tools: ynl-gen: add makefile deps for neigh Kory is reporting build issues after recent additions to YNL if the system headers are old. Link: https://lore.kernel.org/20250519164949.597d6e92@kmaincent-XPS-13-7390 Reported-by: Kory Maincent Fixes: 0939a418b3b0 ("tools: ynl: submsg: reverse parse / error reporting") Tested-by: Kory Maincent Link: https://patch.msgid.link/20250520161916.413298-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/neighbour.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 5e67a7eaf4a7..b851c36ad25d 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_NEIGHBOUR_H -#define __LINUX_NEIGHBOUR_H +#ifndef _UAPI__LINUX_NEIGHBOUR_H +#define _UAPI__LINUX_NEIGHBOUR_H #include #include -- cgit v1.2.3 From a5bd029c733b8ae790d5873e2afeb88b58e3a151 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:04 -0700 Subject: net: add skb_crc32c() Add skb_crc32c(), which calculates the CRC32C of a sk_buff. It will replace __skb_checksum(), which unnecessarily supports arbitrary checksums. Compared to __skb_checksum(), skb_crc32c(): - Uses the correct type for CRC32C values (u32, not __wsum). - Does not require the caller to provide a skb_checksum_ops struct. - Is faster because it does not use indirect calls and does not use the very slow crc32c_combine(). According to commit 2817a336d4d5 ("net: skb_checksum: allow custom update/combine for walking skb") which added __skb_checksum(), the original motivation for the abstraction layer was to avoid code duplication for CRC32C and other checksums in the future. However: - No additional checksums showed up after CRC32C. __skb_checksum() is only used with the "regular" net checksum and CRC32C. - Indirect calls are expensive. Commit 2544af0344ba ("net: avoid indirect calls in L4 checksum calculation") worked around this using the INDIRECT_CALL_1 macro. But that only avoided the indirect call for the net checksum, and at the cost of an extra branch. - The checksums use different types (__wsum and u32), causing casts to be needed. - It made the checksums of fragments be combined (rather than chained) for both checksums, despite this being highly counterproductive for CRC32C due to how slow crc32c_combine() is. This can clearly be seen in commit 4c2f24549644 ("sctp: linearize early if it's not GSO") which tried to work around this performance bug. With a dedicated function for each checksum, we can instead just use the proper strategy for each checksum. As shown by the following tables, the new function skb_crc32c() is faster than __skb_checksum(), with the improvement varying greatly from 5% to 2500% depending on the case. The largest improvements come from fragmented packets, mainly due to eliminating the inefficient crc32c_combine(). But linear packets are improved too, especially shorter ones, mainly due to eliminating indirect calls. These benchmarks were done on AMD Zen 5. On that CPU, Linux uses IBRS instead of retpoline; an even greater improvement might be seen with retpoline: Linear sk_buffs Length in bytes __skb_checksum cycles skb_crc32c cycles =============== ===================== ================= 64 43 18 256 94 77 1420 204 161 16384 1735 1642 Nonlinear sk_buffs (even split between head and one fragment) Length in bytes __skb_checksum cycles skb_crc32c cycles =============== ===================== ================= 64 579 22 256 829 77 1420 1506 194 16384 4365 1682 Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-3-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c7397b17bb08..7ccc6356acac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4203,6 +4203,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc); static inline void * __must_check __skb_header_pointer(const struct sk_buff *skb, int offset, int len, -- cgit v1.2.3 From 99de9d4022e5004f95f425f798f0aa01e87949ff Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:07 -0700 Subject: sctp: use skb_crc32c() instead of __skb_checksum() Make sctp_compute_cksum() just use the new function skb_crc32c(), instead of calling __skb_checksum() with a skb_checksum_ops struct that does CRC32C. This is faster and simpler. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-6-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/sctp/checksum.h | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 291465c25810..654d37ec0402 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -15,8 +15,6 @@ * Dinakaran Joseph * Jon Grimm * Sridhar Samudrala - * - * Rewritten to use libcrc32c by: * Vlad Yasevich */ @@ -25,39 +23,18 @@ #include #include -#include -#include - -static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum) -{ - return (__force __wsum)crc32c((__force __u32)sum, buff, len); -} - -static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - return (__force __wsum)crc32c_combine((__force __u32)csum, - (__force __u32)csum2, len); -} - -static const struct skb_checksum_ops sctp_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); __le32 old = sh->checksum; - __wsum new; + u32 new; sh->checksum = 0; - new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, - &sctp_csum_ops); + new = ~skb_crc32c(skb, offset, skb->len - offset, ~0); sh->checksum = old; - - return cpu_to_le32((__force __u32)new); + return cpu_to_le32(new); } #endif /* __sctp_checksum_h__ */ -- cgit v1.2.3 From 70c96c7cb9f035d5b960021f2450afa6240e66b4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:08 -0700 Subject: net: fold __skb_checksum() into skb_checksum() Now that the only remaining caller of __skb_checksum() is skb_checksum(), fold __skb_checksum() into skb_checksum(). This makes struct skb_checksum_ops unnecessary, so remove that too and simply do the "regular" net checksum. It also makes the wrapper functions csum_partial_ext() and csum_block_add_ext() unnecessary, so remove those too and just use the underlying functions. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-7-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 9 --------- include/net/checksum.h | 12 ------------ 2 files changed, 21 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ccc6356acac..018c07230513 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4192,15 +4192,6 @@ static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; } -struct skb_checksum_ops { - __wsum (*update)(const void *mem, int len, __wsum wsum); - __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); -}; - -extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly; - -__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc); diff --git a/include/net/checksum.h b/include/net/checksum.h index 243f972267b8..e57986b173f8 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -98,12 +98,6 @@ csum_block_add(__wsum csum, __wsum csum2, int offset) return csum_add(csum, csum_shift(csum2, offset)); } -static __always_inline __wsum -csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) -{ - return csum_block_add(csum, csum2, offset); -} - static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { @@ -115,12 +109,6 @@ static __always_inline __wsum csum_unfold(__sum16 n) return (__force __wsum)n; } -static __always_inline -__wsum csum_partial_ext(const void *buff, int len, __wsum sum) -{ - return csum_partial(buff, len, sum); -} - #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) -- cgit v1.2.3 From b82f72292ab4c65250bd734281464a6ab1ff4133 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:09 -0700 Subject: lib/crc32: remove unused support for CRC32C combination crc32c_combine() and crc32c_shift() are no longer used (except by the KUnit test that tests them), and their current implementation is very slow. Remove them. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-8-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/crc32.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 69c2e8bb3782..7f7d0be8a0ac 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -76,29 +76,6 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) return crc32_le_shift(crc1, len2) ^ crc2; } -u32 crc32c_shift(u32 crc, size_t len); - -/** - * crc32c_combine - Combine two crc32c check values into one. For two sequences - * of bytes, seq1 and seq2 with lengths len1 and len2, crc32c() - * check values were calculated for each, crc1 and crc2. - * - * @crc1: crc32c of the first block - * @crc2: crc32c of the second block - * @len2: length of the second block - * - * Return: The crc32c() check value of seq1 and seq2 concatenated, requiring - * only crc1, crc2, and len2. Note: If seq_full denotes the concatenated - * memory area of seq1 with seq2, and crc_full the crc32c() value of - * seq_full, then crc_full == crc32c_combine(crc1, crc2, len2) when - * crc_full was seeded with the same initializer as crc1, and crc2 seed - * was 0. See also crc_combine_test(). - */ -static inline u32 crc32c_combine(u32 crc1, u32 crc2, size_t len2) -{ - return crc32c_shift(crc1, len2) ^ crc2; -} - #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* -- cgit v1.2.3 From ea6342d98928e243f2024fb97a9b4d42ee55dfba Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:10 -0700 Subject: net: add skb_copy_and_crc32c_datagram_iter() Since skb_copy_and_hash_datagram_iter() is used only with CRC32C, the crypto_ahash abstraction provides no value. Add skb_copy_and_crc32c_datagram_iter() which just calls crc32c() directly. This is faster and simpler. It also doesn't have the weird dependency issue where skb_copy_and_hash_datagram_iter() depends on CONFIG_CRYPTO_HASH=y without that being expressed explicitly in the kconfig (presumably because it was too heavyweight for NET to select). The new function is conditional on the hidden boolean symbol NET_CRC32C, which selects CRC32. So it gets compiled only when something that actually needs CRC32C packet checksums is enabled, it has no implicit dependency, and it doesn't depend on the heavyweight crypto layer. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-9-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 018c07230513..510adf63c211 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4137,6 +4137,8 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash); +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); -- cgit v1.2.3 From c93f75b2d755c35b596084ddd3feb3528284a53f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:12 -0700 Subject: net: remove skb_copy_and_hash_datagram_iter() Now that skb_copy_and_hash_datagram_iter() is no longer used, remove it. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-11-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 510adf63c211..5520524c93bf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -274,7 +274,6 @@ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) -struct ahash_request; struct net_device; struct scatterlist; struct pipe_inode_info; @@ -4134,9 +4133,6 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); -int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, - struct ahash_request *hash); int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, -- cgit v1.2.3 From 31afd6bc55cc0093c3e5b0a368319e423d4de8ea Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 17 May 2025 22:13:45 +0200 Subject: net: phy: pass PHY driver to .match_phy_device OP Pass PHY driver pointer to .match_phy_device OP in addition to phydev. Having access to the PHY driver struct might be useful to check the PHY ID of the driver is being matched for in case the PHY ID scanned in the phydev is not consistent. A scenario for this is a PHY that change PHY ID after a firmware is loaded, in such case, the PHY ID stored in PHY device struct is not valid anymore and PHY will manually scan the ID in the match_phy_device function. Having the PHY driver info is also useful for those PHY driver that implement multiple simple .match_phy_device OP to match specific MMD PHY ID. With this extra info if the parsing logic is the same, the matching function can be generalized by using the phy_id in the PHY driver instead of hardcoding. Rust wrapper callback is updated to align to the new match_phy_device arguments. Suggested-by: Russell King (Oracle) Reviewed-by: Russell King (Oracle) Signed-off-by: Christian Marangi Reviewed-by: Benno Lossin # for Rust Reviewed-by: FUJITA Tomonori Link: https://patch.msgid.link/20250517201353.5137-2-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 92a88b5ce356..10e66d45a8e8 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -990,7 +990,8 @@ struct phy_driver { * driver for the given phydev. If NULL, matching is based on * phy_id and phy_id_mask. */ - int (*match_phy_device)(struct phy_device *phydev); + int (*match_phy_device)(struct phy_device *phydev, + const struct phy_driver *phydrv); /** * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY -- cgit v1.2.3 From d6c45707ac84c2d9f274ece1cea4dddb97996bde Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 17 May 2025 22:13:48 +0200 Subject: net: phy: introduce genphy_match_phy_device() Introduce new API, genphy_match_phy_device(), to provide a way to check to match a PHY driver for a PHY device based on the info stored in the PHY device struct. The function generalize the logic used in phy_bus_match() to check the PHY ID whether if C45 or C22 ID should be used for matching. This is useful for custom .match_phy_device function that wants to use the generic logic under some condition. (example a PHY is already setup and provide the correct PHY ID) Reviewed-by: Russell King (Oracle) Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250517201353.5137-5-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 10e66d45a8e8..32b9da274115 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1868,6 +1868,9 @@ char *phy_attached_info_irq(struct phy_device *phydev) __malloc; void phy_attached_info(struct phy_device *phydev); +int genphy_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv); + /* Clause 22 PHY */ int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); -- cgit v1.2.3 From 4ff4d86f6cceb6bea583bdb230e5439655778cce Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 19 May 2025 10:45:05 +0200 Subject: net: Add support for providing the PTP hardware source in tsinfo Multi-PTP source support within a network topology has been merged, but the hardware timestamp source is not yet exposed to users. Currently, users only see the PTP index, which does not indicate whether the timestamp comes from a PHY or a MAC. Add support for reporting the hwtstamp source using a hwtstamp-source field, alongside hwtstamp-phyindex, to describe the origin of the hardware timestamp. Remove HWTSTAMP_SOURCE_UNSPEC enum value as it is not used at all. Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250519-feature_ptp_source-v4-1-5d10e19a0265@bootlin.com Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 5 +++++ include/linux/net_tstamp.h | 7 +------ include/uapi/linux/ethtool_netlink_generated.h | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 117718c24814..5e0dd333ad1f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 @@ -830,6 +831,8 @@ struct ethtool_rxfh_param { * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none * @phc_qualifier: qualifier of the associated PHC + * @phc_source: source device of the associated PHC + * @phc_phyindex: index of PHY device source of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ @@ -838,6 +841,8 @@ struct kernel_ethtool_ts_info { u32 so_timestamping; int phc_index; enum hwtstamp_provider_qualifier phc_qualifier; + enum hwtstamp_source phc_source; + int phc_phyindex; enum hwtstamp_tx_types tx_types; enum hwtstamp_rx_filters rx_filters; }; diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index ff0758e88ea1..f4936d9c2b3c 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -4,6 +4,7 @@ #define _LINUX_NET_TIMESTAMPING_H_ #include +#include #define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ SOF_TIMESTAMPING_TX_SOFTWARE | \ @@ -13,12 +14,6 @@ SOF_TIMESTAMPING_TX_HARDWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) -enum hwtstamp_source { - HWTSTAMP_SOURCE_UNSPEC, - HWTSTAMP_SOURCE_NETDEV, - HWTSTAMP_SOURCE_PHYLIB, -}; - /** * struct hwtstamp_provider_desc - hwtstamp provider description * diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 30c8dad6214e..9a02f579de22 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -37,6 +37,18 @@ enum ethtool_tcp_data_split { ETHTOOL_TCP_DATA_SPLIT_ENABLED, }; +/** + * enum hwtstamp_source - Source of the hardware timestamp + * @HWTSTAMP_SOURCE_NETDEV: Hardware timestamp comes from a MAC or a device + * which has MAC and PHY integrated + * @HWTSTAMP_SOURCE_PHYLIB: Hardware timestamp comes from one PHY device of the + * network topology + */ +enum hwtstamp_source { + HWTSTAMP_SOURCE_NETDEV = 1, + HWTSTAMP_SOURCE_PHYLIB, +}; + enum { ETHTOOL_A_HEADER_UNSPEC, ETHTOOL_A_HEADER_DEV_INDEX, @@ -401,6 +413,8 @@ enum { ETHTOOL_A_TSINFO_PHC_INDEX, ETHTOOL_A_TSINFO_STATS, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER, + ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE, + ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX, __ETHTOOL_A_TSINFO_CNT, ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) -- cgit v1.2.3 From 38b95d588f8fd07027ad8dbca3e1d2b5c13413ae Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:54 -0700 Subject: scm: Move scm_recv() from scm.h to scm.c. scm_recv() has been placed in scm.h since the pre-git era for no particular reason (I think), which makes the file really fragile. For example, when you move SOCK_PASSCRED from include/linux/net.h to enum sock_flags in include/net/sock.h, you will see weird build failure due to terrible dependency. To avoid the build failure in the future, let's move scm_recv(_unix())? and its callees to scm.c. Note that only scm_recv() needs to be exported for Bluetooth. scm_send() should be moved to scm.c too, but I'll revisit later. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/scm.h | 121 ++---------------------------------------------------- 1 file changed, 4 insertions(+), 117 deletions(-) (limited to 'include') diff --git a/include/net/scm.h b/include/net/scm.h index 22bb49589fde..84c4707e78a5 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -102,123 +102,10 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, return __scm_send(sock, msg, scm); } -#ifdef CONFIG_SECURITY_NETWORK -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ - struct lsm_context ctx; - int err; - - if (test_bit(SOCK_PASSSEC, &sock->flags)) { - err = security_secid_to_secctx(scm->secid, &ctx); - - if (err >= 0) { - put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, - ctx.context); - security_release_secctx(&ctx); - } - } -} - -static inline bool scm_has_secdata(struct socket *sock) -{ - return test_bit(SOCK_PASSSEC, &sock->flags); -} -#else -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ } - -static inline bool scm_has_secdata(struct socket *sock) -{ - return false; -} -#endif /* CONFIG_SECURITY_NETWORK */ - -static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) -{ - struct file *pidfd_file = NULL; - int len, pidfd; - - /* put_cmsg() doesn't return an error if CMSG is truncated, - * that's why we need to opencode these checks here. - */ - if (msg->msg_flags & MSG_CMSG_COMPAT) - len = sizeof(struct compat_cmsghdr) + sizeof(int); - else - len = sizeof(struct cmsghdr) + sizeof(int); - - if (msg->msg_controllen < len) { - msg->msg_flags |= MSG_CTRUNC; - return; - } - - if (!scm->pid) - return; - - pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); - - if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { - if (pidfd_file) { - put_unused_fd(pidfd); - fput(pidfd_file); - } - - return; - } - - if (pidfd_file) - fd_install(pidfd, pidfd_file); -} - -static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - scm->fp || scm_has_secdata(sock)) - msg->msg_flags |= MSG_CTRUNC; - scm_destroy(scm); - return false; - } - - if (test_bit(SOCK_PASSCRED, &sock->flags)) { - struct user_namespace *current_ns = current_user_ns(); - struct ucred ucreds = { - .pid = scm->creds.pid, - .uid = from_kuid_munged(current_ns, scm->creds.uid), - .gid = from_kgid_munged(current_ns, scm->creds.gid), - }; - put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); - } - - scm_passec(sock, msg, scm); - - if (scm->fp) - scm_detach_fds(msg, scm); - - return true; -} - -static inline void scm_recv(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - scm_destroy_cred(scm); -} - -static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - if (test_bit(SOCK_PASSPIDFD, &sock->flags)) - scm_pidfd_recv(msg, scm); - - scm_destroy_cred(scm); -} +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); static inline int scm_recv_one_fd(struct file *f, int __user *ufd, unsigned int flags) -- cgit v1.2.3 From 7d8d93fdde50b86bbbf46a203c368ed320e729ab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:56 -0700 Subject: net: Restrict SO_PASS{CRED,PIDFD,SEC} to AF_{UNIX,NETLINK,BLUETOOTH}. SCM_CREDENTIALS and SCM_SECURITY can be recv()ed by calling scm_recv() or scm_recv_unix(), and SCM_PIDFD is only used by scm_recv_unix(). scm_recv() is called from AF_NETLINK and AF_BLUETOOTH. scm_recv_unix() is literally called from AF_UNIX. Let's restrict SO_PASSCRED and SO_PASSSEC to such sockets and SO_PASSPIDFD to AF_UNIX only. Later, SOCK_PASS{CRED,PIDFD,SEC} will be moved to struct sock and united with another field. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 35ca6b13c6d2..483522377955 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2773,9 +2773,14 @@ static inline bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static inline bool sk_is_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX; +} + static inline bool sk_is_stream_unix(const struct sock *sk) { - return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; + return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) @@ -2783,6 +2788,13 @@ static inline bool sk_is_vsock(const struct sock *sk) return sk->sk_family == AF_VSOCK; } +static inline bool sk_may_scm_recv(const struct sock *sk) +{ + return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || + sk->sk_family == AF_NETLINK || + (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from -- cgit v1.2.3 From 0e81cfd971dc4833c699dcd8924e54a5021bc4e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:57 -0700 Subject: af_unix: Move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. As explained in the next patch, SO_PASSRIGHTS would have a problem if we assigned a corresponding bit to socket->flags, so it must be managed in struct sock. Mixing socket->flags and sk->sk_flags for similar options will look confusing, and sk->sk_flags does not have enough space on 32bit system. Also, as mentioned in commit 16e572626961 ("af_unix: dont send SCM_CREDENTIALS by default"), SOCK_PASSCRED and SOCK_PASSPID handling is known to be slow, and managing the flags in struct socket cannot avoid that for embryo sockets. Let's move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. While at it, other SOCK_XXX flags in net.h are grouped as enum. Note that assign_bit() was atomic, so the writer side is moved down after lock_sock() in setsockopt(), but the bit is only read once in sendmsg() and recvmsg(), so lock_sock() is not needed there. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/net.h | 15 +++++++-------- include/net/sock.h | 16 +++++++++++++++- 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 0ff950eecc6b..f8418d6e33e0 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -36,14 +36,13 @@ struct net; * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. * Eventually all flags will be in sk->sk_wq->flags. */ -#define SOCKWQ_ASYNC_NOSPACE 0 -#define SOCKWQ_ASYNC_WAITDATA 1 -#define SOCK_NOSPACE 2 -#define SOCK_PASSCRED 3 -#define SOCK_PASSSEC 4 -#define SOCK_SUPPORT_ZC 5 -#define SOCK_CUSTOM_SOCKOPT 6 -#define SOCK_PASSPIDFD 7 +enum socket_flags { + SOCKWQ_ASYNC_NOSPACE, + SOCKWQ_ASYNC_WAITDATA, + SOCK_NOSPACE, + SOCK_SUPPORT_ZC, + SOCK_CUSTOM_SOCKOPT, +}; #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/include/net/sock.h b/include/net/sock.h index 483522377955..d90a71f66ab8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -337,6 +337,11 @@ struct sk_filter; * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags + * @sk_scm_recv_flags: all flags used by scm_recv() + * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS + * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY + * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. * @sk_owner: reference to the real owner of the socket that calls @@ -523,7 +528,16 @@ struct sock { #endif int sk_disconnects; - u8 sk_txrehash; + union { + u8 sk_txrehash; + u8 sk_scm_recv_flags; + struct { + u8 sk_scm_credentials : 1, + sk_scm_security : 1, + sk_scm_pidfd : 1, + sk_scm_unused : 5; + }; + }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, -- cgit v1.2.3 From 77cbe1a6d8730a07f99f9263c2d5f2304cf5e830 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:59 -0700 Subject: af_unix: Introduce SO_PASSRIGHTS. As long as recvmsg() or recvmmsg() is used with cmsg, it is not possible to avoid receiving file descriptors via SCM_RIGHTS. This behaviour has occasionally been flagged as problematic, as it can be (ab)used to trigger DoS during close(), for example, by passing a FUSE-controlled fd or a hung NFS fd. For instance, as noted on the uAPI Group page [0], an untrusted peer could send a file descriptor pointing to a hung NFS mount and then close it. Once the receiver calls recvmsg() with msg_control, the descriptor is automatically installed, and then the responsibility for the final close() now falls on the receiver, which may result in blocking the process for a long time. Regarding this, systemd calls cmsg_close_all() [1] after each recvmsg() to close() unwanted file descriptors sent via SCM_RIGHTS. However, this cannot work around the issue at all, because the final fput() may still occur on the receiver's side once sendmsg() with SCM_RIGHTS succeeds. Also, even filtering by LSM at recvmsg() does not work for the same reason. Thus, we need a better way to refuse SCM_RIGHTS at sendmsg(). Let's introduce SO_PASSRIGHTS to disable SCM_RIGHTS. Note that this option is enabled by default for backward compatibility. Link: https://uapi-group.org/kernel-features/#disabling-reception-of-scm_rights-for-af_unix-sockets #[0] Link: https://github.com/systemd/systemd/blob/v257.5/src/basic/fd-util.c#L612-L628 #[1] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 4 +++- include/uapi/asm-generic/socket.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index d90a71f66ab8..92e7c1aae3cc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -341,6 +341,7 @@ struct sk_filter; * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. @@ -535,7 +536,8 @@ struct sock { u8 sk_scm_credentials : 1, sk_scm_security : 1, sk_scm_pidfd : 1, - sk_scm_unused : 5; + sk_scm_rights : 1, + sk_scm_unused : 4; }; }; u8 sk_clockid; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index aa5016ff3d91..f333a0ac4ee4 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -145,6 +145,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) -- cgit v1.2.3 From 9a119669fb1924cd9658c16da39a5a585e129e50 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 May 2025 11:38:48 +0200 Subject: netfilter: nf_tables: nft_fib: consistent l3mdev handling fib has two modes: 1. Obtain output device according to source or destination address 2. Obtain the type of the address, e.g. local, unicast, multicast. 'fib daddr type' should return 'local' if the address is configured in this netns or unicast otherwise. 'fib daddr . iif type' should return 'local' if the address is configured on the input interface or unicast otherwise, i.e. more restrictive. However, if the interface is part of a VRF, then 'fib daddr type' returns unicast even if the address is configured on the incoming interface. This is broken for both ipv4 and ipv6. In the ipv4 case, inet_dev_addr_type must only be used if the 'iif' or 'oif' (strict mode) was requested. Else inet_addr_type_dev_table() needs to be used and the correct dev argument must be passed as well so the correct fib (vrf) table is used. In the ipv6 case, the bug is similar, without strict mode, dev is NULL so .flowi6_l3mdev will be set to 0. Add a new 'nft_fib_l3mdev_master_ifindex_rcu()' helper and use that to init the .l3mdev structure member. For ipv6, use it from nft_fib6_flowi_init() which gets called from both the 'type' and the 'route' mode eval functions. This provides consistent behaviour for all modes for both ipv4 and ipv6: If strict matching is requested, the input respectively output device of the netfilter hooks is used. Otherwise, use skb->dev to obtain the l3mdev ifindex. Without this, most type checks in updated nft_fib.sh selftest fail: FAIL: did not find veth0 . 10.9.9.1 . local in fibtype4 FAIL: did not find veth0 . dead:1::1 . local in fibtype6 FAIL: did not find veth0 . dead:9::1 . local in fibtype6 FAIL: did not find tvrf . 10.0.1.1 . local in fibtype4 FAIL: did not find tvrf . 10.9.9.1 . local in fibtype4 FAIL: did not find tvrf . dead:1::1 . local in fibtype6 FAIL: did not find tvrf . dead:9::1 . local in fibtype6 FAIL: fib expression address types match (iif in vrf) (fib errounously returns 'unicast' for all of them, even though all of these addresses are local to the vrf). Fixes: f6d0cbcf09c5 ("netfilter: nf_tables: add fib expression") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 6e202ed5e63f..7370fba844ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -2,6 +2,7 @@ #ifndef _NFT_FIB_H_ #define _NFT_FIB_H_ +#include #include struct nft_fib { @@ -39,6 +40,14 @@ static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) return nft_fib_is_loopback(pkt->skb, indev); } +static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, + const struct net_device *iif) +{ + const struct net_device *dev = iif ? iif : pkt->skb->dev; + + return l3mdev_master_ifindex_rcu(dev); +} + int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); -- cgit v1.2.3 From a1f1acb9c5db9b385c9b3eb1f27f897c06df49ae Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 12:28:44 +0200 Subject: netfilter: nf_dup{4, 6}: Move duplication check to task_struct nf_skb_duplicated is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Due to the recursion involved, the simplest change is to make it a per-task variable. Move the per-CPU variable nf_skb_duplicated to task_struct and name it in_nf_duplicate. Add it to the existing bitfield so it doesn't use additional memory. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Acked-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 11 ----------- include/linux/sched.h | 1 + 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2b8aac2c70ad..892d12823ed4 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -497,17 +497,6 @@ struct nf_defrag_hook { extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; -/* - * nf_skb_duplicated - TEE target has sent a packet - * - * When a xtables target sends a packet, the OUTPUT and POSTROUTING - * hooks are traversed again, i.e. nft and xtables are invoked recursively. - * - * This is used by xtables TEE target to prevent the duplicated skb from - * being duplicated again. - */ -DECLARE_PER_CPU(bool, nf_skb_duplicated); - /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..52d9c52dc8f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1044,6 +1044,7 @@ struct task_struct { /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif + unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif -- cgit v1.2.3 From f37ad91270397a6d053e8623bdb3cf79859691d2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 12:28:46 +0200 Subject: netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit nf_dup_skb_recursion is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move nf_dup_skb_recursion to struct netdev_xmit, provide wrappers. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice_xmit.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 848735b3a7c0..813a19122ebb 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -11,6 +11,9 @@ struct netdev_xmit { #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) u8 sched_mirred_nest; #endif +#if IS_ENABLED(CONFIG_NF_DUP_NETDEV) + u8 nf_dup_skb_recursion; +#endif }; #endif -- cgit v1.2.3 From 90869f43d06dfc836def2f53850a878f829e443e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 May 2025 15:49:33 +0200 Subject: netfilter: conntrack: make nf_conntrack_id callable without a module dependency While nf_conntrack_id() doesn't need any functionaliy from conntrack, it does reside in nf_conntrack_core.c -- callers add a module dependency on conntrack. Followup patch will need to compute the conntrack id from nf_tables_trace.c to include it in nf_trace messages emitted to userspace via netlink. I don't want to introduce a module dependency between nf_tables and conntrack for this. Since trace is slowpath, the added indirection is ok. One alternative is to move nf_conntrack_id to the netfilter/core.c, but I don't see a compelling reason so far. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 892d12823ed4..20947f2c685b 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -470,6 +470,7 @@ struct nf_ct_hook { void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); int (*confirm)(struct sk_buff *skb); + u32 (*get_id)(const struct nf_conntrack *nfct); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; -- cgit v1.2.3 From 7e5c6aa67e6f6133c5a2c53852e1dd9af2c0c3fc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 May 2025 15:49:34 +0200 Subject: netfilter: nf_tables: add packets conntrack state to debug trace info Add the minimal relevant info needed for userspace ("nftables monitor trace") to provide the conntrack view of the packet: - state (new, related, established) - direction (original, reply) - status (e.g., if connection is subject to dnat) - id (allows to query ctnetlink for remaining conntrack state info) Example: trace id a62 inet filter PRE_RAW packet: iif "enp0s3" ether [..] [..] trace id a62 inet filter PRE_MANGLE conntrack: ct direction original ct state new ct id 32 trace id a62 inet filter PRE_MANGLE packet: [..] [..] trace id a62 inet filter IN conntrack: ct direction original ct state new ct status dnat-done ct id 32 [..] In this case one can see that while NAT is active, the new connection isn't subject to a translation. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7d6bc19a0153..2beb30be2c5f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1841,6 +1841,10 @@ enum nft_xfrm_keys { * @NFTA_TRACE_MARK: nfmark (NLA_U32) * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32) * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32) + * @NFTA_TRACE_CT_ID: conntrack id (NLA_U32) + * @NFTA_TRACE_CT_DIRECTION: packets direction (NLA_U8) + * @NFTA_TRACE_CT_STATUS: conntrack status (NLA_U32) + * @NFTA_TRACE_CT_STATE: packet state (new, established, ...) (NLA_U32) */ enum nft_trace_attributes { NFTA_TRACE_UNSPEC, @@ -1861,6 +1865,10 @@ enum nft_trace_attributes { NFTA_TRACE_NFPROTO, NFTA_TRACE_POLICY, NFTA_TRACE_PAD, + NFTA_TRACE_CT_ID, + NFTA_TRACE_CT_DIRECTION, + NFTA_TRACE_CT_STATUS, + NFTA_TRACE_CT_STATE, __NFTA_TRACE_MAX }; #define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1) -- cgit v1.2.3 From e225376d78fb2d85e99a2436a9e65765dc1ac234 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 21 May 2025 22:44:23 +0200 Subject: netfilter: nf_tables: Introduce nft_hook_find_ops{,_rcu}() Also a pretty dull wrapper around the hook->ops.dev comparison for now. Will search the embedded nf_hook_ops list in future. The ugly cast to eliminate the const qualifier will vanish then, too. Since this future list will be RCU-protected, also introduce an _rcu() variant here. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 803d5f1601f9..df0b151743a2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1205,6 +1205,11 @@ struct nft_hook { u8 ifnamelen; }; +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev); +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev); + /** * struct nft_base_chain - nf_tables base chain * -- cgit v1.2.3 From 73319a8ee18b9cf0b2dac87f8521595e0381ba0c Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 21 May 2025 22:44:26 +0200 Subject: netfilter: nf_tables: Have a list of nf_hook_ops in nft_hook Supporting a 1:n relationship between nft_hook and nf_hook_ops is convenient since a chain's or flowtable's nft_hooks may remain in place despite matching interfaces disappearing. This stabilizes ruleset dumps in that regard and opens the possibility to claim newly added interfaces which match the spec. Also it prepares for wildcard interface specs since these will potentially match multiple interfaces. All spots dealing with hook registration are updated to handle a list of multiple nf_hook_ops, but nft_netdev_hook_alloc() only adds a single item for now to retain the old behaviour. The only expected functional change here is how vanishing interfaces are handled: Instead of dropping the respective nft_hook, only the matching nf_hook_ops are dropped. To safely remove individual ops from the list in netdev handlers, an rcu_head is added to struct nf_hook_ops so kfree_rcu() may be used. There is at least nft_flowtable_find_dev() which may be iterating through the list at the same time. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 3 +++ include/net/netfilter/nf_tables.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 20947f2c685b..5f896fcc074d 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -95,6 +95,9 @@ enum nf_hook_ops_type { }; struct nf_hook_ops { + struct list_head list; + struct rcu_head rcu; + /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index df0b151743a2..5e49619ae49c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1199,7 +1199,7 @@ struct nft_stats { struct nft_hook { struct list_head list; - struct nf_hook_ops ops; + struct list_head ops_list; struct rcu_head rcu; char ifname[IFNAMSIZ]; u8 ifnamelen; -- cgit v1.2.3 From 465b9ee0ee7bc268d7f261356afd6c4262e48d82 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 21 May 2025 22:44:33 +0200 Subject: netfilter: nf_tables: Add notifications for hook changes Notify user space if netdev hooks are updated due to netdev add/remove events. Send minimal notification messages by introducing NFT_MSG_NEWDEV/DELDEV message types describing a single device only. Upon NETDEV_CHANGENAME, the callback has no information about the interface's old name. To provide a clear message to user space, include the hook's stored interface name in the notification. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ include/uapi/linux/netfilter/nf_tables.h | 10 ++++++++++ include/uapi/linux/netfilter/nfnetlink.h | 2 ++ 3 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5e49619ae49c..e4d8e451e935 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1142,6 +1142,11 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +struct nft_hook; +void nf_tables_chain_device_notify(const struct nft_chain *chain, + const struct nft_hook *hook, + const struct net_device *dev, int event); + enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2beb30be2c5f..518ba144544c 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -142,6 +142,8 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, NFT_MSG_GETSETELEM_RESET, + NFT_MSG_NEWDEV, + NFT_MSG_DELDEV, NFT_MSG_MAX, }; @@ -1784,10 +1786,18 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING) + * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING) + * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING) + * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, + NFTA_DEVICE_TABLE, + NFTA_DEVICE_FLOWTABLE, + NFTA_DEVICE_CHAIN, + NFTA_DEVICE_SPEC, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 6cd58cd2a6f0..50d807af2649 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -25,6 +25,8 @@ enum nfnetlink_groups { #define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA NFNLGRP_NFTRACE, #define NFNLGRP_NFTRACE NFNLGRP_NFTRACE + NFNLGRP_NFT_DEV, +#define NFNLGRP_NFT_DEV NFNLGRP_NFT_DEV __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) -- cgit v1.2.3 From 384492c48e6a88c9a7f0376d8e8ac7f557988e92 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 20 May 2025 13:30:42 -0700 Subject: net: devmem: support single IOV with sendmsg sendmsg() with a single iov becomes ITER_UBUF, sendmsg() with multiple iovs becomes ITER_IOVEC. iter_iov_len does not return correct value for UBUF, so teach to treat UBUF differently. Cc: Al Viro Cc: Pavel Begunkov Cc: Mina Almasry Fixes: bd61848900bf ("net: devmem: Implement TX path") Signed-off-by: Stanislav Fomichev Acked-by: Mina Almasry Reviewed-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/uio.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 49ece9e1888f..393d0622cc28 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -99,7 +99,13 @@ static inline const struct iovec *iter_iov(const struct iov_iter *iter) } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) -#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) + +static inline size_t iter_iov_len(const struct iov_iter *i) +{ + if (i->iter_type == ITER_UBUF) + return i->count; + return iter_iov(i)->iov_len - i->iov_offset; +} static inline enum iter_type iov_iter_type(const struct iov_iter *i) { -- cgit v1.2.3 From e45b7196df60a4aef86c3998611c91fcc93d21f3 Mon Sep 17 00:00:00 2001 From: Qiu Yutan Date: Wed, 21 May 2025 10:14:08 +0800 Subject: net: neigh: use kfree_skb_reason() in neigh_resolve_output() and neigh_connected_output() Replace kfree_skb() used in neigh_resolve_output() and neigh_connected_output() with kfree_skb_reason(). Following new skb drop reason is added: /* failed to fill the device hard header */ SKB_DROP_REASON_NEIGH_HH_FILLFAIL Signed-off-by: Qiu Yutan Signed-off-by: Jiang Kun Reviewed-by: Kuniyuki Iwashima Reviewed-by: Xu Xin Signed-off-by: David S. Miller --- include/net/dropreason-core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index bea77934a235..bcf9d7467e1a 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -62,6 +62,7 @@ FN(NEIGH_FAILED) \ FN(NEIGH_QUEUEFULL) \ FN(NEIGH_DEAD) \ + FN(NEIGH_HH_FILLFAIL) \ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ @@ -348,6 +349,8 @@ enum skb_drop_reason { SKB_DROP_REASON_NEIGH_QUEUEFULL, /** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */ SKB_DROP_REASON_NEIGH_DEAD, + /** @SKB_DROP_REASON_NEIGH_HH_FILLFAIL: failed to fill the device hard header */ + SKB_DROP_REASON_NEIGH_HH_FILLFAIL, /** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */ SKB_DROP_REASON_TC_EGRESS, /** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */ -- cgit v1.2.3 From 45ca7e9f0730ae36fc610e675b990e9cc9ca0714 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 21 May 2025 14:17:05 +0200 Subject: vsock/virtio: fix `rx_bytes` accounting for stream sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In `struct virtio_vsock_sock`, we maintain two counters: - `rx_bytes`: used internally to track how many bytes have been read. This supports mechanisms like .stream_has_data() and sock_rcvlowat(). - `fwd_cnt`: used for the credit mechanism to inform available receive buffer space to the remote peer. These counters are updated via virtio_transport_inc_rx_pkt() and virtio_transport_dec_rx_pkt(). Since the beginning with commit 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko"), we call virtio_transport_dec_rx_pkt() in virtio_transport_stream_do_dequeue() only when we consume the entire packet, so partial reads, do not update `rx_bytes` and `fwd_cnt`. This is fine for `fwd_cnt`, because we still have space used for the entire packet, and we don't want to update the credit for the other peer until we free the space of the entire packet. However, this causes `rx_bytes` to be stale on partial reads. Previously, this didn’t cause issues because `rx_bytes` was used only by .stream_has_data(), and any unread portion of a packet implied data was still available. However, since commit 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages"), we now rely on `rx_bytes` to determine if a credit update should be sent when the data in the RX queue drops below SO_RCVLOWAT value. This patch fixes the accounting by updating `rx_bytes` with the number of bytes actually read, even on partial reads, while leaving `fwd_cnt` untouched until the packet is fully consumed. Also introduce a new `buf_used` counter to check that the remote peer is honoring the given credit; this was previously done via `rx_bytes`. Fixes: 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages") Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250521121705.196379-1-sgarzare@redhat.com Signed-off-by: Paolo Abeni --- include/linux/virtio_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 0387d64e2c66..36fb3edfa403 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -140,6 +140,7 @@ struct virtio_vsock_sock { u32 last_fwd_cnt; u32 rx_bytes; u32 buf_alloc; + u32 buf_used; struct sk_buff_head rx_queue; u32 msg_count; }; -- cgit v1.2.3 From ed449ddbd867f2cc02d6890c231431f264a876eb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:09 -0700 Subject: net: core: Convert inet_addr_is_any() to sockaddr_storage All the callers of inet_addr_is_any() have a sockaddr_storage-backed sockaddr. Avoid casts and switch prototype to the actual object being used. Reviewed-by: Kuniyuki Iwashima Reviewed-by: Martin K. Petersen # SCSI Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-1-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/inet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/inet.h b/include/linux/inet.h index bd8276e96e60..9158772f3559 100644 --- a/include/linux/inet.h +++ b/include/linux/inet.h @@ -55,6 +55,6 @@ extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char extern int inet_pton_with_scope(struct net *net, unsigned short af, const char *src, const char *port, struct sockaddr_storage *addr); -extern bool inet_addr_is_any(struct sockaddr *addr); +bool inet_addr_is_any(struct sockaddr_storage *addr); #endif /* _LINUX_INET_H */ -- cgit v1.2.3 From 161972650d6795ea00f8b72557cf3c3e593ed250 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:10 -0700 Subject: net: core: Switch netif_set_mac_address() to struct sockaddr_storage In order to avoid passing around struct sockaddr that has a size the compiler cannot reason about (nor track at runtime), convert netif_set_mac_address() to take struct sockaddr_storage. This is just a cast conversion, so there is are no binary changes. Following patches will make actual allocation changes. Acked-by: Gustavo A. R. Silva Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-2-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ea9d335de130..47200a394a02 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4212,7 +4212,7 @@ int netif_set_mtu(struct net_device *dev, int new_mtu); int dev_set_mtu(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); -int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 9ca6804ab7c34f65fcf2e29333a39e7807c30b60 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:14 -0700 Subject: net: core: Convert dev_set_mac_address() to struct sockaddr_storage All users of dev_set_mac_address() are now using a struct sockaddr_storage. Convert the internal data type to struct sockaddr_storage, drop the casts, and update pointer types. Acked-by: Gustavo A. R. Silva Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-6-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 47200a394a02..b4242b997373 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4214,7 +4214,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); -- cgit v1.2.3 From ae9fcd5a0f8ab7e12619e1c66312a03b842935c3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:16 -0700 Subject: net: core: Convert dev_set_mac_address_user() to use struct sockaddr_storage Convert callers of dev_set_mac_address_user() to use struct sockaddr_storage. Add sanity checks on dev->addr_len usage. Signed-off-by: Kees Cook Acked-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20250521204619.2301870-8-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4242b997373..adb14db25798 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4216,7 +4216,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, -- cgit v1.2.3 From ba3d7b93dbe3202bf8ead473d75885af773068bc Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Wed, 21 May 2025 23:27:06 +0200 Subject: wireguard: allowedips: add WGALLOWEDIP_F_REMOVE_ME flag The current netlink API for WireGuard does not directly support removal of allowed ips from a peer. A user can remove an allowed ip from a peer in one of two ways: 1. By using the WGPEER_F_REPLACE_ALLOWEDIPS flag and providing a new list of allowed ips which omits the allowed ip that is to be removed. 2. By reassigning an allowed ip to a "dummy" peer then removing that peer with WGPEER_F_REMOVE_ME. With the first approach, the driver completely rebuilds the allowed ip list for a peer. If my current configuration is such that a peer has allowed ips 192.168.0.2 and 192.168.0.3 and I want to remove 192.168.0.2 the actual transition looks like this. [192.168.0.2, 192.168.0.3] <-- Initial state [] <-- Step 1: Allowed ips removed for peer [192.168.0.3] <-- Step 2: Allowed ips added back for peer This is true even if the allowed ip list is small and the update does not need to be batched into multiple WG_CMD_SET_DEVICE requests, as the removal and subsequent addition of ips is non-atomic within a single request. Consequently, wg_allowedips_lookup_dst and wg_allowedips_lookup_src may return NULL while reconfiguring a peer even for packets bound for ips a user did not intend to remove leading to unintended interruptions in connectivity. This presents in userspace as failed calls to sendto and sendmsg for UDP sockets. In my case, I ran netperf while repeatedly reconfiguring the allowed ips for a peer with wg. /usr/local/bin/netperf -H 10.102.73.72 -l 10m -t UDP_STREAM -- -R 1 -m 1024 send_data: data send error: No route to host (errno 113) netperf: send_omni: send_data failed: No route to host While this may not be of particular concern for environments where peers and allowed ips are mostly static, systems like Cilium manage peers and allowed ips in a dynamic environment where peers (i.e. Kubernetes nodes) and allowed ips (i.e. pods running on those nodes) can frequently change making WGPEER_F_REPLACE_ALLOWEDIPS problematic. The second approach avoids any possible connectivity interruptions but is hacky and less direct, requiring the creation of a temporary peer just to dispose of an allowed ip. Introduce a new flag called WGALLOWEDIP_F_REMOVE_ME which in the same way that WGPEER_F_REMOVE_ME allows a user to remove a single peer from a WireGuard device's configuration allows a user to remove an ip from a peer's set of allowed ips. This enables incremental updates to a device's configuration without any connectivity blips or messy workarounds. A corresponding patch for wg extends the existing `wg set` interface to leverage this feature. $ wg set wg0 peer allowed-ips +192.168.88.0/24,-192.168.0.1/32 When '+' or '-' is prepended to any ip in the list, wg clears WGPEER_F_REPLACE_ALLOWEDIPS and sets the WGALLOWEDIP_F_REMOVE_ME flag on any ip prefixed with '-'. Signed-off-by: Jordan Rife [Jason: minor style nits, fixes to selftest, bump of wireguard-tools version] Signed-off-by: Jason A. Donenfeld Link: https://patch.msgid.link/20250521212707.1767879-5-Jason@zx2c4.com Signed-off-by: Paolo Abeni --- include/uapi/linux/wireguard.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h index ae88be14c947..8c26391196d5 100644 --- a/include/uapi/linux/wireguard.h +++ b/include/uapi/linux/wireguard.h @@ -101,6 +101,10 @@ * WGALLOWEDIP_A_FAMILY: NLA_U16 * WGALLOWEDIP_A_IPADDR: struct in_addr or struct in6_addr * WGALLOWEDIP_A_CIDR_MASK: NLA_U8 + * WGALLOWEDIP_A_FLAGS: NLA_U32, WGALLOWEDIP_F_REMOVE_ME if + * the specified IP should be removed; + * otherwise, this IP will be added if + * it is not already present. * 0: NLA_NESTED * ... * 0: NLA_NESTED @@ -184,11 +188,16 @@ enum wgpeer_attribute { }; #define WGPEER_A_MAX (__WGPEER_A_LAST - 1) +enum wgallowedip_flag { + WGALLOWEDIP_F_REMOVE_ME = 1U << 0, + __WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME +}; enum wgallowedip_attribute { WGALLOWEDIP_A_UNSPEC, WGALLOWEDIP_A_FAMILY, WGALLOWEDIP_A_IPADDR, WGALLOWEDIP_A_CIDR_MASK, + WGALLOWEDIP_A_FLAGS, __WGALLOWEDIP_A_LAST }; #define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1) -- cgit v1.2.3 From 5ec40864aaecc4bd66fe67541d4a41091ed664a5 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 22 May 2025 01:18:22 +0200 Subject: vsock: Move lingering logic to af_vsock core Lingering should be transport-independent in the long run. In preparation for supporting other transports, as well as the linger on shutdown(), move code to core. Generalize by querying vsock_transport::unsent_bytes(), guard against the callback being unimplemented. Do not pass sk_lingertime explicitly. Pull SOCK_LINGER check into vsock_linger(). Flatten the function. Remove the nested block by inverting the condition: return early on !timeout. Suggested-by: Stefano Garzarella Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250522-vsock-linger-v6-2-2ad00b0e447e@rbox.co Signed-off-by: Paolo Abeni --- include/net/af_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 9e85424c8343..d56e6e135158 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -221,6 +221,7 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); +void vsock_linger(struct sock *sk); /**** TAP ****/ -- cgit v1.2.3 From e9cb929670a1e98b592b30f03f06e9e20110f318 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 22 May 2025 13:21:47 +0200 Subject: net: phy: fix up const issues in to_mdio_device() and to_phy_device() Both to_mdio_device() and to_phy_device() "throw away" the const pointer attribute passed to them and return a non-const pointer, which generally is not a good thing overall. Fix this up by using container_of_const() which was designed for this very problem. Cc: Alexander Lobakin Cc: Andrew Lunn Cc: Heiner Kallweit Cc: Russell King Fixes: 7eab14de73a8 ("mdio, phy: fix -Wshadow warnings triggered by nested container_of()") Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2025052246-conduit-glory-8fc9@gregkh Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 5 +---- include/linux/phy.h | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 3c3deac57894..e43ff9f980a4 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -45,10 +45,7 @@ struct mdio_device { unsigned int reset_deassert_delay; }; -static inline struct mdio_device *to_mdio_device(const struct device *dev) -{ - return container_of(dev, struct mdio_device, dev); -} +#define to_mdio_device(__dev) container_of_const(__dev, struct mdio_device, dev) /* struct mdio_driver_common: Common to all MDIO drivers */ struct mdio_driver_common { diff --git a/include/linux/phy.h b/include/linux/phy.h index 32b9da274115..e194dad1623d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -744,10 +744,7 @@ struct phy_device { #define PHY_F_NO_IRQ 0x80000000 #define PHY_F_RXC_ALWAYS_ON 0x40000000 -static inline struct phy_device *to_phy_device(const struct device *dev) -{ - return container_of(to_mdio_device(dev), struct phy_device, mdio); -} +#define to_phy_device(__dev) container_of_const(to_mdio_device(__dev), struct phy_device, mdio) /** * struct phy_tdr_config - Configuration of a TDR raw test -- cgit v1.2.3 From 33f1b3677a13dda60a2a59858f7916672e7f1546 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 May 2025 07:47:45 +0200 Subject: sctp: mark sctp_do_peeloff static sctp_do_peeloff is only used inside of net/sctp/socket.c, so mark it static. Signed-off-by: Christoph Hellwig Acked-by: Xin Long Link: https://patch.msgid.link/20250526054745.2329201-1-hch@lst.de Signed-off-by: Jakub Kicinski --- include/net/sctp/sctp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index d8da764cf6de..e96d1bd087f6 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -364,8 +364,6 @@ sctp_assoc_to_state(const struct sctp_association *asoc) /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp); - /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) -- cgit v1.2.3 From 290e5d3c49f687c1567bde634dc33d57b0674919 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Mon, 19 May 2025 09:20:36 -0700 Subject: net: mana: Add support for Multi Vports on Bare metal To support Multi Vports on Bare metal, increase the device config response version. And, skip the register HW vport, and register filter steps, when the Bare metal hostmode is set. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/1747671636-5810-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Paolo Abeni --- include/net/mana/mana.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0f78065de8fe..38238c1d00bf 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -408,6 +408,7 @@ struct mana_context { struct gdma_dev *gdma_dev; u16 num_ports; + u8 bm_hostmode; struct mana_eq *eqs; struct dentry *mana_eqs_debugfs; @@ -557,7 +558,8 @@ struct mana_query_device_cfg_resp { u64 pf_cap_flags4; u16 max_num_vports; - u16 reserved; + u8 bm_hostmode; /* response v3: Bare Metal Host Mode */ + u8 reserved; u32 max_num_eqs; /* response v2: */ -- cgit v1.2.3