From 0a7de4a8f898c480ffafe024c4a0a8b8819597f1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 7 Apr 2025 16:36:02 +0000
Subject: net: rps: remove kfree_rcu_mightsleep() use

Add an rcu_head to sd_flow_limit and rps_sock_flow_table structs
to use the more conventional and predictable k[v]free_rcu().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250407163602.170356-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rps.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/rps.h b/include/net/rps.h
index e358e9711f27..507f4aa5d39b 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -57,9 +57,10 @@ struct rps_dev_flow_table {
  * meaning we use 32-6=26 bits for the hash.
  */
 struct rps_sock_flow_table {
-	u32	mask;
+	struct rcu_head	rcu;
+	u32		mask;
 
-	u32	ents[] ____cacheline_aligned_in_smp;
+	u32		ents[] ____cacheline_aligned_in_smp;
 };
 #define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 
-- 
cgit v1.2.3


From a36283e2b683f172aa1760c77325e50b16c0f792 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 7 Apr 2025 17:45:41 +0200
Subject: udp_tunnel: create a fastpath GRO lookup.

Most UDP tunnels bind a socket to a local port, with ANY address, no
peer and no interface index specified.
Additionally it's quite common to have a single tunnel device per
namespace.

Track in each namespace the UDP tunnel socket respecting the above.
When only a single one is present, store a reference in the netns.

When such reference is not NULL, UDP tunnel GRO lookup just need to
match the incoming packet destination port vs the socket local port.

The tunnel socket never sets the reuse[port] flag[s]. When bound to no
address and interface, no other socket can exist in the same netns
matching the specified local port.

Matching packets with non-local destination addresses will be
aggregated, and eventually segmented as needed - no behavior changes
intended.

Restrict the optimization to kernel sockets only: it covers all the
relevant use-cases, and user-space owned sockets could be disconnected
and rebound after setup_udp_tunnel_sock(), breaking the uniqueness
assumption

Note that the UDP tunnel socket reference is stored into struct
netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep
all the fastpath-related netns fields in the same struct and allow
cacheline-based optimization. Currently both the IPv4 and IPv6 socket
pointer share the same cacheline as the `udp_table` field.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/41d16bc8d1257d567f9344c445b4ae0b4a91ede4.1744040675.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/udp.h      | 16 ++++++++++++++++
 include/net/netns/ipv4.h | 11 +++++++++++
 include/net/udp.h        |  1 +
 include/net/udp_tunnel.h | 12 ++++++++++++
 4 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 0807e21cfec9..895240177f4f 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -101,6 +101,13 @@ struct udp_sock {
 
 	/* Cache friendly copy of sk->sk_peek_off >= 0 */
 	bool		peeking_with_offset;
+
+	/*
+	 * Accounting for the tunnel GRO fastpath.
+	 * Unprotected by compilers guard, as it uses space available in
+	 * the last UDP socket cacheline.
+	 */
+	struct hlist_node	tunnel_list;
 };
 
 #define udp_test_bit(nr, sk)			\
@@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk)
 
 #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE)
 
+static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6)
+{
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+	return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk);
+#else
+	return NULL;
+#endif
+}
+
 #endif	/* _LINUX_UDP_H */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 650b2dc9199f..6373e3f17da8 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed {
 };
 #endif
 
+struct udp_tunnel_gro {
+	struct sock __rcu *sk;
+	struct hlist_head list;
+};
+
 struct netns_ipv4 {
 	/* Cacheline organization can be found documented in
 	 * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst.
@@ -85,6 +90,11 @@ struct netns_ipv4 {
 	struct inet_timewait_death_row tcp_death_row;
 	struct udp_table *udp_table;
 
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+	/* Not in a pernet subsys because need to be available at GRO stage */
+	struct udp_tunnel_gro udp_tunnel_gro[2];
+#endif
+
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*forw_hdr;
 	struct ctl_table_header	*frags_hdr;
@@ -277,4 +287,5 @@ struct netns_ipv4 {
 	struct hlist_head	*inet_addr_lst;
 	struct delayed_work	addr_chk_work;
 };
+
 #endif
diff --git a/include/net/udp.h b/include/net/udp.h
index 6e89520e100d..a772510b2aa5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -290,6 +290,7 @@ static inline void udp_lib_init_sock(struct sock *sk)
 	struct udp_sock *up = udp_sk(sk);
 
 	skb_queue_head_init(&up->reader_queue);
+	INIT_HLIST_NODE(&up->tunnel_list);
 	up->forward_threshold = sk->sk_rcvbuf >> 2;
 	set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
 }
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index a93dc51f6323..1bb2b852e90e 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -191,6 +191,18 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
 }
 #endif
 
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add);
+#else
+static inline void udp_tunnel_update_gro_lookup(struct net *net,
+						struct sock *sk, bool add) {}
+#endif
+
+static inline void udp_tunnel_cleanup_gro(struct sock *sk)
+{
+	udp_tunnel_update_gro_lookup(sock_net(sk), sk, false);
+}
+
 static inline void udp_tunnel_encap_enable(struct sock *sk)
 {
 	if (udp_test_and_set_bit(ENCAP_ENABLED, sk))
-- 
cgit v1.2.3


From 5d7f5b2f6b935517ee5fd8058dc32342a5cba3e1 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 7 Apr 2025 17:45:42 +0200
Subject: udp_tunnel: use static call for GRO hooks when possible

It's quite common to have a single UDP tunnel type active in the
whole system. In such a case we can replace the indirect call for
the UDP tunnel GRO callback with a static call.

Add the related accounting in the control path and switch to static
call when possible. To keep the code simple use a static array for
the registered tunnel types, and size such array based on the kernel
config.

Note that there are valid kernel configurations leading to
UDP_MAX_TUNNEL_TYPES == 0 even with IS_ENABLED(CONFIG_NET_UDP_TUNNEL),
Explicitly skip the accounting in such a case, to avoid compile warning
when accessing "udp_tunnel_gro_types".

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/53d156cdfddcc9678449e873cc83e68fa1582653.1744040675.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/udp_tunnel.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 1bb2b852e90e..288f06f23a80 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -193,13 +193,16 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
 
 #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
 void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add);
+void udp_tunnel_update_gro_rcv(struct sock *sk, bool add);
 #else
 static inline void udp_tunnel_update_gro_lookup(struct net *net,
 						struct sock *sk, bool add) {}
+static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {}
 #endif
 
 static inline void udp_tunnel_cleanup_gro(struct sock *sk)
 {
+	udp_tunnel_update_gro_rcv(sk, false);
 	udp_tunnel_update_gro_lookup(sock_net(sk), sk, false);
 }
 
@@ -212,6 +215,7 @@ static inline void udp_tunnel_encap_enable(struct sock *sk)
 	if (READ_ONCE(sk->sk_family) == PF_INET6)
 		ipv6_stub->udpv6_encap_enable();
 #endif
+	udp_tunnel_update_gro_rcv(sk, true);
 	udp_encap_enable();
 }
 
-- 
cgit v1.2.3


From 420aabef3ab5fa743afb4d3d391f03ef0e777ca8 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Mon, 7 Apr 2025 21:01:02 +0200
Subject: net: Drop unused @sk of __skb_try_recv_from_queue()

__skb_try_recv_from_queue() deals with a queue, @sk is not used
since commit  e427cad6eee4 ("net: datagram: drop 'destructor'
argument from several helpers"). Remove sk from function parameters,
adapt callers.

No functional change intended.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Link: https://patch.msgid.link/20250407-cleanup-drop-param-sk-v1-1-cd076979afac@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b974a277975a..f1381aff0f89 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4105,8 +4105,7 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
 				int *err, long *timeo_p,
 				const struct sk_buff *skb);
-struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
-					  struct sk_buff_head *queue,
+struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
 					  unsigned int flags,
 					  int *off, int *err,
 					  struct sk_buff **last);
-- 
cgit v1.2.3


From a82dc19db13649aa4232ce37cb6f4ceff851e2fe Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Apr 2025 12:59:48 -0700
Subject: net: avoid potential race between netdev_get_by_index_lock() and
 netns switch

netdev_get_by_index_lock() performs following steps:

  rcu_lock();
  dev = lookup(netns, ifindex);
  dev_get(dev);
  rcu_unlock();
  [... lock & validate the dev ...]
  return dev

Validation right now only checks if the device is registered but since
the lookup is netns-aware we must also protect against the device
switching netns right after we dropped the RCU lock. Otherwise
the caller in netns1 may get a pointer to a device which has just
switched to netns2.

We can't hold the lock for the entire netns change process (because of
the NETDEV_UNREGISTER notifier), and there's no existing marking to
indicate that the netns is unlisted because of netns move, so add one.

AFAIU none of the existing netdev_get_by_index_lock() callers can
suffer from this problem (NAPI code double checks the netns membership
and other callers are either under rtnl_lock or not ns-sensitive),
so this patch does not have to be treated as a fix.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250408195956.412733-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cf3b6445817b..8e9be80bc167 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1952,6 +1952,7 @@ enum netdev_reg_state {
  *	@priv_destructor:	Called from unregister
  *	@npinfo:		XXX: need comments on this one
  * 	@nd_net:		Network namespace this network device is inside
+ *				protected by @lock
  *
  * 	@ml_priv:	Mid-layer private
  *	@ml_priv_type:  Mid-layer private type
@@ -2359,6 +2360,9 @@ struct net_device {
 
 	bool dismantle;
 
+	/** @moving_ns: device is changing netns, protected by @lock */
+	bool moving_ns;
+
 	enum {
 		RTNL_LINK_INITIALIZED,
 		RTNL_LINK_INITIALIZING,
@@ -2521,7 +2525,7 @@ struct net_device {
 	 *	@net_shaper_hierarchy, @reg_state, @threaded
 	 *
 	 * Double protects:
-	 *	@up
+	 *	@up, @moving_ns, @nd_net
 	 *
 	 * Double ops protects:
 	 *	@real_num_rx_queues, @real_num_tx_queues
-- 
cgit v1.2.3


From 606048cbd8346e616cfaee01b0143d072534136d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Apr 2025 12:59:49 -0700
Subject: net: designate XSK pool pointers in queues as "ops protected"

Read accesses go via xsk_get_pool_from_qid(), the call coming
from the core and gve look safe (other "ops locked" drivers
don't support XSK).

Write accesses go via xsk_reg_pool_at_qid() and xsk_clear_pool_at_qid().
Former is already under the ops lock, latter is not (both coming from
the workqueue via xp_clear_dev() and NETDEV_UNREGISTER via xsk_notifier()).

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250408195956.412733-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h     | 1 +
 include/net/netdev_rx_queue.h | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8e9be80bc167..7242fb8a22fc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -688,6 +688,7 @@ struct netdev_queue {
 	/* Subordinate device that the queue has been assigned to */
 	struct net_device	*sb_dev;
 #ifdef CONFIG_XDP_SOCKETS
+	/* "ops protected", see comment about net_device::lock */
 	struct xsk_buff_pool    *pool;
 #endif
 
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index b2238b551dce..8cdcd138b33f 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -20,12 +20,12 @@ struct netdev_rx_queue {
 	struct net_device		*dev;
 	netdevice_tracker		dev_tracker;
 
+	/* All fields below are "ops protected",
+	 * see comment about net_device::lock
+	 */
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool            *pool;
 #endif
-	/* NAPI instance for the queue
-	 * "ops protected", see comment about net_device::lock
-	 */
 	struct napi_struct		*napi;
 	struct pp_memory_provider_params mp_params;
 } ____cacheline_aligned_in_smp;
-- 
cgit v1.2.3


From 4ec9031cbeb73a66979560bbb6d355329be762de Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Apr 2025 12:59:50 -0700
Subject: netdev: add "ops compat locking" helpers

Add helpers to "lock a netdev in a backward-compatible way",
which for ops-locked netdevs will mean take the instance lock.
For drivers which haven't opted into the ops locking we'll take
rtnl_lock.

The scoped foreach is dropping and re-taking the lock for each
device, even if prev and next are both under rtnl_lock.
I hope that's fine since we expect that netdev nl to be mostly
supported by modern drivers, and modern drivers should also
opt into the instance locking.

Note that these helpers are mostly needed for queue related state,
because drivers modify queue config in their ops in a non-atomic
way. Or differently put, queue changes don't have a clear-cut API
like NAPI configuration. Any state that can should just use the
instance lock directly, not the "compat" hacks.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250408195956.412733-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netdev_lock.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h
index c316b551df8d..5706835a660c 100644
--- a/include/net/netdev_lock.h
+++ b/include/net/netdev_lock.h
@@ -64,6 +64,22 @@ netdev_ops_assert_locked_or_invisible(const struct net_device *dev)
 		netdev_ops_assert_locked(dev);
 }
 
+static inline void netdev_lock_ops_compat(struct net_device *dev)
+{
+	if (netdev_need_ops_lock(dev))
+		netdev_lock(dev);
+	else
+		rtnl_lock();
+}
+
+static inline void netdev_unlock_ops_compat(struct net_device *dev)
+{
+	if (netdev_need_ops_lock(dev))
+		netdev_unlock(dev);
+	else
+		rtnl_unlock();
+}
+
 static inline int netdev_lock_cmp_fn(const struct lockdep_map *a,
 				     const struct lockdep_map *b)
 {
-- 
cgit v1.2.3


From 03df156dd3a6d5992f17682cd5c3b11e5ffdae02 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Apr 2025 12:59:52 -0700
Subject: xdp: double protect netdev->xdp_flags with netdev->lock

Protect xdp_features with netdev->lock. This way pure readers
no longer have to take rtnl_lock to access the field.

This includes calling NETDEV_XDP_FEAT_CHANGE under the lock.
Looks like that's fine for bonding, the only "real" listener,
it's the same as ethtool feature change.

In terms of normal drivers - only GVE need special consideration
(other drivers don't use instance lock or don't support XDP).
It calls xdp_set_features_flag() helper from gve_init_priv() which
in turn is called from gve_reset_recovery() (locked), or prior
to netdev registration. So switch to _locked.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Acked-by: Harshitha Ramamurthy <hramamurthy@google.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20250408195956.412733-6-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 include/net/xdp.h         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7242fb8a22fc..dece2ae396a1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2526,7 +2526,7 @@ struct net_device {
 	 *	@net_shaper_hierarchy, @reg_state, @threaded
 	 *
 	 * Double protects:
-	 *	@up, @moving_ns, @nd_net
+	 *	@up, @moving_ns, @nd_net, @xdp_flags
 	 *
 	 * Double ops protects:
 	 *	@real_num_rx_queues, @real_num_tx_queues
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 48efacbaa35d..20e41b5ff319 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -616,6 +616,7 @@ struct xdp_metadata_ops {
 u32 bpf_xdp_metadata_kfunc_id(int id);
 bool bpf_dev_bound_kfunc_id(u32 btf_id);
 void xdp_set_features_flag(struct net_device *dev, xdp_features_t val);
+void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val);
 void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg);
 void xdp_features_clear_redirect_target(struct net_device *dev);
 #else
-- 
cgit v1.2.3


From ce7b14947484e6190372f2c3dbfb69aafbc4c0fc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Apr 2025 12:59:55 -0700
Subject: netdev: depend on netdev->lock for qstats in ops locked drivers

We mostly needed rtnl_lock in qstat to make sure the queue count
is stable while we work. For "ops locked" drivers the instance
lock protects the queue count, so we don't have to take rtnl_lock.

For currently ops-locked drivers: netdevsim and bnxt need
the protection from netdev going down while we dump, which
instance lock provides. gve doesn't care.

Reviewed-by: Joe Damato <jdamato@fastly.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250408195956.412733-9-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netdev_queues.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 825141d675e5..ea709b59d827 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -85,9 +85,11 @@ struct netdev_queue_stats_tx {
  * for some of the events is not maintained, and reliable "total" cannot
  * be provided).
  *
+ * Ops are called under the instance lock if netdev_need_ops_lock()
+ * returns true, otherwise under rtnl_lock.
  * Device drivers can assume that when collecting total device stats,
  * the @get_base_stats and subsequent per-queue calls are performed
- * "atomically" (without releasing the rtnl_lock).
+ * "atomically" (without releasing the relevant lock).
  *
  * Device drivers are encouraged to reset the per-queue statistics when
  * number of queues change. This is because the primary use case for
-- 
cgit v1.2.3


From 229671ac60e298b85c2644f52d7e487e9f487d06 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Apr 2025 20:27:42 +0000
Subject: net: remove cpu stall in txq_trans_update()

txq_trans_update() currently uses txq->xmit_lock_owner
to conditionally update txq->trans_start.

For regular devices, txq->xmit_lock_owner is updated
from HARD_TX_LOCK() and HARD_TX_UNLOCK(), and this apparently
causes cpu stalls.

Using dev->lltx, which sits in a read-mostly cache-line,
and already used in HARD_TX_LOCK() and HARD_TX_UNLOCK()
helps cpu prediction.

On an AMD EPYC 7B12 dual socket server, tcp_rr with 128 threads
and 30,000 flows gets a 5 % increase in throughput.

As explained in commit 95ecba62e2fd ("net: fix races in
netdev_tx_sent_queue()/dev_watchdog()") I am planning
to no longer update txq->trans_start in the fast path
in a followup patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250408202742.2145516-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dece2ae396a1..a28a08046615 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4693,9 +4693,10 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
 /*
  * txq->trans_start can be read locklessly from dev_watchdog()
  */
-static inline void txq_trans_update(struct netdev_queue *txq)
+static inline void txq_trans_update(const struct net_device *dev,
+				    struct netdev_queue *txq)
 {
-	if (txq->xmit_lock_owner != -1)
+	if (!dev->lltx)
 		WRITE_ONCE(txq->trans_start, jiffies);
 }
 
@@ -5214,7 +5215,7 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi
 
 	rc = __netdev_start_xmit(ops, skb, dev, more);
 	if (rc == NETDEV_TX_OK)
-		txq_trans_update(txq);
+		txq_trans_update(dev, txq);
 
 	return rc;
 }
-- 
cgit v1.2.3


From 04271411121a58d37f47b065bc872f333274bf1f Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <jiayuan.chen@linux.dev>
Date: Wed, 9 Apr 2025 19:26:04 +0800
Subject: tcp: add TCP_RFC7323_TW_PAWS drop reason

Devices in the networking path, such as firewalls, NATs, or routers, which
can perform SNAT or DNAT, use addresses from their own limited address
pools to masquerade the source address during forwarding, causing PAWS
verification to fail more easily.

Currently, packet loss statistics for PAWS can only be viewed through MIB,
which is a global metric and cannot be precisely obtained through tracing
to get the specific 4-tuple of the dropped packet. In the past, we had to
use kprobe ret to retrieve relevant skb information from
tcp_timewait_state_process().

We add a drop_reason pointer, similar to what previous commit does:
commit e34100c2ecbb ("tcp: add a drop_reason pointer to tcp_check_req()")

This commit addresses the PAWSESTABREJECTED case and also sets the
corresponding drop reason.

We use 'pwru' to test.

Before this commit:
''''
./pwru 'port 9999'
2025/04/07 13:40:19 Listening for events..
TUPLE                                        FUNC
172.31.75.115:12345->172.31.75.114:9999(tcp) sk_skb_reason_drop(SKB_DROP_REASON_NOT_SPECIFIED)
'''

After this commit:
'''
./pwru 'port 9999'
2025/04/07 13:51:34 Listening for events..
TUPLE                                        FUNC
172.31.75.115:12345->172.31.75.114:9999(tcp) sk_skb_reason_drop(SKB_DROP_REASON_TCP_RFC7323_TW_PAWS)
'''

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250409112614.16153-2-jiayuan.chen@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason-core.h | 6 ++++++
 include/net/tcp.h             | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index e4fdc6b54cef..9701d7f936f6 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -40,6 +40,7 @@
 	FN(TCP_OFOMERGE)		\
 	FN(TCP_RFC7323_PAWS)		\
 	FN(TCP_RFC7323_PAWS_ACK)	\
+	FN(TCP_RFC7323_TW_PAWS)		\
 	FN(TCP_RFC7323_TSECR)		\
 	FN(TCP_LISTEN_OVERFLOW)		\
 	FN(TCP_OLD_SEQUENCE)		\
@@ -283,6 +284,11 @@ enum skb_drop_reason {
 	 * Corresponds to LINUX_MIB_PAWS_OLD_ACK.
 	 */
 	SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK,
+	/**
+	 * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in
+	 * TIME_WAIT state.
+	 */
+	SKB_DROP_REASON_TCP_RFC7323_TW_PAWS,
 	/**
 	 * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr.
 	 * Corresponds to LINUX_MIB_TSECRREJECTED.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4450c384ef17..5078ad868fee 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -427,7 +427,8 @@ enum tcp_tw_status {
 enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
 					      struct sk_buff *skb,
 					      const struct tcphdr *th,
-					      u32 *tw_isn);
+					      u32 *tw_isn,
+					      enum skb_drop_reason *drop_reason);
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			   struct request_sock *req, bool fastopen,
 			   bool *lost_race, enum skb_drop_reason *drop_reason);
-- 
cgit v1.2.3


From c449d5f3a3d70b6223af8df2cadca3ca6eacb613 Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <jiayuan.chen@linux.dev>
Date: Wed, 9 Apr 2025 19:26:05 +0800
Subject: tcp: add LINUX_MIB_PAWS_TW_REJECTED counter

When TCP is in TIME_WAIT state, PAWS verification uses
LINUX_PAWSESTABREJECTED, which is ambiguous and cannot be distinguished
from other PAWS verification processes.

We added a new counter, like the existing PAWS_OLD_ACK one.

Also we update the doc with previously missing PAWS_OLD_ACK.

usage:
'''
nstat -az | grep PAWSTimewait
TcpExtPAWSTimewait              1                  0.0
'''

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250409112614.16153-3-jiayuan.chen@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason-core.h | 1 +
 include/uapi/linux/snmp.h     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index 9701d7f936f6..bea77934a235 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -287,6 +287,7 @@ enum skb_drop_reason {
 	/**
 	 * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in
 	 * TIME_WAIT state.
+	 * Corresponds to LINUX_MIB_PAWS_TW_REJECTED.
 	 */
 	SKB_DROP_REASON_TCP_RFC7323_TW_PAWS,
 	/**
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index ec47f9b68a1b..1d234d7e1892 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -188,6 +188,7 @@ enum
 	LINUX_MIB_PAWSESTABREJECTED,		/* PAWSEstabRejected */
 	LINUX_MIB_TSECRREJECTED,		/* TSEcrRejected */
 	LINUX_MIB_PAWS_OLD_ACK,			/* PAWSOldAck */
+	LINUX_MIB_PAWS_TW_REJECTED,		/* PAWSTimewait */
 	LINUX_MIB_DELAYEDACKS,			/* DelayedACKs */
 	LINUX_MIB_DELAYEDACKLOCKED,		/* DelayedACKLocked */
 	LINUX_MIB_DELAYEDACKLOST,		/* DelayedACKLost */
-- 
cgit v1.2.3


From b1e904999542ad6764eafa54545f1c55776006d1 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 8 Apr 2025 11:32:01 -0700
Subject: net: pass const to msg_data_left()

The msg_data_left() function doesn't modify the struct msghdr parameter,
so mark it as const. This allows the function to be used with const
references, improving type safety and making the API more flexible.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-1-208b87064c28@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/socket.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index c3322eb3d686..3b262487ec06 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -168,7 +168,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr
 	return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg);
 }
 
-static inline size_t msg_data_left(struct msghdr *msg)
+static inline size_t msg_data_left(const struct msghdr *msg)
 {
 	return iov_iter_count(&msg->msg_iter);
 }
-- 
cgit v1.2.3


From 0f08335ade71273f89d19412268b48b55f3e3726 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 8 Apr 2025 11:32:02 -0700
Subject: trace: tcp: Add tracepoint for tcp_sendmsg_locked()

Add a tracepoint to monitor TCP send operations, enabling detailed
visibility into TCP message transmission.

Create a new tracepoint within the tcp_sendmsg_locked function,
capturing traditional fields along with size_goal, which indicates the
optimal data size for a single TCP segment. Additionally, a reference to
the struct sock sk is passed, allowing direct access for BPF programs.
The implementation is largely based on David's patch[1] and suggestions.

Link: https://lore.kernel.org/all/70168c8f-bf52-4279-b4c4-be64527aa1ac@kernel.org/ [1]
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-2-208b87064c28@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/tcp.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 1a40c41ff8c3..75d3d53a3832 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -259,6 +259,30 @@ TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
+TRACE_EVENT(tcp_sendmsg_locked,
+	TP_PROTO(const struct sock *sk, const struct msghdr *msg,
+		 const struct sk_buff *skb, int size_goal),
+
+	TP_ARGS(sk, msg, skb, size_goal),
+
+	TP_STRUCT__entry(
+		__field(const void *, skb_addr)
+		__field(int, skb_len)
+		__field(int, msg_left)
+		__field(int, size_goal)
+	),
+
+	TP_fast_assign(
+		__entry->skb_addr = skb;
+		__entry->skb_len = skb ? skb->len : 0;
+		__entry->msg_left = msg_data_left(msg);
+		__entry->size_goal = size_goal;
+	),
+
+	TP_printk("skb_addr %p skb_len %d msg_left %d size_goal %d",
+		  __entry->skb_addr, __entry->skb_len, __entry->msg_left,
+		  __entry->size_goal));
+
 DECLARE_TRACE(tcp_cwnd_reduction_tp,
 	TP_PROTO(const struct sock *sk, int newly_acked_sacked,
 		 int newly_lost, int flag),
-- 
cgit v1.2.3


From 2a63dd0edf388802074f1d4d6b588a3b4c380688 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 9 Apr 2025 19:36:45 -0700
Subject: net: Retire DCCP socket.

DCCP was orphaned in 2021 by commit 054c4610bd05 ("MAINTAINERS: dccp:
move Gerrit Renker to CREDITS"), which noted that the last maintainer
had been inactive for five years.

In recent years, it has become a playground for syzbot, and most changes
to DCCP have been odd bug fixes triggered by syzbot.  Apart from that,
the only changes have been driven by treewide or networking API updates
or adjustments related to TCP.

Thus, in 2023, we announced we would remove DCCP in 2025 via commit
b144fcaf46d4 ("dccp: Print deprecation notice.").

Since then, only one individual has contacted the netdev mailing list. [0]

There is ongoing research for Multipath DCCP.  The repository is hosted
on GitHub [1], and development is not taking place through the upstream
community.  While the repository is published under the GPLv2 license,
the scheduling part remains proprietary, with a LICENSE file [2] stating:

  "This is not Open Source software."

The researcher mentioned a plan to address the licensing issue, upstream
the patches, and step up as a maintainer, but there has been no further
communication since then.

Maintaining DCCP for a decade without any real users has become a burden.

Therefore, it's time to remove it.

Removing DCCP will also provide significant benefits to TCP.  It allows
us to freely reorganize the layout of struct inet_connection_sock, which
is currently shared with DCCP, and optimize it to reduce the number of
cachelines accessed in the TCP fast path.

Note that we keep DCCP netfilter modules as requested.  [3]

Link: https://lore.kernel.org/netdev/20230710182253.81446-1-kuniyu@amazon.com/T/#u #[0]
Link: https://github.com/telekom/mp-dccp #[1]
Link: https://github.com/telekom/mp-dccp/blob/mpdccp_v03_k5.10/net/dccp/non_gpl_scheduler/LICENSE #[2]
Link: https://lore.kernel.org/netdev/Z_VQ0KlCRkqYWXa-@calendula/ #[3]
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paul Moore <paul@paul-moore.com> (LSM and SELinux)
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Link: https://patch.msgid.link/20250410023921.11307-3-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dccp.h          | 289 ------------------------------------------
 include/linux/tfrc.h          |  51 --------
 include/net/inet_hashtables.h |   6 +-
 include/net/rstreason.h       |   2 +-
 include/net/secure_seq.h      |   4 -
 include/trace/events/sock.h   |   1 -
 include/trace/events/sunrpc.h |   2 -
 7 files changed, 2 insertions(+), 353 deletions(-)
 delete mode 100644 include/linux/tfrc.h

(limited to 'include')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 325af611909f..0b61b8b996d4 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -2,79 +2,8 @@
 #ifndef _LINUX_DCCP_H
 #define _LINUX_DCCP_H
 
-
-#include <linux/in.h>
-#include <linux/interrupt.h>
-#include <linux/ktime.h>
-#include <linux/list.h>
-#include <linux/uio.h>
-#include <linux/workqueue.h>
-
-#include <net/inet_connection_sock.h>
-#include <net/inet_sock.h>
-#include <net/inet_timewait_sock.h>
-#include <net/tcp_states.h>
 #include <uapi/linux/dccp.h>
 
-enum dccp_state {
-	DCCP_OPEN	     = TCP_ESTABLISHED,
-	DCCP_REQUESTING	     = TCP_SYN_SENT,
-	DCCP_LISTEN	     = TCP_LISTEN,
-	DCCP_RESPOND	     = TCP_SYN_RECV,
-	/*
-	 * States involved in closing a DCCP connection:
-	 * 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq.
-	 *
-	 * 2) CLOSING can have three different meanings (RFC 4340, 8.3):
-	 *  a. Client has performed active-close, has sent a Close to the server
-	 *     from state OPEN or PARTOPEN, and is waiting for the final Reset
-	 *     (in this case, SOCK_DONE == 1).
-	 *  b. Client is asked to perform passive-close, by receiving a CloseReq
-	 *     in (PART)OPEN state. It sends a Close and waits for final Reset
-	 *     (in this case, SOCK_DONE == 0).
-	 *  c. Server performs an active-close as in (a), keeps TIMEWAIT state.
-	 *
-	 * 3) The following intermediate states are employed to give passively
-	 *    closing nodes a chance to process their unread data:
-	 *    - PASSIVE_CLOSE    (from OPEN => CLOSED) and
-	 *    - PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above).
-	 */
-	DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1,
-	DCCP_PASSIVE_CLOSE   = TCP_CLOSE_WAIT,	/* any node receiving a Close */
-	DCCP_CLOSING	     = TCP_CLOSING,
-	DCCP_TIME_WAIT	     = TCP_TIME_WAIT,
-	DCCP_CLOSED	     = TCP_CLOSE,
-	DCCP_NEW_SYN_RECV    = TCP_NEW_SYN_RECV,
-	DCCP_PARTOPEN	     = TCP_MAX_STATES,
-	DCCP_PASSIVE_CLOSEREQ,			/* clients receiving CloseReq */
-	DCCP_MAX_STATES
-};
-
-enum {
-	DCCPF_OPEN	      = TCPF_ESTABLISHED,
-	DCCPF_REQUESTING      = TCPF_SYN_SENT,
-	DCCPF_LISTEN	      = TCPF_LISTEN,
-	DCCPF_RESPOND	      = TCPF_SYN_RECV,
-	DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1,
-	DCCPF_CLOSING	      = TCPF_CLOSING,
-	DCCPF_TIME_WAIT	      = TCPF_TIME_WAIT,
-	DCCPF_CLOSED	      = TCPF_CLOSE,
-	DCCPF_NEW_SYN_RECV    = TCPF_NEW_SYN_RECV,
-	DCCPF_PARTOPEN	      = (1 << DCCP_PARTOPEN),
-};
-
-static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb)
-{
-	return (struct dccp_hdr *)skb_transport_header(skb);
-}
-
-static inline struct dccp_hdr *dccp_zeroed_hdr(struct sk_buff *skb, int headlen)
-{
-	skb_push(skb, headlen);
-	skb_reset_transport_header(skb);
-	return memset(skb_transport_header(skb), 0, headlen);
-}
-
 static inline struct dccp_hdr_ext *dccp_hdrx(const struct dccp_hdr *dh)
 {
 	return (struct dccp_hdr_ext *)((unsigned char *)dh + sizeof(*dh));
@@ -85,12 +14,6 @@ static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh)
 	return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0);
 }
 
-static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb)
-{
-	const struct dccp_hdr *dh = dccp_hdr(skb);
-	return __dccp_basic_hdr_len(dh);
-}
-
 static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh)
 {
 	__u64 seq_nr =  ntohs(dh->dccph_seq);
@@ -103,222 +26,10 @@ static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh)
 	return seq_nr;
 }
 
-static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb)
-{
-	return (struct dccp_hdr_request *)(skb_transport_header(skb) +
-					   dccp_basic_hdr_len(skb));
-}
-
-static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb)
-{
-	return (struct dccp_hdr_ack_bits *)(skb_transport_header(skb) +
-					    dccp_basic_hdr_len(skb));
-}
-
-static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb)
-{
-	const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb);
-	return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low);
-}
-
-static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb)
-{
-	return (struct dccp_hdr_response *)(skb_transport_header(skb) +
-					    dccp_basic_hdr_len(skb));
-}
-
-static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb)
-{
-	return (struct dccp_hdr_reset *)(skb_transport_header(skb) +
-					 dccp_basic_hdr_len(skb));
-}
-
 static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh)
 {
 	return __dccp_basic_hdr_len(dh) +
 	       dccp_packet_hdr_len(dh->dccph_type);
 }
 
-static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
-{
-	return __dccp_hdr_len(dccp_hdr(skb));
-}
-
-/**
- * struct dccp_request_sock  -  represent DCCP-specific connection request
- * @dreq_inet_rsk: structure inherited from
- * @dreq_iss: initial sequence number, sent on the first Response (RFC 4340, 7.1)
- * @dreq_gss: greatest sequence number sent (for retransmitted Responses)
- * @dreq_isr: initial sequence number received in the first Request
- * @dreq_gsr: greatest sequence number received (for retransmitted Request(s))
- * @dreq_service: service code present on the Request (there is just one)
- * @dreq_featneg: feature negotiation options for this connection
- * The following two fields are analogous to the ones in dccp_sock:
- * @dreq_timestamp_echo: last received timestamp to echo (13.1)
- * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo
- */
-struct dccp_request_sock {
-	struct inet_request_sock dreq_inet_rsk;
-	__u64			 dreq_iss;
-	__u64			 dreq_gss;
-	__u64			 dreq_isr;
-	__u64			 dreq_gsr;
-	__be32			 dreq_service;
-	spinlock_t		 dreq_lock;
-	struct list_head	 dreq_featneg;
-	__u32			 dreq_timestamp_echo;
-	__u32			 dreq_timestamp_time;
-};
-
-static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req)
-{
-	return (struct dccp_request_sock *)req;
-}
-
-extern struct inet_timewait_death_row dccp_death_row;
-
-extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
-			      struct sk_buff *skb);
-
-struct dccp_options_received {
-	u64	dccpor_ndp:48;
-	u32	dccpor_timestamp;
-	u32	dccpor_timestamp_echo;
-	u32	dccpor_elapsed_time;
-};
-
-struct ccid;
-
-enum dccp_role {
-	DCCP_ROLE_UNDEFINED,
-	DCCP_ROLE_LISTEN,
-	DCCP_ROLE_CLIENT,
-	DCCP_ROLE_SERVER,
-};
-
-struct dccp_service_list {
-	__u32	dccpsl_nr;
-	__be32	dccpsl_list[];
-};
-
-#define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1)
-#define DCCP_SERVICE_CODE_IS_ABSENT		0
-
-static inline bool dccp_list_has_service(const struct dccp_service_list *sl,
-					const __be32 service)
-{
-	if (likely(sl != NULL)) {
-		u32 i = sl->dccpsl_nr;
-		while (i--)
-			if (sl->dccpsl_list[i] == service)
-				return true;
-	}
-	return false;
-}
-
-struct dccp_ackvec;
-
-/**
- * struct dccp_sock - DCCP socket state
- *
- * @dccps_swl - sequence number window low
- * @dccps_swh - sequence number window high
- * @dccps_awl - acknowledgement number window low
- * @dccps_awh - acknowledgement number window high
- * @dccps_iss - initial sequence number sent
- * @dccps_isr - initial sequence number received
- * @dccps_osr - first OPEN sequence number received
- * @dccps_gss - greatest sequence number sent
- * @dccps_gsr - greatest valid sequence number received
- * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss
- * @dccps_service - first (passive sock) or unique (active sock) service code
- * @dccps_service_list - second .. last service code on passive socket
- * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option
- * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo
- * @dccps_l_ack_ratio - feature-local Ack Ratio
- * @dccps_r_ack_ratio - feature-remote Ack Ratio
- * @dccps_l_seq_win - local Sequence Window (influences ack number validity)
- * @dccps_r_seq_win - remote Sequence Window (influences seq number validity)
- * @dccps_pcslen - sender   partial checksum coverage (via sockopt)
- * @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
- * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
- * @dccps_ndp_count - number of Non Data Packets since last data packet
- * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
- * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
- * @dccps_featneg - tracks feature-negotiation state (mostly during handshake)
- * @dccps_hc_rx_ackvec - rx half connection ack vector
- * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
- * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection)
- * @dccps_options_received - parsed set of retrieved options
- * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy
- * @dccps_tx_qlen - maximum length of the TX queue
- * @dccps_role - role of this sock, one of %dccp_role
- * @dccps_hc_rx_insert_options - receiver wants to add options when acking
- * @dccps_hc_tx_insert_options - sender wants to add options when sending
- * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3)
- * @dccps_sync_scheduled - flag which signals "send out-of-band message soon"
- * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets
- * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing)
- * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs)
- */
-struct dccp_sock {
-	/* inet_connection_sock has to be the first member of dccp_sock */
-	struct inet_connection_sock	dccps_inet_connection;
-#define dccps_syn_rtt			dccps_inet_connection.icsk_ack.lrcvtime
-	__u64				dccps_swl;
-	__u64				dccps_swh;
-	__u64				dccps_awl;
-	__u64				dccps_awh;
-	__u64				dccps_iss;
-	__u64				dccps_isr;
-	__u64				dccps_osr;
-	__u64				dccps_gss;
-	__u64				dccps_gsr;
-	__u64				dccps_gar;
-	__be32				dccps_service;
-	__u32				dccps_mss_cache;
-	struct dccp_service_list	*dccps_service_list;
-	__u32				dccps_timestamp_echo;
-	__u32				dccps_timestamp_time;
-	__u16				dccps_l_ack_ratio;
-	__u16				dccps_r_ack_ratio;
-	__u64				dccps_l_seq_win:48;
-	__u64				dccps_r_seq_win:48;
-	__u8				dccps_pcslen:4;
-	__u8				dccps_pcrlen:4;
-	__u8				dccps_send_ndp_count:1;
-	__u64				dccps_ndp_count:48;
-	unsigned long			dccps_rate_last;
-	struct list_head		dccps_featneg;
-	struct dccp_ackvec		*dccps_hc_rx_ackvec;
-	struct ccid			*dccps_hc_rx_ccid;
-	struct ccid			*dccps_hc_tx_ccid;
-	struct dccp_options_received	dccps_options_received;
-	__u8				dccps_qpolicy;
-	__u32				dccps_tx_qlen;
-	enum dccp_role			dccps_role:2;
-	__u8				dccps_hc_rx_insert_options:1;
-	__u8				dccps_hc_tx_insert_options:1;
-	__u8				dccps_server_timewait:1;
-	__u8				dccps_sync_scheduled:1;
-	struct tasklet_struct		dccps_xmitlet;
-	struct timer_list		dccps_xmit_timer;
-};
-
-#define dccp_sk(ptr)	container_of_const(ptr, struct dccp_sock, \
-					   dccps_inet_connection.icsk_inet.sk)
-
-static inline const char *dccp_role(const struct sock *sk)
-{
-	switch (dccp_sk(sk)->dccps_role) {
-	case DCCP_ROLE_UNDEFINED: return "undefined";
-	case DCCP_ROLE_LISTEN:	  return "listen";
-	case DCCP_ROLE_SERVER:	  return "server";
-	case DCCP_ROLE_CLIENT:	  return "client";
-	}
-	return NULL;
-}
-
-extern void dccp_syn_ack_timeout(const struct request_sock *req);
-
 #endif /* _LINUX_DCCP_H */
diff --git a/include/linux/tfrc.h b/include/linux/tfrc.h
deleted file mode 100644
index a5acc768085d..000000000000
--- a/include/linux/tfrc.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _LINUX_TFRC_H_
-#define _LINUX_TFRC_H_
-/*
- *  TFRC - Data Structures for the TCP-Friendly Rate Control congestion
- *         control mechanism as specified in RFC 3448.
- *
- *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
- *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- */
-#include <linux/types.h>
-
-/** 	tfrc_rx_info    -    TFRC Receiver Data Structure
- *
- * 	@tfrcrx_x_recv:	receiver estimate of sending rate (3.2.2)
- * 	@tfrcrx_rtt:	round-trip-time (communicated by sender)
- * 	@tfrcrx_p:	current estimate of loss event rate (3.2.2)
- */
-struct tfrc_rx_info {
-  	__u32 tfrcrx_x_recv;
-	__u32 tfrcrx_rtt;
-  	__u32 tfrcrx_p;
-};
-
-/** 	tfrc_tx_info    -    TFRC Sender Data Structure
- *
- * 	@tfrctx_x:	computed transmit rate (4.3 (4))
- * 	@tfrctx_x_recv: receiver estimate of send rate (4.3)
- * 	@tfrctx_x_calc:	return value of throughput equation (3.1)
- * 	@tfrctx_rtt:	(moving average) estimate of RTT (4.3)
- * 	@tfrctx_p:	current loss event rate (5.4)
- * 	@tfrctx_rto:	estimate of RTO, equals 4*RTT (4.3)
- * 	@tfrctx_ipi:	inter-packet interval (4.6)
- *
- *  Note: X and X_recv are both maintained in units of 64 * bytes/second. This
- *        enables a finer resolution of sending rates and avoids problems with
- *        integer arithmetic; u32 is not sufficient as scaling consumes 6 bits.
- */
-struct tfrc_tx_info {
-	__u64 tfrctx_x;
-	__u64 tfrctx_x_recv;
-	__u32 tfrctx_x_calc;
-	__u32 tfrctx_rtt;
-	__u32 tfrctx_p;
-	__u32 tfrctx_rto;
-	__u32 tfrctx_ipi;
-};
-
-#endif /* _LINUX_TFRC_H_ */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 949641e92539..d172b64a6320 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -177,12 +177,8 @@ struct inet_hashinfo {
 
 static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
 {
-#if IS_ENABLED(CONFIG_IP_DCCP)
-	return sk->sk_prot->h.hashinfo ? :
-		sock_net(sk)->ipv4.tcp_death_row.hashinfo;
-#else
+	/* TODO: rename function */
 	return sock_net(sk)->ipv4.tcp_death_row.hashinfo;
-#endif
 }
 
 static inline struct inet_listen_hashbucket *
diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index 69cb2e52b7da..979ac87b5d99 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -36,7 +36,7 @@
 /**
  * enum sk_rst_reason - the reasons of socket reset
  *
- * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols.
+ * The reasons of sk reset, which are used in TCP/MPTCP protocols.
  *
  * There are three parts in order:
  * 1) skb drop reasons: relying on drop reasons for such as passive reset
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 21e7fa2a1813..cddebafb9f77 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -16,9 +16,5 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
 		     __be16 sport, __be16 dport);
 u32 secure_tcpv6_ts_off(const struct net *net,
 			const __be32 *saddr, const __be32 *daddr);
-u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
-				__be16 sport, __be16 dport);
-u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
-				  __be16 sport, __be16 dport);
 
 #endif /* _NET_SECURE_SEQ */
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 3836de435d9d..b5310439536e 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -19,7 +19,6 @@
 /* The protocol traced by inet_sock_set_state */
 #define inet_protocol_names		\
 		EM(IPPROTO_TCP)			\
-		EM(IPPROTO_DCCP)		\
 		EM(IPPROTO_SCTP)		\
 		EMe(IPPROTO_MPTCP)
 
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 5d331383047b..de214f1dea58 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -21,7 +21,6 @@ TRACE_DEFINE_ENUM(SOCK_DGRAM);
 TRACE_DEFINE_ENUM(SOCK_RAW);
 TRACE_DEFINE_ENUM(SOCK_RDM);
 TRACE_DEFINE_ENUM(SOCK_SEQPACKET);
-TRACE_DEFINE_ENUM(SOCK_DCCP);
 TRACE_DEFINE_ENUM(SOCK_PACKET);
 
 #define show_socket_type(type)					\
@@ -31,7 +30,6 @@ TRACE_DEFINE_ENUM(SOCK_PACKET);
 		{ SOCK_RAW,		"RAW" },		\
 		{ SOCK_RDM,		"RDM" },		\
 		{ SOCK_SEQPACKET,	"SEQPACKET" },		\
-		{ SOCK_DCCP,		"DCCP" },		\
 		{ SOCK_PACKET,		"PACKET" })
 
 /* This list is known to be incomplete, add new enums as needed. */
-- 
cgit v1.2.3


From 22d6c9eebf2e68e6ab831ded37daaa83daff6bb8 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 9 Apr 2025 19:36:46 -0700
Subject: net: Unexport shared functions for DCCP.

DCCP was removed, so many inet functions no longer need to
be exported.

Let's unexport or use EXPORT_IPV6_MOD() for such functions.

sk_free_unlock_clone() is inlined in sk_clone_lock() as it's
the only caller.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250410023921.11307-4-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 694f954258d4..bb4d6189292f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1781,7 +1781,6 @@ void sk_free(struct sock *sk);
 void sk_net_refcnt_upgrade(struct sock *sk);
 void sk_destruct(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
-void sk_free_unlock_clone(struct sock *sk);
 
 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
 			     gfp_t priority);
-- 
cgit v1.2.3


From 235bd9d21fcdf07dd125daa3e60ab64f8aefb927 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 9 Apr 2025 19:36:47 -0700
Subject: tcp: Rename tcp_or_dccp_get_hashinfo().

DCCP was removed, so tcp_or_dccp_get_hashinfo() should be renamed.

Let's rename it to tcp_get_hashinfo().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250410023921.11307-5-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_hashtables.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index d172b64a6320..4564b5d348b1 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -175,9 +175,8 @@ struct inet_hashinfo {
 	bool				pernet;
 } ____cacheline_aligned_in_smp;
 
-static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
+static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk)
 {
-	/* TODO: rename function */
 	return sock_net(sk)->ipv4.tcp_death_row.hashinfo;
 }
 
-- 
cgit v1.2.3


From c26c192c3d486a2a7d83d254bae294c2f8f50abf Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 9 Apr 2025 22:00:56 +0200
Subject: udp: properly deal with xfrm encap and ADDRFORM

UDP GRO accounting assumes that the GRO receive callback is always
set when the UDP tunnel is enabled, but syzkaller proved otherwise,
leading tot the following splat:

WARNING: CPU: 0 PID: 5837 at net/ipv4/udp_offload.c:123 udp_tunnel_update_gro_rcv+0x28d/0x4c0 net/ipv4/udp_offload.c:123
Modules linked in:
CPU: 0 UID: 0 PID: 5837 Comm: syz-executor850 Not tainted 6.14.0-syzkaller-13320-g420aabef3ab5 #0 PREEMPT(full)
Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025
RIP: 0010:udp_tunnel_update_gro_rcv+0x28d/0x4c0 net/ipv4/udp_offload.c:123
Code: 00 00 e8 c6 5a 2f f7 48 c1 e5 04 48 8d b5 20 53 c7 9a ba 10
      00 00 00 4c 89 ff e8 ce 87 99 f7 e9 ce 00 00 00 e8 a4 5a 2f
      f7 90 <0f> 0b 90 e9 de fd ff ff bf 01 00 00 00 89 ee e8 cf
      5e 2f f7 85 ed
RSP: 0018:ffffc90003effa88 EFLAGS: 00010293
RAX: ffffffff8a93fc9c RBX: 0000000000000000 RCX: ffff8880306f9e00
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000000 R08: ffffffff8a93fabe R09: 1ffffffff20bfb2e
R10: dffffc0000000000 R11: fffffbfff20bfb2f R12: ffff88814ef21738
R13: dffffc0000000000 R14: ffff88814ef21778 R15: 1ffff11029de42ef
FS:  0000000000000000(0000) GS:ffff888124f96000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f04eec760d0 CR3: 000000000eb38000 CR4: 00000000003526f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 udp_tunnel_cleanup_gro include/net/udp_tunnel.h:205 [inline]
 udpv6_destroy_sock+0x212/0x270 net/ipv6/udp.c:1829
 sk_common_release+0x71/0x2e0 net/core/sock.c:3896
 inet_release+0x17d/0x200 net/ipv4/af_inet.c:435
 __sock_release net/socket.c:647 [inline]
 sock_close+0xbc/0x240 net/socket.c:1391
 __fput+0x3e9/0x9f0 fs/file_table.c:465
 task_work_run+0x251/0x310 kernel/task_work.c:227
 exit_task_work include/linux/task_work.h:40 [inline]
 do_exit+0xa11/0x27f0 kernel/exit.c:953
 do_group_exit+0x207/0x2c0 kernel/exit.c:1102
 __do_sys_exit_group kernel/exit.c:1113 [inline]
 __se_sys_exit_group kernel/exit.c:1111 [inline]
 __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1111
 x64_sys_call+0x26c3/0x26d0 arch/x86/include/generated/asm/syscalls_64.h:232
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xf3/0x230 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f04eebfac79
Code: Unable to access opcode bytes at 0x7f04eebfac4f.
RSP: 002b:00007fffdcaa34a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f04eebfac79
RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000
RBP: 00007f04eec75270 R08: ffffffffffffffb8 R09: 00007fffdcaa36c8
R10: 0000200000000000 R11: 0000000000000246 R12: 00007f04eec75270
R13: 0000000000000000 R14: 00007f04eec75cc0 R15: 00007f04eebcca70

Address the issue moving the accounting hook into
setup_udp_tunnel_sock() and set_xfrm_gro_udp_encap_rcv(), where
the GRO callback is actually set.

set_xfrm_gro_udp_encap_rcv() is prone to races with IPV6_ADDRFORM,
run the relevant setsockopt under the socket lock to ensure using
consistent values of sk_family and up->encap_type.

Refactor the GRO callback selection code, to make it clear that
the function pointer is always initialized.

Reported-by: syzbot+8c469a2260132cd095c1@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=8c469a2260132cd095c1
Fixes: 172bf009c18d ("xfrm: Support GRO for IPv4 ESP in UDP encapsulation")
Fixes: 5d7f5b2f6b935 ("udp_tunnel: use static call for GRO hooks when possible")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/92bcdb6899145a9a387c8fa9e3ca656642a43634.1744228733.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/udp_tunnel.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 288f06f23a80..2df3b8344eb5 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -215,7 +215,6 @@ static inline void udp_tunnel_encap_enable(struct sock *sk)
 	if (READ_ONCE(sk->sk_family) == PF_INET6)
 		ipv6_stub->udpv6_encap_enable();
 #endif
-	udp_tunnel_update_gro_rcv(sk, true);
 	udp_encap_enable();
 }
 
-- 
cgit v1.2.3


From 097f171f98289cf737437599c40b0d1e81266e9e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 9 Apr 2025 18:42:46 -0700
Subject: net: convert dev->rtnl_link_state to a bool

netdevice reg_state was split into two 16 bit enums back in 2010
in commit a2835763e130 ("rtnetlink: handle rtnl_link netlink
notifications manually"). Since the split the fields have been
moved apart, and last year we converted reg_state to a normal
u8 in commit 4d42b37def70 ("net: convert dev->reg_state to u8").

rtnl_link_state being a 16 bitfield makes no sense. Convert it
to a single bool, it seems very unlikely after 15 years that
we'll need more values in it.

We could drop dev->rtnl_link_ops from the conditions but feels
like having it there more clearly points at the reason for this
hack.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250410014246.780885-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d8544f6a680c..e6036b82ef4c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1946,9 +1946,6 @@ enum netdev_reg_state {
  *
  *	@reg_state:		Register/unregister state machine
  *	@dismantle:		Device is going to be freed
- *	@rtnl_link_state:	This enum represents the phases of creating
- *				a new link
- *
  *	@needs_free_netdev:	Should unregister perform free_netdev?
  *	@priv_destructor:	Called from unregister
  *	@npinfo:		XXX: need comments on this one
@@ -2363,11 +2360,8 @@ struct net_device {
 
 	/** @moving_ns: device is changing netns, protected by @lock */
 	bool moving_ns;
-
-	enum {
-		RTNL_LINK_INITIALIZED,
-		RTNL_LINK_INITIALIZING,
-	} rtnl_link_state:16;
+	/** @rtnl_link_initializing: Device being created, suppress events */
+	bool rtnl_link_initializing;
 
 	bool needs_free_netdev;
 	void (*priv_destructor)(struct net_device *dev);
-- 
cgit v1.2.3


From e846fb5e7c5243c65ff67247cb29a9d76bbcc4e8 Mon Sep 17 00:00:00 2001
From: Joseph Huang <Joseph.Huang@garmin.com>
Date: Fri, 11 Apr 2025 11:03:16 -0400
Subject: net: bridge: mcast: Add offload failed mdb flag

Add MDB_FLAGS_OFFLOAD_FAILED and MDB_PG_FLAGS_OFFLOAD_FAILED to indicate
that an attempt to offload the MDB entry to switchdev has failed.

Signed-off-by: Joseph Huang <Joseph.Huang@garmin.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20250411150323.1117797-2-Joseph.Huang@garmin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_bridge.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index a5b743a2f775..f2a6de424f3f 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -699,10 +699,11 @@ struct br_mdb_entry {
 #define MDB_TEMPORARY 0
 #define MDB_PERMANENT 1
 	__u8 state;
-#define MDB_FLAGS_OFFLOAD	(1 << 0)
-#define MDB_FLAGS_FAST_LEAVE	(1 << 1)
-#define MDB_FLAGS_STAR_EXCL	(1 << 2)
-#define MDB_FLAGS_BLOCKED	(1 << 3)
+#define MDB_FLAGS_OFFLOAD		(1 << 0)
+#define MDB_FLAGS_FAST_LEAVE		(1 << 1)
+#define MDB_FLAGS_STAR_EXCL		(1 << 2)
+#define MDB_FLAGS_BLOCKED		(1 << 3)
+#define MDB_FLAGS_OFFLOAD_FAILED	(1 << 4)
 	__u8 flags;
 	__u16 vid;
 	struct {
-- 
cgit v1.2.3


From 9fbe1e3e61c21508861a72324087aeeea85f796f Mon Sep 17 00:00:00 2001
From: Joseph Huang <Joseph.Huang@garmin.com>
Date: Fri, 11 Apr 2025 11:03:17 -0400
Subject: net: bridge: Add offload_fail_notification bopt

Add BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION bool option.

Signed-off-by: Joseph Huang <Joseph.Huang@garmin.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20250411150323.1117797-3-Joseph.Huang@garmin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_bridge.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index f2a6de424f3f..73876c0e2bba 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -831,6 +831,7 @@ enum br_boolopt_id {
 	BR_BOOLOPT_NO_LL_LEARN,
 	BR_BOOLOPT_MCAST_VLAN_SNOOPING,
 	BR_BOOLOPT_MST_ENABLE,
+	BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION,
 	BR_BOOLOPT_MAX
 };
 
-- 
cgit v1.2.3


From cd3c93167da0e760b5819246eae7a4ea30fd014b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Wed, 9 Apr 2025 12:41:36 +0200
Subject: page_pool: Move pp_magic check into helper functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since we are about to stash some more information into the pp_magic
field, let's move the magic signature checks into a pair of helper
functions so it can be changed in one place.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Tested-by: Yonglong Liu <liuyonglong@huawei.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-1-6a9ef2e0cba8@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b7f13f087954..56c47f4a38ca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4248,4 +4248,24 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
 #define VM_SEALED_SYSMAP	VM_NONE
 #endif
 
+/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
+ * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
+ * the head page of compound page and bit 1 for pfmemalloc page.
+ * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling
+ * the pfmemalloc page.
+ */
+#define PP_MAGIC_MASK ~0x3UL
+
+#ifdef CONFIG_PAGE_POOL
+static inline bool page_pool_page_is_pp(struct page *page)
+{
+	return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+#else
+static inline bool page_pool_page_is_pp(struct page *page)
+{
+	return false;
+}
+#endif
+
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From ee62ce7a1d909ccba0399680a03c2dee83bcae95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Wed, 9 Apr 2025 12:41:37 +0200
Subject: page_pool: Track DMA-mapped pages and unmap them when destroying the
 pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When enabling DMA mapping in page_pool, pages are kept DMA mapped until
they are released from the pool, to avoid the overhead of re-mapping the
pages every time they are used. This causes resource leaks and/or
crashes when there are pages still outstanding while the device is torn
down, because page_pool will attempt an unmap through a non-existent DMA
device on the subsequent page return.

To fix this, implement a simple tracking of outstanding DMA-mapped pages
in page pool using an xarray. This was first suggested by Mina[0], and
turns out to be fairly straight forward: We simply store pointers to
pages directly in the xarray with xa_alloc() when they are first DMA
mapped, and remove them from the array on unmap. Then, when a page pool
is torn down, it can simply walk the xarray and unmap all pages still
present there before returning, which also allows us to get rid of the
get/put_device() calls in page_pool. Using xa_cmpxchg(), no additional
synchronisation is needed, as a page will only ever be unmapped once.

To avoid having to walk the entire xarray on unmap to find the page
reference, we stash the ID assigned by xa_alloc() into the page
structure itself, using the upper bits of the pp_magic field. This
requires a couple of defines to avoid conflicting with the
POINTER_POISON_DELTA define, but this is all evaluated at compile-time,
so does not affect run-time performance. The bitmap calculations in this
patch gives the following number of bits for different architectures:

- 23 bits on 32-bit architectures
- 21 bits on PPC64 (because of the definition of ILLEGAL_POINTER_VALUE)
- 32 bits on other 64-bit architectures

Stashing a value into the unused bits of pp_magic does have the effect
that it can make the value stored there lie outside the unmappable
range (as governed by the mmap_min_addr sysctl), for architectures that
don't define ILLEGAL_POINTER_VALUE. This means that if one of the
pointers that is aliased to the pp_magic field (such as page->lru.next)
is dereferenced while the page is owned by page_pool, that could lead to
a dereference into userspace, which is a security concern. The risk of
this is mitigated by the fact that (a) we always clear pp_magic before
releasing a page from page_pool, and (b) this would need a
use-after-free bug for struct page, which can have many other risks
since page->lru.next is used as a generic list pointer in multiple
places in the kernel. As such, with this patch we take the position that
this risk is negligible in practice. For more discussion, see[1].

Since all the tracking added in this patch is performed on DMA
map/unmap, no additional code is needed in the fast path, meaning the
performance overhead of this tracking is negligible there. A
micro-benchmark shows that the total overhead of the tracking itself is
about 400 ns (39 cycles(tsc) 395.218 ns; sum for both map and unmap[2]).
Since this cost is only paid on DMA map and unmap, it seems like an
acceptable cost to fix the late unmap issue. Further optimisation can
narrow the cases where this cost is paid (for instance by eliding the
tracking when DMA map/unmap is a no-op).

The extra memory needed to track the pages is neatly encapsulated inside
xarray, which uses the 'struct xa_node' structure to track items. This
structure is 576 bytes long, with slots for 64 items, meaning that a
full node occurs only 9 bytes of overhead per slot it tracks (in
practice, it probably won't be this efficient, but in any case it should
be an acceptable overhead).

[0] https://lore.kernel.org/all/CAHS8izPg7B5DwKfSuzz-iOop_YRbk3Sd6Y4rX7KBG9DcVJcyWg@mail.gmail.com/
[1] https://lore.kernel.org/r/20250320023202.GA25514@openwall.com
[2] https://lore.kernel.org/r/ae07144c-9295-4c9d-a400-153bb689fe9e@huawei.com

Reported-by: Yonglong Liu <liuyonglong@huawei.com>
Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com
Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code")
Suggested-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Jesper Dangaard Brouer <hawk@kernel.org>
Tested-by: Jesper Dangaard Brouer <hawk@kernel.org>
Tested-by: Qiuling Ren <qren@redhat.com>
Tested-by: Yuying Ma <yuma@redhat.com>
Tested-by: Yonglong Liu <liuyonglong@huawei.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-2-6a9ef2e0cba8@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm.h            | 46 +++++++++++++++++++++++++++++++++++++++----
 include/linux/poison.h        |  4 ++++
 include/net/page_pool/types.h |  6 ++++++
 3 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 56c47f4a38ca..130d3c9d2ee4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4248,13 +4248,51 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
 #define VM_SEALED_SYSMAP	VM_NONE
 #endif
 
+/*
+ * DMA mapping IDs for page_pool
+ *
+ * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
+ * stashes it in the upper bits of page->pp_magic. We always want to be able to
+ * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP
+ * pages can have arbitrary kernel pointers stored in the same field as pp_magic
+ * (since it overlaps with page->lru.next), so we must ensure that we cannot
+ * mistake a valid kernel pointer with any of the values we write into this
+ * field.
+ *
+ * On architectures that set POISON_POINTER_DELTA, this is already ensured,
+ * since this value becomes part of PP_SIGNATURE; meaning we can just use the
+ * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
+ * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
+ * 0, we make sure that we leave the two topmost bits empty, as that guarantees
+ * we won't mistake a valid kernel pointer for a value we set, regardless of the
+ * VMSPLIT setting.
+ *
+ * Altogether, this means that the number of bits available is constrained by
+ * the size of an unsigned long (at the upper end, subtracting two bits per the
+ * above), and the definition of PP_SIGNATURE (with or without
+ * POISON_POINTER_DELTA).
+ */
+#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
+#if POISON_POINTER_DELTA > 0
+/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
+ * index to not overlap with that if set
+ */
+#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
+#else
+/* Always leave out the topmost two; see above. */
+#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2)
+#endif
+
+#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
+				  PP_DMA_INDEX_SHIFT)
+
 /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
  * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
- * the head page of compound page and bit 1 for pfmemalloc page.
- * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling
- * the pfmemalloc page.
+ * the head page of compound page and bit 1 for pfmemalloc page, as well as the
+ * bits used for the DMA index. page_is_pfmemalloc() is checked in
+ * __page_pool_put_page() to avoid recycling the pfmemalloc page.
  */
-#define PP_MAGIC_MASK ~0x3UL
+#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
 
 #ifdef CONFIG_PAGE_POOL
 static inline bool page_pool_page_is_pp(struct page *page)
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 331a9a996fa8..8ca2235f78d5 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -70,6 +70,10 @@
 #define KEY_DESTROY		0xbd
 
 /********** net/core/page_pool.c **********/
+/*
+ * page_pool uses additional free bits within this value to store data, see the
+ * definition of PP_DMA_INDEX_MASK in mm.h
+ */
 #define PP_SIGNATURE		(0x40 + POISON_POINTER_DELTA)
 
 /********** net/core/skbuff.c **********/
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 36eb57d73abc..431b593de709 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -6,6 +6,7 @@
 #include <linux/dma-direction.h>
 #include <linux/ptr_ring.h>
 #include <linux/types.h>
+#include <linux/xarray.h>
 #include <net/netmem.h>
 
 #define PP_FLAG_DMA_MAP		BIT(0) /* Should page_pool do the DMA
@@ -33,6 +34,9 @@
 #define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \
 				 PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM)
 
+/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */
+#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1)
+
 /*
  * Fast allocation side cache array/stack
  *
@@ -221,6 +225,8 @@ struct page_pool {
 	void *mp_priv;
 	const struct memory_provider_ops *mp_ops;
 
+	struct xarray dma_mapped;
+
 #ifdef CONFIG_PAGE_POOL_STATS
 	/* recycle stats are per-cpu to avoid locking */
 	struct page_pool_recycle_stats __percpu *recycle_stats;
-- 
cgit v1.2.3


From ceaceaf79ea0fe337344fc5c1fb10a421a362410 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 11 Apr 2025 23:04:19 +0100
Subject: net: ethtool: fix get_ts_stats() documentation

Commit 0e9c127729be ("ethtool: add interface to read Tx hardware
timestamping statistics") added documentation for timestamping
statistics, but added the detailed explanation for this method to
the get_ts_info() rather than get_ts_stats(). Move it to the correct
entry.

Cc: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1u3MTz-000Crx-IW@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 8210ece94fa6..013d25858642 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -926,10 +926,11 @@ struct kernel_ethtool_ts_info {
  * @get_ts_info: Get the time stamping and PTP hardware clock capabilities.
  *	It may be called with RCU, or rtnl or reference on the device.
  *	Drivers supporting transmit time stamps in software should set this to
- *	ethtool_op_get_ts_info(). Drivers must not zero statistics which they
- *	don't report. The stats	structure is initialized to ETHTOOL_STAT_NOT_SET
- *	indicating driver does not report statistics.
- * @get_ts_stats: Query the device hardware timestamping statistics.
+ *	ethtool_op_get_ts_info().
+ * @get_ts_stats: Query the device hardware timestamping statistics. Drivers
+ *	must not zero statistics which they don't report. The stats structure
+ *	is initialized to ETHTOOL_STAT_NOT_SET indicating driver does not
+ *	report statistics.
  * @get_module_info: Get the size and type of the eeprom contained within
  *	a plug-in module.
  * @get_module_eeprom: Get the eeprom information from the plug-in module
-- 
cgit v1.2.3


From 7a60d91c690bf73c2c78e763efa29f294e217c3a Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 11 Apr 2025 13:52:32 -0700
Subject: net: Add ->exit_rtnl() hook to struct pernet_operations.

struct pernet_operations provides two batching hooks; ->exit_batch()
and ->exit_batch_rtnl().

The batching variant is beneficial if ->exit() meets any of the
following conditions:

  1) ->exit() repeatedly acquires a global lock for each netns

  2) ->exit() has a time-consuming operation that can be factored
     out (e.g. synchronize_rcu(), smp_mb(), etc)

  3) ->exit() does not need to repeat the same iterations for each
     netns (e.g. inet_twsk_purge())

Currently, none of the ->exit_batch_rtnl() functions satisfy any of
the above conditions because RTNL is factored out and held by the
caller and all of these functions iterate over the dying netns list.

Also, we want to hold per-netns RTNL there but avoid spreading
__rtnl_net_lock() across multiple locations.

Let's add ->exit_rtnl() hook and run it under __rtnl_net_lock().

The following patches will convert all ->exit_batch_rtnl() users
to ->exit_rtnl().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/20250411205258.63164-4-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/net_namespace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index bd57d8fb54f1..b071e6eed9d5 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -475,6 +475,8 @@ struct pernet_operations {
 	void (*exit)(struct net *net);
 	void (*exit_batch)(struct list_head *net_exit_list);
 	/* Following method is called with RTNL held. */
+	void (*exit_rtnl)(struct net *net,
+			  struct list_head *dev_kill_list);
 	void (*exit_batch_rtnl)(struct list_head *net_exit_list,
 				struct list_head *dev_kill_list);
 	unsigned int * const id;
-- 
cgit v1.2.3


From a967e01e2ad201f6ddc778ed65a5dae1c68ee8a5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 11 Apr 2025 13:52:35 -0700
Subject: ipv4: ip_tunnel: Convert ip_tunnel_delete_nets() callers to
 ->exit_rtnl().

ip_tunnel_delete_nets() iterates the dying netns list and performs the
same operations for each.

Let's export ip_tunnel_destroy() as ip_tunnel_delete_net() and call it
from ->exit_rtnl().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/20250411205258.63164-7-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_tunnels.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index a36a335cef9f..0c3d571a04a1 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -377,10 +377,9 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev);
 int ip_tunnel_get_iflink(const struct net_device *dev);
 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
-
-void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id,
-			   struct rtnl_link_ops *ops,
-			   struct list_head *dev_to_kill);
+void ip_tunnel_delete_net(struct net *net, unsigned int id,
+			  struct rtnl_link_ops *ops,
+			  struct list_head *dev_to_kill);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
-- 
cgit v1.2.3


From c57a9c503543cd8829eeaaf88362199e0491c0d7 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 11 Apr 2025 13:52:43 -0700
Subject: net: Remove ->exit_batch_rtnl().

There are no ->exit_batch_rtnl() users remaining.

Let's remove the hook.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/20250411205258.63164-15-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/net_namespace.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index b071e6eed9d5..025a7574b275 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -477,8 +477,6 @@ struct pernet_operations {
 	/* Following method is called with RTNL held. */
 	void (*exit_rtnl)(struct net *net,
 			  struct list_head *dev_kill_list);
-	void (*exit_batch_rtnl)(struct list_head *net_exit_list,
-				struct list_head *dev_kill_list);
 	unsigned int * const id;
 	const size_t size;
 };
-- 
cgit v1.2.3


From 651f88cb046c5e002f7c11de2cebf207787d2346 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Sat, 12 Apr 2025 09:08:45 +0100
Subject: net: stmmac: remove eee_usecs_rate

plat_dat->eee_users_rate is now unused, so remove this member.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1u3Vuv-000E7y-9k@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/stmmac.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c4ec8bb8144e..8aed09d65b4a 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -276,7 +276,6 @@ struct plat_stmmacenet_data {
 	int mac_port_sel_speed;
 	int has_xgmac;
 	u8 vlan_fail_q;
-	unsigned long eee_usecs_rate;
 	struct pci_dev *pdev;
 	int int_snapshot_num;
 	int msi_mac_vec;
-- 
cgit v1.2.3


From 23738cc8048322cf324f330cd697380fb3455da5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Apr 2025 10:52:47 +0100
Subject: rxrpc: Pull out certain app callback funcs into an ops table

A number of functions separately furnish an AF_RXRPC socket with callback
function pointers into a kernel app (such as the AFS filesystem) that is
using it.  Replace most of these with an ops table for the entire socket.
This makes it easier to add more callback functions.

Note that the call incoming data processing callback is retaind as that
gets set to different things, depending on the type of op.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250411095303.2316168-3-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_rxrpc.h | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index cf793d18e5df..ebb6092c488b 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -29,18 +29,23 @@ enum rxrpc_interruptibility {
  */
 extern atomic_t rxrpc_debug_id;
 
+/*
+ * Operations table for rxrpc to call out to a kernel application (e.g. kAFS).
+ */
+struct rxrpc_kernel_ops {
+	void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call,
+				unsigned long user_call_ID);
+	void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID);
+	void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID);
+};
+
 typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *,
 				  unsigned long);
 typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *,
 				      unsigned long);
-typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *,
-					unsigned long);
-typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long);
-typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long);
 
-void rxrpc_kernel_new_call_notification(struct socket *,
-					rxrpc_notify_new_call_t,
-					rxrpc_discard_new_call_t);
+void rxrpc_kernel_set_notifications(struct socket *sock,
+				    const struct rxrpc_kernel_ops *app_ops);
 struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 					   struct rxrpc_peer *peer,
 					   struct key *key,
@@ -72,9 +77,9 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer);
 unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data);
 unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer);
 unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *);
-int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
-			       rxrpc_user_attach_call_t, unsigned long, gfp_t,
-			       unsigned int);
+int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx,
+			       unsigned long user_call_ID, gfp_t gfp,
+			       unsigned int debug_id);
 void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64);
 bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *);
 u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *);
-- 
cgit v1.2.3


From 5800b1cf3fd8ccab752a101865be1e76dac33142 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Apr 2025 10:52:49 +0100
Subject: rxrpc: Allow CHALLENGEs to the passed to the app for a RESPONSE

Allow the app to request that CHALLENGEs be passed to it through an
out-of-band queue that allows recvmsg() to pick it up so that the app can
add data to it with sendmsg().

This will allow the application (AFS or userspace) to interact with the
process if it wants to and put values into user-defined fields.  This will
be used by AFS when talking to a fileserver to supply that fileserver with
a crypto key by which callback RPCs can be encrypted (ie. notifications
from the fileserver to the client).

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250411095303.2316168-5-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_rxrpc.h       | 24 +++++++++++++++++++++++
 include/trace/events/rxrpc.h | 18 ++++++++++++++++-
 include/uapi/linux/rxrpc.h   | 46 ++++++++++++++++++++++++++++++++------------
 3 files changed, 75 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index ebb6092c488b..0b209f703ffc 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -16,6 +16,7 @@ struct sock;
 struct socket;
 struct rxrpc_call;
 struct rxrpc_peer;
+struct krb5_buffer;
 enum rxrpc_abort_reason;
 
 enum rxrpc_interruptibility {
@@ -24,6 +25,10 @@ enum rxrpc_interruptibility {
 	RXRPC_UNINTERRUPTIBLE,	/* Call should not be interruptible at all */
 };
 
+enum rxrpc_oob_type {
+	RXRPC_OOB_CHALLENGE,	/* Security challenge for a connection */
+};
+
 /*
  * Debug ID counter for tracing.
  */
@@ -37,6 +42,7 @@ struct rxrpc_kernel_ops {
 				unsigned long user_call_ID);
 	void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID);
 	void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID);
+	void (*notify_oob)(struct sock *sk, struct sk_buff *oob);
 };
 
 typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *,
@@ -88,5 +94,23 @@ void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
 
 int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
 int rxrpc_sock_set_security_keyring(struct sock *, struct key *);
+int rxrpc_sock_set_manage_response(struct sock *sk, bool set);
+
+enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob,
+					   struct rxrpc_peer **_peer,
+					   unsigned long *_peer_appdata);
+struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock,
+					 enum rxrpc_oob_type *_type);
+void rxrpc_kernel_free_oob(struct sk_buff *oob);
+void rxrpc_kernel_query_challenge(struct sk_buff *challenge,
+				  struct rxrpc_peer **_peer,
+				  unsigned long *_peer_appdata,
+				  u16 *_service_id, u8 *_security_index);
+int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code,
+				  int error, enum rxrpc_abort_reason why);
+int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge);
+u32 rxgk_kernel_query_challenge(struct sk_buff *challenge);
+int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge,
+				     struct krb5_buffer *appdata);
 
 #endif /* _NET_RXRPC_H */
diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index cad50d91077e..08ecebd90595 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -25,6 +25,7 @@
 	EM(afs_abort_probeuuid_negative,	"afs-probeuuid-neg")	\
 	EM(afs_abort_send_data_error,		"afs-send-data")	\
 	EM(afs_abort_unmarshal_error,		"afs-unmarshal")	\
+	EM(afs_abort_unsupported_sec_class,	"afs-unsup-sec-class")	\
 	/* rxperf errors */						\
 	EM(rxperf_abort_general_error,		"rxperf-error")		\
 	EM(rxperf_abort_oom,			"rxperf-oom")		\
@@ -77,6 +78,7 @@
 	EM(rxrpc_abort_call_timeout,		"call-timeout")		\
 	EM(rxrpc_abort_no_service_key,		"no-serv-key")		\
 	EM(rxrpc_abort_nomem,			"nomem")		\
+	EM(rxrpc_abort_response_sendmsg,	"resp-sendmsg")		\
 	EM(rxrpc_abort_service_not_offered,	"serv-not-offered")	\
 	EM(rxrpc_abort_shut_down,		"shut-down")		\
 	EM(rxrpc_abort_unsupported_security,	"unsup-sec")		\
@@ -133,24 +135,33 @@
 	EM(rxrpc_skb_get_conn_secured,		"GET conn-secd") \
 	EM(rxrpc_skb_get_conn_work,		"GET conn-work") \
 	EM(rxrpc_skb_get_local_work,		"GET locl-work") \
+	EM(rxrpc_skb_get_post_oob,		"GET post-oob ") \
 	EM(rxrpc_skb_get_reject_work,		"GET rej-work ") \
 	EM(rxrpc_skb_get_to_recvmsg,		"GET to-recv  ") \
 	EM(rxrpc_skb_get_to_recvmsg_oos,	"GET to-recv-o") \
 	EM(rxrpc_skb_new_encap_rcv,		"NEW encap-rcv") \
 	EM(rxrpc_skb_new_error_report,		"NEW error-rpt") \
 	EM(rxrpc_skb_new_jumbo_subpacket,	"NEW jumbo-sub") \
+	EM(rxrpc_skb_new_response_rxgk,		"NEW resp-rxgk") \
+	EM(rxrpc_skb_new_response_rxkad,	"NEW resp-rxkd") \
 	EM(rxrpc_skb_new_unshared,		"NEW unshared ") \
 	EM(rxrpc_skb_put_call_rx,		"PUT call-rx  ") \
+	EM(rxrpc_skb_put_challenge,		"PUT challenge") \
 	EM(rxrpc_skb_put_conn_secured,		"PUT conn-secd") \
 	EM(rxrpc_skb_put_conn_work,		"PUT conn-work") \
 	EM(rxrpc_skb_put_error_report,		"PUT error-rep") \
 	EM(rxrpc_skb_put_input,			"PUT input    ") \
 	EM(rxrpc_skb_put_jumbo_subpacket,	"PUT jumbo-sub") \
+	EM(rxrpc_skb_put_oob,			"PUT oob      ") \
 	EM(rxrpc_skb_put_purge,			"PUT purge    ") \
+	EM(rxrpc_skb_put_purge_oob,		"PUT purge-oob") \
+	EM(rxrpc_skb_put_response,		"PUT response ") \
 	EM(rxrpc_skb_put_rotate,		"PUT rotate   ") \
 	EM(rxrpc_skb_put_unknown,		"PUT unknown  ") \
 	EM(rxrpc_skb_see_conn_work,		"SEE conn-work") \
+	EM(rxrpc_skb_see_oob_challenge,		"SEE oob-chall") \
 	EM(rxrpc_skb_see_recvmsg,		"SEE recvmsg  ") \
+	EM(rxrpc_skb_see_recvmsg_oob,		"SEE recvm-oob") \
 	EM(rxrpc_skb_see_reject,		"SEE reject   ") \
 	EM(rxrpc_skb_see_rotate,		"SEE rotate   ") \
 	E_(rxrpc_skb_see_version,		"SEE version  ")
@@ -216,9 +227,11 @@
 	EM(rxrpc_conn_free,			"FREE        ") \
 	EM(rxrpc_conn_get_activate_call,	"GET act-call") \
 	EM(rxrpc_conn_get_call_input,		"GET inp-call") \
+	EM(rxrpc_conn_get_challenge_input,	"GET inp-chal") \
 	EM(rxrpc_conn_get_conn_input,		"GET inp-conn") \
 	EM(rxrpc_conn_get_idle,			"GET idle    ") \
 	EM(rxrpc_conn_get_poke_abort,		"GET pk-abort") \
+	EM(rxrpc_conn_get_poke_response,	"GET response") \
 	EM(rxrpc_conn_get_poke_secured,		"GET secured ") \
 	EM(rxrpc_conn_get_poke_timer,		"GET poke    ") \
 	EM(rxrpc_conn_get_service_conn,		"GET svc-conn") \
@@ -226,10 +239,12 @@
 	EM(rxrpc_conn_new_service,		"NEW service ") \
 	EM(rxrpc_conn_put_call,			"PUT call    ") \
 	EM(rxrpc_conn_put_call_input,		"PUT inp-call") \
+	EM(rxrpc_conn_put_challenge_input,	"PUT inp-chal") \
 	EM(rxrpc_conn_put_conn_input,		"PUT inp-conn") \
 	EM(rxrpc_conn_put_discard_idle,		"PUT disc-idl") \
 	EM(rxrpc_conn_put_local_dead,		"PUT loc-dead") \
 	EM(rxrpc_conn_put_noreuse,		"PUT noreuse ") \
+	EM(rxrpc_conn_put_oob,			"PUT oob     ") \
 	EM(rxrpc_conn_put_poke,			"PUT poke    ") \
 	EM(rxrpc_conn_put_service_reaped,	"PUT svc-reap") \
 	EM(rxrpc_conn_put_unbundle,		"PUT unbundle") \
@@ -331,6 +346,7 @@
 	EM(rxrpc_recvmsg_full,			"FULL") \
 	EM(rxrpc_recvmsg_hole,			"HOLE") \
 	EM(rxrpc_recvmsg_next,			"NEXT") \
+	EM(rxrpc_recvmsg_oobq,			"OOBQ") \
 	EM(rxrpc_recvmsg_requeue,		"REQU") \
 	EM(rxrpc_recvmsg_return,		"RETN") \
 	EM(rxrpc_recvmsg_terminal,		"TERM") \
@@ -456,7 +472,7 @@
 	EM(rxrpc_tx_point_conn_abort,		"ConnAbort") \
 	EM(rxrpc_tx_point_reject,		"Reject") \
 	EM(rxrpc_tx_point_rxkad_challenge,	"RxkadChall") \
-	EM(rxrpc_tx_point_rxkad_response,	"RxkadResp") \
+	EM(rxrpc_tx_point_response,		"Response") \
 	EM(rxrpc_tx_point_version_keepalive,	"VerKeepalive") \
 	E_(rxrpc_tx_point_version_reply,	"VerReply")
 
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 8f8dc7a937a4..c4e9833b0a12 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -36,26 +36,33 @@ struct sockaddr_rxrpc {
 #define RXRPC_MIN_SECURITY_LEVEL	4	/* minimum security level */
 #define RXRPC_UPGRADEABLE_SERVICE	5	/* Upgrade service[0] -> service[1] */
 #define RXRPC_SUPPORTED_CMSG		6	/* Get highest supported control message type */
+#define RXRPC_MANAGE_RESPONSE		7	/* [clnt] Want to manage RESPONSE packets */
 
 /*
  * RxRPC control messages
  * - If neither abort or accept are specified, the message is a data message.
  * - terminal messages mean that a user call ID tag can be recycled
+ * - C/S/- indicate whether these are applicable to client, server or both
  * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg()
  */
 enum rxrpc_cmsg_type {
-	RXRPC_USER_CALL_ID	= 1,	/* sr: user call ID specifier */
-	RXRPC_ABORT		= 2,	/* sr: abort request / notification [terminal] */
-	RXRPC_ACK		= 3,	/* -r: [Service] RPC op final ACK received [terminal] */
-	RXRPC_NET_ERROR		= 5,	/* -r: network error received [terminal] */
-	RXRPC_BUSY		= 6,	/* -r: server busy received [terminal] */
-	RXRPC_LOCAL_ERROR	= 7,	/* -r: local error generated [terminal] */
-	RXRPC_NEW_CALL		= 8,	/* -r: [Service] new incoming call notification */
-	RXRPC_EXCLUSIVE_CALL	= 10,	/* s-: Call should be on exclusive connection */
-	RXRPC_UPGRADE_SERVICE	= 11,	/* s-: Request service upgrade for client call */
-	RXRPC_TX_LENGTH		= 12,	/* s-: Total length of Tx data */
-	RXRPC_SET_CALL_TIMEOUT	= 13,	/* s-: Set one or more call timeouts */
-	RXRPC_CHARGE_ACCEPT	= 14,	/* s-: Charge the accept pool with a user call ID */
+	RXRPC_USER_CALL_ID	= 1,	/* -sr: User call ID specifier */
+	RXRPC_ABORT		= 2,	/* -sr: Abort request / notification [terminal] */
+	RXRPC_ACK		= 3,	/* S-r: RPC op final ACK received [terminal] */
+	RXRPC_NET_ERROR		= 5,	/* --r: Network error received [terminal] */
+	RXRPC_BUSY		= 6,	/* C-r: Server busy received [terminal] */
+	RXRPC_LOCAL_ERROR	= 7,	/* --r: Local error generated [terminal] */
+	RXRPC_NEW_CALL		= 8,	/* S-r: New incoming call notification */
+	RXRPC_EXCLUSIVE_CALL	= 10,	/* Cs-: Call should be on exclusive connection */
+	RXRPC_UPGRADE_SERVICE	= 11,	/* Cs-: Request service upgrade for client call */
+	RXRPC_TX_LENGTH		= 12,	/* -s-: Total length of Tx data */
+	RXRPC_SET_CALL_TIMEOUT	= 13,	/* -s-: Set one or more call timeouts */
+	RXRPC_CHARGE_ACCEPT	= 14,	/* Ss-: Charge the accept pool with a user call ID */
+	RXRPC_OOB_ID		= 15,	/* -sr: OOB message ID */
+	RXRPC_CHALLENGED	= 16,	/* C-r: Info on a received CHALLENGE */
+	RXRPC_RESPOND		= 17,	/* Cs-: Respond to a challenge */
+	RXRPC_RESPONDED		= 18,	/* S-r: Data received in RESPONSE */
+	RXRPC_RESP_RXGK_APPDATA	= 19,	/* Cs-: RESPONSE: RxGK app data to include */
 	RXRPC__SUPPORTED
 };
 
@@ -118,4 +125,19 @@ enum rxrpc_cmsg_type {
 #define RXKADDATALEN		19270411	/* user data too long */
 #define RXKADILLEGALLEVEL	19270412	/* caller not authorised to use encrypted conns */
 
+/*
+ * Challenge information in the RXRPC_CHALLENGED control message.
+ */
+struct rxrpc_challenge {
+	__u16		service_id;	/* The service ID of the connection (may be upgraded) */
+	__u8		security_index;	/* The security index of the connection */
+	__u8		pad;		/* Round out to a multiple of 4 bytes. */
+	/* ... The security class gets to append extra information ... */
+};
+
+struct rxgk_challenge {
+	struct rxrpc_challenge	base;
+	__u32			enctype;	/* Krb5 encoding type */
+};
+
 #endif /* _UAPI_LINUX_RXRPC_H */
-- 
cgit v1.2.3


From 01af64269751f261421a9e80a527c8e987aeda8d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Apr 2025 10:52:50 +0100
Subject: rxrpc: Add the security index for yfs-rxgk

Add the security index and abort codes for the YFS variant of rxgk.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://patch.msgid.link/20250411095303.2316168-6-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/crypto/krb5.h      |  5 +++++
 include/uapi/linux/rxrpc.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/crypto/krb5.h b/include/crypto/krb5.h
index 62d998e62f47..71dd38f59be1 100644
--- a/include/crypto/krb5.h
+++ b/include/crypto/krb5.h
@@ -63,6 +63,11 @@ struct scatterlist;
 #define KEY_USAGE_SEED_ENCRYPTION       (0xAA)
 #define KEY_USAGE_SEED_INTEGRITY        (0x55)
 
+/*
+ * Standard Kerberos error codes.
+ */
+#define KRB5_PROG_KEYTYPE_NOSUPP		-1765328233
+
 /*
  * Mode of operation.
  */
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index c4e9833b0a12..d9735abd4c79 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -80,6 +80,7 @@ enum rxrpc_cmsg_type {
 #define RXRPC_SECURITY_RXKAD	2	/* kaserver or kerberos 4 */
 #define RXRPC_SECURITY_RXGK	4	/* gssapi-based */
 #define RXRPC_SECURITY_RXK5	5	/* kerberos 5 */
+#define RXRPC_SECURITY_YFS_RXGK	6	/* YFS gssapi-based */
 
 /*
  * RxRPC-level abort codes
@@ -125,6 +126,36 @@ enum rxrpc_cmsg_type {
 #define RXKADDATALEN		19270411	/* user data too long */
 #define RXKADILLEGALLEVEL	19270412	/* caller not authorised to use encrypted conns */
 
+/*
+ * RxGK GSSAPI security abort codes.
+ */
+#if 0 /* Original standard abort codes (used by OpenAFS) */
+#define RXGK_INCONSISTENCY	1233242880	/* Security module structure inconsistent */
+#define RXGK_PACKETSHORT	1233242881	/* Packet too short for security challenge */
+#define RXGK_BADCHALLENGE	1233242882	/* Invalid security challenge */
+#define RXGK_BADETYPE		1233242883	/* Invalid or impermissible encryption type */
+#define RXGK_BADLEVEL		1233242884	/* Invalid or impermissible security level */
+#define RXGK_BADKEYNO		1233242885	/* Key version number not found */
+#define RXGK_EXPIRED		1233242886	/* Token has expired */
+#define RXGK_NOTAUTH		1233242887	/* Caller not authorized */
+#define RXGK_BAD_TOKEN		1233242888	/* Security object was passed a bad token */
+#define RXGK_SEALED_INCON	1233242889	/* Sealed data inconsistent */
+#define RXGK_DATA_LEN		1233242890	/* User data too long */
+#define RXGK_BAD_QOP		1233242891	/* Inadequate quality of protection available */
+#else /* Revised standard abort codes (used by YFS) */
+#define RXGK_INCONSISTENCY	1233242880	/* Security module structure inconsistent */
+#define RXGK_PACKETSHORT	1233242881	/* Packet too short for security challenge */
+#define RXGK_BADCHALLENGE	1233242882	/* Security challenge/response failed */
+#define RXGK_SEALEDINCON	1233242883	/* Sealed data is inconsistent */
+#define RXGK_NOTAUTH		1233242884	/* Caller not authorised */
+#define RXGK_EXPIRED		1233242885	/* Authentication expired */
+#define RXGK_BADLEVEL		1233242886	/* Unsupported or not permitted security level */
+#define RXGK_BADKEYNO		1233242887	/* Bad transport key number */
+#define RXGK_NOTRXGK		1233242888	/* Security layer is not rxgk */
+#define RXGK_UNSUPPORTED	1233242889	/* Endpoint does not support rxgk */
+#define RXGK_GSSERROR		1233242890	/* GSSAPI mechanism error */
+#endif
+
 /*
  * Challenge information in the RXRPC_CHALLENGED control message.
  */
-- 
cgit v1.2.3


From 0ca100ff4df64f5d0f6c1dd5080c3e096786bea6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Apr 2025 10:52:51 +0100
Subject: rxrpc: Add YFS RxGK (GSSAPI) security class

Add support for the YFS-variant RxGK security class to support
GSSAPI-derived authentication.  This also allows the use of better crypto
over the rxkad security class.

The key payload is XDR encoded of the form:

    typedef int64_t opr_time;

    const AFSTOKEN_RK_TIX_MAX = 12000; 	/* Matches entry in rxkad.h */

    struct token_rxkad {
	afs_int32 viceid;
	afs_int32 kvno;
	afs_int64 key;
	afs_int32 begintime;
	afs_int32 endtime;
	afs_int32 primary_flag;
	opaque ticket<AFSTOKEN_RK_TIX_MAX>;
    };

    struct token_rxgk {
	opr_time begintime;
	opr_time endtime;
	afs_int64 level;
	afs_int64 lifetime;
	afs_int64 bytelife;
	afs_int64 enctype;
	opaque key<>;
	opaque ticket<>;
    };

    const AFSTOKEN_UNION_NOAUTH = 0;
    const AFSTOKEN_UNION_KAD = 2;
    const AFSTOKEN_UNION_YFSGK = 6;

    union ktc_tokenUnion switch (afs_int32 type) {
	case AFSTOKEN_UNION_KAD:
	    token_rxkad kad;
	case AFSTOKEN_UNION_YFSGK:
	    token_rxgk  gk;
    };

    const AFSTOKEN_LENGTH_MAX = 16384;
    typedef opaque token_opaque<AFSTOKEN_LENGTH_MAX>;

    const AFSTOKEN_MAX = 8;
    const AFSTOKEN_CELL_MAX = 64;

    struct ktc_setTokenData {
	afs_int32 flags;
	string cell<AFSTOKEN_CELL_MAX>;
	token_opaque tokens<AFSTOKEN_MAX>;
    };

The parser for the basic token struct is already present, as is the rxkad
token type.  This adds a parser for the rxgk token type.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Herbert Xu <herbert@gondor.apana.org.au>
cc: Chuck Lever <chuck.lever@oracle.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250411095303.2316168-7-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/keys/rxrpc-type.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/keys/rxrpc-type.h b/include/keys/rxrpc-type.h
index 333c0f49a9cd..0ddbe197a261 100644
--- a/include/keys/rxrpc-type.h
+++ b/include/keys/rxrpc-type.h
@@ -9,6 +9,7 @@
 #define _KEYS_RXRPC_TYPE_H
 
 #include <linux/key.h>
+#include <crypto/krb5.h>
 
 /*
  * key type for AF_RXRPC keys
@@ -31,6 +32,21 @@ struct rxkad_key {
 	u8	ticket[];		/* the encrypted ticket */
 };
 
+/*
+ * RxRPC key for YFS-RxGK (type-6 security)
+ */
+struct rxgk_key {
+	s64		begintime;	/* Time at which the ticket starts */
+	s64		endtime;	/* Time at which the ticket ends */
+	u64		lifetime;	/* Maximum lifespan of a connection (seconds) */
+	u64		bytelife;	/* Maximum number of bytes on a connection */
+	unsigned int	enctype;	/* Encoding type */
+	s8		level;		/* Negotiated security RXRPC_SECURITY_PLAIN/AUTH/ENCRYPT */
+	struct krb5_buffer key;		/* Master key, K0 */
+	struct krb5_buffer ticket;	/* Ticket to be passed to server */
+	u8		_key[];		/* Key storage */
+};
+
 /*
  * list of tokens attached to an rxrpc key
  */
@@ -40,6 +56,7 @@ struct rxrpc_key_token {
 	struct rxrpc_key_token *next;	/* the next token in the list */
 	union {
 		struct rxkad_key *kad;
+		struct rxgk_key *rxgk;
 	};
 };
 
-- 
cgit v1.2.3


From 9d1d2b59341f58126a69b51f9f5f8ccb9f12e54a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Apr 2025 10:52:53 +0100
Subject: rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)

Implement the basic parts of the yfs-rxgk security class (security index 6)
to support GSSAPI-negotiated security.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Herbert Xu <herbert@gondor.apana.org.au>
cc: Chuck Lever <chuck.lever@oracle.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250411095303.2316168-9-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/rxrpc.h | 45 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 08ecebd90595..aab81e8196ae 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -69,6 +69,38 @@
 	EM(rxkad_abort_resp_tkt_sname,		"rxkad-resp-tk-sname")	\
 	EM(rxkad_abort_resp_unknown_tkt,	"rxkad-resp-unknown-tkt") \
 	EM(rxkad_abort_resp_version,		"rxkad-resp-version")	\
+	/* RxGK security errors */					\
+	EM(rxgk_abort_1_verify_mic_eproto,	"rxgk1-vfy-mic-eproto")	\
+	EM(rxgk_abort_2_decrypt_eproto,		"rxgk2-dec-eproto")	\
+	EM(rxgk_abort_2_short_data,		"rxgk2-short-data")	\
+	EM(rxgk_abort_2_short_encdata,		"rxgk2-short-encdata")	\
+	EM(rxgk_abort_2_short_header,		"rxgk2-short-hdr")	\
+	EM(rxgk_abort_bad_key_number,		"rxgk-bad-key-num")	\
+	EM(rxgk_abort_chall_key_expired,	"rxgk-chall-key-exp")	\
+	EM(rxgk_abort_chall_no_key,		"rxgk-chall-nokey")	\
+	EM(rxgk_abort_chall_short,		"rxgk-chall-short")	\
+	EM(rxgk_abort_resp_auth_dec,		"rxgk-resp-auth-dec")	\
+	EM(rxgk_abort_resp_bad_callid,		"rxgk-resp-bad-callid")	\
+	EM(rxgk_abort_resp_bad_nonce,		"rxgk-resp-bad-nonce")	\
+	EM(rxgk_abort_resp_bad_param,		"rxgk-resp-bad-param")	\
+	EM(rxgk_abort_resp_call_ctr,		"rxgk-resp-call-ctr")	\
+	EM(rxgk_abort_resp_call_state,		"rxgk-resp-call-state")	\
+	EM(rxgk_abort_resp_internal_error,	"rxgk-resp-int-error")	\
+	EM(rxgk_abort_resp_nopkg,		"rxgk-resp-nopkg")	\
+	EM(rxgk_abort_resp_short_applen,	"rxgk-resp-short-applen") \
+	EM(rxgk_abort_resp_short_auth,		"rxgk-resp-short-auth") \
+	EM(rxgk_abort_resp_short_call_list,	"rxgk-resp-short-callls") \
+	EM(rxgk_abort_resp_short_packet,	"rxgk-resp-short-packet") \
+	EM(rxgk_abort_resp_short_yfs_klen,	"rxgk-resp-short-yfs-klen") \
+	EM(rxgk_abort_resp_short_yfs_key,	"rxgk-resp-short-yfs-key") \
+	EM(rxgk_abort_resp_short_yfs_tkt,	"rxgk-resp-short-yfs-tkt") \
+	EM(rxgk_abort_resp_tok_dec,		"rxgk-resp-tok-dec")	\
+	EM(rxgk_abort_resp_tok_internal_error,	"rxgk-resp-tok-int-err") \
+	EM(rxgk_abort_resp_tok_keyerr,		"rxgk-resp-tok-keyerr")	\
+	EM(rxgk_abort_resp_tok_nokey,		"rxgk-resp-tok-nokey")	\
+	EM(rxgk_abort_resp_tok_nopkg,		"rxgk-resp-tok-nopkg")	\
+	EM(rxgk_abort_resp_tok_short,		"rxgk-resp-tok-short")	\
+	EM(rxgk_abort_resp_xdr_align,		"rxgk-resp-xdr-align")	\
 	/* rxrpc errors */						\
 	EM(rxrpc_abort_call_improper_term,	"call-improper-term")	\
 	EM(rxrpc_abort_call_reset,		"call-reset")		\
@@ -471,6 +503,7 @@
 	EM(rxrpc_tx_point_call_final_resend,	"CallFinalResend") \
 	EM(rxrpc_tx_point_conn_abort,		"ConnAbort") \
 	EM(rxrpc_tx_point_reject,		"Reject") \
+	EM(rxrpc_tx_point_rxgk_challenge,	"RxGKChall") \
 	EM(rxrpc_tx_point_rxkad_challenge,	"RxkadChall") \
 	EM(rxrpc_tx_point_response,		"Response") \
 	EM(rxrpc_tx_point_version_keepalive,	"VerKeepalive") \
@@ -489,6 +522,7 @@
 
 #define rxrpc_txbuf_traces \
 	EM(rxrpc_txbuf_alloc_data,		"ALLOC DATA ")	\
+	EM(rxrpc_txbuf_alloc_response,		"ALLOC RESP ")	\
 	EM(rxrpc_txbuf_free,			"FREE       ")	\
 	EM(rxrpc_txbuf_get_buffer,		"GET BUFFER ")	\
 	EM(rxrpc_txbuf_get_trans,		"GET TRANS  ")	\
@@ -496,6 +530,7 @@
 	EM(rxrpc_txbuf_put_cleaned,		"PUT CLEANED")	\
 	EM(rxrpc_txbuf_put_nomem,		"PUT NOMEM  ")	\
 	EM(rxrpc_txbuf_put_rotated,		"PUT ROTATED")	\
+	EM(rxrpc_txbuf_put_response_tx,		"PUT RESP TX")	\
 	EM(rxrpc_txbuf_put_send_aborted,	"PUT SEND-X ")	\
 	EM(rxrpc_txbuf_put_trans,		"PUT TRANS  ")	\
 	EM(rxrpc_txbuf_see_lost,		"SEE LOST   ")	\
@@ -1178,6 +1213,7 @@ TRACE_EVENT(rxrpc_rx_challenge,
 		    __field(u32,		version)
 		    __field(u32,		nonce)
 		    __field(u32,		min_level)
+		    __field(u8,			security_ix)
 			     ),
 
 	    TP_fast_assign(
@@ -1186,11 +1222,13 @@ TRACE_EVENT(rxrpc_rx_challenge,
 		    __entry->version = version;
 		    __entry->nonce = nonce;
 		    __entry->min_level = min_level;
+		    __entry->security_ix = conn->security_ix;
 			   ),
 
-	    TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x",
+	    TP_printk("C=%08x CHALLENGE r=%08x sx=%u v=%x n=%x ml=%x",
 		      __entry->conn,
 		      __entry->serial,
+		      __entry->security_ix,
 		      __entry->version,
 		      __entry->nonce,
 		      __entry->min_level)
@@ -1208,6 +1246,7 @@ TRACE_EVENT(rxrpc_rx_response,
 		    __field(u32,		version)
 		    __field(u32,		kvno)
 		    __field(u32,		ticket_len)
+		    __field(u8,			security_ix)
 			     ),
 
 	    TP_fast_assign(
@@ -1216,11 +1255,13 @@ TRACE_EVENT(rxrpc_rx_response,
 		    __entry->version = version;
 		    __entry->kvno = kvno;
 		    __entry->ticket_len = ticket_len;
+		    __entry->security_ix = conn->security_ix;
 			   ),
 
-	    TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x",
+	    TP_printk("C=%08x RESPONSE r=%08x sx=%u v=%x kvno=%x tl=%x",
 		      __entry->conn,
 		      __entry->serial,
+		      __entry->security_ix,
 		      __entry->version,
 		      __entry->kvno,
 		      __entry->ticket_len)
-- 
cgit v1.2.3


From 7a7513a3081c6a2729d8570c77bbed1978277dc9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Apr 2025 10:52:54 +0100
Subject: rxrpc: rxgk: Implement connection rekeying

Implement rekeying of connections with the RxGK security class.  This
involves regenerating the keys with a different key number as part of the
input data after a certain amount of time or a certain amount of bytes
encrypted.  Rekeying may be triggered by either end.

The LSW of the key number is inserted into the security-specific field in
the RX header, and we try and expand it to 32-bits to make it last longer.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Herbert Xu <herbert@gondor.apana.org.au>
cc: Chuck Lever <chuck.lever@oracle.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250411095303.2316168-10-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/rxrpc.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index aab81e8196ae..920439df1f6f 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -2725,6 +2725,30 @@ TRACE_EVENT(rxrpc_rack_timer,
 		      ktime_to_us(__entry->delay))
 	    );
 
+TRACE_EVENT(rxrpc_rxgk_rekey,
+	    TP_PROTO(struct rxrpc_connection *conn,
+		     unsigned int current_key, unsigned int requested_key),
+
+	    TP_ARGS(conn, current_key, requested_key),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	conn)
+		    __field(unsigned int,	current_key)
+		    __field(unsigned int,	requested_key)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->conn		= conn->debug_id;
+		    __entry->current_key	= current_key;
+		    __entry->requested_key	= requested_key;
+			   ),
+
+	    TP_printk("C=%08x cur=%x req=%x",
+		      __entry->conn,
+		      __entry->current_key,
+		      __entry->requested_key)
+	    );
+
 #undef EM
 #undef E_
 
-- 
cgit v1.2.3


From d03539d5c2dec9b028297c15e57bd3c01d0d9c0d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Apr 2025 10:52:56 +0100
Subject: rxrpc: Display security params in the afs_cb_call tracepoint

Make the afs_cb_call tracepoint display some security parameters to make
debugging easier.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250411095303.2316168-12-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_rxrpc.h     |  2 ++
 include/trace/events/afs.h | 11 +++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 0b209f703ffc..f15341594cc8 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -112,5 +112,7 @@ int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge);
 u32 rxgk_kernel_query_challenge(struct sk_buff *challenge);
 int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge,
 				     struct krb5_buffer *appdata);
+u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call,
+				    u16 *_service_id, u32 *_enctype);
 
 #endif /* _NET_RXRPC_H */
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 8857f5ea77d4..7f83d242c8e9 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -663,19 +663,26 @@ TRACE_EVENT(afs_cb_call,
 		    __field(unsigned int,		call)
 		    __field(u32,			op)
 		    __field(u16,			service_id)
+		    __field(u8,				security_ix)
+		    __field(u32,			enctype)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->call	= call->debug_id;
 		    __entry->op		= call->operation_ID;
 		    __entry->service_id	= call->service_id;
+		    __entry->security_ix = call->security_ix;
+		    __entry->enctype	= call->enctype;
 			   ),
 
-	    TP_printk("c=%08x %s",
+	    TP_printk("c=%08x %s sv=%u sx=%u en=%u",
 		      __entry->call,
 		      __entry->service_id == 2501 ?
 		      __print_symbolic(__entry->op, yfs_cm_operations) :
-		      __print_symbolic(__entry->op, afs_cm_operations))
+		      __print_symbolic(__entry->op, afs_cm_operations),
+		      __entry->service_id,
+		      __entry->security_ix,
+		      __entry->enctype)
 	    );
 
 TRACE_EVENT(afs_call,
-- 
cgit v1.2.3


From fba6995798c6085a0c2fc67e0cacd489a6971044 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Apr 2025 10:52:58 +0100
Subject: rxrpc: Add more CHALLENGE/RESPONSE packet tracing

Add more tracing for CHALLENGE and RESPONSE packets.  Currently, rxrpc only
has client-relevant tracepoints (rx_challenge and tx_response), but add the
server-side ones too.

Further, record the service ID in the rx_challenge tracepoint as well.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250411095303.2316168-14-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/rxrpc.h | 78 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 920439df1f6f..378d2dfc7392 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -1201,6 +1201,39 @@ TRACE_EVENT(rxrpc_rx_conn_abort,
 		      __entry->abort_code)
 	    );
 
+TRACE_EVENT(rxrpc_tx_challenge,
+	    TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial,
+		     u32 version, u32 nonce),
+
+	    TP_ARGS(conn, serial, version, nonce),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	conn)
+		    __field(rxrpc_serial_t,	serial)
+		    __field(u32,		version)
+		    __field(u32,		nonce)
+		    __field(u16,		service_id)
+		    __field(u8,			security_ix)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->conn = conn->debug_id;
+		    __entry->serial = serial;
+		    __entry->version = version;
+		    __entry->nonce = nonce;
+		    __entry->service_id = conn->service_id;
+		    __entry->security_ix = conn->security_ix;
+			   ),
+
+	    TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x",
+		      __entry->conn,
+		      __entry->serial,
+		      __entry->service_id,
+		      __entry->security_ix,
+		      __entry->version,
+		      __entry->nonce)
+	    );
+
 TRACE_EVENT(rxrpc_rx_challenge,
 	    TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial,
 		     u32 version, u32 nonce, u32 min_level),
@@ -1213,6 +1246,7 @@ TRACE_EVENT(rxrpc_rx_challenge,
 		    __field(u32,		version)
 		    __field(u32,		nonce)
 		    __field(u32,		min_level)
+		    __field(u16,		service_id)
 		    __field(u8,			security_ix)
 			     ),
 
@@ -1222,18 +1256,60 @@ TRACE_EVENT(rxrpc_rx_challenge,
 		    __entry->version = version;
 		    __entry->nonce = nonce;
 		    __entry->min_level = min_level;
+		    __entry->service_id = conn->service_id;
 		    __entry->security_ix = conn->security_ix;
 			   ),
 
-	    TP_printk("C=%08x CHALLENGE r=%08x sx=%u v=%x n=%x ml=%x",
+	    TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x ml=%x",
 		      __entry->conn,
 		      __entry->serial,
+		      __entry->service_id,
 		      __entry->security_ix,
 		      __entry->version,
 		      __entry->nonce,
 		      __entry->min_level)
 	    );
 
+TRACE_EVENT(rxrpc_tx_response,
+	    TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial,
+		     struct rxrpc_skb_priv *rsp),
+
+	    TP_ARGS(conn, serial, rsp),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	conn)
+		    __field(rxrpc_serial_t,	serial)
+		    __field(rxrpc_serial_t,	challenge)
+		    __field(u32,		version)
+		    __field(u32,		kvno)
+		    __field(u16,		ticket_len)
+		    __field(u16,		appdata_len)
+		    __field(u16,		service_id)
+		    __field(u8,			security_ix)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->conn	= conn->debug_id;
+		    __entry->serial	= serial;
+		    __entry->challenge	= rsp->resp.challenge_serial;
+		    __entry->version	= rsp->resp.version;
+		    __entry->kvno	= rsp->resp.kvno;
+		    __entry->ticket_len = rsp->resp.ticket_len;
+		    __entry->service_id = conn->service_id;
+		    __entry->security_ix = conn->security_ix;
+			   ),
+
+	    TP_printk("C=%08x RESPONSE r=%08x cr=%08x sv=%u+%u v=%x kv=%x tl=%u",
+		      __entry->conn,
+		      __entry->serial,
+		      __entry->challenge,
+		      __entry->service_id,
+		      __entry->security_ix,
+		      __entry->version,
+		      __entry->kvno,
+		      __entry->ticket_len)
+	    );
+
 TRACE_EVENT(rxrpc_rx_response,
 	    TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial,
 		     u32 version, u32 kvno, u32 ticket_len),
-- 
cgit v1.2.3


From 6e83166dd8003e8611f253426b85e0c3d933e1c2 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Sun, 13 Apr 2025 11:34:32 +0200
Subject: mptcp: sched: remove mptcp_sched_data

This is a follow-up of commit b68b106b0f15 ("mptcp: sched: reduce size
for unused data"), now removing the mptcp_sched_data structure.

Now is a good time to do that, because the previously mentioned WIP work
has been updated, no longer depending on this structure.

Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-1-0f83a4350150@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mptcp.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index bfbad695951c..f7263fe2a2e4 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -101,18 +101,9 @@ struct mptcp_out_options {
 #define MPTCP_SCHED_MAX		128
 #define MPTCP_SCHED_BUF_MAX	(MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX)
 
-#define MPTCP_SUBFLOWS_MAX	8
-
-struct mptcp_sched_data {
-	u8	subflows;
-	struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
-};
-
 struct mptcp_sched_ops {
-	int (*get_send)(struct mptcp_sock *msk,
-			struct mptcp_sched_data *data);
-	int (*get_retrans)(struct mptcp_sock *msk,
-			   struct mptcp_sched_data *data);
+	int (*get_send)(struct mptcp_sock *msk);
+	int (*get_retrans)(struct mptcp_sock *msk);
 
 	char			name[MPTCP_SCHED_NAME_MAX];
 	struct module		*owner;
-- 
cgit v1.2.3


From f99564688f38458d86b64f099ebf03f19517cf77 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 13 Apr 2025 16:09:40 +0200
Subject: net: phy: remove device_phy_find_device

AFAICS this function has never had a user.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/ab7b8094-2eea-4e82-a047-fd60117f220b@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index a2bfae80c449..fb755358d965 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1757,7 +1757,6 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id);
 struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode);
 struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode);
-struct phy_device *device_phy_find_device(struct device *dev);
 struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode);
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
 int phy_device_register(struct phy_device *phy);
@@ -1779,11 +1778,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode)
 	return NULL;
 }
 
-static inline struct phy_device *device_phy_find_device(struct device *dev)
-{
-	return NULL;
-}
-
 static inline
 struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode)
 {
-- 
cgit v1.2.3


From 95d06e92a401928fe46fda7616e460f39cb7211b Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 14 Apr 2025 06:24:07 -0700
Subject: netlink: Introduce nlmsg_payload helper

Create a new helper function, nlmsg_payload(), to simplify checking and
retrieving Netlink message payloads.

This reduces boilerplate code for users who need to verify the message
length before accessing its data.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250414-nlmsg-v2-1-3d90cb42c6af@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netlink.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 29e0db940382..82e07e272290 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -611,6 +611,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh)
 	return nlh->nlmsg_len - NLMSG_HDRLEN;
 }
 
+/**
+ * nlmsg_payload - message payload if the data fits in the len
+ * @nlh: netlink message header
+ * @len: struct length
+ *
+ * Returns: The netlink message payload/data if the length is sufficient,
+ * otherwise NULL.
+ */
+static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len)
+{
+	if (nlh->nlmsg_len < nlmsg_msg_size(len))
+		return NULL;
+
+	return nlmsg_data(nlh);
+}
+
 /**
  * nlmsg_attrdata - head of attributes data
  * @nlh: netlink message header
-- 
cgit v1.2.3


From 7c571ac57d9d97190dcba18212fabf99888b0c48 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Mon, 14 Apr 2025 14:26:30 -0700
Subject: net: ptp: introduce .supported_extts_flags to ptp_clock_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PTP_EXTTS_REQUEST(2) ioctl has a flags field which specifies how the
external timestamp request should behave. This includes which edge of the
signal to timestamp, as well as a specialized "offset" mode. It is expected
that more flags will be added in the future.

Driver authors routinely do not check the flags, often accepting requests
with flags which they do not support. Even drivers which do check flags may
not be future-proofed to reject flags not yet defined. Thus, any future
flag additions often require manually updating drivers to reject these
flags.

This approach of hoping we catch flag checks during review, or playing
whack-a-mole after the fact is the wrong approach.

Introduce the "supported_extts_flags" field to the ptp_clock_info
structure. This field defines the set of flags the device actually
supports.

Update the core character device logic to check this field and reject
unsupported requests. Getting this right is somewhat tricky. First, to
avoid unnecessary repetition and make basic functionality work when
.supported_extts_flags is 0, the core always accepts the PTP_ENABLE_FEATURE
flag. This flag is used to set the 'on' parameter to the .enable function
and is thus always 'supported' by all drivers.

For backwards compatibility, the PTP_RISING_EDGE and PTP_FALLING_EDGE flags
are merely "hints" when using the old PTP_EXTTS_REQUEST ioctl, and are not
expected to be enforced. If the user issues PTP_EXTTS_REQUEST2, the
PTP_STRICT_FLAGS flag is added which is supposed to inform the driver to
strictly validate the flags and reject unsupported requests. To handle
this, first check if the driver reports PTP_STRICT_FLAGS support. If it
does not, then always allow the PTP_RISING_EDGE and PTP_FALLING_EDGE flags.
This keeps backwards compatibility with the original PTP_EXTTS_REQUEST
ioctl where these flags are not guaranteed to be honored.

This way, drivers which do not set the supported_extts_flags will continue
to accept requests for the original PTP_EXTTS_REQUEST ioctl. The core will
automatically reject requests with new flags, and correctly reject requests
with PTP_STRICT_FLAGS, where the driver is supposed to strictly validate
the flags.

Update the various drivers, refactoring their validation logic into the
.supported_extts_flags field. For consistency and readability,
PTP_ENABLE_FEATURE is not set in the supported flags list, and
PTP_EXTTS_EDGES is expanded to PTP_RISING_EDGE | PTP_FALLING_EDGE in all
cases.

Note the following driver files set n_ext_ts to a non-zero value but did
not check flags at all:

 • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c
 • drivers/net/ethernet/freescale/enetc/enetc_ptp.c
 • drivers/net/ethernet/intel/i40e/i40e_ptp.c
 • drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
 • drivers/net/ethernet/renesas/ravb_ptp.c
 • drivers/net/ethernet/renesas/rtsn.c
 • drivers/net/ethernet/renesas/rtsn.h
 • drivers/net/ethernet/ti/am65-cpts.c
 • drivers/net/ethernet/ti/cpts.h
 • drivers/net/ethernet/ti/icssg/icss_iep.c
 • drivers/net/ethernet/xscale/ptp_ixp46x.c
 • drivers/net/phy/bcm-phy-ptp.c
 • drivers/ptp/ptp_ocp.c
 • drivers/ptp/ptp_pch.c
 • drivers/ptp/ptp_qoriq.c

These drivers behavior does change slightly: they will now reject the
PTP_EXTTS_REQUEST2 ioctl, because they do not strictly validate their
flags. This also makes them no longer incorrectly accept PTP_EXT_OFFSET.

Also note that the renesas ravb driver does not support PTP_STRICT_FLAGS.
We could leave the .supported_extts_flags as 0, but I added the
PTP_RISING_EDGE | PTP_FALLING_EDGE since the driver previously manually
validated these flags. This is equivalent to 0 because the core will allow
these flags regardless unless PTP_STRICT_FLAGS is also set.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-1-f6b17d15475c@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ptp_clock_kernel.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 0d68d09bedd1..25cba2e5ee69 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -68,6 +68,17 @@ struct ptp_system_timestamp {
  * @n_per_out: The number of programmable periodic signals.
  * @n_pins:    The number of programmable pins.
  * @pps:       Indicates whether the clock supports a PPS callback.
+ *
+ * @supported_extts_flags:  The set of flags the driver supports for the
+ *                          PTP_EXTTS_REQUEST ioctl. The PTP core will use
+ *                          this list to reject unsupported requests.
+ *                          PTP_ENABLE_FEATURE is assumed and does not need to
+ *                          be included. If PTP_STRICT_FLAGS is *not* set,
+ *                          then both PTP_RISING_EDGE and PTP_FALLING_EDGE
+ *                          will be assumed. Note that PTP_STRICT_FLAGS must
+ *                          be set if the drivers wants to honor
+ *                          PTP_EXTTS_REQUEST2 and any future flags.
+ *
  * @pin_config: Array of length 'n_pins'. If the number of
  *              programmable pins is nonzero, then drivers must
  *              allocate and initialize this array.
@@ -174,6 +185,7 @@ struct ptp_clock_info {
 	int n_per_out;
 	int n_pins;
 	int pps;
+	unsigned int supported_extts_flags;
 	struct ptp_pin_desc *pin_config;
 	int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm);
 	int (*adjphase)(struct ptp_clock_info *ptp, s32 phase);
-- 
cgit v1.2.3


From d9f3e9ecc4562ae07aaf614cf0a6690ef7ca0e10 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Mon, 14 Apr 2025 14:26:31 -0700
Subject: net: ptp: introduce .supported_perout_flags to ptp_clock_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PTP_PEROUT_REQUEST2 ioctl has gained support for flags specifying
specific output behavior including PTP_PEROUT_ONE_SHOT,
PTP_PEROUT_DUTY_CYCLE, PTP_PEROUT_PHASE.

Driver authors are notorious for not checking the flags of the request.
This results in misinterpreting the request, generating an output signal
that does not match the requested value. It is anticipated that even more
flags will be added in the future, resulting in even more broken requests.

Expecting these issues to be caught during review or playing whack-a-mole
after the fact is not a great solution.

Instead, introduce the supported_perout_flags field in the ptp_clock_info
structure. Update the core character device logic to explicitly reject any
request which has a flag not on this list.

This ensures that drivers must 'opt in' to the flags they support. Drivers
which don't set the .supported_perout_flags field will not need to check
that unsupported flags aren't passed, as the core takes care of this.

Update the drivers which do support flags to set this new field.

Note the following driver files set n_per_out to a non-zero value but did
not check the flags at all:

 • drivers/ptp/ptp_clockmatrix.c
 • drivers/ptp/ptp_idt82p33.c
 • drivers/ptp/ptp_fc3.c
 • drivers/net/ethernet/ti/am65-cpts.c
 • drivers/net/ethernet/aquantia/atlantic/aq_ptp.c
 • drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
 • drivers/net/dsa/sja1105/sja1105_ptp.c
 • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c
 • drivers/net/ethernet/mscc/ocelot_vsc7514.c
 • drivers/net/ethernet/intel/i40e/i40e_ptp.c

Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-2-f6b17d15475c@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ptp_clock_kernel.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 25cba2e5ee69..eced7e9bf69a 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -69,6 +69,11 @@ struct ptp_system_timestamp {
  * @n_pins:    The number of programmable pins.
  * @pps:       Indicates whether the clock supports a PPS callback.
  *
+ * @supported_perout_flags:  The set of flags the driver supports for the
+ *                           PTP_PEROUT_REQUEST ioctl. The PTP core will
+ *                           reject a request with any flag not specified
+ *                           here.
+ *
  * @supported_extts_flags:  The set of flags the driver supports for the
  *                          PTP_EXTTS_REQUEST ioctl. The PTP core will use
  *                          this list to reject unsupported requests.
@@ -185,6 +190,7 @@ struct ptp_clock_info {
 	int n_per_out;
 	int n_pins;
 	int pps;
+	unsigned int supported_perout_flags;
 	unsigned int supported_extts_flags;
 	struct ptp_pin_desc *pin_config;
 	int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm);
-- 
cgit v1.2.3


From 43eca05b6a3b917c600e10cc6b06bfa57fa57401 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Fri, 11 Apr 2025 10:49:56 +0300
Subject: xfrm: Add explicit dev to .xdo_dev_state_{add,delete,free}

Previously, device driver IPSec offload implementations would fall into
two categories:
1. Those that used xso.dev to determine the offload device.
2. Those that used xso.real_dev to determine the offload device.

The first category didn't work with bonding while the second did.
In a non-bonding setup the two pointers are the same.

This commit adds explicit pointers for the offload netdevice to
.xdo_dev_state_add() / .xdo_dev_state_delete() / .xdo_dev_state_free()
which eliminates the confusion and allows drivers from the first
category to work with bonding.

xso.real_dev now becomes a private pointer managed by the bonding
driver.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h | 10 +++++++---
 include/net/xfrm.h        |  8 ++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d8544f6a680c..88dfb8aeed3c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1013,9 +1013,13 @@ struct netdev_bpf {
 
 #ifdef CONFIG_XFRM_OFFLOAD
 struct xfrmdev_ops {
-	int	(*xdo_dev_state_add) (struct xfrm_state *x, struct netlink_ext_ack *extack);
-	void	(*xdo_dev_state_delete) (struct xfrm_state *x);
-	void	(*xdo_dev_state_free) (struct xfrm_state *x);
+	int	(*xdo_dev_state_add)(struct net_device *dev,
+				     struct xfrm_state *x,
+				     struct netlink_ext_ack *extack);
+	void	(*xdo_dev_state_delete)(struct net_device *dev,
+					struct xfrm_state *x);
+	void	(*xdo_dev_state_free)(struct net_device *dev,
+				      struct xfrm_state *x);
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
 	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 39365fd2ea17..3d2f6c879311 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -147,8 +147,16 @@ enum {
 };
 
 struct xfrm_dev_offload {
+	/* The device for this offload.
+	 * Device drivers should not use this directly, as that will prevent
+	 * them from working with bonding device. Instead, the device passed
+	 * to the add/delete callbacks should be used.
+	 */
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
+	/* This is a private pointer used by the bonding driver.
+	 * Device drivers should not use it.
+	 */
 	struct net_device	*real_dev;
 	unsigned long		offload_handle;
 	u8			dir : 2;
-- 
cgit v1.2.3


From d2fddbd3479928e52061e1c8dd302006b6283ce8 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Fri, 11 Apr 2025 10:49:58 +0300
Subject: bonding: Fix multiple long standing offload races

Refactor the bonding ipsec offload operations to fix a number of
long-standing control plane races between state migration and user
deletion and a few other issues.

xfrm state deletion can happen concurrently with
bond_change_active_slave() operation. This manifests itself as a
bond_ipsec_del_sa() call with x->lock held, followed by a
bond_ipsec_free_sa() a bit later from a wq. The alternate path of
these calls coming from xfrm_dev_state_flush() can't happen, as that
needs the RTNL lock and bond_change_active_slave() already holds it.

1. bond_ipsec_del_sa_all() might call xdo_dev_state_delete() a second
   time on an xfrm state that was concurrently killed. This is bad.
2. bond_ipsec_add_sa_all() can add a state on the new device, but
   pending bond_ipsec_free_sa() calls from the old device will then hit
   the WARN_ON() and then, worse, call xdo_dev_state_free() on the new
   device without a corresponding xdo_dev_state_delete().
3. Resolve a sleeping in atomic context introduced by the mentioned
   "Fixes" commit.

bond_ipsec_del_sa_all() and bond_ipsec_add_sa_all() now acquire x->lock
and check for x->km.state to help with problems 1 and 2. And since
xso.real_dev is now a private pointer managed by the bonding driver in
xfrm state, make better use of it to fully fix problems 1 and 2. In
bond_ipsec_del_sa_all(), set xso.real_dev to NULL while holding both the
mutex and x->lock, which makes sure that neither bond_ipsec_del_sa() nor
bond_ipsec_free_sa() could run concurrently.

Fix problem 3 by moving the list cleanup (which requires the mutex) from
bond_ipsec_del_sa() (called from atomic context) to bond_ipsec_free_sa()

Finally, simplify bond_ipsec_del_sa() and bond_ipsec_free_sa() by using
xso->real_dev directly, since it's now protected by locks and can be
trusted to always reflect the offload device.

Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex")
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Tested-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 3d2f6c879311..b7e8f3f49627 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -154,8 +154,11 @@ struct xfrm_dev_offload {
 	 */
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
-	/* This is a private pointer used by the bonding driver.
-	 * Device drivers should not use it.
+	/* This is a private pointer used by the bonding driver (and eventually
+	 * should be moved there). Device drivers should not use it.
+	 * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases,
+	 * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock
+	 * is held.
 	 */
 	struct net_device	*real_dev;
 	unsigned long		offload_handle;
-- 
cgit v1.2.3


From cd1fafe7da1f6f2aa25723e317f6e8e9d0c050a1 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 15 Apr 2025 05:24:58 +0000
Subject: eth: bnxt: add support rx side device memory TCP

Currently, bnxt_en driver satisfies the requirements of the Device
memory TCP, which is HDS.
So, it implements rx-side Device memory TCP for bnxt_en driver.
It requires only converting the page API to netmem API.
`struct page` of agg rings are changed to `netmem_ref netmem` and
corresponding functions are changed to a variant of netmem API.

It also passes PP_FLAG_ALLOW_UNREADABLE_NETMEM flag to a parameter of
page_pool.
The netmem will be activated only when a user requests devmem TCP.

When netmem is activated, received data is unreadable and netmem is
disabled, received data is readable.
But drivers don't need to handle both cases because netmem core API will
handle it properly.
So, using proper netmem API is enough for drivers.

Device memory TCP can be tested with
tools/testing/selftests/drivers/net/hw/ncdevmem.
This is tested with BCM57504-N425G and firmware version 232.0.155.8/pkg
232.1.132.8.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Tested-by: David Wei <dw@davidwei.uk>
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Link: https://patch.msgid.link/20250415052458.1260575-1-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/helpers.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 582a3d00cbe2..93f2c31baf9b 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -395,6 +395,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
 
+static inline void page_pool_recycle_direct_netmem(struct page_pool *pool,
+						   netmem_ref netmem)
+{
+	page_pool_put_full_netmem(pool, netmem, true);
+}
+
 #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA	\
 		(sizeof(dma_addr_t) > sizeof(unsigned long))
 
@@ -492,4 +498,9 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
 		page_pool_update_nid(pool, new_nid);
 }
 
+static inline bool page_pool_is_unreadable(struct page_pool *pool)
+{
+	return !!pool->mp_ops;
+}
+
 #endif /* _NET_PAGE_POOL_HELPERS_H */
-- 
cgit v1.2.3


From ab244a394c7f13f6573744b9ca72bb22151a3ec4 Mon Sep 17 00:00:00 2001
From: Chiachang Wang <chiachangwang@google.com>
Date: Thu, 13 Mar 2025 02:36:40 +0000
Subject: xfrm: Migrate offload configuration

Add hardware offload configuration to XFRM_MSG_MIGRATE
using an option netlink attribute XFRMA_OFFLOAD_DEV.

In the existing xfrm_state_migrate(), the xfrm_init_state()
is called assuming no hardware offload by default. Even the
original xfrm_state is configured with offload, the setting will
be reset. If the device is configured with hardware offload,
it's reasonable to allow the device to maintain its hardware
offload mode. But the device will end up with offload disabled
after receiving a migration event when the device migrates the
connection from one netdev to another one.

The devices that support migration may work with different
underlying networks, such as mobile devices. The hardware setting
should be forwarded to the different netdev based on the
migration configuration. This change provides the capability
for user space to migrate from one netdev to another.

Test: Tested with kernel test in the Android tree located
      in https://android.googlesource.com/kernel/tests/
      The xfrm_tunnel_test.py under the tests folder in
      particular.
Signed-off-by: Chiachang Wang <chiachangwang@google.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b7e8f3f49627..466423a1a70a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1904,12 +1904,16 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n
 						u32 if_id);
 struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
 				      struct xfrm_migrate *m,
-				      struct xfrm_encap_tmpl *encap);
+				      struct xfrm_encap_tmpl *encap,
+				      struct net *net,
+				      struct xfrm_user_offload *xuo,
+				      struct netlink_ext_ack *extack);
 int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 		 struct xfrm_migrate *m, int num_bundles,
 		 struct xfrm_kmaddress *k, struct net *net,
 		 struct xfrm_encap_tmpl *encap, u32 if_id,
-		 struct netlink_ext_ack *extack);
+		 struct netlink_ext_ack *extack,
+		 struct xfrm_user_offload *xuo);
 #endif
 
 int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
-- 
cgit v1.2.3


From b7a63391aa982295bbb3125e7d4470f51f31ff0f Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@openvpn.net>
Date: Tue, 15 Apr 2025 13:17:19 +0200
Subject: ovpn: add basic netlink support

This commit introduces basic netlink support with family
registration/unregistration functionalities and stub pre/post-doit.

More importantly it introduces the YAML uAPI description along
with its auto-generated files:
- include/uapi/linux/ovpn.h
- drivers/net/ovpn/netlink-gen.c
- drivers/net/ovpn/netlink-gen.h

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
Link: https://patch.msgid.link/20250415-b4-ovpn-v26-2-577f6097b964@openvpn.net
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/ovpn.h | 109 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 include/uapi/linux/ovpn.h

(limited to 'include')

diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h
new file mode 100644
index 000000000000..680d1522dc87
--- /dev/null
+++ b/include/uapi/linux/ovpn.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/ovpn.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_OVPN_H
+#define _UAPI_LINUX_OVPN_H
+
+#define OVPN_FAMILY_NAME	"ovpn"
+#define OVPN_FAMILY_VERSION	1
+
+#define OVPN_NONCE_TAIL_SIZE	8
+
+enum ovpn_cipher_alg {
+	OVPN_CIPHER_ALG_NONE,
+	OVPN_CIPHER_ALG_AES_GCM,
+	OVPN_CIPHER_ALG_CHACHA20_POLY1305,
+};
+
+enum ovpn_del_peer_reason {
+	OVPN_DEL_PEER_REASON_TEARDOWN,
+	OVPN_DEL_PEER_REASON_USERSPACE,
+	OVPN_DEL_PEER_REASON_EXPIRED,
+	OVPN_DEL_PEER_REASON_TRANSPORT_ERROR,
+	OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT,
+};
+
+enum ovpn_key_slot {
+	OVPN_KEY_SLOT_PRIMARY,
+	OVPN_KEY_SLOT_SECONDARY,
+};
+
+enum {
+	OVPN_A_PEER_ID = 1,
+	OVPN_A_PEER_REMOTE_IPV4,
+	OVPN_A_PEER_REMOTE_IPV6,
+	OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID,
+	OVPN_A_PEER_REMOTE_PORT,
+	OVPN_A_PEER_SOCKET,
+	OVPN_A_PEER_SOCKET_NETNSID,
+	OVPN_A_PEER_VPN_IPV4,
+	OVPN_A_PEER_VPN_IPV6,
+	OVPN_A_PEER_LOCAL_IPV4,
+	OVPN_A_PEER_LOCAL_IPV6,
+	OVPN_A_PEER_LOCAL_PORT,
+	OVPN_A_PEER_KEEPALIVE_INTERVAL,
+	OVPN_A_PEER_KEEPALIVE_TIMEOUT,
+	OVPN_A_PEER_DEL_REASON,
+	OVPN_A_PEER_VPN_RX_BYTES,
+	OVPN_A_PEER_VPN_TX_BYTES,
+	OVPN_A_PEER_VPN_RX_PACKETS,
+	OVPN_A_PEER_VPN_TX_PACKETS,
+	OVPN_A_PEER_LINK_RX_BYTES,
+	OVPN_A_PEER_LINK_TX_BYTES,
+	OVPN_A_PEER_LINK_RX_PACKETS,
+	OVPN_A_PEER_LINK_TX_PACKETS,
+
+	__OVPN_A_PEER_MAX,
+	OVPN_A_PEER_MAX = (__OVPN_A_PEER_MAX - 1)
+};
+
+enum {
+	OVPN_A_KEYCONF_PEER_ID = 1,
+	OVPN_A_KEYCONF_SLOT,
+	OVPN_A_KEYCONF_KEY_ID,
+	OVPN_A_KEYCONF_CIPHER_ALG,
+	OVPN_A_KEYCONF_ENCRYPT_DIR,
+	OVPN_A_KEYCONF_DECRYPT_DIR,
+
+	__OVPN_A_KEYCONF_MAX,
+	OVPN_A_KEYCONF_MAX = (__OVPN_A_KEYCONF_MAX - 1)
+};
+
+enum {
+	OVPN_A_KEYDIR_CIPHER_KEY = 1,
+	OVPN_A_KEYDIR_NONCE_TAIL,
+
+	__OVPN_A_KEYDIR_MAX,
+	OVPN_A_KEYDIR_MAX = (__OVPN_A_KEYDIR_MAX - 1)
+};
+
+enum {
+	OVPN_A_IFINDEX = 1,
+	OVPN_A_PEER,
+	OVPN_A_KEYCONF,
+
+	__OVPN_A_MAX,
+	OVPN_A_MAX = (__OVPN_A_MAX - 1)
+};
+
+enum {
+	OVPN_CMD_PEER_NEW = 1,
+	OVPN_CMD_PEER_SET,
+	OVPN_CMD_PEER_GET,
+	OVPN_CMD_PEER_DEL,
+	OVPN_CMD_PEER_DEL_NTF,
+	OVPN_CMD_KEY_NEW,
+	OVPN_CMD_KEY_GET,
+	OVPN_CMD_KEY_SWAP,
+	OVPN_CMD_KEY_SWAP_NTF,
+	OVPN_CMD_KEY_DEL,
+
+	__OVPN_CMD_MAX,
+	OVPN_CMD_MAX = (__OVPN_CMD_MAX - 1)
+};
+
+#define OVPN_MCGRP_PEERS	"peers"
+
+#endif /* _UAPI_LINUX_OVPN_H */
-- 
cgit v1.2.3


From c2d950c4672a012ea9765c15a389cdcdf919f652 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@openvpn.net>
Date: Tue, 15 Apr 2025 13:17:20 +0200
Subject: ovpn: add basic interface creation/destruction/management routines

Add basic infrastructure for handling ovpn interfaces.

Tested-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
Link: https://patch.msgid.link/20250415-b4-ovpn-v26-3-577f6097b964@openvpn.net
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/if_link.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 318386cc5b0d..3ad2d5d98034 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1986,4 +1986,19 @@ enum {
 
 #define IFLA_DSA_MAX	(__IFLA_DSA_MAX - 1)
 
+/* OVPN section */
+
+enum ovpn_mode {
+	OVPN_MODE_P2P,
+	OVPN_MODE_MP,
+};
+
+enum {
+	IFLA_OVPN_UNSPEC,
+	IFLA_OVPN_MODE,
+	__IFLA_OVPN_MAX,
+};
+
+#define IFLA_OVPN_MAX	(__IFLA_OVPN_MAX - 1)
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
-- 
cgit v1.2.3


From f6226ae7a0cd47aaa9175aca6a1e19600f884cbf Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@openvpn.net>
Date: Tue, 15 Apr 2025 13:17:23 +0200
Subject: ovpn: introduce the ovpn_socket object

This specific structure is used in the ovpn kernel module
to wrap and carry around a standard kernel socket.

ovpn takes ownership of passed sockets and therefore an ovpn
specific objects is attached to them for status tracking
purposes.

Initially only UDP support is introduced. TCP will come in a later
patch.

Cc: willemdebruijn.kernel@gmail.com
Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
Link: https://patch.msgid.link/20250415-b4-ovpn-v26-6-577f6097b964@openvpn.net
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/udp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index d85d671deed3..edca3e430305 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -43,5 +43,6 @@ struct udphdr {
 #define UDP_ENCAP_GTP1U		5 /* 3GPP TS 29.060 */
 #define UDP_ENCAP_RXRPC		6
 #define TCP_ENCAP_ESPINTCP	7 /* Yikes, this is really xfrm encap types. */
+#define UDP_ENCAP_OVPNINUDP	8 /* OpenVPN traffic */
 
 #endif /* _UAPI_LINUX_UDP_H */
-- 
cgit v1.2.3


From 17240749f26e07cafa676688d8a3326086498447 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <antonio@openvpn.net>
Date: Tue, 15 Apr 2025 13:17:29 +0200
Subject: skb: implement skb_send_sock_locked_with_flags()

When sending an skb over a socket using skb_send_sock_locked(),
it is currently not possible to specify any flag to be set in
msghdr->msg_flags.

However, we may want to pass flags the user may have specified,
like MSG_NOSIGNAL.

Extend __skb_send_sock() with a new argument 'flags' and add a
new interface named skb_send_sock_locked_with_flags().

Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>
Signed-off-by: Antonio Quartulli <antonio@openvpn.net>
Link: https://patch.msgid.link/20250415-b4-ovpn-v26-12-577f6097b964@openvpn.net
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f1381aff0f89..beb084ee4f4d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4145,6 +4145,8 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    unsigned int flags);
 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 			 int len);
+int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
+				    int offset, int len, int flags);
 int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
-- 
cgit v1.2.3


From a1b669ea16c4d7c1a1a8fc7e25aaf651ea0078c3 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Wed, 9 Apr 2025 14:45:57 -0700
Subject: bpf: Prepare to reuse get_ctx_arg_idx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename get_ctx_arg_idx to bpf_ctx_arg_idx, and allow others to call it.
No functional change.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20250409214606.2000194-2-ameryhung@gmail.com
---
 include/linux/btf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index ebc0c0c9b944..b2983706292f 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -522,6 +522,7 @@ bool btf_param_match_suffix(const struct btf *btf,
 			    const char *suffix);
 int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
 		       u32 arg_no);
+u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off);
 
 struct bpf_verifier_log;
 
-- 
cgit v1.2.3


From 151e13ece86d234213b7f224f0e26a957c0eeb3e Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Tue, 15 Apr 2025 18:02:15 -0700
Subject: net: ethtool: Adjust exactly ETH_GSTRING_LEN-long stats to use memcpy

Many drivers populate the stats buffer using C-String based APIs (e.g.
ethtool_sprintf() and ethtool_puts()), usually when building up the
list of stats individually (i.e. with a for() loop). This, however,
requires that the source strings be populated in such a way as to have
a terminating NUL byte in the source.

Other drivers populate the stats buffer directly using one big memcpy()
of an entire array of strings. No NUL termination is needed here, as the
bytes are being directly passed through. Yet others will build up the
stats buffer individually, but also use memcpy(). This, too, does not
need NUL termination of the source strings.

However, there are cases where the strings that populate the
source stats strings are exactly ETH_GSTRING_LEN long, and GCC
15's -Wunterminated-string-initialization option complains that the
trailing NUL byte has been truncated. This situation is fine only if the
driver is using the memcpy() approach. If the C-String APIs are used,
the destination string name will have its final byte truncated by the
required trailing NUL byte applied by the C-string API.

For drivers that are already using memcpy() but have initializers that
truncate the NUL terminator, mark their source strings as __nonstring to
silence the GCC warnings.

For drivers that have initializers that truncate the NUL terminator and
are using the C-String APIs, switch to memcpy() to avoid destination
string truncation and mark their source strings as __nonstring to silence
the GCC warnings. (Also introduce ethtool_cpy() as a helper to make this
an easy replacement).

Specifically the following warnings were investigated and addressed:

../drivers/net/ethernet/chelsio/cxgb/cxgb2.c:364:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization]
  364 |         "TxFramesAbortedDueToXSCollisions",
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:165:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization]
  165 |         { ENETC_PM_R1523X(0),   "MAC rx 1523 to max-octet packets" },
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:190:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization]
  190 |         { ENETC_PM_T1523X(0),   "MAC tx 1523 to max-octet packets" },
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../drivers/net/ethernet/google/gve/gve_ethtool.c:76:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization]
   76 |         "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt",
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:117:53: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization]
  117 |         STMMAC_STAT(ptp_rx_msg_type_pdelay_follow_up),
      |                                                     ^
../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:46:12: note: in definition of macro 'STMMAC_STAT'
   46 |         { #m, sizeof_field(struct stmmac_extra_stats, m),       \
      |            ^
../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:328:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization]
  328 |                 .str = "a_mac_control_frames_transmitted",
      |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:340:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization]
  340 |                 .str = "a_pause_mac_ctrl_frames_received",
      |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Kees Cook <kees@kernel.org>
Reviewed-by: Petr Machata <petrm@nvidia.com> # for mlxsw
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Link: https://patch.msgid.link/20250416010210.work.904-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 013d25858642..7edb5f5e7134 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -1330,6 +1330,17 @@ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
  */
 extern void ethtool_puts(u8 **data, const char *str);
 
+/**
+ * ethtool_cpy - Write possibly-not-NUL-terminated string to ethtool string data
+ * @data: Pointer to a pointer to the start of string to write into
+ * @str: NUL-byte padded char array of size ETH_GSTRING_LEN to copy from
+ */
+#define ethtool_cpy(data, str)	do {				\
+	BUILD_BUG_ON(sizeof(str) != ETH_GSTRING_LEN);		\
+	memcpy(*(data), str, ETH_GSTRING_LEN);			\
+	*(data) += ETH_GSTRING_LEN;				\
+} while (0)
+
 /* Link mode to forced speed capabilities maps */
 struct ethtool_forced_speed_map {
 	u32		speed;
-- 
cgit v1.2.3


From 22cbc1ee268b7ec0000848708944daa61c6e4909 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 15 Apr 2025 20:04:47 -0700
Subject: netdev: fix the locking for netdev notifications

Kuniyuki reports that the assert for netdev lock fires when
there are netdev event listeners (otherwise we skip the netlink
event generation).

Correct the locking when coming from the notifier.

The NETDEV_XDP_FEAT_CHANGE notifier is already fully locked,
it's the documentation that's incorrect.

Fixes: 99e44f39a8f7 ("netdev: depend on netdev->lock for xdp features")
Reported-by: syzkaller <syzkaller@googlegroups.com>
Reported-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/20250410171019.62128-1-kuniyu@amazon.com
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250416030447.1077551-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  2 +-
 include/net/netdev_lock.h | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e6036b82ef4c..0321fd952f70 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2520,7 +2520,7 @@ struct net_device {
 	 *	@net_shaper_hierarchy, @reg_state, @threaded
 	 *
 	 * Double protects:
-	 *	@up, @moving_ns, @nd_net, @xdp_flags
+	 *	@up, @moving_ns, @nd_net, @xdp_features
 	 *
 	 * Double ops protects:
 	 *	@real_num_rx_queues, @real_num_tx_queues
diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h
index 5706835a660c..2a753813f849 100644
--- a/include/net/netdev_lock.h
+++ b/include/net/netdev_lock.h
@@ -48,6 +48,22 @@ static inline void netdev_unlock_ops(struct net_device *dev)
 		netdev_unlock(dev);
 }
 
+static inline void netdev_lock_ops_to_full(struct net_device *dev)
+{
+	if (netdev_need_ops_lock(dev))
+		netdev_assert_locked(dev);
+	else
+		netdev_lock(dev);
+}
+
+static inline void netdev_unlock_full_to_ops(struct net_device *dev)
+{
+	if (netdev_need_ops_lock(dev))
+		netdev_assert_locked(dev);
+	else
+		netdev_unlock(dev);
+}
+
 static inline void netdev_ops_assert_locked(const struct net_device *dev)
 {
 	if (netdev_need_ops_lock(dev))
-- 
cgit v1.2.3


From 2b905deb43ea0b67fa8448fc9c15dacb068f45b6 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Wed, 16 Apr 2025 19:56:23 +0800
Subject: net: Delete the outer () duplicated of macro SOCK_SKB_CB_OFFSET
 definition

For macro SOCK_SKB_CB_OFFSET definition, Delete the outer () duplicated.

Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250416-fix_net-v1-1-d544c9f3f169@quicinc.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index bb4d6189292f..e223102337c7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2604,8 +2604,8 @@ struct sock_skb_cb {
  * using skb->cb[] would keep using it directly and utilize its
  * alignment guarantee.
  */
-#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
-			    sizeof(struct sock_skb_cb)))
+#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \
+			    sizeof(struct sock_skb_cb))
 
 #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
 			    SOCK_SKB_CB_OFFSET))
-- 
cgit v1.2.3


From 1df4a945444f071a9c5e09580a485919c42d4de5 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 16 Apr 2025 10:06:12 -0700
Subject: trace: tcp: Add const qualifier to skb parameter in tcp_probe event

Change the tcp_probe tracepoint to accept a const struct sk_buff
parameter instead of a non-const one. This improves type safety and
better reflects that the skb is not modified within the tracepoint
implementation.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250416-tcp_probe-v1-1-1edc3c5a1cb8@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 75d3d53a3832..53e878fa14d1 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -293,7 +293,7 @@ DECLARE_TRACE(tcp_cwnd_reduction_tp,
 
 TRACE_EVENT(tcp_probe,
 
-	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+	TP_PROTO(struct sock *sk, const struct sk_buff *skb),
 
 	TP_ARGS(sk, skb),
 
-- 
cgit v1.2.3


From 8066e388be48f1ad62b0449dc1d31a25489fa12a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 16 Apr 2025 13:08:40 -0700
Subject: net: add UAPI to the header guard in various network headers

fib_rule, ip6_tunnel, and a whole lot of if_* headers lack the customary
_UAPI in the header guard. Without it YNL build can't protect from in tree
and system headers both getting included. YNL doesn't need most of these
but it's annoying to have to fix them one by one.

Note that header installation strips this _UAPI prefix so this should
result in no change to the end user.

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20250416200840.1338195-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/fib_rules.h    | 4 ++--
 include/uapi/linux/if_addr.h      | 4 ++--
 include/uapi/linux/if_addrlabel.h | 4 ++--
 include/uapi/linux/if_alg.h       | 6 +++---
 include/uapi/linux/if_arcnet.h    | 6 +++---
 include/uapi/linux/if_bonding.h   | 6 +++---
 include/uapi/linux/if_fc.h        | 6 +++---
 include/uapi/linux/if_hippi.h     | 6 +++---
 include/uapi/linux/if_packet.h    | 4 ++--
 include/uapi/linux/if_plip.h      | 4 ++--
 include/uapi/linux/if_slip.h      | 4 ++--
 include/uapi/linux/if_x25.h       | 6 +++---
 include/uapi/linux/if_xdp.h       | 6 +++---
 include/uapi/linux/ip6_tunnel.h   | 4 ++--
 include/uapi/linux/net_dropmon.h  | 4 ++--
 include/uapi/linux/net_tstamp.h   | 6 +++---
 include/uapi/linux/netlink_diag.h | 4 ++--
 include/uapi/linux/pkt_cls.h      | 4 ++--
 include/uapi/linux/pkt_sched.h    | 4 ++--
 19 files changed, 46 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 2df6e4035d50..418c4be697ad 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_FIB_RULES_H
-#define __LINUX_FIB_RULES_H
+#ifndef _UAPI__LINUX_FIB_RULES_H
+#define _UAPI__LINUX_FIB_RULES_H
 
 #include <linux/types.h>
 #include <linux/rtnetlink.h>
diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index 1c392dd95a5e..aa7958b4e41d 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_IF_ADDR_H
-#define __LINUX_IF_ADDR_H
+#ifndef _UAPI__LINUX_IF_ADDR_H
+#define _UAPI__LINUX_IF_ADDR_H
 
 #include <linux/types.h>
 #include <linux/netlink.h>
diff --git a/include/uapi/linux/if_addrlabel.h b/include/uapi/linux/if_addrlabel.h
index d1f5974c76e1..e69db764fbba 100644
--- a/include/uapi/linux/if_addrlabel.h
+++ b/include/uapi/linux/if_addrlabel.h
@@ -8,8 +8,8 @@
  *	YOSHIFUJI Hideaki @ USAGI/WIDE <yoshfuji@linux-ipv6.org>
  */
 
-#ifndef __LINUX_IF_ADDRLABEL_H
-#define __LINUX_IF_ADDRLABEL_H
+#ifndef _UAPI__LINUX_IF_ADDRLABEL_H
+#define _UAPI__LINUX_IF_ADDRLABEL_H
 
 #include <linux/types.h>
 
diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h
index 0824fbc026a1..b35871cbeed7 100644
--- a/include/uapi/linux/if_alg.h
+++ b/include/uapi/linux/if_alg.h
@@ -11,8 +11,8 @@
  *
  */
 
-#ifndef _LINUX_IF_ALG_H
-#define _LINUX_IF_ALG_H
+#ifndef _UAPI_LINUX_IF_ALG_H
+#define _UAPI_LINUX_IF_ALG_H
 
 #include <linux/types.h>
 
@@ -58,4 +58,4 @@ struct af_alg_iv {
 #define ALG_OP_DECRYPT			0
 #define ALG_OP_ENCRYPT			1
 
-#endif	/* _LINUX_IF_ALG_H */
+#endif	/* _UAPI_LINUX_IF_ALG_H */
diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h
index b122cfac7128..473569eaf692 100644
--- a/include/uapi/linux/if_arcnet.h
+++ b/include/uapi/linux/if_arcnet.h
@@ -14,8 +14,8 @@
  *              2 of the License, or (at your option) any later version.
  */
 
-#ifndef _LINUX_IF_ARCNET_H
-#define _LINUX_IF_ARCNET_H
+#ifndef _UAPI_LINUX_IF_ARCNET_H
+#define _UAPI_LINUX_IF_ARCNET_H
 
 #include <linux/types.h>
 #include <linux/if_ether.h>
@@ -127,4 +127,4 @@ struct archdr {
 	} soft;
 };
 
-#endif				/* _LINUX_IF_ARCNET_H */
+#endif				/* _UAPI_LINUX_IF_ARCNET_H */
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index d174914a837d..3bcc03f3aa4f 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -41,8 +41,8 @@
  *      - added definitions for various XOR hashing policies
  */
 
-#ifndef _LINUX_IF_BONDING_H
-#define _LINUX_IF_BONDING_H
+#ifndef _UAPI_LINUX_IF_BONDING_H
+#define _UAPI_LINUX_IF_BONDING_H
 
 #include <linux/if.h>
 #include <linux/types.h>
@@ -152,4 +152,4 @@ enum {
 };
 #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1)
 
-#endif /* _LINUX_IF_BONDING_H */
+#endif /* _UAPI_LINUX_IF_BONDING_H */
diff --git a/include/uapi/linux/if_fc.h b/include/uapi/linux/if_fc.h
index 3e3173282cc3..ff5ab92d16c2 100644
--- a/include/uapi/linux/if_fc.h
+++ b/include/uapi/linux/if_fc.h
@@ -18,8 +18,8 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  */
-#ifndef _LINUX_IF_FC_H
-#define _LINUX_IF_FC_H
+#ifndef _UAPI_LINUX_IF_FC_H
+#define _UAPI_LINUX_IF_FC_H
 
 #include <linux/types.h>
 
@@ -49,4 +49,4 @@ struct fcllc {
 	__be16 ethertype;		/* ether type field */
 };
 
-#endif	/* _LINUX_IF_FC_H */
+#endif	/* _UAPI_LINUX_IF_FC_H */
diff --git a/include/uapi/linux/if_hippi.h b/include/uapi/linux/if_hippi.h
index 785a1452a66c..42c4ffd11dae 100644
--- a/include/uapi/linux/if_hippi.h
+++ b/include/uapi/linux/if_hippi.h
@@ -20,8 +20,8 @@
  *		2 of the License, or (at your option) any later version.
  */
  
-#ifndef _LINUX_IF_HIPPI_H
-#define _LINUX_IF_HIPPI_H
+#ifndef _UAPI_LINUX_IF_HIPPI_H
+#define _UAPI_LINUX_IF_HIPPI_H
 
 #include <linux/types.h>
 #include <asm/byteorder.h>
@@ -151,4 +151,4 @@ struct hippi_hdr {
 	struct hippi_snap_hdr	snap;
 } __attribute__((packed));
 
-#endif	/* _LINUX_IF_HIPPI_H */
+#endif	/* _UAPI_LINUX_IF_HIPPI_H */
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 1d2718dd9647..6cd1d7a41dfb 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_IF_PACKET_H
-#define __LINUX_IF_PACKET_H
+#ifndef _UAPI__LINUX_IF_PACKET_H
+#define _UAPI__LINUX_IF_PACKET_H
 
 #include <asm/byteorder.h>
 #include <linux/types.h>
diff --git a/include/uapi/linux/if_plip.h b/include/uapi/linux/if_plip.h
index 495a366112f2..054d86a9c6e6 100644
--- a/include/uapi/linux/if_plip.h
+++ b/include/uapi/linux/if_plip.h
@@ -9,8 +9,8 @@
  *
  */
  
-#ifndef _LINUX_IF_PLIP_H
-#define _LINUX_IF_PLIP_H
+#ifndef _UAPI_LINUX_IF_PLIP_H
+#define _UAPI_LINUX_IF_PLIP_H
 
 #include <linux/sockios.h>
 
diff --git a/include/uapi/linux/if_slip.h b/include/uapi/linux/if_slip.h
index 65937be53103..299bf7adc862 100644
--- a/include/uapi/linux/if_slip.h
+++ b/include/uapi/linux/if_slip.h
@@ -6,8 +6,8 @@
  *	KISS TNC driver.
  */
  
-#ifndef __LINUX_SLIP_H
-#define __LINUX_SLIP_H
+#ifndef _UAPI__LINUX_SLIP_H
+#define _UAPI__LINUX_SLIP_H
 
 #define		SL_MODE_SLIP		0
 #define		SL_MODE_CSLIP		1
diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h
index 3a5938e38370..861cfa983db4 100644
--- a/include/uapi/linux/if_x25.h
+++ b/include/uapi/linux/if_x25.h
@@ -13,8 +13,8 @@
  *  GNU General Public License for more details.
  */
 
-#ifndef _IF_X25_H
-#define _IF_X25_H
+#ifndef _UAPI_IF_X25_H
+#define _UAPI_IF_X25_H
 
 #include <linux/types.h>
 
@@ -24,4 +24,4 @@
 #define X25_IFACE_DISCONNECT	0x02
 #define X25_IFACE_PARAMS	0x03
 
-#endif /* _IF_X25_H */
+#endif /* _UAPI_IF_X25_H */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 42869770776e..44f2bb93e7e6 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -7,8 +7,8 @@
  *	      Magnus Karlsson <magnus.karlsson@intel.com>
  */
 
-#ifndef _LINUX_IF_XDP_H
-#define _LINUX_IF_XDP_H
+#ifndef _UAPI_LINUX_IF_XDP_H
+#define _UAPI_LINUX_IF_XDP_H
 
 #include <linux/types.h>
 
@@ -180,4 +180,4 @@ struct xdp_desc {
 /* TX packet carries valid metadata. */
 #define XDP_TX_METADATA (1 << 1)
 
-#endif /* _LINUX_IF_XDP_H */
+#endif /* _UAPI_LINUX_IF_XDP_H */
diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h
index 0245269b037c..85182a839d42 100644
--- a/include/uapi/linux/ip6_tunnel.h
+++ b/include/uapi/linux/ip6_tunnel.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _IP6_TUNNEL_H
-#define _IP6_TUNNEL_H
+#ifndef _UAPI_IP6_TUNNEL_H
+#define _UAPI_IP6_TUNNEL_H
 
 #include <linux/types.h>
 #include <linux/if.h>		/* For IFNAMSIZ. */
diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 84f622a66a7a..9dd41c2f58a6 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __NET_DROPMON_H
-#define __NET_DROPMON_H
+#ifndef _UAPI__NET_DROPMON_H
+#define _UAPI__NET_DROPMON_H
 
 #include <linux/types.h>
 #include <linux/netlink.h>
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 383213de612a..a93e6ea37fb3 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -7,8 +7,8 @@
  *
  */
 
-#ifndef _NET_TIMESTAMPING_H
-#define _NET_TIMESTAMPING_H
+#ifndef _UAPI_NET_TIMESTAMPING_H
+#define _UAPI_NET_TIMESTAMPING_H
 
 #include <linux/types.h>
 #include <linux/socket.h>   /* for SO_TIMESTAMPING */
@@ -216,4 +216,4 @@ struct sock_txtime {
 	__u32			flags;	/* as defined by enum txtime_flags */
 };
 
-#endif /* _NET_TIMESTAMPING_H */
+#endif /* _UAPI_NET_TIMESTAMPING_H */
diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h
index dfa61be43d2f..ff28200204bb 100644
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __NETLINK_DIAG_H__
-#define __NETLINK_DIAG_H__
+#ifndef _UAPI__NETLINK_DIAG_H__
+#define _UAPI__NETLINK_DIAG_H__
 
 #include <linux/types.h>
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 2c32080416b5..490821364165 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_PKT_CLS_H
-#define __LINUX_PKT_CLS_H
+#ifndef _UAPI__LINUX_PKT_CLS_H
+#define _UAPI__LINUX_PKT_CLS_H
 
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 25a9a47001cd..9ea874395717 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_PKT_SCHED_H
-#define __LINUX_PKT_SCHED_H
+#ifndef _UAPI__LINUX_PKT_SCHED_H
+#define _UAPI__LINUX_PKT_SCHED_H
 
 #include <linux/const.h>
 #include <linux/types.h>
-- 
cgit v1.2.3


From 9ff2aa4206eff40a202e425f232036bc84ad4c0e Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 17 Mar 2025 23:07:30 -0400
Subject: net: ethtool: mm: extract stmmac verification logic into common
 library

It appears that stmmac is not the only hardware which requires a
software-driven verification state machine for the MAC Merge layer.

While on the one hand it's good to encourage hardware implementations,
on the other hand it's quite difficult to tolerate multiple drivers
implementing independently fairly non-trivial logic.

Extract the hardware-independent logic from stmmac into library code and
put it in ethtool. Name the state structure "mmsv" for MAC Merge
Software Verification. Let this expose an operations structure for
executing the hardware stuff: sync hardware with the tx_active boolean
(result of verification process), enable/disable the pMAC, send mPackets,
notify library of external events (reception of mPackets), as well as
link state changes.

Note that it is assumed that the external events are received in hardirq
context. If they are not, it is probably a good idea to disable hardirqs
when calling ethtool_mmsv_event_handle(), because the library does not
do so.

Also, the MM software verification process has no business with the
tx_min_frag_size, that is all the driver's to handle.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Co-developed-by: Choong Yong Liang <yong.liang.choong@linux.intel.com>
Signed-off-by: Choong Yong Liang <yong.liang.choong@linux.intel.com>
Tested-by: Choong Yong Liang <yong.liang.choong@linux.intel.com>
Tested-by: Furong Xu <0x1207@gmail.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/ethtool.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 7edb5f5e7134..117718c24814 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -17,9 +17,13 @@
 #include <linux/compat.h>
 #include <linux/if_ether.h>
 #include <linux/netlink.h>
+#include <linux/timer_types.h>
 #include <uapi/linux/ethtool.h>
 #include <uapi/linux/net_tstamp.h>
 
+#define ETHTOOL_MM_MAX_VERIFY_TIME_MS		128
+#define ETHTOOL_MM_MAX_VERIFY_RETRIES		3
+
 struct compat_ethtool_rx_flow_spec {
 	u32		flow_type;
 	union ethtool_flow_union h_u;
@@ -718,6 +722,75 @@ struct ethtool_mm_stats {
 	u64 MACMergeHoldCount;
 };
 
+enum ethtool_mmsv_event {
+	ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET,
+	ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET,
+	ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET,
+};
+
+/* MAC Merge verification mPacket type */
+enum ethtool_mpacket {
+	ETHTOOL_MPACKET_VERIFY,
+	ETHTOOL_MPACKET_RESPONSE,
+};
+
+struct ethtool_mmsv;
+
+/**
+ * struct ethtool_mmsv_ops - Operations for MAC Merge Software Verification
+ * @configure_tx: Driver callback for the event where the preemptible TX
+ *		  becomes active or inactive. Preemptible traffic
+ *		  classes must be committed to hardware only while
+ *		  preemptible TX is active.
+ * @configure_pmac: Driver callback for the event where the pMAC state
+ *		    changes as result of an administrative setting
+ *		    (ethtool) or a call to ethtool_mmsv_link_state_handle().
+ * @send_mpacket: Driver-provided method for sending a Verify or a Response
+ *		  mPacket.
+ */
+struct ethtool_mmsv_ops {
+	void (*configure_tx)(struct ethtool_mmsv *mmsv, bool tx_active);
+	void (*configure_pmac)(struct ethtool_mmsv *mmsv, bool pmac_enabled);
+	void (*send_mpacket)(struct ethtool_mmsv *mmsv, enum ethtool_mpacket mpacket);
+};
+
+/**
+ * struct ethtool_mmsv - MAC Merge Software Verification
+ * @ops: operations for MAC Merge Software Verification
+ * @dev: pointer to net_device structure
+ * @lock: serialize access to MAC Merge state between
+ *	  ethtool requests and link state updates.
+ * @status: current verification FSM state
+ * @verify_timer: timer for verification in local TX direction
+ * @verify_enabled: indicates if verification is enabled
+ * @verify_retries: number of retries for verification
+ * @pmac_enabled: indicates if the preemptible MAC is enabled
+ * @verify_time: time for verification in milliseconds
+ * @tx_enabled: indicates if transmission is enabled
+ */
+struct ethtool_mmsv {
+	const struct ethtool_mmsv_ops *ops;
+	struct net_device *dev;
+	spinlock_t lock;
+	enum ethtool_mm_verify_status status;
+	struct timer_list verify_timer;
+	bool verify_enabled;
+	int verify_retries;
+	bool pmac_enabled;
+	u32 verify_time;
+	bool tx_enabled;
+};
+
+void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv);
+void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up);
+void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv,
+			       enum ethtool_mmsv_event event);
+void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv,
+			 struct ethtool_mm_state *state);
+void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg);
+void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev,
+		       const struct ethtool_mmsv_ops *ops);
+
 /**
  * struct ethtool_rxfh_param - RXFH (RSS) parameters
  * @hfunc: Defines the current RSS hash function used by HW (or to be set to).
-- 
cgit v1.2.3


From 094adad91310d9f8f8485251129482f4f3e2c5b3 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 15 Apr 2025 15:11:34 +0300
Subject: vxlan: Use a single lock to protect the FDB table

Currently, the VXLAN driver stores FDB entries in a hash table with a
fixed number of buckets (256). Subsequent patches are going to convert
this table to rhashtable with a linked list for entry traversal, as
rhashtable is more scalable.

In preparation for this conversion, move from a per-bucket spin lock to
a single spin lock that protects the entire FDB table.

The per-bucket spin locks were introduced by commit fe1e0713bbe8
("vxlan: Use FDB_HASH_SIZE hash_locks to reduce contention") citing
"huge contention when inserting/deleting vxlan_fdbs into the fdb_head".

It is not clear from the commit message which code path was holding the
spin lock for long periods of time, but the obvious suspect is the FDB
cleanup routine (vxlan_cleanup()) that periodically traverses the entire
table in order to delete aged-out entries.

This will be solved by subsequent patches that will convert the FDB
cleanup routine to traverse the linked list of FDB entries using RCU,
only acquiring the spin lock when deleting an aged-out entry.

The change reduces the size of the VXLAN device structure from 3600
bytes to 2576 bytes.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20250415121143.345227-7-idosch@nvidia.com
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/vxlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 2dd23ee2bacd..272e11708a33 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -296,7 +296,7 @@ struct vxlan_dev {
 	struct vxlan_rdst default_dst;	/* default destination */
 
 	struct timer_list age_timer;
-	spinlock_t	  hash_lock[FDB_HASH_SIZE];
+	spinlock_t	  hash_lock;
 	unsigned int	  addrcnt;
 	struct gro_cells  gro_cells;
 
-- 
cgit v1.2.3


From 8d45673d2d2e59d03e108c569a3e8c031aa534c8 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 15 Apr 2025 15:11:35 +0300
Subject: vxlan: Add a linked list of FDB entries

Currently, FDB entries are stored in a hash table with a fixed number of
buckets. The table is used for both lookups and entry traversal.
Subsequent patches will convert the table to rhashtable which is not
suitable for entry traversal.

In preparation for this conversion, add FDB entries to a linked list.
Subsequent patches will convert the driver to use this list when
traversing entries during dump, flush, etc.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20250415121143.345227-8-idosch@nvidia.com
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/vxlan.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 272e11708a33..96a6c6f45c2e 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -307,6 +307,7 @@ struct vxlan_dev {
 	struct hlist_head fdb_head[FDB_HASH_SIZE];
 
 	struct rhashtable mdb_tbl;
+	struct hlist_head fdb_list;
 	struct hlist_head mdb_list;
 	unsigned int mdb_seq;
 };
-- 
cgit v1.2.3


From 1f763fa808e92a67feea8364ef80ca3065d74702 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 15 Apr 2025 15:11:43 +0300
Subject: vxlan: Convert FDB table to rhashtable

FDB entries are currently stored in a hash table with a fixed number of
buckets (256), resulting in performance degradation as the number of
entries grows. Solve this by converting the driver to use rhashtable
which maintains more or less constant performance regardless of the
number of entries.

Measured transmitted packets per second using a single pktgen thread
with varying number of entries when the transmitted packet always hits
the default entry (worst case):

Number of entries | Improvement
------------------|------------
1k                | +1.12%
4k                | +9.22%
16k               | +55%
64k               | +585%
256k              | +2460%

In addition, the change reduces the size of the VXLAN device structure
from 2584 bytes to 672 bytes.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20250415121143.345227-16-idosch@nvidia.com
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/vxlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 96a6c6f45c2e..e2f7ca045d3e 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -304,7 +304,7 @@ struct vxlan_dev {
 
 	struct vxlan_vni_group  __rcu *vnigrp;
 
-	struct hlist_head fdb_head[FDB_HASH_SIZE];
+	struct rhashtable fdb_hash_tbl;
 
 	struct rhashtable mdb_tbl;
 	struct hlist_head fdb_list;
-- 
cgit v1.2.3


From 45bd443bfd8697a7da308c16c3e75e2bb353b3d1 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 18 Apr 2025 02:15:19 +0100
Subject: net: 802: Remove unused p8022 code

p8022.c defines two external functions, register_8022_client()
and unregister_8022_client(), the last use of which was removed in
2018 by
commit 7a2e838d28cf ("staging: ipx: delete it from the tree")

Remove the p8022.c file, it's corresponding header, and glue
surrounding it.  There was one place the header was included in vlan.c
but it didn't use the functions it declared.

There was a comment in net/802/Makefile about checking
against net/core/Makefile, but that's at least 20 years old and
there's no sign of net/core/Makefile mentioning it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://patch.msgid.link/20250418011519.145320-1-linux@treblig.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/p8022.h | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 include/net/p8022.h

(limited to 'include')

diff --git a/include/net/p8022.h b/include/net/p8022.h
deleted file mode 100644
index a29e224ac498..000000000000
--- a/include/net/p8022.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_P8022_H
-#define _NET_P8022_H
-
-struct net_device;
-struct packet_type;
-struct sk_buff;
-
-struct datalink_proto *
-register_8022_client(unsigned char type,
-		     int (*func)(struct sk_buff *skb,
-				 struct net_device *dev,
-				 struct packet_type *pt,
-				 struct net_device *orig_dev));
-void unregister_8022_client(struct datalink_proto *proto);
-#endif
-- 
cgit v1.2.3


From 145436ae01193c0a379fd3ea9c4fbdf32863db1f Mon Sep 17 00:00:00 2001
From: Dimitri Fedrau <dimitri.fedrau@liebherr.com>
Date: Wed, 16 Apr 2025 19:14:49 +0200
Subject: net: phy: Add helper for getting MAC termination resistance

Add helper which returns the MAC termination resistance value. Modifying
the resistance to an appropriate value can reduce signal reflections and
therefore improve signal quality.

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Dimitri Fedrau <dimitri.fedrau@liebherr.com>
Link: https://patch.msgid.link/20250416-dp83822-mac-impedance-v3-3-028ac426cddb@liebherr.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index fb755358d965..066a28a4b64b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2040,6 +2040,9 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev,
 			      enum ethtool_link_mode_bit_indices linkmode,
 			      u32 *val);
 
+int phy_get_mac_termination(struct phy_device *phydev, struct device *dev,
+			    u32 *val);
+
 void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv,
 		       bool *tx_pause, bool *rx_pause);
 
-- 
cgit v1.2.3


From 0e0a7e3719bc8cbe6d6e30b3e81f21472ecba5bc Mon Sep 17 00:00:00 2001
From: Joshua Washington <joshwash@google.com>
Date: Mon, 21 Apr 2025 18:16:32 -0700
Subject: xdp: create locked/unlocked instances of xdp redirect target setters

Commit 03df156dd3a6 ("xdp: double protect netdev->xdp_flags with
netdev->lock") introduces the netdev lock to xdp_set_features_flag().
The change includes a _locked version of the method, as it is possible
for a driver to have already acquired the netdev lock before calling
this helper. However, the same applies to
xdp_features_(set|clear)_redirect_flags(), which ends up calling the
unlocked version of xdp_set_features_flags() leading to deadlocks in
GVE, which grabs the netdev lock as part of its suspend, reset, and
shutdown processes:

[  833.265543] WARNING: possible recursive locking detected
[  833.270949] 6.15.0-rc1 #6 Tainted: G            E
[  833.276271] --------------------------------------------
[  833.281681] systemd-shutdow/1 is trying to acquire lock:
[  833.287090] ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: xdp_set_features_flag+0x29/0x90
[  833.295470]
[  833.295470] but task is already holding lock:
[  833.301400] ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: gve_shutdown+0x44/0x90 [gve]
[  833.309508]
[  833.309508] other info that might help us debug this:
[  833.316130]  Possible unsafe locking scenario:
[  833.316130]
[  833.322142]        CPU0
[  833.324681]        ----
[  833.327220]   lock(&dev->lock);
[  833.330455]   lock(&dev->lock);
[  833.333689]
[  833.333689]  *** DEADLOCK ***
[  833.333689]
[  833.339701]  May be due to missing lock nesting notation
[  833.339701]
[  833.346582] 5 locks held by systemd-shutdow/1:
[  833.351205]  #0: ffffffffa9c89130 (system_transition_mutex){+.+.}-{4:4}, at: __se_sys_reboot+0xe6/0x210
[  833.360695]  #1: ffff93b399e5c1b8 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xb4/0x1f0
[  833.369144]  #2: ffff949d19a471b8 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xc2/0x1f0
[  833.377603]  #3: ffffffffa9eca050 (rtnl_mutex){+.+.}-{4:4}, at: gve_shutdown+0x33/0x90 [gve]
[  833.386138]  #4: ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: gve_shutdown+0x44/0x90 [gve]

Introduce xdp_features_(set|clear)_redirect_target_locked() versions
which assume that the netdev lock has already been acquired before
setting the XDP feature flag and update GVE to use the locked version.

Fixes: 03df156dd3a6 ("xdp: double protect netdev->xdp_flags with netdev->lock")
Tested-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20250422011643.3509287-1-joshwash@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/xdp.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 20e41b5ff319..b40f1f96cb11 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -618,7 +618,10 @@ bool bpf_dev_bound_kfunc_id(u32 btf_id);
 void xdp_set_features_flag(struct net_device *dev, xdp_features_t val);
 void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val);
 void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg);
+void xdp_features_set_redirect_target_locked(struct net_device *dev,
+					     bool support_sg);
 void xdp_features_clear_redirect_target(struct net_device *dev);
+void xdp_features_clear_redirect_target_locked(struct net_device *dev);
 #else
 static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; }
 static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; }
-- 
cgit v1.2.3


From 76a853f86c97b348dc96e75a6e6f94d8750715ee Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 13 Mar 2025 13:49:39 +0100
Subject: wifi: free SKBTX_WIFI_STATUS skb tx_flags flag

Jason mentioned at netdevconf that we've run out of tx_flags in
the skb_shinfo(). Gain one bit back by removing the wifi bit.

We can do that because the only userspace application for it
(hostapd) doesn't change the setting on the socket, it just
uses different sockets, and normally doesn't even use this any
more, sending the frames over nl80211 instead.

Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20250313134942.52ff54a140ec.If390bbdc46904cf451256ba989d7a056c457af6e@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/skbuff.h | 3 ---
 include/net/sock.h     | 2 --
 2 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b974a277975a..9ee39670e8f4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -481,9 +481,6 @@ enum {
 	/* generate software time stamp on packet tx completion */
 	SKBTX_COMPLETION_TSTAMP = 1 << 3,
 
-	/* generate wifi status information (where possible) */
-	SKBTX_WIFI_STATUS = 1 << 4,
-
 	/* determine hardware time stamp based on time or cycles */
 	SKBTX_HW_TSTAMP_NETDEV = 1 << 5,
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 694f954258d4..36b219109790 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2736,8 +2736,6 @@ static inline void _sock_tx_timestamp(struct sock *sk,
 				*tskey = atomic_inc_return(&sk->sk_tskey) - 1;
 		}
 	}
-	if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
-		*tx_flags |= SKBTX_WIFI_STATUS;
 }
 
 static inline void sock_tx_timestamp(struct sock *sk,
-- 
cgit v1.2.3


From 996c15bd30a9caf5d3a32414a28503f3389fc96e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sat, 29 Mar 2025 22:14:20 +0100
Subject: wifi: cfg80211/mac80211: remove more 5/10 MHz code

We still have ieee80211_chandef_rate_flags() and all that,
but all the users seem pretty much broken (deflink, etc.)
Remove all the code. It's been two years since last anyone
even vaguely entertained the notion of looking at this and
fixing it.

Link: https://patch.msgid.link/20250329221419.c31da7ae8c84.I1a3a4b6008134d66ca75a5bdfc004f4594da8145@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 37 -------------------------------------
 1 file changed, 37 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index efbd79c67be2..6df4e17f1437 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1096,43 +1096,6 @@ int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef,
  **/
 int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef);
 
-/**
- * ieee80211_chanwidth_rate_flags - return rate flags for channel width
- * @width: the channel width of the channel
- *
- * In some channel types, not all rates may be used - for example CCK
- * rates may not be used in 5/10 MHz channels.
- *
- * Returns: rate flags which apply for this channel width
- */
-static inline enum ieee80211_rate_flags
-ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width)
-{
-	switch (width) {
-	case NL80211_CHAN_WIDTH_5:
-		return IEEE80211_RATE_SUPPORTS_5MHZ;
-	case NL80211_CHAN_WIDTH_10:
-		return IEEE80211_RATE_SUPPORTS_10MHZ;
-	default:
-		break;
-	}
-	return 0;
-}
-
-/**
- * ieee80211_chandef_rate_flags - returns rate flags for a channel
- * @chandef: channel definition for the channel
- *
- * See ieee80211_chanwidth_rate_flags().
- *
- * Returns: rate flags which apply for this channel
- */
-static inline enum ieee80211_rate_flags
-ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef)
-{
-	return ieee80211_chanwidth_rate_flags(chandef->width);
-}
-
 /**
  * ieee80211_chandef_max_power - maximum transmission power for the chandef
  *
-- 
cgit v1.2.3


From 4876376988081d636a4c4e5f03a5556386b49087 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@toke.dk>
Date: Thu, 3 Apr 2025 20:39:28 +0200
Subject: Revert "mac80211: Dynamically set CoDel parameters per station"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 484a54c2e597dbc4ace79c1687022282905afba0. The CoDel
parameter change essentially disables CoDel on slow stations, with some
questionable assumptions, as Dave pointed out in [0]. Quoting from
there:

  But here are my pithy comments as to why this part of mac80211 is so
  wrong...

   static void sta_update_codel_params(struct sta_info *sta, u32 thr)
   {
  -       if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) {

  1) sta->local->num_sta is the number of associated, rather than
  active, stations. "Active" stations in the last 50ms or so, might have
  been a better thing to use, but as most people have far more than that
  associated, we end up with really lousy codel parameters, all the
  time. Mistake numero uno!

  2) The STA_SLOW_THRESHOLD was completely arbitrary in 2016.

  -               sta->cparams.target = MS2TIME(50);

  This, by itself, was probably not too bad. 30ms might have been
  better, at the time, when we were battling powersave etc, but 20ms was
  enough, really, to cover most scenarios, even where we had low rate
  2Ghz multicast to cope with. Even then, codel has a hard time finding
  any sane drop rate at all, with a target this high.

  -               sta->cparams.interval = MS2TIME(300);

  But this was horrible, a total mistake, that is leading to codel being
  completely ineffective in almost any scenario on clients or APS.
  100ms, even 80ms, here, would be vastly better than this insanity. I'm
  seeing 5+seconds of delay accumulated in a bunch of otherwise happily
  fq-ing APs....

  100ms of observed jitter during a flow is enough. Certainly (in 2016)
  there were interactions with powersave that I did not understand, and
  still don't, but if you are transmitting in the first place, powersave
  shouldn't be a problemmmm.....

  -               sta->cparams.ecn = false;

  At the time we were pretty nervous about ecn, I'm kind of sanguine
  about it now, and reliably indicating ecn seems better than turning it
  off for any reason.

  [...]

  In production, on p2p wireless, I've had 8ms and 80ms for target and
  interval for years now, and it works great.

I think Dave's arguments above are basically sound on the face of it,
and various experimentation with tighter CoDel parameters in the OpenWrt
community have show promising results[1]. So I don't think there's any
reason to keep this parameter fiddling; hence this revert.

[0] https://lore.kernel.org/linux-wireless/CAA93jw6NJ2cmLmMauz0xAgC2MGbBq6n0ZiZzAdkK0u4b+O2yXg@mail.gmail.com/
[1] https://forum.openwrt.org/t/reducing-multiplexing-latencies-still-further-in-wifi/133605/130

Suggested-By: Dave Taht <dave.taht@gmail.com>
In-memory-of: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Link: https://patch.msgid.link/20250403183930.197716-1-toke@toke.dk
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c498f685d01f..5349df596157 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5346,22 +5346,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif,
 			    struct ieee80211_tx_rate *dest,
 			    int max_rates);
 
-/**
- * ieee80211_sta_set_expected_throughput - set the expected tpt for a station
- *
- * Call this function to notify mac80211 about a change in expected throughput
- * to a station. A driver for a device that does rate control in firmware can
- * call this function when the expected throughput estimate towards a station
- * changes. The information is used to tune the CoDel AQM applied to traffic
- * going towards that station (which can otherwise be too aggressive and cause
- * slow stations to starve).
- *
- * @pubsta: the station to set throughput for.
- * @thr: the current expected throughput in kbps.
- */
-void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
-					   u32 thr);
-
 /**
  * ieee80211_tx_rate_update - transmit rate update callback
  *
-- 
cgit v1.2.3


From fcc2d3e11bcc8f01d52a8c419f49f86ff8343b7c Mon Sep 17 00:00:00 2001
From: Karthikeyan Kathirvel <karthikeyan.kathirvel@oss.qualcomm.com>
Date: Mon, 21 Apr 2025 16:45:05 +0530
Subject: wifi: ieee80211: define beacon protection bit field

An AP supporting Beacon Protection should set bit 84 in
the extended capabilities IE (9.4.2.25 in the 802.11be D7 spec).
So the *4th* bit of the 10th byte should be checked to figure out
whether beacon protection is enabled or disabled.

Signed-off-by: Karthikeyan Kathirvel <karthikeyan.kathirvel@oss.qualcomm.com>
Reviewed-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
Link: https://patch.msgid.link/20250421111505.3633992-1-karthikeyan.kathirvel@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 508d466de1cc..cbc3928aa504 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4087,6 +4087,9 @@ enum ieee80211_tdls_actioncode {
 /* Defines support for enhanced multi-bssid advertisement*/
 #define WLAN_EXT_CAPA11_EMA_SUPPORT	BIT(3)
 
+/* Enable Beacon Protection */
+#define WLAN_EXT_CAPA11_BCN_PROTECT	BIT(4)
+
 /* TDLS specific payload type in the LLC/SNAP header */
 #define WLAN_TDLS_SNAP_RFTYPE	0x2
 
-- 
cgit v1.2.3


From 53160d0edf7336acaed4c74c6af8549d87c92ae6 Mon Sep 17 00:00:00 2001
From: Ramasamy Kaliappan <quic_rkaliapp@quicinc.com>
Date: Thu, 27 Mar 2025 10:43:17 +0530
Subject: wifi: cfg80211: Add support to get EMLSR capabilities of non-AP MLD

The Enhanced multi-link single-radio (EMLSR) operation allows a non-AP MLD
with multiple receive chains to listen on one or more EMLSR links when the
corresponding non-AP STA(s) affiliated with the non-AP MLD is (are) in
the awake state. [IEEE 802.11be-2024, (35.3.17 Enhanced multi-link
single-radio (EMLSR) operation)]

An MLD which intends to enable EMLSR operations will set the EML
Capabilities Present subfield to 1 and shall set the EMLSR Support
subfield in the Common Info field of the Basic Multi-Link element to 1 in
all Management frames that include the Basic Multi-Link element except
Authentication frames. EML capabilities contains information such as
EML Transition timeout, Padding delay and Transition delay. These fields
needs to updated to drivers to trigger EMLSR operation and to transmit and
receive initial control frame and data frames.

Add support to receive EML Capabilities subfield that non-AP MLD
advertises during (re)association request and send it to underlying
drivers during ADD/SET station.

Signed-off-by: Ramasamy Kaliappan <quic_rkaliapp@quicinc.com>
Signed-off-by: Rameshkumar Sundaram <quic_ramess@quicinc.com>
Link: https://patch.msgid.link/20250327051320.3253783-2-quic_ramess@quicinc.com
[accept EMLSR capabilities only for unassoc AP STA]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6df4e17f1437..87cb66fba621 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1733,6 +1733,9 @@ struct cfg80211_ttlm_params {
  * @supported_oper_classes_len: number of supported operating classes
  * @support_p2p_ps: information if station supports P2P PS mechanism
  * @airtime_weight: airtime scheduler weight for this station
+ * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is
+ *	present/updated
+ * @eml_cap: EML capabilities of this station
  * @link_sta_params: link related params.
  */
 struct station_parameters {
@@ -1757,6 +1760,8 @@ struct station_parameters {
 	u8 supported_oper_classes_len;
 	int support_p2p_ps;
 	u16 airtime_weight;
+	bool eml_cap_present;
+	u16 eml_cap;
 	struct link_station_parameters link_sta_params;
 };
 
-- 
cgit v1.2.3


From 14e0f59a88cc22ceeb36e26b89b70b22292d23de Mon Sep 17 00:00:00 2001
From: Ramasamy Kaliappan <quic_rkaliapp@quicinc.com>
Date: Thu, 27 Mar 2025 10:43:18 +0530
Subject: wifi: mac80211: update ML STA with EML capabilities

When an AP and Non-AP MLD operates in EMLSR mode, EML capabilities
advertised during Association contains information such as EMLSR
transition delay, padding delay and transition timeout values.

Save the EML capabilities information that is received during station
addition and capabilities update in ieee80211_sta so that drivers can use
it for triggering EMLSR operation.

Signed-off-by: Ramasamy Kaliappan <quic_rkaliapp@quicinc.com>
Signed-off-by: Rameshkumar Sundaram <quic_ramess@quicinc.com>
Link: https://patch.msgid.link/20250327051320.3253783-3-quic_ramess@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5349df596157..c305ebfa6e45 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2488,6 +2488,7 @@ struct ieee80211_link_sta {
  * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single
  *	A-MSDU. Taken from the Extended Capabilities element. 0 means
  *	unlimited.
+ * @eml_cap: EML capabilities of this MLO station
  * @cur: currently valid data as aggregated from the active links
  *	For non MLO STA it will point to the deflink data. For MLO STA
  *	ieee80211_sta_recalc_aggregates() must be called to update it.
@@ -2522,6 +2523,7 @@ struct ieee80211_sta {
 	bool mlo;
 	bool spp_amsdu;
 	u8 max_amsdu_subframes;
+	u16 eml_cap;
 
 	struct ieee80211_sta_aggregates *cur;
 
-- 
cgit v1.2.3


From 91ea0489dc97bfda72ed74f98ab66dc0ab4235c7 Mon Sep 17 00:00:00 2001
From: Rameshkumar Sundaram <quic_ramess@quicinc.com>
Date: Thu, 27 Mar 2025 10:43:19 +0530
Subject: wifi: ieee80211: Add helpers to fetch EMLSR delay and timeout values

Add helpers to get EMLSR transition delay, padding delay and transition
timeout values from EML capabilities field of Multi-link Element.

Signed-off-by: Rameshkumar Sundaram <quic_ramess@quicinc.com>
Link: https://patch.msgid.link/20250327051320.3253783-4-quic_ramess@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index cbc3928aa504..15a87f522017 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -5618,6 +5618,80 @@ static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len)
 	return len >= fixed + elem_len;
 }
 
+/**
+ * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay
+ *	in microseconds
+ * @eml_cap: EML capabilities field value from common info field of
+ *	the Multi-link element
+ * Return: the EMLSR Padding delay (in microseconds) encoded in the
+ *	EML Capabilities field
+ */
+
+static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap)
+{
+	/* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR
+	 * Padding Delay subfield.
+	 */
+	u32 pad_delay = u16_get_bits(eml_cap,
+				     IEEE80211_EML_CAP_EMLSR_PADDING_DELAY);
+
+	if (!pad_delay ||
+	    pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US)
+		return 0;
+
+	return 32 * (1 << (pad_delay - 1));
+}
+
+/**
+ * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition
+ *	delay in microseconds
+ * @eml_cap: EML capabilities field value from common info field of
+ *	the Multi-link element
+ * Return: the EMLSR Transition delay (in microseconds) encoded in the
+ *	EML Capabilities field
+ */
+
+static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap)
+{
+	/* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR
+	 * Transition Delay subfield.
+	 */
+	u32 trans_delay =
+		u16_get_bits(eml_cap,
+			     IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY);
+
+	/* invalid values also just use 0 */
+	if (!trans_delay ||
+	    trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US)
+		return 0;
+
+	return 16 * (1 << (trans_delay - 1));
+}
+
+/**
+ * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition
+ *	timeout value in microseconds
+ * @eml_cap: EML capabilities field value from common info field of
+ *	the Multi-link element
+ * Return: the EMLSR Transition timeout (in microseconds) encoded in
+ *	the EML Capabilities field
+ */
+
+static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap)
+{
+	/* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the
+	 * Transition Timeout subfield.
+	 */
+	u8 timeout = u16_get_bits(eml_cap,
+				  IEEE80211_EML_CAP_TRANSITION_TIMEOUT);
+
+	/* invalid values also just use 0 */
+	if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU)
+		return 0;
+
+	return 128 * (1 << (timeout - 1));
+}
+
 #define for_each_mle_subelement(_elem, _data, _len)			\
 	if (ieee80211_mle_size_ok(_data, _len))				\
 		for_each_element(_elem,					\
-- 
cgit v1.2.3


From 37523c3c47b3f3cc4c7d2ff47d28ee9ec99317c1 Mon Sep 17 00:00:00 2001
From: Rameshkumar Sundaram <rameshkumar.sundaram@oss.qualcomm.com>
Date: Tue, 8 Apr 2025 11:44:59 -0700
Subject: wifi: nl80211: add link id of transmitted profile for MLO MBSSID

During non-transmitted (nontx) profile configuration, interface
index of the transmitted (tx) profile is used to retrieve the
wireless device (wdev) associated with it. With MLO, this 'wdev'
may be part of an MLD with more than one link, hence only
interface index is not sufficient anymore to retrieve the correct
tx profile. Add a new attribute to configure link id of tx profile.

Signed-off-by: Rameshkumar Sundaram <rameshkumar.sundaram@oss.qualcomm.com>
Co-developed-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Signed-off-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Co-developed-by: Aloka Dixit <aloka.dixit@oss.qualcomm.com>
Signed-off-by: Aloka Dixit <aloka.dixit@oss.qualcomm.com>
Link: https://patch.msgid.link/20250408184501.3715887-2-aloka.dixit@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 87cb66fba621..d1848dc8ec99 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1263,11 +1263,13 @@ struct cfg80211_crypto_settings {
  * struct cfg80211_mbssid_config - AP settings for multi bssid
  *
  * @tx_wdev: pointer to the transmitted interface in the MBSSID set
+ * @tx_link_id: link ID of the transmitted profile in an MLD.
  * @index: index of this AP in the multi bssid group.
  * @ema: set to true if the beacons should be sent out in EMA mode.
  */
 struct cfg80211_mbssid_config {
 	struct wireless_dev *tx_wdev;
+	u8 tx_link_id;
 	u8 index;
 	bool ema;
 };
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index ddcc4cda74af..e9ccf43fe3c6 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -8036,6 +8036,11 @@ enum nl80211_sar_specs_attrs {
  *	Setting this flag is permitted only if the driver advertises EMA support
  *	by setting wiphy->ema_max_profile_periodicity to non-zero.
  *
+ * @NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID: Link ID of the transmitted profile.
+ *	This parameter is mandatory when NL80211_ATTR_MBSSID_CONFIG attributes
+ *	are sent for a non-transmitted profile and if the transmitted profile
+ *	is part of an MLD. For all other cases this parameter is unnecessary.
+ *
  * @__NL80211_MBSSID_CONFIG_ATTR_LAST: Internal
  * @NL80211_MBSSID_CONFIG_ATTR_MAX: highest attribute
  */
@@ -8047,6 +8052,7 @@ enum nl80211_mbssid_config_attributes {
 	NL80211_MBSSID_CONFIG_ATTR_INDEX,
 	NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX,
 	NL80211_MBSSID_CONFIG_ATTR_EMA,
+	NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID,
 
 	/* keep last */
 	__NL80211_MBSSID_CONFIG_ATTR_LAST,
-- 
cgit v1.2.3


From f600832794c91d7021d7337104734246b02a2b86 Mon Sep 17 00:00:00 2001
From: Rameshkumar Sundaram <rameshkumar.sundaram@oss.qualcomm.com>
Date: Tue, 8 Apr 2025 11:45:00 -0700
Subject: wifi: mac80211: restructure tx profile retrieval for MLO MBSSID

For MBSSID, each vif (struct ieee80211_vif) stores another vif
pointer for the transmitting profile of MBSSID set. This won't
suffice for MLO as there may be multiple links, each of which can
be part of different MBSSID sets. Hence the information needs to
be stored per-link. Additionally, the transmitted profile itself
may be part of an MLD hence storing vif will not suffice either.
Fix MLO by storing an instance of struct ieee80211_bss_conf
for each link.

Modify following operations to reflect the above structure updates:
- channel switch completion
- BSS color change completion
- Removing nontransmitted links in ieee80211_stop_mbssid()
- drivers retrieving the transmitted link for beacon templates.

Signed-off-by: Rameshkumar Sundaram <rameshkumar.sundaram@oss.qualcomm.com>
Co-developed-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Signed-off-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Co-developed-by: Aloka Dixit <aloka.dixit@oss.qualcomm.com>
Signed-off-by: Aloka Dixit <aloka.dixit@oss.qualcomm.com>
Link: https://patch.msgid.link/20250408184501.3715887-3-aloka.dixit@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c305ebfa6e45..fdafc37d17cc 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -682,6 +682,9 @@ struct ieee80211_parsed_tpe {
  *	responder functionality.
  * @ftmr_params: configurable lci/civic parameter when enabling FTM responder.
  * @nontransmitted: this BSS is a nontransmitted BSS profile
+ * @tx_bss_conf: Pointer to the BSS configuration of transmitting interface
+ *	if MBSSID is enabled. This pointer is RCU-protected due to CSA finish
+ *	and BSS color change flows accessing it.
  * @transmitter_bssid: the address of transmitter AP
  * @bssid_index: index inside the multiple BSSID set
  * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set
@@ -804,6 +807,7 @@ struct ieee80211_bss_conf {
 	struct ieee80211_ftm_responder_params *ftmr_params;
 	/* Multiple BSSID data */
 	bool nontransmitted;
+	struct ieee80211_bss_conf __rcu *tx_bss_conf;
 	u8 transmitter_bssid[ETH_ALEN];
 	u8 bssid_index;
 	u8 bssid_indicator;
@@ -2023,7 +2027,6 @@ enum ieee80211_neg_ttlm_res {
  * @txq: the multicast data TX queue
  * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see
  *	&enum ieee80211_offload_flags.
- * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled.
  */
 struct ieee80211_vif {
 	enum nl80211_iftype type;
@@ -2052,8 +2055,6 @@ struct ieee80211_vif {
 	bool probe_req_reg;
 	bool rx_mcast_action_reg;
 
-	struct ieee80211_vif *mbssid_tx_vif;
-
 	/* must be last */
 	u8 drv_priv[] __aligned(sizeof(void *));
 };
-- 
cgit v1.2.3


From 52358dd63e348c3b6c488acc105be1aeda8fb923 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 18 Apr 2025 11:04:01 +0200
Subject: net: phy: remove function stubs

All callers of these functions depend on PHYLIB or select it directly
or indirectly by selecting PHYLINK. Stubs make sense for optional
functionality, but that's not the case here.

MDIO_XGENE usually is selected by NET_XGENE which also selects PHYLIB.
Add a dependency to PHYLIB nevertheless, in order not to break
randconfig builds.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/f7a69a1f-60e9-4ac0-8b7c-481e0cc850e7@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 37 -------------------------------------
 1 file changed, 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 066a28a4b64b..3beaf225ee88 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1753,7 +1753,6 @@ int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum,
 struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 				     bool is_c45,
 				     struct phy_c45_device_ids *c45_ids);
-#if IS_ENABLED(CONFIG_PHYLIB)
 int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id);
 struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode);
 struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode);
@@ -1761,42 +1760,6 @@ struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode);
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
 int phy_device_register(struct phy_device *phy);
 void phy_device_free(struct phy_device *phydev);
-#else
-static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id)
-{
-	return 0;
-}
-static inline
-struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode)
-{
-	return 0;
-}
-
-static inline
-struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode)
-{
-	return NULL;
-}
-
-static inline
-struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode)
-{
-	return NULL;
-}
-
-static inline
-struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45)
-{
-	return NULL;
-}
-
-static inline int phy_device_register(struct phy_device *phy)
-{
-	return 0;
-}
-
-static inline void phy_device_free(struct phy_device *phydev) { }
-#endif /* CONFIG_PHYLIB */
 void phy_device_remove(struct phy_device *phydev);
 int phy_get_c45_ids(struct phy_device *phydev);
 int phy_init_hw(struct phy_device *phydev);
-- 
cgit v1.2.3


From 834d97843e3bca86f17cc517885f54f3433427b2 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 17 Apr 2025 17:03:53 -0700
Subject: ipv6: Protect fib6_link_table() with spinlock.

We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT.

If the request specifies a new table ID, fib6_new_table() is
called to create a new routing table.

Two concurrent requests could specify the same table ID, so we
need a lock to protect net->ipv6.fib_table_hash[h].

Let's add a spinlock to protect the hash bucket linkage.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/20250418000443.43734-13-kuniyu@amazon.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netns/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 5f2cfd84570a..47dc70d8100a 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -72,6 +72,7 @@ struct netns_ipv6 {
 	struct rt6_statistics   *rt6_stats;
 	struct timer_list       ip6_fib_timer;
 	struct hlist_head       *fib_table_hash;
+	spinlock_t		fib_table_hash_lock;
 	struct fib6_table       *fib6_main_tbl;
 	struct list_head	fib6_walkers;
 	rwlock_t		fib6_walker_lock;
-- 
cgit v1.2.3


From accb46b56bc3bc99ee69ba18b06ca60266ad6fca Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 17 Apr 2025 17:03:54 -0700
Subject: ipv6: Defer fib6_purge_rt() in fib6_add_rt2node() to fib6_add().

The next patch adds per-nexthop spinlock which protects nh->f6i_list.

When rt->nh is not NULL, fib6_add_rt2node() will be called under the lock.
fib6_add_rt2node() could call fib6_purge_rt() for another route, which
could holds another nexthop lock.

Then, deadlock could happen between two nexthops.

Let's defer fib6_purge_rt() after fib6_add_rt2node().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/20250418000443.43734-14-kuniyu@amazon.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/ip6_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 7c87873ae211..88b0dd4d8e09 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -198,6 +198,7 @@ struct fib6_info {
 					fib6_destroying:1,
 					unused:4;
 
+	struct list_head		purge_link;
 	struct rcu_head			rcu;
 	struct nexthop			*nh;
 	struct fib6_nh			fib6_nh[];
-- 
cgit v1.2.3


From 081efd18326e353c6fbfdeff903a83edde953f72 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 17 Apr 2025 17:03:55 -0700
Subject: ipv6: Protect nh->f6i_list with spinlock and flag.

We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT.

Then, we may be going to add a route tied to a dying nexthop.

The nexthop itself is not freed during the RCU grace period, but
if we link a route after __remove_nexthop_fib() is called for the
nexthop, the route will be leaked.

To avoid the race between IPv6 route addition under RCU vs nexthop
deletion under RTNL, let's add a dead flag and protect it and
nh->f6i_list with a spinlock.

__remove_nexthop_fib() acquires the nexthop's spinlock and sets false
to nh->dead, then calls ip6_del_rt() for the linked route one by one
without the spinlock because fib6_purge_rt() acquires it later.

While adding an IPv6 route, fib6_add() acquires the nexthop lock and
checks the dead flag just before inserting the route.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250418000443.43734-15-kuniyu@amazon.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/nexthop.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index d9fb44e8b321..572e69cda476 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -152,6 +152,8 @@ struct nexthop {
 	u8			protocol;   /* app managing this nh */
 	u8			nh_flags;
 	bool			is_group;
+	bool			dead;
+	spinlock_t		lock;       /* protect dead and f6i_list */
 
 	refcount_t		refcnt;
 	struct rcu_head		rcu;
-- 
cgit v1.2.3


From 39144062ea335495d659b08c9e3133ab746a0b1b Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 23 Apr 2025 00:51:47 +0100
Subject: rxrpc: Remove deadcode

Remove three functions that are no longer used.

rxrpc_get_txbuf() last use was removed by 2020's
commit 5e6ef4f1017c ("rxrpc: Make the I/O thread take over the call and
local processor work")

rxrpc_kernel_get_epoch() last use was removed by 2020's
commit 44746355ccb1 ("afs: Don't get epoch from a server because it may be
ambiguous")

rxrpc_kernel_set_max_life() last use was removed by 2023's
commit db099c625b13 ("rxrpc: Fix timeout of a call that hasn't yet been
granted a channel")

Both of the rxrpc_kernel_* functions were documented.  Remove that
documentation as well as the code.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Acked-by: David Howells <dhowells@redhat.com>
Link: https://patch.msgid.link/20250422235147.146460-1-linux@treblig.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_rxrpc.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index f15341594cc8..0fb4c41c9bbf 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -88,9 +88,6 @@ int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx,
 			       unsigned int debug_id);
 void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64);
 bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *);
-u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *);
-void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
-			       unsigned long);
 
 int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
 int rxrpc_sock_set_security_keyring(struct sock *, struct key *);
-- 
cgit v1.2.3


From bc2550b4e195754fbb24aac1f012d3dd9e3b4edc Mon Sep 17 00:00:00 2001
From: Jeremy Harris <jgh@exim.org>
Date: Wed, 23 Apr 2025 13:43:33 +0100
Subject: tcp: fastopen: note that a child socket was created

tcp: fastopen: note that a child socket was created

This uses up the last bit in a field of tcp_sock.

Signed-off-by: Jeremy Harris <jgh@exim.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20250423124334.4916-2-jgh@exim.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1669d95bb0f9..a8af71623ba7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -385,7 +385,8 @@ struct tcp_sock {
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
-		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
+		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
+		syn_fastopen_child:1; /* created TFO passive child socket */
 
 	u8	keepalive_probes; /* num of allowed keep alive probes	*/
 	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */
-- 
cgit v1.2.3


From 2b13042d3636327eb50c8a0ee06f629d52d1b8fb Mon Sep 17 00:00:00 2001
From: Jeremy Harris <jgh@exim.org>
Date: Wed, 23 Apr 2025 13:43:34 +0100
Subject: tcp: fastopen: pass TFO child indication through getsockopt

tcp: fastopen: pass TFO child indication through getsockopt

Note that this uses up the last bit of a field in struct tcp_info

Signed-off-by: Jeremy Harris <jgh@exim.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20250423124334.4916-3-jgh@exim.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/tcp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index dc8fdc80e16b..bdac8c42fa82 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -184,6 +184,7 @@ enum tcp_fastopen_client_fail {
 #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
 #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
 #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
+#define TCPI_OPT_TFO_CHILD	128 /* child from a Fast Open option on SYN */
 
 /*
  * Sender's congestion state indicating normal or abnormal situations
-- 
cgit v1.2.3


From d57ee99831e336576359beb26e2b140511c99106 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 23 Apr 2025 17:08:08 +0200
Subject: net: ethernet: mtk_wed: annotate RCU release in attach()

There are some sparse warnings in wifi, and it seems that
it's actually possible to annotate a function pointer with
__releases(), making the sparse warnings go away. In a way
that also serves as documentation that rcu_read_unlock()
must be called in the attach method, so add that annotation.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://patch.msgid.link/20250423150811.456205-2-johannes@sipsolutions.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/soc/mediatek/mtk_wed.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index a476648858a6..d8949a4ed0dc 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -192,7 +192,7 @@ struct mtk_wed_device {
 };
 
 struct mtk_wed_ops {
-	int (*attach)(struct mtk_wed_device *dev);
+	int (*attach)(struct mtk_wed_device *dev) __releases(RCU);
 	int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring,
 			     void __iomem *regs, bool reset);
 	int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring,
-- 
cgit v1.2.3


From 34dd0fecaa02d654c447d43a7e4c72f9b18b7033 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Fri, 25 Apr 2025 16:55:31 +0200
Subject: net: sched: generalize check for no-queue qdisc on TX queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "noqueue" qdisc can either be directly attached, or get default
attached if net_device priv_flags has IFF_NO_QUEUE. In both cases, the
allocated Qdisc structure gets it's enqueue function pointer reset to
NULL by noqueue_init() via noqueue_qdisc_ops.

This is a common case for software virtual net_devices. For these devices
with no-queue, the transmission path in __dev_queue_xmit() will bypass
the qdisc layer. Directly invoking device drivers ndo_start_xmit (via
dev_hard_start_xmit).  In this mode the device driver is not allowed to
ask for packets to be queued (either via returning NETDEV_TX_BUSY or
stopping the TXQ).

The simplest and most reliable way to identify this no-queue case is by
checking if enqueue == NULL.

The vrf driver currently open-codes this check (!qdisc->enqueue). While
functionally correct, this low-level detail is better encapsulated in a
dedicated helper for clarity and long-term maintainability.

To make this behavior more explicit and reusable, this patch introduce a
new helper: qdisc_txq_has_no_queue(). Helper will also be used by the
veth driver in the next patch, which introduces optional qdisc-based
backpressure.

This is a non-functional change.

Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://patch.msgid.link/174559293172.827981.7583862632045264175.stgit@firesoul
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sch_generic.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d48c657191cd..b6c177f7141c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -803,6 +803,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev)
 	return false;
 }
 
+/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */
+static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq)
+{
+	struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc);
+
+	return qdisc->enqueue == NULL;
+}
+
 /* Is the device using the noop qdisc on all queues?  */
 static inline bool qdisc_tx_is_noop(const struct net_device *dev)
 {
-- 
cgit v1.2.3


From 0014af802193aa3547484b5db0f1a258bad28c81 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 Apr 2025 15:55:53 +0200
Subject: netfilter: nf_tables: export set count and backend name to userspace

nf_tables picks a suitable set backend implementation (bitmap, hash,
rbtree..) based on the userspace requirements.

Figuring out the chosen backend requires information about the set flags
and the kernel version.  Export this to userspace so nft can include this
information in '--debug=netlink' output.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 49c944e78463..7d6bc19a0153 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -394,6 +394,8 @@ enum nft_set_field_attributes {
  * @NFTA_SET_HANDLE: set handle (NLA_U64)
  * @NFTA_SET_EXPR: set expression (NLA_NESTED: nft_expr_attributes)
  * @NFTA_SET_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes)
+ * @NFTA_SET_TYPE: set backend type (NLA_STRING)
+ * @NFTA_SET_COUNT: number of set elements (NLA_U32)
  */
 enum nft_set_attributes {
 	NFTA_SET_UNSPEC,
@@ -415,6 +417,8 @@ enum nft_set_attributes {
 	NFTA_SET_HANDLE,
 	NFTA_SET_EXPR,
 	NFTA_SET_EXPRESSIONS,
+	NFTA_SET_TYPE,
+	NFTA_SET_COUNT,
 	__NFTA_SET_MAX
 };
 #define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
-- 
cgit v1.2.3


From 32607a332cfea5a4b2a185f3e3d605a9bf4f8df0 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 24 Apr 2025 10:35:18 -0400
Subject: ipv4: prefer multipath nexthop that matches source address

With multipath routes, try to ensure that packets leave on the device
that is associated with the source address.

Avoid the following tcpdump example:

    veth0 Out IP 10.1.0.2.38640 > 10.2.0.3.8000: Flags [S]
    veth1 Out IP 10.1.0.2.38648 > 10.2.0.3.8000: Flags [S]

Which can happen easily with the most straightforward setup:

    ip addr add 10.0.0.1/24 dev veth0
    ip addr add 10.1.0.1/24 dev veth1

    ip route add 10.2.0.3 nexthop via 10.0.0.2 dev veth0 \
    			  nexthop via 10.1.0.2 dev veth1

This is apparently considered WAI, based on the comment in
ip_route_output_key_hash_rcu:

    * 2. Moreover, we are allowed to send packets with saddr
    *    of another iface. --ANK

It may be ok for some uses of multipath, but not all. For instance,
when using two ISPs, a router may drop packets with unknown source.

The behavior occurs because tcp_v4_connect makes three route
lookups when establishing a connection:

1. ip_route_connect calls to select a source address, with saddr zero.
2. ip_route_connect calls again now that saddr and daddr are known.
3. ip_route_newports calls again after a source port is also chosen.

With a route with multiple nexthops, each lookup may make a different
choice depending on available entropy to fib_select_multipath. So it
is possible for 1 to select the saddr from the first entry, but 3 to
select the second entry. Leading to the above situation.

Address this by preferring a match that matches the flowi4 saddr. This
will make 2 and 3 make the same choice as 1. Continue to update the
backup choice until a choice that matches saddr is found.

Do this in fib_select_multipath itself, rather than passing an fl4_oif
constraint, to avoid changing non-multipath route selection. Commit
e6b45241c57a ("ipv4: reset flowi parameters on route connect") shows
how that may cause regressions.

Also read ipv4.sysctl_fib_multipath_use_neigh only once. No need to
refresh in the loop.

This does not happen in IPv6, which performs only one lookup.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20250424143549.669426-2-willemdebruijn.kernel@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/ip_fib.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e3864b74e92a..48bb3cf41469 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net,
 
 int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
 		 struct netlink_ext_ack *extack);
-void fib_select_multipath(struct fib_result *res, int hash);
+void fib_select_multipath(struct fib_result *res, int hash,
+			  const struct flowi4 *fl4);
 void fib_select_path(struct net *net, struct fib_result *res,
 		     struct flowi4 *fl4, const struct sk_buff *skb);
 
-- 
cgit v1.2.3


From 65e9024643c7512ade3aedbb341e11d77ed7abc2 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 24 Apr 2025 10:35:19 -0400
Subject: ip: load balance tcp connections to single dst addr and port

Load balance new TCP connections across nexthops also when they
connect to the same service at a single remote address and port.

This affects only port-based multipath hashing:
fib_multipath_hash_policy 1 or 3.

Local connections must choose both a source address and port when
connecting to a remote service, in ip_route_connect. This
"chicken-and-egg problem" (commit 2d7192d6cbab ("ipv4: Sanitize and
simplify ip_route_{connect,newports}()")) is resolved by first
selecting a source address, by looking up a route using the zero
wildcard source port and address.

As a result multiple connections to the same destination address and
port have no entropy in fib_multipath_hash.

This is not a problem when forwarding, as skb-based hashing has a
4-tuple. Nor when establishing UDP connections, as autobind there
selects a port before reaching ip_route_connect.

Load balance also TCP, by using a random port in fib_multipath_hash.
Port assignment in inet_hash_connect is not atomic with
ip_route_connect. Thus ports are unpredictable, effectively random.

Implementation details:

Do not actually pass a random fl4_sport, as that affects not only
hashing, but routing more broadly, and can match a source port based
policy route, which existing wildcard port 0 will not. Instead,
define a new wildcard flowi flag that is used only for hashing.

Selecting a random source is equivalent to just selecting a random
hash entirely. But for code clarity, follow the normal 4-tuple hash
process and only update this field.

fib_multipath_hash can be reached with zero sport from other code
paths, so explicitly pass this flowi flag, rather than trying to infer
this case in the function itself.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20250424143549.669426-3-willemdebruijn.kernel@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/flow.h  | 1 +
 include/net/route.h | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/flow.h b/include/net/flow.h
index 2a3f0c42f092..a1839c278d87 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -39,6 +39,7 @@ struct flowi_common {
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_KNOWN_NH		0x02
 #define FLOWI_FLAG_L3MDEV_OIF		0x04
+#define FLOWI_FLAG_ANY_SPORT		0x08
 	__u32	flowic_secid;
 	kuid_t  flowic_uid;
 	__u32		flowic_multipath_hash;
diff --git a/include/net/route.h b/include/net/route.h
index c605fd5ec0c0..8e39aa822cf9 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
 	if (inet_test_bit(TRANSPARENT, sk))
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
+	if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport)
+		flow_flags |= FLOWI_FLAG_ANY_SPORT;
+
 	flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
 			   ip_sock_rt_scope(sk), protocol, flow_flags, dst,
 			   src, dport, sport, sk->sk_uid);
-- 
cgit v1.2.3


From 144530c15ec7fa95b29812d86f4be527338ea204 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Fri, 25 Apr 2025 13:46:16 -0700
Subject: pds_core: remove extra name description

Fix the kernel-doc complaint
include/linux/pds/pds_adminq.h:481: warning: Excess struct member 'name' description in 'pds_core_lif_getattr_comp'

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pds/pds_adminq.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h
index ddd111f04ca0..339156113fa5 100644
--- a/include/linux/pds/pds_adminq.h
+++ b/include/linux/pds/pds_adminq.h
@@ -463,7 +463,6 @@ struct pds_core_lif_getattr_cmd {
  * @rsvd:       Word boundary padding
  * @comp_index: Index in the descriptor ring for which this is the completion
  * @state:	LIF state (enum pds_core_lif_state)
- * @name:	LIF name string, 0 terminated
  * @features:	Features (enum pds_core_hw_features)
  * @rsvd2:      Word boundary padding
  * @color:	Color bit
-- 
cgit v1.2.3


From 7c4f4c4fa9b6fbb7e483bebd02f7b9cbc20ca5cc Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Fri, 25 Apr 2025 13:46:17 -0700
Subject: pds_core: smaller adminq poll starting interval

Shorten the adminq poll starting interval in order to notice
any transaction errors more quickly.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pds/pds_adminq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h
index 339156113fa5..40ff0ec2b879 100644
--- a/include/linux/pds/pds_adminq.h
+++ b/include/linux/pds/pds_adminq.h
@@ -4,7 +4,7 @@
 #ifndef _PDS_CORE_ADMINQ_H_
 #define _PDS_CORE_ADMINQ_H_
 
-#define PDSC_ADMINQ_MAX_POLL_INTERVAL	256
+#define PDSC_ADMINQ_MAX_POLL_INTERVAL	256000	/* usecs */
 
 enum pds_core_adminq_flags {
 	PDS_AQ_FLAG_FASTPOLL	= BIT(1),	/* completion poll at 1ms */
-- 
cgit v1.2.3


From 468d8b462ac64659caec53eff34f02963d5f52c8 Mon Sep 17 00:00:00 2001
From: Dave Ertman <david.m.ertman@intel.com>
Date: Tue, 15 Apr 2025 21:15:45 -0500
Subject: iidc/ice/irdma: Rename IDC header file

To prepare for the IDC upgrade to support different CORE
PCI drivers, rename header file from iidc.h to iidc_rdma.h
since this files functionality is specifically for RDMA support.

Use net/dscp.h include in irdma osdep.h and DSCP_MAX type.h,
instead of iidc header and define.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/net/intel/iidc.h      | 109 ------------------------------------
 include/linux/net/intel/iidc_rdma.h | 109 ++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 109 deletions(-)
 delete mode 100644 include/linux/net/intel/iidc.h
 create mode 100644 include/linux/net/intel/iidc_rdma.h

(limited to 'include')

diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h
deleted file mode 100644
index 13274c3def66..000000000000
--- a/include/linux/net/intel/iidc.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2021, Intel Corporation. */
-
-#ifndef _IIDC_H_
-#define _IIDC_H_
-
-#include <linux/auxiliary_bus.h>
-#include <linux/dcbnl.h>
-#include <linux/device.h>
-#include <linux/if_ether.h>
-#include <linux/kernel.h>
-#include <linux/netdevice.h>
-
-enum iidc_event_type {
-	IIDC_EVENT_BEFORE_MTU_CHANGE,
-	IIDC_EVENT_AFTER_MTU_CHANGE,
-	IIDC_EVENT_BEFORE_TC_CHANGE,
-	IIDC_EVENT_AFTER_TC_CHANGE,
-	IIDC_EVENT_CRIT_ERR,
-	IIDC_EVENT_NBITS		/* must be last */
-};
-
-enum iidc_reset_type {
-	IIDC_PFR,
-	IIDC_CORER,
-	IIDC_GLOBR,
-};
-
-enum iidc_rdma_protocol {
-	IIDC_RDMA_PROTOCOL_IWARP = BIT(0),
-	IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1),
-};
-
-#define IIDC_MAX_USER_PRIORITY		8
-#define IIDC_MAX_DSCP_MAPPING		64
-#define IIDC_DSCP_PFC_MODE		0x1
-
-/* Struct to hold per RDMA Qset info */
-struct iidc_rdma_qset_params {
-	/* Qset TEID returned to the RDMA driver in
-	 * ice_add_rdma_qset and used by RDMA driver
-	 * for calls to ice_del_rdma_qset
-	 */
-	u32 teid;	/* Qset TEID */
-	u16 qs_handle; /* RDMA driver provides this */
-	u16 vport_id; /* VSI index */
-	u8 tc; /* TC branch the Qset should belong to */
-};
-
-struct iidc_qos_info {
-	u64 tc_ctx;
-	u8 rel_bw;
-	u8 prio_type;
-	u8 egress_virt_up;
-	u8 ingress_virt_up;
-};
-
-/* Struct to pass QoS info */
-struct iidc_qos_params {
-	struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
-	u8 up2tc[IIDC_MAX_USER_PRIORITY];
-	u8 vport_relative_bw;
-	u8 vport_priority_type;
-	u8 num_tc;
-	u8 pfc_mode;
-	u8 dscp_map[IIDC_MAX_DSCP_MAPPING];
-};
-
-struct iidc_event {
-	DECLARE_BITMAP(type, IIDC_EVENT_NBITS);
-	u32 reg;
-};
-
-struct ice_pf;
-
-int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
-int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
-int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type);
-int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable);
-void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos);
-int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
-void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
-
-/* Structure representing auxiliary driver tailored information about the core
- * PCI dev, each auxiliary driver using the IIDC interface will have an
- * instance of this struct dedicated to it.
- */
-
-struct iidc_auxiliary_dev {
-	struct auxiliary_device adev;
-	struct ice_pf *pf;
-};
-
-/* structure representing the auxiliary driver. This struct is to be
- * allocated and populated by the auxiliary driver's owner. The core PCI
- * driver will access these ops by performing a container_of on the
- * auxiliary_device->dev.driver.
- */
-struct iidc_auxiliary_drv {
-	struct auxiliary_driver adrv;
-	/* This event_handler is meant to be a blocking call.  For instance,
-	 * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not
-	 * return until the auxiliary driver is ready for the MTU change to
-	 * happen.
-	 */
-	void (*event_handler)(struct ice_pf *pf, struct iidc_event *event);
-};
-
-#endif /* _IIDC_H_*/
diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h
new file mode 100644
index 000000000000..0cd75404e459
--- /dev/null
+++ b/include/linux/net/intel/iidc_rdma.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021, Intel Corporation. */
+
+#ifndef _IIDC_RDMA_H_
+#define _IIDC_RDMA_H_
+
+#include <linux/auxiliary_bus.h>
+#include <linux/dcbnl.h>
+#include <linux/device.h>
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+enum iidc_event_type {
+	IIDC_EVENT_BEFORE_MTU_CHANGE,
+	IIDC_EVENT_AFTER_MTU_CHANGE,
+	IIDC_EVENT_BEFORE_TC_CHANGE,
+	IIDC_EVENT_AFTER_TC_CHANGE,
+	IIDC_EVENT_CRIT_ERR,
+	IIDC_EVENT_NBITS		/* must be last */
+};
+
+enum iidc_reset_type {
+	IIDC_PFR,
+	IIDC_CORER,
+	IIDC_GLOBR,
+};
+
+enum iidc_rdma_protocol {
+	IIDC_RDMA_PROTOCOL_IWARP = BIT(0),
+	IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1),
+};
+
+#define IIDC_MAX_USER_PRIORITY		8
+#define IIDC_MAX_DSCP_MAPPING		64
+#define IIDC_DSCP_PFC_MODE		0x1
+
+/* Struct to hold per RDMA Qset info */
+struct iidc_rdma_qset_params {
+	/* Qset TEID returned to the RDMA driver in
+	 * ice_add_rdma_qset and used by RDMA driver
+	 * for calls to ice_del_rdma_qset
+	 */
+	u32 teid;	/* Qset TEID */
+	u16 qs_handle; /* RDMA driver provides this */
+	u16 vport_id; /* VSI index */
+	u8 tc; /* TC branch the Qset should belong to */
+};
+
+struct iidc_qos_info {
+	u64 tc_ctx;
+	u8 rel_bw;
+	u8 prio_type;
+	u8 egress_virt_up;
+	u8 ingress_virt_up;
+};
+
+/* Struct to pass QoS info */
+struct iidc_qos_params {
+	struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
+	u8 up2tc[IIDC_MAX_USER_PRIORITY];
+	u8 vport_relative_bw;
+	u8 vport_priority_type;
+	u8 num_tc;
+	u8 pfc_mode;
+	u8 dscp_map[IIDC_MAX_DSCP_MAPPING];
+};
+
+struct iidc_event {
+	DECLARE_BITMAP(type, IIDC_EVENT_NBITS);
+	u32 reg;
+};
+
+struct ice_pf;
+
+int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
+int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
+int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type);
+int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable);
+void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos);
+int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
+void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
+
+/* Structure representing auxiliary driver tailored information about the core
+ * PCI dev, each auxiliary driver using the IIDC interface will have an
+ * instance of this struct dedicated to it.
+ */
+
+struct iidc_auxiliary_dev {
+	struct auxiliary_device adev;
+	struct ice_pf *pf;
+};
+
+/* structure representing the auxiliary driver. This struct is to be
+ * allocated and populated by the auxiliary driver's owner. The core PCI
+ * driver will access these ops by performing a container_of on the
+ * auxiliary_device->dev.driver.
+ */
+struct iidc_auxiliary_drv {
+	struct auxiliary_driver adrv;
+	/* This event_handler is meant to be a blocking call.  For instance,
+	 * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not
+	 * return until the auxiliary driver is ready for the MTU change to
+	 * happen.
+	 */
+	void (*event_handler)(struct ice_pf *pf, struct iidc_event *event);
+};
+
+#endif /* _IIDC_RDMA_H_*/
-- 
cgit v1.2.3


From 97b5631aae6896369712d6b7131afbc95c753587 Mon Sep 17 00:00:00 2001
From: Dave Ertman <david.m.ertman@intel.com>
Date: Tue, 15 Apr 2025 21:15:46 -0500
Subject: iidc/ice/irdma: Rename to iidc_* convention

In preparation of supporting more than a single core PCI driver
for RDMA, homogenize naming to iidc_rdma_* and IIDC_RDMA_*
form.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/net/intel/iidc_rdma.h | 38 +++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h
index 0cd75404e459..2b24a9912fa0 100644
--- a/include/linux/net/intel/iidc_rdma.h
+++ b/include/linux/net/intel/iidc_rdma.h
@@ -11,16 +11,16 @@
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
 
-enum iidc_event_type {
-	IIDC_EVENT_BEFORE_MTU_CHANGE,
-	IIDC_EVENT_AFTER_MTU_CHANGE,
-	IIDC_EVENT_BEFORE_TC_CHANGE,
-	IIDC_EVENT_AFTER_TC_CHANGE,
-	IIDC_EVENT_CRIT_ERR,
-	IIDC_EVENT_NBITS		/* must be last */
+enum iidc_rdma_event_type {
+	IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE,
+	IIDC_RDMA_EVENT_AFTER_MTU_CHANGE,
+	IIDC_RDMA_EVENT_BEFORE_TC_CHANGE,
+	IIDC_RDMA_EVENT_AFTER_TC_CHANGE,
+	IIDC_RDMA_EVENT_CRIT_ERR,
+	IIDC_RDMA_EVENT_NBITS		/* must be last */
 };
 
-enum iidc_reset_type {
+enum iidc_rdma_reset_type {
 	IIDC_PFR,
 	IIDC_CORER,
 	IIDC_GLOBR,
@@ -47,7 +47,7 @@ struct iidc_rdma_qset_params {
 	u8 tc; /* TC branch the Qset should belong to */
 };
 
-struct iidc_qos_info {
+struct iidc_rdma_qos_info {
 	u64 tc_ctx;
 	u8 rel_bw;
 	u8 prio_type;
@@ -56,8 +56,8 @@ struct iidc_qos_info {
 };
 
 /* Struct to pass QoS info */
-struct iidc_qos_params {
-	struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
+struct iidc_rdma_qos_params {
+	struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
 	u8 up2tc[IIDC_MAX_USER_PRIORITY];
 	u8 vport_relative_bw;
 	u8 vport_priority_type;
@@ -66,8 +66,8 @@ struct iidc_qos_params {
 	u8 dscp_map[IIDC_MAX_DSCP_MAPPING];
 };
 
-struct iidc_event {
-	DECLARE_BITMAP(type, IIDC_EVENT_NBITS);
+struct iidc_rdma_event {
+	DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS);
 	u32 reg;
 };
 
@@ -75,9 +75,11 @@ struct ice_pf;
 
 int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
 int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
-int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type);
+int ice_rdma_request_reset(struct ice_pf *pf,
+			   enum iidc_rdma_reset_type reset_type);
 int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable);
-void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos);
+void ice_get_qos_params(struct ice_pf *pf,
+			struct iidc_rdma_qos_params *qos);
 int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
 void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
 
@@ -86,7 +88,7 @@ void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
  * instance of this struct dedicated to it.
  */
 
-struct iidc_auxiliary_dev {
+struct iidc_rdma_core_auxiliary_dev {
 	struct auxiliary_device adev;
 	struct ice_pf *pf;
 };
@@ -96,14 +98,14 @@ struct iidc_auxiliary_dev {
  * driver will access these ops by performing a container_of on the
  * auxiliary_device->dev.driver.
  */
-struct iidc_auxiliary_drv {
+struct iidc_rdma_core_auxiliary_drv {
 	struct auxiliary_driver adrv;
 	/* This event_handler is meant to be a blocking call.  For instance,
 	 * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not
 	 * return until the auxiliary driver is ready for the MTU change to
 	 * happen.
 	 */
-	void (*event_handler)(struct ice_pf *pf, struct iidc_event *event);
+	void (*event_handler)(struct ice_pf *pf, struct iidc_rdma_event *event);
 };
 
 #endif /* _IIDC_RDMA_H_*/
-- 
cgit v1.2.3


From d9251a560ba67bbedd53b81aee32e1ad95f42000 Mon Sep 17 00:00:00 2001
From: Dave Ertman <david.m.ertman@intel.com>
Date: Tue, 15 Apr 2025 21:15:47 -0500
Subject: iidc/ice/irdma: Break iidc.h into two headers

In preparation of supporting more than a single core PCI driver
for RDMA, break the iidc_rdma.h header file into two more focused
headers.

Only the elements universal to all Intel drivers will remain in
the generic iidc_rdma.h header. Move the ice specific information
to an ice specific header file named iidc_rdma_ice.h.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/net/intel/iidc_rdma.h     | 14 +-------------
 include/linux/net/intel/iidc_rdma_ice.h | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/net/intel/iidc_rdma_ice.h

(limited to 'include')

diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h
index 2b24a9912fa0..1e8136395154 100644
--- a/include/linux/net/intel/iidc_rdma.h
+++ b/include/linux/net/intel/iidc_rdma.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2021, Intel Corporation. */
+/* Copyright (C) 2021-2025, Intel Corporation. */
 
 #ifndef _IIDC_RDMA_H_
 #define _IIDC_RDMA_H_
@@ -71,18 +71,6 @@ struct iidc_rdma_event {
 	u32 reg;
 };
 
-struct ice_pf;
-
-int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
-int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
-int ice_rdma_request_reset(struct ice_pf *pf,
-			   enum iidc_rdma_reset_type reset_type);
-int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable);
-void ice_get_qos_params(struct ice_pf *pf,
-			struct iidc_rdma_qos_params *qos);
-int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
-void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
-
 /* Structure representing auxiliary driver tailored information about the core
  * PCI dev, each auxiliary driver using the IIDC interface will have an
  * instance of this struct dedicated to it.
diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h
new file mode 100644
index 000000000000..78d10003d776
--- /dev/null
+++ b/include/linux/net/intel/iidc_rdma_ice.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021-2025, Intel Corporation. */
+
+#ifndef _IIDC_RDMA_ICE_H_
+#define _IIDC_RDMA_ICE_H_
+
+struct ice_pf;
+
+int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
+int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
+int ice_rdma_request_reset(struct ice_pf *pf,
+			   enum iidc_rdma_reset_type reset_type);
+int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable);
+void ice_get_qos_params(struct ice_pf *pf,
+			struct iidc_rdma_qos_params *qos);
+int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
+void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
+
+#endif /* _IIDC_RDMA_ICE_H_*/
-- 
cgit v1.2.3


From 8239b771b94b639556c1987185fd82b2a896c923 Mon Sep 17 00:00:00 2001
From: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Date: Tue, 15 Apr 2025 21:15:48 -0500
Subject: ice: Replace ice specific DSCP mapping num with a kernel define

Replace ice driver specific DSCP mapping number defines
ICE_DSCP_NUM_VAL and IIDC_MAX_DSCP_MAPPING with
an equivalent kernel define DSCP_MAX.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/net/intel/iidc_rdma.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h
index 1e8136395154..7f1910289534 100644
--- a/include/linux/net/intel/iidc_rdma.h
+++ b/include/linux/net/intel/iidc_rdma.h
@@ -10,6 +10,7 @@
 #include <linux/if_ether.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <net/dscp.h>
 
 enum iidc_rdma_event_type {
 	IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE,
@@ -32,7 +33,6 @@ enum iidc_rdma_protocol {
 };
 
 #define IIDC_MAX_USER_PRIORITY		8
-#define IIDC_MAX_DSCP_MAPPING		64
 #define IIDC_DSCP_PFC_MODE		0x1
 
 /* Struct to hold per RDMA Qset info */
@@ -63,7 +63,7 @@ struct iidc_rdma_qos_params {
 	u8 vport_priority_type;
 	u8 num_tc;
 	u8 pfc_mode;
-	u8 dscp_map[IIDC_MAX_DSCP_MAPPING];
+	u8 dscp_map[DSCP_MAX];
 };
 
 struct iidc_rdma_event {
-- 
cgit v1.2.3


From a3e1c0ad835702555d90565584ab6f723adf7f94 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 29 Apr 2025 08:04:46 +0200
Subject: net: phy: factor out provider part from mdio_bus.c

After 52358dd63e34 ("net: phy: remove function stubs") there's a
problem if CONFIG_MDIO_BUS is set, but CONFIG_PHYLIB is not.
mdiobus_scan() uses phylib functions like get_phy_device().
Bringing back the stub wouldn't make much sense, because it would
allow to compile mdiobus_scan(), but the function would be unusable.
The stub returned NULL, and we have the following in mdiobus_scan():

phydev = get_phy_device(bus, addr, c45);
        if (IS_ERR(phydev))
                return phydev;

So calling mdiobus_scan() w/o CONFIG_PHYLIB would cause a crash later in
mdiobus_scan(). In general the PHYLIB functionality isn't optional here.
Consequently, MDIO bus providers depend on PHYLIB.
Therefore factor it out and build it together with the libphy core
modules. In addition make all MDIO bus providers under /drivers/net/mdio
depend on PHYLIB. Same applies to enetc MDIO bus provider. Note that
PHYLIB selects MDIO_DEVRES, therefore we can omit this here.

Fixes: 52358dd63e34 ("net: phy: remove function stubs")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202504270639.mT0lh2o1-lkp@intel.com/
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/c74772a9-dab6-44bf-a657-389df89d85c2@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3beaf225ee88..d62d292024bc 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2062,6 +2062,7 @@ int __phy_hwtstamp_set(struct phy_device *phydev,
 		       struct netlink_ext_ack *extack);
 
 extern const struct bus_type mdio_bus_type;
+extern const struct class mdio_bus_class;
 
 struct mdio_board_info {
 	const char	*bus_id;
-- 
cgit v1.2.3


From 66d454e99d71857faf249486912e381ec83760b4 Mon Sep 17 00:00:00 2001
From: Jordan Rife <jordan@jrife.io>
Date: Fri, 2 May 2025 09:15:21 -0700
Subject: bpf: udp: Make sure iter->batch always contains a full bucket
 snapshot

Require that iter->batch always contains a full bucket snapshot. This
invariant is important to avoid skipping or repeating sockets during
iteration when combined with the next few patches. Before, there were
two cases where a call to bpf_iter_udp_batch may only capture part of a
bucket:

1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1].
2. When more sockets are added to the bucket while calling
   bpf_iter_udp_realloc_batch(), making the updated batch size
   insufficient [2].

In cases where the batch size only covers part of a bucket, it is
possible to forget which sockets were already visited, especially if we
have to process a bucket in more than two batches. This forces us to
choose between repeating or skipping sockets, so don't allow this:

1. Stop iteration and propagate -ENOMEM up to userspace if reallocation
   fails instead of continuing with a partial batch.
2. Try bpf_iter_udp_realloc_batch() with GFP_USER just as before, but if
   we still aren't able to capture the full bucket, call
   bpf_iter_udp_realloc_batch() again while holding the bucket lock to
   guarantee the bucket does not change. On the second attempt use
   GFP_NOWAIT since we hold onto the spin lock.

Introduce the udp_portaddr_for_each_entry_from macro and use it instead
of udp_portaddr_for_each_entry to make it possible to continue iteration
from an arbitrary socket. This is required for this patch in the
GFP_NOWAIT case to allow us to fill the rest of a batch starting from
the middle of a bucket and the later patch which skips sockets that were
already seen.

Testing all scenarios directly is a bit difficult, but I did some manual
testing to exercise the code paths where GFP_NOWAIT is used and where
ERR_PTR(err) is returned. I used the realloc test case included later
in this series to trigger a scenario where a realloc happens inside
bpf_iter_udp_batch and made a small code tweak to force the first
realloc attempt to allocate a too-small batch, thus requiring
another attempt with GFP_NOWAIT. Some printks showed both reallocs with
the tests passing:

Apr 25 23:16:24 crow kernel: go again GFP_USER
Apr 25 23:16:24 crow kernel: go again GFP_NOWAIT

With this setup, I also forced each of the bpf_iter_udp_realloc_batch
calls to return -ENOMEM to ensure that iteration ends and that the
read() in userspace fails.

[1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/
[2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/

Signed-off-by: Jordan Rife <jordan@jrife.io>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/udp.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 895240177f4f..4e1a672af4c5 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -216,6 +216,9 @@ static inline void udp_allow_gso(struct sock *sk)
 #define udp_portaddr_for_each_entry(__sk, list) \
 	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
+#define udp_portaddr_for_each_entry_from(__sk) \
+	hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node)
+
 #define udp_portaddr_for_each_entry_rcu(__sk, list) \
 	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
-- 
cgit v1.2.3


From ca732e990fc8222a2d6782ae750304719e212fe8 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 1 May 2025 12:45:11 +0100
Subject: net: stmmac: add get_interfaces() platform method

Add a get_interfaces() platform method to allow platforms to indicate
to phylink which interface modes they support - which then allows
phylink to validate on initialisation that the configured PHY interface
mode is actually supported.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1uASLn-0021Qd-Mi@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/stmmac.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 8aed09d65b4a..537bced69c46 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -233,6 +233,8 @@ struct plat_stmmacenet_data {
 	u8 tx_sched_algorithm;
 	struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES];
 	struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES];
+	void (*get_interfaces)(struct stmmac_priv *priv, void *bsp_priv,
+			       unsigned long *interfaces);
 	int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i,
 			       phy_interface_t interface, int speed);
 	void (*fix_mac_speed)(void *priv, int speed, unsigned int mode);
-- 
cgit v1.2.3


From 9d165dc58055d98658941a33fef9e5da866af3e9 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 1 May 2025 12:45:27 +0100
Subject: net: stmmac: remove speed_mode_2500() method

Remove the speed_mode_2500() platform method which is no longer used
or necessary, being superseded by the more flexible get_interfaces()
method.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/E1uASM3-0021R3-2B@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/stmmac.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 537bced69c46..26ddf95d23f9 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -241,7 +241,6 @@ struct plat_stmmacenet_data {
 	int (*fix_soc_reset)(void *priv, void __iomem *ioaddr);
 	int (*serdes_powerup)(struct net_device *ndev, void *priv);
 	void (*serdes_powerdown)(struct net_device *ndev, void *priv);
-	void (*speed_mode_2500)(struct net_device *ndev, void *priv);
 	int (*mac_finish)(struct net_device *ndev,
 			  void *priv,
 			  unsigned int mode,
-- 
cgit v1.2.3


From 320a66f84022028f1277bf568a5e8987eac6e797 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Thu, 1 May 2025 01:24:02 +0100
Subject: strparser: Remove unused __strp_unpause

The last use of __strp_unpause() was removed in 2022 by
commit 84c61fe1a75b ("tls: rx: do not use the standard strparser")

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250501002402.308843-1-linux@treblig.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/strparser.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index 0a83010b3a64..0ed73e364faa 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -114,8 +114,6 @@ static inline void strp_pause(struct strparser *strp)
 
 /* May be called without holding lock for attached socket */
 void strp_unpause(struct strparser *strp);
-/* Must be called with process lock held (lock_sock) */
-void __strp_unpause(struct strparser *strp);
 
 static inline void save_strp_stats(struct strparser *strp,
 				   struct strp_aggr_stats *agg_stats)
-- 
cgit v1.2.3


From ac8f09b9210c48934c78fdc6bc167e660eaac928 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 2 May 2025 00:38:15 +0100
Subject: sctp: Remove unused sctp_assoc_del_peer and sctp_chunk_iif

sctp_assoc_del_peer() last use was removed in 2015 by
commit 73e6742027f5 ("sctp: Do not try to search for the transport twice")
which now uses rm_peer instead of del_peer.

sctp_chunk_iif() last use was removed in 2016 by
commit 1f45f78f8e51 ("sctp: allow GSO frags to access the chunk too")

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20250501233815.99832-1-linux@treblig.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/sm.h      | 1 -
 include/net/sctp/structs.h | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 64c42bd56bb2..3bfd261a53cc 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -161,7 +161,6 @@ const struct sctp_sm_table_entry *sctp_sm_lookup_event(
 					enum sctp_event_type event_type,
 					enum sctp_state state,
 					union sctp_subtype event_subtype);
-int sctp_chunk_iif(const struct sctp_chunk *);
 struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *,
 					     struct sctp_chunk *,
 					     gfp_t gfp);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index dcd288fa1bb6..1ad7ce71d0a7 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2152,8 +2152,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *,
 				     const union sctp_addr *address,
 				     const gfp_t gfp,
 				     const int peer_state);
-void sctp_assoc_del_peer(struct sctp_association *asoc,
-			 const union sctp_addr *addr);
 void sctp_assoc_rm_peer(struct sctp_association *asoc,
 			 struct sctp_transport *peer);
 void sctp_assoc_control_transport(struct sctp_association *asoc,
-- 
cgit v1.2.3


From 429ac6211494c12b668dac59811ea8a96db6d757 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 5 May 2025 13:45:11 +0200
Subject: devlink: define enum for attr types of dynamic attributes

Devlink param and health reporter fmsg use attributes with dynamic type
which is determined according to a different type. Currently used values
are NLA_*. The problem is, they are not part of UAPI. They may change
which would cause a break.

To make this future safe, introduce a enum that shadows NLA_* values in
it and is part of UAPI.

Also, this allows to possibly carry types that are unrelated to NLA_*
values.

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20250505114513.53370-3-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/devlink.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 9401aa343673..a5ee0f13740a 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -385,6 +385,21 @@ enum devlink_linecard_state {
 	DEVLINK_LINECARD_STATE_MAX = __DEVLINK_LINECARD_STATE_MAX - 1
 };
 
+/* Variable attribute type. */
+enum devlink_var_attr_type {
+	/* Following values relate to the internal NLA_* values */
+	DEVLINK_VAR_ATTR_TYPE_U8 = 1,
+	DEVLINK_VAR_ATTR_TYPE_U16,
+	DEVLINK_VAR_ATTR_TYPE_U32,
+	DEVLINK_VAR_ATTR_TYPE_U64,
+	DEVLINK_VAR_ATTR_TYPE_STRING,
+	DEVLINK_VAR_ATTR_TYPE_FLAG,
+	DEVLINK_VAR_ATTR_TYPE_NUL_STRING = 10,
+	DEVLINK_VAR_ATTR_TYPE_BINARY,
+	__DEVLINK_VAR_ATTR_TYPE_CUSTOM_BASE = 0x80,
+	/* Any possible custom types, unrelated to NLA_* values go below */
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
-- 
cgit v1.2.3


From f9e78932eac650cf1385244482b85e65ccaa87cf Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 5 May 2025 13:45:12 +0200
Subject: devlink: avoid param type value translations

Assign DEVLINK_PARAM_TYPE_* enum values to DEVLINK_VAR_ATTR_TYPE_* to
ensure the same values are used internally and in UAPI. Benefit from
that by removing the value translations.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20250505114513.53370-4-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b8783126c1ed..0091f23a40f7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -420,11 +420,11 @@ typedef u64 devlink_resource_occ_get_t(void *priv);
 
 #define __DEVLINK_PARAM_MAX_STRING_VALUE 32
 enum devlink_param_type {
-	DEVLINK_PARAM_TYPE_U8,
-	DEVLINK_PARAM_TYPE_U16,
-	DEVLINK_PARAM_TYPE_U32,
-	DEVLINK_PARAM_TYPE_STRING,
-	DEVLINK_PARAM_TYPE_BOOL,
+	DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8,
+	DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16,
+	DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32,
+	DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING,
+	DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG,
 };
 
 union devlink_param_value {
-- 
cgit v1.2.3


From 22c64f37e1d4e757b0073a72f1439c2c3509c5cb Mon Sep 17 00:00:00 2001
From: Mohan Kumar G <quic_mkumarg@quicinc.com>
Date: Mon, 5 May 2025 20:58:36 +0530
Subject: wifi: mac80211: Update MCS15 support in link_conf

As per IEEE 802.11be-2024 - 9.4.2.321, EHT operation element
contains MCS15 Disable subfield as the sixth bit, which is set when
MCS15 support is not enabled.

Get MCS15 support from EHT operation params and add it in link_conf
so that driver can use this value to know if EHT-MCS 15 reception
is enabled.

Co-developed-by: Dhanavandhana Kannan <quic_dhanavan1@quicinc.com>
Signed-off-by: Dhanavandhana Kannan <quic_dhanavan1@quicinc.com>
Signed-off-by: Mohan Kumar G <quic_mkumarg@quicinc.com>
Link: https://patch.msgid.link/20250505152836.3266829-1-quic_mkumarg@quicinc.com
[remove pointless !! for bool assignment]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 include/net/mac80211.h    | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 17f917cb4540..420c7f9aa6ee 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2325,6 +2325,7 @@ struct ieee80211_eht_cap_elem {
 #define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION	                0x04
 #define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT         0x08
 #define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK      0x30
+#define IEEE80211_EHT_OPER_MCS15_DISABLE                        0x40
 
 /**
  * struct ieee80211_eht_operation - eht operation element
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fdafc37d17cc..82617579d910 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -744,6 +744,7 @@ struct ieee80211_parsed_tpe {
  * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the
  *	reception of an EHT TB PPDU on an RU that spans the entire PPDU
  *	bandwidth
+ * @eht_disable_mcs15: disable EHT-MCS 15 reception capability.
  * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This
  *	information is the latest known value. It can come from this link's
  *	beacon or from a beacon sent by another link.
@@ -852,6 +853,8 @@ struct ieee80211_bss_conf {
 	bool eht_su_beamformee;
 	bool eht_mu_beamformer;
 	bool eht_80mhz_full_bw_ul_mumimo;
+	bool eht_disable_mcs15;
+
 	u8 bss_param_ch_cnt;
 	u8 bss_param_ch_cnt_link_id;
 };
-- 
cgit v1.2.3


From 4701073c3debd16d7f534f3eb808bd9b50601c0c Mon Sep 17 00:00:00 2001
From: Wei Fang <wei.fang@nxp.com>
Date: Tue, 6 May 2025 16:07:22 +0800
Subject: net: enetc: add initial netc-lib driver to support NTMP

Some NETC functionality is controlled using control messages sent to the
hardware using BD ring interface with 32B descriptor similar to transmit
BD ring used on ENETC. This BD ring interface is referred to as command
BD ring. It is used to configure functionality where the underlying
resources may be shared between different entities or being too large to
configure using direct registers. Therefore, a messaging protocol called
NETC Table Management Protocol (NTMP) is provided for exchanging
configuration and management information between the software and the
hardware using the command BD ring interface.

For the management protocol of LS1028A has been retroactively named NTMP
1.0, and its implementation is in enetc_cbdr.c and enetc_qos.c. However,
NTMP of i.MX95 has been upgraded to version 2.0, which is incompatible
with LS1028A, because the message formats have been changed. Therefore,
add the netc-lib driver to support NTMP 2.0 to operate various tables.
Note that, only MAC address filter table and RSS table are supported at
the moment. More tables will be supported in subsequent patches.

It is worth mentioning that the purpose of the netc-lib driver is to
provide some NTMP-based generic interfaces for ENETC and NETC Switch
drivers. Currently, it only supports the configurations of some tables.
Interfaces such as tc flower and debugfs will be added in the future.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20250506080735.3444381-2-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/fsl/ntmp.h | 121 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 include/linux/fsl/ntmp.h

(limited to 'include')

diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h
new file mode 100644
index 000000000000..916dc4fe7de3
--- /dev/null
+++ b/include/linux/fsl/ntmp.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/* Copyright 2025 NXP */
+#ifndef __NETC_NTMP_H
+#define __NETC_NTMP_H
+
+#include <linux/bitops.h>
+#include <linux/if_ether.h>
+
+struct maft_keye_data {
+	u8 mac_addr[ETH_ALEN];
+	__le16 resv;
+};
+
+struct maft_cfge_data {
+	__le16 si_bitmap;
+	__le16 resv;
+};
+
+struct netc_cbdr_regs {
+	void __iomem *pir;
+	void __iomem *cir;
+	void __iomem *mr;
+
+	void __iomem *bar0;
+	void __iomem *bar1;
+	void __iomem *lenr;
+};
+
+struct netc_tbl_vers {
+	u8 maft_ver;
+	u8 rsst_ver;
+};
+
+struct netc_cbdr {
+	struct device *dev;
+	struct netc_cbdr_regs regs;
+
+	int bd_num;
+	int next_to_use;
+	int next_to_clean;
+
+	int dma_size;
+	void *addr_base;
+	void *addr_base_align;
+	dma_addr_t dma_base;
+	dma_addr_t dma_base_align;
+
+	/* Serialize the order of command BD ring */
+	spinlock_t ring_lock;
+};
+
+struct ntmp_user {
+	int cbdr_num;	/* number of control BD ring */
+	struct device *dev;
+	struct netc_cbdr *ring;
+	struct netc_tbl_vers tbl;
+};
+
+struct maft_entry_data {
+	struct maft_keye_data keye;
+	struct maft_cfge_data cfge;
+};
+
+#if IS_ENABLED(CONFIG_NXP_NETC_LIB)
+int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
+		   const struct netc_cbdr_regs *regs);
+void ntmp_free_cbdr(struct netc_cbdr *cbdr);
+
+/* NTMP APIs */
+int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id,
+			struct maft_entry_data *maft);
+int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id,
+			  struct maft_entry_data *maft);
+int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id);
+int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table,
+			   int count);
+int ntmp_rsst_query_entry(struct ntmp_user *user,
+			  u32 *table, int count);
+#else
+static inline int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
+				 const struct netc_cbdr_regs *regs)
+{
+	return 0;
+}
+
+static inline void ntmp_free_cbdr(struct netc_cbdr *cbdr)
+{
+}
+
+static inline int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id,
+				      struct maft_entry_data *maft)
+{
+	return 0;
+}
+
+static inline int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id,
+					struct maft_entry_data *maft)
+{
+	return 0;
+}
+
+static inline int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id)
+{
+	return 0;
+}
+
+static inline int ntmp_rsst_update_entry(struct ntmp_user *user,
+					 const u32 *table, int count)
+{
+	return 0;
+}
+
+static inline int ntmp_rsst_query_entry(struct ntmp_user *user,
+					u32 *table, int count)
+{
+	return 0;
+}
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From c24a65b6a27c78d8540409800886b6622ea86ebf Mon Sep 17 00:00:00 2001
From: Dave Ertman <david.m.ertman@intel.com>
Date: Tue, 15 Apr 2025 21:15:49 -0500
Subject: iidc/ice/irdma: Update IDC to support multiple consumers

In preparation of supporting more than a single core PCI driver
for RDMA, move ice specific structs like qset_params, qos_info
and qos_params from iidc_rdma.h to iidc_rdma_ice.h.

Previously, the ice driver was just exporting its entire PF struct
to the auxiliary driver, but since each core driver will have its own
different PF struct, implement a universal struct that all core drivers
can provide to the auxiliary driver through the probe call.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Co-developed-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Co-developed-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Co-developed-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/net/intel/iidc_rdma.h     | 67 +++++++++-----------------------
 include/linux/net/intel/iidc_rdma_ice.h | 69 ++++++++++++++++++++++++++++-----
 2 files changed, 78 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h
index 7f1910289534..8baad1082042 100644
--- a/include/linux/net/intel/iidc_rdma.h
+++ b/include/linux/net/intel/iidc_rdma.h
@@ -5,7 +5,6 @@
 #define _IIDC_RDMA_H_
 
 #include <linux/auxiliary_bus.h>
-#include <linux/dcbnl.h>
 #include <linux/device.h>
 #include <linux/if_ether.h>
 #include <linux/kernel.h>
@@ -17,14 +16,19 @@ enum iidc_rdma_event_type {
 	IIDC_RDMA_EVENT_AFTER_MTU_CHANGE,
 	IIDC_RDMA_EVENT_BEFORE_TC_CHANGE,
 	IIDC_RDMA_EVENT_AFTER_TC_CHANGE,
+	IIDC_RDMA_EVENT_WARN_RESET,
 	IIDC_RDMA_EVENT_CRIT_ERR,
 	IIDC_RDMA_EVENT_NBITS		/* must be last */
 };
 
+struct iidc_rdma_event {
+	DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS);
+	u32 reg;
+};
+
 enum iidc_rdma_reset_type {
-	IIDC_PFR,
-	IIDC_CORER,
-	IIDC_GLOBR,
+	IIDC_FUNC_RESET,
+	IIDC_DEV_RESET,
 };
 
 enum iidc_rdma_protocol {
@@ -32,53 +36,22 @@ enum iidc_rdma_protocol {
 	IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1),
 };
 
-#define IIDC_MAX_USER_PRIORITY		8
-#define IIDC_DSCP_PFC_MODE		0x1
-
-/* Struct to hold per RDMA Qset info */
-struct iidc_rdma_qset_params {
-	/* Qset TEID returned to the RDMA driver in
-	 * ice_add_rdma_qset and used by RDMA driver
-	 * for calls to ice_del_rdma_qset
-	 */
-	u32 teid;	/* Qset TEID */
-	u16 qs_handle; /* RDMA driver provides this */
-	u16 vport_id; /* VSI index */
-	u8 tc; /* TC branch the Qset should belong to */
-};
-
-struct iidc_rdma_qos_info {
-	u64 tc_ctx;
-	u8 rel_bw;
-	u8 prio_type;
-	u8 egress_virt_up;
-	u8 ingress_virt_up;
-};
-
-/* Struct to pass QoS info */
-struct iidc_rdma_qos_params {
-	struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
-	u8 up2tc[IIDC_MAX_USER_PRIORITY];
-	u8 vport_relative_bw;
-	u8 vport_priority_type;
-	u8 num_tc;
-	u8 pfc_mode;
-	u8 dscp_map[DSCP_MAX];
-};
-
-struct iidc_rdma_event {
-	DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS);
-	u32 reg;
+/* Structure to be populated by core LAN PCI driver */
+struct iidc_rdma_core_dev_info {
+	struct pci_dev *pdev; /* PCI device of corresponding to main function */
+	struct auxiliary_device *adev;
+	/* Current active RDMA protocol */
+	enum iidc_rdma_protocol rdma_protocol;
+	void *iidc_priv; /* elements unique to each driver */
 };
 
 /* Structure representing auxiliary driver tailored information about the core
  * PCI dev, each auxiliary driver using the IIDC interface will have an
  * instance of this struct dedicated to it.
  */
-
 struct iidc_rdma_core_auxiliary_dev {
 	struct auxiliary_device adev;
-	struct ice_pf *pf;
+	struct iidc_rdma_core_dev_info *cdev_info;
 };
 
 /* structure representing the auxiliary driver. This struct is to be
@@ -88,12 +61,8 @@ struct iidc_rdma_core_auxiliary_dev {
  */
 struct iidc_rdma_core_auxiliary_drv {
 	struct auxiliary_driver adrv;
-	/* This event_handler is meant to be a blocking call.  For instance,
-	 * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not
-	 * return until the auxiliary driver is ready for the MTU change to
-	 * happen.
-	 */
-	void (*event_handler)(struct ice_pf *pf, struct iidc_rdma_event *event);
+	void (*event_handler)(struct iidc_rdma_core_dev_info *cdev,
+			      struct iidc_rdma_event *event);
 };
 
 #endif /* _IIDC_RDMA_H_*/
diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h
index 78d10003d776..b40eed0e13fe 100644
--- a/include/linux/net/intel/iidc_rdma_ice.h
+++ b/include/linux/net/intel/iidc_rdma_ice.h
@@ -4,16 +4,67 @@
 #ifndef _IIDC_RDMA_ICE_H_
 #define _IIDC_RDMA_ICE_H_
 
-struct ice_pf;
+#include <linux/dcbnl.h>
 
-int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
-int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
-int ice_rdma_request_reset(struct ice_pf *pf,
+#define IIDC_MAX_USER_PRIORITY         8
+#define IIDC_DSCP_PFC_MODE             0x1
+
+/**
+ * struct iidc_rdma_qset_params - Struct to hold per RDMA Qset info
+ * @teid: TEID of the Qset node
+ * @qs_handle: SW index of the Qset, RDMA provides this
+ * @vport_id: VSI index
+ * @tc: Traffic Class branch the QSet should belong to
+ */
+struct iidc_rdma_qset_params {
+	/* Qset TEID returned to the RDMA driver in
+	 * ice_add_rdma_qset and used by RDMA driver
+	 * for calls to ice_del_rdma_qset
+	 */
+	u32 teid;
+	u16 qs_handle;
+	u16 vport_id;
+	u8 tc;
+};
+
+struct iidc_rdma_qos_info {
+	u64 tc_ctx;
+	u8 rel_bw;
+	u8 prio_type;
+	u8 egress_virt_up;
+	u8 ingress_virt_up;
+};
+
+/* Struct to pass QoS info */
+struct iidc_rdma_qos_params {
+	struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
+	u8 up2tc[IIDC_MAX_USER_PRIORITY];
+	u8 vport_relative_bw;
+	u8 vport_priority_type;
+	u8 num_tc;
+	u8 pfc_mode;
+	u8 dscp_map[DSCP_MAX];
+};
+
+struct iidc_rdma_priv_dev_info {
+	u8 pf_id;
+	u16 vport_id;
+	struct net_device *netdev;
+	struct iidc_rdma_qos_params qos_info;
+	u8 __iomem *hw_addr;
+};
+
+int ice_add_rdma_qset(struct iidc_rdma_core_dev_info *cdev,
+		      struct iidc_rdma_qset_params *qset);
+int ice_del_rdma_qset(struct iidc_rdma_core_dev_info *cdev,
+		      struct iidc_rdma_qset_params *qset);
+int ice_rdma_request_reset(struct iidc_rdma_core_dev_info *cdev,
 			   enum iidc_rdma_reset_type reset_type);
-int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable);
-void ice_get_qos_params(struct ice_pf *pf,
-			struct iidc_rdma_qos_params *qos);
-int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
-void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry);
+int ice_rdma_update_vsi_filter(struct iidc_rdma_core_dev_info *cdev, u16 vsi_id,
+			       bool enable);
+int ice_alloc_rdma_qvector(struct iidc_rdma_core_dev_info *cdev,
+			   struct msix_entry *entry);
+void ice_free_rdma_qvector(struct iidc_rdma_core_dev_info *cdev,
+			   struct msix_entry *entry);
 
 #endif /* _IIDC_RDMA_ICE_H_*/
-- 
cgit v1.2.3


From 1b2900db0119c02e6445bb61ec3fba982d10cc8d Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Thu, 8 May 2025 13:30:34 +0300
Subject: ethtool: Block setting of symmetric RSS when non-symmetric
 rx-flow-hash is requested

Symmetric RSS hash requires that:
* No other fields besides IP src/dst and/or L4 src/dst are set
* If src is set, dst must also be set

This restriction was only enforced when RXNFC was configured after
symmetric hash was enabled. In the opposite order of operations (RXNFC
then symmetric enablement) the check was not performed.

Perform the sanity check on set_rxfh as well, by iterating over all flow
types hash fields and making sure they are all symmetric.

Introduce a function that returns whether a flow type is hashable (not
spec only) and needs to be iterated over. To make sure that no one
forgets to update the list of hashable flow types when adding new flow
types, a static assert is added to draw the developer's attention.

The conversion of uapi #defines to enum is not ideal, but as Jakub
mentioned [1], we have precedent for that.

[1] https://lore.kernel.org/netdev/20250324073509.6571ade3@kernel.org/

Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250508103034.885536-1-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 134 ++++++++++++++++++++++---------------------
 1 file changed, 69 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 84833cca29fe..707c1844010c 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2295,71 +2295,75 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	RXH_XFRM_SYM_OR_XOR	(1 << 1)
 #define	RXH_XFRM_NO_CHANGE	0xff
 
-/* L2-L4 network traffic flow types */
-#define	TCP_V4_FLOW	0x01	/* hash or spec (tcp_ip4_spec) */
-#define	UDP_V4_FLOW	0x02	/* hash or spec (udp_ip4_spec) */
-#define	SCTP_V4_FLOW	0x03	/* hash or spec (sctp_ip4_spec) */
-#define	AH_ESP_V4_FLOW	0x04	/* hash only */
-#define	TCP_V6_FLOW	0x05	/* hash or spec (tcp_ip6_spec; nfc only) */
-#define	UDP_V6_FLOW	0x06	/* hash or spec (udp_ip6_spec; nfc only) */
-#define	SCTP_V6_FLOW	0x07	/* hash or spec (sctp_ip6_spec; nfc only) */
-#define	AH_ESP_V6_FLOW	0x08	/* hash only */
-#define	AH_V4_FLOW	0x09	/* hash or spec (ah_ip4_spec) */
-#define	ESP_V4_FLOW	0x0a	/* hash or spec (esp_ip4_spec) */
-#define	AH_V6_FLOW	0x0b	/* hash or spec (ah_ip6_spec; nfc only) */
-#define	ESP_V6_FLOW	0x0c	/* hash or spec (esp_ip6_spec; nfc only) */
-#define	IPV4_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
-#define	IP_USER_FLOW	IPV4_USER_FLOW
-#define	IPV6_USER_FLOW	0x0e	/* spec only (usr_ip6_spec; nfc only) */
-#define	IPV4_FLOW	0x10	/* hash only */
-#define	IPV6_FLOW	0x11	/* hash only */
-#define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
-
-/* Used for GTP-U IPv4 and IPv6.
- * The format of GTP packets only includes
- * elements such as TEID and GTP version.
- * It is primarily intended for data communication of the UE.
- */
-#define GTPU_V4_FLOW 0x13	/* hash only */
-#define GTPU_V6_FLOW 0x14	/* hash only */
-
-/* Use for GTP-C IPv4 and v6.
- * The format of these GTP packets does not include TEID.
- * Primarily expected to be used for communication
- * to create sessions for UE data communication,
- * commonly referred to as CSR (Create Session Request).
- */
-#define GTPC_V4_FLOW 0x15	/* hash only */
-#define GTPC_V6_FLOW 0x16	/* hash only */
-
-/* Use for GTP-C IPv4 and v6.
- * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID.
- * After session creation, it becomes this packet.
- * This is mainly used for requests to realize UE handover.
- */
-#define GTPC_TEID_V4_FLOW 0x17	/* hash only */
-#define GTPC_TEID_V6_FLOW 0x18	/* hash only */
-
-/* Use for GTP-U and extended headers for the PSC (PDU Session Container).
- * The format of these GTP packets includes TEID and QFI.
- * In 5G communication using UPF (User Plane Function),
- * data communication with this extended header is performed.
- */
-#define GTPU_EH_V4_FLOW 0x19	/* hash only */
-#define GTPU_EH_V6_FLOW 0x1a	/* hash only */
-
-/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers.
- * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by
- * UL/DL included in the PSC.
- * There are differences in the data included based on Downlink/Uplink,
- * and can be used to distinguish packets.
- * The functions described so far are useful when you want to
- * handle communication from the mobile network in UPF, PGW, etc.
- */
-#define GTPU_UL_V4_FLOW 0x1b	/* hash only */
-#define GTPU_UL_V6_FLOW 0x1c	/* hash only */
-#define GTPU_DL_V4_FLOW 0x1d	/* hash only */
-#define GTPU_DL_V6_FLOW 0x1e	/* hash only */
+enum {
+	/* L2-L4 network traffic flow types */
+	TCP_V4_FLOW	= 0x01,	/* hash or spec (tcp_ip4_spec) */
+	UDP_V4_FLOW	= 0x02,	/* hash or spec (udp_ip4_spec) */
+	SCTP_V4_FLOW	= 0x03,	/* hash or spec (sctp_ip4_spec) */
+	AH_ESP_V4_FLOW	= 0x04,	/* hash only */
+	TCP_V6_FLOW	= 0x05,	/* hash or spec (tcp_ip6_spec; nfc only) */
+	UDP_V6_FLOW	= 0x06,	/* hash or spec (udp_ip6_spec; nfc only) */
+	SCTP_V6_FLOW	= 0x07,	/* hash or spec (sctp_ip6_spec; nfc only) */
+	AH_ESP_V6_FLOW	= 0x08,	/* hash only */
+	AH_V4_FLOW	= 0x09,	/* hash or spec (ah_ip4_spec) */
+	ESP_V4_FLOW	= 0x0a,	/* hash or spec (esp_ip4_spec) */
+	AH_V6_FLOW	= 0x0b,	/* hash or spec (ah_ip6_spec; nfc only) */
+	ESP_V6_FLOW	= 0x0c,	/* hash or spec (esp_ip6_spec; nfc only) */
+	IPV4_USER_FLOW	= 0x0d,	/* spec only (usr_ip4_spec) */
+	IP_USER_FLOW	= IPV4_USER_FLOW,
+	IPV6_USER_FLOW	= 0x0e, /* spec only (usr_ip6_spec; nfc only) */
+	IPV4_FLOW	= 0x10, /* hash only */
+	IPV6_FLOW	= 0x11, /* hash only */
+	ETHER_FLOW	= 0x12, /* spec only (ether_spec) */
+
+	/* Used for GTP-U IPv4 and IPv6.
+	 * The format of GTP packets only includes
+	 * elements such as TEID and GTP version.
+	 * It is primarily intended for data communication of the UE.
+	 */
+	GTPU_V4_FLOW	= 0x13,	/* hash only */
+	GTPU_V6_FLOW	= 0x14,	/* hash only */
+
+	/* Use for GTP-C IPv4 and v6.
+	 * The format of these GTP packets does not include TEID.
+	 * Primarily expected to be used for communication
+	 * to create sessions for UE data communication,
+	 * commonly referred to as CSR (Create Session Request).
+	 */
+	GTPC_V4_FLOW	= 0x15,	/* hash only */
+	GTPC_V6_FLOW	= 0x16,	/* hash only */
+
+	/* Use for GTP-C IPv4 and v6.
+	 * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID.
+	 * After session creation, it becomes this packet.
+	 * This is mainly used for requests to realize UE handover.
+	 */
+	GTPC_TEID_V4_FLOW	= 0x17,	/* hash only */
+	GTPC_TEID_V6_FLOW	= 0x18,	/* hash only */
+
+	/* Use for GTP-U and extended headers for the PSC (PDU Session Container).
+	 * The format of these GTP packets includes TEID and QFI.
+	 * In 5G communication using UPF (User Plane Function),
+	 * data communication with this extended header is performed.
+	 */
+	GTPU_EH_V4_FLOW	= 0x19,	/* hash only */
+	GTPU_EH_V6_FLOW	= 0x1a,	/* hash only */
+
+	/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers.
+	 * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by
+	 * UL/DL included in the PSC.
+	 * There are differences in the data included based on Downlink/Uplink,
+	 * and can be used to distinguish packets.
+	 * The functions described so far are useful when you want to
+	 * handle communication from the mobile network in UPF, PGW, etc.
+	 */
+	GTPU_UL_V4_FLOW	= 0x1b,	/* hash only */
+	GTPU_UL_V6_FLOW	= 0x1c,	/* hash only */
+	GTPU_DL_V4_FLOW	= 0x1d,	/* hash only */
+	GTPU_DL_V6_FLOW	= 0x1e,	/* hash only */
+
+	__FLOW_TYPE_COUNT,
+};
 
 /* Flag to enable additional fields in struct ethtool_rx_flow_spec */
 #define	FLOW_EXT	0x80000000
-- 
cgit v1.2.3


From 6c14058edfd01cdc0d3018b9069643b0da7c3e80 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 8 May 2025 12:52:36 +0300
Subject: net: dsa: convert to ndo_hwtstamp_get() and ndo_hwtstamp_set()

New timestamping API was introduced in commit 66f7223039c0 ("net: add
NDOs for configuring hardware timestamping") from kernel v6.6. It is
time to convert DSA to the new API, so that the ndo_eth_ioctl() path can
be removed completely.

Move the ds->ops->port_hwtstamp_get() and ds->ops->port_hwtstamp_set()
calls from dsa_user_ioctl() to dsa_user_hwtstamp_get() and
dsa_user_hwtstamp_set().

Due to the fact that the underlying ifreq type changes to
kernel_hwtstamp_config, the drivers and the Ocelot switchdev front-end,
all hooked up directly or indirectly, must also be converted all at once.

The conversion also updates the comment from dsa_port_supports_hwtstamp(),
which is no longer true because kernel_hwtstamp_config is kernel memory
and does not need copy_to_user(). I've deliberated whether it is
necessary to also update "err != -EOPNOTSUPP" to a more general "!err",
but all drivers now either return 0 or -EOPNOTSUPP.

The existing logic from the ocelot_ioctl() function, to avoid
configuring timestamping if the PHY supports the operation, is obsoleted
by more advanced core logic in dev_set_hwtstamp_phylib().

This is only a partial preparation for proper PHY timestamping support.
None of these switch driver currently sets up PTP traps for PHY
timestamping, so setting dev->see_all_hwtstamp_requests is not yet
necessary and the conversion is relatively trivial.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com> # felix, sja1105, mv88e6xxx
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20250508095236.887789-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h         | 5 +++--
 include/soc/mscc/ocelot.h | 7 +++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index a0a9481c52c2..55e2d97f247e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1131,9 +1131,10 @@ struct dsa_switch_ops {
 	 * PTP functionality
 	 */
 	int	(*port_hwtstamp_get)(struct dsa_switch *ds, int port,
-				     struct ifreq *ifr);
+				     struct kernel_hwtstamp_config *config);
 	int	(*port_hwtstamp_set)(struct dsa_switch *ds, int port,
-				     struct ifreq *ifr);
+				     struct kernel_hwtstamp_config *config,
+				     struct netlink_ext_ack *extack);
 	void	(*port_txtstamp)(struct dsa_switch *ds, int port,
 				 struct sk_buff *skb);
 	bool	(*port_rxtstamp)(struct dsa_switch *ds, int port,
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 6db7fc9dbaa4..48d6deb3efd7 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -1073,8 +1073,11 @@ int ocelot_vlan_prepare(struct ocelot *ocelot, int port, u16 vid, bool pvid,
 int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
 		    bool untagged);
 int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid);
-int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr);
-int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr);
+void ocelot_hwstamp_get(struct ocelot *ocelot, int port,
+			struct kernel_hwtstamp_config *cfg);
+int ocelot_hwstamp_set(struct ocelot *ocelot, int port,
+		       struct kernel_hwtstamp_config *cfg,
+		       struct netlink_ext_ack *extack);
 int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port,
 				 struct sk_buff *skb,
 				 struct sk_buff **clone);
-- 
cgit v1.2.3


From a96876057b9e44f60d936f8e4887543555b0593c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 9 May 2025 14:27:51 -0700
Subject: netlink: fix policy dump for int with validation callback

Recent devlink change added validation of an integer value
via NLA_POLICY_VALIDATE_FN, for sparse enums. Handle this
in policy dump. We can't extract any info out of the callback,
so report only the type.

Fixes: 429ac6211494 ("devlink: define enum for attr types of dynamic attributes")
Reported-by: syzbot+01eb26848144516e7f0a@syzkaller.appspotmail.com
Link: https://patch.msgid.link/20250509212751.1905149-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netlink.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 82e07e272290..90a560dc167a 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -321,7 +321,13 @@ enum nla_policy_validation {
  *    All other            Unused - but note that it's a union
  *
  * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
+ *    NLA_U8, NLA_U16,
+ *    NLA_U32, NLA_U64,
+ *    NLA_S8, NLA_S16,
+ *    NLA_S32, NLA_S64,
+ *    NLA_MSECS,
  *    NLA_BINARY           Validation function called for the attribute.
+ *
  *    All other            Unused - but note that it's a union
  *
  * Example:
-- 
cgit v1.2.3


From 03e96b8c11d140fb4ead0b30c2d6e1a294b501ef Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Thu, 8 May 2025 00:48:21 +0000
Subject: netmem: add niov->type attribute to distinguish different net_iov
 types

Later patches in the series adds TX net_iovs where there is no pp
associated, so we can't rely on niov->pp->mp_ops to tell what is the
type of the net_iov.

Add a type enum to the net_iov which tells us the net_iov type.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20250508004830.4100853-2-almasrymina@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netmem.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
index c61d5b21e7b4..973fdbcfef38 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -20,8 +20,17 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
  */
 #define NET_IOV 0x01UL
 
+enum net_iov_type {
+	NET_IOV_DMABUF,
+	NET_IOV_IOURING,
+
+	/* Force size to unsigned long to make the NET_IOV_ASSERTS below pass.
+	 */
+	NET_IOV_MAX = ULONG_MAX
+};
+
 struct net_iov {
-	unsigned long __unused_padding;
+	enum net_iov_type type;
 	unsigned long pp_magic;
 	struct page_pool *pp;
 	struct net_iov_area *owner;
-- 
cgit v1.2.3


From e9f3d61db5cb29b3f17f0dc40c3ec2cda2ee93e5 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Thu, 8 May 2025 00:48:22 +0000
Subject: net: add get_netmem/put_netmem support

Currently net_iovs support only pp ref counts, and do not support a
page ref equivalent.

This is fine for the RX path as net_iovs are used exclusively with the
pp and only pp refcounting is needed there. The TX path however does not
use pp ref counts, thus, support for get_page/put_page equivalent is
needed for netmem.

Support get_netmem/put_netmem. Check the type of the netmem before
passing it to page or net_iov specific code to obtain a page ref
equivalent.

For dmabuf net_iovs, we obtain a ref on the underlying binding. This
ensures the entire binding doesn't disappear until all the net_iovs have
been put_netmem'ed. We do not need to track the refcount of individual
dmabuf net_iovs as we don't allocate/free them from a pool similar to
what the buddy allocator does for pages.

This code is written to be extensible by other net_iov implementers.
get_netmem/put_netmem will check the type of the netmem and route it to
the correct helper:

pages -> [get|put]_page()
dmabuf net_iovs -> net_devmem_[get|put]_net_iov()
new net_iovs ->	new helpers

Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250508004830.4100853-3-almasrymina@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff_ref.h | 4 ++--
 include/net/netmem.h       | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h
index 0f3c58007488..9e49372ef1a0 100644
--- a/include/linux/skbuff_ref.h
+++ b/include/linux/skbuff_ref.h
@@ -17,7 +17,7 @@
  */
 static inline void __skb_frag_ref(skb_frag_t *frag)
 {
-	get_page(skb_frag_page(frag));
+	get_netmem(skb_frag_netmem(frag));
 }
 
 /**
@@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle)
 	if (recycle && napi_pp_put_page(netmem))
 		return;
 #endif
-	put_page(netmem_to_page(netmem));
+	put_netmem(netmem);
 }
 
 /**
diff --git a/include/net/netmem.h b/include/net/netmem.h
index 973fdbcfef38..ecb6b29c93f6 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -273,4 +273,7 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
 	return __netmem_clear_lsb(netmem)->dma_addr;
 }
 
+void get_netmem(netmem_ref netmem);
+void put_netmem(netmem_ref netmem);
+
 #endif /* _NET_NETMEM_H */
-- 
cgit v1.2.3


From 8802087d20c0e1c26c4b4fe30e22264bf8285e51 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 8 May 2025 00:48:23 +0000
Subject: net: devmem: TCP tx netlink api

Add bind-tx netlink call to attach dmabuf for TX; queue is not
required, only ifindex and dmabuf fd for attachment.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20250508004830.4100853-4-almasrymina@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/netdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7600bf62dbdf..7eb9571786b8 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -219,6 +219,7 @@ enum {
 	NETDEV_CMD_QSTATS_GET,
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
+	NETDEV_CMD_BIND_TX,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
-- 
cgit v1.2.3


From bd61848900bff597764238f3a8ec67c815cd316e Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Thu, 8 May 2025 00:48:24 +0000
Subject: net: devmem: Implement TX path

Augment dmabuf binding to be able to handle TX. Additional to all the RX
binding, we also create tx_vec needed for the TX path.

Provide API for sendmsg to be able to send dmabufs bound to this device:

- Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from.
- MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf.

Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY
implementation, while disabling instances where MSG_ZEROCOPY falls back
to copying.

We additionally pipe the binding down to the new
zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems
instead of the traditional page netmems.

We also special case skb_frag_dma_map to return the dma-address of these
dmabuf net_iovs instead of attempting to map pages.

The TX path may release the dmabuf in a context where we cannot wait.
This happens when the user unbinds a TX dmabuf while there are still
references to its netmems in the TX path. In that case, the netmems will
be put_netmem'd from a context where we can't unmap the dmabuf, Resolve
this by making __net_devmem_dmabuf_binding_free schedule_work'd.

Based on work by Stanislav Fomichev <sdf@fomichev.me>. A lot of the meat
of the implementation came from devmem TCP RFC v1[1], which included the
TX path, but Stan did all the rebasing on top of netmem/net_iov.

Cc: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250508004830.4100853-5-almasrymina@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h | 17 +++++++++++++----
 include/net/sock.h     |  1 +
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f3e72be6f634..c7397b17bb08 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
 extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;
 
 struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
-				       struct ubuf_info *uarg);
+				       struct ubuf_info *uarg, bool devmem);
 
 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 
+struct net_devmem_dmabuf_binding;
+
 int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
 			    struct sk_buff *skb, struct iov_iter *from,
-			    size_t length);
+			    size_t length,
+			    struct net_devmem_dmabuf_binding *binding);
 
 int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
 				struct iov_iter *from, size_t length);
@@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
 static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
 					  struct msghdr *msg, int len)
 {
-	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
+	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
+				       NULL);
 }
 
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
-			     struct ubuf_info *uarg);
+			     struct ubuf_info *uarg,
+			     struct net_devmem_dmabuf_binding *binding);
 
 /* Internal */
 #define skb_shinfo(SKB)	((struct skb_shared_info *)(skb_end_pointer(SKB)))
@@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
 					    size_t offset, size_t size,
 					    enum dma_data_direction dir)
 {
+	if (skb_frag_is_net_iov(frag)) {
+		return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
+		       frag->offset;
+	}
 	return dma_map_page(dev, skb_frag_page(frag),
 			    skb_frag_off(frag) + offset, size, dir);
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index f0fabb9fd28a..3e15d7105ad2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1851,6 +1851,7 @@ struct sockcm_cookie {
 	u32 tsflags;
 	u32 ts_opt_id;
 	u32 priority;
+	u32 dmabuf_id;
 };
 
 static inline void sockcm_init(struct sockcm_cookie *sockc,
-- 
cgit v1.2.3


From 383faec0fd64b9bff15eb5f700f023ec35520a96 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Thu, 8 May 2025 00:48:26 +0000
Subject: net: enable driver support for netmem TX

Drivers need to make sure not to pass netmem dma-addrs to the
dma-mapping API in order to support netmem TX.

Add helpers and netmem_dma_*() helpers that enables special handling of
netmem dma-addrs that drivers can use.

Document in netmem.rst what drivers need to do to support netmem TX.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250508004830.4100853-7-almasrymina@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h |  2 ++
 include/net/netmem.h      | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 773167508c82..32a1e41636a9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1772,6 +1772,7 @@ enum netdev_reg_state {
  *	@lltx:		device supports lockless Tx. Deprecated for real HW
  *			drivers. Mainly used by logical interfaces, such as
  *			bonding and tunnels
+ *	@netmem_tx:	device support netmem_tx.
  *
  *	@name:	This is the first field of the "visible" part of this structure
  *		(i.e. as seen by users in the "Space.c" file).  It is the name
@@ -2087,6 +2088,7 @@ struct net_device {
 	struct_group(priv_flags_fast,
 		unsigned long		priv_flags:32;
 		unsigned long		lltx:1;
+		unsigned long		netmem_tx:1;
 	);
 	const struct net_device_ops *netdev_ops;
 	const struct header_ops *header_ops;
diff --git a/include/net/netmem.h b/include/net/netmem.h
index ecb6b29c93f6..386164fb9c18 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -8,6 +8,7 @@
 #ifndef _NET_NETMEM_H
 #define _NET_NETMEM_H
 
+#include <linux/dma-mapping.h>
 #include <linux/mm.h>
 #include <net/net_debug.h>
 
@@ -276,4 +277,23 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
 void get_netmem(netmem_ref netmem);
 void put_netmem(netmem_ref netmem);
 
+#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL)   \
+	do {                                                     \
+		if (!netmem_is_net_iov(NETMEM))                  \
+			dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \
+		else                                             \
+			dma_unmap_addr_set(PTR, ADDR_NAME, 0);   \
+	} while (0)
+
+static inline void netmem_dma_unmap_page_attrs(struct device *dev,
+					       dma_addr_t addr, size_t size,
+					       enum dma_data_direction dir,
+					       unsigned long attrs)
+{
+	if (!addr)
+		return;
+
+	dma_unmap_page_attrs(dev, addr, size, dir, attrs);
+}
+
 #endif /* _NET_NETMEM_H */
-- 
cgit v1.2.3


From dc75c3ced10c611f524e9e444303a0249ce32e43 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 12 May 2025 22:20:59 +0200
Subject: net: phy: remove stub for mdiobus_register_board_info

The functionality of mdiobus_register_board_info() typically isn't
optional for the caller. Therefore remove the stub.

Note: Currently we have only one caller of mdiobus_register_board_info(),
in a DSA/PHYLINK context. Therefore CONFIG_MDIO_DEVICE is selected anyway.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/410a2222-c4e8-45b0-9091-d49674caeb00@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index d62d292024bc..7c29d346d4b3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2071,17 +2071,8 @@ struct mdio_board_info {
 	const void	*platform_data;
 };
 
-#if IS_ENABLED(CONFIG_MDIO_DEVICE)
 int mdiobus_register_board_info(const struct mdio_board_info *info,
 				unsigned int n);
-#else
-static inline int mdiobus_register_board_info(const struct mdio_board_info *i,
-					      unsigned int n)
-{
-	return 0;
-}
-#endif
-
 
 /**
  * phy_module_driver() - Helper macro for registering PHY drivers
-- 
cgit v1.2.3


From c16608005ccb99fbde3a4cd96eab28e16f148abf Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Tue, 13 May 2025 11:19:22 +0300
Subject: net: Look for bonding slaves in the bond's network namespace

Update the for_each_netdev_in_bond_rcu macro to iterate through network
devices in the bond's network namespace instead of always using
init_net. This change is safe because:

1. **Bond-Slave Namespace Relationship**: A bond device and its slaves
   must reside in the same network namespace. The bond device's
   namespace is established at creation time and cannot change.

2. **Slave Movement Implications**: Any attempt to move a slave device
   to a different namespace automatically removes it from the bond, as
   per kernel networking stack rules.
   This maintains the invariant that slaves must exist in the same
   namespace as their bond.

This change is part of an effort to enable Link Aggregation (LAG) to
work properly inside custom network namespaces. Previously, the macro
would only find slave devices in the initial network namespace,
preventing proper bonding functionality in custom namespaces.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250513081922.525716-1-mbloch@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 32a1e41636a9..9e3a2d8452d6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3268,7 +3268,7 @@ int call_netdevice_notifiers_info(unsigned long val,
 #define for_each_netdev_continue_rcu(net, d)		\
 	list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_in_bond_rcu(bond, slave)	\
-		for_each_netdev_rcu(&init_net, slave)	\
+		for_each_netdev_rcu(dev_net_rcu(bond), slave)	\
 			if (netdev_master_upper_dev_get_rcu(slave) == (bond))
 #define net_device_entry(lh)	list_entry(lh, struct net_device, dev_list)
 
-- 
cgit v1.2.3


From b9eef3391de028fdd88fd7a2f81a4834fc98c9ac Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 12 May 2025 11:27:26 +0200
Subject: xdp: Use nested-BH locking for system_page_pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

system_page_pool is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.

Make a struct with a page_pool member (original system_page_pool) and a
local_lock_t and use local_lock_nested_bh() for locking. This change
adds only lockdep coverage and does not alter the functional behaviour
for !PREEMPT_RT.

Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jesper Dangaard Brouer <hawk@kernel.org>
Cc: John Fastabend <john.fastabend@gmail.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/20250512092736.229935-6-bigeasy@linutronix.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9e3a2d8452d6..73a97cf1bbce 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3503,7 +3503,12 @@ struct softnet_data {
 };
 
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
-DECLARE_PER_CPU(struct page_pool *, system_page_pool);
+
+struct page_pool_bh {
+	struct page_pool *pool;
+	local_lock_t bh_lock;
+};
+DECLARE_PER_CPU(struct page_pool_bh, system_page_pool);
 
 #ifndef CONFIG_PREEMPT_RT
 static inline int dev_recursion_level(void)
-- 
cgit v1.2.3


From 7fe70c06a182a140be9996b02256d907e114479a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 12 May 2025 11:27:31 +0200
Subject: net/sched: act_mirred: Move the recursion counter struct netdev_xmit

mirred_nest_level is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.

Move mirred_nest_level to struct netdev_xmit as u8, provide wrappers.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20250512092736.229935-11-bigeasy@linutronix.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice_xmit.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
index 38325e070296..848735b3a7c0 100644
--- a/include/linux/netdevice_xmit.h
+++ b/include/linux/netdevice_xmit.h
@@ -8,6 +8,9 @@ struct netdev_xmit {
 #ifdef CONFIG_NET_EGRESS
 	u8  skip_txqueue;
 #endif
+#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
+	u8 sched_mirred_nest;
+#endif
 };
 
 #endif
-- 
cgit v1.2.3


From c1269d3d12b88151ee4c109624b5022d53a11738 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 13 May 2025 19:39:09 +0000
Subject: tcp: add tcp_rcvbuf_grow() tracepoint

Provide a new tracepoint to better understand
tcp_rcv_space_adjust() (currently broken) behavior.

Call it only when tcp_rcv_space_adjust() has a chance
to make a change.

I chose to leave trace_tcp_rcv_space_adjust() as is,
because commit 6163849d289b ("net: introduce a new tracepoint
for tcp_rcv_space_adjust") intent was to get it called after
each data delivery to user space.

Tested:

Pair of hosts in the same rack. Ideally, sk->sk_rcvbuf should be kept small.

echo "4096 131072 33554432" >/proc/sys/net/ipv4/tcp_rmem
./netserver
perf record -C10 -e tcp:tcp_rcvbuf_grow sleep 30

<launch from client : netperf -H server -T,10>

Trace for a TS enabled TCP flow (with standard ms granularity)

perf script // We can see that sk_rcvbuf is growing very fast to tcp_mem[2]
  260.500397: tcp:tcp_rcvbuf_grow: time=291 rtt_us=274 copied=110592 inq=0 space=41080 ooo=0 scaling_ratio=230 rcvbuf=131072 ...
  260.501333: tcp:tcp_rcvbuf_grow: time=555 rtt_us=364 copied=333824 inq=0 space=110592 ooo=0 scaling_ratio=230 rcvbuf=1399144 ...
  260.501664: tcp:tcp_rcvbuf_grow: time=331 rtt_us=330 copied=798720 inq=0 space=333824 ooo=0 scaling_ratio=230 rcvbuf=4110551 ...
  260.502003: tcp:tcp_rcvbuf_grow: time=340 rtt_us=330 copied=1040384 inq=49152 space=798720 ooo=0 scaling_ratio=230 rcvbuf=7006410 ...
  260.502483: tcp:tcp_rcvbuf_grow: time=479 rtt_us=330 copied=2658304 inq=49152 space=1040384 ooo=0 scaling_ratio=230 rcvbuf=7006410 ...
  260.502899: tcp:tcp_rcvbuf_grow: time=416 rtt_us=413 copied=4026368 inq=147456 space=2658304 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.504233: tcp:tcp_rcvbuf_grow: time=493 rtt_us=487 copied=4800512 inq=196608 space=4026368 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.504792: tcp:tcp_rcvbuf_grow: time=559 rtt_us=551 copied=5672960 inq=49152 space=4800512 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.506614: tcp:tcp_rcvbuf_grow: time=610 rtt_us=607 copied=6688768 inq=180224 space=5672960 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.507280: tcp:tcp_rcvbuf_grow: time=666 rtt_us=656 copied=6868992 inq=49152 space=6688768 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.507979: tcp:tcp_rcvbuf_grow: time=699 rtt_us=699 copied=7000064 inq=0 space=6868992 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.508681: tcp:tcp_rcvbuf_grow: time=703 rtt_us=699 copied=7208960 inq=0 space=7000064 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.509426: tcp:tcp_rcvbuf_grow: time=744 rtt_us=737 copied=7569408 inq=0 space=7208960 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.510213: tcp:tcp_rcvbuf_grow: time=787 rtt_us=770 copied=7880704 inq=49152 space=7569408 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.511013: tcp:tcp_rcvbuf_grow: time=801 rtt_us=798 copied=8339456 inq=0 space=7880704 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.511860: tcp:tcp_rcvbuf_grow: time=847 rtt_us=824 copied=8601600 inq=49152 space=8339456 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.512710: tcp:tcp_rcvbuf_grow: time=850 rtt_us=846 copied=8814592 inq=65536 space=8601600 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.514428: tcp:tcp_rcvbuf_grow: time=871 rtt_us=865 copied=8855552 inq=49152 space=8814592 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.515333: tcp:tcp_rcvbuf_grow: time=905 rtt_us=882 copied=9228288 inq=49152 space=8855552 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.516237: tcp:tcp_rcvbuf_grow: time=905 rtt_us=896 copied=9371648 inq=49152 space=9228288 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.517149: tcp:tcp_rcvbuf_grow: time=911 rtt_us=909 copied=9543680 inq=49152 space=9371648 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.518070: tcp:tcp_rcvbuf_grow: time=921 rtt_us=921 copied=9793536 inq=0 space=9543680 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.520895: tcp:tcp_rcvbuf_grow: time=948 rtt_us=947 copied=10203136 inq=114688 space=9793536 ooo=0 scaling_ratio=230 rcvbuf=24622616 ...
  260.521853: tcp:tcp_rcvbuf_grow: time=959 rtt_us=954 copied=10293248 inq=57344 space=10203136 ooo=0 scaling_ratio=230 rcvbuf=24691992 ...
  260.522818: tcp:tcp_rcvbuf_grow: time=964 rtt_us=959 copied=10330112 inq=0 space=10293248 ooo=0 scaling_ratio=230 rcvbuf=24691992 ...
  260.524760: tcp:tcp_rcvbuf_grow: time=979 rtt_us=969 copied=10633216 inq=49152 space=10330112 ooo=0 scaling_ratio=230 rcvbuf=24691992 ...
  260.526709: tcp:tcp_rcvbuf_grow: time=975 rtt_us=973 copied=12013568 inq=163840 space=10633216 ooo=0 scaling_ratio=230 rcvbuf=25136755 ...
  260.527694: tcp:tcp_rcvbuf_grow: time=985 rtt_us=976 copied=12025856 inq=32768 space=12013568 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  260.530655: tcp:tcp_rcvbuf_grow: time=991 rtt_us=986 copied=12050432 inq=98304 space=12025856 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  260.533626: tcp:tcp_rcvbuf_grow: time=993 rtt_us=989 copied=12124160 inq=0 space=12050432 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  260.538606: tcp:tcp_rcvbuf_grow: time=1000 rtt_us=994 copied=12222464 inq=49152 space=12124160 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  260.545605: tcp:tcp_rcvbuf_grow: time=1005 rtt_us=998 copied=12263424 inq=81920 space=12222464 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  260.553626: tcp:tcp_rcvbuf_grow: time=1005 rtt_us=999 copied=12320768 inq=12288 space=12263424 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  260.589749: tcp:tcp_rcvbuf_grow: time=1001 rtt_us=1000 copied=12398592 inq=16384 space=12320768 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  260.806577: tcp:tcp_rcvbuf_grow: time=1010 rtt_us=1000 copied=12402688 inq=32768 space=12398592 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  261.002386: tcp:tcp_rcvbuf_grow: time=1002 rtt_us=1000 copied=12419072 inq=98304 space=12402688 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  261.803432: tcp:tcp_rcvbuf_grow: time=1013 rtt_us=1000 copied=12468224 inq=49152 space=12419072 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  261.829533: tcp:tcp_rcvbuf_grow: time=1004 rtt_us=1000 copied=12615680 inq=0 space=12468224 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...
  265.505435: tcp:tcp_rcvbuf_grow: time=1007 rtt_us=1000 copied=12632064 inq=32768 space=12615680 ooo=0 scaling_ratio=230 rcvbuf=33554432 ...

We also see rtt_us going gradually to 1000 usec, causing massive overshoot.

Trace for a usec TS enabled TCP flow (us granularity)

perf script // We can see that sk_rcvbuf is growing to a smaller value,
               thanks to tight rtt_us values.
 1509.273955: tcp:tcp_rcvbuf_grow: time=396 rtt_us=377 copied=110592 inq=0 space=41080 ooo=0 scaling_ratio=230 rcvbuf=131072 ...
 1509.274366: tcp:tcp_rcvbuf_grow: time=412 rtt_us=365 copied=129024 inq=0 space=110592 ooo=0 scaling_ratio=230 rcvbuf=1399144 ...
 1509.274738: tcp:tcp_rcvbuf_grow: time=372 rtt_us=355 copied=194560 inq=0 space=129024 ooo=0 scaling_ratio=230 rcvbuf=1399144 ...
 1509.275020: tcp:tcp_rcvbuf_grow: time=282 rtt_us=257 copied=401408 inq=0 space=194560 ooo=0 scaling_ratio=230 rcvbuf=1399144 ...
 1509.275190: tcp:tcp_rcvbuf_grow: time=170 rtt_us=144 copied=741376 inq=229376 space=401408 ooo=0 scaling_ratio=230 rcvbuf=3021625 ...
 1509.275300: tcp:tcp_rcvbuf_grow: time=110 rtt_us=110 copied=1146880 inq=65536 space=741376 ooo=0 scaling_ratio=230 rcvbuf=4642390 ...
 1509.275449: tcp:tcp_rcvbuf_grow: time=149 rtt_us=106 copied=1310720 inq=737280 space=1146880 ooo=0 scaling_ratio=230 rcvbuf=5498637 ...
 1509.275560: tcp:tcp_rcvbuf_grow: time=111 rtt_us=107 copied=1388544 inq=430080 space=1310720 ooo=0 scaling_ratio=230 rcvbuf=5498637 ...
 1509.275674: tcp:tcp_rcvbuf_grow: time=114 rtt_us=113 copied=1495040 inq=421888 space=1388544 ooo=0 scaling_ratio=230 rcvbuf=5498637 ...
 1509.275800: tcp:tcp_rcvbuf_grow: time=126 rtt_us=126 copied=1572864 inq=77824 space=1495040 ooo=0 scaling_ratio=230 rcvbuf=5498637 ...
 1509.275968: tcp:tcp_rcvbuf_grow: time=168 rtt_us=161 copied=1863680 inq=172032 space=1572864 ooo=0 scaling_ratio=230 rcvbuf=5498637 ...
 1509.276129: tcp:tcp_rcvbuf_grow: time=161 rtt_us=161 copied=1941504 inq=204800 space=1863680 ooo=0 scaling_ratio=230 rcvbuf=5782790 ...
 1509.276288: tcp:tcp_rcvbuf_grow: time=159 rtt_us=158 copied=1990656 inq=131072 space=1941504 ooo=0 scaling_ratio=230 rcvbuf=5782790 ...
 1509.276900: tcp:tcp_rcvbuf_grow: time=228 rtt_us=226 copied=2883584 inq=266240 space=1990656 ooo=0 scaling_ratio=230 rcvbuf=5782790 ...
 1509.277819: tcp:tcp_rcvbuf_grow: time=242 rtt_us=236 copied=3022848 inq=0 space=2883584 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.278072: tcp:tcp_rcvbuf_grow: time=253 rtt_us=247 copied=3055616 inq=49152 space=3022848 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.279560: tcp:tcp_rcvbuf_grow: time=268 rtt_us=264 copied=3133440 inq=180224 space=3055616 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.279833: tcp:tcp_rcvbuf_grow: time=274 rtt_us=270 copied=3424256 inq=0 space=3133440 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.282187: tcp:tcp_rcvbuf_grow: time=277 rtt_us=273 copied=3465216 inq=180224 space=3424256 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.284685: tcp:tcp_rcvbuf_grow: time=292 rtt_us=292 copied=3481600 inq=147456 space=3465216 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.284983: tcp:tcp_rcvbuf_grow: time=297 rtt_us=295 copied=3702784 inq=45056 space=3481600 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.285596: tcp:tcp_rcvbuf_grow: time=311 rtt_us=310 copied=3723264 inq=40960 space=3702784 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.285909: tcp:tcp_rcvbuf_grow: time=313 rtt_us=304 copied=3846144 inq=196608 space=3723264 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.291654: tcp:tcp_rcvbuf_grow: time=322 rtt_us=311 copied=3960832 inq=49152 space=3846144 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.291986: tcp:tcp_rcvbuf_grow: time=333 rtt_us=330 copied=4075520 inq=360448 space=3960832 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.292319: tcp:tcp_rcvbuf_grow: time=332 rtt_us=332 copied=4079616 inq=65536 space=4075520 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.292666: tcp:tcp_rcvbuf_grow: time=348 rtt_us=347 copied=4177920 inq=212992 space=4079616 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.293015: tcp:tcp_rcvbuf_grow: time=349 rtt_us=345 copied=4276224 inq=262144 space=4177920 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.293371: tcp:tcp_rcvbuf_grow: time=356 rtt_us=346 copied=4415488 inq=49152 space=4276224 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...
 1509.515798: tcp:tcp_rcvbuf_grow: time=424 rtt_us=411 copied=4833280 inq=81920 space=4415488 ooo=0 scaling_ratio=230 rcvbuf=12316197 ...

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Wei Wang <weiwan@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20250513193919.1089692-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/tcp.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 53e878fa14d1..006c2116c8f6 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
 	TP_ARGS(sk)
 );
 
+TRACE_EVENT(tcp_rcvbuf_grow,
+
+	TP_PROTO(struct sock *sk, int time),
+
+	TP_ARGS(sk, time),
+
+	TP_STRUCT__entry(
+		__field(int, time)
+		__field(__u32, rtt_us)
+		__field(__u32, copied)
+		__field(__u32, inq)
+		__field(__u32, space)
+		__field(__u32, ooo_space)
+		__field(__u32, rcvbuf)
+		__field(__u8, scaling_ratio)
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u16, family)
+		__array(__u8, saddr, 4)
+		__array(__u8, daddr, 4)
+		__array(__u8, saddr_v6, 16)
+		__array(__u8, daddr_v6, 16)
+		__field(const void *, skaddr)
+		__field(__u64, sock_cookie)
+	),
+
+	TP_fast_assign(
+		struct inet_sock *inet = inet_sk(sk);
+		struct tcp_sock *tp = tcp_sk(sk);
+		__be32 *p32;
+
+		__entry->time = time;
+		__entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
+		__entry->copied = tp->copied_seq - tp->rcvq_space.seq;
+		__entry->inq = tp->rcv_nxt - tp->copied_seq;
+		__entry->space = tp->rcvq_space.space;
+		__entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 :
+				     TCP_SKB_CB(tp->ooo_last_skb)->end_seq -
+				     tp->rcv_nxt;
+
+		__entry->rcvbuf = sk->sk_rcvbuf;
+		__entry->scaling_ratio = tp->scaling_ratio;
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+		__entry->family = sk->sk_family;
+
+		p32 = (__be32 *) __entry->saddr;
+		*p32 = inet->inet_saddr;
+
+		p32 = (__be32 *) __entry->daddr;
+		*p32 = inet->inet_daddr;
+
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+
+		__entry->skaddr = sk;
+		__entry->sock_cookie = sock_gen_cookie(sk);
+	),
+
+	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
+		  "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
+		  "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
+		  __entry->time, __entry->rtt_us, __entry->copied,
+		  __entry->inq, __entry->space, __entry->ooo_space,
+		  __entry->scaling_ratio, __entry->rcvbuf,
+		  show_family_name(__entry->family),
+		  __entry->sport, __entry->dport,
+		  __entry->saddr, __entry->daddr,
+		  __entry->saddr_v6, __entry->daddr_v6,
+		  __entry->skaddr,
+		  __entry->sock_cookie)
+);
+
 TRACE_EVENT(tcp_retransmit_synack,
 
 	TP_PROTO(const struct sock *sk, const struct request_sock *req),
-- 
cgit v1.2.3


From ea33537d82921e71f852ea2ed985acc562125efe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 13 May 2025 19:39:12 +0000
Subject: tcp: add receive queue awareness in tcp_rcv_space_adjust()

If the application can not drain fast enough a TCP socket queue,
tcp_rcv_space_adjust() can overestimate tp->rcvq_space.space.

Then sk->sk_rcvbuf can grow and hit tcp_rmem[2] for no good reason.

Fix this by taking into acount the number of available bytes.

Keeping sk->sk_rcvbuf at the right size allows better cache efficiency.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Wei Wang <weiwan@google.com>
Link: https://patch.msgid.link/20250513193919.1089692-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a8af71623ba7..29f59d50dc73 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -340,7 +340,7 @@ struct tcp_sock {
 	} rcv_rtt_est;
 /* Receiver queue space */
 	struct {
-		u32	space;
+		int	space;
 		u32	seq;
 		u64	time;
 	} rcvq_space;
-- 
cgit v1.2.3


From 1119e5519dcdb7b3527f5d85accf9c7aa02b2b28 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 13 May 2025 15:17:52 -0700
Subject: net: sched: uapi: add more sanely named duplicate defines

The TCA_FLOWER_KEY_CFM enum has a UNSPEC and MAX with _OPT
in the name, but the real attributes don't. Add a MAX that
more reasonably matches the attrs.

The PAD in TCA_TAPRIO is the only attr which doesn't have
_ATTR in it, perhaps signifying that it's not a real attr?
If so interesting idea in abstract but it makes codegen painful.

Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250513221752.843102-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/pkt_cls.h   | 1 +
 include/uapi/linux/pkt_sched.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 490821364165..28d94b11d1aa 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -697,6 +697,7 @@ enum {
 };
 
 #define TCA_FLOWER_KEY_CFM_OPT_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1)
+#define TCA_FLOWER_KEY_CFM_MAX	   (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1)
 
 #define TCA_FLOWER_MASK_FLAGS_RANGE	(1 << 0) /* Range-based match */
 
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 9ea874395717..3e41349f3fa2 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1182,6 +1182,7 @@ enum {
 	TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */
 	TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
 	TCA_TAPRIO_PAD,
+	TCA_TAPRIO_ATTR_PAD = TCA_TAPRIO_PAD,
 	TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
-- 
cgit v1.2.3


From 9cd5ef0b8c04c46a15c8f5d002f02ea0d0477790 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 15 May 2025 10:03:54 +0000
Subject: net: rfs: add sock_rps_delete_flow() helper

RFS can exhibit lower performance for workloads using short-lived
flows and a small set of 4-tuple.

This is often the case for load-testers, using a pair of hosts,
if the server has a single listener port.

Typical use case :

Server : tcp_crr -T128 -F1000 -6 -U -l30 -R 14250
Client : tcp_crr -T128 -F1000 -6 -U -l30 -c -H server | grep local_throughput

This is because RFS global hash table contains stale information,
when the same RSS key is recycled for another socket and another cpu.

Make sure to undo the changes and go back to initial state when
a flow is disconnected.

Performance of the above test is increased by 22 %,
going from 372604 transactions per second to 457773.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Octavian Purdila <tavip@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20250515100354.3339920-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rps.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/net/rps.h b/include/net/rps.h
index 507f4aa5d39b..d8ab3a08bcc4 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -123,6 +123,30 @@ static inline void sock_rps_record_flow(const struct sock *sk)
 #endif
 }
 
+static inline void sock_rps_delete_flow(const struct sock *sk)
+{
+#ifdef CONFIG_RPS
+	struct rps_sock_flow_table *table;
+	u32 hash, index;
+
+	if (!static_branch_unlikely(&rfs_needed))
+		return;
+
+	hash = READ_ONCE(sk->sk_rxhash);
+	if (!hash)
+		return;
+
+	rcu_read_lock();
+	table = rcu_dereference(net_hotdata.rps_sock_flow_table);
+	if (table) {
+		index = hash & table->mask;
+		if (READ_ONCE(table->ents[index]) != RPS_NO_CPU)
+			WRITE_ONCE(table->ents[index], RPS_NO_CPU);
+	}
+	rcu_read_unlock();
+#endif
+}
+
 static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-- 
cgit v1.2.3


From 7b151e4efdde7cc7cfaae66e497d12487a70c6e9 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 14 May 2025 20:54:29 +0200
Subject: net: phy: fixed_phy: remove fixed_phy_register_with_gpiod

Since its introduction 6 yrs ago this functions has never had a user.
So remove it.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/ccbeef28-65ae-4e28-b1db-816c44338dee@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy_fixed.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 1acafd86ab13..3392c09b5d24 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -13,7 +13,6 @@ struct fixed_phy_status {
 };
 
 struct device_node;
-struct gpio_desc;
 struct net_device;
 
 #if IS_ENABLED(CONFIG_FIXED_PHY)
@@ -24,11 +23,6 @@ extern struct phy_device *fixed_phy_register(unsigned int irq,
 					     struct fixed_phy_status *status,
 					     struct device_node *np);
 
-extern struct phy_device *
-fixed_phy_register_with_gpiod(unsigned int irq,
-			      struct fixed_phy_status *status,
-			      struct gpio_desc *gpiod);
-
 extern void fixed_phy_unregister(struct phy_device *phydev);
 extern int fixed_phy_set_link_update(struct phy_device *phydev,
 			int (*link_update)(struct net_device *,
@@ -46,14 +40,6 @@ static inline struct phy_device *fixed_phy_register(unsigned int irq,
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct phy_device *
-fixed_phy_register_with_gpiod(unsigned int irq,
-			      struct fixed_phy_status *status,
-			      struct gpio_desc *gpiod)
-{
-	return ERR_PTR(-ENODEV);
-}
-
 static inline void fixed_phy_unregister(struct phy_device *phydev)
 {
 }
-- 
cgit v1.2.3


From a462903fa22541f212134fba81084315ad843e6e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 May 2025 13:59:27 +0200
Subject: net: netlink: reduce extack cookie size

Seems like the extack cookie hasn't found any users outside
of wireless, which always uses nl_set_extack_cookie_u64().
Thus, allocating 20 bytes for it is pointless, reduce that
to 8 bytes, and add a BUILD_BUG_ON() to ensure it's enough
(obviously it is, for a u64, but in case it changes again.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://patch.msgid.link/20250516115927.38209-2-johannes@sipsolutions.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index c3ae84a77e16..882e9c1b6c1d 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -63,7 +63,7 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
 }
 
 /* this can be increased when necessary - don't expose to userland */
-#define NETLINK_MAX_COOKIE_LEN	20
+#define NETLINK_MAX_COOKIE_LEN	8
 #define NETLINK_MAX_FMTMSG_LEN	80
 
 /**
@@ -212,6 +212,7 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
 {
 	if (!extack)
 		return;
+	BUILD_BUG_ON(sizeof(extack->cookie) < sizeof(cookie));
 	memcpy(extack->cookie, &cookie, sizeof(cookie));
 	extack->cookie_len = sizeof(cookie);
 }
-- 
cgit v1.2.3


From 84b21e61ebd64931d865ce3df49d930db8c9e2cd Mon Sep 17 00:00:00 2001
From: Gur Stavi <gur.stavi@huawei.com>
Date: Sun, 18 May 2025 13:00:54 +0300
Subject: queue_api: reduce risk of name collision over txq

Rename local variable in macros from txq to _txq.
When macro parameter get_desc is expended it is likely to have a txq
token that refers to a different txq variable at the caller's site.

Signed-off-by: Gur Stavi <gur.stavi@huawei.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/95b60d218f004308486d92ed17c8cc6f28bac09d.1747559621.git.gur.stavi@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netdev_queues.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 069ff35a72de..ba2eaf39089b 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -288,27 +288,27 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue,
 
 #define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs)		\
 	({								\
-		struct netdev_queue *txq;				\
+		struct netdev_queue *_txq;				\
 									\
-		txq = netdev_get_tx_queue(dev, idx);			\
-		netif_txq_try_stop(txq, get_desc, start_thrs);		\
+		_txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_try_stop(_txq, get_desc, start_thrs);		\
 	})
 
 #define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \
 	({								\
-		struct netdev_queue *txq;				\
+		struct netdev_queue *_txq;				\
 									\
-		txq = netdev_get_tx_queue(dev, idx);			\
-		netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \
+		_txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_maybe_stop(_txq, get_desc, stop_thrs, start_thrs); \
 	})
 
 #define netif_subqueue_completed_wake(dev, idx, pkts, bytes,		\
 				      get_desc, start_thrs)		\
 	({								\
-		struct netdev_queue *txq;				\
+		struct netdev_queue *_txq;				\
 									\
-		txq = netdev_get_tx_queue(dev, idx);			\
-		netif_txq_completed_wake(txq, pkts, bytes,		\
+		_txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_completed_wake(_txq, pkts, bytes,		\
 					 get_desc, start_thrs);		\
 	})
 
-- 
cgit v1.2.3


From 31be641d74267d98317ef5a2b90e6200511cabb3 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 15 May 2025 10:11:54 +0200
Subject: net: phy: make mdio consumer / device layer a separate module

After having factored out the provider part from mdio_bus.c, we can
make the mdio consumer / device layer a separate module. This also
allows to remove Kconfig symbol MDIO_DEVICE.
The module init / exit functions from mdio_bus.c no longer have to be
called from phy_device.c. The link order defined in
drivers/net/phy/Makefile ensures that init / exit functions are called
in the right order.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/dba6b156-5748-44ce-b5e2-e8dc2fcee5a7@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/phy.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 7c29d346d4b3..92a88b5ce356 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2033,9 +2033,6 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev,
 				   const struct ethtool_link_ksettings *cmd);
 int phy_ethtool_nway_reset(struct net_device *ndev);
 
-int __init mdio_bus_init(void);
-void mdio_bus_exit(void);
-
 int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data);
 int phy_ethtool_get_sset_count(struct phy_device *phydev);
 int phy_ethtool_get_stats(struct phy_device *phydev,
-- 
cgit v1.2.3


From 4c2bd7913f52b1e5c978edf56cdef39c30a1f603 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 17 May 2025 13:08:10 -0700
Subject: net: let lockdep compare instance locks

AFAIU always returning -1 from lockdep's compare function
basically disables checking of dependencies between given
locks. Try to be a little more precise about what guarantees
that instance locks won't deadlock.

Right now we only nest them under protection of rtnl_lock.
Mostly in unregister_netdevice_many() and dev_close_many().

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250517200810.466531-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netdev_lock.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h
index 2a753813f849..c345afecd4c5 100644
--- a/include/net/netdev_lock.h
+++ b/include/net/netdev_lock.h
@@ -99,16 +99,15 @@ static inline void netdev_unlock_ops_compat(struct net_device *dev)
 static inline int netdev_lock_cmp_fn(const struct lockdep_map *a,
 				     const struct lockdep_map *b)
 {
-	/* Only lower devices currently grab the instance lock, so no
-	 * real ordering issues can occur. In the near future, only
-	 * hardware devices will grab instance lock which also does not
-	 * involve any ordering. Suppress lockdep ordering warnings
-	 * until (if) we start grabbing instance lock on pure SW
-	 * devices (bond/team/veth/etc).
-	 */
 	if (a == b)
 		return 0;
-	return -1;
+
+	/* Allow locking multiple devices only under rtnl_lock,
+	 * the exact order doesn't matter.
+	 * Note that upper devices don't lock their ops, so nesting
+	 * mostly happens in batched device removal for now.
+	 */
+	return lockdep_rtnl_is_held() ? -1 : 1;
 }
 
 #define netdev_lockdep_set_classes(dev)				\
-- 
cgit v1.2.3


From 3f1716ee0f6c63795e6d225e3f5ec3825cd2bd57 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 17 May 2025 22:34:32 +0200
Subject: net: phy: fixed_phy: remove irq argument from fixed_phy_add

All callers pass PHY_POLL, therefore remove irq argument from
fixed_phy_add().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Acked-by: Greg Ungerer <gerg@linux-m68k.org>
Link: https://patch.msgid.link/b3b9b3bc-c310-4a54-b376-c909c83575de@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy_fixed.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 3392c09b5d24..316bb4deda37 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -17,8 +17,7 @@ struct net_device;
 
 #if IS_ENABLED(CONFIG_FIXED_PHY)
 extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier);
-extern int fixed_phy_add(unsigned int irq, int phy_id,
-			 struct fixed_phy_status *status);
+int fixed_phy_add(int phy_id, struct fixed_phy_status *status);
 extern struct phy_device *fixed_phy_register(unsigned int irq,
 					     struct fixed_phy_status *status,
 					     struct device_node *np);
@@ -28,7 +27,7 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev,
 			int (*link_update)(struct net_device *,
 					   struct fixed_phy_status *));
 #else
-static inline int fixed_phy_add(unsigned int irq, int phy_id,
+static inline int fixed_phy_add(int phy_id,
 				struct fixed_phy_status *status)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From d23b4af5df3900fb0b4e1a05cb8119dd1c395519 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 17 May 2025 22:35:56 +0200
Subject: net: phy: fixed_phy: remove irq argument from fixed_phy_register

All callers pass PHY_POLL, therefore remove irq argument from
fixed_phy_register().

Note: I keep the irq argument in fixed_phy_add_gpiod() for now,
for the case that somebody may want to use a GPIO interrupt in
the future, by e.g. adding a call to fwnode_irq_get() to
fixed_phy_get_gpiod().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/31cdb232-a5e9-4997-a285-cb9a7d208124@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy_fixed.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 316bb4deda37..634149a73c2a 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -18,9 +18,8 @@ struct net_device;
 #if IS_ENABLED(CONFIG_FIXED_PHY)
 extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier);
 int fixed_phy_add(int phy_id, struct fixed_phy_status *status);
-extern struct phy_device *fixed_phy_register(unsigned int irq,
-					     struct fixed_phy_status *status,
-					     struct device_node *np);
+struct phy_device *fixed_phy_register(struct fixed_phy_status *status,
+				      struct device_node *np);
 
 extern void fixed_phy_unregister(struct phy_device *phydev);
 extern int fixed_phy_set_link_update(struct phy_device *phydev,
@@ -32,9 +31,9 @@ static inline int fixed_phy_add(int phy_id,
 {
 	return -ENODEV;
 }
-static inline struct phy_device *fixed_phy_register(unsigned int irq,
-						struct fixed_phy_status *status,
-						struct device_node *np)
+static inline struct phy_device *
+fixed_phy_register(struct fixed_phy_status *status,
+		   struct device_node *np)
 {
 	return ERR_PTR(-ENODEV);
 }
-- 
cgit v1.2.3


From 4ba1c5bb4811f560a86697311cb4e9741e047a5d Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 17 May 2025 22:37:29 +0200
Subject: net: phy: fixed_phy: constify status argument where possible

Constify the passed struct fixed_phy_status *status where possible.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/d1764b62-8538-408b-a4e3-b63715481a38@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy_fixed.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 634149a73c2a..5399b9e41e35 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -17,8 +17,8 @@ struct net_device;
 
 #if IS_ENABLED(CONFIG_FIXED_PHY)
 extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier);
-int fixed_phy_add(int phy_id, struct fixed_phy_status *status);
-struct phy_device *fixed_phy_register(struct fixed_phy_status *status,
+int fixed_phy_add(int phy_id, const struct fixed_phy_status *status);
+struct phy_device *fixed_phy_register(const struct fixed_phy_status *status,
 				      struct device_node *np);
 
 extern void fixed_phy_unregister(struct phy_device *phydev);
@@ -27,12 +27,12 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev,
 					   struct fixed_phy_status *));
 #else
 static inline int fixed_phy_add(int phy_id,
-				struct fixed_phy_status *status)
+				const struct fixed_phy_status *status)
 {
 	return -ENODEV;
 }
 static inline struct phy_device *
-fixed_phy_register(struct fixed_phy_status *status,
+fixed_phy_register(const struct fixed_phy_status *status,
 		   struct device_node *np)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From f0a56c17e64bb5e7cdb9295df2b5fc21e4949005 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 15 May 2025 19:27:18 -0700
Subject: inet: Remove rtnl_is_held arg of lwtunnel_valid_encap_type(_attr)?().

Commit f130a0cc1b4f ("inet: fix lwtunnel_valid_encap_type() lock
imbalance") added the rtnl_is_held argument as a temporary fix while
I'm converting nexthop and IPv6 routing table to per-netns RTNL or RCU.

Now all callers of lwtunnel_valid_encap_type() do not hold RTNL.

Let's remove the argument.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250516022759.44392-3-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/lwtunnel.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 39cd50300a18..c306ebe379a0 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -116,11 +116,9 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
 int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
 			   unsigned int num);
 int lwtunnel_valid_encap_type(u16 encap_type,
-			      struct netlink_ext_ack *extack,
-			      bool rtnl_is_held);
+			      struct netlink_ext_ack *extack);
 int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
-				   struct netlink_ext_ack *extack,
-				   bool rtnl_is_held);
+				   struct netlink_ext_ack *extack);
 int lwtunnel_build_state(struct net *net, u16 encap_type,
 			 struct nlattr *encap,
 			 unsigned int family, const void *cfg,
@@ -203,15 +201,14 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
 }
 
 static inline int lwtunnel_valid_encap_type(u16 encap_type,
-					    struct netlink_ext_ack *extack,
-					    bool rtnl_is_held)
+					    struct netlink_ext_ack *extack)
 {
 	NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel");
 	return -EOPNOTSUPP;
 }
+
 static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
-						 struct netlink_ext_ack *extack,
-						 bool rtnl_is_held)
+						 struct netlink_ext_ack *extack)
 {
 	/* return 0 since we are not walking attr looking for
 	 * RTA_ENCAP_TYPE attribute on nexthops.
-- 
cgit v1.2.3


From d7500fbfb12067ee7313f13f4c58f771be3018ab Mon Sep 17 00:00:00 2001
From: Bert Karwatzki <spasswolf@web.de>
Date: Wed, 21 May 2025 00:34:29 +0200
Subject: wifi: check if socket flags are valid

Checking the SOCK_WIFI_STATUS flag bit in sk_flags may give wrong results
since sk_flags are part of a union and the union is used otherwise. Add
sk_requests_wifi_status() which checks if sk is non-NULL, sk is a full
socket (so flags are valid) and checks the flag bit.

Fixes: 76a853f86c97 ("wifi: free SKBTX_WIFI_STATUS skb tx_flags flag")
Suggested-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Bert Karwatzki <spasswolf@web.de>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20250520223430.6875-1-spasswolf@web.de
[edit commit message, fix indentation]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/sock.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index f0fabb9fd28a..75c12e14fc47 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2821,6 +2821,12 @@ sk_is_refcounted(struct sock *sk)
 	return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
 }
 
+static inline bool
+sk_requests_wifi_status(struct sock *sk)
+{
+	return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS);
+}
+
 /* Checks if this SKB belongs to an HW offloaded socket
  * and whether any SW fallbacks are required based on dev.
  * Check decrypted mark in case skb_orphan() cleared socket.
-- 
cgit v1.2.3


From b803c4a4f78834b31ebfbbcea350473333760559 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Fri, 2 May 2025 02:12:10 +0900
Subject: can: dev: add struct data_bittiming_params to group FD parameters

This is a preparation patch for the introduction of CAN XL.

CAN FD and CAN XL uses similar bittiming parameters. Add one level of
nesting for all the CAN FD parameters. Typically:

  priv->can.data_bittiming;

becomes:

  priv->can.fd.data_bittiming;

This way, the CAN XL equivalent (to be introduced later) would be:

  priv->can.xl.data_bittiming;

Add the new struct data_bittiming_params which contains all the data
bittiming parameters, including the TDC and the callback functions.

This done, update all the CAN FD drivers to make use of the new
layout.

Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://patch.msgid.link/20250501171213.2161572-2-mailhol.vincent@wanadoo.fr
[mkl: fix rcar_canfd]
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 23492213ea35..492d23bec7be 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -38,6 +38,17 @@ enum can_termination_gpio {
 	CAN_TERMINATION_GPIO_MAX,
 };
 
+struct data_bittiming_params {
+	const struct can_bittiming_const *data_bittiming_const;
+	struct can_bittiming data_bittiming;
+	const struct can_tdc_const *tdc_const;
+	struct can_tdc tdc;
+	const u32 *data_bitrate_const;
+	unsigned int data_bitrate_const_cnt;
+	int (*do_set_data_bittiming)(struct net_device *dev);
+	int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv);
+};
+
 /*
  * CAN common private data
  */
@@ -45,16 +56,11 @@ struct can_priv {
 	struct net_device *dev;
 	struct can_device_stats can_stats;
 
-	const struct can_bittiming_const *bittiming_const,
-		*data_bittiming_const;
-	struct can_bittiming bittiming, data_bittiming;
-	const struct can_tdc_const *tdc_const;
-	struct can_tdc tdc;
-
+	const struct can_bittiming_const *bittiming_const;
+	struct can_bittiming bittiming;
+	struct data_bittiming_params fd;
 	unsigned int bitrate_const_cnt;
 	const u32 *bitrate_const;
-	const u32 *data_bitrate_const;
-	unsigned int data_bitrate_const_cnt;
 	u32 bitrate_max;
 	struct can_clock clock;
 
@@ -77,14 +83,12 @@ struct can_priv {
 	struct delayed_work restart_work;
 
 	int (*do_set_bittiming)(struct net_device *dev);
-	int (*do_set_data_bittiming)(struct net_device *dev);
 	int (*do_set_mode)(struct net_device *dev, enum can_mode mode);
 	int (*do_set_termination)(struct net_device *dev, u16 term);
 	int (*do_get_state)(const struct net_device *dev,
 			    enum can_state *state);
 	int (*do_get_berr_counter)(const struct net_device *dev,
 				   struct can_berr_counter *bec);
-	int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv);
 };
 
 static inline bool can_tdc_is_enabled(const struct can_priv *priv)
@@ -114,11 +118,11 @@ static inline bool can_tdc_is_enabled(const struct can_priv *priv)
  */
 static inline s32 can_get_relative_tdco(const struct can_priv *priv)
 {
-	const struct can_bittiming *dbt = &priv->data_bittiming;
+	const struct can_bittiming *dbt = &priv->fd.data_bittiming;
 	s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg +
 				  dbt->phase_seg1) * dbt->brp;
 
-	return (s32)priv->tdc.tdco - sample_point_in_tc;
+	return (s32)priv->fd.tdc.tdco - sample_point_in_tc;
 }
 
 /* helper to define static CAN controller features at device creation time */
-- 
cgit v1.2.3


From 04425292a62c15d1fde714522beaf8f3c2ed1de9 Mon Sep 17 00:00:00 2001
From: Hsin-chen Chuang <chharry@chromium.org>
Date: Wed, 16 Apr 2025 09:53:35 +0000
Subject: Bluetooth: Introduce HCI Driver protocol

Although commit 75ddcd5ad40e ("Bluetooth: btusb: Configure altsetting
for HCI_USER_CHANNEL") has enabled the HCI_USER_CHANNEL user to send out
SCO data through USB Bluetooth chips, it's observed that with the patch
HFP is flaky on most of the existing USB Bluetooth controllers: Intel
chips sometimes send out no packet for Transparent codec; MTK chips may
generate SCO data with a wrong handle for CVSD codec; RTK could split
the data with a wrong packet size for Transparent codec; ... etc.

To address the issue above one needs to reset the altsetting back to
zero when there is no active SCO connection, which is the same as the
BlueZ behavior, and another benefit is the bus doesn't need to reserve
bandwidth when no SCO connection.

This patch adds the infrastructure that allow the user space program to
talk to Bluetooth drivers directly:
- Define the new packet type HCI_DRV_PKT which is specifically used for
  communication between the user space program and the Bluetooth drviers
- hci_send_frame intercepts the packets and invokes drivers' HCI Drv
  callbacks (so far only defined for btusb)
- 2 kinds of events to user space: Command Status and Command Complete,
  the former simply returns the status while the later may contain
  additional response data.

Cc: chromeos-bluetooth-upstreaming@chromium.org
Fixes: b16b327edb4d ("Bluetooth: btusb: add sysfs attribute to control USB alt setting")
Signed-off-by: Hsin-chen Chuang <chharry@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  1 +
 include/net/bluetooth/hci_core.h |  3 ++
 include/net/bluetooth/hci_drv.h  | 76 ++++++++++++++++++++++++++++++++++++++++
 include/net/bluetooth/hci_mon.h  |  2 ++
 4 files changed, 82 insertions(+)
 create mode 100644 include/net/bluetooth/hci_drv.h

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 797992019f9e..2502febf4da7 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -494,6 +494,7 @@ enum {
 #define HCI_EVENT_PKT		0x04
 #define HCI_ISODATA_PKT		0x05
 #define HCI_DIAG_PKT		0xf0
+#define HCI_DRV_PKT		0xf1
 #define HCI_VENDOR_PKT		0xff
 
 /* HCI packet types */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 54bfeeaa0995..57f6175fd1cd 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -31,6 +31,7 @@
 #include <linux/rculist.h>
 
 #include <net/bluetooth/hci.h>
+#include <net/bluetooth/hci_drv.h>
 #include <net/bluetooth/hci_sync.h>
 #include <net/bluetooth/hci_sock.h>
 #include <net/bluetooth/coredump.h>
@@ -613,6 +614,8 @@ struct hci_dev {
 	struct list_head	monitored_devices;
 	bool			advmon_pend_notify;
 
+	struct hci_drv		*hci_drv;
+
 #if IS_ENABLED(CONFIG_BT_LEDS)
 	struct led_trigger	*power_led;
 #endif
diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h
new file mode 100644
index 000000000000..2f01c44f05ec
--- /dev/null
+++ b/include/net/bluetooth/hci_drv.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025 Google Corporation
+ */
+
+#ifndef __HCI_DRV_H
+#define __HCI_DRV_H
+
+#include <linux/types.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci.h>
+
+struct hci_drv_cmd_hdr {
+	__le16	opcode;
+	__le16	len;
+} __packed;
+
+struct hci_drv_ev_hdr {
+	__le16	opcode;
+	__le16	len;
+} __packed;
+
+#define HCI_DRV_EV_CMD_STATUS	0x0000
+struct hci_drv_ev_cmd_status {
+	__le16	opcode;
+	__u8	status;
+} __packed;
+
+#define HCI_DRV_EV_CMD_COMPLETE	0x0001
+struct hci_drv_ev_cmd_complete {
+	__le16	opcode;
+	__u8	status;
+	__u8	data[];
+} __packed;
+
+#define HCI_DRV_STATUS_SUCCESS			0x00
+#define HCI_DRV_STATUS_UNSPECIFIED_ERROR	0x01
+#define HCI_DRV_STATUS_UNKNOWN_COMMAND		0x02
+#define HCI_DRV_STATUS_INVALID_PARAMETERS	0x03
+
+#define HCI_DRV_MAX_DRIVER_NAME_LENGTH	32
+
+/* Common commands that make sense on all drivers start from 0x0000 */
+#define HCI_DRV_OP_READ_INFO	0x0000
+#define HCI_DRV_READ_INFO_SIZE	0
+struct hci_drv_rp_read_info {
+	__u8	driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH];
+	__le16	num_supported_commands;
+	__le16	supported_commands[];
+} __packed;
+
+/* Driver specific OGF (Opcode Group Field)
+ * Commands in this group may have different meanings across different drivers.
+ */
+#define HCI_DRV_OGF_DRIVER_SPECIFIC	0x01
+
+int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status);
+int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp,
+			 size_t rp_len);
+int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *cmd_skb);
+
+struct hci_drv_handler {
+	int (*func)(struct hci_dev *hdev, void *data, u16 data_len);
+	size_t data_len;
+};
+
+struct hci_drv {
+	size_t common_handler_count;
+	const struct hci_drv_handler *common_handlers;
+
+	size_t specific_handler_count;
+	const struct hci_drv_handler *specific_handlers;
+};
+
+#endif /* __HCI_DRV_H */
diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h
index 082f89531b88..bbd752494ef9 100644
--- a/include/net/bluetooth/hci_mon.h
+++ b/include/net/bluetooth/hci_mon.h
@@ -51,6 +51,8 @@ struct hci_mon_hdr {
 #define HCI_MON_CTRL_EVENT	17
 #define HCI_MON_ISO_TX_PKT	18
 #define HCI_MON_ISO_RX_PKT	19
+#define HCI_MON_DRV_TX_PKT	20
+#define HCI_MON_DRV_RX_PKT	21
 
 struct hci_mon_new_index {
 	__u8		type;
-- 
cgit v1.2.3


From dd0ccf858057b793beb3779be7576d92c93cf828 Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sun, 27 Apr 2025 14:27:25 +0300
Subject: Bluetooth: add support for SIOCETHTOOL ETHTOOL_GET_TS_INFO

Bluetooth needs some way for user to get supported so_timestamping flags
for the different socket types.

Use SIOCETHTOOL API for this purpose. As hci_dev is not associated with
struct net_device, the existing implementation can't be reused, so we
add a small one here.

Add support (only) for ETHTOOL_GET_TS_INFO command. The API differs
slightly from netdev in that the result depends also on socket type.

Signed-off-by: Pauli Virtanen <pav@iki.fi>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index bbefde319f95..114299bd8b98 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -29,6 +29,7 @@
 #include <linux/poll.h>
 #include <net/sock.h>
 #include <linux/seq_file.h>
+#include <linux/ethtool.h>
 
 #define BT_SUBSYS_VERSION	2
 #define BT_SUBSYS_REVISION	22
@@ -448,6 +449,9 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
 			  hci_req_complete_t *req_complete,
 			  hci_req_complete_skb_t *req_complete_skb);
 
+int hci_ethtool_ts_info(unsigned int index, int sk_proto,
+			struct kernel_ethtool_ts_info *ts_info);
+
 #define HCI_REQ_START	BIT(0)
 #define HCI_REQ_SKB	BIT(1)
 
-- 
cgit v1.2.3


From 23205562ffc8de20f57afdd984858cab29e77968 Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sat, 3 May 2025 17:08:21 +0300
Subject: Bluetooth: separate CIS_LINK and BIS_LINK link types

Use separate link type id for unicast and broadcast ISO connections.
These connection types are handled with separate HCI commands, socket
API is different, and hci_conn has union fields that are different in
the two cases, so they shall not be mixed up.

Currently in most places it is attempted to distinguish ucast by
bacmp(&c->dst, BDADDR_ANY) but it is wrong as dst is set for bcast sink
hci_conn in iso_conn_ready(). Additionally checking sync_handle might be
OK, but depends on details of bcast conn configuration flow.

To avoid complicating it, use separate link types.

Fixes: f764a6c2c1e4 ("Bluetooth: ISO: Add broadcast support")
Signed-off-by: Pauli Virtanen <pav@iki.fi>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  3 ++-
 include/net/bluetooth/hci_core.h | 48 ++++++++++++++++++----------------------
 2 files changed, 23 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 2502febf4da7..82cbd54443ac 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -558,7 +558,8 @@ enum {
 #define ESCO_LINK	0x02
 /* Low Energy links do not have defined link type. Use invented one */
 #define LE_LINK		0x80
-#define ISO_LINK	0x82
+#define CIS_LINK	0x82
+#define BIS_LINK	0x83
 #define INVALID_LINK	0xff
 
 /* LMP features */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 57f6175fd1cd..2b261e74e2c4 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -999,7 +999,8 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
 	case ESCO_LINK:
 		h->sco_num++;
 		break;
-	case ISO_LINK:
+	case CIS_LINK:
+	case BIS_LINK:
 		h->iso_num++;
 		break;
 	}
@@ -1025,7 +1026,8 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c)
 	case ESCO_LINK:
 		h->sco_num--;
 		break;
-	case ISO_LINK:
+	case CIS_LINK:
+	case BIS_LINK:
 		h->iso_num--;
 		break;
 	}
@@ -1042,7 +1044,8 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type)
 	case SCO_LINK:
 	case ESCO_LINK:
 		return h->sco_num;
-	case ISO_LINK:
+	case CIS_LINK:
+	case BIS_LINK:
 		return h->iso_num;
 	default:
 		return 0;
@@ -1103,7 +1106,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (bacmp(&c->dst, ba) || c->type != ISO_LINK)
+		if (bacmp(&c->dst, ba) || c->type != BIS_LINK)
 			continue;
 
 		if (c->iso_qos.bcast.bis == bis) {
@@ -1125,7 +1128,7 @@ hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK)
+		if (c->type != BIS_LINK)
 			continue;
 
 		if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags))
@@ -1151,8 +1154,8 @@ hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (bacmp(&c->dst, ba) || c->type != ISO_LINK ||
-			!test_bit(HCI_CONN_PER_ADV, &c->flags))
+		if (bacmp(&c->dst, ba) || c->type != BIS_LINK ||
+		    !test_bit(HCI_CONN_PER_ADV, &c->flags))
 			continue;
 
 		if (c->iso_qos.bcast.big == big &&
@@ -1241,7 +1244,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY))
+		if (c->type != CIS_LINK)
 			continue;
 
 		/* Match CIG ID if set */
@@ -1273,7 +1276,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY))
+		if (c->type != CIS_LINK)
 			continue;
 
 		if (handle == c->iso_qos.ucast.cig) {
@@ -1296,17 +1299,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK)
-			continue;
-
-		/* An ISO_LINK hcon with BDADDR_ANY as destination
-		 * address is a Broadcast connection. A Broadcast
-		 * slave connection is associated with a PA train,
-		 * so the sync_handle can be used to differentiate
-		 * from unicast.
-		 */
-		if (bacmp(&c->dst, BDADDR_ANY) &&
-		    c->sync_handle == HCI_SYNC_HANDLE_INVALID)
+		if (c->type != BIS_LINK)
 			continue;
 
 		if (handle == c->iso_qos.bcast.big) {
@@ -1330,7 +1323,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK)
+		if (c->type != BIS_LINK)
 			continue;
 
 		if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) {
@@ -1353,8 +1346,8 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle,  __u16 state)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK ||
-			c->state != state)
+		if (c->type != BIS_LINK || bacmp(&c->dst, BDADDR_ANY) ||
+		    c->state != state)
 			continue;
 
 		if (handle == c->iso_qos.bcast.big) {
@@ -1377,8 +1370,8 @@ hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK ||
-			!test_bit(HCI_CONN_PA_SYNC, &c->flags))
+		if (c->type != BIS_LINK ||
+		    !test_bit(HCI_CONN_PA_SYNC, &c->flags))
 			continue;
 
 		if (c->iso_qos.bcast.big == big) {
@@ -1400,7 +1393,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK)
+		if (c->type != BIS_LINK)
 			continue;
 
 		/* Ignore the listen hcon, we are looking
@@ -2015,7 +2008,8 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
 	case ESCO_LINK:
 		return sco_connect_ind(hdev, bdaddr, flags);
 
-	case ISO_LINK:
+	case CIS_LINK:
+	case BIS_LINK:
 		return iso_connect_ind(hdev, bdaddr, flags);
 
 	default:
-- 
cgit v1.2.3


From 8b8762eeec59b959fbca60afffe21265bce67168 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 20 May 2025 09:19:05 -0700
Subject: tools: ynl-gen: add makefile deps for neigh

Kory is reporting build issues after recent additions to YNL
if the system headers are old.

Link: https://lore.kernel.org/20250519164949.597d6e92@kmaincent-XPS-13-7390
Reported-by: Kory Maincent <kory.maincent@bootlin.com>
Fixes: 0939a418b3b0 ("tools: ynl: submsg: reverse parse / error reporting")
Tested-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20250520161916.413298-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/neighbour.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index 5e67a7eaf4a7..b851c36ad25d 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_NEIGHBOUR_H
-#define __LINUX_NEIGHBOUR_H
+#ifndef _UAPI__LINUX_NEIGHBOUR_H
+#define _UAPI__LINUX_NEIGHBOUR_H
 
 #include <linux/types.h>
 #include <linux/netlink.h>
-- 
cgit v1.2.3


From a5bd029c733b8ae790d5873e2afeb88b58e3a151 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 19 May 2025 10:50:04 -0700
Subject: net: add skb_crc32c()

Add skb_crc32c(), which calculates the CRC32C of a sk_buff.  It will
replace __skb_checksum(), which unnecessarily supports arbitrary
checksums.  Compared to __skb_checksum(), skb_crc32c():

   - Uses the correct type for CRC32C values (u32, not __wsum).

   - Does not require the caller to provide a skb_checksum_ops struct.

   - Is faster because it does not use indirect calls and does not use
     the very slow crc32c_combine().

According to commit 2817a336d4d5 ("net: skb_checksum: allow custom
update/combine for walking skb") which added __skb_checksum(), the
original motivation for the abstraction layer was to avoid code
duplication for CRC32C and other checksums in the future.  However:

   - No additional checksums showed up after CRC32C.  __skb_checksum()
     is only used with the "regular" net checksum and CRC32C.

   - Indirect calls are expensive.  Commit 2544af0344ba ("net: avoid
     indirect calls in L4 checksum calculation") worked around this
     using the INDIRECT_CALL_1 macro. But that only avoided the indirect
     call for the net checksum, and at the cost of an extra branch.

   - The checksums use different types (__wsum and u32), causing casts
     to be needed.

   - It made the checksums of fragments be combined (rather than
     chained) for both checksums, despite this being highly
     counterproductive for CRC32C due to how slow crc32c_combine() is.
     This can clearly be seen in commit 4c2f24549644 ("sctp: linearize
     early if it's not GSO") which tried to work around this performance
     bug.  With a dedicated function for each checksum, we can instead
     just use the proper strategy for each checksum.

As shown by the following tables, the new function skb_crc32c() is
faster than __skb_checksum(), with the improvement varying greatly from
5% to 2500% depending on the case.  The largest improvements come from
fragmented packets, mainly due to eliminating the inefficient
crc32c_combine().  But linear packets are improved too, especially
shorter ones, mainly due to eliminating indirect calls.  These
benchmarks were done on AMD Zen 5.  On that CPU, Linux uses IBRS instead
of retpoline; an even greater improvement might be seen with retpoline:

    Linear sk_buffs

        Length in bytes    __skb_checksum cycles    skb_crc32c cycles
        ===============    =====================    =================
                     64                       43                   18
                    256                       94                   77
                   1420                      204                  161
                  16384                     1735                 1642

    Nonlinear sk_buffs (even split between head and one fragment)

        Length in bytes    __skb_checksum cycles    skb_crc32c cycles
        ===============    =====================    =================
                     64                      579                   22
                    256                      829                   77
                   1420                     1506                  194
                  16384                     4365                 1682

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://patch.msgid.link/20250519175012.36581-3-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c7397b17bb08..7ccc6356acac 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4203,6 +4203,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
 		      __wsum csum, const struct skb_checksum_ops *ops);
 __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
 		    __wsum csum);
+u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc);
 
 static inline void * __must_check
 __skb_header_pointer(const struct sk_buff *skb, int offset, int len,
-- 
cgit v1.2.3


From 99de9d4022e5004f95f425f798f0aa01e87949ff Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 19 May 2025 10:50:07 -0700
Subject: sctp: use skb_crc32c() instead of __skb_checksum()

Make sctp_compute_cksum() just use the new function skb_crc32c(),
instead of calling __skb_checksum() with a skb_checksum_ops struct that
does CRC32C.  This is faster and simpler.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://patch.msgid.link/20250519175012.36581-6-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/checksum.h | 29 +++--------------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 291465c25810..654d37ec0402 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -15,8 +15,6 @@
  *    Dinakaran Joseph
  *    Jon Grimm <jgrimm@us.ibm.com>
  *    Sridhar Samudrala <sri@us.ibm.com>
- *
- * Rewritten to use libcrc32c by:
  *    Vlad Yasevich <vladislav.yasevich@hp.com>
  */
 
@@ -25,39 +23,18 @@
 
 #include <linux/types.h>
 #include <linux/sctp.h>
-#include <linux/crc32c.h>
-#include <linux/crc32.h>
-
-static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum)
-{
-	return (__force __wsum)crc32c((__force __u32)sum, buff, len);
-}
-
-static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2,
-				       int offset, int len)
-{
-	return (__force __wsum)crc32c_combine((__force __u32)csum,
-					      (__force __u32)csum2, len);
-}
-
-static const struct skb_checksum_ops sctp_csum_ops = {
-	.update  = sctp_csum_update,
-	.combine = sctp_csum_combine,
-};
 
 static inline __le32 sctp_compute_cksum(const struct sk_buff *skb,
 					unsigned int offset)
 {
 	struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
 	__le32 old = sh->checksum;
-	__wsum new;
+	u32 new;
 
 	sh->checksum = 0;
-	new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0,
-			      &sctp_csum_ops);
+	new = ~skb_crc32c(skb, offset, skb->len - offset, ~0);
 	sh->checksum = old;
-
-	return cpu_to_le32((__force __u32)new);
+	return cpu_to_le32(new);
 }
 
 #endif /* __sctp_checksum_h__ */
-- 
cgit v1.2.3


From 70c96c7cb9f035d5b960021f2450afa6240e66b4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 19 May 2025 10:50:08 -0700
Subject: net: fold __skb_checksum() into skb_checksum()

Now that the only remaining caller of __skb_checksum() is
skb_checksum(), fold __skb_checksum() into skb_checksum().  This makes
struct skb_checksum_ops unnecessary, so remove that too and simply do
the "regular" net checksum.  It also makes the wrapper functions
csum_partial_ext() and csum_block_add_ext() unnecessary, so remove those
too and just use the underlying functions.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://patch.msgid.link/20250519175012.36581-7-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h |  9 ---------
 include/net/checksum.h | 12 ------------
 2 files changed, 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7ccc6356acac..018c07230513 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4192,15 +4192,6 @@ static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
 	return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
 }
 
-struct skb_checksum_ops {
-	__wsum (*update)(const void *mem, int len, __wsum wsum);
-	__wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
-};
-
-extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;
-
-__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
-		      __wsum csum, const struct skb_checksum_ops *ops);
 __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
 		    __wsum csum);
 u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc);
diff --git a/include/net/checksum.h b/include/net/checksum.h
index 243f972267b8..e57986b173f8 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -98,12 +98,6 @@ csum_block_add(__wsum csum, __wsum csum2, int offset)
 	return csum_add(csum, csum_shift(csum2, offset));
 }
 
-static __always_inline __wsum
-csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len)
-{
-	return csum_block_add(csum, csum2, offset);
-}
-
 static __always_inline __wsum
 csum_block_sub(__wsum csum, __wsum csum2, int offset)
 {
@@ -115,12 +109,6 @@ static __always_inline __wsum csum_unfold(__sum16 n)
 	return (__force __wsum)n;
 }
 
-static __always_inline
-__wsum csum_partial_ext(const void *buff, int len, __wsum sum)
-{
-	return csum_partial(buff, len, sum);
-}
-
 #define CSUM_MANGLED_0 ((__force __sum16)0xffff)
 
 static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff)
-- 
cgit v1.2.3


From b82f72292ab4c65250bd734281464a6ab1ff4133 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 19 May 2025 10:50:09 -0700
Subject: lib/crc32: remove unused support for CRC32C combination

crc32c_combine() and crc32c_shift() are no longer used (except by the
KUnit test that tests them), and their current implementation is very
slow.  Remove them.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://patch.msgid.link/20250519175012.36581-8-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/crc32.h | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 69c2e8bb3782..7f7d0be8a0ac 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -76,29 +76,6 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2)
 	return crc32_le_shift(crc1, len2) ^ crc2;
 }
 
-u32 crc32c_shift(u32 crc, size_t len);
-
-/**
- * crc32c_combine - Combine two crc32c check values into one. For two sequences
- *		    of bytes, seq1 and seq2 with lengths len1 and len2, crc32c()
- *		    check values were calculated for each, crc1 and crc2.
- *
- * @crc1: crc32c of the first block
- * @crc2: crc32c of the second block
- * @len2: length of the second block
- *
- * Return: The crc32c() check value of seq1 and seq2 concatenated, requiring
- *	   only crc1, crc2, and len2. Note: If seq_full denotes the concatenated
- *	   memory area of seq1 with seq2, and crc_full the crc32c() value of
- *	   seq_full, then crc_full == crc32c_combine(crc1, crc2, len2) when
- *	   crc_full was seeded with the same initializer as crc1, and crc2 seed
- *	   was 0. See also crc_combine_test().
- */
-static inline u32 crc32c_combine(u32 crc1, u32 crc2, size_t len2)
-{
-	return crc32c_shift(crc1, len2) ^ crc2;
-}
-
 #define crc32(seed, data, length)  crc32_le(seed, (unsigned char const *)(data), length)
 
 /*
-- 
cgit v1.2.3


From ea6342d98928e243f2024fb97a9b4d42ee55dfba Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 19 May 2025 10:50:10 -0700
Subject: net: add skb_copy_and_crc32c_datagram_iter()

Since skb_copy_and_hash_datagram_iter() is used only with CRC32C, the
crypto_ahash abstraction provides no value.  Add
skb_copy_and_crc32c_datagram_iter() which just calls crc32c() directly.

This is faster and simpler.  It also doesn't have the weird dependency
issue where skb_copy_and_hash_datagram_iter() depends on
CONFIG_CRYPTO_HASH=y without that being expressed explicitly in the
kconfig (presumably because it was too heavyweight for NET to select).
The new function is conditional on the hidden boolean symbol NET_CRC32C,
which selects CRC32.  So it gets compiled only when something that
actually needs CRC32C packet checksums is enabled, it has no implicit
dependency, and it doesn't depend on the heavyweight crypto layer.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://patch.msgid.link/20250519175012.36581-9-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 018c07230513..510adf63c211 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4137,6 +4137,8 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
 int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
 			   struct iov_iter *to, int len,
 			   struct ahash_request *hash);
+int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
+				      struct iov_iter *to, int len, u32 *crcp);
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
-- 
cgit v1.2.3


From c93f75b2d755c35b596084ddd3feb3528284a53f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 19 May 2025 10:50:12 -0700
Subject: net: remove skb_copy_and_hash_datagram_iter()

Now that skb_copy_and_hash_datagram_iter() is no longer used, remove it.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://patch.msgid.link/20250519175012.36581-11-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 510adf63c211..5520524c93bf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -274,7 +274,6 @@
 			 SKB_DATA_ALIGN(sizeof(struct sk_buff)) +	\
 			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
 
-struct ahash_request;
 struct net_device;
 struct scatterlist;
 struct pipe_inode_info;
@@ -4134,9 +4133,6 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
 }
 int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
 				   struct msghdr *msg);
-int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
-			   struct iov_iter *to, int len,
-			   struct ahash_request *hash);
 int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
 				      struct iov_iter *to, int len, u32 *crcp);
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
-- 
cgit v1.2.3


From 31afd6bc55cc0093c3e5b0a368319e423d4de8ea Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Sat, 17 May 2025 22:13:45 +0200
Subject: net: phy: pass PHY driver to .match_phy_device OP

Pass PHY driver pointer to .match_phy_device OP in addition to phydev.
Having access to the PHY driver struct might be useful to check the
PHY ID of the driver is being matched for in case the PHY ID scanned in
the phydev is not consistent.

A scenario for this is a PHY that change PHY ID after a firmware is
loaded, in such case, the PHY ID stored in PHY device struct is not
valid anymore and PHY will manually scan the ID in the match_phy_device
function.

Having the PHY driver info is also useful for those PHY driver that
implement multiple simple .match_phy_device OP to match specific MMD PHY
ID. With this extra info if the parsing logic is the same, the matching
function can be generalized by using the phy_id in the PHY driver
instead of hardcoding.

Rust wrapper callback is updated to align to the new match_phy_device
arguments.

Suggested-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Benno Lossin <lossin@kernel.org> # for Rust
Reviewed-by: FUJITA Tomonori <fujita.tomonori@gmail.com>
Link: https://patch.msgid.link/20250517201353.5137-2-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 92a88b5ce356..10e66d45a8e8 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -990,7 +990,8 @@ struct phy_driver {
 	 * driver for the given phydev.	 If NULL, matching is based on
 	 * phy_id and phy_id_mask.
 	 */
-	int (*match_phy_device)(struct phy_device *phydev);
+	int (*match_phy_device)(struct phy_device *phydev,
+				const struct phy_driver *phydrv);
 
 	/**
 	 * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY
-- 
cgit v1.2.3


From d6c45707ac84c2d9f274ece1cea4dddb97996bde Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Sat, 17 May 2025 22:13:48 +0200
Subject: net: phy: introduce genphy_match_phy_device()

Introduce new API, genphy_match_phy_device(), to provide a way to check
to match a PHY driver for a PHY device based on the info stored in the
PHY device struct.

The function generalize the logic used in phy_bus_match() to check the
PHY ID whether if C45 or C22 ID should be used for matching.

This is useful for custom .match_phy_device function that wants to use
the generic logic under some condition. (example a PHY is already setup
and provide the correct PHY ID)

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Link: https://patch.msgid.link/20250517201353.5137-5-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 10e66d45a8e8..32b9da274115 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1868,6 +1868,9 @@ char *phy_attached_info_irq(struct phy_device *phydev)
 	__malloc;
 void phy_attached_info(struct phy_device *phydev);
 
+int genphy_match_phy_device(struct phy_device *phydev,
+			    const struct phy_driver *phydrv);
+
 /* Clause 22 PHY */
 int genphy_read_abilities(struct phy_device *phydev);
 int genphy_setup_forced(struct phy_device *phydev);
-- 
cgit v1.2.3


From 4ff4d86f6cceb6bea583bdb230e5439655778cce Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Mon, 19 May 2025 10:45:05 +0200
Subject: net: Add support for providing the PTP hardware source in tsinfo

Multi-PTP source support within a network topology has been merged,
but the hardware timestamp source is not yet exposed to users.
Currently, users only see the PTP index, which does not indicate
whether the timestamp comes from a PHY or a MAC.

Add support for reporting the hwtstamp source using a
hwtstamp-source field, alongside hwtstamp-phyindex, to describe
the origin of the hardware timestamp.

Remove HWTSTAMP_SOURCE_UNSPEC enum value as it is not used at all.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20250519-feature_ptp_source-v4-1-5d10e19a0265@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/ethtool.h                        |  5 +++++
 include/linux/net_tstamp.h                     |  7 +------
 include/uapi/linux/ethtool_netlink_generated.h | 14 ++++++++++++++
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 117718c24814..5e0dd333ad1f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -19,6 +19,7 @@
 #include <linux/netlink.h>
 #include <linux/timer_types.h>
 #include <uapi/linux/ethtool.h>
+#include <uapi/linux/ethtool_netlink_generated.h>
 #include <uapi/linux/net_tstamp.h>
 
 #define ETHTOOL_MM_MAX_VERIFY_TIME_MS		128
@@ -830,6 +831,8 @@ struct ethtool_rxfh_param {
  * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags
  * @phc_index: device index of the associated PHC, or -1 if there is none
  * @phc_qualifier: qualifier of the associated PHC
+ * @phc_source: source device of the associated PHC
+ * @phc_phyindex: index of PHY device source of the associated PHC
  * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values
  * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values
  */
@@ -838,6 +841,8 @@ struct kernel_ethtool_ts_info {
 	u32 so_timestamping;
 	int phc_index;
 	enum hwtstamp_provider_qualifier phc_qualifier;
+	enum hwtstamp_source phc_source;
+	int phc_phyindex;
 	enum hwtstamp_tx_types tx_types;
 	enum hwtstamp_rx_filters rx_filters;
 };
diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h
index ff0758e88ea1..f4936d9c2b3c 100644
--- a/include/linux/net_tstamp.h
+++ b/include/linux/net_tstamp.h
@@ -4,6 +4,7 @@
 #define _LINUX_NET_TIMESTAMPING_H_
 
 #include <uapi/linux/net_tstamp.h>
+#include <uapi/linux/ethtool_netlink_generated.h>
 
 #define SOF_TIMESTAMPING_SOFTWARE_MASK	(SOF_TIMESTAMPING_RX_SOFTWARE | \
 					 SOF_TIMESTAMPING_TX_SOFTWARE | \
@@ -13,12 +14,6 @@
 					 SOF_TIMESTAMPING_TX_HARDWARE | \
 					 SOF_TIMESTAMPING_RAW_HARDWARE)
 
-enum hwtstamp_source {
-	HWTSTAMP_SOURCE_UNSPEC,
-	HWTSTAMP_SOURCE_NETDEV,
-	HWTSTAMP_SOURCE_PHYLIB,
-};
-
 /**
  * struct hwtstamp_provider_desc - hwtstamp provider description
  *
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 30c8dad6214e..9a02f579de22 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -37,6 +37,18 @@ enum ethtool_tcp_data_split {
 	ETHTOOL_TCP_DATA_SPLIT_ENABLED,
 };
 
+/**
+ * enum hwtstamp_source - Source of the hardware timestamp
+ * @HWTSTAMP_SOURCE_NETDEV: Hardware timestamp comes from a MAC or a device
+ *   which has MAC and PHY integrated
+ * @HWTSTAMP_SOURCE_PHYLIB: Hardware timestamp comes from one PHY device of the
+ *   network topology
+ */
+enum hwtstamp_source {
+	HWTSTAMP_SOURCE_NETDEV = 1,
+	HWTSTAMP_SOURCE_PHYLIB,
+};
+
 enum {
 	ETHTOOL_A_HEADER_UNSPEC,
 	ETHTOOL_A_HEADER_DEV_INDEX,
@@ -401,6 +413,8 @@ enum {
 	ETHTOOL_A_TSINFO_PHC_INDEX,
 	ETHTOOL_A_TSINFO_STATS,
 	ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER,
+	ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE,
+	ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX,
 
 	__ETHTOOL_A_TSINFO_CNT,
 	ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1)
-- 
cgit v1.2.3


From 38b95d588f8fd07027ad8dbca3e1d2b5c13413ae Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 19 May 2025 13:57:54 -0700
Subject: scm: Move scm_recv() from scm.h to scm.c.

scm_recv() has been placed in scm.h since the pre-git era for no
particular reason (I think), which makes the file really fragile.

For example, when you move SOCK_PASSCRED from include/linux/net.h to
enum sock_flags in include/net/sock.h, you will see weird build failure
due to terrible dependency.

To avoid the build failure in the future, let's move scm_recv(_unix())?
and its callees to scm.c.

Note that only scm_recv() needs to be exported for Bluetooth.

scm_send() should be moved to scm.c too, but I'll revisit later.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/scm.h | 121 ++----------------------------------------------------
 1 file changed, 4 insertions(+), 117 deletions(-)

(limited to 'include')

diff --git a/include/net/scm.h b/include/net/scm.h
index 22bb49589fde..84c4707e78a5 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -102,123 +102,10 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
 	return __scm_send(sock, msg, scm);
 }
 
-#ifdef CONFIG_SECURITY_NETWORK
-static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
-{
-	struct lsm_context ctx;
-	int err;
-
-	if (test_bit(SOCK_PASSSEC, &sock->flags)) {
-		err = security_secid_to_secctx(scm->secid, &ctx);
-
-		if (err >= 0) {
-			put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len,
-				 ctx.context);
-			security_release_secctx(&ctx);
-		}
-	}
-}
-
-static inline bool scm_has_secdata(struct socket *sock)
-{
-	return test_bit(SOCK_PASSSEC, &sock->flags);
-}
-#else
-static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
-{ }
-
-static inline bool scm_has_secdata(struct socket *sock)
-{
-	return false;
-}
-#endif /* CONFIG_SECURITY_NETWORK */
-
-static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
-{
-	struct file *pidfd_file = NULL;
-	int len, pidfd;
-
-	/* put_cmsg() doesn't return an error if CMSG is truncated,
-	 * that's why we need to opencode these checks here.
-	 */
-	if (msg->msg_flags & MSG_CMSG_COMPAT)
-		len = sizeof(struct compat_cmsghdr) + sizeof(int);
-	else
-		len = sizeof(struct cmsghdr) + sizeof(int);
-
-	if (msg->msg_controllen < len) {
-		msg->msg_flags |= MSG_CTRUNC;
-		return;
-	}
-
-	if (!scm->pid)
-		return;
-
-	pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
-
-	if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
-		if (pidfd_file) {
-			put_unused_fd(pidfd);
-			fput(pidfd_file);
-		}
-
-		return;
-	}
-
-	if (pidfd_file)
-		fd_install(pidfd, pidfd_file);
-}
-
-static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg,
-				     struct scm_cookie *scm, int flags)
-{
-	if (!msg->msg_control) {
-		if (test_bit(SOCK_PASSCRED, &sock->flags) ||
-		    test_bit(SOCK_PASSPIDFD, &sock->flags) ||
-		    scm->fp || scm_has_secdata(sock))
-			msg->msg_flags |= MSG_CTRUNC;
-		scm_destroy(scm);
-		return false;
-	}
-
-	if (test_bit(SOCK_PASSCRED, &sock->flags)) {
-		struct user_namespace *current_ns = current_user_ns();
-		struct ucred ucreds = {
-			.pid = scm->creds.pid,
-			.uid = from_kuid_munged(current_ns, scm->creds.uid),
-			.gid = from_kgid_munged(current_ns, scm->creds.gid),
-		};
-		put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
-	}
-
-	scm_passec(sock, msg, scm);
-
-	if (scm->fp)
-		scm_detach_fds(msg, scm);
-
-	return true;
-}
-
-static inline void scm_recv(struct socket *sock, struct msghdr *msg,
-			    struct scm_cookie *scm, int flags)
-{
-	if (!__scm_recv_common(sock, msg, scm, flags))
-		return;
-
-	scm_destroy_cred(scm);
-}
-
-static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg,
-				 struct scm_cookie *scm, int flags)
-{
-	if (!__scm_recv_common(sock, msg, scm, flags))
-		return;
-
-	if (test_bit(SOCK_PASSPIDFD, &sock->flags))
-		scm_pidfd_recv(msg, scm);
-
-	scm_destroy_cred(scm);
-}
+void scm_recv(struct socket *sock, struct msghdr *msg,
+	      struct scm_cookie *scm, int flags);
+void scm_recv_unix(struct socket *sock, struct msghdr *msg,
+		   struct scm_cookie *scm, int flags);
 
 static inline int scm_recv_one_fd(struct file *f, int __user *ufd,
 				  unsigned int flags)
-- 
cgit v1.2.3


From 7d8d93fdde50b86bbbf46a203c368ed320e729ab Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 19 May 2025 13:57:56 -0700
Subject: net: Restrict SO_PASS{CRED,PIDFD,SEC} to AF_{UNIX,NETLINK,BLUETOOTH}.

SCM_CREDENTIALS and SCM_SECURITY can be recv()ed by calling
scm_recv() or scm_recv_unix(), and SCM_PIDFD is only used by
scm_recv_unix().

scm_recv() is called from AF_NETLINK and AF_BLUETOOTH.

scm_recv_unix() is literally called from AF_UNIX.

Let's restrict SO_PASSCRED and SO_PASSSEC to such sockets and
SO_PASSPIDFD to AF_UNIX only.

Later, SOCK_PASS{CRED,PIDFD,SEC} will be moved to struct sock
and united with another field.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 35ca6b13c6d2..483522377955 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2773,9 +2773,14 @@ static inline bool sk_is_udp(const struct sock *sk)
 	       sk->sk_protocol == IPPROTO_UDP;
 }
 
+static inline bool sk_is_unix(const struct sock *sk)
+{
+	return sk->sk_family == AF_UNIX;
+}
+
 static inline bool sk_is_stream_unix(const struct sock *sk)
 {
-	return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM;
+	return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM;
 }
 
 static inline bool sk_is_vsock(const struct sock *sk)
@@ -2783,6 +2788,13 @@ static inline bool sk_is_vsock(const struct sock *sk)
 	return sk->sk_family == AF_VSOCK;
 }
 
+static inline bool sk_may_scm_recv(const struct sock *sk)
+{
+	return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) ||
+		sk->sk_family == AF_NETLINK ||
+		(IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH);
+}
+
 /**
  * sk_eat_skb - Release a skb if it is no longer needed
  * @sk: socket to eat this skb from
-- 
cgit v1.2.3


From 0e81cfd971dc4833c699dcd8924e54a5021bc4e8 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 19 May 2025 13:57:57 -0700
Subject: af_unix: Move SOCK_PASS{CRED,PIDFD,SEC} to struct sock.

As explained in the next patch, SO_PASSRIGHTS would have a problem
if we assigned a corresponding bit to socket->flags, so it must be
managed in struct sock.

Mixing socket->flags and sk->sk_flags for similar options will look
confusing, and sk->sk_flags does not have enough space on 32bit system.

Also, as mentioned in commit 16e572626961 ("af_unix: dont send
SCM_CREDENTIALS by default"), SOCK_PASSCRED and SOCK_PASSPID handling
is known to be slow, and managing the flags in struct socket cannot
avoid that for embryo sockets.

Let's move SOCK_PASS{CRED,PIDFD,SEC} to struct sock.

While at it, other SOCK_XXX flags in net.h are grouped as enum.

Note that assign_bit() was atomic, so the writer side is moved down
after lock_sock() in setsockopt(), but the bit is only read once
in sendmsg() and recvmsg(), so lock_sock() is not needed there.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 15 +++++++--------
 include/net/sock.h  | 16 +++++++++++++++-
 2 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 0ff950eecc6b..f8418d6e33e0 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -36,14 +36,13 @@ struct net;
  * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected.
  * Eventually all flags will be in sk->sk_wq->flags.
  */
-#define SOCKWQ_ASYNC_NOSPACE	0
-#define SOCKWQ_ASYNC_WAITDATA	1
-#define SOCK_NOSPACE		2
-#define SOCK_PASSCRED		3
-#define SOCK_PASSSEC		4
-#define SOCK_SUPPORT_ZC		5
-#define SOCK_CUSTOM_SOCKOPT	6
-#define SOCK_PASSPIDFD		7
+enum socket_flags {
+	SOCKWQ_ASYNC_NOSPACE,
+	SOCKWQ_ASYNC_WAITDATA,
+	SOCK_NOSPACE,
+	SOCK_SUPPORT_ZC,
+	SOCK_CUSTOM_SOCKOPT,
+};
 
 #ifndef ARCH_HAS_SOCKET_TYPES
 /**
diff --git a/include/net/sock.h b/include/net/sock.h
index 483522377955..d90a71f66ab8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -337,6 +337,11 @@ struct sk_filter;
   *	@sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
   *	@sk_txtime_report_errors: set report errors mode for SO_TXTIME
   *	@sk_txtime_unused: unused txtime flags
+  *	@sk_scm_recv_flags: all flags used by scm_recv()
+  *	@sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS
+  *	@sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY
+  *	@sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD
+  *	@sk_scm_unused: unused flags for scm_recv()
   *	@ns_tracker: tracker for netns reference
   *	@sk_user_frags: xarray of pages the user is holding a reference on.
   *	@sk_owner: reference to the real owner of the socket that calls
@@ -523,7 +528,16 @@ struct sock {
 #endif
 	int			sk_disconnects;
 
-	u8			sk_txrehash;
+	union {
+		u8		sk_txrehash;
+		u8		sk_scm_recv_flags;
+		struct {
+			u8	sk_scm_credentials : 1,
+				sk_scm_security : 1,
+				sk_scm_pidfd : 1,
+				sk_scm_unused : 5;
+		};
+	};
 	u8			sk_clockid;
 	u8			sk_txtime_deadline_mode : 1,
 				sk_txtime_report_errors : 1,
-- 
cgit v1.2.3


From 77cbe1a6d8730a07f99f9263c2d5f2304cf5e830 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 19 May 2025 13:57:59 -0700
Subject: af_unix: Introduce SO_PASSRIGHTS.

As long as recvmsg() or recvmmsg() is used with cmsg, it is not
possible to avoid receiving file descriptors via SCM_RIGHTS.

This behaviour has occasionally been flagged as problematic, as
it can be (ab)used to trigger DoS during close(), for example, by
passing a FUSE-controlled fd or a hung NFS fd.

For instance, as noted on the uAPI Group page [0], an untrusted peer
could send a file descriptor pointing to a hung NFS mount and then
close it.  Once the receiver calls recvmsg() with msg_control, the
descriptor is automatically installed, and then the responsibility
for the final close() now falls on the receiver, which may result
in blocking the process for a long time.

Regarding this, systemd calls cmsg_close_all() [1] after each
recvmsg() to close() unwanted file descriptors sent via SCM_RIGHTS.

However, this cannot work around the issue at all, because the final
fput() may still occur on the receiver's side once sendmsg() with
SCM_RIGHTS succeeds.  Also, even filtering by LSM at recvmsg() does
not work for the same reason.

Thus, we need a better way to refuse SCM_RIGHTS at sendmsg().

Let's introduce SO_PASSRIGHTS to disable SCM_RIGHTS.

Note that this option is enabled by default for backward
compatibility.

Link: https://uapi-group.org/kernel-features/#disabling-reception-of-scm_rights-for-af_unix-sockets #[0]
Link: https://github.com/systemd/systemd/blob/v257.5/src/basic/fd-util.c#L612-L628 #[1]
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h                | 4 +++-
 include/uapi/asm-generic/socket.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index d90a71f66ab8..92e7c1aae3cc 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -341,6 +341,7 @@ struct sk_filter;
   *	@sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS
   *	@sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY
   *	@sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD
+  *	@sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS
   *	@sk_scm_unused: unused flags for scm_recv()
   *	@ns_tracker: tracker for netns reference
   *	@sk_user_frags: xarray of pages the user is holding a reference on.
@@ -535,7 +536,8 @@ struct sock {
 			u8	sk_scm_credentials : 1,
 				sk_scm_security : 1,
 				sk_scm_pidfd : 1,
-				sk_scm_unused : 5;
+				sk_scm_rights : 1,
+				sk_scm_unused : 4;
 		};
 	};
 	u8			sk_clockid;
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index aa5016ff3d91..f333a0ac4ee4 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -145,6 +145,8 @@
 
 #define SO_RCVPRIORITY		82
 
+#define SO_PASSRIGHTS		83
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
-- 
cgit v1.2.3


From 9a119669fb1924cd9658c16da39a5a585e129e50 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 May 2025 11:38:48 +0200
Subject: netfilter: nf_tables: nft_fib: consistent l3mdev handling

fib has two modes:
1. Obtain output device according to source or destination address
2. Obtain the type of the address, e.g. local, unicast, multicast.

'fib daddr type' should return 'local' if the address is configured
in this netns or unicast otherwise.

'fib daddr . iif type' should return 'local' if the address is configured
on the input interface or unicast otherwise, i.e. more restrictive.

However, if the interface is part of a VRF, then 'fib daddr type'
returns unicast even if the address is configured on the incoming
interface.

This is broken for both ipv4 and ipv6.

In the ipv4 case, inet_dev_addr_type must only be used if the
'iif' or 'oif' (strict mode) was requested.

Else inet_addr_type_dev_table() needs to be used and the correct
dev argument must be passed as well so the correct fib (vrf) table
is used.

In the ipv6 case, the bug is similar, without strict mode, dev is NULL
so .flowi6_l3mdev will be set to 0.

Add a new 'nft_fib_l3mdev_master_ifindex_rcu()' helper and use that
to init the .l3mdev structure member.

For ipv6, use it from nft_fib6_flowi_init() which gets called from
both the 'type' and the 'route' mode eval functions.

This provides consistent behaviour for all modes for both ipv4 and ipv6:
If strict matching is requested, the input respectively output device
of the netfilter hooks is used.

Otherwise, use skb->dev to obtain the l3mdev ifindex.

Without this, most type checks in updated nft_fib.sh selftest fail:

  FAIL: did not find veth0 . 10.9.9.1 . local in fibtype4
  FAIL: did not find veth0 . dead:1::1 . local in fibtype6
  FAIL: did not find veth0 . dead:9::1 . local in fibtype6
  FAIL: did not find tvrf . 10.0.1.1 . local in fibtype4
  FAIL: did not find tvrf . 10.9.9.1 . local in fibtype4
  FAIL: did not find tvrf . dead:1::1 . local in fibtype6
  FAIL: did not find tvrf . dead:9::1 . local in fibtype6
  FAIL: fib expression address types match (iif in vrf)

(fib errounously returns 'unicast' for all of them, even
 though all of these addresses are local to the vrf).

Fixes: f6d0cbcf09c5 ("netfilter: nf_tables: add fib expression")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_fib.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
index 6e202ed5e63f..7370fba844ef 100644
--- a/include/net/netfilter/nft_fib.h
+++ b/include/net/netfilter/nft_fib.h
@@ -2,6 +2,7 @@
 #ifndef _NFT_FIB_H_
 #define _NFT_FIB_H_
 
+#include <net/l3mdev.h>
 #include <net/netfilter/nf_tables.h>
 
 struct nft_fib {
@@ -39,6 +40,14 @@ static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt)
 	return nft_fib_is_loopback(pkt->skb, indev);
 }
 
+static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt,
+						    const struct net_device *iif)
+{
+	const struct net_device *dev = iif ? iif : pkt->skb->dev;
+
+	return l3mdev_master_ifindex_rcu(dev);
+}
+
 int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset);
 int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		 const struct nlattr * const tb[]);
-- 
cgit v1.2.3


From a1f1acb9c5db9b385c9b3eb1f27f897c06df49ae Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 12 May 2025 12:28:44 +0200
Subject: netfilter: nf_dup{4, 6}: Move duplication check to task_struct

nf_skb_duplicated is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.

Due to the recursion involved, the simplest change is to make it a
per-task variable.

Move the per-CPU variable nf_skb_duplicated to task_struct and name it
in_nf_duplicate. Add it to the existing bitfield so it doesn't use
additional memory.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 11 -----------
 include/linux/sched.h     |  1 +
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 2b8aac2c70ad..892d12823ed4 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -497,17 +497,6 @@ struct nf_defrag_hook {
 extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook;
 extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook;
 
-/*
- * nf_skb_duplicated - TEE target has sent a packet
- *
- * When a xtables target sends a packet, the OUTPUT and POSTROUTING
- * hooks are traversed again, i.e. nft and xtables are invoked recursively.
- *
- * This is used by xtables TEE target to prevent the duplicated skb from
- * being duplicated again.
- */
-DECLARE_PER_CPU(bool, nf_skb_duplicated);
-
 /*
  * Contains bitmask of ctnetlink event subscribers, if any.
  * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f96ac1982893..52d9c52dc8f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1044,6 +1044,7 @@ struct task_struct {
 	/* delay due to memory thrashing */
 	unsigned                        in_thrashing:1;
 #endif
+	unsigned			in_nf_duplicate:1;
 #ifdef CONFIG_PREEMPT_RT
 	struct netdev_xmit		net_xmit;
 #endif
-- 
cgit v1.2.3


From f37ad91270397a6d053e8623bdb3cf79859691d2 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 12 May 2025 12:28:46 +0200
Subject: netfilter: nf_dup_netdev: Move the recursion counter struct
 netdev_xmit

nf_dup_skb_recursion is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.

Move nf_dup_skb_recursion to struct netdev_xmit, provide wrappers.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice_xmit.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
index 848735b3a7c0..813a19122ebb 100644
--- a/include/linux/netdevice_xmit.h
+++ b/include/linux/netdevice_xmit.h
@@ -11,6 +11,9 @@ struct netdev_xmit {
 #if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
 	u8 sched_mirred_nest;
 #endif
+#if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
+	u8 nf_dup_skb_recursion;
+#endif
 };
 
 #endif
-- 
cgit v1.2.3


From 90869f43d06dfc836def2f53850a878f829e443e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 22 May 2025 15:49:33 +0200
Subject: netfilter: conntrack: make nf_conntrack_id callable without a module
 dependency

While nf_conntrack_id() doesn't need any functionaliy from conntrack, it
does reside in nf_conntrack_core.c -- callers add a module
dependency on conntrack.

Followup patch will need to compute the conntrack id from nf_tables_trace.c
to include it in nf_trace messages emitted to userspace via netlink.

I don't want to introduce a module dependency between nf_tables and
conntrack for this.

Since trace is slowpath, the added indirection is ok.

One alternative is to move nf_conntrack_id to the netfilter/core.c,
but I don't see a compelling reason so far.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 892d12823ed4..20947f2c685b 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -470,6 +470,7 @@ struct nf_ct_hook {
 	void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
 	void (*set_closing)(struct nf_conntrack *nfct);
 	int (*confirm)(struct sk_buff *skb);
+	u32 (*get_id)(const struct nf_conntrack *nfct);
 };
 extern const struct nf_ct_hook __rcu *nf_ct_hook;
 
-- 
cgit v1.2.3


From 7e5c6aa67e6f6133c5a2c53852e1dd9af2c0c3fc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 22 May 2025 15:49:34 +0200
Subject: netfilter: nf_tables: add packets conntrack state to debug trace info

Add the minimal relevant info needed for userspace ("nftables monitor
trace") to provide the conntrack view of the packet:

- state (new, related, established)
- direction (original, reply)
- status (e.g., if connection is subject to dnat)
- id (allows to query ctnetlink for remaining conntrack state info)

Example:
trace id a62 inet filter PRE_RAW packet: iif "enp0s3" ether [..]
  [..]
trace id a62 inet filter PRE_MANGLE conntrack: ct direction original ct state new ct id 32
trace id a62 inet filter PRE_MANGLE packet: [..]
 [..]
trace id a62 inet filter IN conntrack: ct direction original ct state new ct status dnat-done ct id 32
 [..]

In this case one can see that while NAT is active, the new connection
isn't subject to a translation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 7d6bc19a0153..2beb30be2c5f 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1841,6 +1841,10 @@ enum nft_xfrm_keys {
  * @NFTA_TRACE_MARK: nfmark (NLA_U32)
  * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32)
  * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32)
+ * @NFTA_TRACE_CT_ID: conntrack id (NLA_U32)
+ * @NFTA_TRACE_CT_DIRECTION: packets direction (NLA_U8)
+ * @NFTA_TRACE_CT_STATUS: conntrack status (NLA_U32)
+ * @NFTA_TRACE_CT_STATE: packet state (new, established, ...) (NLA_U32)
  */
 enum nft_trace_attributes {
 	NFTA_TRACE_UNSPEC,
@@ -1861,6 +1865,10 @@ enum nft_trace_attributes {
 	NFTA_TRACE_NFPROTO,
 	NFTA_TRACE_POLICY,
 	NFTA_TRACE_PAD,
+	NFTA_TRACE_CT_ID,
+	NFTA_TRACE_CT_DIRECTION,
+	NFTA_TRACE_CT_STATUS,
+	NFTA_TRACE_CT_STATE,
 	__NFTA_TRACE_MAX
 };
 #define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1)
-- 
cgit v1.2.3


From e225376d78fb2d85e99a2436a9e65765dc1ac234 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 21 May 2025 22:44:23 +0200
Subject: netfilter: nf_tables: Introduce nft_hook_find_ops{,_rcu}()

Also a pretty dull wrapper around the hook->ops.dev comparison for now.
Will search the embedded nf_hook_ops list in future. The ugly cast to
eliminate the const qualifier will vanish then, too.

Since this future list will be RCU-protected, also introduce an _rcu()
variant here.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 803d5f1601f9..df0b151743a2 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1205,6 +1205,11 @@ struct nft_hook {
 	u8			ifnamelen;
 };
 
+struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
+				      const struct net_device *dev);
+struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
+					  const struct net_device *dev);
+
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
-- 
cgit v1.2.3


From 73319a8ee18b9cf0b2dac87f8521595e0381ba0c Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 21 May 2025 22:44:26 +0200
Subject: netfilter: nf_tables: Have a list of nf_hook_ops in nft_hook

Supporting a 1:n relationship between nft_hook and nf_hook_ops is
convenient since a chain's or flowtable's nft_hooks may remain in place
despite matching interfaces disappearing. This stabilizes ruleset dumps
in that regard and opens the possibility to claim newly added interfaces
which match the spec. Also it prepares for wildcard interface specs
since these will potentially match multiple interfaces.

All spots dealing with hook registration are updated to handle a list of
multiple nf_hook_ops, but nft_netdev_hook_alloc() only adds a single
item for now to retain the old behaviour. The only expected functional
change here is how vanishing interfaces are handled: Instead of dropping
the respective nft_hook, only the matching nf_hook_ops are dropped.

To safely remove individual ops from the list in netdev handlers, an
rcu_head is added to struct nf_hook_ops so kfree_rcu() may be used.
There is at least nft_flowtable_find_dev() which may be iterating
through the list at the same time.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         | 3 +++
 include/net/netfilter/nf_tables.h | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 20947f2c685b..5f896fcc074d 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -95,6 +95,9 @@ enum nf_hook_ops_type {
 };
 
 struct nf_hook_ops {
+	struct list_head	list;
+	struct rcu_head		rcu;
+
 	/* User fills in from here down. */
 	nf_hookfn		*hook;
 	struct net_device	*dev;
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index df0b151743a2..5e49619ae49c 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1199,7 +1199,7 @@ struct nft_stats {
 
 struct nft_hook {
 	struct list_head	list;
-	struct nf_hook_ops	ops;
+	struct list_head	ops_list;
 	struct rcu_head		rcu;
 	char			ifname[IFNAMSIZ];
 	u8			ifnamelen;
-- 
cgit v1.2.3


From 465b9ee0ee7bc268d7f261356afd6c4262e48d82 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 21 May 2025 22:44:33 +0200
Subject: netfilter: nf_tables: Add notifications for hook changes

Notify user space if netdev hooks are updated due to netdev add/remove
events. Send minimal notification messages by introducing
NFT_MSG_NEWDEV/DELDEV message types describing a single device only.

Upon NETDEV_CHANGENAME, the callback has no information about the
interface's old name. To provide a clear message to user space, include
the hook's stored interface name in the notification.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  5 +++++
 include/uapi/linux/netfilter/nf_tables.h | 10 ++++++++++
 include/uapi/linux/netfilter/nfnetlink.h |  2 ++
 3 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5e49619ae49c..e4d8e451e935 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1142,6 +1142,11 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
 int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
 void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
 
+struct nft_hook;
+void nf_tables_chain_device_notify(const struct nft_chain *chain,
+				   const struct nft_hook *hook,
+				   const struct net_device *dev, int event);
+
 enum nft_chain_types {
 	NFT_CHAIN_T_DEFAULT = 0,
 	NFT_CHAIN_T_ROUTE,
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 2beb30be2c5f..518ba144544c 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -142,6 +142,8 @@ enum nf_tables_msg_types {
 	NFT_MSG_DESTROYOBJ,
 	NFT_MSG_DESTROYFLOWTABLE,
 	NFT_MSG_GETSETELEM_RESET,
+	NFT_MSG_NEWDEV,
+	NFT_MSG_DELDEV,
 	NFT_MSG_MAX,
 };
 
@@ -1784,10 +1786,18 @@ enum nft_synproxy_attributes {
  * enum nft_device_attributes - nf_tables device netlink attributes
  *
  * @NFTA_DEVICE_NAME: name of this device (NLA_STRING)
+ * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING)
+ * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING)
+ * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING)
+ * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING)
  */
 enum nft_devices_attributes {
 	NFTA_DEVICE_UNSPEC,
 	NFTA_DEVICE_NAME,
+	NFTA_DEVICE_TABLE,
+	NFTA_DEVICE_FLOWTABLE,
+	NFTA_DEVICE_CHAIN,
+	NFTA_DEVICE_SPEC,
 	__NFTA_DEVICE_MAX
 };
 #define NFTA_DEVICE_MAX		(__NFTA_DEVICE_MAX - 1)
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 6cd58cd2a6f0..50d807af2649 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -25,6 +25,8 @@ enum nfnetlink_groups {
 #define NFNLGRP_ACCT_QUOTA		NFNLGRP_ACCT_QUOTA
 	NFNLGRP_NFTRACE,
 #define NFNLGRP_NFTRACE			NFNLGRP_NFTRACE
+	NFNLGRP_NFT_DEV,
+#define NFNLGRP_NFT_DEV			NFNLGRP_NFT_DEV
 	__NFNLGRP_MAX,
 };
 #define NFNLGRP_MAX	(__NFNLGRP_MAX - 1)
-- 
cgit v1.2.3


From 384492c48e6a88c9a7f0376d8e8ac7f557988e92 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <stfomichev@gmail.com>
Date: Tue, 20 May 2025 13:30:42 -0700
Subject: net: devmem: support single IOV with sendmsg

sendmsg() with a single iov becomes ITER_UBUF, sendmsg() with multiple
iovs becomes ITER_IOVEC. iter_iov_len does not return correct
value for UBUF, so teach to treat UBUF differently.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: Mina Almasry <almasrymina@google.com>
Fixes: bd61848900bf ("net: devmem: Implement TX path")
Signed-off-by: Stanislav Fomichev <stfomichev@gmail.com>
Acked-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/uio.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 49ece9e1888f..393d0622cc28 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -99,7 +99,13 @@ static inline const struct iovec *iter_iov(const struct iov_iter *iter)
 }
 
 #define iter_iov_addr(iter)	(iter_iov(iter)->iov_base + (iter)->iov_offset)
-#define iter_iov_len(iter)	(iter_iov(iter)->iov_len - (iter)->iov_offset)
+
+static inline size_t iter_iov_len(const struct iov_iter *i)
+{
+	if (i->iter_type == ITER_UBUF)
+		return i->count;
+	return iter_iov(i)->iov_len - i->iov_offset;
+}
 
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
 {
-- 
cgit v1.2.3


From e45b7196df60a4aef86c3998611c91fcc93d21f3 Mon Sep 17 00:00:00 2001
From: Qiu Yutan <qiu.yutan@zte.com.cn>
Date: Wed, 21 May 2025 10:14:08 +0800
Subject: net: neigh: use kfree_skb_reason() in neigh_resolve_output() and
 neigh_connected_output()

Replace kfree_skb() used in neigh_resolve_output() and
neigh_connected_output() with kfree_skb_reason().

Following new skb drop reason is added:
/* failed to fill the device hard header */
SKB_DROP_REASON_NEIGH_HH_FILLFAIL

Signed-off-by: Qiu Yutan <qiu.yutan@zte.com.cn>
Signed-off-by: Jiang Kun <jiang.kun2@zte.com.cn>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Xu Xin <xu.xin16@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dropreason-core.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index bea77934a235..bcf9d7467e1a 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -62,6 +62,7 @@
 	FN(NEIGH_FAILED)		\
 	FN(NEIGH_QUEUEFULL)		\
 	FN(NEIGH_DEAD)			\
+	FN(NEIGH_HH_FILLFAIL)		\
 	FN(TC_EGRESS)			\
 	FN(SECURITY_HOOK)		\
 	FN(QDISC_DROP)			\
@@ -348,6 +349,8 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_NEIGH_QUEUEFULL,
 	/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
 	SKB_DROP_REASON_NEIGH_DEAD,
+	/** @SKB_DROP_REASON_NEIGH_HH_FILLFAIL: failed to fill the device hard header */
+	SKB_DROP_REASON_NEIGH_HH_FILLFAIL,
 	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
 	SKB_DROP_REASON_TC_EGRESS,
 	/** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */
-- 
cgit v1.2.3


From 45ca7e9f0730ae36fc610e675b990e9cc9ca0714 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 21 May 2025 14:17:05 +0200
Subject: vsock/virtio: fix `rx_bytes` accounting for stream sockets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In `struct virtio_vsock_sock`, we maintain two counters:
- `rx_bytes`: used internally to track how many bytes have been read.
  This supports mechanisms like .stream_has_data() and sock_rcvlowat().
- `fwd_cnt`: used for the credit mechanism to inform available receive
  buffer space to the remote peer.

These counters are updated via virtio_transport_inc_rx_pkt() and
virtio_transport_dec_rx_pkt().

Since the beginning with commit 06a8fc78367d ("VSOCK: Introduce
virtio_vsock_common.ko"), we call virtio_transport_dec_rx_pkt() in
virtio_transport_stream_do_dequeue() only when we consume the entire
packet, so partial reads, do not update `rx_bytes` and `fwd_cnt`.

This is fine for `fwd_cnt`, because we still have space used for the
entire packet, and we don't want to update the credit for the other
peer until we free the space of the entire packet. However, this
causes `rx_bytes` to be stale on partial reads.

Previously, this didn’t cause issues because `rx_bytes` was used only by
.stream_has_data(), and any unread portion of a packet implied data was
still available. However, since commit 93b808876682
("virtio/vsock: fix logic which reduces credit update messages"), we now
rely on `rx_bytes` to determine if a credit update should be sent when
the data in the RX queue drops below SO_RCVLOWAT value.

This patch fixes the accounting by updating `rx_bytes` with the number
of bytes actually read, even on partial reads, while leaving `fwd_cnt`
untouched until the packet is fully consumed. Also introduce a new
`buf_used` counter to check that the remote peer is honoring the given
credit; this was previously done via `rx_bytes`.

Fixes: 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20250521121705.196379-1-sgarzare@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/virtio_vsock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 0387d64e2c66..36fb3edfa403 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -140,6 +140,7 @@ struct virtio_vsock_sock {
 	u32 last_fwd_cnt;
 	u32 rx_bytes;
 	u32 buf_alloc;
+	u32 buf_used;
 	struct sk_buff_head rx_queue;
 	u32 msg_count;
 };
-- 
cgit v1.2.3


From ed449ddbd867f2cc02d6890c231431f264a876eb Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 21 May 2025 13:46:09 -0700
Subject: net: core: Convert inet_addr_is_any() to sockaddr_storage

All the callers of inet_addr_is_any() have a sockaddr_storage-backed
sockaddr. Avoid casts and switch prototype to the actual object being
used.

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> # SCSI
Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20250521204619.2301870-1-kees@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/inet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/inet.h b/include/linux/inet.h
index bd8276e96e60..9158772f3559 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -55,6 +55,6 @@ extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char
 
 extern int inet_pton_with_scope(struct net *net, unsigned short af,
 		const char *src, const char *port, struct sockaddr_storage *addr);
-extern bool inet_addr_is_any(struct sockaddr *addr);
+bool inet_addr_is_any(struct sockaddr_storage *addr);
 
 #endif	/* _LINUX_INET_H */
-- 
cgit v1.2.3


From 161972650d6795ea00f8b72557cf3c3e593ed250 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 21 May 2025 13:46:10 -0700
Subject: net: core: Switch netif_set_mac_address() to struct sockaddr_storage

In order to avoid passing around struct sockaddr that has a size the
compiler cannot reason about (nor track at runtime), convert
netif_set_mac_address() to take struct sockaddr_storage. This is just a
cast conversion, so there is are no binary changes. Following patches
will make actual allocation changes.

Acked-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20250521204619.2301870-2-kees@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ea9d335de130..47200a394a02 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4212,7 +4212,7 @@ int netif_set_mtu(struct net_device *dev, int new_mtu);
 int dev_set_mtu(struct net_device *, int);
 int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 			      struct netlink_ext_ack *extack);
-int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa,
+int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 			  struct netlink_ext_ack *extack);
 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 			struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From 9ca6804ab7c34f65fcf2e29333a39e7807c30b60 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 21 May 2025 13:46:14 -0700
Subject: net: core: Convert dev_set_mac_address() to struct sockaddr_storage

All users of dev_set_mac_address() are now using a struct sockaddr_storage.
Convert the internal data type to struct sockaddr_storage, drop the casts,
and update pointer types.

Acked-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20250521204619.2301870-6-kees@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 47200a394a02..b4242b997373 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4214,7 +4214,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 			      struct netlink_ext_ack *extack);
 int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 			  struct netlink_ext_ack *extack);
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
+int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 			struct netlink_ext_ack *extack);
 int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
 			     struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From ae9fcd5a0f8ab7e12619e1c66312a03b842935c3 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 21 May 2025 13:46:16 -0700
Subject: net: core: Convert dev_set_mac_address_user() to use struct
 sockaddr_storage

Convert callers of dev_set_mac_address_user() to use struct
sockaddr_storage. Add sanity checks on dev->addr_len usage.

Signed-off-by: Kees Cook <kees@kernel.org>
Acked-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://patch.msgid.link/20250521204619.2301870-8-kees@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b4242b997373..adb14db25798 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4216,7 +4216,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 			  struct netlink_ext_ack *extack);
 int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 			struct netlink_ext_ack *extack);
-int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss,
 			     struct netlink_ext_ack *extack);
 int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
 int dev_get_port_parent_id(struct net_device *dev,
-- 
cgit v1.2.3


From ba3d7b93dbe3202bf8ead473d75885af773068bc Mon Sep 17 00:00:00 2001
From: Jordan Rife <jordan@jrife.io>
Date: Wed, 21 May 2025 23:27:06 +0200
Subject: wireguard: allowedips: add WGALLOWEDIP_F_REMOVE_ME flag

The current netlink API for WireGuard does not directly support removal
of allowed ips from a peer. A user can remove an allowed ip from a peer
in one of two ways:

1. By using the WGPEER_F_REPLACE_ALLOWEDIPS flag and providing a new
   list of allowed ips which omits the allowed ip that is to be removed.
2. By reassigning an allowed ip to a "dummy" peer then removing that
   peer with WGPEER_F_REMOVE_ME.

With the first approach, the driver completely rebuilds the allowed ip
list for a peer. If my current configuration is such that a peer has
allowed ips 192.168.0.2 and 192.168.0.3 and I want to remove 192.168.0.2
the actual transition looks like this.

[192.168.0.2, 192.168.0.3] <-- Initial state
[]                         <-- Step 1: Allowed ips removed for peer
[192.168.0.3]              <-- Step 2: Allowed ips added back for peer

This is true even if the allowed ip list is small and the update does
not need to be batched into multiple WG_CMD_SET_DEVICE requests, as the
removal and subsequent addition of ips is non-atomic within a single
request. Consequently, wg_allowedips_lookup_dst and
wg_allowedips_lookup_src may return NULL while reconfiguring a peer even
for packets bound for ips a user did not intend to remove leading to
unintended interruptions in connectivity. This presents in userspace as
failed calls to sendto and sendmsg for UDP sockets. In my case, I ran
netperf while repeatedly reconfiguring the allowed ips for a peer with
wg.

/usr/local/bin/netperf -H 10.102.73.72 -l 10m -t UDP_STREAM -- -R 1 -m 1024
send_data: data send error: No route to host (errno 113)
netperf: send_omni: send_data failed: No route to host

While this may not be of particular concern for environments where peers
and allowed ips are mostly static, systems like Cilium manage peers and
allowed ips in a dynamic environment where peers (i.e. Kubernetes nodes)
and allowed ips (i.e. pods running on those nodes) can frequently
change making WGPEER_F_REPLACE_ALLOWEDIPS problematic.

The second approach avoids any possible connectivity interruptions
but is hacky and less direct, requiring the creation of a temporary
peer just to dispose of an allowed ip.

Introduce a new flag called WGALLOWEDIP_F_REMOVE_ME which in the same
way that WGPEER_F_REMOVE_ME allows a user to remove a single peer from
a WireGuard device's configuration allows a user to remove an ip from a
peer's set of allowed ips. This enables incremental updates to a
device's configuration without any connectivity blips or messy
workarounds.

A corresponding patch for wg extends the existing `wg set` interface to
leverage this feature.

$ wg set wg0 peer <PUBKEY> allowed-ips +192.168.88.0/24,-192.168.0.1/32

When '+' or '-' is prepended to any ip in the list, wg clears
WGPEER_F_REPLACE_ALLOWEDIPS and sets the WGALLOWEDIP_F_REMOVE_ME flag on
any ip prefixed with '-'.

Signed-off-by: Jordan Rife <jordan@jrife.io>
[Jason: minor style nits, fixes to selftest, bump of wireguard-tools version]
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Link: https://patch.msgid.link/20250521212707.1767879-5-Jason@zx2c4.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/wireguard.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h
index ae88be14c947..8c26391196d5 100644
--- a/include/uapi/linux/wireguard.h
+++ b/include/uapi/linux/wireguard.h
@@ -101,6 +101,10 @@
  *                    WGALLOWEDIP_A_FAMILY: NLA_U16
  *                    WGALLOWEDIP_A_IPADDR: struct in_addr or struct in6_addr
  *                    WGALLOWEDIP_A_CIDR_MASK: NLA_U8
+ *                    WGALLOWEDIP_A_FLAGS: NLA_U32, WGALLOWEDIP_F_REMOVE_ME if
+ *                                         the specified IP should be removed;
+ *                                         otherwise, this IP will be added if
+ *                                         it is not already present.
  *                0: NLA_NESTED
  *                    ...
  *                0: NLA_NESTED
@@ -184,11 +188,16 @@ enum wgpeer_attribute {
 };
 #define WGPEER_A_MAX (__WGPEER_A_LAST - 1)
 
+enum wgallowedip_flag {
+	WGALLOWEDIP_F_REMOVE_ME = 1U << 0,
+	__WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME
+};
 enum wgallowedip_attribute {
 	WGALLOWEDIP_A_UNSPEC,
 	WGALLOWEDIP_A_FAMILY,
 	WGALLOWEDIP_A_IPADDR,
 	WGALLOWEDIP_A_CIDR_MASK,
+	WGALLOWEDIP_A_FLAGS,
 	__WGALLOWEDIP_A_LAST
 };
 #define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1)
-- 
cgit v1.2.3


From 5ec40864aaecc4bd66fe67541d4a41091ed664a5 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Thu, 22 May 2025 01:18:22 +0200
Subject: vsock: Move lingering logic to af_vsock core

Lingering should be transport-independent in the long run. In preparation
for supporting other transports, as well as the linger on shutdown(), move
code to core.

Generalize by querying vsock_transport::unsent_bytes(), guard against the
callback being unimplemented. Do not pass sk_lingertime explicitly. Pull
SOCK_LINGER check into vsock_linger().

Flatten the function. Remove the nested block by inverting the condition:
return early on !timeout.

Suggested-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20250522-vsock-linger-v6-2-2ad00b0e447e@rbox.co
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/af_vsock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 9e85424c8343..d56e6e135158 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -221,6 +221,7 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport,
 				     void (*fn)(struct sock *sk));
 int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk);
 bool vsock_find_cid(unsigned int cid);
+void vsock_linger(struct sock *sk);
 
 /**** TAP ****/
 
-- 
cgit v1.2.3


From e9cb929670a1e98b592b30f03f06e9e20110f318 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 22 May 2025 13:21:47 +0200
Subject: net: phy: fix up const issues in to_mdio_device() and to_phy_device()

Both to_mdio_device() and to_phy_device() "throw away" the const pointer
attribute passed to them and return a non-const pointer, which generally
is not a good thing overall.  Fix this up by using container_of_const()
which was designed for this very problem.

Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Heiner Kallweit <hkallweit1@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Fixes: 7eab14de73a8 ("mdio, phy: fix -Wshadow warnings triggered by nested container_of()")
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/2025052246-conduit-glory-8fc9@gregkh
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mdio.h | 5 +----
 include/linux/phy.h  | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 3c3deac57894..e43ff9f980a4 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -45,10 +45,7 @@ struct mdio_device {
 	unsigned int reset_deassert_delay;
 };
 
-static inline struct mdio_device *to_mdio_device(const struct device *dev)
-{
-	return container_of(dev, struct mdio_device, dev);
-}
+#define to_mdio_device(__dev)	container_of_const(__dev, struct mdio_device, dev)
 
 /* struct mdio_driver_common: Common to all MDIO drivers */
 struct mdio_driver_common {
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 32b9da274115..e194dad1623d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -744,10 +744,7 @@ struct phy_device {
 #define PHY_F_NO_IRQ		0x80000000
 #define PHY_F_RXC_ALWAYS_ON	0x40000000
 
-static inline struct phy_device *to_phy_device(const struct device *dev)
-{
-	return container_of(to_mdio_device(dev), struct phy_device, mdio);
-}
+#define to_phy_device(__dev)	container_of_const(to_mdio_device(__dev), struct phy_device, mdio)
 
 /**
  * struct phy_tdr_config - Configuration of a TDR raw test
-- 
cgit v1.2.3


From 33f1b3677a13dda60a2a59858f7916672e7f1546 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 26 May 2025 07:47:45 +0200
Subject: sctp: mark sctp_do_peeloff static

sctp_do_peeloff is only used inside of net/sctp/socket.c,
so mark it static.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20250526054745.2329201-1-hch@lst.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/sctp.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index d8da764cf6de..e96d1bd087f6 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -364,8 +364,6 @@ sctp_assoc_to_state(const struct sctp_association *asoc)
 /* Look up the association by its id.  */
 struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id);
 
-int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp);
-
 /* A macro to walk a list of skbs.  */
 #define sctp_skb_for_each(pos, head, tmp) \
 	skb_queue_walk_safe(head, pos, tmp)
-- 
cgit v1.2.3


From 290e5d3c49f687c1567bde634dc33d57b0674919 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 19 May 2025 09:20:36 -0700
Subject: net: mana: Add support for Multi Vports on Bare metal

To support Multi Vports on Bare metal, increase the device config response
version. And, skip the register HW vport, and register filter steps, when
the Bare metal hostmode is set.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1747671636-5810-1-git-send-email-haiyangz@microsoft.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mana/mana.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 0f78065de8fe..38238c1d00bf 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -408,6 +408,7 @@ struct mana_context {
 	struct gdma_dev *gdma_dev;
 
 	u16 num_ports;
+	u8 bm_hostmode;
 
 	struct mana_eq *eqs;
 	struct dentry *mana_eqs_debugfs;
@@ -557,7 +558,8 @@ struct mana_query_device_cfg_resp {
 	u64 pf_cap_flags4;
 
 	u16 max_num_vports;
-	u16 reserved;
+	u8 bm_hostmode; /* response v3: Bare Metal Host Mode */
+	u8 reserved;
 	u32 max_num_eqs;
 
 	/* response v2: */
-- 
cgit v1.2.3