From 80d2eefcb4c84aa9018b2a997ab3a4c567bc821a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 Mar 2024 08:40:30 +0100 Subject: net: Use backlog-NAPI to clean up the defer_list. The defer_list is a per-CPU list which is used to free skbs outside of the socket lock and on the CPU on which they have been allocated. The list is processed during NAPI callbacks so ideally the list is cleaned up. Should the amount of skbs on the list exceed a certain water mark then the softirq is triggered remotely on the target CPU by invoking a remote function call. The raise of the softirqs via a remote function call leads to waking the ksoftirqd on PREEMPT_RT which is undesired. The backlog-NAPI threads already provide the infrastructure which can be utilized to perform the cleanup of the defer_list. The NAPI state is updated with the input_pkt_queue.lock acquired. It order not to break the state, it is needed to also wake the backlog-NAPI thread with the lock held. This requires to acquire the use the lock in rps_lock_irq*() if the backlog-NAPI threads are used even with RPS disabled. Move the logic of remotely starting softirqs to clean up the defer_list into kick_defer_list_purge(). Make sure a lock is held in rps_lock_irq*() if backlog-NAPI threads are used. Schedule backlog-NAPI for defer_list cleanup if backlog-NAPI is available. Acked-by: Jakub Kicinski Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb37817d6382..e41d30ebaca6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3287,6 +3287,7 @@ static inline void dev_xmit_recursion_dec(void) __this_cpu_dec(softnet_data.xmit.recursion); } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); -- cgit v1.2.3 From 117aef12a7b1b797bce9f66b156c65eab850b5b5 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:52 +0100 Subject: ip_tunnel: use a separate struct to store tunnel params in the kernel Unlike IPv6 tunnels which use purely-kernel __ip6_tnl_parm structure to store params inside the kernel, IPv4 tunnel code uses the same ip_tunnel_parm which is being used to talk with the userspace. This makes it difficult to alter or add any fields or use a different format for whatever data. Define struct ip_tunnel_parm_kern, a 1:1 copy of ip_tunnel_parm for now, and use it throughout the code. Define the pieces, where the copy user <-> kernel happens, as standalone functions, and copy the data there field-by-field, so that the kernel-side structure could be easily modified later on and the users wouldn't have to care about this. Reviewed-by: Simon Horman Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e41d30ebaca6..55c7cf9404a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -59,7 +59,7 @@ struct ethtool_ops; struct kernel_hwtstamp_config; struct phy_device; struct dsa_port; -struct ip_tunnel_parm; +struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; struct netdev_name_node; @@ -1327,7 +1327,7 @@ struct netdev_net_notifier { * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. - * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, + * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); @@ -1583,7 +1583,8 @@ struct net_device_ops { int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); int (*ndo_tunnel_ctl)(struct net_device *dev, - struct ip_tunnel_parm *p, int cmd); + struct ip_tunnel_parm_kern *p, + int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); -- cgit v1.2.3 From b9495b564d91a0afe4125db64d2bc25c310bda9c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:18 +0000 Subject: net: move kick_defer_list_purge() to net/core/dev.h kick_defer_list_purge() is defined in net/core/dev.c and used from net/core/skubff.c Because we need softnet_data, include from net/core/dev.h Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 55c7cf9404a4..15e70527b703 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3288,7 +3288,6 @@ static inline void dev_xmit_recursion_dec(void) __this_cpu_dec(softnet_data.xmit.recursion); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); -- cgit v1.2.3 From 2fe50a4d7225cf10748775e290361896637091a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:19 +0000 Subject: net: move dev_xmit_recursion() helpers to net/core/dev.h Move dev_xmit_recursion() and friends to net/core/dev.h They are only used from net/core/dev.c and net/core/filter.c. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 15e70527b703..0cd9ee83f554 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3271,23 +3271,6 @@ static inline int dev_recursion_level(void) return this_cpu_read(softnet_data.xmit.recursion); } -#define XMIT_RECURSION_LIMIT 8 -static inline bool dev_xmit_recursion(void) -{ - return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > - XMIT_RECURSION_LIMIT); -} - -static inline void dev_xmit_recursion_inc(void) -{ - __this_cpu_inc(softnet_data.xmit.recursion); -} - -static inline void dev_xmit_recursion_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.recursion); -} - void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); -- cgit v1.2.3 From a7ae7b0b2ea014ff3ed4be812c3efa1b1d86e153 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:21 +0000 Subject: net: make softnet_data.dropped an atomic_t If under extreme cpu backlog pressure enqueue_to_backlog() has to drop a packet, it could do this without dirtying a cache line and potentially slowing down the target cpu. Move sd->dropped into a separate cache line, and make it atomic. In non pressure mode, this field is not touched, no need to consume valuable space in a hot cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0cd9ee83f554..be9f071a340d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3237,10 +3237,11 @@ struct softnet_data { unsigned int input_queue_tail; #endif unsigned int received_rps; - unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + atomic_t dropped ____cacheline_aligned_in_smp; + /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; -- cgit v1.2.3 From 36b83ffcf209a2e6099dae1070df6a2001dfab27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:23 +0000 Subject: net: rps: change input_queue_tail_incr_save() input_queue_tail_incr_save() is incrementing the sd queue_tail and save it in the flow last_qtail. Two issues here : - no lock protects the write on last_qtail, we should use appropriate annotations. - We can perform this write after releasing the per-cpu backlog lock, to decrease this lock hold duration (move away the cache line miss) Also move input_queue_head_incr() and rps helpers to include/net/rps.h, while adding rps_ prefix to better reflect their role. v2: Fixed a build issue (Jakub and kernel build bots) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be9f071a340d..2c11c295cf64 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3250,21 +3250,6 @@ struct softnet_data { call_single_data_t defer_csd; }; -static inline void input_queue_head_incr(struct softnet_data *sd) -{ -#ifdef CONFIG_RPS - sd->input_queue_head++; -#endif -} - -static inline void input_queue_tail_incr_save(struct softnet_data *sd, - unsigned int *qtail) -{ -#ifdef CONFIG_RPS - *qtail = ++sd->input_queue_tail; -#endif -} - DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); static inline int dev_recursion_level(void) -- cgit v1.2.3 From d3ae5f4632c107d3c2eeb97a60fecc6a6f9d6fbe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:25 +0000 Subject: net: rps: move received_rps field to a better location Commit 14d898f3c1b3 ("dev: Move received_rps counter next to RPS members in softnet data") was unfortunate: received_rps is dirtied by a cpu and never read by other cpus in fast path. Its presence in the hot RPS cache line (shared by many cpus) is hurting RPS/RFS performance. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2c11c295cf64..7d12b5a9380f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3204,6 +3204,7 @@ struct softnet_data { struct softnet_data *rps_ipi_list; #endif + unsigned int received_rps; bool in_net_rx_action; bool in_napi_threaded_poll; @@ -3236,7 +3237,6 @@ struct softnet_data { unsigned int cpu; unsigned int input_queue_tail; #endif - unsigned int received_rps; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; -- cgit v1.2.3 From b1f81b9a535b48b2c9ca460720a2bc73fd2001de Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 Mar 2024 08:27:50 +0100 Subject: netdevice: add DEFINE_FREE() for dev_put For short netdev holds within a function there are still a lot of users of dev_put() rather than netdev_put(). Add DEFINE_FREE() to allow making those safer. Signed-off-by: Johannes Berg Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7d12b5a9380f..0c198620ac93 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4097,6 +4097,8 @@ static inline void dev_put(struct net_device *dev) netdev_put(dev, NULL); } +DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T)) + static inline void netdev_ref_replace(struct net_device *odev, struct net_device *ndev, netdevice_tracker *tracker, -- cgit v1.2.3 From 6916e461e7933d3d003441291c543938f2ccb371 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 4 Apr 2024 11:29:51 +0200 Subject: net: phy: Introduce ethernet link topology representation Link topologies containing multiple network PHYs attached to the same net_device can be found when using a PHY as a media converter for use with an SFP connector, on which an SFP transceiver containing a PHY can be used. With the current model, the transceiver's PHY can't be used for operations such as cable testing, timestamping, macsec offload, etc. The reason being that most of the logic for these configuration, coming from either ethtool netlink or ioctls tend to use netdev->phydev, which in multi-phy systems will reference the PHY closest to the MAC. Introduce a numbering scheme allowing to enumerate PHY devices that belong to any netdev, which can in turn allow userspace to take more precise decisions with regard to each PHY's configuration. The numbering is maintained per-netdev, in a phy_device_list. The numbering works similarly to a netdevice's ifindex, with identifiers that are only recycled once INT_MAX has been reached. This prevents races that could occur between PHY listing and SFP transceiver removal/insertion. The identifiers are assigned at phy_attach time, as the numbering depends on the netdevice the phy is attached to. The PHY index can be re-used for PHYs that are persistent. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0c198620ac93..d45f330d083d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,7 +40,6 @@ #include #endif #include - #include #include #include @@ -52,6 +51,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -1974,6 +1974,7 @@ enum netdev_reg_state { * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one + * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. @@ -2364,6 +2365,7 @@ struct net_device { #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif + struct phy_link_topology *link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; -- cgit v1.2.3 From c661050f93d3fd37a33c06041bb18a89688de7d2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 22 Apr 2024 05:38:56 -0700 Subject: net: create a dummy net_device allocator It is impossible to use init_dummy_netdev together with alloc_netdev() as the 'setup' argument. This is because alloc_netdev() initializes some fields in the net_device structure, and later init_dummy_netdev() memzero them all. This causes some problems as reported here: https://lore.kernel.org/all/20240322082336.49f110cc@kernel.org/ Split the init_dummy_netdev() function in two. Create a new function called init_dummy_netdev_core() that does not memzero the net_device structure. Then have init_dummy_netdev() memzero-ing and calling init_dummy_netdev_core(), keeping the old behaviour. init_dummy_netdev_core() is the new function that could be called as an argument for alloc_netdev(). Also, create a helper to allocate and initialize dummy net devices, leveraging init_dummy_netdev_core() as the setup argument. This function basically simplify the allocation of dummy devices, by allocating and initializing it. Freeing the device continue to be done through free_netdev() Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d45f330d083d..f849e7d110ed 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4519,6 +4519,9 @@ static inline void netif_addr_unlock_bh(struct net_device *dev) void ether_setup(struct net_device *dev); +/* Allocate dummy net_device */ +struct net_device *alloc_netdev_dummy(int sizeof_priv); + /* Support for loadable net-drivers */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, -- cgit v1.2.3 From 0840556e5a3a331b6932ef17dd4bc94445df3297 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 29 Apr 2024 18:58:12 -0700 Subject: net: Protect dev->name by seqlock. We will convert ioctl(SIOCGARP) to RCU, and then we need to copy dev->name which is currently protected by rtnl_lock(). This patch does the following: 1) Add seqlock netdev_rename_lock to protect dev->name 2) Add netdev_copy_name() that copies dev->name to buffer under netdev_rename_lock 3) Use netdev_copy_name() in netdev_get_name() and drop devnet_rename_sem Suggested-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89iJEWs7AYSJqGCUABeVqOCTkErponfZdT5kV-iD=-SajnQ@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240430015813.71143-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f849e7d110ed..41853424b41d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3136,6 +3136,7 @@ struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); +void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, -- cgit v1.2.3 From c1742dcb6bda5fd535fbaa2145f0a180bc329aa6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 May 2024 17:39:26 +0000 Subject: net: no longer acquire RTNL in threaded_show() dev->threaded can be read locklessly, if we add corresponding READ_ONCE()/WRITE_ONCE() annotations. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240502173926.2010646-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 41853424b41d..2814a15eed73 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2370,8 +2370,8 @@ struct net_device { struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; bool proto_down; + bool threaded; unsigned wol_enabled:1; - unsigned threaded:1; struct list_head net_notifier_list; -- cgit v1.2.3 From 087b24de5c825c53f15a9481b94f757223c20610 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 1 May 2024 23:25:40 +0000 Subject: queue_api: define queue api This API enables the net stack to reset the queues used for devmem TCP. Signed-off-by: Mina Almasry Signed-off-by: Shailend Chand Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2814a15eed73..cf261fb89d73 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1957,6 +1957,7 @@ enum netdev_reg_state { * @sysfs_rx_queue_group: Space for optional per-rx queue attributes * @rtnl_link_ops: Rtnl_link_ops * @stat_ops: Optional ops for queue-aware statistics + * @queue_mgmt_ops: Optional ops for queue management * * @gso_max_size: Maximum size of generic segmentation offload * @tso_max_size: Device (as in HW) limit on the max TSO request size @@ -2340,6 +2341,8 @@ struct net_device { const struct netdev_stat_ops *stat_ops; + const struct netdev_queue_mgmt_ops *queue_mgmt_ops; + /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u -- cgit v1.2.3