From 0b5c21bbc01e92745ca1ca4f6fd87d878fa3ea5e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Apr 2022 11:38:47 +0200 Subject: net: ensure net_todo_list is processed quickly In [1], Will raised a potential issue that the cfg80211 code, which does (from a locking perspective) rtnl_lock() wiphy_lock() rtnl_unlock() might be suspectible to ABBA deadlocks, because rtnl_unlock() calls netdev_run_todo(), which might end up calling rtnl_lock() again, which could then deadlock (see the comment in the code added here for the scenario). Some back and forth and thinking ensued, but clearly this can't happen if the net_todo_list is empty at the rtnl_unlock() here. Clearly, the code here cannot actually put an entry on it, and all other users of rtnl_unlock() will empty it since that will always go through netdev_run_todo(), emptying the list. So the only other way to get there would be to add to the list and then unlock the RTNL without going through rtnl_unlock(), which is only possible through __rtnl_unlock(). However, this isn't exported and not used in many places, and none of them seem to be able to unregister before using it. Therefore, add a WARN_ON() in the code to ensure this invariant won't be broken, so that the cfg80211 (or any similar) code stays safe. [1] https://lore.kernel.org/r/Yjzpo3TfZxtKPMAG@google.com Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220404113847.0ee02e4a70da.Ic73d206e217db20fd22dcec14fe5442ca732804b@changeid Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59e27a2b7bf0..b6a1e7f643da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3894,7 +3894,8 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern int netdev_budget; extern unsigned int netdev_budget_usecs; -/* Called by rtnetlink.c:rtnl_unlock() */ +/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */ +extern struct list_head net_todo_list; void netdev_run_todo(void); static inline void __dev_put(struct net_device *dev) -- cgit v1.2.3 From a333215e10cb5d3b1e0685ca117f0e9452215485 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 5 Apr 2022 21:57:48 +0200 Subject: net: ethernet: mtk_eth_soc: implement flow offloading to WED devices This allows hardware flow offloading from Ethernet to WLAN on MT7622 SoC Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b6a1e7f643da..7b2a0b739684 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -862,6 +862,7 @@ enum net_device_path_type { DEV_PATH_BRIDGE, DEV_PATH_PPPOE, DEV_PATH_DSA, + DEV_PATH_MTK_WDMA, }; struct net_device_path { @@ -887,6 +888,12 @@ struct net_device_path { int port; u16 proto; } dsa; + struct { + u8 wdma_idx; + u8 queue; + u16 wcid; + u8 bss; + } mtk_wdma; }; }; -- cgit v1.2.3 From 6264f58ca0e54e41d63c2d00334a48bac28fbf30 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Apr 2022 14:37:54 -0700 Subject: net: extract a few internals from netdevice.h There's a number of functions and static variables used under net/core/ but not from the outside. We currently dump most of them into netdevice.h. That bad for many reasons: - netdevice.h is very cluttered, hard to figure out what the APIs are; - netdevice.h is very long; - we have to touch netdevice.h more which causes expensive incremental builds. Create a header under net/core/ and move some declarations. The new header is also a bit of a catch-all but that's fine, if we create more specific headers people will likely over-think where their declaration fit best. And end up putting them in netdevice.h, again. More work should be done on splitting netdevice.h into more targeted headers, but that'd be more time consuming so small steps. Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 72 ++--------------------------------------------- 1 file changed, 2 insertions(+), 70 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7b2a0b739684..7e7b2a72e473 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -59,7 +59,8 @@ struct dsa_port; struct ip_tunnel_parm; struct macsec_context; struct macsec_ops; - +struct netdev_name_node; +struct sd_flow_limit; struct sfp_bus; /* 802.11 specific */ struct wireless_dev; @@ -1020,16 +1021,6 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; -struct netdev_name_node { - struct hlist_node hlist; - struct list_head list; - struct net_device *dev; - const char *name; -}; - -int netdev_name_node_alt_create(struct net_device *dev, const char *name); -int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); - struct netdev_net_notifier { struct list_head list; struct notifier_block *nb; @@ -2975,7 +2966,6 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); -int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); @@ -3034,19 +3024,6 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } -#ifdef CONFIG_NET_FLOW_LIMIT -#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ -struct sd_flow_limit { - u64 count; - unsigned int num_buckets; - unsigned int history_head; - u16 history[FLOW_LIMIT_HISTORY]; - u8 buckets[]; -}; - -extern int netdev_flow_limit_table_len; -#endif /* CONFIG_NET_FLOW_LIMIT */ - /* * Incoming packets are placed on per-CPU queues */ @@ -3770,7 +3747,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); void __dev_notify_flags(struct net_device *, unsigned int old_flags, unsigned int gchanges); -int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, @@ -3782,13 +3758,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, return __dev_change_net_namespace(dev, net, pat, 0); } int __dev_set_mtu(struct net_device *, int); -int dev_validate_mtu(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); -int dev_set_mtu_ext(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); -int dev_change_tx_queue_len(struct net_device *, unsigned long); -void dev_set_group(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, @@ -3796,24 +3766,13 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); -int dev_change_carrier(struct net_device *, bool new_carrier); -int dev_get_phys_port_id(struct net_device *dev, - struct netdev_phys_item_id *ppid); -int dev_get_phys_port_name(struct net_device *dev, - char *name, size_t len); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); -int dev_change_proto_down(struct net_device *dev, bool proto_down); -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); -int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, int expected_fd, u32 flags); int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); @@ -3898,13 +3857,6 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, bool dev_nit_active(struct net_device *dev); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; - -/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */ -extern struct list_head net_todo_list; -void netdev_run_todo(void); - static inline void __dev_put(struct net_device *dev) { if (dev) { @@ -4021,10 +3973,7 @@ static inline void dev_replace_track(struct net_device *odev, * called netif_lowerlayer_*() because they represent the state of any * kind of lower layer not just hardware media. */ - -void linkwatch_init_dev(struct net_device *dev); void linkwatch_fire_event(struct net_device *dev); -void linkwatch_forget_dev(struct net_device *dev); /** * netif_carrier_ok - test if carrier present @@ -4470,9 +4419,6 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); -void dev_addr_flush(struct net_device *dev); -int dev_addr_init(struct net_device *dev); -void dev_addr_check(struct net_device *dev); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); @@ -4562,7 +4508,6 @@ static inline void __dev_mc_unsync(struct net_device *dev, /* Functions used for secondary unicast and multicast support */ void dev_set_rx_mode(struct net_device *dev); -void __dev_set_rx_mode(struct net_device *dev); int dev_set_promiscuity(struct net_device *dev, int inc); int dev_set_allmulti(struct net_device *dev, int inc); void netdev_state_change(struct net_device *dev); @@ -4580,11 +4525,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; -extern int netdev_tstamp_prequeue; -extern int netdev_unregister_timeout_secs; -extern int weight_p; -extern int dev_weight_rx_bias; -extern int dev_weight_tx_bias; extern int dev_rx_weight; extern int dev_tx_weight; extern int gro_normal_batch; @@ -4772,12 +4712,6 @@ static inline void netdev_rx_csum_fault(struct net_device *dev, void net_enable_timestamp(void); void net_disable_timestamp(void); -#ifdef CONFIG_PROC_FS -int __init dev_proc_init(void); -#else -#define dev_proc_init() 0 -#endif - static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) @@ -4813,8 +4747,6 @@ extern const struct kobj_ns_type_operations net_ns_type_operations; const char *netdev_drivername(const struct net_device *dev); -void linkwatch_run_queue(void); - static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { -- cgit v1.2.3 From 794c24e9921f32ded4422833a990ccf11dc3c00e Mon Sep 17 00:00:00 2001 From: Jeffrey Ji Date: Wed, 6 Apr 2022 17:26:00 +0000 Subject: net-core: rx_otherhost_dropped to core_stats Increment rx_otherhost_dropped counter when packet dropped due to mismatched dest MAC addr. An example when this drop can occur is when manually crafting raw packets that will be consumed by a user space application via a tap device. For testing purposes local traffic was generated using trafgen for the client and netcat to start a server Tested: Created 2 netns, sent 1 packet using trafgen from 1 to the other with "{eth(daddr=$INCORRECT_MAC...}", verified that iproute2 showed the counter was incremented. (Also had to modify iproute2 to show the stat, additional patch for that coming next.) Signed-off-by: Jeffrey Ji Reviewed-by: Brian Vazquez Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220406172600.1141083-1-jeffreyjilinux@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7e7b2a72e473..28ea4f8269d4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -203,6 +203,7 @@ struct net_device_core_stats { local_t rx_dropped; local_t tx_dropped; local_t rx_nohandler; + local_t rx_otherhost_dropped; } __aligned(4 * sizeof(local_t)); #include @@ -3837,6 +3838,7 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) DEV_CORE_STATS_INC(rx_nohandler) +DEV_CORE_STATS_INC(rx_otherhost_dropped) static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb, -- cgit v1.2.3 From 1306d5362a591493a2d07f685ed2cc480dcda320 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 13 Apr 2022 13:51:56 +0300 Subject: net: add ndo_fdb_del_bulk Add a new netdev op called ndo_fdb_del_bulk, it will be later used for driver-specific bulk delete implementation dispatched from rtnetlink. The first user will be the bridge, we need it to signal to rtnetlink from the driver that we support bulk delete operation (NLM_F_BULK). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28ea4f8269d4..a602f29365b0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1260,6 +1260,10 @@ struct netdev_net_notifier { * struct net_device *dev, * const unsigned char *addr, u16 vid) * Deletes the FDB entry from dev coresponding to addr. + * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[], + * struct net_device *dev, + * u16 vid, + * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, * int *idx) @@ -1510,6 +1514,11 @@ struct net_device_ops { struct net_device *dev, const unsigned char *addr, u16 vid); + int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + u16 vid, + struct netlink_ext_ack *extack); int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, -- cgit v1.2.3 From 2f1e85b1aee459b7d0fd981839042c6a38ffaf0c Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 16 Apr 2022 00:40:45 +0800 Subject: net: sched: use queue_mapping to pick tx queue This patch fixes issue: * If we install tc filters with act_skbedit in clsact hook. It doesn't work, because netdev_core_pick_tx() overwrites queue_mapping. $ tc filter ... action skbedit queue_mapping 1 And this patch is useful: * We can use FQ + EDT to implement efficient policies. Tx queues are picked by xps, ndo_select_queue of netdev driver, or skb hash in netdev_core_pick_tx(). In fact, the netdev driver, and skb hash are _not_ under control. xps uses the CPUs map to select Tx queues, but we can't figure out which task_struct of pod/containter running on this cpu in most case. We can use clsact filters to classify one pod/container traffic to one Tx queue. Why ? In containter networking environment, there are two kinds of pod/ containter/net-namespace. One kind (e.g. P1, P2), the high throughput is key in these applications. But avoid running out of network resource, the outbound traffic of these pods is limited, using or sharing one dedicated Tx queues assigned HTB/TBF/FQ Qdisc. Other kind of pods (e.g. Pn), the low latency of data access is key. And the traffic is not limited. Pods use or share other dedicated Tx queues assigned FIFO Qdisc. This choice provides two benefits. First, contention on the HTB/FQ Qdisc lock is significantly reduced since fewer CPUs contend for the same queue. More importantly, Qdisc contention can be eliminated completely if each CPU has its own FIFO Qdisc for the second kind of pods. There must be a mechanism in place to support classifying traffic based on pods/container to different Tx queues. Note that clsact is outside of Qdisc while Qdisc can run a classifier to select a sub-queue under the lock. In general recording the decision in the skb seems a little heavy handed. This patch introduces a per-CPU variable, suggested by Eric. The xmit.skip_txqueue flag is firstly cleared in __dev_queue_xmit(). - Tx Qdisc may install that skbedit actions, then xmit.skip_txqueue flag is set in qdisc->enqueue() though tx queue has been selected in netdev_tx_queue_mapping() or netdev_core_pick_tx(). That flag is cleared firstly in __dev_queue_xmit(), is useful: - Avoid picking Tx queue with netdev_tx_queue_mapping() in next netdev in such case: eth0 macvlan - eth0.3 vlan - eth0 ixgbe-phy: For example, eth0, macvlan in pod, which root Qdisc install skbedit queue_mapping, send packets to eth0.3, vlan in host. In __dev_queue_xmit() of eth0.3, clear the flag, does not select tx queue according to skb->queue_mapping because there is no filters in clsact or tx Qdisc of this netdev. Same action taked in eth0, ixgbe in Host. - Avoid picking Tx queue for next packet. If we set xmit.skip_txqueue in tx Qdisc (qdisc->enqueue()), the proper way to clear it is clearing it in __dev_queue_xmit when processing next packets. For performance reasons, use the static key. If user does not config the NET_EGRESS, the patch will not be compiled. +----+ +----+ +----+ | P1 | | P2 | | Pn | +----+ +----+ +----+ | | | +-----------+-----------+ | | clsact/skbedit | MQ v +-----------+-----------+ | q0 | q1 | qn v v v HTB/FQ HTB/FQ ... FIFO Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Jonathan Lemon Cc: Eric Dumazet Cc: Alexander Lobakin Cc: Paolo Abeni Cc: Talal Ahmad Cc: Kevin Hao Cc: Ilias Apalodimas Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Antoine Tenart Cc: Wei Wang Cc: Arnd Bergmann Suggested-by: Eric Dumazet Signed-off-by: Tonghao Zhang Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a602f29365b0..7dccbfd1bf56 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3061,6 +3061,9 @@ struct softnet_data { struct { u16 recursion; u8 more; +#ifdef CONFIG_NET_EGRESS + u8 skip_txqueue; +#endif } xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, -- cgit v1.2.3 From 68822bdf76f10c3dc80609d4e2cdc1e847429086 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Apr 2022 13:12:37 -0700 Subject: net: generalize skb freeing deferral to per-cpu lists Logic added in commit f35f821935d8 ("tcp: defer skb freeing after socket lock is released") helped bulk TCP flows to move the cost of skbs frees outside of critical section where socket lock was held. But for RPC traffic, or hosts with RFS enabled, the solution is far from being ideal. For RPC traffic, recvmsg() has to return to user space right after skb payload has been consumed, meaning that BH handler has no chance to pick the skb before recvmsg() thread. This issue is more visible with BIG TCP, as more RPC fit one skb. For RFS, even if BH handler picks the skbs, they are still picked from the cpu on which user thread is running. Ideally, it is better to free the skbs (and associated page frags) on the cpu that originally allocated them. This patch removes the per socket anchor (sk->defer_list) and instead uses a per-cpu list, which will hold more skbs per round. This new per-cpu list is drained at the end of net_action_rx(), after incoming packets have been processed, to lower latencies. In normal conditions, skbs are added to the per-cpu list with no further action. In the (unlikely) cases where the cpu does not run net_action_rx() handler fast enough, we use an IPI to raise NET_RX_SOFTIRQ on the remote cpu. Also, we do not bother draining the per-cpu list from dev_cpu_dead() This is because skbs in this list have no requirement on how fast they should be freed. Note that we can add in the future a small per-cpu cache if we see any contention on sd->defer_lock. Tested on a pair of hosts with 100Gbit NIC, RFS enabled, and /proc/sys/net/ipv4/tcp_rmem[2] tuned to 16MB to work around page recycling strategy used by NIC driver (its page pool capacity being too small compared to number of skbs/pages held in sockets receive queues) Note that this tuning was only done to demonstrate worse conditions for skb freeing for this particular test. These conditions can happen in more general production workload. 10 runs of one TCP_STREAM flow Before: Average throughput: 49685 Mbit. Kernel profiles on cpu running user thread recvmsg() show high cost for skb freeing related functions (*) 57.81% [kernel] [k] copy_user_enhanced_fast_string (*) 12.87% [kernel] [k] skb_release_data (*) 4.25% [kernel] [k] __free_one_page (*) 3.57% [kernel] [k] __list_del_entry_valid 1.85% [kernel] [k] __netif_receive_skb_core 1.60% [kernel] [k] __skb_datagram_iter (*) 1.59% [kernel] [k] free_unref_page_commit (*) 1.16% [kernel] [k] __slab_free 1.16% [kernel] [k] _copy_to_iter (*) 1.01% [kernel] [k] kfree (*) 0.88% [kernel] [k] free_unref_page 0.57% [kernel] [k] ip6_rcv_core 0.55% [kernel] [k] ip6t_do_table 0.54% [kernel] [k] flush_smp_call_function_queue (*) 0.54% [kernel] [k] free_pcppages_bulk 0.51% [kernel] [k] llist_reverse_order 0.38% [kernel] [k] process_backlog (*) 0.38% [kernel] [k] free_pcp_prepare 0.37% [kernel] [k] tcp_recvmsg_locked (*) 0.37% [kernel] [k] __list_add_valid 0.34% [kernel] [k] sock_rfree 0.34% [kernel] [k] _raw_spin_lock_irq (*) 0.33% [kernel] [k] __page_cache_release 0.33% [kernel] [k] tcp_v6_rcv (*) 0.33% [kernel] [k] __put_page (*) 0.29% [kernel] [k] __mod_zone_page_state 0.27% [kernel] [k] _raw_spin_lock After patch: Average throughput: 73076 Mbit. Kernel profiles on cpu running user thread recvmsg() looks better: 81.35% [kernel] [k] copy_user_enhanced_fast_string 1.95% [kernel] [k] _copy_to_iter 1.95% [kernel] [k] __skb_datagram_iter 1.27% [kernel] [k] __netif_receive_skb_core 1.03% [kernel] [k] ip6t_do_table 0.60% [kernel] [k] sock_rfree 0.50% [kernel] [k] tcp_v6_rcv 0.47% [kernel] [k] ip6_rcv_core 0.45% [kernel] [k] read_tsc 0.44% [kernel] [k] _raw_spin_lock_irqsave 0.37% [kernel] [k] _raw_spin_lock 0.37% [kernel] [k] native_irq_return_iret 0.33% [kernel] [k] __inet6_lookup_established 0.31% [kernel] [k] ip6_protocol_deliver_rcu 0.29% [kernel] [k] tcp_rcv_established 0.29% [kernel] [k] llist_reverse_order v2: kdoc issue (kernel bots) do not defer if (alloc_cpu == smp_processor_id()) (Paolo) replace the sk_buff_head with a single-linked list (Jakub) add a READ_ONCE()/WRITE_ONCE() for the lockless read of sd->defer_list Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220422201237.416238-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7dccbfd1bf56..ac8a5f71220a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3081,6 +3081,11 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + /* Another possibly contended cache line */ + spinlock_t defer_lock ____cacheline_aligned_in_smp; + int defer_count; + struct sk_buff *defer_list; + call_single_data_t defer_csd; }; static inline void input_queue_head_incr(struct softnet_data *sd) -- cgit v1.2.3 From c526fd8f9f4f21cb83c0b1c9a1ee9c0ac9be9e2e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 28 Apr 2022 11:58:46 +0100 Subject: net: inline dev_queue_xmit() Inline dev_queue_xmit() and dev_queue_xmit_accel(), they both are small proxy functions doing nothing but redirecting the control flow to __dev_queue_xmit(). Signed-off-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b75ca2d095ae..4aba92a4042a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2940,10 +2940,20 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); -int dev_queue_xmit(struct sk_buff *skb); -int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); +static inline int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} + +static inline int dev_queue_xmit_accel(struct sk_buff *skb, + struct net_device *sb_dev) +{ + return __dev_queue_xmit(skb, sb_dev); +} + static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { int ret; -- cgit v1.2.3 From 58caed3dacb4354a25a1aa8d2febc3e9648ba1f4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 May 2022 16:27:03 -0700 Subject: netdev: reshuffle netif_napi_add() APIs to allow dropping weight Most drivers should not have to worry about selecting the right weight for their NAPI instances and pass NAPI_POLL_WEIGHT. It'd be best if we didn't require the argument at all and selected the default internally. This change prepares the ground for such reshuffling, allowing for a smooth transition. The following API should remain after the next release cycle: netif_napi_add() netif_napi_add_weight() netif_napi_add_tx() netif_napi_add_tx_weight() Where the _weight() variants take an explicit weight argument. I opted for a _weight() suffix rather than a __ prefix, because we use __ in places to mean that caller needs to also issue a synchronize_net() call. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220502232703.396351-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 50 +++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 17 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4aba92a4042a..eaf66e57d891 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2499,37 +2499,53 @@ static inline void *netdev_priv(const struct net_device *dev) */ #define NAPI_POLL_WEIGHT 64 +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight); + /** - * netif_napi_add - initialize a NAPI context - * @dev: network device - * @napi: NAPI context - * @poll: polling function - * @weight: default weight + * netif_napi_add() - initialize a NAPI context + * @dev: network device + * @napi: NAPI context + * @poll: polling function + * @weight: default weight * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight); +static inline void +netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + netif_napi_add_weight(dev, napi, poll, weight); +} + +static inline void +netif_napi_add_tx_weight(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); + netif_napi_add_weight(dev, napi, poll, weight); +} + +#define netif_tx_napi_add netif_napi_add_tx_weight /** - * netif_tx_napi_add - initialize a NAPI context - * @dev: network device - * @napi: NAPI context - * @poll: polling function - * @weight: default weight + * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only + * @dev: network device + * @napi: NAPI context + * @poll: polling function * * This variant of netif_napi_add() should be used from drivers using NAPI * to exclusively poll a TX queue. * This will avoid we add it into napi_hash[], thus polluting this hash table. */ -static inline void netif_tx_napi_add(struct net_device *dev, +static inline void netif_napi_add_tx(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), - int weight) + int (*poll)(struct napi_struct *, int)) { - set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); - netif_napi_add(dev, napi, poll, weight); + netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } /** -- cgit v1.2.3 From 6df6398f7c8b481ce83f28143bc08a5231616deb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:31 -0700 Subject: net: add netif_inherit_tso_max() To make later patches smaller create a helper for inheriting the TSO limitations of a lower device. The TSO in the name is not an accident, subsequent patches will replace GSO with TSO in more names. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eaf66e57d891..006bb5c0e413 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4895,6 +4895,9 @@ static inline void netif_set_gro_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_max_size, size); } +void netif_inherit_tso_max(struct net_device *to, + const struct net_device *from); + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 14d7b8122fd591693a2388b98563707ba72c6780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:32 -0700 Subject: net: don't allow user space to lift the device limits Up until commit 46e6b992c250 ("rtnetlink: allow GSO maximums to be set on device creation") the gso_max_segs and gso_max_size of a device were not controlled from user space. The quoted commit added the ability to control them because of the following setup: netns A | netns B veth<->veth eth0 If eth0 has TSO limitations and user wants to efficiently forward traffic between eth0 and the veths they should copy the TSO limitations of eth0 onto the veths. This would happen automatically for macvlans or ipvlan but veth users are not so lucky (given the loose coupling). Unfortunately the commit in question allowed users to also override the limits on real HW devices. It may be useful to control the max GSO size and someone may be using that ability (not that I know of any user), so create a separate set of knobs to reliably record the TSO limitations. Validate the user requests. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 006bb5c0e413..e12f7de6d6ae 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1917,8 +1917,10 @@ enum netdev_ml_priv_type { * @rtnl_link_ops: Rtnl_link_ops * * @gso_max_size: Maximum size of generic segmentation offload + * @tso_max_size: Device (as in HW) limit on the max TSO request size * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO + * @tso_max_segs: Device (as in HW) limit on the max TSO segment count * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device @@ -2262,8 +2264,13 @@ struct net_device { /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; +#define TSO_LEGACY_MAX_SIZE 65536 +#define TSO_MAX_SIZE UINT_MAX + unsigned int tso_max_size; #define GSO_MAX_SEGS 65535 u16 gso_max_segs; +#define TSO_MAX_SEGS U16_MAX + u16 tso_max_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; @@ -4895,6 +4902,8 @@ static inline void netif_set_gro_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_max_size, size); } +void netif_set_tso_max_size(struct net_device *dev, unsigned int size); +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, const struct net_device *from); -- cgit v1.2.3 From 744d49daf8bd3b17b345c836f2e6f97d49fa6ae8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:34 -0700 Subject: net: move netif_set_gso_max helpers These are now internal to the core, no need to expose them. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e12f7de6d6ae..8cf0ac616cb9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4881,27 +4881,6 @@ static inline bool netif_needs_gso(struct sk_buff *skb, (skb->ip_summed != CHECKSUM_UNNECESSARY))); } -static inline void netif_set_gso_max_size(struct net_device *dev, - unsigned int size) -{ - /* dev->gso_max_size is read locklessly from sk_setup_caps() */ - WRITE_ONCE(dev->gso_max_size, size); -} - -static inline void netif_set_gso_max_segs(struct net_device *dev, - unsigned int segs) -{ - /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ - WRITE_ONCE(dev->gso_max_segs, segs); -} - -static inline void netif_set_gro_max_size(struct net_device *dev, - unsigned int size) -{ - /* This pairs with the READ_ONCE() in skb_gro_receive() */ - WRITE_ONCE(dev->gro_max_size, size); -} - void netif_set_tso_max_size(struct net_device *dev, unsigned int size); void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, -- cgit v1.2.3 From ca4567f1e6f660f86fcd04f3563c0045b0d4772f Mon Sep 17 00:00:00 2001 From: Alaa Mohamed Date: Thu, 5 May 2022 17:09:57 +0200 Subject: rtnetlink: add extack support in fdb del handlers Add extack support to .ndo_fdb_del in netdevice.h and all related methods. Signed-off-by: Alaa Mohamed Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8cf0ac616cb9..74c97a34921d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1513,7 +1513,7 @@ struct net_device_ops { struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid); + u16 vid, struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, -- cgit v1.2.3 From 97dc7cd92ac67f6e05df418df1772ba4a7fbf693 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:40 +0200 Subject: ptp: Support late timestamp determination If a physical clock supports a free running cycle counter, then timestamps shall be based on this time too. For TX it is known in advance before the transmission if a timestamp based on the free running cycle counter is needed. For RX it is impossible to know which timestamp is needed before the packet is received and assigned to a socket. Support late timestamp determination by a network device. Therefore, an address/cookie is stored within the new netdev_data field of struct skb_shared_hwtstamps. This address/cookie is provided to a new network device function called ndo_get_tstamp(), which returns a timestamp based on the normal/adjustable time or based on the free running cycle counter. If function is not supported, then timestamp handling is not changed. This mechanism is intended for RX, but TX use is also possible. Signed-off-by: Gerhard Engleder Acked-by: Jonathan Lemon Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 74c97a34921d..51fe143091cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1356,6 +1356,12 @@ struct netdev_net_notifier { * The caller must be under RCU read context. * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); * Get the forwarding path to reach the real device from the HW destination address + * ktime_t (*ndo_get_tstamp)(struct net_device *dev, + * const struct skb_shared_hwtstamps *hwtstamps, + * bool cycles); + * Get hardware timestamp based on normal/adjustable time or free running + * cycle counter. This function is required if physical clock supports a + * free running cycle counter. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1578,6 +1584,9 @@ struct net_device_ops { struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); + ktime_t (*ndo_get_tstamp)(struct net_device *dev, + const struct skb_shared_hwtstamps *hwtstamps, + bool cycles); }; /** @@ -4761,6 +4770,18 @@ static inline void netdev_rx_csum_fault(struct net_device *dev, void net_enable_timestamp(void); void net_disable_timestamp(void); +static inline ktime_t netdev_get_tstamp(struct net_device *dev, + const struct skb_shared_hwtstamps *hwtstamps, + bool cycles) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_tstamp) + return ops->ndo_get_tstamp(dev, hwtstamps, cycles); + + return hwtstamps->hwtstamp; +} + static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) -- cgit v1.2.3 From 5b87be9e4978fe507c7e14c0bdff147d10d39aec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 May 2022 20:57:37 -0700 Subject: net: add include/net/net_debug.h Remove from include/linux/netdevice.h helpers that send debug/info/warnings to syslog. We plan adding more helpers in following patches. v2: added two includes, and 'struct net_device' forward declaration to avoid compile errors (kernel bots) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 141 +--------------------------------------------- 1 file changed, 1 insertion(+), 140 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 51fe143091cf..536321691c72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -50,6 +50,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -5071,81 +5072,9 @@ static inline const char *netdev_reg_state(const struct net_device *dev) return " (unknown)"; } -__printf(3, 4) __cold -void netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...); -__printf(2, 3) __cold -void netdev_emerg(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_alert(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_crit(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_err(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_warn(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_notice(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_info(const struct net_device *dev, const char *format, ...); - -#define netdev_level_once(level, dev, fmt, ...) \ -do { \ - static bool __section(".data.once") __print_once; \ - \ - if (!__print_once) { \ - __print_once = true; \ - netdev_printk(level, dev, fmt, ##__VA_ARGS__); \ - } \ -} while (0) - -#define netdev_emerg_once(dev, fmt, ...) \ - netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__) -#define netdev_alert_once(dev, fmt, ...) \ - netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__) -#define netdev_crit_once(dev, fmt, ...) \ - netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__) -#define netdev_err_once(dev, fmt, ...) \ - netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__) -#define netdev_warn_once(dev, fmt, ...) \ - netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__) -#define netdev_notice_once(dev, fmt, ...) \ - netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__) -#define netdev_info_once(dev, fmt, ...) \ - netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__) - #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) -#define netdev_dbg(__dev, format, args...) \ -do { \ - dynamic_netdev_dbg(__dev, format, ##args); \ -} while (0) -#elif defined(DEBUG) -#define netdev_dbg(__dev, format, args...) \ - netdev_printk(KERN_DEBUG, __dev, format, ##args) -#else -#define netdev_dbg(__dev, format, args...) \ -({ \ - if (0) \ - netdev_printk(KERN_DEBUG, __dev, format, ##args); \ -}) -#endif - -#if defined(VERBOSE_DEBUG) -#define netdev_vdbg netdev_dbg -#else - -#define netdev_vdbg(dev, format, args...) \ -({ \ - if (0) \ - netdev_printk(KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - /* * netdev_WARN() acts like dev_printk(), but with the key difference * of using a WARN/WARN_ON to get the message out, including the @@ -5159,74 +5088,6 @@ do { \ WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev), \ netdev_reg_state(dev), ##args) -/* netif printk helpers, similar to netdev_printk */ - -#define netif_printk(priv, type, level, dev, fmt, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - netdev_printk(level, (dev), fmt, ##args); \ -} while (0) - -#define netif_level(level, priv, type, dev, fmt, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - netdev_##level(dev, fmt, ##args); \ -} while (0) - -#define netif_emerg(priv, type, dev, fmt, args...) \ - netif_level(emerg, priv, type, dev, fmt, ##args) -#define netif_alert(priv, type, dev, fmt, args...) \ - netif_level(alert, priv, type, dev, fmt, ##args) -#define netif_crit(priv, type, dev, fmt, args...) \ - netif_level(crit, priv, type, dev, fmt, ##args) -#define netif_err(priv, type, dev, fmt, args...) \ - netif_level(err, priv, type, dev, fmt, ##args) -#define netif_warn(priv, type, dev, fmt, args...) \ - netif_level(warn, priv, type, dev, fmt, ##args) -#define netif_notice(priv, type, dev, fmt, args...) \ - netif_level(notice, priv, type, dev, fmt, ##args) -#define netif_info(priv, type, dev, fmt, args...) \ - netif_level(info, priv, type, dev, fmt, ##args) - -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) -#define netif_dbg(priv, type, netdev, format, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - dynamic_netdev_dbg(netdev, format, ##args); \ -} while (0) -#elif defined(DEBUG) -#define netif_dbg(priv, type, dev, format, args...) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args) -#else -#define netif_dbg(priv, type, dev, format, args...) \ -({ \ - if (0) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - -/* if @cond then downgrade to debug, else print at @level */ -#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...) \ - do { \ - if (cond) \ - netif_dbg(priv, type, netdev, fmt, ##args); \ - else \ - netif_ ## level(priv, type, netdev, fmt, ##args); \ - } while (0) - -#if defined(VERBOSE_DEBUG) -#define netif_vdbg netif_dbg -#else -#define netif_vdbg(priv, type, dev, format, args...) \ -({ \ - if (0) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. -- cgit v1.2.3 From 7c4e983c4f3cf94fcd879730c6caa877e0768a4d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:33:57 -0700 Subject: net: allow gso_max_size to exceed 65536 The code for gso_max_size was added originally to allow for debugging and workaround of buggy devices that couldn't support TSO with blocks 64K in size. The original reason for limiting it to 64K was because that was the existing limits of IPv4 and non-jumbogram IPv6 length fields. With the addition of Big TCP we can remove this limit and allow the value to potentially go up to UINT_MAX and instead be limited by the tso_max_size value. So in order to support this we need to go through and clean up the remaining users of the gso_max_size value so that the values will cap at 64K for non-TCPv6 flows. In addition we can clean up the GSO_MAX_SIZE value so that 64K becomes GSO_LEGACY_MAX_SIZE and UINT_MAX will now be the upper limit for GSO_MAX_SIZE. v6: (edumazet) fixed a compile error if CONFIG_IPV6=n, in a new sk_trim_gso_size() helper. netif_set_tso_max_size() caps the requested TSO size with GSO_MAX_SIZE. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 536321691c72..ce780e352f43 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,7 +2272,9 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ -#define GSO_MAX_SIZE 65536 +#define GSO_LEGACY_MAX_SIZE 65536u +#define GSO_MAX_SIZE UINT_MAX + unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX -- cgit v1.2.3 From 34b92e8d19da1e44070dc0ecc2747871469ac534 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:33:58 -0700 Subject: net: limit GSO_MAX_SIZE to 524280 bytes Make sure we will not overflow shinfo->gso_segs Minimal TCP MSS size is 8 bytes, and shinfo->gso_segs is a 16bit field. TCP_MIN_GSO_SIZE is currently defined in include/net/tcp.h, it seems cleaner to not bring tcp details into include/linux/netdevice.h Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce780e352f43..fd38847d0dc7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,14 +2272,17 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ +#define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u -#define GSO_MAX_SIZE UINT_MAX +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; -#define GSO_MAX_SEGS 65535 u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; -- cgit v1.2.3 From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:34:03 -0700 Subject: net: allow gro_max_size to exceed 65536 Allow the gro_max_size to exceed a value larger than 65536. There weren't really any external limitations that prevented this other than the fact that IPv4 only supports a 16 bit length field. Since we have the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to exceed this value and for IPv4 and non-TCP flows we can cap things at 65536 via a constant rather than relying on gro_max_size. [edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fd38847d0dc7..d57ce248004c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2161,7 +2161,11 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; -#define GRO_MAX_SIZE 65536 +#define GRO_LEGACY_MAX_SIZE 65536u +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; -- cgit v1.2.3 From 97e719a82b43c6c2bb5eebdb3c5d479a332ac2ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 May 2022 21:24:53 -0700 Subject: net: fix possible race in skb_attempt_defer_free() A cpu can observe sd->defer_count reaching 128, and call smp_call_function_single_async() Problem is that the remote CPU can clear sd->defer_count before the IPI is run/acknowledged. Other cpus can queue more packets and also decide to call smp_call_function_single_async() while the pending IPI was not yet delivered. This is a common issue with smp_call_function_single_async(). Callers must ensure correct synchronization and serialization. I triggered this issue while experimenting smaller threshold. Performing the call to smp_call_function_single_async() under sd->defer_lock protection did not solve the problem. Commit 5a18ceca6350 ("smp: Allow smp_call_function_single_async() to insert locked csd") replaced an informative WARN_ON_ONCE() with a return of -EBUSY, which is often ignored. Test of CSD_FLAG_LOCK presence is racy anyway. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d57ce248004c..cbaf312e365b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3136,6 +3136,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; + int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; }; -- cgit v1.2.3 From c304eddcecfe2513ff98ce3ae97d1c196d82ba08 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 May 2022 13:20:54 -0700 Subject: net: wrap the wireless pointers in struct net_device in an ifdef Most protocol-specific pointers in struct net_device are under a respective ifdef. Wireless is the notable exception. Since there's a sizable number of custom-built kernels for datacenter workloads which don't build wireless it seems reasonable to ifdefy those pointers as well. While at it move IPv4 and IPv6 pointers up, those are special for obvious reasons. Acked-by: Johannes Berg Acked-by: Stefan Schmidt # ieee802154 Acked-by: Sven Eckelmann Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1ce91cb8074a..f615a66c89e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2119,6 +2119,8 @@ struct net_device { /* Protocol-specific pointers */ + struct in_device __rcu *ip_ptr; + struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif @@ -2131,16 +2133,18 @@ struct net_device { #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif - struct in_device __rcu *ip_ptr; #if IS_ENABLED(CONFIG_DECNET) struct dn_dev __rcu *dn_ptr; #endif - struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; #endif +#if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; +#endif +#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) struct wpan_dev *ieee802154_ptr; +#endif #if IS_ENABLED(CONFIG_MPLS_ROUTING) struct mpls_dev __rcu *mpls_ptr; #endif -- cgit v1.2.3