From 75ea27d0d62281c31ee259c872dfdeb072cf5e39 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 7 Oct 2021 18:16:50 +0200 Subject: net: introduce a function to check if a netdev name is in use __dev_get_by_name is currently used to either retrieve a net device reference using its name or to check if a name is already used by a registered net device (per ns). In the later case there is no need to return a reference to a net device. Introduce a new helper, netdev_name_in_use, to check if a name is currently used by a registered net device without leaking a reference the corresponding net device. This helper uses netdev_name_node_lookup instead of __dev_get_by_name as we don't need the extra logic retrieving a reference to the corresponding net device. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d79163208dfd..15f4a658e436 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2955,6 +2955,7 @@ struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); +bool netdev_name_in_use(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); void dev_close(struct net_device *dev); -- cgit v1.2.3 From 0199215216978b612d4e8be11e878b87bc643033 Mon Sep 17 00:00:00 2001 From: Juhee Kang Date: Sun, 10 Oct 2021 13:03:29 +0900 Subject: mlxsw: spectrum: use netif_is_macsec() instead of open code Open code which is dev->priv_flags & IFF_MACSEC has already defined as netif_is_macsec(). So use netif_is_macsec() instead of open code. This patch doesn't change logic. Signed-off-by: Juhee Kang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 15f4a658e436..0723c1314ea2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5237,7 +5237,7 @@ static inline void netif_keep_dst(struct net_device *dev) static inline bool netif_reduces_vlan_mtu(struct net_device *dev) { /* TODO: reserve and use an additional IFF bit, if we get more users */ - return dev->priv_flags & IFF_MACSEC; + return netif_is_macsec(dev); } extern struct pernet_operations __net_initdata loopback_net_ops; -- cgit v1.2.3 From 40af35fdf79c77e19c597e47cc8fe9a8e200de30 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 12 Oct 2021 09:06:32 -0700 Subject: netdevice: demote the type of some dev_addr_set() helpers __dev_addr_set() and dev_addr_mod() and pretty low level, let the arguments be void, there's no chance for confusion in callers converted to use them. Keep u8 in dev_addr_set() because some of the callers are converted from a loop and we want to make sure assignments are not from an array of a different type. Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0723c1314ea2..f33af341bfb2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4643,7 +4643,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ static inline void -__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len) +__dev_addr_set(struct net_device *dev, const void *addr, size_t len) { memcpy(dev->dev_addr, addr, len); } @@ -4655,7 +4655,7 @@ static inline void dev_addr_set(struct net_device *dev, const u8 *addr) static inline void dev_addr_mod(struct net_device *dev, unsigned int offset, - const u8 *addr, size_t len) + const void *addr, size_t len) { memcpy(&dev->dev_addr[offset], addr, len); } -- cgit v1.2.3 From 9974cb5c879048f5144d0660c4932d98176213c4 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Wed, 13 Oct 2021 17:47:02 +0800 Subject: net: delete redundant function declaration The implement of function netdev_all_upper_get_next_dev_rcu has been removed in: commit f1170fd462c6 ("net: Remove all_adj_list and its references") so delete redundant declaration in header file. Fixes: f1170fd462c6 ("net: Remove all_adj_list and its references") Signed-off-by: Chen Wandun Link: https://lore.kernel.org/r/20211013094702.3931071-1-chenwandun@huawei.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f33af341bfb2..173984414f38 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4801,8 +4801,6 @@ struct netdev_nested_priv { bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); -struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter); #ifdef CONFIG_LOCKDEP static LIST_HEAD(net_unlink_list); -- cgit v1.2.3 From 42df6e1d221dddc0f2acf2be37e68d553ad65f96 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Fri, 8 Oct 2021 22:06:03 +0200 Subject: netfilter: Introduce egress hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support classifying packets with netfilter on egress to satisfy user requirements such as: * outbound security policies for containers (Laura) * filtering and mangling intra-node Direct Server Return (DSR) traffic on a load balancer (Laura) * filtering locally generated traffic coming in through AF_PACKET, such as local ARP traffic generated for clustering purposes or DHCP (Laura; the AF_PACKET plumbing is contained in a follow-up commit) * L2 filtering from ingress and egress for AVB (Audio Video Bridging) and gPTP with nftables (Pablo) * in the future: in-kernel NAT64/NAT46 (Pablo) The egress hook introduced herein complements the ingress hook added by commit e687ad60af09 ("netfilter: add netfilter ingress hook after handle_ing() under unique static key"). A patch for nftables to hook up egress rules from user space has been submitted separately, so users may immediately take advantage of the feature. Alternatively or in addition to netfilter, packets can be classified with traffic control (tc). On ingress, packets are classified first by tc, then by netfilter. On egress, the order is reversed for symmetry. Conceptually, tc and netfilter can be thought of as layers, with netfilter layered above tc. Traffic control is capable of redirecting packets to another interface (man 8 tc-mirred). E.g., an ingress packet may be redirected from the host namespace to a container via a veth connection: tc ingress (host) -> tc egress (veth host) -> tc ingress (veth container) In this case, netfilter egress classifying is not performed when leaving the host namespace! That's because the packet is still on the tc layer. If tc redirects the packet to a physical interface in the host namespace such that it leaves the system, the packet is never subjected to netfilter egress classifying. That is only logical since it hasn't passed through netfilter ingress classifying either. Packets can alternatively be redirected at the netfilter layer using nft fwd. Such a packet *is* subjected to netfilter egress classifying since it has reached the netfilter layer. Internally, the skb->nf_skip_egress flag controls whether netfilter is invoked on egress by __dev_queue_xmit(). Because __dev_queue_xmit() may be called recursively by tunnel drivers such as vxlan, the flag is reverted to false after sch_handle_egress(). This ensures that netfilter is applied both on the overlay and underlying network. Interaction between tc and netfilter is possible by setting and querying skb->mark. If netfilter egress classifying is not enabled on any interface, it is patched out of the data path by way of a static_key and doesn't make a performance difference that is discernible from noise: Before: 1537 1538 1538 1537 1538 1537 Mb/sec After: 1536 1534 1539 1539 1539 1540 Mb/sec Before + tc accept: 1418 1418 1418 1419 1419 1418 Mb/sec After + tc accept: 1419 1424 1418 1419 1422 1420 Mb/sec Before + tc drop: 1620 1619 1619 1619 1620 1620 Mb/sec After + tc drop: 1616 1624 1625 1624 1622 1619 Mb/sec When netfilter egress classifying is enabled on at least one interface, a minimal performance penalty is incurred for every egress packet, even if the interface it's transmitted over doesn't have any netfilter egress rules configured. That is caused by checking dev->nf_hooks_egress against NULL. Measurements were performed on a Core i7-3615QM. Commands to reproduce: ip link add dev foo type dummy ip link set dev foo up modprobe pktgen echo "add_device foo" > /proc/net/pktgen/kpktgend_3 samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i foo -n 400000000 -m "11:11:11:11:11:11" -d 1.1.1.1 Accept all traffic with tc: tc qdisc add dev foo clsact tc filter add dev foo egress bpf da bytecode '1,6 0 0 0,' Drop all traffic with tc: tc qdisc add dev foo clsact tc filter add dev foo egress bpf da bytecode '1,6 0 0 2,' Apply this patch when measuring packet drops to avoid errors in dmesg: https://lore.kernel.org/netdev/a73dda33-57f4-95d8-ea51-ed483abd6a7a@iogearbox.net/ Signed-off-by: Lukas Wunner Cc: Laura García Liébana Cc: John Fastabend Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Eric Dumazet Cc: Thomas Graf Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d79163208dfd..e9a48068f306 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1861,6 +1861,7 @@ enum netdev_ml_priv_type { * @xps_maps: XXX: need comments on this one * @miniq_egress: clsact qdisc specific data for * egress processing + * @nf_hooks_egress: netfilter hooks executed for egress packets * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) @@ -2161,6 +2162,9 @@ struct net_device { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; #endif +#ifdef CONFIG_NETFILTER_EGRESS + struct nf_hook_entries __rcu *nf_hooks_egress; +#endif #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); -- cgit v1.2.3 From 29cbcd85828372333aa87542c51f2b2b0fd4380c Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Sat, 16 Oct 2021 10:49:10 +0200 Subject: net: sched: Remove Qdisc::running sequence counter The Qdisc::running sequence counter has two uses: 1. Reliably reading qdisc's tc statistics while the qdisc is running (a seqcount read/retry loop at gnet_stats_add_basic()). 2. As a flag, indicating whether the qdisc in question is running (without any retry loops). For the first usage, the Qdisc::running sequence counter write section, qdisc_run_begin() => qdisc_run_end(), covers a much wider area than what is actually needed: the raw qdisc's bstats update. A u64_stats sync point was thus introduced (in previous commits) inside the bstats structure itself. A local u64_stats write section is then started and stopped for the bstats updates. Use that u64_stats sync point mechanism for the bstats read/retry loop at gnet_stats_add_basic(). For the second qdisc->running usage, a __QDISC_STATE_RUNNING bit flag, accessed with atomic bitops, is sufficient. Using a bit flag instead of a sequence counter at qdisc_run_begin/end() and qdisc_is_running() leads to the SMP barriers implicitly added through raw_read_seqcount() and write_seqcount_begin/end() getting removed. All call sites have been surveyed though, and no required ordering was identified. Now that the qdisc->running sequence counter is no longer used, remove it. Note, using u64_stats implies no sequence counter protection for 64-bit architectures. This can lead to the qdisc tc statistics "packets" vs. "bytes" values getting out of sync on rare occasions. The individual values will still be valid. Signed-off-by: Ahmed S. Darwish Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 173984414f38..f9cd6fea213f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1916,7 +1916,6 @@ enum netdev_ml_priv_type { * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock - * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the @@ -2250,7 +2249,6 @@ struct net_device { struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; - struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; unsigned threaded:1; @@ -2360,13 +2358,11 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ - static struct lock_class_key qdisc_running_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ - (dev)->qdisc_running_key = &qdisc_running_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ -- cgit v1.2.3 From 4721031c3559db8eae61df305f10c00099a7c1d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:51 -0800 Subject: net: move gro definitions to include/net/gro.h include/linux/netdevice.h became too big, move gro stuff into include/net/gro.h Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 348 ---------------------------------------------- 1 file changed, 348 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..d95c9839ce90 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2520,109 +2520,6 @@ static inline void netif_napi_del(struct napi_struct *napi) synchronize_net(); } -struct napi_gro_cb { - /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ - void *frag0; - - /* Length of frag0. */ - unsigned int frag0_len; - - /* This indicates where we are processing relative to skb->data. */ - int data_offset; - - /* This is non-zero if the packet cannot be merged with the new skb. */ - u16 flush; - - /* Save the IP ID here and check when we get to the transport layer */ - u16 flush_id; - - /* Number of segments aggregated. */ - u16 count; - - /* Start offset for remote checksum offload */ - u16 gro_remcsum_start; - - /* jiffies when first packet was created/queued */ - unsigned long age; - - /* Used in ipv6_gro_receive() and foo-over-udp */ - u16 proto; - - /* This is non-zero if the packet may be of the same flow. */ - u8 same_flow:1; - - /* Used in tunnel GRO receive */ - u8 encap_mark:1; - - /* GRO checksum is valid */ - u8 csum_valid:1; - - /* Number of checksums via CHECKSUM_UNNECESSARY */ - u8 csum_cnt:3; - - /* Free the skb? */ - u8 free:2; -#define NAPI_GRO_FREE 1 -#define NAPI_GRO_FREE_STOLEN_HEAD 2 - - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - - /* Used in GRE, set in fou/gue_gro_receive */ - u8 is_fou:1; - - /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; - - /* Number of gro_receive callbacks this packet already went through */ - u8 recursion_counter:4; - - /* GRO is done by frag_list pointer chaining. */ - u8 is_flist:1; - - /* used to support CHECKSUM_COMPLETE for tunneling protocols */ - __wsum csum; - - /* used in skb_gro_receive() slow path */ - struct sk_buff *last; -}; - -#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) - -#define GRO_RECURSION_LIMIT 15 -static inline int gro_recursion_inc_test(struct sk_buff *skb) -{ - return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; -} - -typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); -static inline struct sk_buff *call_gro_receive(gro_receive_t cb, - struct list_head *head, - struct sk_buff *skb) -{ - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - - return cb(head, skb); -} - -typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, - struct sk_buff *); -static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, - struct sock *sk, - struct list_head *head, - struct sk_buff *skb) -{ - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - - return cb(sk, head, skb); -} - struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; @@ -3008,251 +2905,6 @@ int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); -static inline unsigned int skb_gro_offset(const struct sk_buff *skb) -{ - return NAPI_GRO_CB(skb)->data_offset; -} - -static inline unsigned int skb_gro_len(const struct sk_buff *skb) -{ - return skb->len - NAPI_GRO_CB(skb)->data_offset; -} - -static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) -{ - NAPI_GRO_CB(skb)->data_offset += len; -} - -static inline void *skb_gro_header_fast(struct sk_buff *skb, - unsigned int offset) -{ - return NAPI_GRO_CB(skb)->frag0 + offset; -} - -static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) -{ - return NAPI_GRO_CB(skb)->frag0_len < hlen; -} - -static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) -{ - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; -} - -static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, - unsigned int offset) -{ - if (!pskb_may_pull(skb, hlen)) - return NULL; - - skb_gro_frag0_invalidate(skb); - return skb->data + offset; -} - -static inline void *skb_gro_network_header(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) + - skb_network_offset(skb); -} - -static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (NAPI_GRO_CB(skb)->csum_valid) - NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, - csum_partial(start, len, 0)); -} - -/* GRO checksum functions. These are logical equivalents of the normal - * checksum functions (in skbuff.h) except that they operate on the GRO - * offsets and fields in sk_buff. - */ - -__sum16 __skb_gro_checksum_complete(struct sk_buff *skb); - -static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); -} - -static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, - bool zero_okay, - __sum16 check) -{ - return ((skb->ip_summed != CHECKSUM_PARTIAL || - skb_checksum_start_offset(skb) < - skb_gro_offset(skb)) && - !skb_at_gro_remcsum_start(skb) && - NAPI_GRO_CB(skb)->csum_cnt == 0 && - (!zero_okay || check)); -} - -static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, - __wsum psum) -{ - if (NAPI_GRO_CB(skb)->csum_valid && - !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) - return 0; - - NAPI_GRO_CB(skb)->csum = psum; - - return __skb_gro_checksum_complete(skb); -} - -static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) -{ - if (NAPI_GRO_CB(skb)->csum_cnt > 0) { - /* Consume a checksum from CHECKSUM_UNNECESSARY */ - NAPI_GRO_CB(skb)->csum_cnt--; - } else { - /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we - * verified a new top level checksum or an encapsulated one - * during GRO. This saves work if we fallback to normal path. - */ - __skb_incr_checksum_unnecessary(skb); - } -} - -#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ - compute_pseudo) \ -({ \ - __sum16 __ret = 0; \ - if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ - __ret = __skb_gro_checksum_validate_complete(skb, \ - compute_pseudo(skb, proto)); \ - if (!__ret) \ - skb_gro_incr_csum_unnecessary(skb); \ - __ret; \ -}) - -#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ - __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) - -#define skb_gro_checksum_validate_zero_check(skb, proto, check, \ - compute_pseudo) \ - __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) - -#define skb_gro_checksum_simple_validate(skb) \ - __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) - -static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid); -} - -static inline void __skb_gro_checksum_convert(struct sk_buff *skb, - __wsum pseudo) -{ - NAPI_GRO_CB(skb)->csum = ~pseudo; - NAPI_GRO_CB(skb)->csum_valid = 1; -} - -#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ -do { \ - if (__skb_gro_checksum_convert_check(skb)) \ - __skb_gro_checksum_convert(skb, \ - compute_pseudo(skb, proto)); \ -} while (0) - -struct gro_remcsum { - int offset; - __wsum delta; -}; - -static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) -{ - grc->offset = 0; - grc->delta = 0; -} - -static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, - unsigned int off, size_t hdrlen, - int start, int offset, - struct gro_remcsum *grc, - bool nopartial) -{ - __wsum delta; - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); - - BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); - - if (!nopartial) { - NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; - return ptr; - } - - ptr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, off + plen)) { - ptr = skb_gro_header_slow(skb, off + plen, off); - if (!ptr) - return NULL; - } - - delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, - start, offset); - - /* Adjust skb->csum since we changed the packet */ - NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); - - grc->offset = off + hdrlen + offset; - grc->delta = delta; - - return ptr; -} - -static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, - struct gro_remcsum *grc) -{ - void *ptr; - size_t plen = grc->offset + sizeof(u16); - - if (!grc->delta) - return; - - ptr = skb_gro_header_fast(skb, grc->offset); - if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) { - ptr = skb_gro_header_slow(skb, plen, grc->offset); - if (!ptr) - return; - } - - remcsum_unadjust((__sum16 *)ptr, grc->delta); -} - -#ifdef CONFIG_XFRM_OFFLOAD -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) -{ - if (PTR_ERR(pp) != -EINPROGRESS) - NAPI_GRO_CB(skb)->flush |= flush; -} -static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, - struct sk_buff *pp, - int flush, - struct gro_remcsum *grc) -{ - if (PTR_ERR(pp) != -EINPROGRESS) { - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, grc); - skb->remcsum_offload = 0; - } -} -#else -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) -{ - NAPI_GRO_CB(skb)->flush |= flush; -} -static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, - struct sk_buff *pp, - int flush, - struct gro_remcsum *grc) -{ - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, grc); - skb->remcsum_offload = 0; -} -#endif static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, -- cgit v1.2.3 From 0b935d7f8c07bf0a192712bdbf76dbf45ef8b115 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:52 -0800 Subject: net: gro: move skb_gro_receive_list to udp_offload.c This helper is used once, no need to keep it in fat net/core/skbuff.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d95c9839ce90..ce6ee1453dbc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2903,7 +2903,6 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); -int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From e456a18a390b96f22b0de2acd4d0f49c72ed2280 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:53 -0800 Subject: net: gro: move skb_gro_receive into net/core/gro.c net/core/gro.c will contain all core gro functions, to shrink net/core/skbuff.c and net/core/dev.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce6ee1453dbc..93d397db9ec4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2902,7 +2902,6 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); -int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From 587652bbdd06ab38a4c1b85e40f933d2cf4a1147 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:54 -0800 Subject: net: gro: populate net/core/gro.c Move gro code and data from net/core/dev.c to net/core/gro.c to ease maintenance. gro_normal_list() and gro_normal_one() are inlined because they are called from both files. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93d397db9ec4..31a7e6b27681 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3657,6 +3657,7 @@ int netif_rx_ni(struct sk_buff *skb); int netif_rx_any_context(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); +void netif_receive_skb_list_internal(struct list_head *head); void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); -- cgit v1.2.3 From 7071732c26fe2cf141185ed16a8a85d02495ae8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:23:02 -0800 Subject: net: use .data.once section in netdev_level_once() Same rationale than prior patch : using the dedicated section avoid holes and pack all these bool values. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 31a7e6b27681..dd328364dfe9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,7 @@ void netdev_info(const struct net_device *dev, const char *format, ...); #define netdev_level_once(level, dev, fmt, ...) \ do { \ - static bool __print_once __read_mostly; \ + static bool __section(".data.once") __print_once; \ \ if (!__print_once) { \ __print_once = true; \ -- cgit v1.2.3 From 8160fb43d55d26d64607fd32fe69185a5f5fe41f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:21 -0800 Subject: net: use an atomic_long_t for queue->trans_timeout tx_timeout_show() assumed dev_watchdog() would stop all the queues, to fetch queue->trans_timeout under protection of the queue->_xmit_lock. As we want to no longer disrupt transmits, we use an atomic_long_t instead. Signed-off-by: Eric Dumazet Cc: david decotigny Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd328364dfe9..1d22483cf78c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -592,7 +592,7 @@ struct netdev_queue { * Number of TX timeouts for this queue * (/sys/class/net/DEV/Q/trans_timeout) */ - unsigned long trans_timeout; + atomic_long_t trans_timeout; /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; -- cgit v1.2.3 From 5337824f4dc4bb26f38fbbba4ffb425a92803f15 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:22 -0800 Subject: net: annotate accesses to queue->trans_start In following patches, dev_watchdog() will no longer stop all queues. It will read queue->trans_start locklessly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d22483cf78c..279409ef2b18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4095,10 +4095,21 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) spin_unlock_bh(&txq->_xmit_lock); } +/* + * txq->trans_start can be read locklessly from dev_watchdog() + */ static inline void txq_trans_update(struct netdev_queue *txq) { if (txq->xmit_lock_owner != -1) - txq->trans_start = jiffies; + WRITE_ONCE(txq->trans_start, jiffies); +} + +static inline void txq_trans_cond_update(struct netdev_queue *txq) +{ + unsigned long now = jiffies; + + if (READ_ONCE(txq->trans_start) != now) + WRITE_ONCE(txq->trans_start, now); } /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */ @@ -4106,8 +4117,7 @@ static inline void netif_trans_update(struct net_device *dev) { struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); - if (txq->trans_start != jiffies) - txq->trans_start = jiffies; + txq_trans_cond_update(txq); } /** -- cgit v1.2.3 From dab8fe320726b38a6b1dc6a7ca6e386c5f7779e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:23 -0800 Subject: net: do not inline netif_tx_lock()/netif_tx_unlock() These are not fast path, there is no point in inlining them. Also provide netif_freeze_queues()/netif_unfreeze_queues() so that we can use them from dev_watchdog() in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 39 ++------------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 279409ef2b18..4f4a299e92de 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4126,27 +4126,7 @@ static inline void netif_trans_update(struct net_device *dev) * * Get network device transmit lock */ -static inline void netif_tx_lock(struct net_device *dev) -{ - unsigned int i; - int cpu; - - spin_lock(&dev->tx_global_lock); - cpu = smp_processor_id(); - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - /* We are the only thread of execution doing a - * freeze, but we have to grab the _xmit_lock in - * order to synchronize with threads which are in - * the ->hard_start_xmit() handler and already - * checked the frozen bit. - */ - __netif_tx_lock(txq, cpu); - set_bit(__QUEUE_STATE_FROZEN, &txq->state); - __netif_tx_unlock(txq); - } -} +void netif_tx_lock(struct net_device *dev); static inline void netif_tx_lock_bh(struct net_device *dev) { @@ -4154,22 +4134,7 @@ static inline void netif_tx_lock_bh(struct net_device *dev) netif_tx_lock(dev); } -static inline void netif_tx_unlock(struct net_device *dev) -{ - unsigned int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - /* No need to grab the _xmit_lock here. If the - * queue is not stopped for another reason, we - * force a schedule. - */ - clear_bit(__QUEUE_STATE_FROZEN, &txq->state); - netif_schedule_queue(txq); - } - spin_unlock(&dev->tx_global_lock); -} +void netif_tx_unlock(struct net_device *dev); static inline void netif_tx_unlock_bh(struct net_device *dev) { -- cgit v1.2.3 From adeef3e32146a8d2a73c399dc6f5d76a449131b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Nov 2021 06:21:51 -0800 Subject: net: constify netdev->dev_addr Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. We converted all users to make modifications via appropriate helpers, make netdev->dev_addr const. The update helpers need to upcast from the buffer to struct netdev_hw_addr. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4f4a299e92de..2462195784a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2117,7 +2117,7 @@ struct net_device { * Cache lines mostly used on receive path (including eth_type_trans()) */ /* Interface address info used in eth_type_trans() */ - unsigned char *dev_addr; + const unsigned char *dev_addr; struct netdev_rx_queue *_rx; unsigned int num_rx_queues; @@ -4268,10 +4268,13 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len); + static inline void __dev_addr_set(struct net_device *dev, const void *addr, size_t len) { - memcpy(dev->dev_addr, addr, len); + dev_addr_mod(dev, 0, addr, len); } static inline void dev_addr_set(struct net_device *dev, const u8 *addr) @@ -4279,13 +4282,6 @@ static inline void dev_addr_set(struct net_device *dev, const u8 *addr) __dev_addr_set(dev, addr, dev->addr_len); } -static inline void -dev_addr_mod(struct net_device *dev, unsigned int offset, - const void *addr, size_t len) -{ - memcpy(&dev->dev_addr[offset], addr, len); -} - int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, -- cgit v1.2.3 From d07b26f5bbea9ade34dfd6abea7b3ca056c03cd1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Nov 2021 06:21:53 -0800 Subject: dev_addr: add a modification check netdev->dev_addr should only be modified via helpers, but someone may be casting off the const. Add a runtime check to catch abuses. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2462195784a9..cb7f2661d187 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type { * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. * + * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2268,6 +2270,8 @@ struct net_device { /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; + + u8 dev_addr_shadow[MAX_ADDR_LEN]; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -4288,6 +4292,7 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); -- cgit v1.2.3 From 4b66d2161b8125b6caa6971815e85631cf3cf36f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Nov 2021 07:43:31 -0800 Subject: net: annotate accesses to dev->gso_max_size dev->gso_max_size is written under RTNL protection, or when the device is not yet visible, but is read locklessly. Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_size() where we can to better document what is going on. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb7f2661d187..14eeb58ed197 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4731,7 +4731,8 @@ static inline bool netif_needs_gso(struct sk_buff *skb, static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { - dev->gso_max_size = size; + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); } static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, -- cgit v1.2.3 From 6d872df3e3b91532b142de9044e5b4984017a55f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Nov 2021 07:43:32 -0800 Subject: net: annotate accesses to dev->gso_max_segs dev->gso_max_segs is written under RTNL protection, or when the device is not yet visible, but is read locklessly. Add netif_set_gso_max_segs() helper. Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_segs() where we can to better document what is going on. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 14eeb58ed197..df049864661d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4735,6 +4735,13 @@ static inline void netif_set_gso_max_size(struct net_device *dev, WRITE_ONCE(dev->gso_max_size, size); } +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 2106efda785b55a8957efed9a52dfa28ee0d7280 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 22 Nov 2021 17:24:47 -0800 Subject: net: remove .ndo_change_proto_down .ndo_change_proto_down was added seemingly to enable out-of-tree implementations. Over 2.5yrs later we still have no real users upstream. Hardwire the generic implementation for now, we can revert once real users materialize. (rocker is a test vehicle, not a user.) We need to drop the optimization on the sysfs side, because unlike ndos priv_flags will be changed at runtime, so we'd need READ_ONCE/WRITE_ONCE everywhere.. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df049864661d..db3bff1ae7fd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1297,11 +1297,6 @@ struct netdev_net_notifier { * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. - * void (*ndo_change_proto_down)(struct net_device *dev, - * bool proto_down); - * This function is used to pass protocol port error state information - * to the switch driver. The switch driver can react to the proto_down - * by doing a phys down on the associated switch port. * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while @@ -1542,8 +1537,6 @@ struct net_device_ops { int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); - int (*ndo_change_proto_down)(struct net_device *dev, - bool proto_down); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, @@ -1612,6 +1605,7 @@ struct net_device_ops { * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) + * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1646,6 +1640,7 @@ enum netdev_priv_flags { IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, IFF_TX_SKB_NO_LINEAR = 1<<31, + IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1982,7 +1977,7 @@ struct net_device { /* Read-mostly cache-line for fast-path access */ unsigned int flags; - unsigned int priv_flags; + unsigned long long priv_flags; const struct net_device_ops *netdev_ops; int ifindex; unsigned short gflags; @@ -3735,7 +3730,6 @@ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); int dev_change_proto_down(struct net_device *dev, bool proto_down); -int dev_change_proto_down_generic(struct net_device *dev, bool proto_down); void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); -- cgit v1.2.3 From 7a10d8c810cfad3e79372d7d1c77899d86cd6662 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2021 09:01:55 -0800 Subject: net: annotate data-races on txq->xmit_lock_owner syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner without annotations. No serious issue there, let's document what is happening there. BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0: __netif_tx_unlock include/linux/netdevice.h:4437 [inline] __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_hh_output include/net/neighbour.h:511 [inline] neigh_output include/net/neighbour.h:525 [inline] ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1: __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523 neigh_output include/net/neighbour.h:527 [inline] ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443 folio_test_anon include/linux/page-flags.h:581 [inline] PageAnon include/linux/page-flags.h:586 [inline] zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347 zap_pmd_range mm/memory.c:1467 [inline] zap_pud_range mm/memory.c:1496 [inline] zap_p4d_range mm/memory.c:1517 [inline] unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538 unmap_single_vma+0x157/0x210 mm/memory.c:1583 unmap_vmas+0xd0/0x180 mm/memory.c:1615 exit_mmap+0x23d/0x470 mm/mmap.c:3170 __mmput+0x27/0x1b0 kernel/fork.c:1113 mmput+0x3d/0x50 kernel/fork.c:1134 exit_mm+0xdb/0x170 kernel/exit.c:507 do_exit+0x608/0x17a0 kernel/exit.c:819 do_group_exit+0xce/0x180 kernel/exit.c:929 get_signal+0xfc3/0x1550 kernel/signal.c:2852 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0xffffffff Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G W 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..be5cb3360b94 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4404,7 +4404,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) @@ -4421,26 +4422,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); - if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + + if (likely(ok)) { + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); + } return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } -- cgit v1.2.3 From 4d92b95ff2f95f13df9bad0b5a25a9f60e72758d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:57 -0800 Subject: net: add net device refcount tracker infrastructure net device are refcounted. Over the years we had numerous bugs caused by imbalanced dev_hold() and dev_put() calls. The general idea is to be able to precisely pair each decrement with a corresponding prior increment. Both share a cookie, basically a pointer to private data storing stack traces. This patch adds dev_hold_track() and dev_put_track(). To use these helpers, each data structure owning a refcount should also use a "netdevice_tracker" to pair the hold and put. netdevice_tracker dev_tracker; ... dev_hold_track(dev, &dev_tracker, GFP_ATOMIC); ... dev_put_track(dev, &dev_tracker); Whenever a leak happens, we will get precise stack traces of the point dev_hold_track() happened, at device dismantle phase. We will also get a stack trace if too many dev_put_track() for the same netdevice_tracker are attempted. This is guarded by CONFIG_NET_DEV_REFCNT_TRACKER option. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 65117f01d5f2..143d60ed0047 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,6 +48,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -300,6 +301,12 @@ enum netdev_state_t { }; +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +typedef struct ref_tracker *netdevice_tracker; +#else +typedef struct {} netdevice_tracker; +#endif + struct gro_list { struct list_head list; int count; @@ -1865,6 +1872,7 @@ enum netdev_ml_priv_type { * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @dev_refcnt: Number of references to this device + * @refcnt_tracker: Tracker directory for tracked references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * @@ -2178,6 +2186,7 @@ struct net_device { #else refcount_t dev_refcnt; #endif + struct ref_tracker_dir refcnt_tracker; struct list_head link_watch_list; @@ -3805,6 +3814,7 @@ void netdev_run_todo(void); * @dev: network device * * Release reference to device to allow it to be freed. + * Try using dev_put_track() instead. */ static inline void dev_put(struct net_device *dev) { @@ -3822,6 +3832,7 @@ static inline void dev_put(struct net_device *dev) * @dev: network device * * Hold reference to device to keep it from being freed. + * Try using dev_hold_track() instead. */ static inline void dev_hold(struct net_device *dev) { @@ -3834,6 +3845,40 @@ static inline void dev_hold(struct net_device *dev) } } +static inline void netdev_tracker_alloc(struct net_device *dev, + netdevice_tracker *tracker, gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); +#endif +} + +static inline void netdev_tracker_free(struct net_device *dev, + netdevice_tracker *tracker) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_free(&dev->refcnt_tracker, tracker); +#endif +} + +static inline void dev_hold_track(struct net_device *dev, + netdevice_tracker *tracker, gfp_t gfp) +{ + if (dev) { + dev_hold(dev); + netdev_tracker_alloc(dev, tracker, gfp); + } +} + +static inline void dev_put_track(struct net_device *dev, + netdevice_tracker *tracker) +{ + if (dev) { + netdev_tracker_free(dev, tracker); + dev_put(dev); + } +} + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. -- cgit v1.2.3 From 80e8921b2b72c300ca56a01729004d30bedb82cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:58 -0800 Subject: net: add net device refcount tracker to struct netdev_rx_queue This helps debugging net device refcount leaks. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 143d60ed0047..3d691fadd569 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -741,6 +741,8 @@ struct netdev_rx_queue { #endif struct kobject kobj; struct net_device *dev; + netdevice_tracker dev_tracker; + #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif -- cgit v1.2.3 From 0b688f24b7d611db3a02f3d4ab562d049c78a17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:59 -0800 Subject: net: add net device refcount tracker to struct netdev_queue This will help debugging pesky netdev reference leaks. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3d691fadd569..b4f704337f65 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -586,6 +586,8 @@ struct netdev_queue { * read-mostly part */ struct net_device *dev; + netdevice_tracker dev_tracker; + struct Qdisc __rcu *qdisc; struct Qdisc *qdisc_sleeping; #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 9038c320001dd07f60736018edf608ac5baca0ab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:03 -0800 Subject: net: dst: add net device refcount tracking to dst_entry We want to track all dev_hold()/dev_put() to ease leak hunting. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4f704337f65..afed3b10491b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3883,6 +3883,23 @@ static inline void dev_put_track(struct net_device *dev, } } +static inline void dev_replace_track(struct net_device *odev, + struct net_device *ndev, + netdevice_tracker *tracker, + gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (odev) + ref_tracker_free(&odev->refcnt_tracker, tracker); +#endif + dev_hold(ndev); + dev_put(odev); +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (ndev) + ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp); +#endif +} + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. -- cgit v1.2.3 From 63f13937cbe9b00982dfc8e578b1aec8e5037333 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:14 -0800 Subject: net: linkwatch: add net device refcount tracker Add a netdevice_tracker inside struct net_device, to track the self reference when a device is in lweventlist. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index afed3b10491b..69dca1edd5a6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1950,6 +1950,7 @@ enum netdev_ml_priv_type { * keep a list of interfaces to be deleted. * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. + * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2280,6 +2281,7 @@ struct net_device { struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; u8 dev_addr_shadow[MAX_ADDR_LEN]; + netdevice_tracker linkwatch_dev_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From f12bf6f3f942b37de65eeea8be25903587fec930 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2021 17:30:30 -0800 Subject: net: watchdog: add net device refcount tracker Add a netdevice_tracker inside struct net_device, to track the self reference when a device has an active watchdog timer. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69dca1edd5a6..1a748ee9a421 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1951,6 +1951,7 @@ enum netdev_ml_priv_type { * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. + * @watchdog_dev_tracker: refcount tracker used by watchdog. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2282,6 +2283,7 @@ struct net_device { u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; + netdevice_tracker watchdog_dev_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From 9ba74e6c9e9d0c5c1e5792a7111fc7d1a0589cb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Dec 2021 23:44:21 -0800 Subject: net: add networking namespace refcount tracker We have 100+ syzbot reports about netns being dismantled too soon, still unresolved as of today. We think a missing get_net() or an extra put_net() is the root cause. In order to find the bug(s), and be able to spot future ones, this patch adds CONFIG_NET_NS_REFCNT_TRACKER and new helpers to precisely pair all put_net() with corresponding get_net(). To use these helpers, each data structure owning a refcount should also use a "netns_tracker" to pair the get and put. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a748ee9a421..235d5d082f1a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,7 +48,7 @@ #include #include #include -#include +#include struct netpoll_info; struct device; @@ -300,13 +300,6 @@ enum netdev_state_t { __LINK_STATE_TESTING, }; - -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER -typedef struct ref_tracker *netdevice_tracker; -#else -typedef struct {} netdevice_tracker; -#endif - struct gro_list { struct list_head list; int count; -- cgit v1.2.3 From 9280ac2e6f199cddcd746a9ba459136b8666287b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Dec 2021 07:15:15 -0800 Subject: net: dev_replace_track() cleanup Use existing helpers (netdev_tracker_free() and netdev_tracker_alloc()) to remove ifdefery. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20211214151515.312535-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 235d5d082f1a..c06e9dc1a317 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3885,16 +3885,14 @@ static inline void dev_replace_track(struct net_device *odev, netdevice_tracker *tracker, gfp_t gfp) { -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER if (odev) - ref_tracker_free(&odev->refcnt_tracker, tracker); -#endif + netdev_tracker_free(odev, tracker); + dev_hold(ndev); dev_put(odev); -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (ndev) - ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp); -#endif + netdev_tracker_alloc(ndev, tracker, gfp); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on -- cgit v1.2.3 From f1d9268e061863ead77b07f5a6807d063e28a1c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Dec 2021 07:09:33 -0800 Subject: net: add net device refcount tracker to struct packet_type Most notable changes are in af_packet, tipc ones are trivial. Signed-off-by: Eric Dumazet Cc: Jon Maloy Cc: Ying Xue Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c06e9dc1a317..a419718612c6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2533,6 +2533,7 @@ struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; struct net_device *dev; /* NULL is wildcarded here */ + netdevice_tracker dev_tracker; int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, -- cgit v1.2.3 From b62e3317b68d9c84301940ca8ca9c35a584111b2 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Thu, 16 Dec 2021 23:19:16 +0800 Subject: net: fix typo in a comment The double 'as' in a comment is repeated, thus it should be removed. Signed-off-by: Xiang wangx Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be5cb3360b94..6aadcc0ecb5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1937,7 +1937,7 @@ enum netdev_ml_priv_type { * @udp_tunnel_nic: UDP tunnel offload state * @xdp_state: stores info on attached XDP BPF programs * - * @nested_level: Used as as a parameter of spin_lock_nested() of + * @nested_level: Used as a parameter of spin_lock_nested() of * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. -- cgit v1.2.3 From 8cbfe939abe905280279e84a297b1cb34e0d0ec9 Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 17 Dec 2021 19:16:22 +0100 Subject: flow_offload: allow user to offload tc action to net device Use flow_indr_dev_register/flow_indr_dev_setup_offload to offload tc action. We need to call tc_cleanup_flow_action to clean up tc action entry since in tc_setup_action, some actions may hold dev refcnt, especially the mirror action. Signed-off-by: Baowen Zheng Signed-off-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a419718612c6..8b0bdeb4734e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -920,6 +920,7 @@ enum tc_setup_type { TC_SETUP_QDISC_TBF, TC_SETUP_QDISC_FIFO, TC_SETUP_QDISC_HTB, + TC_SETUP_ACT, }; /* These structures hold the attributes of bpf state that are being passed -- cgit v1.2.3 From d6c6d0bb2cb3af91c9c1546af7cdf4770d0df443 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 29 Dec 2021 12:36:20 +0100 Subject: net: remove references to CONFIG_IRDA in network header files Commit d64c2a76123f ("staging: irda: remove the irda network stack and drivers") removes the config IRDA. Remove the remaining references to this non-existing config in the network header files. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20211229113620.19368-1-lukas.bulwahn@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2684bdb6defa..6f99c8f51b60 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2098,7 +2098,7 @@ struct net_device { #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; #endif -#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) +#if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif struct in_device __rcu *ip_ptr; -- cgit v1.2.3 From eac1b93c14d645ef147b049ace0d5230df755548 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Wed, 5 Jan 2022 02:48:38 -0800 Subject: gro: add ability to control gro max packet size Eric Dumazet suggested to allow users to modify max GRO packet size. We have seen GRO being disabled by users of appliances (such as wifi access points) because of claimed bufferbloat issues, or some work arounds in sch_cake, to split GRO/GSO packets. Instead of disabling GRO completely, one can chose to limit the maximum packet size of GRO packets, depending on their latency constraints. This patch adds a per device gro_max_size attribute that can be changed with ip link command. ip link set dev eth0 gro_max_size 16000 Suggested-by: Eric Dumazet Signed-off-by: Coco Li Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6f99c8f51b60..3213c7227b59 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type { * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. + * @gro_max_size: Maximum size of aggregated packet in generic + * receive offload (GRO) * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. @@ -2131,6 +2133,8 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; +#define GRO_MAX_SIZE 65536 + unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; @@ -4806,6 +4810,13 @@ static inline void netif_set_gso_max_segs(struct net_device *dev, WRITE_ONCE(dev->gso_max_segs, segs); } +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 47934e06b65637c88a762d9c98329ae6e3238888 Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Tue, 18 Jan 2022 14:20:13 -0500 Subject: net: fix information leakage in /proc/net/ptype In one net namespace, after creating a packet socket without binding it to a device, users in other net namespaces can observe the new `packet_type` added by this packet socket by reading `/proc/net/ptype` file. This is minor information leakage as packet socket is namespace aware. Add a net pointer in `packet_type` to keep the net namespace of of corresponding packet socket. In `ptype_seq_show`, this net pointer must be checked when it is not NULL. Fixes: 2feb27dbe00c ("[NETNS]: Minor information leak via /proc/net/ptype file.") Signed-off-by: Congyu Liu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3213c7227b59..e490b84732d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2548,6 +2548,7 @@ struct packet_type { struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); + struct net *af_packet_net; void *af_packet_priv; struct list_head list; }; -- cgit v1.2.3