From b4c2b9593a1c4c3a718370e34af28e817fd5e5c6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:49 +0530 Subject: net/netfilter: Add unstable CT lookup helpers for XDP and TC-BPF This change adds conntrack lookup helpers using the unstable kfunc call interface for the XDP and TC-BPF hooks. The primary usecase is implementing a synproxy in XDP, see Maxim's patchset [0]. Export get_net_ns_by_id as nf_conntrack_bpf.c needs to call it. This object is only built when CONFIG_DEBUG_INFO_BTF_MODULES is enabled. [0]: https://lore.kernel.org/bpf/20211019144655.3483197-1-maximmi@nvidia.com Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_bpf.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/net/netfilter/nf_conntrack_bpf.h (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h new file mode 100644 index 000000000000..a473b56842c5 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _NF_CONNTRACK_BPF_H +#define _NF_CONNTRACK_BPF_H + +#include +#include + +#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern int register_nf_conntrack_bpf(void); + +#else + +static inline int register_nf_conntrack_bpf(void) +{ + return 0; +} + +#endif + +#endif /* _NF_CONNTRACK_BPF_H */ -- cgit v1.2.3 From 3368aa357f3ba133ae65fc26c04d24a1447a3903 Mon Sep 17 00:00:00 2001 From: Manish Mandlik Date: Tue, 11 Jan 2022 08:14:25 -0800 Subject: Bluetooth: msft: Handle MSFT Monitor Device Event Whenever the controller starts/stops monitoring a bt device, it sends MSFT Monitor Device event. Add handler to read this vendor event. Test performed: - Verified by logs that the MSFT Monitor Device event is received from the controller whenever it starts/stops monitoring a device. Signed-off-by: Manish Mandlik Reviewed-by: Miao-chen Chou Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 586f69d084a2..639fb9f57ae7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -258,6 +258,15 @@ struct adv_info { #define HCI_ADV_TX_POWER_NO_PREFERENCE 0x7F +struct monitored_device { + struct list_head list; + + bdaddr_t bdaddr; + __u8 addr_type; + __u16 handle; + bool notified; +}; + struct adv_pattern { struct list_head list; __u8 ad_type; @@ -591,6 +600,8 @@ struct hci_dev { struct delayed_work interleave_scan; + struct list_head monitored_devices; + #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif -- cgit v1.2.3 From 8d7f167752c3e4c45a39e76ffa6f7209413d3fa6 Mon Sep 17 00:00:00 2001 From: Manish Mandlik Date: Tue, 11 Jan 2022 08:14:26 -0800 Subject: Bluetooth: mgmt: Add MGMT Adv Monitor Device Found/Lost events This patch introduces two new MGMT events for notifying the bluetoothd whenever the controller starts/stops monitoring a device. Test performed: - Verified by logs that the MSFT Monitor Device is received from the controller and the bluetoothd is notified whenever the controller starts/stops monitoring a device. Signed-off-by: Manish Mandlik Reviewed-by: Miao-chen Chou Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 3 +++ include/net/bluetooth/mgmt.h | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 639fb9f57ae7..21eadb113a31 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -601,6 +601,7 @@ struct hci_dev { struct delayed_work interleave_scan; struct list_head monitored_devices; + bool advmon_pend_notify; #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; @@ -1858,6 +1859,8 @@ void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status); int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status); +void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, + bdaddr_t *bdaddr, u8 addr_type); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 107b25deae68..99266f7aebdc 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -1104,3 +1104,19 @@ struct mgmt_ev_controller_resume { #define MGMT_WAKE_REASON_NON_BT_WAKE 0x0 #define MGMT_WAKE_REASON_UNEXPECTED 0x1 #define MGMT_WAKE_REASON_REMOTE_WAKE 0x2 + +#define MGMT_EV_ADV_MONITOR_DEVICE_FOUND 0x002f +struct mgmt_ev_adv_monitor_device_found { + __le16 monitor_handle; + struct mgmt_addr_info addr; + __s8 rssi; + __le32 flags; + __le16 eir_len; + __u8 eir[0]; +} __packed; + +#define MGMT_EV_ADV_MONITOR_DEVICE_LOST 0x0030 +struct mgmt_ev_adv_monitor_device_lost { + __le16 monitor_handle; + struct mgmt_addr_info addr; +} __packed; -- cgit v1.2.3 From 48cec899e357cfb92d022a9c0df6bbe72a7f6951 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 20 Jan 2022 14:34:40 +0200 Subject: tcp: Add a stub for sk_defer_free_flush() When compiling the kernel with CONFIG_INET disabled, the sk_defer_free_flush() should be defined as a nop. This resolves the following compilation error: ld: net/core/sock.o: in function `sk_defer_free_flush': ./include/net/tcp.h:1378: undefined reference to `__sk_defer_free_flush' Fixes: 79074a72d335 ("net: Flush deferred skb free on socket destroy") Reported-by: kernel test robot Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220120123440.9088-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 44e442bf23f9..b9fc978fb2ca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1369,6 +1369,7 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +#ifdef CONFIG_INET void __sk_defer_free_flush(struct sock *sk); static inline void sk_defer_free_flush(struct sock *sk) @@ -1377,6 +1378,9 @@ static inline void sk_defer_free_flush(struct sock *sk) return; __sk_defer_free_flush(sk); } +#else +static inline void sk_defer_free_flush(struct sock *sk) {} +#endif int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); -- cgit v1.2.3 From aafc2e3285c2d7a79b7ee15221c19fbeca7b1509 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jan 2022 09:41:12 -0800 Subject: ipv6: annotate accesses to fn->fn_sernum struct fib6_node's fn_sernum field can be read while other threads change it. Add READ_ONCE()/WRITE_ONCE() annotations. Do not change existing smp barriers in fib6_get_cookie_safe() and __fib6_update_sernum_upto_root() syzbot reported: BUG: KCSAN: data-race in fib6_clean_node / inet6_csk_route_socket write to 0xffff88813df62e2c of 4 bytes by task 1920 on cpu 1: fib6_clean_node+0xc2/0x260 net/ipv6/ip6_fib.c:2178 fib6_walk_continue+0x38e/0x430 net/ipv6/ip6_fib.c:2112 fib6_walk net/ipv6/ip6_fib.c:2160 [inline] fib6_clean_tree net/ipv6/ip6_fib.c:2240 [inline] __fib6_clean_all+0x1a9/0x2e0 net/ipv6/ip6_fib.c:2256 fib6_flush_trees+0x6c/0x80 net/ipv6/ip6_fib.c:2281 rt_genid_bump_ipv6 include/net/net_namespace.h:488 [inline] addrconf_dad_completed+0x57f/0x870 net/ipv6/addrconf.c:4230 addrconf_dad_work+0x908/0x1170 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:359 ret_from_fork+0x1f/0x30 read to 0xffff88813df62e2c of 4 bytes by task 15701 on cpu 0: fib6_get_cookie_safe include/net/ip6_fib.h:285 [inline] rt6_get_cookie include/net/ip6_fib.h:306 [inline] ip6_dst_store include/net/ip6_route.h:234 [inline] inet6_csk_route_socket+0x352/0x3c0 net/ipv6/inet6_connection_sock.c:109 inet6_csk_xmit+0x91/0x1e0 net/ipv6/inet6_connection_sock.c:121 __tcp_transmit_skb+0x1323/0x1840 net/ipv4/tcp_output.c:1402 tcp_transmit_skb net/ipv4/tcp_output.c:1420 [inline] tcp_write_xmit+0x1450/0x4460 net/ipv4/tcp_output.c:2680 __tcp_push_pending_frames+0x68/0x1c0 net/ipv4/tcp_output.c:2864 tcp_push+0x2d9/0x2f0 net/ipv4/tcp.c:725 mptcp_push_release net/mptcp/protocol.c:1491 [inline] __mptcp_push_pending+0x46c/0x490 net/mptcp/protocol.c:1578 mptcp_sendmsg+0x9ec/0xa50 net/mptcp/protocol.c:1764 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:643 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] kernel_sendmsg+0x97/0xd0 net/socket.c:745 sock_no_sendpage+0x84/0xb0 net/core/sock.c:3086 inet_sendpage+0x9d/0xc0 net/ipv4/af_inet.c:834 kernel_sendpage+0x187/0x200 net/socket.c:3492 sock_sendpage+0x5a/0x70 net/socket.c:1007 pipe_to_sendpage+0x128/0x160 fs/splice.c:364 splice_from_pipe_feed fs/splice.c:418 [inline] __splice_from_pipe+0x207/0x500 fs/splice.c:562 splice_from_pipe fs/splice.c:597 [inline] generic_splice_sendpage+0x94/0xd0 fs/splice.c:746 do_splice_from fs/splice.c:767 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:936 splice_direct_to_actor+0x345/0x650 fs/splice.c:891 do_splice_direct+0x106/0x190 fs/splice.c:979 do_sendfile+0x675/0xc40 fs/read_write.c:1245 __do_sys_sendfile64 fs/read_write.c:1310 [inline] __se_sys_sendfile64 fs/read_write.c:1296 [inline] __x64_sys_sendfile64+0x102/0x140 fs/read_write.c:1296 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000026f -> 0x00000271 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 15701 Comm: syz-executor.2 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 The Fixes tag I chose is probably arbitrary, I do not think we need to backport this patch to older kernels. Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220120174112.1126644-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a9a4ccc0cdb5..40ae8f1b18e5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -282,7 +282,7 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, fn = rcu_dereference(f6i->fib6_node); if (fn) { - *cookie = fn->fn_sernum; + *cookie = READ_ONCE(fn->fn_sernum); /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */ smp_rmb(); status = true; -- cgit v1.2.3 From 2e88d4ff03013937028f5397268b21e10cf68713 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:45 +0100 Subject: xdp: introduce flags field in xdp_buff/xdp_frame Introduce flags field in xdp_frame and xdp_buffer data structures to define additional buffer features. At the moment the only supported buffer feature is frags bit (XDP_FLAGS_HAS_FRAGS). frags bit is used to specify if this is a linear buffer (XDP_FLAGS_HAS_FRAGS not set) or a frags frame (XDP_FLAGS_HAS_FRAGS set). In the latter case the driver is expected to initialize the skb_shared_info structure at the end of the first buffer to link together subsequent buffers belonging to the same frame. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/e389f14f3a162c0a5bc6a2e1aa8dd01a90be117d.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 8f0812e4996d..485e9495a690 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -66,6 +66,10 @@ struct xdp_txq_info { struct net_device *dev; }; +enum xdp_buff_flags { + XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ +}; + struct xdp_buff { void *data; void *data_end; @@ -74,13 +78,30 @@ struct xdp_buff { struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ + u32 flags; /* supported values defined in xdp_buff_flags */ }; +static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); +} + +static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_HAS_FRAGS; +} + +static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->frame_sz = frame_sz; xdp->rxq = rxq; + xdp->flags = 0; } static __always_inline void @@ -122,8 +143,14 @@ struct xdp_frame { */ struct xdp_mem_info mem; struct net_device *dev_rx; /* used by cpumap */ + u32 flags; /* supported values defined in xdp_buff_flags */ }; +static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); +} + #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; @@ -180,6 +207,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; + xdp->flags = frame->flags; } static inline @@ -206,6 +234,7 @@ int xdp_update_frame_from_buff(struct xdp_buff *xdp, xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; + xdp_frame->flags = xdp->flags; return 0; } -- cgit v1.2.3 From d65a1906b31246492449eafe9cace188cb59e26c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:48 +0100 Subject: net: xdp: add xdp_update_skb_shared_info utility routine Introduce xdp_update_skb_shared_info routine to update frags array metadata in skb_shared_info data structure converting to a skb from a xdp_buff or xdp_frame. According to the current skb_shared_info architecture in xdp_frame/xdp_buff and to the xdp frags support, there is no need to run skb_add_rx_frag() and reset frags array converting the buffer to a skb since the frag array will be in the same position for xdp_buff/xdp_frame and for the skb, we just need to update memory metadata. Introduce XDP_FLAGS_PF_MEMALLOC flag in xdp_buff_flags in order to mark the xdp_buff or xdp_frame as under memory-pressure if pages of the frags array are under memory pressure. Doing so we can avoid looping over all fragments in xdp_update_skb_shared_info routine. The driver is expected to set the flag constructing the xdp_buffer using xdp_buff_set_frag_pfmemalloc utility routine. Rely on xdp_update_skb_shared_info in __xdp_build_skb_from_frame routine converting the non-linear xdp_frame to a skb after performing a XDP_REDIRECT. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/bfd23fb8a8d7438724f7819c567cdf99ffd6226f.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 485e9495a690..1f8641ec658e 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -67,7 +67,10 @@ struct xdp_txq_info { }; enum xdp_buff_flags { - XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ + XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ + XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under + * pressure + */ }; struct xdp_buff { @@ -96,6 +99,16 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } +static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); +} + +static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { @@ -151,6 +164,11 @@ static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } +static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); +} + #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; @@ -186,6 +204,19 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +static inline void +xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, + unsigned int size, unsigned int truesize, + bool pfmemalloc) +{ + skb_shinfo(skb)->nr_frags = nr_frags; + + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; + skb->pfmemalloc |= pfmemalloc; +} + /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) -- cgit v1.2.3 From 7c48cb0176c6d6d3b55029f7ff4ffa05faee6446 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:50 +0100 Subject: xdp: add frags support to xdp_return_{buff/frame} Take into account if the received xdp_buff/xdp_frame is non-linear recycling/returning the frame memory to the allocator or into xdp_frame_bulk. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/a961069febc868508ce1bdf5e53a343eb4e57cb2.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 1f8641ec658e..8463dea8b4db 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -306,10 +306,24 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem); static inline void xdp_release_frame(struct xdp_frame *xdpf) { struct xdp_mem_info *mem = &xdpf->mem; + struct skb_shared_info *sinfo; + int i; /* Curr only page_pool needs this */ - if (mem->type == MEM_TYPE_PAGE_POOL) - __xdp_release_frame(xdpf->data, mem); + if (mem->type != MEM_TYPE_PAGE_POOL) + return; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_release_frame(page_address(page), mem); + } +out: + __xdp_release_frame(xdpf->data, mem); } int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, -- cgit v1.2.3 From 0165cc817075cf701e4289838f1d925ff1911b3e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:54 +0100 Subject: bpf: introduce bpf_xdp_get_buff_len helper Introduce bpf_xdp_get_buff_len helper in order to return the xdp buffer total size (linear and paged area) Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/aac9ac3504c84026cf66a3c71b7c5ae89bc991be.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 8463dea8b4db..52b593321956 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -145,6 +145,20 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp) return (struct skb_shared_info *)xdp_data_hard_end(xdp); } +static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +{ + unsigned int len = xdp->data_end - xdp->data; + struct skb_shared_info *sinfo; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + len += sinfo->xdp_frags_size; +out: + return len; +} + struct xdp_frame { void *data; u16 len; -- cgit v1.2.3 From bf25146a5595269810b1f47d048f114c5ff9f544 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 21 Jan 2022 11:09:55 +0100 Subject: bpf: add frags support to the bpf_xdp_adjust_tail() API This change adds support for tail growing and shrinking for XDP frags. When called on a non-linear packet with a grow request, it will work on the last fragment of the packet. So the maximum grow size is the last fragments tailroom, i.e. no new buffer will be allocated. A XDP frags capable driver is expected to set frag_size in xdp_rxq_info data structure to notify the XDP core the fragment size. frag_size set to 0 is interpreted by the XDP core as tail growing is not allowed. Introduce __xdp_rxq_info_reg utility routine to initialize frag_size field. When shrinking, it will work from the last fragment, all the way down to the base buffer depending on the shrinking size. It's important to mention that once you shrink down the fragment(s) are freed, so you can not grow again to the original size. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jakub Kicinski Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Eelco Chaudron Link: https://lore.kernel.org/r/eabda3485dda4f2f158b477729337327e609461d.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 52b593321956..b7721c3e4d1f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -60,6 +60,7 @@ struct xdp_rxq_info { u32 reg_state; struct xdp_mem_info mem; unsigned int napi_id; + u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { @@ -304,6 +305,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) return xdp_frame; } +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); @@ -340,8 +343,17 @@ out: __xdp_release_frame(xdpf->data, mem); } -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index, unsigned int napi_id); +int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id, u32 frag_size); +static inline int +xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id) +{ + return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); +} + void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); -- cgit v1.2.3 From aa6034678e873db8bd5c5a4b73f8b88c469374d6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 21 Jan 2022 16:25:18 +0800 Subject: bonding: use rcu_dereference_rtnl when get bonding active slave bond_option_active_slave_get_rcu() should not be used in rtnl_mutex as it use rcu_dereference(). Replace to rcu_dereference_rtnl() so we also can use this function in rtnl protected context. With this update, we can rmeove the rcu_read_lock/unlock in bonding .ndo_eth_ioctl and .get_ts_info. Reported-by: Vladimir Oltean Fixes: 94dd016ae538 ("bond: pass get_ts_info and SIOC[SG]HWTSTAMP ioctl to active device") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/bonding.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index f6ae3a4baea4..83cfd2d70247 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -346,7 +346,7 @@ static inline bool bond_uses_primary(struct bonding *bond) static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond) { - struct slave *slave = rcu_dereference(bond->curr_active_slave); + struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); return bond_uses_primary(bond) && slave ? slave->dev : NULL; } -- cgit v1.2.3 From d5ebaa7c5f6f688959e8d40840b2249ede63b8ed Mon Sep 17 00:00:00 2001 From: Soenke Huster Date: Sun, 23 Jan 2022 15:06:24 +0100 Subject: Bluetooth: hci_event: Ignore multiple conn complete events When one of the three connection complete events is received multiple times for the same handle, the device is registered multiple times which leads to memory corruptions. Therefore, consequent events for a single connection are ignored. The conn->state can hold different values, therefore HCI_CONN_HANDLE_UNSET is introduced to identify new connections. To make sure the events do not contain this or another invalid handle HCI_CONN_HANDLE_MAX and checks are introduced. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=215497 Signed-off-by: Soenke Huster Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 21eadb113a31..f5caff1ddb29 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -303,6 +303,9 @@ struct adv_monitor { #define HCI_MAX_SHORT_NAME_LENGTH 10 +#define HCI_CONN_HANDLE_UNSET 0xffff +#define HCI_CONN_HANDLE_MAX 0x0eff + /* Min encryption key size to match with SMP */ #define HCI_MIN_ENC_KEY_SIZE 7 -- cgit v1.2.3 From d507204d3c5cc57d9a8bdf0a477615bb59ea1611 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:52 -0800 Subject: tcp/dccp: add tw->tw_bslot We want to allow inet_twsk_kill() working even if netns has been dismantled/freed, to get rid of inet_twsk_purge(). This patch adds tw->tw_bslot to cache the bind bucket slot so that inet_twsk_kill() no longer needs to dereference twsk_net(tw) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index dfd919b3119e..c221fe2b77dd 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -72,6 +72,7 @@ struct inet_timewait_sock { tw_tos : 8; u32 tw_txhash; u32 tw_priority; + u32 tw_bslot; /* bind bucket slot */ struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; -- cgit v1.2.3 From 27dd35e02235902933626469a1c9198612d72564 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:53 -0800 Subject: tcp/dccp: no longer use twsk_net(tw) from tw_timer_handler() We will soon get rid of inet_twsk_purge(). This means that tw_timer_handler() might fire after a netns has been dismantled/freed. Instead of adding a function (and data structure) to find a netns from tw->tw_net_cookie, just update the SNMP counters a bit earlier, when the netns is known to be alive. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index c221fe2b77dd..b323db969b8b 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -65,10 +65,9 @@ struct inet_timewait_sock { /* these three are in inet_sock */ __be16 tw_sport; /* And these are ours. */ - unsigned int tw_kill : 1, - tw_transparent : 1, + unsigned int tw_transparent : 1, tw_flowlabel : 20, - tw_pad : 2, /* 2 bits hole */ + tw_pad : 3, /* 3 bits hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; -- cgit v1.2.3 From 0dad4087a86a2cbe177404dc73f18ada26a2c390 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:54 -0800 Subject: tcp/dccp: get rid of inet_twsk_purge() Prior patches in the series made sure tw_timer_handler() can be fired after netns has been dismantled/freed. We no longer have to scan a potentially big TCP ehash table at netns dismantle. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b323db969b8b..463ae5d33eb0 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -110,8 +110,6 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); - static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) { -- cgit v1.2.3 From a15c89c703d43490ea68ea4516553d4ea4f6b1e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:55 -0800 Subject: ipv4: do not use per netns icmp sockets Back in linux-2.6.25 (commit 4a6ad7a141cb "[NETNS]: Make icmp_sk per namespace."), we added private per-cpu/per-netns ipv4 icmp sockets. This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. icmp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer. icmp_xmit_lock() already makes sure to lock the chosen per-cpu socket. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 78557643526e..639a31638159 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -70,7 +70,6 @@ struct netns_ipv4 { struct hlist_head *fib_table_hash; struct sock *fibnl; - struct sock * __percpu *icmp_sk; struct sock *mc_autojoin_sk; struct inet_peer_base *peers; -- cgit v1.2.3 From 6a17b961ec19cd61ca646a6655ab93e8f6fe15c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:56 -0800 Subject: ipv6: do not use per netns icmp sockets Back in linux-2.6.25 (commit 98c6d1b261e7 "[NETNS]: Make icmpv6_sk per namespace.", we added private per-cpu/per-netns ipv6 icmp sockets. This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. icmp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer. icmpv6_xmit_lock() already makes sure to lock the chosen per-cpu socket. This patch has a considerable impact on the number of netns that the worker thread in cleanup_net() can dismantle per second, because ip6mr_sk_done() is no longer called, meaning we no longer acquire the rtnl mutex, competing with other threads adding new netns. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index a4b550380316..30cdfc4e1615 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -88,7 +88,6 @@ struct netns_ipv6 { struct fib6_table *fib6_local_tbl; struct fib_rules_ops *fib6_rules_ops; #endif - struct sock * __percpu *icmp_sk; struct sock *ndisc_sk; struct sock *tcp_sk; struct sock *igmp_sk; -- cgit v1.2.3 From 37ba017dcc3b1123206808979834655ddcf93251 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:57 -0800 Subject: ipv4/tcp: do not use per netns ctl sockets TCP ipv4 uses per-cpu/per-netns ctl sockets in order to send RST and some ACK packets (on behalf of TIMEWAIT sockets). This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. tcp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer in order to be able to use IPv4 output functions. Note that I attempted a related change in the past, that had to be hot-fixed in commit bdbbb8527b6f ("ipv4: tcp: get rid of ugly unicast_sock") This patch could very well surface old bugs, on layers not taking care of sk->sk_kern_sock properly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 639a31638159..22b4c6df1d2b 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -73,7 +73,6 @@ struct netns_ipv4 { struct sock *mc_autojoin_sk; struct inet_peer_base *peers; - struct sock * __percpu *tcp_sk; struct fqdir *fqdir; u8 sysctl_icmp_echo_ignore_all; -- cgit v1.2.3 From c1aca3080e382886e2e58e809787441984a2f89b Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:13 -0800 Subject: xfrm: Check if_id in xfrm_migrate This patch enables distinguishing SAs and SPs based on if_id during the xfrm_migrate flow. This ensures support for xfrm interfaces throughout the SA/SP lifecycle. When there are multiple existing SPs with the same direction, the same xfrm_selector and different endpoint addresses, xfrm_migrate might fail with ENODATA. Specifically, the code path for performing xfrm_migrate is: Stage 1: find policy to migrate with xfrm_migrate_policy_find(sel, dir, type, net) Stage 2: find and update state(s) with xfrm_migrate_state_find(mp, net) Stage 3: update endpoint address(es) of template(s) with xfrm_policy_migrate(pol, m, num_migrate) Currently "Stage 1" always returns the first xfrm_policy that matches, and "Stage 3" looks for the xfrm_tmpl that matches the old endpoint address. Thus if there are multiple xfrm_policy with same selector, direction, type and net, "Stage 1" might rertun a wrong xfrm_policy and "Stage 3" will fail with ENODATA because it cannot find a xfrm_tmpl with the matching endpoint address. The fix is to allow userspace to pass an if_id and add if_id to the matching rule in Stage 1 and Stage 2 since if_id is a unique ID for xfrm_policy and xfrm_state. For compatibility, if_id will only be checked if the attribute is set. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1668886 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fdb41e8bb626..743dd1da506e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1681,14 +1681,15 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net); +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, u32 if_id); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); -- cgit v1.2.3 From fbb8295248e1d6f576d444309fcf79356008eac1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 10:07:14 -0800 Subject: tcp: allocate tcp_death_row outside of struct netns_ipv4 I forgot tcp had per netns tracking of timewait sockets, and their sysctl to change the limit. After 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()"), whole struct net can be freed before last tw socket is freed. We need to allocate a separate struct inet_timewait_death_row object per netns. tw_count becomes a refcount and gains associated debugging infrastructure. BUG: KASAN: use-after-free in inet_twsk_kill+0x358/0x3c0 net/ipv4/inet_timewait_sock.c:46 Read of size 8 at addr ffff88807d5f9f40 by task kworker/1:7/3690 CPU: 1 PID: 3690 Comm: kworker/1:7 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events pwq_unbound_release_workfn Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 inet_twsk_kill+0x358/0x3c0 net/ipv4/inet_timewait_sock.c:46 call_timer_fn+0x1a5/0x6b0 kernel/time/timer.c:1421 expire_timers kernel/time/timer.c:1466 [inline] __run_timers.part.0+0x67c/0xa30 kernel/time/timer.c:1734 __run_timers kernel/time/timer.c:1715 [inline] run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1747 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649 sysvec_apic_timer_interrupt+0x93/0xc0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 RIP: 0010:lockdep_unregister_key+0x1c9/0x250 kernel/locking/lockdep.c:6328 Code: 00 00 00 48 89 ee e8 46 fd ff ff 4c 89 f7 e8 5e c9 ff ff e8 09 cc ff ff 9c 58 f6 c4 02 75 26 41 f7 c4 00 02 00 00 74 01 fb 5b <5d> 41 5c 41 5d 41 5e 41 5f e9 19 4a 08 00 0f 0b 5b 5d 41 5c 41 5d RSP: 0018:ffffc90004077cb8 EFLAGS: 00000206 RAX: 0000000000000046 RBX: ffff88807b61b498 RCX: 0000000000000001 RDX: dffffc0000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff888077027128 R08: 0000000000000001 R09: ffffffff8f1ea4fc R10: fffffbfff1ff93ee R11: 000000000000af1e R12: 0000000000000246 R13: 0000000000000000 R14: ffffffff8ffc89b8 R15: ffffffff90157fb0 wq_unregister_lockdep kernel/workqueue.c:3508 [inline] pwq_unbound_release_workfn+0x254/0x340 kernel/workqueue.c:3746 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307 worker_thread+0x657/0x1110 kernel/workqueue.c:2454 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Allocated by task 3635: kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:437 [inline] __kasan_slab_alloc+0x90/0xc0 mm/kasan/common.c:470 kasan_slab_alloc include/linux/kasan.h:260 [inline] slab_post_alloc_hook mm/slab.h:732 [inline] slab_alloc_node mm/slub.c:3230 [inline] slab_alloc mm/slub.c:3238 [inline] kmem_cache_alloc+0x202/0x3a0 mm/slub.c:3243 kmem_cache_zalloc include/linux/slab.h:705 [inline] net_alloc net/core/net_namespace.c:407 [inline] copy_net_ns+0x125/0x760 net/core/net_namespace.c:462 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226 ksys_unshare+0x445/0x920 kernel/fork.c:3048 __do_sys_unshare kernel/fork.c:3119 [inline] __se_sys_unshare kernel/fork.c:3117 [inline] __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3117 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88807d5f9a80 which belongs to the cache net_namespace of size 6528 The buggy address is located 1216 bytes inside of 6528-byte region [ffff88807d5f9a80, ffff88807d5fb400) The buggy address belongs to the page: page:ffffea0001f57e00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88807d5f9a80 pfn:0x7d5f8 head:ffffea0001f57e00 order:3 compound_mapcount:0 compound_pincount:0 memcg:ffff888070023001 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 ffff888010dd4f48 ffffea0001404e08 ffff8880118fd000 raw: ffff88807d5f9a80 0000000000040002 00000001ffffffff ffff888070023001 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 3634, ts 119694798460, free_ts 119693556950 prep_new_page mm/page_alloc.c:2434 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389 alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271 alloc_slab_page mm/slub.c:1799 [inline] allocate_slab mm/slub.c:1944 [inline] new_slab+0x28a/0x3b0 mm/slub.c:2004 ___slab_alloc+0x87c/0xe90 mm/slub.c:3018 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105 slab_alloc_node mm/slub.c:3196 [inline] slab_alloc mm/slub.c:3238 [inline] kmem_cache_alloc+0x35c/0x3a0 mm/slub.c:3243 kmem_cache_zalloc include/linux/slab.h:705 [inline] net_alloc net/core/net_namespace.c:407 [inline] copy_net_ns+0x125/0x760 net/core/net_namespace.c:462 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226 ksys_unshare+0x445/0x920 kernel/fork.c:3048 __do_sys_unshare kernel/fork.c:3119 [inline] __se_sys_unshare kernel/fork.c:3117 [inline] __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3117 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1352 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404 free_unref_page_prepare mm/page_alloc.c:3325 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3404 skb_free_head net/core/skbuff.c:655 [inline] skb_release_data+0x65d/0x790 net/core/skbuff.c:677 skb_release_all net/core/skbuff.c:742 [inline] __kfree_skb net/core/skbuff.c:756 [inline] consume_skb net/core/skbuff.c:914 [inline] consume_skb+0xc2/0x160 net/core/skbuff.c:908 skb_free_datagram+0x1b/0x1f0 net/core/datagram.c:325 netlink_recvmsg+0x636/0xea0 net/netlink/af_netlink.c:1998 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_recvmsg net/socket.c:962 [inline] ____sys_recvmsg+0x2c4/0x600 net/socket.c:2632 ___sys_recvmsg+0x127/0x200 net/socket.c:2674 __sys_recvmsg+0xe2/0x1a0 net/socket.c:2704 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Memory state around the buggy address: ffff88807d5f9e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807d5f9e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff88807d5f9f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807d5f9f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807d5fa000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()") Signed-off-by: Eric Dumazet Reported-by: syzbot Reported-by: Paolo Abeni Tested-by: Paolo Abeni Link: https://lore.kernel.org/r/20220126180714.845362-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 22b4c6df1d2b..94568e022001 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -31,18 +31,16 @@ struct ping_group_range { struct inet_hashinfo; struct inet_timewait_death_row { - atomic_t tw_count; - char tw_pad[L1_CACHE_BYTES - sizeof(atomic_t)]; + refcount_t tw_refcount; - struct inet_hashinfo *hashinfo; + struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; int sysctl_max_tw_buckets; }; struct tcp_fastopen_context; struct netns_ipv4 { - /* Please keep tcp_death_row at first field in netns_ipv4 */ - struct inet_timewait_death_row tcp_death_row ____cacheline_aligned_in_smp; + struct inet_timewait_death_row *tcp_death_row; #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; -- cgit v1.2.3 From a6d95c5a628a09be129f25d5663a7e9db8261f51 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 26 Jan 2022 16:00:18 +0100 Subject: Revert "xfrm: xfrm_state_mtu should return at least 1280 for ipv6" This reverts commit b515d2637276a3810d6595e10ab02c13bfd0b63a. Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") in v5.14 breaks the TCP MSS calculation in ipsec transport mode, resulting complete stalls of TCP connections. This happens when the (P)MTU is 1280 or slighly larger. The desired formula for the MSS is: MSS = (MTU - ESP_overhead) - IP header - TCP header However, the above commit clamps the (MTU - ESP_overhead) to a minimum of 1280, turning the formula into MSS = max(MTU - ESP overhead, 1280) - IP header - TCP header With the (P)MTU near 1280, the calculated MSS is too large and the resulting TCP packets never make it to the destination because they are over the actual PMTU. The above commit also causes suboptimal double fragmentation in xfrm tunnel mode, as described in https://lore.kernel.org/netdev/20210429202529.codhwpc7w6kbudug@dwarf.suse.cz/ The original problem the above commit was trying to fix is now fixed by commit 6596a0229541270fb8d38d989f91b78838e5e9da ("xfrm: fix MTU regression"). Signed-off-by: Jiri Bohac Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 743dd1da506e..76aa6f11a540 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1568,7 +1568,6 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); -- cgit v1.2.3 From 2e9589ff809e9232f689acd51da73390e135146a Mon Sep 17 00:00:00 2001 From: xu xin Date: Wed, 26 Jan 2022 07:10:58 +0000 Subject: ipv4: Namespaceify min_adv_mss sysctl knob Different netns has different requirement on the setting of min_adv_mss sysctl which the advertised MSS will be never lower than. Enable min_adv_mss to be configured per network namespace. Signed-off-by: xu xin Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 94568e022001..f0687867b5cd 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -83,6 +83,7 @@ struct netns_ipv4 { u32 ip_rt_min_pmtu; int ip_rt_mtu_expires; + int ip_rt_min_advmss; struct local_ports ip_local_ports; -- cgit v1.2.3 From 36268983e90316b37000a005642af42234dabb36 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 26 Jan 2022 16:38:52 +0100 Subject: Revert "ipv6: Honor all IPv6 PIO Valid Lifetime values" This reverts commit b75326c201242de9495ff98e5d5cff41d7fc0d9d. This commit breaks Linux compatibility with USGv6 tests. The RFC this commit was based on is actually an expired draft: no published RFC currently allows the new behaviour it introduced. Without full IETF endorsement, the flash renumbering scenario this patch was supposed to enable is never going to work, as other IPv6 equipements on the same LAN will keep the 2 hours limit. Fixes: b75326c20124 ("ipv6: Honor all IPv6 PIO Valid Lifetime values") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 78ea3e332688..e7ce719838b5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -6,6 +6,8 @@ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ +#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ + #define TEMP_VALID_LIFETIME (7*86400) #define TEMP_PREFERRED_LIFETIME (86400) #define REGEN_MAX_RETRY (3) -- cgit v1.2.3 From 8b0fdcdc3a7d44aff907f0103f5ffb86b12bfe71 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:10:59 -0800 Subject: net: remove bond_slave_has_mac_rcu() No caller since v3.16. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/bonding.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index 83cfd2d70247..7dead855a72d 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -698,20 +698,6 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond, return NULL; } -/* Caller must hold rcu_read_lock() for read */ -static inline struct slave *bond_slave_has_mac_rcu(struct bonding *bond, - const u8 *mac) -{ - struct list_head *iter; - struct slave *tmp; - - bond_for_each_slave_rcu(bond, tmp, iter) - if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) - return tmp; - - return NULL; -} - /* Caller must hold rcu_read_lock() for read */ static inline bool bond_slave_has_mac_rx(struct bonding *bond, const u8 *mac) { -- cgit v1.2.3 From 560e08eda7969c3ef0639ab05f718be03a49d387 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:00 -0800 Subject: net: ax25: remove route refcount Nothing takes the refcount since v4.9. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/ax25.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/net') diff --git a/include/net/ax25.h b/include/net/ax25.h index 526e49589197..cb628c5d7c5b 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -187,18 +187,12 @@ typedef struct { typedef struct ax25_route { struct ax25_route *next; - refcount_t refcount; ax25_address callsign; struct net_device *dev; ax25_digi *digipeat; char ip_mode; } ax25_route; -static inline void ax25_hold_route(ax25_route *ax25_rt) -{ - refcount_inc(&ax25_rt->refcount); -} - void __ax25_put_route(ax25_route *ax25_rt); extern rwlock_t ax25_route_lock; @@ -213,12 +207,6 @@ static inline void ax25_route_lock_unuse(void) read_unlock(&ax25_route_lock); } -static inline void ax25_put_route(ax25_route *ax25_rt) -{ - if (refcount_dec_and_test(&ax25_rt->refcount)) - __ax25_put_route(ax25_rt); -} - typedef struct { char slave; /* slave_mode? */ struct timer_list slave_timer; /* timeout timer */ -- cgit v1.2.3 From 937fca918aacf54f1c9cb00d16d4e999a0569be0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:05 -0800 Subject: udplite: remove udplite_csum_outgoing() Not used since v4.0. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/udplite.h | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'include/net') diff --git a/include/net/udplite.h b/include/net/udplite.h index 9185e45b997f..a3c53110d30b 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -70,49 +70,6 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) return 0; } -/* Slow-path computation of checksum. Socket is locked. */ -static inline __wsum udplite_csum_outgoing(struct sock *sk, struct sk_buff *skb) -{ - const struct udp_sock *up = udp_sk(skb->sk); - int cscov = up->len; - __wsum csum = 0; - - if (up->pcflag & UDPLITE_SEND_CC) { - /* - * Sender has set `partial coverage' option on UDP-Lite socket. - * The special case "up->pcslen == 0" signifies full coverage. - */ - if (up->pcslen < up->len) { - if (0 < up->pcslen) - cscov = up->pcslen; - udp_hdr(skb)->len = htons(up->pcslen); - } - /* - * NOTE: Causes for the error case `up->pcslen > up->len': - * (i) Application error (will not be penalized). - * (ii) Payload too big for send buffer: data is split - * into several packets, each with its own header. - * In this case (e.g. last segment), coverage may - * exceed packet length. - * Since packets with coverage length > packet length are - * illegal, we fall back to the defaults here. - */ - } - - skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ - - skb_queue_walk(&sk->sk_write_queue, skb) { - const int off = skb_transport_offset(skb); - const int len = skb->len - off; - - csum = skb_checksum(skb, off, (cscov > len)? len : cscov, csum); - - if ((cscov -= len) <= 0) - break; - } - return csum; -} - /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) { -- cgit v1.2.3 From 98b608629746946ecc6cda70bf0fd047785a1197 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:07 -0800 Subject: net: sched: remove psched_tdiff_bounded() Not used since v3.9. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9e7b21c0b3a6..44a35531952e 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -63,12 +63,6 @@ static inline psched_time_t psched_get_time(void) return PSCHED_NS2TICKS(ktime_get_ns()); } -static inline psched_tdiff_t -psched_tdiff_bounded(psched_time_t tv1, psched_time_t tv2, psched_time_t bound) -{ - return min(tv1 - tv2, bound); -} - struct qdisc_watchdog { u64 last_expires; struct hrtimer timer; -- cgit v1.2.3 From a459bc9a3a68f2975ee6661fb9c86126a5636b25 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:08 -0800 Subject: net: sched: remove qdisc_qlen_cpu() Never used since it was added in v5.2. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 472843eedbae..9bab396c1f3b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -518,11 +518,6 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } -static inline int qdisc_qlen_cpu(const struct Qdisc *q) -{ - return this_cpu_ptr(q->cpu_qstats)->qlen; -} - static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; -- cgit v1.2.3 From d1bc532e99becf104635ed4da6fefa306f452321 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 25 Jan 2022 17:04:43 +0100 Subject: i40e: xsk: Move tmp desc array from driver to pool Move desc_array from the driver to the pool. The reason behind this is that we can then reuse this array as a temporary storage for descriptors in all zero-copy drivers that use the batched interface. This will make it easier to add batching to more drivers. i40e is the only driver that has a batched Tx zero-copy implementation, so no need to touch any other driver. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/bpf/20220125160446.78976-6-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 5 ++--- include/net/xsk_buff_pool.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 443d45951564..4aa031849668 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -13,7 +13,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); -u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max); +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); @@ -142,8 +142,7 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, return false; } -static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, - u32 max) +static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max) { return 0; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index ddeefc4a1040..5554ee75e7da 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -60,6 +60,7 @@ struct xsk_buff_pool { */ dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; + struct xdp_desc *tx_descs; u64 chunk_mask; u64 addrs_cnt; u32 free_list_cnt; -- cgit v1.2.3 From 23f57406b82de51809d5812afd96f210f8b627f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 17:10:22 -0800 Subject: ipv4: avoid using shared IP generator for connected sockets ip_select_ident_segs() has been very conservative about using the connected socket private generator only for packets with IP_DF set, claiming it was needed for some VJ compression implementations. As mentioned in this referenced document, this can be abused. (Ref: Off-Path TCP Exploits of the Mixed IPID Assignment) Before switching to pure random IPID generation and possibly hurt some workloads, lets use the private inet socket generator. Not only this will remove one vulnerability, this will also improve performance of TCP flows using pmtudisc==IP_PMTUDISC_DONT Fixes: 73f156a6e8c1 ("inetpeer: get rid of ip_id_count") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reported-by: Ray Che Cc: Willy Tarreau Signed-off-by: Jakub Kicinski --- include/net/ip.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 81e23a102a0d..b51bae43b0dd 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -525,19 +525,18 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); + /* We had many attacks based on IPID, use the private + * generator as much as we can. + */ + if (sk && inet_sk(sk)->inet_daddr) { + iph->id = htons(inet_sk(sk)->inet_id); + inet_sk(sk)->inet_id += segs; + return; + } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { - /* This is only to work around buggy Windows95/2000 - * VJ compression implementations. If the ID field - * does not change, they drop every other packet in - * a TCP stream using header compression. - */ - if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; - } else { - iph->id = 0; - } + iph->id = 0; } else { + /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } -- cgit v1.2.3 From 3c42b2019863b327caa233072c50739d4144dd16 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 17:34:04 -0800 Subject: ipv4: remove sparse error in ip_neigh_gw4() ./include/net/route.h:373:48: warning: incorrect type in argument 2 (different base types) ./include/net/route.h:373:48: expected unsigned int [usertype] key ./include/net/route.h:373:48: got restricted __be32 [usertype] daddr Fixes: 5c9f7c1dfc2e ("ipv4: Add helpers for neigh lookup for nexthop") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220127013404.1279313-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 4c858dcf1aa8..25404fc2b483 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -370,7 +370,7 @@ static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, { struct neighbour *neigh; - neigh = __ipv4_neigh_lookup_noref(dev, daddr); + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &daddr, dev, false); -- cgit v1.2.3 From f37a4cc6bb0ba08c2d9fd7d18a1da87161cbb7f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jan 2022 00:36:28 +0000 Subject: udp6: pass flow in ip6_make_skb together with cork Another preparation patch. inet_cork_full already contains a field for iflow, so we can avoid passing a separate struct iflow6 into __ip6_append_data() and ip6_make_skb(), and use the flow stored in inet_cork_full. Make sure callers set cork->fl, i.e. we init it in ip6_append_data() and before calling ip6_make_skb(). Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3afcb128e064..5e0b56d66724 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1020,7 +1020,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - struct ipcm6_cookie *ipc6, struct flowi6 *fl6, + struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); -- cgit v1.2.3 From 31ed2261e88fbd1eb62fc870ef2c6b2cf2951336 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jan 2022 00:36:31 +0000 Subject: ipv6: partially inline ipv6_fixup_options Inline a part of ipv6_fixup_options() to avoid extra overhead on function call if opt is NULL. Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5e0b56d66724..082f30256f59 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -437,8 +437,16 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt); -struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, - struct ipv6_txoptions *opt); +struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, + struct ipv6_txoptions *opt); + +static inline struct ipv6_txoptions * +ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) +{ + if (!opt) + return NULL; + return __ipv6_fixup_options(opt_space, opt); +} bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); -- cgit v1.2.3 From c265a3a6690b093e542aceb4a7bf8bf6c577e42b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:25:40 +0100 Subject: net: mac802154: Explain the use of ieee802154_wake/stop_queue() It is not straightforward to the newcomer that a single skb can currently be sent at a time and that the internal process is to stop the queue when processing a frame before re-enabling it. Make this clear by documenting the ieee802154_wake/stop_queue() helpers. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125122540.855604-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/mac802154.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index d524ffb9eb25..2c3bbc6645ba 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -464,6 +464,12 @@ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, * ieee802154_wake_queue - wake ieee802154 queue * @hw: pointer as obtained from ieee802154_alloc_hw(). * + * Tranceivers usually have either one transmit framebuffer or one framebuffer + * for both transmitting and receiving. Hence, the core currently only handles + * one frame at a time for each phy, which means we had to stop the queue to + * avoid new skb to come during the transmission. The queue then needs to be + * woken up after the operation. + * * Drivers should use this function instead of netif_wake_queue. */ void ieee802154_wake_queue(struct ieee802154_hw *hw); @@ -472,6 +478,12 @@ void ieee802154_wake_queue(struct ieee802154_hw *hw); * ieee802154_stop_queue - stop ieee802154 queue * @hw: pointer as obtained from ieee802154_alloc_hw(). * + * Tranceivers usually have either one transmit framebuffer or one framebuffer + * for both transmitting and receiving. Hence, the core currently only handles + * one frame at a time for each phy, which means we need to tell upper layers to + * stop giving us new skbs while we are busy with the transmitted one. The queue + * must then be stopped before transmitting. + * * Drivers should use this function instead of netif_stop_queue. */ void ieee802154_stop_queue(struct ieee802154_hw *hw); -- cgit v1.2.3 From d01ffb9eee4af165d83b08dd73ebdf9fe94a519b Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 28 Jan 2022 12:47:16 +0800 Subject: ax25: add refcount in ax25_dev to avoid UAF bugs If we dereference ax25_dev after we call kfree(ax25_dev) in ax25_dev_device_down(), it will lead to concurrency UAF bugs. There are eight syscall functions suffer from UAF bugs, include ax25_bind(), ax25_release(), ax25_connect(), ax25_ioctl(), ax25_getname(), ax25_sendmsg(), ax25_getsockopt() and ax25_info_show(). One of the concurrency UAF can be shown as below: (USE) | (FREE) | ax25_device_event | ax25_dev_device_down ax25_bind | ... ... | kfree(ax25_dev) ax25_fillin_cb() | ... ax25_fillin_cb_from_dev() | ... | The root cause of UAF bugs is that kfree(ax25_dev) in ax25_dev_device_down() is not protected by any locks. When ax25_dev, which there are still pointers point to, is released, the concurrency UAF bug will happen. This patch introduces refcount into ax25_dev in order to guarantee that there are no pointers point to it when ax25_dev is released. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- include/net/ax25.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/ax25.h b/include/net/ax25.h index 526e49589197..50b417df6221 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -239,6 +239,7 @@ typedef struct ax25_dev { #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; } ax25_dev; typedef struct ax25_cb { @@ -293,6 +294,15 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } +#define ax25_dev_hold(__ax25_dev) \ + refcount_inc(&((__ax25_dev)->refcount)) + +static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) { + kfree(ax25_dev); + } +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; -- cgit v1.2.3 From 4f0e30407ef6f2075b8e86d54be42e787087cd61 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 28 Jan 2022 08:06:54 -0800 Subject: ipv4: drop fragmentation code from ip_options_build() Since v2.5.44 and addition of ip_options_fragment() ip_options_build() does not render headers for fragments directly. @is_frag is always 0. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index b51bae43b0dd..fdbab0c00fca 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -712,7 +712,7 @@ int ip_forward(struct sk_buff *skb); */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, - __be32 daddr, struct rtable *rt, int is_frag); + __be32 daddr, struct rtable *rt); int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt); -- cgit v1.2.3 From 47ed9442b2ecfcdc72889667236d6c59b6a3337e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 28 Jan 2022 16:53:47 -0700 Subject: ipv4: Make ip_idents_reserve static ip_idents_reserve is only used in net/ipv4/route.c. Make it static and remove the export. Signed-off-by: David Ahern Cc: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index fdbab0c00fca..3984f2c39c4b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -517,7 +517,6 @@ void ip_dst_metrics_put(struct dst_entry *dst) kfree(p); } -u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, -- cgit v1.2.3 From e187013abeb4c2a7ec8a4bb978844c7e92a1a6ec Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:21 +0300 Subject: txhash: Make rethinking txhash behavior configurable via sysctl Add a per ns sysctl that controls the txhash rethink behavior: net.core.txrehash. When enabled, the same behavior is retained, when disabled, rethink is not performed. Sysctl is enabled by default. Signed-off-by: Akhmat Karakotov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/core.h | 1 + include/net/sock.h | 34 +++++++++++++++++++++------------- 2 files changed, 22 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 552bc25b1933..388244e315e7 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -10,6 +10,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + u8 sysctl_txrehash; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..0540e1b2aeb0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -587,6 +587,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) __tmp | SK_USER_DATA_NOCOPY); \ }) +static inline +struct net *sock_net(const struct sock *sk) +{ + return read_pnet(&sk->sk_net); +} + +static inline +void sock_net_set(struct sock *sk, struct net *net) +{ + write_pnet(&sk->sk_net, net); +} + /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE @@ -2054,10 +2066,18 @@ static inline void sk_set_txhash(struct sock *sk) static inline bool sk_rethink_txhash(struct sock *sk) { - if (sk->sk_txhash) { + u8 rehash; + + if (!sk->sk_txhash) + return false; + + rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + + if (rehash) { sk_set_txhash(sk); return true; } + return false; } @@ -2704,18 +2724,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static inline -struct net *sock_net(const struct sock *sk) -{ - return read_pnet(&sk->sk_net); -} - -static inline -void sock_net_set(struct sock *sk, struct net *net) -{ - write_pnet(&sk->sk_net, net); -} - static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { -- cgit v1.2.3 From 26859240e4ee701e0379f08634957adaff67e43a Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:22 +0300 Subject: txhash: Add socket option to control TX hash rethink behavior Add the SO_TXREHASH socket option to control hash rethink behavior per socket. When default mode is set, sockets disable rehash at initialization and use sysctl option when entering listen state. setsockopt() overrides default behavior. Signed-off-by: Akhmat Karakotov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 0540e1b2aeb0..d6c13f0fba40 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -316,6 +316,7 @@ struct sk_filter; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit + * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received @@ -491,6 +492,7 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; + u8 sk_txrehash; #ifdef CONFIG_NET_RX_BUSY_POLL u8 sk_prefer_busy_poll; u16 sk_busy_poll_budget; @@ -2066,18 +2068,10 @@ static inline void sk_set_txhash(struct sock *sk) static inline bool sk_rethink_txhash(struct sock *sk) { - u8 rehash; - - if (!sk->sk_txhash) - return false; - - rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - - if (rehash) { + if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } - return false; } -- cgit v1.2.3 From 02b2a91c6f0d57df687e666b475849a54f295a12 Mon Sep 17 00:00:00 2001 From: David Girault Date: Tue, 1 Feb 2022 19:09:56 +0100 Subject: net: ieee802154: Provide a kdoc to the address structure Give this structure a header to better explain its content. Signed-off-by: David Girault [miquel.raynal@bootlin.com: Isolate this change from a bigger commit and reword the comment] Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220201180956.93581-1-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 6ed07844eb24..833672d6fbe4 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -227,6 +227,16 @@ static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net) write_pnet(&wpan_phy->_net, net); } +/** + * struct ieee802154_addr - IEEE802.15.4 device address + * @mode: Address mode from frame header. Can be one of: + * - @IEEE802154_ADDR_NONE + * - @IEEE802154_ADDR_SHORT + * - @IEEE802154_ADDR_LONG + * @pan_id: The PAN ID this address belongs to + * @short_addr: address if @mode is @IEEE802154_ADDR_SHORT + * @extended_addr: address if @mode is @IEEE802154_ADDR_LONG + */ struct ieee802154_addr { u8 mode; __le16 pan_id; -- cgit v1.2.3 From 295ab96f478d0fa56393e85406f19a867e26ce22 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Feb 2022 01:03:20 +0100 Subject: net: dsa: provide switch operations for tracking the master state Certain drivers may need to send management traffic to the switch for things like register access, FDB dump, etc, to accelerate what their slow bus (SPI, I2C, MDIO) can already do. Ethernet is faster (especially in bulk transactions) but is also more unreliable, since the user may decide to bring the DSA master down (or not bring it up), therefore severing the link between the host and the attached switch. Drivers needing Ethernet-based register access already should have fallback logic to the slow bus if the Ethernet method fails, but that fallback may be based on a timeout, and the I/O to the switch may slow down to a halt if the master is down, because every Ethernet packet will have to time out. The driver also doesn't have the option to turn off Ethernet-based I/O momentarily, because it wouldn't know when to turn it back on. Which is where this change comes in. By tracking NETDEV_CHANGE, NETDEV_UP and NETDEV_GOING_DOWN events on the DSA master, we should know the exact interval of time during which this interface is reliably available for traffic. Provide this information to switches so they can use it as they wish. An helper is added dsa_port_master_is_operational() to check if a master port is operational. Signed-off-by: Vladimir Oltean Signed-off-by: Ansuel Smith Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 57b3e4e7413b..43c4153ef53a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -278,6 +278,10 @@ struct dsa_port { u8 devlink_port_setup:1; + /* Master state bits, valid only on CPU ports */ + u8 master_admin_up:1; + u8 master_oper_up:1; + u8 setup:1; struct device_node *dn; @@ -478,6 +482,12 @@ static inline bool dsa_port_is_unused(struct dsa_port *dp) return dp->type == DSA_PORT_TYPE_UNUSED; } +static inline bool dsa_port_master_is_operational(struct dsa_port *dp) +{ + return dsa_port_is_cpu(dp) && dp->master_admin_up && + dp->master_oper_up; +} + static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; @@ -1036,6 +1046,13 @@ struct dsa_switch_ops { int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); + + /* + * DSA master tracking operations + */ + void (*master_state_change)(struct dsa_switch *ds, + const struct net_device *master, + bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ -- cgit v1.2.3 From 5903123f662ed18483f05cac3f9e800a074c29ff Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Fri, 28 Jan 2022 22:26:21 +0300 Subject: tcp: Use BPF timeout setting for SYN ACK RTO When setting RTO through BPF program, some SYN ACK packets were unaffected and continued to use TCP_TIMEOUT_INIT constant. This patch adds timeout option to struct request_sock. Option is initialized with TCP_TIMEOUT_INIT and is reassigned through BPF using tcp_timeout_init call. SYN ACK retransmits now use newly added timeout option. Signed-off-by: Akhmat Karakotov Acked-by: Martin KaFai Lau v2: - Add timeout option to struct request_sock. Do not call tcp_timeout_init on every syn ack retransmit. v3: - Use unsigned long for min. Bound tcp_timeout_init to TCP_RTO_MAX. v4: - Refactor duplicate code by adding reqsk_timeout function. Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 8 ++++++++ include/net/request_sock.h | 2 ++ include/net/tcp.h | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 4ad47d9f9d27..3908296d103f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -285,6 +285,14 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); +static inline unsigned long +reqsk_timeout(struct request_sock *req, unsigned long max_timeout) +{ + u64 timeout = (u64)req->timeout << req->num_timeout; + + return (unsigned long)min_t(u64, timeout, max_timeout); +} + static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) { /* The below has to be done to allow calling inet_csk_destroy_sock */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 29e41ff3ec93..144c39db9898 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -70,6 +70,7 @@ struct request_sock { struct saved_syn *saved_syn; u32 secid; u32 peer_secid; + u32 timeout; }; static inline struct request_sock *inet_reqsk(const struct sock *sk) @@ -104,6 +105,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, sk_node_init(&req_to_sk(req)->sk_node); sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; + req->timeout = 0; req->num_timeout = 0; req->num_retrans = 0; req->sk = NULL; diff --git a/include/net/tcp.h b/include/net/tcp.h index b9fc978fb2ca..eff2487d972d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2358,7 +2358,7 @@ static inline u32 tcp_timeout_init(struct sock *sk) if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; - return timeout; + return min_t(int, timeout, TCP_RTO_MAX); } static inline u32 tcp_rwnd_init_bpf(struct sock *sk) -- cgit v1.2.3 From 4a81f6da9cb2d1ef911131a6fd8bd15cb61fc772 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 1 Feb 2022 20:39:42 +0100 Subject: net, neigh: Do not trigger immediate probes on NUD_FAILED from neigh_managed_work syzkaller was able to trigger a deadlock for NTF_MANAGED entries [0]: kworker/0:16/14617 is trying to acquire lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 [...] but task is already holding lock: ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: neigh_managed_work+0x35/0x250 net/core/neighbour.c:1572 The neighbor entry turned to NUD_FAILED state, where __neigh_event_send() triggered an immediate probe as per commit cd28ca0a3dd1 ("neigh: reduce arp latency") via neigh_probe() given table lock was held. One option to fix this situation is to defer the neigh_probe() back to the neigh_timer_handler() similarly as pre cd28ca0a3dd1. For the case of NTF_MANAGED, this deferral is acceptable given this only happens on actual failure state and regular / expected state is NUD_VALID with the entry already present. The fix adds a parameter to __neigh_event_send() in order to communicate whether immediate probe is allowed or disallowed. Existing call-sites of neigh_event_send() default as-is to immediate probe. However, the neigh_managed_work() disables it via use of neigh_event_send_probe(). [0] __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_deadlock_bug kernel/locking/lockdep.c:2956 [inline] check_deadlock kernel/locking/lockdep.c:2999 [inline] validate_chain kernel/locking/lockdep.c:3788 [inline] __lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5027 lock_acquire kernel/locking/lockdep.c:5639 [inline] lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5604 __raw_write_lock_bh include/linux/rwlock_api_smp.h:202 [inline] _raw_write_lock_bh+0x2f/0x40 kernel/locking/spinlock.c:334 ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652 ip6_finish_output2+0x1070/0x14f0 net/ipv6/ip6_output.c:123 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] __ip6_finish_output+0x61e/0xe90 net/ipv6/ip6_output.c:170 ip6_finish_output+0x32/0x200 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:451 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0xa99/0x17f0 net/ipv6/ndisc.c:508 ndisc_send_ns+0x3a9/0x840 net/ipv6/ndisc.c:650 ndisc_solicit+0x2cd/0x4f0 net/ipv6/ndisc.c:742 neigh_probe+0xc2/0x110 net/core/neighbour.c:1040 __neigh_event_send+0x37d/0x1570 net/core/neighbour.c:1201 neigh_event_send include/net/neighbour.h:470 [inline] neigh_managed_work+0x162/0x250 net/core/neighbour.c:1574 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307 worker_thread+0x657/0x1110 kernel/workqueue.c:2454 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Fixes: 7482e3841d52 ("net, neigh: Add NTF_MANAGED flag for managed neighbor entries") Reported-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Roopa Prabhu Tested-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220201193942.5055-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 937389e04c8e..87419f7f5421 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -350,7 +350,8 @@ static inline struct neighbour *neigh_create(struct neigh_table *tbl, return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); @@ -460,17 +461,24 @@ static inline struct neighbour * neigh_clone(struct neighbour *neigh) #define neigh_hold(n) refcount_inc(&(n)->refcnt) -static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +static __always_inline int neigh_event_send_probe(struct neighbour *neigh, + struct sk_buff *skb, + const bool immediate_ok) { unsigned long now = jiffies; - + if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); - if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) - return __neigh_event_send(neigh, skb); + if (!(neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) + return __neigh_event_send(neigh, skb, immediate_ok); return 0; } +static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + return neigh_event_send_probe(neigh, skb, true); +} + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { -- cgit v1.2.3 From 52cc6ffc0ab2c61a76127b9347567fc97c15582f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 31 Jan 2022 08:40:01 -0800 Subject: page_pool: Refactor page_pool to enable fragmenting after allocation This change is meant to permit a driver to perform "fragmenting" of the page from within the driver instead of the current model which requires pre-partitioning the page. The main motivation behind this is to support use cases where the page will be split up by the driver after DMA instead of before. With this change it becomes possible to start using page pool to replace some of the existing use cases where multiple references were being used for a single page, but the number needed was unknown as the size could be dynamic. For example, with this code it would be possible to do something like the following to handle allocation: page = page_pool_alloc_pages(); if (!page) return NULL; page_pool_fragment_page(page, DRIVER_PAGECNT_BIAS_MAX); rx_buf->page = page; rx_buf->pagecnt_bias = DRIVER_PAGECNT_BIAS_MAX; Then we would process a received buffer by handling it with: rx_buf->pagecnt_bias--; Once the page has been fully consumed we could then flush the remaining instances with: if (page_pool_defrag_page(page, rx_buf->pagecnt_bias)) continue; page_pool_put_defragged_page(pool, page -1, !!budget); The general idea is that we want to have the ability to allocate a page with excess fragment count and then trim off the unneeded fragments. Signed-off-by: Alexander Duyck Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 82 +++++++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 30 deletions(-) (limited to 'include/net') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 79a805542d0f..97c3c19872ff 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -201,21 +201,67 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, } #endif -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct); +void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); -/* Same as above but will try to sync the entire area pool->max_len */ -static inline void page_pool_put_full_page(struct page_pool *pool, - struct page *page, bool allow_direct) +static inline void page_pool_fragment_page(struct page *page, long nr) +{ + atomic_long_set(&page->pp_frag_count, nr); +} + +static inline long page_pool_defrag_page(struct page *page, long nr) +{ + long ret; + + /* If nr == pp_frag_count then we have cleared all remaining + * references to the page. No need to actually overwrite it, instead + * we can leave this to be overwritten by the calling function. + * + * The main advantage to doing this is that an atomic_read is + * generally a much cheaper operation than an atomic update, + * especially when dealing with a page that may be partitioned + * into only 2 or 3 pieces. + */ + if (atomic_long_read(&page->pp_frag_count) == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON(ret < 0); + return ret; +} + +static inline bool page_pool_is_last_frag(struct page_pool *pool, + struct page *page) +{ + /* If fragments aren't enabled or count is 0 we were the last user */ + return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || + (page_pool_defrag_page(page, 1) == 0); +} + +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size, + bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - page_pool_put_page(pool, page, -1, allow_direct); + if (!page_pool_is_last_frag(pool, page)) + return; + + page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); #endif } +/* Same as above but will try to sync the entire area pool->max_len */ +static inline void page_pool_put_full_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + page_pool_put_page(pool, page, -1, allow_direct); +} + /* Same as above but the caller must guarantee safe context. e.g NAPI */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) @@ -243,30 +289,6 @@ static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) page->dma_addr_upper = upper_32_bits(addr); } -static inline void page_pool_set_frag_count(struct page *page, long nr) -{ - atomic_long_set(&page->pp_frag_count, nr); -} - -static inline long page_pool_atomic_sub_frag_count_return(struct page *page, - long nr) -{ - long ret; - - /* As suggested by Alexander, atomic_long_read() may cover up the - * reference count errors, so avoid calling atomic_long_read() in - * the cases of freeing or draining the page_frags, where we would - * not expect it to match or that are slowpath anyway. - */ - if (__builtin_constant_p(nr) && - atomic_long_read(&page->pp_frag_count) == nr) - return 0; - - ret = atomic_long_sub_return(nr, &page->pp_frag_count); - WARN_ON(ret < 0); - return ret; -} - static inline bool is_page_pool_compiled_in(void) { #ifdef CONFIG_PAGE_POOL -- cgit v1.2.3 From 7af4a361a62f59b44ec301f21090823365dd0244 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Thu, 3 Feb 2022 11:16:53 +0100 Subject: net: dsa: mv88e6xxx: Improve isolation of standalone ports Clear MapDA on standalone ports to bypass any ATU lookup that might point the packet in the wrong direction. This means that all packets are flooded using the PVT config. So make sure that standalone ports are only allowed to communicate with the local upstream port. Here is a scenario in which this is needed: CPU | .----. .---0---. | .--0--. | sw0 | | | sw1 | '-1-2-3-' | '-1-2-' '---' - sw0p1 and sw1p1 are bridged - sw0p2 and sw1p2 are in standalone mode - Learning must be enabled on sw0p3 in order for hardware forwarding to work properly between bridged ports 1. A packet with SA :aa comes in on sw1p2 1a. Egresses sw1p0 1b. Ingresses sw0p3, ATU adds an entry for :aa towards port 3 1c. Egresses sw0p0 2. A packet with DA :aa comes in on sw0p2 2a. If an ATU lookup is done at this point, the packet will be incorrectly forwarded towards sw0p3. With this change in place, the ATU is bypassed and the packet is forwarded in accordance with the PVT, which only contains the CPU port. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 43c4153ef53a..6e5ef62a7dce 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -591,6 +591,18 @@ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) return port == dsa_upstream_port(ds, port); } +/* Return the local port used to reach the CPU port */ +static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) +{ + struct dsa_port *dp; + + dsa_switch_for_each_available_port(dp, ds) { + return dsa_upstream_port(ds, dp->index); + } + + return ds->num_ports; +} + /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning * that the routing port from @downstream_ds to @upstream_ds is also the port * which @downstream_ds uses to reach its dedicated CPU. -- cgit v1.2.3 From d352b20f4174a6bd998992329b773ab513232880 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Thu, 3 Feb 2022 11:16:56 +0100 Subject: net: dsa: mv88e6xxx: Improve multichip isolation of standalone ports Given that standalone ports are now configured to bypass the ATU and forward all frames towards the upstream port, extend the ATU bypass to multichip systems. Load VID 0 (standalone) into the VTU with the policy bit set. Since VID 4095 (bridged) is already loaded, we now know that all VIDs in use are always available in all VTUs. Therefore, we can safely enable 802.1Q on DSA ports. Setting the DSA ports' VTU policy to TRAP means that all incoming frames on VID 0 will be classified as MGMT - as a result, the ATU is bypassed on all subsequent switches. With this isolation in place, we are able to support configurations that are simultaneously very quirky and very useful. Quirky because it involves looping cables between local switchports like in this example: CPU | .------. .---0---. | .----0----. | sw0 | | | sw1 | '-1-2-3-' | '-1-2-3-4-' $ @ '---' $ @ % % We have three physically looped pairs ($, @, and %). This is very useful because it allows us to run the kernel's kselftests for the bridge on mv88e6xxx hardware. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6e5ef62a7dce..ca8c14b547b4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -591,6 +591,12 @@ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) return port == dsa_upstream_port(ds, port); } +/* Return true if this is a DSA port leading away from the CPU */ +static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port) +{ + return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port); +} + /* Return the local port used to reach the CPU port */ static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) { -- cgit v1.2.3 From 87563a043cef044fed5db7967a75741cc16ad2b1 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Thu, 3 Feb 2022 23:08:11 +0800 Subject: ax25: fix reference count leaks of ax25_dev The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") introduces refcount into ax25_dev, but there are reference leak paths in ax25_ctl_ioctl(), ax25_fwd_ioctl(), ax25_rt_add(), ax25_rt_del() and ax25_rt_opt(). This patch uses ax25_dev_put() and adjusts the position of ax25_addr_ax25dev() to fix reference cout leaks of ax25_dev. Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Signed-off-by: Duoming Zhou Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20220203150811.42256-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- include/net/ax25.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ax25.h b/include/net/ax25.h index 50b417df6221..8221af1811df 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -294,10 +294,12 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } -#define ax25_dev_hold(__ax25_dev) \ - refcount_inc(&((__ax25_dev)->refcount)) +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +} -static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); -- cgit v1.2.3 From bb62a765b1b5597d32a426096aa78d2a8eb6b091 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:06:59 +0100 Subject: netfilter: conntrack: make all extensions 8-byte alignned All extensions except one need 8 byte alignment, so just make that the default. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index c7515d82ab06..705a4487f023 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -49,7 +49,7 @@ enum nf_ct_ext_id { struct nf_ct_ext { u8 offset[NF_CT_EXT_NUM]; u8 len; - char data[]; + char data[] __aligned(8); }; static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id) @@ -83,10 +83,7 @@ struct nf_ct_ext_type { void (*destroy)(struct nf_conn *ct); enum nf_ct_ext_id id; - - /* Length and min alignment. */ u8 len; - u8 align; }; int nf_ct_extend_register(const struct nf_ct_ext_type *type); -- cgit v1.2.3 From 5f31edc0676b55cd6baf5ba119d1881f3fbadee1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:07:00 +0100 Subject: netfilter: conntrack: move extension sizes into core No need to specify this in the registration modules, we already collect all sizes for build-time checks on the maximum combined size. After this change, all extensions except nat have no meaningful content in their nf_ct_ext_type struct definition. Next patch handles nat, this will then allow to remove the dynamic register api completely. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 705a4487f023..87d818414092 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -83,7 +83,6 @@ struct nf_ct_ext_type { void (*destroy)(struct nf_conn *ct); enum nf_ct_ext_id id; - u8 len; }; int nf_ct_extend_register(const struct nf_ct_ext_type *type); -- cgit v1.2.3 From 1bc91a5ddf3eaea0e0ea957cccf3abdcfcace00e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:07:01 +0100 Subject: netfilter: conntrack: handle ->destroy hook via nat_ops instead The nat module already exposes a few functions to the conntrack core. Move the nat extension destroy hook to it. After this, no conntrack extension needs a destroy hook. 'struct nf_ct_ext_type' and the register/unregister api can be removed in a followup patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 87d818414092..343f9194423a 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -79,9 +79,6 @@ void nf_ct_ext_destroy(struct nf_conn *ct); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); struct nf_ct_ext_type { - /* Destroys relationships (can be NULL). */ - void (*destroy)(struct nf_conn *ct); - enum nf_ct_ext_id id; }; -- cgit v1.2.3 From 1015c3de23eedb8ac5a163165434484df44cfe00 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:07:02 +0100 Subject: netfilter: conntrack: remove extension register api These no longer register/unregister a meaningful structure so remove it. Cc: Paul Blakey Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_acct.h | 1 - include/net/netfilter/nf_conntrack_ecache.h | 13 ------------- include/net/netfilter/nf_conntrack_extend.h | 9 --------- include/net/netfilter/nf_conntrack_labels.h | 3 --- include/net/netfilter/nf_conntrack_seqadj.h | 3 --- include/net/netfilter/nf_conntrack_timeout.h | 12 ------------ include/net/netfilter/nf_conntrack_timestamp.h | 13 ------------- 7 files changed, 54 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h index 7f44a771530e..4b2b7f8914ea 100644 --- a/include/net/netfilter/nf_conntrack_acct.h +++ b/include/net/netfilter/nf_conntrack_acct.h @@ -78,7 +78,6 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, u32 dir, void nf_conntrack_acct_pernet_init(struct net *net); -int nf_conntrack_acct_init(void); void nf_conntrack_acct_fini(void); #endif /* _NF_CONNTRACK_ACCT_H */ diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index d932e22edcb4..16bcff809b18 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -166,9 +166,6 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state); void nf_conntrack_ecache_pernet_init(struct net *net); void nf_conntrack_ecache_pernet_fini(struct net *net); -int nf_conntrack_ecache_init(void); -void nf_conntrack_ecache_fini(void); - static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return net->ct.ecache_dwork_pending; @@ -194,16 +191,6 @@ static inline void nf_conntrack_ecache_pernet_init(struct net *net) static inline void nf_conntrack_ecache_pernet_fini(struct net *net) { } - -static inline int nf_conntrack_ecache_init(void) -{ - return 0; -} - -static inline void nf_conntrack_ecache_fini(void) -{ -} - static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return false; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ #endif /*_NF_CONNTRACK_ECACHE_H*/ diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 343f9194423a..96635ad2acc7 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -72,16 +72,7 @@ static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id) #define nf_ct_ext_find(ext, id) \ ((id##_TYPE *)__nf_ct_ext_find((ext), (id))) -/* Destroy all relationships */ -void nf_ct_ext_destroy(struct nf_conn *ct); - /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); -struct nf_ct_ext_type { - enum nf_ct_ext_id id; -}; - -int nf_ct_extend_register(const struct nf_ct_ext_type *type); -void nf_ct_extend_unregister(const struct nf_ct_ext_type *type); #endif /* _NF_CONNTRACK_EXTEND_H */ diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index ba916411c4e1..3c23298e68ca 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -45,12 +45,9 @@ int nf_connlabels_replace(struct nf_conn *ct, #ifdef CONFIG_NF_CONNTRACK_LABELS int nf_conntrack_labels_init(void); -void nf_conntrack_labels_fini(void); int nf_connlabels_get(struct net *net, unsigned int bit); void nf_connlabels_put(struct net *net); #else -static inline int nf_conntrack_labels_init(void) { return 0; } -static inline void nf_conntrack_labels_fini(void) {} static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; } static inline void nf_connlabels_put(struct net *net) {} #endif diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h index 0a10b50537ae..883c414b768e 100644 --- a/include/net/netfilter/nf_conntrack_seqadj.h +++ b/include/net/netfilter/nf_conntrack_seqadj.h @@ -42,7 +42,4 @@ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, u32 seq); -int nf_conntrack_seqadj_init(void); -void nf_conntrack_seqadj_fini(void); - #endif /* _NF_CONNTRACK_SEQADJ_H */ diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 659b0ea25b4d..db507e4a65bb 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -89,23 +89,11 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -int nf_conntrack_timeout_init(void); -void nf_conntrack_timeout_fini(void); void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name); void nf_ct_destroy_timeout(struct nf_conn *ct); #else -static inline int nf_conntrack_timeout_init(void) -{ - return 0; -} - -static inline void nf_conntrack_timeout_fini(void) -{ - return; -} - static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) diff --git a/include/net/netfilter/nf_conntrack_timestamp.h b/include/net/netfilter/nf_conntrack_timestamp.h index 820ea34b6029..57138d974a9f 100644 --- a/include/net/netfilter/nf_conntrack_timestamp.h +++ b/include/net/netfilter/nf_conntrack_timestamp.h @@ -40,21 +40,8 @@ struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp) #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP void nf_conntrack_tstamp_pernet_init(struct net *net); - -int nf_conntrack_tstamp_init(void); -void nf_conntrack_tstamp_fini(void); #else static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {} - -static inline int nf_conntrack_tstamp_init(void) -{ - return 0; -} - -static inline void nf_conntrack_tstamp_fini(void) -{ - return; -} #endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */ #endif /* _NF_CONNTRACK_TSTAMP_H */ -- cgit v1.2.3 From 7e367b06f16b3d22e884af55117601d2a73fca03 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Feb 2022 10:49:39 +0200 Subject: cfg80211: fix -Wcast-qual warnings When enabling -Wcast-qual e.g. via W=3, we get a lot of warnings from this file, whenever it's included. Since the fixes are simple, just do that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.6a1d52213019.I92d82e7251cf712faa43fd09db3142327a3bce3d@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d19e48f9fdc6..f6db085ff3df 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2604,7 +2604,7 @@ const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id); */ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id) { - return (void *)ieee80211_bss_get_elem(bss, id); + return (const void *)ieee80211_bss_get_elem(bss, id); } @@ -5970,9 +5970,9 @@ cfg80211_find_ie_match(u8 eid, const u8 *ies, unsigned int len, (!match_len && match_offset))) return NULL; - return (void *)cfg80211_find_elem_match(eid, ies, len, - match, match_len, - match_offset ? + return (const void *)cfg80211_find_elem_match(eid, ies, len, + match, match_len, + match_offset ? match_offset - 2 : 0); } @@ -6099,7 +6099,7 @@ static inline const u8 * cfg80211_find_vendor_ie(unsigned int oui, int oui_type, const u8 *ies, unsigned int len) { - return (void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len); + return (const void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len); } /** -- cgit v1.2.3 From 5beb53d6ba4f39743d057e756db22bd8c079fc21 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Feb 2022 10:49:40 +0200 Subject: ieee80211: radiotap: fix -Wcast-qual warnings When enabling -Wcast-qual e.g. via W=3, we get a lot of warnings from this file, whenever it's included. Since the fixes are simple, just do that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.cc733aeb1a18.I03396e1bf7a1af364cbd0916037f65d800035039@changeid Signed-off-by: Johannes Berg --- include/net/ieee80211_radiotap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index 11630351c978..598f53d2a3a0 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2017 Intel Deutschland GmbH - * Copyright (c) 2018-2019 Intel Corporation + * Copyright (c) 2018-2019, 2021 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -365,7 +365,7 @@ enum ieee80211_radiotap_zero_len_psdu_type { */ static inline u16 ieee80211_get_radiotap_len(const char *data) { - struct ieee80211_radiotap_header *hdr = (void *)data; + const struct ieee80211_radiotap_header *hdr = (const void *)data; return get_unaligned_le16(&hdr->it_len); } -- cgit v1.2.3 From ea5907db2a9ccf37fdb6d1e67bcb620c1fea10f8 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Wed, 2 Feb 2022 10:49:47 +0200 Subject: mac80211: fix struct ieee80211_tx_info size The size of the status_driver_data field was not adjusted when the is_valid_ack_signal field was added. Since the size of struct ieee80211_tx_info is limited, replace the is_valid_ack_signal field with a flags field, and adjust the struct size accordingly. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.0ff363d4fa56.I45792c0187034a6d0e1c99a7db741996ef7caba3@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c50221d7e82c..bd6912d0292b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation */ #ifndef MAC80211_H @@ -883,6 +883,17 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_DONT_REORDER = BIT(8), }; +/** + * enum mac80211_tx_status_flags - flags to describe transmit status + * + * @IEEE80211_TX_STATUS_ACK_SIGNAL_VALID: ACK signal is valid + * + * These flags are used in tx_info->status.flags. + */ +enum mac80211_tx_status_flags { + IEEE80211_TX_STATUS_ACK_SIGNAL_VALID = BIT(0), +}; + /* * This definition is used as a mask to clear all temporary flags, which are * set by the tx handlers for each transmission attempt by the mac80211 stack. @@ -1046,7 +1057,7 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @status.antenna: (legacy, kept only for iwlegacy) * @status.tx_time: airtime consumed for transmission; note this is only * used for WMM AC, not for airtime fairness - * @status.is_valid_ack_signal: ACK signal is valid + * @status.flags: status flags, see &enum mac80211_tx_status_flags * @status.status_driver_data: driver use area * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK @@ -1099,8 +1110,8 @@ struct ieee80211_tx_info { u8 ampdu_len; u8 antenna; u16 tx_time; - bool is_valid_ack_signal; - void *status_driver_data[19 / sizeof(void *)]; + u8 flags; + void *status_driver_data[18 / sizeof(void *)]; } status; struct { struct ieee80211_tx_rate driver_rates[ -- cgit v1.2.3 From c78b8b20e34920231eda02fb40c7aca7d88be837 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Feb 2022 15:12:40 -0800 Subject: net: don't include ndisc.h from ipv6.h Nothing in ipv6.h needs ndisc.h, drop it. Link: https://lore.kernel.org/r/20220203043457.2222388-1-kuba@kernel.org Acked-by: Jeremy Kerr Acked-by: Stefan Schmidt Link: https://lore.kernel.org/r/20220203231240.2297588-1-kuba@kernel.org Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 1 - include/net/ipv6_frag.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 082f30256f59..cda1f205f391 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 0a4779175a52..5052c66e22d2 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_FRAG_H #define _IPV6_FRAG_H +#include #include #include #include -- cgit v1.2.3 From 35d39fecbc242150af5587506e58ec1f8541fb68 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 3 Feb 2022 10:44:30 +0200 Subject: net/sched: Enable tc skb ext allocation on chain miss only when needed Currently tc skb extension is used to send miss info from tc to ovs datapath module, and driver to tc. For the tc to ovs miss it is currently always allocated even if it will not be used by ovs datapath (as it depends on a requested feature). Export the static key which is used by openvswitch module to guard this code path as well, so it will be skipped if ovs datapath doesn't need it. Enable this code path once ovs datapath needs it. Signed-off-by: Paul Blakey Reviewed-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 676cb8ea9e15..a3b57a93228a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -1028,4 +1028,15 @@ struct tc_fifo_qopt_offload { }; }; +#ifdef CONFIG_NET_CLS_ACT +DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); +void tc_skb_ext_tc_enable(void); +void tc_skb_ext_tc_disable(void); +#define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) +#else /* CONFIG_NET_CLS_ACT */ +static inline void tc_skb_ext_tc_enable(void) { } +static inline void tc_skb_ext_tc_disable(void) { } +#define tc_skb_ext_tc_enabled() false +#endif + #endif -- cgit v1.2.3 From de5a1f3ce4c862150dc442530dba19e1d1dc6bc2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 4 Feb 2022 12:28:37 +0100 Subject: net: gro: minor optimization for dev_gro_receive() While inspecting some perf report, I noticed that the compiler emits suboptimal code for the napi CB initialization, fetching and storing multiple times the memory for flags bitfield. This is with gcc 10.3.1, but I observed the same with older compiler versions. We can help the compiler to do a nicer work clearing several fields at once using an u32 alias. The generated code is quite smaller, with the same number of conditional. Before: objdump -t net/core/gro.o | grep " F .text" 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive After: 0000000000000bb0 l F .text 000000000000033c dev_gro_receive v1 -> v2: - use struct_group (Alexander and Alex) RFC -> v1: - use __struct_group to delimit the zeroed area (Alexander) Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/gro.h | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/include/net/gro.h b/include/net/gro.h index 8f75802d50fd..a765fedda5c4 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -29,46 +29,50 @@ struct napi_gro_cb { /* Number of segments aggregated. */ u16 count; - /* Start offset for remote checksum offload */ - u16 gro_remcsum_start; + /* Used in ipv6_gro_receive() and foo-over-udp */ + u16 proto; /* jiffies when first packet was created/queued */ unsigned long age; - /* Used in ipv6_gro_receive() and foo-over-udp */ - u16 proto; + /* portion of the cb set to zero at every gro iteration */ + struct_group(zeroed, + + /* Start offset for remote checksum offload */ + u16 gro_remcsum_start; - /* This is non-zero if the packet may be of the same flow. */ - u8 same_flow:1; + /* This is non-zero if the packet may be of the same flow. */ + u8 same_flow:1; - /* Used in tunnel GRO receive */ - u8 encap_mark:1; + /* Used in tunnel GRO receive */ + u8 encap_mark:1; - /* GRO checksum is valid */ - u8 csum_valid:1; + /* GRO checksum is valid */ + u8 csum_valid:1; - /* Number of checksums via CHECKSUM_UNNECESSARY */ - u8 csum_cnt:3; + /* Number of checksums via CHECKSUM_UNNECESSARY */ + u8 csum_cnt:3; - /* Free the skb? */ - u8 free:2; + /* Free the skb? */ + u8 free:2; #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; + /* Used in foo-over-udp, set in udp[46]_gro_receive */ + u8 is_ipv6:1; - /* Used in GRE, set in fou/gue_gro_receive */ - u8 is_fou:1; + /* Used in GRE, set in fou/gue_gro_receive */ + u8 is_fou:1; - /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; + /* Used to determine if flush_id can be ignored */ + u8 is_atomic:1; - /* Number of gro_receive callbacks this packet already went through */ - u8 recursion_counter:4; + /* Number of gro_receive callbacks this packet already went through */ + u8 recursion_counter:4; - /* GRO is done by frag_list pointer chaining. */ - u8 is_flist:1; + /* GRO is done by frag_list pointer chaining. */ + u8 is_flist:1; + ); /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; -- cgit v1.2.3 From 9c1be1935fb68b2413796cdc03d019b8cf35ab51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 5 Feb 2022 09:01:25 -0800 Subject: net: initialize init_net earlier While testing a patch that will follow later ("net: add netns refcount tracker to struct nsproxy") I found that devtmpfs_init() was called before init_net was initialized. This is a bug, because devtmpfs_setup() calls ksys_unshare(CLONE_NEWNS); This has the effect of increasing init_net refcount, which will be later overwritten to 1, as part of setup_net(&init_net) We had too many prior patches [1] trying to work around the root cause. Really, make sure init_net is in BSS section, and that net_ns_init() is called earlier at boot time. Note that another patch ("vfs: add netns refcount tracker to struct fs_context") also will need net_ns_init() being called before vfs_caches_init() As a bonus, this patch saves around 4KB in .data section. [1] f8c46cb39079 ("netns: do not call pernet ops for not yet set up init_net namespace") b5082df8019a ("net: Initialise init_net.count to 1") 734b65417b24 ("net: Statically initialize init_net.dev_base_head") v2: fixed a build error reported by kernel build bots (CONFIG_NET=n) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/net_namespace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 5b61c462e534..374cc7b260fc 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -513,4 +513,10 @@ static inline void fnhe_genid_bump(struct net *net) atomic_inc(&net->fnhe_genid); } +#ifdef CONFIG_NET +void net_ns_init(void); +#else +static inline void net_ns_init(void) {} +#endif + #endif /* __NET_NET_NAMESPACE_H */ -- cgit v1.2.3 From a410a0cf98854a698a519bfbeb604145da384c0e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 4 Feb 2022 14:58:11 +0100 Subject: ipv6: Define dscp_t and stop taking ECN bits into account in fib6-rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a dscp_t type and its appropriate helpers that ensure ECN bits are not taken into account when handling DSCP. Use this new type to replace the tclass field of struct fib6_rule, so that fib6-rules don't get influenced by ECN bits anymore. Before this patch, fib6-rules didn't make any distinction between the DSCP and ECN bits. Therefore, rules specifying a DSCP (tos or dsfield options in iproute2) stopped working as soon a packets had at least one of its ECN bits set (as a work around one could create four rules for each DSCP value to match, one for each possible ECN value). After this patch fib6-rules only compare the DSCP bits. ECN doesn't influence the result anymore. Also, fib6-rules now must have the ECN bits cleared or they will be rejected. Signed-off-by: Guillaume Nault Acked-by: David Ahern Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- include/net/inet_dscp.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ include/net/ipv6.h | 6 ++++++ 2 files changed, 63 insertions(+) create mode 100644 include/net/inet_dscp.h (limited to 'include/net') diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h new file mode 100644 index 000000000000..72f250dffada --- /dev/null +++ b/include/net/inet_dscp.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * inet_dscp.h: helpers for handling differentiated services codepoints (DSCP) + * + * DSCP is defined in RFC 2474: + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | DSCP | CU | + * +---+---+---+---+---+---+---+---+ + * + * DSCP: differentiated services codepoint + * CU: currently unused + * + * The whole DSCP + CU bits form the DS field. + * The DS field is also commonly called TOS or Traffic Class (for IPv6). + * + * Note: the CU bits are now used for Explicit Congestion Notification + * (RFC 3168). + */ + +#ifndef _INET_DSCP_H +#define _INET_DSCP_H + +#include + +/* Special type for storing DSCP values. + * + * A dscp_t variable stores a DS field with the CU (ECN) bits cleared. + * Using dscp_t allows to strictly separate DSCP and ECN bits, thus avoiding + * bugs where ECN bits are erroneously taken into account during FIB lookups + * or policy routing. + * + * Note: to get the real DSCP value contained in a dscp_t variable one would + * have to do a bit shift after calling inet_dscp_to_dsfield(). We could have + * a helper for that, but there's currently no users. + */ +typedef u8 __bitwise dscp_t; + +#define INET_DSCP_MASK 0xfc + +static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield) +{ + return (__force dscp_t)(dsfield & INET_DSCP_MASK); +} + +static inline __u8 inet_dscp_to_dsfield(dscp_t dscp) +{ + return (__force __u8)dscp; +} + +static inline bool inet_validate_dscp(__u8 val) +{ + return !(val & ~INET_DSCP_MASK); +} + +#endif /* _INET_DSCP_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index cda1f205f391..f693784e1419 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -974,6 +975,11 @@ static inline u8 ip6_tclass(__be32 flowinfo) return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; } +static inline dscp_t ip6_dscp(__be32 flowinfo) +{ + return inet_dsfield_to_dscp(ip6_tclass(flowinfo)); +} + static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) { return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; -- cgit v1.2.3 From f55fbb6afb8d701e3185e31e73f5ea9503a66744 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 4 Feb 2022 14:58:16 +0100 Subject: ipv4: Reject routes specifying ECN bits in rtm_tos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the new dscp_t type to replace the fc_tos field of fib_config, to ensure IPv4 routes aren't influenced by ECN bits when configured with non-zero rtm_tos. Before this patch, IPv4 routes specifying an rtm_tos with some of the ECN bits set were accepted. However they wouldn't work (never match) as IPv4 normally clears the ECN bits with IPTOS_RT_MASK before doing a FIB lookup (although a few buggy code paths don't). After this patch, IPv4 routes specifying an rtm_tos with any ECN bit set is rejected. Note: IPv6 routes ignore rtm_tos altogether, any rtm_tos is accepted, but treated as if it were 0. Signed-off-by: Guillaume Nault Acked-by: David Ahern Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index c4297704bbcb..6a82bcb8813b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,7 @@ struct fib_config { u8 fc_dst_len; - u8 fc_tos; + dscp_t fc_dscp; u8 fc_protocol; u8 fc_scope; u8 fc_type; -- cgit v1.2.3 From 21a216a8fc630161e69eaf02cbebdd1816ff1a13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:28 -0800 Subject: ipv6/addrconf: allocate a per netns hash table Add a per netns hash table and a dedicated spinlock, first step to get rid of the global inet6_addr_lst[] one. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 30cdfc4e1615..755f12001c8b 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -92,6 +92,10 @@ struct netns_ipv6 { struct sock *tcp_sk; struct sock *igmp_sk; struct sock *mc_autojoin_sk; + + struct hlist_head *inet6_addr_lst; + spinlock_t addrconf_hash_lock; + #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES struct mr_table *mrt6; -- cgit v1.2.3 From 8805d13ff1b2bef6a7bb8a005d2441763286dd7a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:29 -0800 Subject: ipv6/addrconf: use one delayed work per netns Next step for using per netns inet6_addr_lst is to have per netns work item to ultimately call addrconf_verify_rtnl() and addrconf_verify() with a new 'struct net*' argument. Everything is still using the global inet6_addr_lst[] table. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 755f12001c8b..d145f1966682 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -95,6 +95,7 @@ struct netns_ipv6 { struct hlist_head *inet6_addr_lst; spinlock_t addrconf_hash_lock; + struct delayed_work addr_chk_work; #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES -- cgit v1.2.3 From c7d9a6751a5fcebc87feee9b999b40078ed9fb43 Mon Sep 17 00:00:00 2001 From: Luiz Angelo Daros de Luca Date: Tue, 8 Feb 2022 02:32:10 -0300 Subject: net: dsa: typo in comment Signed-off-by: Luiz Angelo Daros de Luca Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220208053210.14831-1-luizluca@gmail.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index ca8c14b547b4..fd1f62a6e0a8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1281,7 +1281,7 @@ module_exit(dsa_tag_driver_module_exit) /** * module_dsa_tag_drivers() - Helper macro for registering DSA tag * drivers - * @__ops_array: Array of tag driver strucutres + * @__ops_array: Array of tag driver structures * * Helper macro for DSA tag drivers which do not do anything special * in module init/exit. Each module may only use this macro once, and -- cgit v1.2.3 From 8dd8678e42b5a8bbd3e2c49cda76c743318c56b0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 5 Feb 2022 13:00:04 +0100 Subject: netfilter: ecache: don't use nf_conn spinlock For updating eache missed value we can use cmpxchg. This also avoids need to disable BH. kernel robot reported build failure on v1 because not all arches support cmpxchg for u16, so extend this to u32. This doesn't increase struct size, existing padding is used. Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 16bcff809b18..6c4c490a3e34 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -21,10 +21,10 @@ enum nf_ct_ecache_state { struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ - u16 missed; /* missed events */ u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ enum nf_ct_ecache_state state:8;/* ecache state */ + u32 missed; /* missed events */ u32 portid; /* netlink portid of destroyer */ }; -- cgit v1.2.3 From 7afa38831aee2ca2f41b22747ed8545e1887aaa9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Feb 2022 12:29:47 +0100 Subject: netfilter: cttimeout: use option structure Instead of two exported functions, export a single option structure. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_timeout.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index db507e4a65bb..3ea94f6f3844 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -108,8 +108,12 @@ static inline void nf_ct_destroy_timeout(struct nf_conn *ct) #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -extern struct nf_ct_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); -extern void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout); +struct nf_ct_timeout_hooks { + struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name); + void (*timeout_put)(struct nf_ct_timeout *timeout); +}; + +extern const struct nf_ct_timeout_hooks *nf_ct_timeout_hook; #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ -- cgit v1.2.3 From 23f68d462984bfda47c7bf663dca347e8e3df549 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 7 Feb 2022 19:25:08 +0100 Subject: netfilter: nft_cmp: optimize comparison for 16-bytes Allow up to 16-byte comparisons with a new cmp fast version. Use two 64-bit words and calculate the mask representing the bits to be compared. Make sure the comparison is 64-bit aligned and avoid out-of-bound memory access on registers. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index b6fb1fdff9b2..0ea7c55cea4d 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -42,6 +42,14 @@ struct nft_cmp_fast_expr { bool inv; }; +struct nft_cmp16_fast_expr { + struct nft_data data; + struct nft_data mask; + u8 sreg; + u8 len; + bool inv; +}; + struct nft_immediate_expr { struct nft_data data; u8 dreg; @@ -59,6 +67,7 @@ static inline u32 nft_cmp_fast_mask(unsigned int len) } extern const struct nft_expr_ops nft_cmp_fast_ops; +extern const struct nft_expr_ops nft_cmp16_fast_ops; struct nft_payload { enum nft_payload_bases base:8; -- cgit v1.2.3 From cfc56f85e72f5b9c5c5be26dc2b16518d36a7868 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 7 Feb 2022 18:13:18 +0100 Subject: net: do not keep the dst cache when uncloning an skb dst and its metadata When uncloning an skb dst and its associated metadata a new dst+metadata is allocated and the tunnel information from the old metadata is copied over there. The issue is the tunnel metadata has references to cached dst, which are copied along the way. When a dst+metadata refcount drops to 0 the metadata is freed including the cached dst entries. As they are also referenced in the initial dst+metadata, this ends up in UaFs. In practice the above did not happen because of another issue, the dst+metadata was never freed because its refcount never dropped to 0 (this will be fixed in a subsequent patch). Fix this by initializing the dst cache after copying the tunnel information from the old metadata to also unshare the dst cache. Fixes: d71785ffc7e7 ("net: add dst_cache to ovs vxlan lwtunnel") Cc: Paolo Abeni Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Antoine Tenart Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 14efa0ded75d..b997e0c1e362 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -123,6 +123,19 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, sizeof(struct ip_tunnel_info) + md_size); +#ifdef CONFIG_DST_CACHE + /* Unclone the dst cache if there is one */ + if (new_md->u.tun_info.dst_cache.cache) { + int ret; + + ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC); + if (ret) { + metadata_dst_free(new_md); + return ERR_PTR(ret); + } + } +#endif + skb_dst_drop(skb); dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); -- cgit v1.2.3 From 9eeabdf17fa0ab75381045c867c370f4cc75a613 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 7 Feb 2022 18:13:19 +0100 Subject: net: fix a memleak when uncloning an skb dst and its metadata When uncloning an skb dst and its associated metadata, a new dst+metadata is allocated and later replaces the old one in the skb. This is helpful to have a non-shared dst+metadata attached to a specific skb. The issue is the uncloned dst+metadata is initialized with a refcount of 1, which is increased to 2 before attaching it to the skb. When tun_dst_unclone returns, the dst+metadata is only referenced from a single place (the skb) while its refcount is 2. Its refcount will never drop to 0 (when the skb is consumed), leading to a memory leak. Fix this by removing the call to dst_hold in tun_dst_unclone, as the dst+metadata refcount is already 1. Fixes: fc4099f17240 ("openvswitch: Fix egress tunnel info.") Cc: Pravin B Shelar Reported-by: Vlad Buslov Tested-by: Vlad Buslov Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index b997e0c1e362..adab27ba1ecb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -137,7 +137,6 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) #endif skb_dst_drop(skb); - dst_hold(&new_md->dst); skb_dst_set(skb, &new_md->dst); return new_md; } -- cgit v1.2.3 From 8069b22d656f6e1922352bff90ab78e6fab73779 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 9 Feb 2022 12:05:55 +0800 Subject: mctp: Add helper for address match checking Currently, we have a couple of paths that check that an EID matches, or the match value is MCTP_ADDR_ANY. Rather than open coding this, add a little helper. Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/net/mctp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index 7e35ec79b909..706d329dd8e8 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -45,6 +45,11 @@ static inline bool mctp_address_ok(mctp_eid_t eid) return eid >= 8 && eid < 255; } +static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) +{ + return match == eid || match == MCTP_ADDR_ANY; +} + static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb) { return (struct mctp_hdr *)skb_network_header(skb); -- cgit v1.2.3 From 63ed1aab3d40aa61aaa66819bdce9377ac7f40fa Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Wed, 9 Feb 2022 12:05:57 +0800 Subject: mctp: Add SIOCMCTP{ALLOC,DROP}TAG ioctls for tag control This change adds a couple of new ioctls for mctp sockets: SIOCMCTPALLOCTAG and SIOCMCTPDROPTAG. These ioctls provide facilities for explicit allocation / release of tags, overriding the automatic allocate-on-send/release-on-reply and timeout behaviours. This allows userspace more control over messages that may not fit a simple request/response model. In order to indicate a pre-allocated tag to the sendmsg() syscall, we introduce a new flag to the struct sockaddr_mctp.smctp_tag value: MCTP_TAG_PREALLOC. Additional changes from Jeremy Kerr . Contains a fix that was: Reported-by: kernel test robot Signed-off-by: Matt Johnston Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/net/mctp.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index 706d329dd8e8..e80a4baf8379 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -126,7 +126,7 @@ struct mctp_sock { */ struct mctp_sk_key { mctp_eid_t peer_addr; - mctp_eid_t local_addr; + mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ __u8 tag; /* incoming tag match; invert TO for local */ /* we hold a ref to sk when set */ @@ -163,6 +163,12 @@ struct mctp_sk_key { */ unsigned long dev_flow_state; struct mctp_dev *dev; + + /* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire + * automatically on timeout or response, instead SIOCMCTPDROPTAG + * is used. + */ + bool manual_alloc; }; struct mctp_skb_cb { @@ -239,6 +245,9 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); +struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, + mctp_eid_t daddr, mctp_eid_t saddr, + bool manual, u8 *tagp); /* routing <--> device interface */ unsigned int mctp_default_net(struct net *net); -- cgit v1.2.3 From ede6c39c4f9068cbeb4036448c45fff5393e0432 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Feb 2022 18:59:32 -0800 Subject: net: make net->dev_unreg_count atomic Having to acquire rtnl from netdev_run_todo() for every dismantled device is not desirable when/if rtnl is under stress. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/net_namespace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 374cc7b260fc..c4f5601f6e32 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -63,7 +63,7 @@ struct net { */ spinlock_t rules_mod_lock; - unsigned int dev_unreg_count; + atomic_t dev_unreg_count; unsigned int dev_base_seq; /* protected by rtnl_mutex */ int ifindex; -- cgit v1.2.3 From f9496b7c1b48ce02cd17a3ee88b1e049c689a222 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:38 +0800 Subject: net/smc: Add global configure for handshake limitation by netlink Although we can control SMC handshake limitation through socket options, which means that applications who need it must modify their code. It's quite troublesome for many existing applications. This patch modifies the global default value of SMC handshake limitation through netlink, providing a way to put constraint on handshake without modifies any code for applications. Suggested-by: Tony Lu Signed-off-by: D. Wythe Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- include/net/netns/smc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index ea8a9cf2619b..47b166684fd8 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -12,5 +12,7 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; + + bool limit_smc_hs; /* constraint on handshake */ }; #endif -- cgit v1.2.3 From 2d4feb2c1ba728fe53d6f80705b2d7e5c5b666a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Feb 2022 13:42:28 -0800 Subject: ipv6: get rid of net->ipv6.rt6_stats->fib_rt_uncache This counter has never been visible, there is little point trying to maintain it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 40ae8f1b18e5..32c83bb68879 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -367,9 +367,8 @@ struct rt6_statistics { __u32 fib_rt_cache; /* cached rt entries in exception table */ __u32 fib_discarded_routes; /* total number of routes delete */ - /* The following stats are not protected by any lock */ + /* The following stat is not protected by any lock */ atomic_t fib_rt_alloc; /* total number of routes alloced */ - atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 -- cgit v1.2.3 From 26394fc118d6115390bd5b3a0fb17096271da227 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Fri, 11 Feb 2022 17:30:42 +0000 Subject: ipv6: mcast: use rcu-safe version of ipv6_get_lladdr() Some time ago 8965779d2c0e ("ipv6,mcast: always hold idev->lock before mca_lock") switched ipv6_get_lladdr() to __ipv6_get_lladdr(), which is rcu-unsafe version. That was OK, because idev->lock was held for these codepaths. In 88e2ca308094 ("mld: convert ifmcaddr6 to RCU") these external locks were removed, so we probably need to restore the original rcu-safe call. Otherwise, we occasionally get a machine crashed/stalled with the following in dmesg: [ 3405.966610][T230589] general protection fault, probably for non-canonical address 0xdead00000000008c: 0000 [#1] SMP NOPTI [ 3405.982083][T230589] CPU: 44 PID: 230589 Comm: kworker/44:3 Tainted: G O 5.15.19-cloudflare-2022.2.1 #1 [ 3405.998061][T230589] Hardware name: SUPA-COOL-SERV [ 3406.009552][T230589] Workqueue: mld mld_ifc_work [ 3406.017224][T230589] RIP: 0010:__ipv6_get_lladdr+0x34/0x60 [ 3406.025780][T230589] Code: 57 10 48 83 c7 08 48 89 e5 48 39 d7 74 3e 48 8d 82 38 ff ff ff eb 13 48 8b 90 d0 00 00 00 48 8d 82 38 ff ff ff 48 39 d7 74 22 <66> 83 78 32 20 77 1b 75 e4 89 ca 23 50 2c 75 dd 48 8b 50 08 48 8b [ 3406.055748][T230589] RSP: 0018:ffff94e4b3fc3d10 EFLAGS: 00010202 [ 3406.065617][T230589] RAX: dead00000000005a RBX: ffff94e4b3fc3d30 RCX: 0000000000000040 [ 3406.077477][T230589] RDX: dead000000000122 RSI: ffff94e4b3fc3d30 RDI: ffff8c3a31431008 [ 3406.089389][T230589] RBP: ffff94e4b3fc3d10 R08: 0000000000000000 R09: 0000000000000000 [ 3406.101445][T230589] R10: ffff8c3a31430000 R11: 000000000000000b R12: ffff8c2c37887100 [ 3406.113553][T230589] R13: ffff8c3a39537000 R14: 00000000000005dc R15: ffff8c3a31431000 [ 3406.125730][T230589] FS: 0000000000000000(0000) GS:ffff8c3b9fc80000(0000) knlGS:0000000000000000 [ 3406.138992][T230589] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3406.149895][T230589] CR2: 00007f0dfea1db60 CR3: 000000387b5f2000 CR4: 0000000000350ee0 [ 3406.162421][T230589] Call Trace: [ 3406.170235][T230589] [ 3406.177736][T230589] mld_newpack+0xfe/0x1a0 [ 3406.186686][T230589] add_grhead+0x87/0xa0 [ 3406.195498][T230589] add_grec+0x485/0x4e0 [ 3406.204310][T230589] ? newidle_balance+0x126/0x3f0 [ 3406.214024][T230589] mld_ifc_work+0x15d/0x450 [ 3406.223279][T230589] process_one_work+0x1e6/0x380 [ 3406.232982][T230589] worker_thread+0x50/0x3a0 [ 3406.242371][T230589] ? rescuer_thread+0x360/0x360 [ 3406.252175][T230589] kthread+0x127/0x150 [ 3406.261197][T230589] ? set_kthread_struct+0x40/0x40 [ 3406.271287][T230589] ret_from_fork+0x22/0x30 [ 3406.280812][T230589] [ 3406.288937][T230589] Modules linked in: ... [last unloaded: kheaders] [ 3406.476714][T230589] ---[ end trace 3525a7655f2f3b9e ]--- Fixes: 88e2ca308094 ("mld: convert ifmcaddr6 to RCU") Reported-by: David Pinilla Caparros Signed-off-by: Ignat Korchagin Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index e7ce719838b5..59940e230b78 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -109,8 +109,6 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); -int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, - u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, -- cgit v1.2.3 From a2614140dc0f467a83aa3bb4b6ee2d6480a76202 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Feb 2022 19:45:06 +0200 Subject: net: dsa: mv88e6xxx: flush switchdev FDB workqueue before removing VLAN mv88e6xxx is special among DSA drivers in that it requires the VTU to contain the VID of the FDB entry it modifies in mv88e6xxx_port_db_load_purge(), otherwise it will return -EOPNOTSUPP. Sometimes due to races this is not always satisfied even if external code does everything right (first deletes the FDB entries, then the VLAN), because DSA commits to hardware FDB entries asynchronously since commit c9eb3e0f8701 ("net: dsa: Add support for learning FDB through notification"). Therefore, the mv88e6xxx driver must close this race condition by itself, by asking DSA to flush the switchdev workqueue of any FDB deletions in progress, prior to exiting a VLAN. Fixes: c9eb3e0f8701 ("net: dsa: Add support for learning FDB through notification") Reported-by: Rafael Richter Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 57b3e4e7413b..85a5ba3772f5 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1187,6 +1187,7 @@ void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); +void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); -- cgit v1.2.3 From 9ceaf6f76b203682bb6100e14b3d7da4c0bedde8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Feb 2022 11:15:53 -0800 Subject: bonding: fix data-races around agg_select_timer syzbot reported that two threads might write over agg_select_timer at the same time. Make agg_select_timer atomic to fix the races. BUG: KCSAN: data-race in bond_3ad_initiate_agg_selection / bond_3ad_state_machine_handler read to 0xffff8881242aea90 of 4 bytes by task 1846 on cpu 1: bond_3ad_state_machine_handler+0x99/0x2810 drivers/net/bonding/bond_3ad.c:2317 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 write to 0xffff8881242aea90 of 4 bytes by task 25910 on cpu 0: bond_3ad_initiate_agg_selection+0x18/0x30 drivers/net/bonding/bond_3ad.c:1998 bond_open+0x658/0x6f0 drivers/net/bonding/bond_main.c:3967 __dev_open+0x274/0x3a0 net/core/dev.c:1407 dev_open+0x54/0x190 net/core/dev.c:1443 bond_enslave+0xcef/0x3000 drivers/net/bonding/bond_main.c:1937 do_set_master net/core/rtnetlink.c:2532 [inline] do_setlink+0x94f/0x2500 net/core/rtnetlink.c:2736 __rtnl_newlink net/core/rtnetlink.c:3414 [inline] rtnl_newlink+0xfeb/0x13e0 net/core/rtnetlink.c:3529 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5594 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000050 -> 0x0000004f Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 25910 Comm: syz-executor.1 Tainted: G W 5.17.0-rc4-syzkaller-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jay Vosburgh Cc: Veaceslav Falico Signed-off-by: David S. Miller --- include/net/bond_3ad.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index 38785d48baff..184105d68294 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -262,7 +262,7 @@ struct ad_system { struct ad_bond_info { struct ad_system system; /* 802.3ad system structure */ struct bond_3ad_stats stats; - u32 agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ + atomic_t agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ u16 aggregator_identifier; }; -- cgit v1.2.3 From 8d23a54f5beea59b560855fb571e5d73d783e0b4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Feb 2022 19:02:12 +0200 Subject: net: bridge: switchdev: differentiate new VLANs from changed ones br_switchdev_port_vlan_add() currently emits a SWITCHDEV_PORT_OBJ_ADD event with a SWITCHDEV_OBJ_ID_PORT_VLAN for 2 distinct cases: - a struct net_bridge_vlan got created - an existing struct net_bridge_vlan was modified This makes it impossible for switchdev drivers to properly balance PORT_OBJ_ADD with PORT_OBJ_DEL events, so if we want to allow that to happen, we must provide a way for drivers to distinguish between a VLAN with changed flags and a new one. Annotate struct switchdev_obj_port_vlan with a "bool changed" that distinguishes the 2 cases above. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/switchdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d353793dfeb5..92cc763991e9 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -81,6 +81,13 @@ struct switchdev_obj_port_vlan { struct switchdev_obj obj; u16 flags; u16 vid; + /* If set, the notifier signifies a change of one of the following + * flags for a VLAN that already exists: + * - BRIDGE_VLAN_INFO_PVID + * - BRIDGE_VLAN_INFO_UNTAGGED + * Entries with BRIDGE_VLAN_INFO_BRENTRY unset are not notified at all. + */ + bool changed; }; #define SWITCHDEV_OBJ_PORT_VLAN(OBJ) \ -- cgit v1.2.3 From c4076cdd21f8d68a96f1e7124bd8915c7e31a474 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Feb 2022 19:02:16 +0200 Subject: net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces The switchdev_handle_port_obj_add() helper is good for replicating a port object on the lower interfaces of @dev, if that object was emitted on a bridge, or on a bridge port that is a LAG. However, drivers that use this helper limit themselves to a box from which they can no longer intercept port objects notified on neighbor ports ("foreign interfaces"). One such driver is DSA, where software bridging with foreign interfaces such as standalone NICs or Wi-Fi APs is an important use case. There, a VLAN installed on a neighbor bridge port roughly corresponds to a forwarding VLAN installed on the DSA switch's CPU port. To support this use case while also making use of the benefits of the switchdev_handle_* replication helper for port objects, introduce a new variant of these functions that crawls through the neighbor ports of @dev, in search of potentially compatible switchdev ports that are interested in the event. The strategy is identical to switchdev_handle_fdb_event_to_device(): if @dev wasn't a switchdev interface, then go one step upper, and recursively call this function on the bridge that this port belongs to. At the next recursion step, __switchdev_handle_port_obj_add() will iterate through the bridge's lower interfaces. Among those, some will be switchdev interfaces, and one will be the original @dev that we came from. To prevent infinite recursion, we must suppress reentry into the original @dev, and just call the @add_cb for the switchdev_interfaces. It looks like this: br0 / | \ / | \ / | \ swp0 swp1 eth0 1. __switchdev_handle_port_obj_add(eth0) -> check_cb(eth0) returns false -> eth0 has no lower interfaces -> eth0's bridge is br0 -> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb)) finds br0 2. __switchdev_handle_port_obj_add(br0) -> check_cb(br0) returns false -> netdev_for_each_lower_dev -> check_cb(swp0) returns true, so we don't skip this interface 3. __switchdev_handle_port_obj_add(swp0) -> check_cb(swp0) returns true, so we call add_cb(swp0) (back to netdev_for_each_lower_dev from 2) -> check_cb(swp1) returns true, so we don't skip this interface 4. __switchdev_handle_port_obj_add(swp1) -> check_cb(swp1) returns true, so we call add_cb(swp1) (back to netdev_for_each_lower_dev from 2) -> check_cb(eth0) returns false, so we skip this interface to avoid infinite recursion Note: eth0 could have been a LAG, and we don't want to suppress the recursion through its lowers if those exist, so when check_cb() returns false, we still call switchdev_lower_dev_find() to estimate whether there's anything worth a recursion beneath that LAG. Using check_cb() and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures out whether the lowers of the LAG are switchdev, but also whether they actively offload the LAG or not (whether the LAG is "foreign" to the switchdev interface or not). The port_obj_info->orig_dev is preserved across recursive calls, so switchdev drivers still know on which device was this notification originally emitted. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/switchdev.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 92cc763991e9..c32e1c8f79ec 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -324,11 +324,26 @@ int switchdev_handle_port_obj_add(struct net_device *dev, int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)); +int switchdev_handle_port_obj_add_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack)); int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)); +int switchdev_handle_port_obj_del_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj)); int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, @@ -447,6 +462,18 @@ switchdev_handle_port_obj_add(struct net_device *dev, return 0; } +static inline int switchdev_handle_port_obj_add_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack)) +{ + return 0; +} + static inline int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, @@ -457,6 +484,18 @@ switchdev_handle_port_obj_del(struct net_device *dev, return 0; } +static inline int +switchdev_handle_port_obj_del_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj)) +{ + return 0; +} + static inline int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, -- cgit v1.2.3 From 134ef2388e7f3271d13223decdb5e45b0f84489f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Feb 2022 19:02:17 +0200 Subject: net: dsa: add explicit support for host bridge VLANs Currently, DSA programs VLANs on shared (DSA and CPU) ports each time it does so on user ports. This is good for basic functionality but has several limitations: - the VLAN group which must reach the CPU may be radically different from the VLAN group that must be autonomously forwarded by the switch. In other words, the admin may want to isolate noisy stations and avoid traffic from them going to the control processor of the switch, where it would just waste useless cycles. The bridge already supports independent control of VLAN groups on bridge ports and on the bridge itself, and when VLAN-aware, it will drop packets in software anyway if their VID isn't added as a 'self' entry towards the bridge device. - Replaying host FDB entries may depend, for some drivers like mv88e6xxx, on replaying the host VLANs as well. The 2 VLAN groups are approximately the same in most regular cases, but there are corner cases when timing matters, and DSA's approximation of replicating VLANs on shared ports simply does not work. - If a user makes the bridge (implicitly the CPU port) join a VLAN by accident, there is no way for the CPU port to isolate itself from that noisy VLAN except by rebooting the system. This is because for each VLAN added on a user port, DSA will add it on shared ports too, but for each VLAN deletion on a user port, it will remain installed on shared ports, since DSA has no good indication of whether the VLAN is still in use or not. Now that the bridge driver emits well-balanced SWITCHDEV_OBJ_ID_PORT_VLAN addition and removal events, DSA has a simple and straightforward task of separating the bridge port VLANs (these have an orig_dev which is a DSA slave interface, or a LAG interface) from the host VLANs (these have an orig_dev which is a bridge interface), and to keep a simple reference count of each VID on each shared port. Forwarding VLANs must be installed on the bridge ports and on all DSA ports interconnecting them. We don't have a good view of the exact topology, so we simply install forwarding VLANs on all DSA ports, which is what has been done until now. Host VLANs must be installed primarily on the dedicated CPU port of each bridge port. More subtly, they must also be installed on upstream-facing and downstream-facing DSA ports that are connecting the bridge ports and the CPU. This ensures that the mv88e6xxx's problem (VID of host FDB entry may be absent from VTU) is still addressed even if that switch is in a cross-chip setup, and it has no local CPU port. Therefore: - user ports contain only bridge port (forwarding) VLANs, and no refcounting is necessary - DSA ports contain both forwarding and host VLANs. Refcounting is necessary among these 2 types. - CPU ports contain only host VLANs. Refcounting is also necessary. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index fd1f62a6e0a8..85cb9aed4c51 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -312,6 +312,10 @@ struct dsa_port { struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; + + /* List of VLANs that CPU and DSA ports are members of. */ + struct mutex vlans_lock; + struct list_head vlans; }; /* TODO: ideally DSA ports would have a single dp->link_dp member, @@ -332,6 +336,12 @@ struct dsa_mac_addr { struct list_head list; }; +struct dsa_vlan { + u16 vid; + refcount_t refcount; + struct list_head list; +}; + struct dsa_switch { struct device *dev; -- cgit v1.2.3 From 5cd5a8a3e2fb11b1c8a09f062c44c1e228ef987a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:29:53 +0100 Subject: cfg80211: Add data structures to capture EHT capabilities And advertise EHT capabilities to user space when supported. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.6fb70658529f.I2413a37c8f7d2d6d638038a3d95360a3fce0114d@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f6db085ff3df..7bec15f605be 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -360,6 +360,48 @@ struct ieee80211_sta_he_cap { u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN]; }; +/** + * struct ieee80211_eht_mcs_nss_supp - EHT max supported NSS per MCS + * + * See P802.11be_D1.3 Table 9-401k - "Subfields of the Supported EHT-MCS + * and NSS Set field" + * + * @only_20mhz: MCS/NSS support for 20 MHz-only STA. + * @bw._80: MCS/NSS support for BW <= 80 MHz + * @bw._160: MCS/NSS support for BW = 160 MHz + * @bw._320: MCS/NSS support for BW = 320 MHz + */ +struct ieee80211_eht_mcs_nss_supp { + union { + struct ieee80211_eht_mcs_nss_supp_20mhz_only only_20mhz; + struct { + struct ieee80211_eht_mcs_nss_supp_bw _80; + struct ieee80211_eht_mcs_nss_supp_bw _160; + struct ieee80211_eht_mcs_nss_supp_bw _320; + } __packed bw; + } __packed; +} __packed; + +#define IEEE80211_EHT_PPE_THRES_MAX_LEN 32 + +/** + * struct ieee80211_sta_eht_cap - STA's EHT capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11be EHT capabilities for a STA. + * + * @has_eht: true iff EHT data is valid. + * @eht_cap_elem: Fixed portion of the eht capabilities element. + * @eht_mcs_nss_supp: The supported NSS/MCS combinations. + * @eht_ppe_thres: Holds the PPE Thresholds data. + */ +struct ieee80211_sta_eht_cap { + bool has_eht; + struct ieee80211_eht_cap_elem_fixed eht_cap_elem; + struct ieee80211_eht_mcs_nss_supp eht_mcs_nss_supp; + u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN]; +}; + /** * struct ieee80211_sband_iftype_data - sband data per interface type * @@ -379,6 +421,7 @@ struct ieee80211_sband_iftype_data { u16 types_mask; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; struct { const u8 *data; unsigned int len; @@ -561,6 +604,26 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, return data->he_6ghz_capa.capa; } +/** + * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype + * @sband: the sband to search for the iftype on + * @iftype: enum nl80211_iftype + * + * Return: pointer to the struct ieee80211_sta_eht_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_eht_cap * +ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband, + enum nl80211_iftype iftype) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, iftype); + + if (data && data->eht_cap.has_eht) + return &data->eht_cap; + + return NULL; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * -- cgit v1.2.3 From 3743bec6120ae0748cb3fda6ff80a690117ef1f3 Mon Sep 17 00:00:00 2001 From: Jia Ding Date: Mon, 14 Feb 2022 17:29:54 +0100 Subject: cfg80211: Add support for EHT 320 MHz channel width Add 320 MHz support in the channel def and center frequency validation with compatible check. Signed-off-by: Jia Ding Co-authored-by: Karthikeyan Periyasamy Signed-off-by: Karthikeyan Periyasamy Co-authored-by: Muna Sinada Signed-off-by: Muna Sinada Co-authored-by: Veerendranath Jakkam Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/1640163883-12696-5-git-send-email-quic_vjakkam@quicinc.com Link: https://lore.kernel.org/r/20220214163009.175289-1-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7bec15f605be..a27f2d699dac 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -109,7 +109,11 @@ struct wiphy; * on this channel. * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted * on this channel. - * + * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, + * this flag indicates that a 320 MHz channel cannot use this + * channel as the control or any of the secondary channels. + * This may be due to the driver or due to regulatory bandwidth + * restrictions. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -131,6 +135,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_4MHZ = 1<<16, IEEE80211_CHAN_8MHZ = 1<<17, IEEE80211_CHAN_16MHZ = 1<<18, + IEEE80211_CHAN_NO_320MHZ = 1<<19, }; #define IEEE80211_CHAN_NO_HT40 \ -- cgit v1.2.3 From cfb14110acf87b4db62e07ba08a80429f1749f40 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Mon, 14 Feb 2022 17:29:55 +0100 Subject: nl80211: add EHT MCS support Add support for reporting and calculating EHT bitrates. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/1640163883-12696-7-git-send-email-quic_vjakkam@quicinc.com Link: https://lore.kernel.org/r/20220214163009.175289-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a27f2d699dac..f35ffd81d213 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1595,6 +1595,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_HE_MCS: HE MCS information * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS + * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), @@ -1604,6 +1605,7 @@ enum rate_info_flags { RATE_INFO_FLAGS_HE_MCS = BIT(4), RATE_INFO_FLAGS_EDMG = BIT(5), RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6), + RATE_INFO_FLAGS_EHT_MCS = BIT(7), }; /** @@ -1618,6 +1620,8 @@ enum rate_info_flags { * @RATE_INFO_BW_80: 80 MHz bandwidth * @RATE_INFO_BW_160: 160 MHz bandwidth * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation + * @RATE_INFO_BW_320: 320 MHz bandwidth + * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation */ enum rate_info_bw { RATE_INFO_BW_20 = 0, @@ -1627,6 +1631,8 @@ enum rate_info_bw { RATE_INFO_BW_80, RATE_INFO_BW_160, RATE_INFO_BW_HE_RU, + RATE_INFO_BW_320, + RATE_INFO_BW_EHT_RU, }; /** @@ -1644,6 +1650,9 @@ enum rate_info_bw { * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc, * only valid if bw is %RATE_INFO_BW_HE_RU) * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4) + * @eht_gi: EHT guard interval (from &enum nl80211_eht_gi) + * @eht_ru_alloc: EHT RU allocation (from &enum nl80211_eht_ru_alloc, + * only valid if bw is %RATE_INFO_BW_EHT_RU) */ struct rate_info { u8 flags; @@ -1655,6 +1664,8 @@ struct rate_info { u8 he_dcm; u8 he_ru_alloc; u8 n_bonded_ch; + u8 eht_gi; + u8 eht_ru_alloc; }; /** -- cgit v1.2.3 From 31846b657857e6a73d982604f36a34710d98902c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:29:57 +0100 Subject: cfg80211: add NO-EHT flag to regulatory This may be necessary in some cases, add a flag and propagate it, just like the NO-HE that already exists. Signed-off-by: Ilan Peer [split off from a combined 320/no-EHT patch] Link: https://lore.kernel.org/r/20220214173004.dbb85a7b86bb.Ifc1e2daac51c1cc5f895ccfb79faf5eaec3950ec@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f35ffd81d213..5cfc483dece1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -114,6 +114,7 @@ struct wiphy; * channel as the control or any of the secondary channels. * This may be due to the driver or due to regulatory bandwidth * restrictions. + * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -136,6 +137,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_8MHZ = 1<<17, IEEE80211_CHAN_16MHZ = 1<<18, IEEE80211_CHAN_NO_320MHZ = 1<<19, + IEEE80211_CHAN_NO_EHT = 1<<20, }; #define IEEE80211_CHAN_NO_HT40 \ -- cgit v1.2.3 From ea05fd3581d32a0f1098657005c7a9b763798fe8 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:29:58 +0100 Subject: cfg80211: Support configuration of station EHT capabilities Add attributes and some code bits to support userspace passing in EHT capabilities of stations. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.ecf0b3ff9627.Icb4a5f2ec7b41d9008ac4cfc16c59baeb84793d3@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5cfc483dece1..68713388b617 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1487,6 +1487,8 @@ struct sta_txpwr { * @airtime_weight: airtime scheduler weight for this station * @txpwr: transmit power for an associated station * @he_6ghz_capa: HE 6 GHz Band capabilities of station + * @eht_capa: EHT capabilities of station + * @eht_capa_len: the length of the EHT capabilities */ struct station_parameters { const u8 *supported_rates; @@ -1520,6 +1522,8 @@ struct station_parameters { u16 airtime_weight; struct sta_txpwr txpwr; const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + const struct ieee80211_eht_cap_elem *eht_capa; + u8 eht_capa_len; }; /** -- cgit v1.2.3 From 5dca295dd76756c7918ad7113fc4a3cf8262ed43 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:30:00 +0100 Subject: mac80211: Add initial support for EHT and 320 MHz channels Add initial support for EHT and 320 MHz bandwidth in mac80211. As a new IEEE80211_STA_RX_BW_320 is added to enum ieee80211_sta_rx_bandwidth, update the drivers to avoid compilation warnings. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.0f144cc0bba6.Iad18111264da87eed5fd7b017f0cc6e58c604e07@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index bd6912d0292b..586d3c26c8ac 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2005,6 +2005,7 @@ enum ieee80211_sta_state { * @IEEE80211_STA_RX_BW_80: station can receive up to 80 MHz * @IEEE80211_STA_RX_BW_160: station can receive up to 160 MHz * (including 80+80 MHz) + * @IEEE80211_STA_RX_BW_320: station can receive up to 320 MHz * * Implementation note: 20 must be zero to be initialized * correctly, the values must be sorted. @@ -2014,6 +2015,7 @@ enum ieee80211_sta_rx_bandwidth { IEEE80211_STA_RX_BW_40, IEEE80211_STA_RX_BW_80, IEEE80211_STA_RX_BW_160, + IEEE80211_STA_RX_BW_320, }; /** -- cgit v1.2.3 From a1de64078bf7a3fed856fef5334132ba1963b683 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:30:02 +0100 Subject: mac80211: Handle station association response with EHT When the association is an EHT association, parse the EHT element from the association response and update the station's EHT capabilities accordingly. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.f33574718755.I21182234c5303d9423eabd5eb997e7cf75f8e0c8@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 586d3c26c8ac..f118b8fe667a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -636,6 +636,7 @@ struct ieee80211_fils_discovery { * @tx_pwr_env: transmit power envelope array of BSS. * @tx_pwr_env_num: number of @tx_pwr_env. * @pwr_reduction: power constraint of BSS. + * @eht_support: does this BSS support EHT */ struct ieee80211_bss_conf { const u8 *bssid; @@ -710,6 +711,7 @@ struct ieee80211_bss_conf { struct ieee80211_tx_pwr_env tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT]; u8 tx_pwr_env_num; u8 pwr_reduction; + bool eht_support; }; /** @@ -2071,6 +2073,7 @@ struct ieee80211_sta_txpwr { * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities + * @eht_cap: EHT capabilities of this STA * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -2111,6 +2114,7 @@ struct ieee80211_sta { struct ieee80211_sta_vht_cap vht_cap; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; -- cgit v1.2.3 From 0b0dff5b3b98c5c7ce848151df9da0b3cdf0cc8b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 15 Feb 2022 11:00:37 -0500 Subject: ipv6: per-netns exclusive flowlabel checks Ipv6 flowlabels historically require a reservation before use. Optionally in exclusive mode (e.g., user-private). Commit 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist") introduced a fastpath that avoids this check when no exclusive leases exist in the system, and thus any flowlabel use will be granted. That allows skipping the control operation to reserve a flowlabel entirely. Though with a warning if the fast path fails: This is an optimization. Robust applications still have to revert to requesting leases if the fast path fails due to an exclusive lease. Still, this is subtle. Better isolate network namespaces from each other. Flowlabels are per-netns. Also record per-netns whether exclusive leases are in use. Then behavior does not change based on activity in other netns. Changes v2 - wrap in IS_ENABLED(CONFIG_IPV6) to avoid breakage if disabled Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist") Link: https://lore.kernel.org/netdev/MWHPR2201MB1072BCCCFCE779E4094837ACD0329@MWHPR2201MB1072.namprd22.prod.outlook.com/ Reported-by: Congyu Liu Signed-off-by: Willem de Bruijn Tested-by: Congyu Liu Link: https://lore.kernel.org/r/20220215160037.1976072-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 5 ++++- include/net/netns/ipv6.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3afcb128e064..92eec13d1693 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -393,17 +393,20 @@ static inline void txopt_put(struct ipv6_txoptions *opt) kfree_rcu(opt, rcu); } +#if IS_ENABLED(CONFIG_IPV6) struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { - if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key)) + if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) && + READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } +#endif struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index a4b550380316..6bd7e5a85ce7 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -77,9 +77,10 @@ struct netns_ipv6 { spinlock_t fib6_gc_lock; unsigned int ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; + unsigned char flowlabel_has_excl; #ifdef CONFIG_IPV6_MULTIPLE_TABLES - unsigned int fib6_rules_require_fldissect; bool fib6_has_custom_rules; + unsigned int fib6_rules_require_fldissect; #ifdef CONFIG_IPV6_SUBTREES unsigned int fib6_routes_require_src; #endif -- cgit v1.2.3 From f8e9ce4a6e85067d7d7cfa89167f5ce5f0ec2a8a Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 15 Feb 2022 18:11:24 -0800 Subject: mptcp: mptcp_parse_option is no longer exported Options parsing in now done from mptcp_incoming_options(). mptcp_parse_option() has been removed from mptcp.h when CONFIG_MPTCP is defined but not when it is not. Fixes: cfde141ea3fa ("mptcp: move option parsing into mptcp_incoming_options()") Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index a925349b4b89..0a3b0fb04a3b 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -217,12 +217,6 @@ static inline bool rsk_drop_req(const struct request_sock *req) return false; } -static inline void mptcp_parse_option(const struct sk_buff *skb, - const unsigned char *ptr, int opsize, - struct tcp_options_received *opt_rx) -{ -} - static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) -- cgit v1.2.3 From 5224f79096170bf7b92cc8fe42a12f44b91e5f62 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 14 Feb 2022 19:11:44 -0600 Subject: treewide: Replace zero-length arrays with flexible-array members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. This code was transformed with the help of Coccinelle: (next-20220214$ spatch --jobs $(getconf _NPROCESSORS_ONLN) --sp-file script.cocci --include-headers --dir . > output.patch) @@ identifier S, member, array; type T1, T2; @@ struct S { ... T1 member; T2 array[ - 0 ]; }; UAPI and wireless changes were intentionally excluded from this patch and will be sent out separately. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/78 Reviewed-by: Kees Cook Signed-off-by: Gustavo A. R. Silva --- include/net/bluetooth/mgmt.h | 2 +- include/net/ioam6.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 107b25deae68..9607ec289fd0 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -696,7 +696,7 @@ struct mgmt_cp_set_blocked_keys { #define MGMT_READ_CONTROLLER_CAP_SIZE 0 struct mgmt_rp_read_controller_cap { __le16 cap_len; - __u8 cap[0]; + __u8 cap[]; } __packed; #define MGMT_OP_READ_EXP_FEATURES_INFO 0x0049 diff --git a/include/net/ioam6.h b/include/net/ioam6.h index 3f45ba37a2c6..781d2d8b2f29 100644 --- a/include/net/ioam6.h +++ b/include/net/ioam6.h @@ -35,7 +35,7 @@ struct ioam6_schema { int len; __be32 hdr; - u8 data[0]; + u8 data[]; }; struct ioam6_pernet_data { -- cgit v1.2.3 From d95d6320ba7a51d61c097ffc3bcafcf70283414e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Feb 2022 09:32:17 -0800 Subject: ipv6: fix data-race in fib6_info_hw_flags_set / fib6_purge_rt Because fib6_info_hw_flags_set() is called without any synchronization, all accesses to gi6->offload, fi->trap and fi->offload_failed need some basic protection like READ_ONCE()/WRITE_ONCE(). BUG: KCSAN: data-race in fib6_info_hw_flags_set / fib6_purge_rt read to 0xffff8881087d5886 of 1 bytes by task 13953 on cpu 0: fib6_drop_pcpu_from net/ipv6/ip6_fib.c:1007 [inline] fib6_purge_rt+0x4f/0x580 net/ipv6/ip6_fib.c:1033 fib6_del_route net/ipv6/ip6_fib.c:1983 [inline] fib6_del+0x696/0x890 net/ipv6/ip6_fib.c:2028 __ip6_del_rt net/ipv6/route.c:3876 [inline] ip6_del_rt+0x83/0x140 net/ipv6/route.c:3891 __ipv6_dev_ac_dec+0x2b5/0x370 net/ipv6/anycast.c:374 ipv6_dev_ac_dec net/ipv6/anycast.c:387 [inline] __ipv6_sock_ac_close+0x141/0x200 net/ipv6/anycast.c:207 ipv6_sock_ac_close+0x79/0x90 net/ipv6/anycast.c:220 inet6_release+0x32/0x50 net/ipv6/af_inet6.c:476 __sock_release net/socket.c:650 [inline] sock_close+0x6c/0x150 net/socket.c:1318 __fput+0x295/0x520 fs/file_table.c:280 ____fput+0x11/0x20 fs/file_table.c:313 task_work_run+0x8e/0x110 kernel/task_work.c:164 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:175 [inline] exit_to_user_mode_prepare+0x160/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881087d5886 of 1 bytes by task 1912 on cpu 1: fib6_info_hw_flags_set+0x155/0x3b0 net/ipv6/route.c:6230 nsim_fib6_rt_hw_flags_set drivers/net/netdevsim/fib.c:668 [inline] nsim_fib6_rt_add drivers/net/netdevsim/fib.c:691 [inline] nsim_fib6_rt_insert drivers/net/netdevsim/fib.c:756 [inline] nsim_fib6_event drivers/net/netdevsim/fib.c:853 [inline] nsim_fib_event drivers/net/netdevsim/fib.c:886 [inline] nsim_fib_event_work+0x284f/0x2cf0 drivers/net/netdevsim/fib.c:1477 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x22 -> 0x2a Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 1912 Comm: kworker/1:3 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events nsim_fib_event_work Fixes: 0c5fcf9e249e ("IPv6: Add "offload failed" indication to routes") Fixes: bb3c4ab93e44 ("ipv6: Add "offload" and "trap" indications to routes") Signed-off-by: Eric Dumazet Cc: Amit Cohen Cc: Ido Schimmel Reported-by: syzbot Link: https://lore.kernel.org/r/20220216173217.3792411-2-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 40ae8f1b18e5..2048bc8748cb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -190,14 +190,16 @@ struct fib6_info { u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; + + u8 offload; + u8 trap; + u8 offload_failed; + u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, fib6_destroying:1, - offload:1, - trap:1, - offload_failed:1, - unused:1; + unused:4; struct rcu_head rcu; struct nexthop *nh; -- cgit v1.2.3 From d2b1d186ce2eac6b15d31db3e2750ee8e02bbe81 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 21:37:26 +0200 Subject: net: dsa: delete unused exported symbols for ethtool PHY stats Introduced in commit cf963573039a ("net: dsa: Allow providing PHY statistics from CPU port"), it appears these were never used. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220216193726.2926320-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1456313a1faa..c8626dec970c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1257,9 +1257,6 @@ static inline bool dsa_slave_dev_check(const struct net_device *dev) #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); -int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); -int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); -int dsa_port_get_phy_sset_count(struct dsa_port *dp); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); struct dsa_tag_driver { -- cgit v1.2.3 From 8467fadc115cb08bb1cbc7885cb7b7ef1871cae4 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 17 Feb 2022 10:07:55 +0200 Subject: net: gro: Fix a 'directive in macro's argument list' sparse warning Following the cited commit, sparse started complaining about: ../include/net/gro.h:58:1: warning: directive in macro's argument list ../include/net/gro.h:59:1: warning: directive in macro's argument list Fix that by moving the defines out of the struct_group() macro. Fixes: de5a1f3ce4c8 ("net: gro: minor optimization for dev_gro_receive()") Reviewed-by: Maxim Mikityanskiy Signed-off-by: Gal Pressman Acked-by: Alexander Lobakin Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/gro.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/gro.h b/include/net/gro.h index a765fedda5c4..867656b0739c 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -35,6 +35,9 @@ struct napi_gro_cb { /* jiffies when first packet was created/queued */ unsigned long age; +/* Used in napi_gro_cb::free */ +#define NAPI_GRO_FREE 1 +#define NAPI_GRO_FREE_STOLEN_HEAD 2 /* portion of the cb set to zero at every gro iteration */ struct_group(zeroed, @@ -55,8 +58,6 @@ struct napi_gro_cb { /* Free the skb? */ u8 free:2; -#define NAPI_GRO_FREE 1 -#define NAPI_GRO_FREE_STOLEN_HEAD 2 /* Used in foo-over-udp, set in udp[46]_gro_receive */ u8 is_ipv6:1; -- cgit v1.2.3 From a1cdec57e03a1352e92fbbe7974039dda4efcec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2022 09:05:02 -0800 Subject: net-timestamp: convert sk->sk_tskey to atomic_t UDP sendmsg() can be lockless, this is causing all kinds of data races. This patch converts sk->sk_tskey to remove one of these races. BUG: KCSAN: data-race in __ip_append_data / __ip_append_data read to 0xffff8881035d4b6c of 4 bytes by task 8877 on cpu 1: __ip_append_data+0x1c1/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881035d4b6c of 4 bytes by task 8880 on cpu 0: __ip_append_data+0x1d8/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000054d -> 0x0000054e Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8880 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00167-gdcb85f85fa6f-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 09c2d251b707 ("net-timestamp: add key to disambiguate concurrent datagrams") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..50aecd28b355 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -507,7 +507,7 @@ struct sock { #endif u16 sk_tsflags; u8 sk_shutdown; - u32 sk_tskey; + atomic_t sk_tskey; atomic_t sk_zckey; u8 sk_clockid; @@ -2667,7 +2667,7 @@ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = sk->sk_tskey++; + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; -- cgit v1.2.3 From bde018222c6b084ac32933a9f933581dd83da18e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 17 Feb 2022 18:30:35 +0000 Subject: net: dsa: add support for phylink mac_select_pcs() Add DSA support for the phylink mac_select_pcs() method so DSA drivers can return provide phylink with the appropriate PCS for the PHY interface mode. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index c8626dec970c..bc6eef6af810 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -788,6 +788,9 @@ struct dsa_switch_ops { void (*phylink_validate)(struct dsa_switch *ds, int port, unsigned long *supported, struct phylink_link_state *state); + struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, + int port, + phy_interface_t iface); int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); void (*phylink_mac_config)(struct dsa_switch *ds, int port, -- cgit v1.2.3 From 47f0bd5032106469827cf56c8b45bb9101112105 Mon Sep 17 00:00:00 2001 From: Jacques de Laval Date: Thu, 17 Feb 2022 16:02:02 +0100 Subject: net: Add new protocol attribute to IP addresses This patch adds a new protocol attribute to IPv4 and IPv6 addresses. Inspiration was taken from the protocol attribute of routes. User space applications like iproute2 can set/get the protocol with the Netlink API. The attribute is stored as an 8-bit unsigned integer. The protocol attribute is set by kernel for these categories: - IPv4 and IPv6 loopback addresses - IPv6 addresses generated from router announcements - IPv6 link local addresses User space may pass custom protocols, not defined by the kernel. Grouping addresses on their origin is useful in scenarios where you want to distinguish between addresses based on who added them, e.g. kernel vs. user space. Tagging addresses with a string label is an existing feature that could be used as a solution. Unfortunately the max length of a label is 15 characters, and for compatibility reasons the label must be prefixed with the name of the device followed by a colon. Since device names also have a max length of 15 characters, only -1 characters is guaranteed to be available for any origin tag, which is not that much. A reference implementation of user space setting and getting protocols is available for iproute2: https://github.com/westermo/iproute2/commit/9a6ea18bd79f47f293e5edc7780f315ea42ff540 Signed-off-by: Jacques de Laval Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220217150202.80802-1-Jacques.De.Laval@westermo.com Signed-off-by: Jakub Kicinski --- include/net/addrconf.h | 2 ++ include/net/if_inet6.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 59940e230b78..f7506f08e505 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -64,6 +64,8 @@ struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; + u8 ifa_proto; + const struct in6_addr *peer_pfx; u32 rt_priority; diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index f026cf08a8e8..4cfdef6ca4f6 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -71,6 +71,8 @@ struct inet6_ifaddr { bool tokenized; + u8 ifa_proto; + struct rcu_head rcu; struct in6_addr peer_addr; }; -- cgit v1.2.3 From cb196b725936f6b776ad1d073f66fbe92aa798fa Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 18 Feb 2022 12:25:53 +0800 Subject: mctp: replace mctp_address_ok with more fine-grained helpers Currently, we have mctp_address_ok(), which checks if an EID is in the "valid" range of 8-254 inclusive. However, 0 and 255 may also be valid addresses, depending on context. 0 is the NULL EID, which may be set when physical addressing is used. 255 is valid as a destination address for broadcasts. This change renames mctp_address_ok to mctp_address_unicast, and adds similar helpers for broadcast and null EIDs, which will be used in an upcoming commit. Signed-off-by: Jeremy Kerr Signed-off-by: Jakub Kicinski --- include/net/mctp.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index e80a4baf8379..d37268fe6825 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -40,11 +40,21 @@ struct mctp_hdr { #define MCTP_INITIAL_DEFAULT_NET 1 -static inline bool mctp_address_ok(mctp_eid_t eid) +static inline bool mctp_address_unicast(mctp_eid_t eid) { return eid >= 8 && eid < 255; } +static inline bool mctp_address_broadcast(mctp_eid_t eid) +{ + return eid == 255; +} + +static inline bool mctp_address_null(mctp_eid_t eid) +{ + return eid == 0; +} + static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) { return match == eid || match == MCTP_ADDR_ANY; -- cgit v1.2.3 From 5486f5bf790b5c664913076c3194b8f916a5c7ad Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Feb 2022 14:35:49 +0100 Subject: net: Force inlining of checksum functions in net/checksum.h All functions defined as static inline in net/checksum.h are meant to be inlined for performance reason. But since commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") the compiler is allowed to uninline functions when it wants. Fair enough in the general case, but for tiny performance critical checksum helpers that's counter-productive. The problem mainly arises when selecting CONFIG_CC_OPTIMISE_FOR_SIZE, Those helpers being 'static inline' in header files you suddenly find them duplicated many times in the resulting vmlinux. Here is a typical exemple when building powerpc pmac32_defconfig with CONFIG_CC_OPTIMISE_FOR_SIZE. csum_sub() appears 4 times: c04a23cc : c04a23cc: 7c 84 20 f8 not r4,r4 c04a23d0: 7c 63 20 14 addc r3,r3,r4 c04a23d4: 7c 63 01 94 addze r3,r3 c04a23d8: 4e 80 00 20 blr ... c04a2ce8: 4b ff f6 e5 bl c04a23cc ... c04a2d2c: 4b ff f6 a1 bl c04a23cc ... c04a2d54: 4b ff f6 79 bl c04a23cc ... c04a754c : c04a754c: 7c 84 20 f8 not r4,r4 c04a7550: 7c 63 20 14 addc r3,r3,r4 c04a7554: 7c 63 01 94 addze r3,r3 c04a7558: 4e 80 00 20 blr ... c04ac930: 4b ff ac 1d bl c04a754c ... c04ad264: 4b ff a2 e9 bl c04a754c ... c04e3b08 : c04e3b08: 7c 84 20 f8 not r4,r4 c04e3b0c: 7c 63 20 14 addc r3,r3,r4 c04e3b10: 7c 63 01 94 addze r3,r3 c04e3b14: 4e 80 00 20 blr ... c04e5788: 4b ff e3 81 bl c04e3b08 ... c04e65c8: 4b ff d5 41 bl c04e3b08 ... c0512d34 : c0512d34: 7c 84 20 f8 not r4,r4 c0512d38: 7c 63 20 14 addc r3,r3,r4 c0512d3c: 7c 63 01 94 addze r3,r3 c0512d40: 4e 80 00 20 blr ... c0512dfc: 4b ff ff 39 bl c0512d34 ... c05138bc: 4b ff f4 79 bl c0512d34 ... Restore the expected behaviour by using __always_inline for all functions defined in net/checksum.h vmlinux size is even reduced by 256 bytes with this patch: text data bss dec hex filename 6980022 2515362 194384 9689768 93daa8 vmlinux.before 6979862 2515266 194384 9689512 93d9a8 vmlinux.now Fixes: ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") Cc: Masahiro Yamada Cc: Nick Desaulniers Cc: Andrew Morton Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- include/net/checksum.h | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'include/net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 5218041e5c8f..02d0c2d01014 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -22,7 +22,7 @@ #include #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER -static inline +static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { @@ -33,7 +33,7 @@ __wsum csum_and_copy_from_user (const void __user *src, void *dst, #endif #ifndef HAVE_CSUM_COPY_USER -static __inline__ __wsum csum_and_copy_to_user +static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); @@ -45,7 +45,7 @@ static __inline__ __wsum csum_and_copy_to_user #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY -static inline __wsum +static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); @@ -54,7 +54,7 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len) #endif #ifndef HAVE_ARCH_CSUM_ADD -static inline __wsum csum_add(__wsum csum, __wsum addend) +static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; @@ -62,12 +62,12 @@ static inline __wsum csum_add(__wsum csum, __wsum addend) } #endif -static inline __wsum csum_sub(__wsum csum, __wsum addend) +static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } -static inline __sum16 csum16_add(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; @@ -75,12 +75,12 @@ static inline __sum16 csum16_add(__sum16 csum, __be16 addend) return (__force __sum16)(res + (res < (__force u16)addend)); } -static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } -static inline __wsum csum_shift(__wsum sum, int offset) +static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) @@ -88,42 +88,43 @@ static inline __wsum csum_shift(__wsum sum, int offset) return sum; } -static inline __wsum +static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { return csum_add(csum, csum_shift(csum2, offset)); } -static inline __wsum +static __always_inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } -static inline __wsum +static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } -static inline __wsum csum_unfold(__sum16 n) +static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } -static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) +static __always_inline +__wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) -static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } -static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) +static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); @@ -136,7 +137,7 @@ static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) * m : old value of a 16bit field * m' : new value of a 16bit field */ -static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) +static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } @@ -150,16 +151,16 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); -static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, - __be16 from, __be16 to, - bool pseudohdr) +static __always_inline +void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, + __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } -static inline __wsum remcsum_adjust(void *ptr, __wsum csum, - int start, int offset) +static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, + int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; @@ -175,12 +176,12 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum, return delta; } -static inline void remcsum_unadjust(__sum16 *psum, __wsum delta) +static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } -static inline __wsum wsum_negate(__wsum val) +static __always_inline __wsum wsum_negate(__wsum val) { return (__force __wsum)-((__force u32)val); } -- cgit v1.2.3 From ccfbf44d4c7fb7c64cf79b3f2a5ae522e5165878 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 19 Feb 2022 11:47:17 +0000 Subject: net: dsa: remove pcs_poll With drivers converted over to using phylink PCS, there is no need for the struct dsa_switch member "pcs_poll" to exist anymore - there is a flag in the struct phylink_pcs which indicates whether this PCS needs to be polled which supersedes this. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index bc6eef6af810..f13de2d8aef3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -391,11 +391,6 @@ struct dsa_switch { */ u32 vlan_filtering:1; - /* MAC PCS does not provide link state change interrupt, and requires - * polling. Flag passed on to PHYLINK. - */ - u32 pcs_poll:1; - /* For switches that only have the MRU configurable. To ensure the * configured MTU is not exceeded, normalization of MRU on all bridged * interfaces is needed. -- cgit v1.2.3 From b1a5983f56e371046dcf164f90bfaf704d2b89f6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 17 Feb 2022 23:41:20 +0100 Subject: netfilter: nf_tables_offload: incorrect flow offload action array size immediate verdict expression needs to allocate one slot in the flow offload action array, however, immediate data expression does not need to do so. fwd and dup expression need to allocate one slot, this is missing. Add a new offload_action interface to report if this expression needs to allocate one slot in the flow offload action array. Fixes: be2861dc36d7 ("netfilter: nft_{fwd,dup}_netdev: add offload support") Reported-and-tested-by: Nick Gregory Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/net/netfilter/nf_tables_offload.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index eaf55da9a205..c4c0861deac1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -905,9 +905,9 @@ struct nft_expr_ops { int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); + bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); - u32 offload_flags; const struct nft_expr_type *type; void *data; }; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index f9d95ff82df8..797147843958 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -67,8 +67,6 @@ struct nft_flow_rule { struct flow_rule *rule; }; -#define NFT_OFFLOAD_F_ACTION (1 << 0) - void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type); -- cgit v1.2.3 From 7a26dc9e7b43f5a24c4b843713e728582adf1c38 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:33 +0800 Subject: net: tcp: add skb drop reasons to tcp_add_backlog() Pass the address of drop_reason to tcp_add_backlog() to store the reasons for skb drops when fails. Following drop reasons are introduced: SKB_DROP_REASON_SOCKET_BACKLOG Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index eff2487d972d..04f4650e0ff0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1367,7 +1367,8 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) __skb_checksum_complete(skb); } -bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason); #ifdef CONFIG_INET void __sk_defer_free_flush(struct sock *sk); -- cgit v1.2.3 From 0c51e12e218f20b7d976158fdc18019627326f7a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 19 Feb 2022 17:45:19 +0200 Subject: ipv4: Invalidate neighbour for broadcast address upon address addition In case user space sends a packet destined to a broadcast address when a matching broadcast route is not configured, the kernel will create a unicast neighbour entry that will never be resolved [1]. When the broadcast route is configured, the unicast neighbour entry will not be invalidated and continue to linger, resulting in packets being dropped. Solve this by invalidating unresolved neighbour entries for broadcast addresses after routes for these addresses are internally configured by the kernel. This allows the kernel to create a broadcast neighbour entry following the next route lookup. Another possible solution that is more generic but also more complex is to have the ARP code register a listener to the FIB notification chain and invalidate matching neighbour entries upon the addition of broadcast routes. It is also possible to wave off the issue as a user space problem, but it seems a bit excessive to expect user space to be that intimately familiar with the inner workings of the FIB/neighbour kernel code. [1] https://lore.kernel.org/netdev/55a04a8f-56f3-f73c-2aea-2195923f09d1@huawei.com/ Reported-by: Wang Hai Signed-off-by: Ido Schimmel Tested-by: Wang Hai Signed-off-by: David S. Miller --- include/net/arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/arp.h b/include/net/arp.h index 031374ac2f22..d7ef4ec71dfe 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -65,6 +65,7 @@ void arp_send(int type, int ptype, __be32 dest_ip, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); +int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, -- cgit v1.2.3 From 696c65444120cfe98ae704e86a59df8666fabf1d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Feb 2022 13:54:53 +0800 Subject: ipv6: separate ndisc_ns_create() from ndisc_send_ns() This patch separate NS message allocation steps from ndisc_send_ns(), so it could be used in other places, like bonding, to allocate and send IPv6 NS message. Also export ndisc_send_skb() and ndisc_ns_create() for later bonding usage. Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/ndisc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 53cb8de0e589..aac3a42de432 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -447,10 +447,15 @@ void ndisc_cleanup(void); int ndisc_rcv(struct sk_buff *skb); +struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, + const struct in6_addr *saddr, u64 nonce); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce); +void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, + const struct in6_addr *saddr); + void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, -- cgit v1.2.3 From 841e95641e4cb695586b036f330d6d272fcd7173 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Feb 2022 13:54:55 +0800 Subject: bonding: add extra field for bond_opt_value Adding an extra storage field for bond_opt_value so we can set large bytes of data for bonding options in future, e.g. IPv6 address. Define a new call bond_opt_initextra(). Also change the checking order of __bond_opt_init() and check values first. Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/bond_options.h | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/bond_options.h b/include/net/bond_options.h index dd75c071f67e..286b29c6c451 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -71,14 +71,18 @@ enum { /* This structure is used for storing option values and for passing option * values when changing an option. The logic when used as an arg is as follows: - * - if string != NULL -> parse it, if the opt is RAW type then return it, else - * return the parse result - * - if string == NULL -> parse value + * - if value != ULLONG_MAX -> parse value + * - if string != NULL -> parse string + * - if the opt is RAW data and length less than maxlen, + * copy the data to extra storage */ + +#define BOND_OPT_EXTRA_MAXLEN 16 struct bond_opt_value { char *string; u64 value; u32 flags; + char extra[BOND_OPT_EXTRA_MAXLEN]; }; struct bonding; @@ -118,17 +122,22 @@ const struct bond_opt_value *bond_opt_get_val(unsigned int option, u64 val); * When value is ULLONG_MAX then string will be used. */ static inline void __bond_opt_init(struct bond_opt_value *optval, - char *string, u64 value) + char *string, u64 value, + void *extra, size_t extra_len) { memset(optval, 0, sizeof(*optval)); optval->value = ULLONG_MAX; - if (value == ULLONG_MAX) - optval->string = string; - else + if (value != ULLONG_MAX) optval->value = value; + else if (string) + optval->string = string; + else if (extra_len <= BOND_OPT_EXTRA_MAXLEN) + memcpy(optval->extra, extra, extra_len); } -#define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value) -#define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX) +#define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value, NULL, 0) +#define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX, NULL, 0) +#define bond_opt_initextra(optval, extra, extra_len) \ + __bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len) void bond_option_arp_ip_targets_clear(struct bonding *bond); -- cgit v1.2.3 From 4e24be018eb9dbcefa4b01c07e298b147dc1a4d7 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Feb 2022 13:54:56 +0800 Subject: bonding: add new parameter ns_targets Add a new bonding parameter ns_targets to store IPv6 address. Add required bond_ns_send/rcv functions first before adding IPv6 address option setting. Add two functions bond_send/rcv_validate so we can send/recv ARP and NS at the same time. Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/bonding.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index 7dead855a72d..f3b986f6b6e4 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -29,8 +29,11 @@ #include #include #include +#include +#include #define BOND_MAX_ARP_TARGETS 16 +#define BOND_MAX_NS_TARGETS BOND_MAX_ARP_TARGETS #define BOND_DEFAULT_MIIMON 100 @@ -146,6 +149,7 @@ struct bond_params { struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; + struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; @@ -628,7 +632,7 @@ struct bond_net { struct class_attribute class_attr_bonding_masters; }; -int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); +int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); @@ -735,6 +739,19 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) return -1; } +static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) +{ + int i; + + for (i = 0; i < BOND_MAX_NS_TARGETS; i++) + if (ipv6_addr_equal(&targets[i], ip)) + return i; + else if (ipv6_addr_any(&targets[i])) + break; + + return -1; +} + /* exported from bond_main.c */ extern unsigned int bond_net_id; -- cgit v1.2.3 From 129e3c1bab24d27d0fa6e505a472345a92d7a2b0 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Feb 2022 13:54:57 +0800 Subject: bonding: add new option ns_ip6_target This patch add a new bonding option ns_ip6_target, which correspond to the arp_ip_target. With this we set IPv6 targets and send IPv6 NS request to determine the health of the link. For other related options like the validation, we still use arp_validate, and will change to ns_validate later. Note: the sysfs configuration support was removed based on https://lore.kernel.org/netdev/8863.1645071997@famine Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/bond_options.h | 4 ++++ include/net/bonding.h | 7 +++++++ 2 files changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 286b29c6c451..61b49063791c 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -66,6 +66,7 @@ enum { BOND_OPT_PEER_NOTIF_DELAY, BOND_OPT_LACP_ACTIVE, BOND_OPT_MISSED_MAX, + BOND_OPT_NS_TARGETS, BOND_OPT_LAST }; @@ -140,5 +141,8 @@ static inline void __bond_opt_init(struct bond_opt_value *optval, __bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len) void bond_option_arp_ip_targets_clear(struct bonding *bond); +#if IS_ENABLED(CONFIG_IPV6) +void bond_option_ns_ip6_targets_clear(struct bonding *bond); +#endif #endif /* _NET_BOND_OPTIONS_H */ diff --git a/include/net/bonding.h b/include/net/bonding.h index f3b986f6b6e4..d0dfe727e0b1 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -503,6 +503,13 @@ static inline int bond_is_ip_target_ok(__be32 addr) return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } +static inline int bond_is_ip6_target_ok(struct in6_addr *addr) +{ + return !ipv6_addr_any(addr) && + !ipv6_addr_loopback(addr) && + !ipv6_addr_is_multicast(addr); +} + /* Get the oldest arp which we've received on this slave for bond's * arp_targets. */ -- cgit v1.2.3 From d9b5ae5c1b241b91480aa30408be12fe91af834a Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 23 Feb 2022 18:34:16 +0200 Subject: openvswitch: Fix setting ipv6 fields causing hw csum failure Ipv6 ttl, label and tos fields are modified without first pulling/pushing the ipv6 header, which would have updated the hw csum (if available). This might cause csum validation when sending the packet to the stack, as can be seen in the trace below. Fix this by updating skb->csum if available. Trace resulted by ipv6 ttl dec and then sending packet to conntrack [actions: set(ipv6(hlimit=63)),ct(zone=99)]: [295241.900063] s_pf0vf2: hw csum failure [295241.923191] Call Trace: [295241.925728] [295241.927836] dump_stack+0x5c/0x80 [295241.931240] __skb_checksum_complete+0xac/0xc0 [295241.935778] nf_conntrack_tcp_packet+0x398/0xba0 [nf_conntrack] [295241.953030] nf_conntrack_in+0x498/0x5e0 [nf_conntrack] [295241.958344] __ovs_ct_lookup+0xac/0x860 [openvswitch] [295241.968532] ovs_ct_execute+0x4a7/0x7c0 [openvswitch] [295241.979167] do_execute_actions+0x54a/0xaa0 [openvswitch] [295242.001482] ovs_execute_actions+0x48/0x100 [openvswitch] [295242.006966] ovs_dp_process_packet+0x96/0x1d0 [openvswitch] [295242.012626] ovs_vport_receive+0x6c/0xc0 [openvswitch] [295242.028763] netdev_frame_hook+0xc0/0x180 [openvswitch] [295242.034074] __netif_receive_skb_core+0x2ca/0xcb0 [295242.047498] netif_receive_skb_internal+0x3e/0xc0 [295242.052291] napi_gro_receive+0xba/0xe0 [295242.056231] mlx5e_handle_rx_cqe_mpwrq_rep+0x12b/0x250 [mlx5_core] [295242.062513] mlx5e_poll_rx_cq+0xa0f/0xa30 [mlx5_core] [295242.067669] mlx5e_napi_poll+0xe1/0x6b0 [mlx5_core] [295242.077958] net_rx_action+0x149/0x3b0 [295242.086762] __do_softirq+0xd7/0x2d6 [295242.090427] irq_exit+0xf7/0x100 [295242.093748] do_IRQ+0x7f/0xd0 [295242.096806] common_interrupt+0xf/0xf [295242.100559] [295242.102750] RIP: 0033:0x7f9022e88cbd [295242.125246] RSP: 002b:00007f9022282b20 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffda [295242.132900] RAX: 0000000000000005 RBX: 0000000000000010 RCX: 0000000000000000 [295242.140120] RDX: 00007f9022282ba8 RSI: 00007f9022282a30 RDI: 00007f9014005c30 [295242.147337] RBP: 00007f9014014d60 R08: 0000000000000020 R09: 00007f90254a8340 [295242.154557] R10: 00007f9022282a28 R11: 0000000000000246 R12: 0000000000000000 [295242.161775] R13: 00007f902308c000 R14: 000000000000002b R15: 00007f9022b71f40 Fixes: 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action") Signed-off-by: Paul Blakey Link: https://lore.kernel.org/r/20220223163416.24096-1-paulb@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/checksum.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 02d0c2d01014..79c67f14c448 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -142,6 +142,11 @@ static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } +static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) +{ + *csum = csum_add(csum_sub(*csum, old), new); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); -- cgit v1.2.3 From 29fb608396d6a62c1b85acc421ad7a4399085b9f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 14 Feb 2022 17:59:38 -0800 Subject: Bluetooth: Fix bt_skb_sendmmsg not allocating partial chunks Since bt_skb_sendmmsg can be used with the likes of SOCK_STREAM it shall return the partial chunks it could allocate instead of freeing everything as otherwise it can cause problems like bellow. Fixes: 81be03e026dc ("Bluetooth: RFCOMM: Replace use of memcpy_from_msg with bt_skb_sendmmsg") Reported-by: Paul Menzel Link: https://lore.kernel.org/r/d7206e12-1b99-c3be-84f4-df22af427ef5@molgen.mpg.de BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215594 Signed-off-by: Luiz Augusto von Dentz Tested-by: Paul Menzel (Nokia N9 (MeeGo/Harmattan) Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 4b3d0b16c185..a647e5fabdbd 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -506,8 +506,7 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, tmp = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom); if (IS_ERR(tmp)) { - kfree_skb(skb); - return tmp; + return skb; } len -= tmp->len; -- cgit v1.2.3 From a56a1138cbd85e4d565356199d60e1cb94e5a77a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 17 Feb 2022 13:10:38 -0800 Subject: Bluetooth: hci_sync: Fix not using conn_timeout When using hci_le_create_conn_sync it shall wait for the conn_timeout since the connection complete may take longer than just 2 seconds. Also fix the masking of HCI_EV_LE_ENHANCED_CONN_COMPLETE and HCI_EV_LE_CONN_COMPLETE so they are never both set so we can predict which one the controller will use in case of HCI_OP_LE_CREATE_CONN. Fixes: 6cd29ec6ae5e3 ("Bluetooth: hci_sync: Wait for proper events when connecting LE") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 586f69d084a2..e336e9c1dda4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1489,6 +1489,14 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) +/* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: + * + * C24: Mandatory if the LE Controller supports Connection State and either + * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported + */ +#define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 -- cgit v1.2.3 From 46a76724e4c93bb1cda8ee11276001a92d1f7987 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:44 +0200 Subject: net: dsa: rename references to "lag" as "lag_dev" In preparation of converting struct net_device *dp->lag_dev into a struct dsa_lag *dp->lag, we need to rename, for consistency purposes, all occurrences of the "lag" variable in the DSA core to "lag_dev". Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index f13de2d8aef3..ef7f446cbdf4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -182,12 +182,12 @@ static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, } static inline int dsa_lag_id(struct dsa_switch_tree *dst, - struct net_device *lag) + struct net_device *lag_dev) { unsigned int id; dsa_lags_foreach_id(id, dst) { - if (dsa_lag_dev(dst, id) == lag) + if (dsa_lag_dev(dst, id) == lag_dev) return id; } @@ -966,10 +966,10 @@ struct dsa_switch_ops { int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *lag, + int port, struct net_device *lag_dev, struct netdev_lag_upper_info *info); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *lag); + int port, struct net_device *lag_dev); /* * PTP functionality @@ -1041,10 +1041,10 @@ struct dsa_switch_ops { */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, - struct net_device *lag, + struct net_device *lag_dev, struct netdev_lag_upper_info *info); int (*port_lag_leave)(struct dsa_switch *ds, int port, - struct net_device *lag); + struct net_device *lag_dev); /* * HSR integration -- cgit v1.2.3 From 3d4a0a2a46ab8ff8897dfd6324edee5e8184d2c5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:47 +0200 Subject: net: dsa: make LAG IDs one-based The DSA LAG API will be changed to become more similar with the bridge data structures, where struct dsa_bridge holds an unsigned int num, which is generated by DSA and is one-based. We have a similar thing going with the DSA LAG, except that isn't stored anywhere, it is calculated dynamically by dsa_lag_id() by iterating through dst->lags. The idea of encoding an invalid (or not requested) LAG ID as zero for the purpose of simplifying checks in drivers means that the LAG IDs passed by DSA to drivers need to be one-based too. So back-and-forth conversion is needed when indexing the dst->lags array, as well as in drivers which assume a zero-based index. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index ef7f446cbdf4..3ae93adda25d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -163,9 +163,10 @@ struct dsa_switch_tree { unsigned int last_switch; }; +/* LAG IDs are one-based, the dst->lags array is zero-based */ #define dsa_lags_foreach_id(_id, _dst) \ - for ((_id) = 0; (_id) < (_dst)->lags_len; (_id)++) \ - if ((_dst)->lags[(_id)]) + for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++) \ + if ((_dst)->lags[(_id) - 1]) #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ @@ -178,7 +179,8 @@ struct dsa_switch_tree { static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, unsigned int id) { - return dst->lags[id]; + /* DSA LAG IDs are one-based, dst->lags is zero-based */ + return dst->lags[id - 1]; } static inline int dsa_lag_id(struct dsa_switch_tree *dst, -- cgit v1.2.3 From dedd6a009f4191989bee83c1faf66728648a223f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:49 +0200 Subject: net: dsa: create a dsa_lag structure The main purpose of this change is to create a data structure for a LAG as seen by DSA. This is similar to what we have for bridging - we pass a copy of this structure by value to ->port_lag_join and ->port_lag_leave. For now we keep the lag_dev, id and a reference count in it. Future patches will add a list of FDB entries for the LAG (these also need to be refcounted to work properly). The LAG structure is created using dsa_port_lag_create() and destroyed using dsa_port_lag_destroy(), just like we have for bridging. Because now, the dsa_lag itself is refcounted, we can simplify dsa_lag_map() and dsa_lag_unmap(). These functions need to keep a LAG in the dst->lags array only as long as at least one port uses it. The refcounting logic inside those functions can be removed now - they are called only when we should perform the operation. dsa_lag_dev() is renamed to dsa_lag_by_id() and now returns the dsa_lag structure instead of the lag_dev net_device. dsa_lag_foreach_port() now takes the dsa_lag structure as argument. dst->lags holds an array of dsa_lag structures. dsa_lag_map() now also saves the dsa_lag->id value, so that linear walking of dst->lags in drivers using dsa_lag_id() is no longer necessary. They can just look at lag.id. dsa_port_lag_id_get() is a helper, similar to dsa_port_bridge_num_get(), which can be used by drivers to get the LAG ID assigned by DSA to a given port. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3ae93adda25d..81ed34998416 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -116,6 +116,12 @@ struct dsa_netdevice_ops { #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) +struct dsa_lag { + struct net_device *dev; + unsigned int id; + refcount_t refcount; +}; + struct dsa_switch_tree { struct list_head list; @@ -134,7 +140,7 @@ struct dsa_switch_tree { /* Maps offloaded LAG netdevs to a zero-based linear ID for * drivers that need it. */ - struct net_device **lags; + struct dsa_lag **lags; /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; @@ -170,14 +176,14 @@ struct dsa_switch_tree { #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ - if ((_dp)->lag_dev == (_lag)) + if (dsa_port_offloads_lag((_dp), (_lag))) #define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) -static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, - unsigned int id) +static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, + unsigned int id) { /* DSA LAG IDs are one-based, dst->lags is zero-based */ return dst->lags[id - 1]; @@ -189,8 +195,10 @@ static inline int dsa_lag_id(struct dsa_switch_tree *dst, unsigned int id; dsa_lags_foreach_id(id, dst) { - if (dsa_lag_dev(dst, id) == lag_dev) - return id; + struct dsa_lag *lag = dsa_lag_by_id(dst, id); + + if (lag->dev == lag_dev) + return lag->id; } return -ENODEV; @@ -293,7 +301,7 @@ struct dsa_port { struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; - struct net_device *lag_dev; + struct dsa_lag *lag; struct net_device *hsr_dev; struct list_head list; @@ -643,14 +651,30 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) return dp->vlan_filtering; } +static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp) +{ + return dp->lag ? dp->lag->id : 0; +} + +static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp) +{ + return dp->lag ? dp->lag->dev : NULL; +} + +static inline bool dsa_port_offloads_lag(struct dsa_port *dp, + const struct dsa_lag *lag) +{ + return dsa_port_lag_dev_get(dp) == lag->dev; +} + static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { if (!dp->bridge) return NULL; - if (dp->lag_dev) - return dp->lag_dev; + if (dp->lag) + return dp->lag->dev; else if (dp->hsr_dev) return dp->hsr_dev; @@ -968,10 +992,10 @@ struct dsa_switch_ops { int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *lag_dev, + int port, struct dsa_lag lag, struct netdev_lag_upper_info *info); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *lag_dev); + int port, struct dsa_lag lag); /* * PTP functionality @@ -1043,10 +1067,10 @@ struct dsa_switch_ops { */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, - struct net_device *lag_dev, + struct dsa_lag lag, struct netdev_lag_upper_info *info); int (*port_lag_leave)(struct dsa_switch *ds, int port, - struct net_device *lag_dev); + struct dsa_lag lag); /* * HSR integration -- cgit v1.2.3 From ec638740fce990ad2b9af43ead8088d6d6eb2145 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:50 +0200 Subject: net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device When the switchdev_handle_fdb_event_to_device() event replication helper was created, my original thought was that FDB events on LAG interfaces should most likely be special-cased, not just replicated towards all switchdev ports beneath that LAG. So this replication helper currently does not recurse through switchdev lower interfaces of LAG bridge ports, but rather calls the lag_mod_cb() if that was provided. No switchdev driver uses this helper for FDB events on LAG interfaces yet, so that was an assumption which was yet to be tested. It is certainly usable for that purpose, as my RFC series shows: https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/ however this approach is slightly convoluted because: - the switchdev driver gets a "dev" that isn't its own net device, but rather the LAG net device. It must call switchdev_lower_dev_find(dev) in order to get a handle of any of its own net devices (the ones that pass check_cb). - in order for FDB entries on LAG ports to be correctly refcounted per the number of switchdev ports beneath that LAG, we haven't escaped the need to iterate through the LAG's lower interfaces. Except that is now the responsibility of the switchdev driver, because the replication helper just stopped half-way. So, even though yes, FDB events on LAG bridge ports must be special-cased, in the end it's simpler to let switchdev_handle_fdb_* just iterate through the LAG port's switchdev lowers, and let the switchdev driver figure out that those physical ports are under a LAG. The switchdev_handle_fdb_event_to_device() helper takes a "foreign_dev_check" callback so it can figure out whether @dev can autonomously forward to @foreign_dev. DSA fills this method properly: if the LAG is offloaded by another port in the same tree as @dev, then it isn't foreign. If it is a software LAG, it is foreign - forwarding happens in software. Whether an interface is foreign or not decides whether the replication helper will go through the LAG's switchdev lowers or not. Since the lan966x doesn't properly fill this out, FDB events on software LAG uppers will get called. By changing lan966x_foreign_dev_check(), we can suppress them. Whereas DSA will now start receiving FDB events for its offloaded LAG uppers, so we need to return -EOPNOTSUPP, since we currently don't do the right thing for them. Cc: Horatiu Vultur Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index c32e1c8f79ec..3e424d40fae3 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -313,10 +313,7 @@ int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long e const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info), - int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev, - unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info)); + const struct switchdev_notifier_fdb_info *fdb_info)); int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, @@ -443,10 +440,7 @@ switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info), - int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev, - unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info)) + const struct switchdev_notifier_fdb_info *fdb_info)) { return 0; } -- cgit v1.2.3 From e212fa7c54184b7b1f88990bd328b23b567cbf41 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:53 +0200 Subject: net: dsa: support FDB events on offloaded LAG interfaces This change introduces support for installing static FDB entries towards a bridge port that is a LAG of multiple DSA switch ports, as well as support for filtering towards the CPU local FDB entries emitted for LAG interfaces that are bridge ports. Conceptually, host addresses on LAG ports are identical to what we do for plain bridge ports. Whereas FDB entries _towards_ a LAG can't simply be replicated towards all member ports like we do for multicast, or VLAN. Instead we need new driver API. Hardware usually considers a LAG to be a "logical port", and sets the entire LAG as the forwarding destination. The physical egress port selection within the LAG is made by hashing policy, as usual. To represent the logical port corresponding to the LAG, we pass by value a copy of the dsa_lag structure to all switches in the tree that have at least one port in that LAG. To illustrate why a refcounted list of FDB entries is needed in struct dsa_lag, it is enough to say that: - a LAG may be a bridge port and may therefore receive FDB events even while it isn't yet offloaded by any DSA interface - DSA interfaces may be removed from a LAG while that is a bridge port; we don't want FDB entries lingering around, but we don't want to remove entries that are still in use, either For all the cases below to work, the idea is to always keep an FDB entry on a LAG with a reference count equal to the DSA member ports. So: - if a port joins a LAG, it requests the bridge to replay the FDB, and the FDB entries get created, or their refcount gets bumped by one - if a port leaves a LAG, the FDB replay deletes or decrements refcount by one - if an FDB is installed towards a LAG with ports already present, that entry is created (if it doesn't exist) and its refcount is bumped by the amount of ports already present in the LAG echo "Adding FDB entry to bond with existing ports" ip link del bond0 ip link add bond0 type bond mode 802.3ad ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up ip link del br0 ip link add br0 type bridge ip link set bond0 master br0 bridge fdb add dev bond0 00:01:02:03:04:05 master static ip link del br0 ip link del bond0 echo "Adding FDB entry to empty bond" ip link del bond0 ip link add bond0 type bond mode 802.3ad ip link del br0 ip link add br0 type bridge ip link set bond0 master br0 bridge fdb add dev bond0 00:01:02:03:04:05 master static ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up ip link del br0 ip link del bond0 echo "Adding FDB entry to empty bond, then removing ports one by one" ip link del bond0 ip link add bond0 type bond mode 802.3ad ip link del br0 ip link add br0 type bridge ip link set bond0 master br0 bridge fdb add dev bond0 00:01:02:03:04:05 master static ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up ip link set swp1 nomaster ip link set swp2 nomaster ip link del br0 ip link del bond0 Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 81ed34998416..01faba89c987 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -119,6 +119,8 @@ struct dsa_netdevice_ops { struct dsa_lag { struct net_device *dev; unsigned int id; + struct mutex fdb_lock; + struct list_head fdbs; refcount_t refcount; }; @@ -944,6 +946,10 @@ struct dsa_switch_ops { const unsigned char *addr, u16 vid); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); + int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, + const unsigned char *addr, u16 vid); + int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, + const unsigned char *addr, u16 vid); /* * Multicast database -- cgit v1.2.3 From 7bbb765b73496699a165d505ecdce962f903b422 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Feb 2022 17:57:40 +0000 Subject: net/tcp: Merge TCP-MD5 inbound callbacks The functions do essentially the same work to verify TCP-MD5 sign. Code can be merged into one family-independent function in order to reduce copy'n'paste and generated code. Later with TCP-AO option added, this will allow to create one function that's responsible for segment verification, that will have all the different checks for MD5/AO/non-signed packets, which in turn will help to see checks for all corner-cases in one function, rather than spread around different families and functions. Cc: Eric Dumazet Cc: Hideaki YOSHIFUJI Signed-off-by: Dmitry Safonov Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220223175740.452397-1-dima@arista.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 04f4650e0ff0..479a27777ad6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1674,6 +1674,11 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } +bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + enum skb_drop_reason *reason, + const void *saddr, const void *daddr, + int family, int dif, int sdif); + #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else @@ -1683,6 +1688,14 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, { return NULL; } +static inline bool tcp_inbound_md5_hash(const struct sock *sk, + const struct sk_buff *skb, + enum skb_drop_reason *reason, + const void *saddr, const void *daddr, + int family, int dif, int sdif) +{ + return false; +} #define tcp_twsk_md5_key(twsk) NULL #endif -- cgit v1.2.3 From c26933639b5402c174c65c01d33f145622784012 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:22 +0200 Subject: net: dsa: request drivers to perform FDB isolation For DSA, to encourage drivers to perform FDB isolation simply means to track which bridge does each FDB and MDB entry belong to. It then becomes the driver responsibility to use something that makes the FDB entry from one bridge not match the FDB lookup of ports from other bridges. The top-level functions where the bridge is determined are: - dsa_port_fdb_{add,del} - dsa_port_host_fdb_{add,del} - dsa_port_mdb_{add,del} - dsa_port_host_mdb_{add,del} aka the pre-crosschip-notifier functions. Changing the API to pass a reference to a bridge is not superfluous, and looking at the passed bridge argument is not the same as having the driver look at dsa_to_port(ds, port)->bridge from the ->port_fdb_add() method. DSA installs FDB and MDB entries on shared (CPU and DSA) ports as well, and those do not have any dp->bridge information to retrieve, because they are not in any bridge - they are merely the pipes that serve the user ports that are in one or multiple bridges. The struct dsa_bridge associated with each FDB/MDB entry is encapsulated in a larger "struct dsa_db" database. Although only databases associated to bridges are notified for now, this API will be the starting point for implementing IFF_UNICAST_FLT in DSA. There, the idea is to install FDB entries on the CPU port which belong to the corresponding user port's port database. These are supposed to match only when the port is standalone. It is better to introduce the API in its expected final form than to introduce it for bridges first, then to have to change drivers which may have made one or more assumptions. Drivers can use the provided bridge.num, but they can also use a different numbering scheme that is more convenient. DSA must perform refcounting on the CPU and DSA ports by also taking into account the bridge number. So if two bridges request the same local address, DSA must notify the driver twice, once for each bridge. In fact, if the driver supports FDB isolation, DSA must perform refcounting per bridge, but if the driver doesn't, DSA must refcount host addresses across all bridges, otherwise it would be telling the driver to delete an FDB entry for a bridge and the driver would delete it for all bridges. So introduce a bool fdb_isolation in drivers which would make all bridge databases passed to the cross-chip notifier have the same number (0). This makes dsa_mac_addr_find() -> dsa_db_equal() say that all bridge databases are the same database - which is essentially the legacy behavior. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 01faba89c987..87c5f18eb381 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -341,11 +341,28 @@ struct dsa_link { struct list_head list; }; +enum dsa_db_type { + DSA_DB_PORT, + DSA_DB_LAG, + DSA_DB_BRIDGE, +}; + +struct dsa_db { + enum dsa_db_type type; + + union { + const struct dsa_port *dp; + struct dsa_lag lag; + struct dsa_bridge bridge; + }; +}; + struct dsa_mac_addr { unsigned char addr[ETH_ALEN]; u16 vid; refcount_t refcount; struct list_head list; + struct dsa_db db; }; struct dsa_vlan { @@ -409,6 +426,13 @@ struct dsa_switch { */ u32 mtu_enforcement_ingress:1; + /* Drivers that isolate the FDBs of multiple bridges must set this + * to true to receive the bridge as an argument in .port_fdb_{add,del} + * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be + * passed as zero. + */ + u32 fdb_isolation:1; + /* Listener for switch fabric events */ struct notifier_block nb; @@ -941,23 +965,29 @@ struct dsa_switch_ops { * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); int (*port_fdb_del)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); /* * Multicast database */ int (*port_mdb_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); int (*port_mdb_del)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); /* * RXNFC */ -- cgit v1.2.3 From 06b9cce42634a50f2840777a66553b02320db5ef Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:23 +0200 Subject: net: dsa: pass extack to .port_bridge_join driver methods As FDB isolation cannot be enforced between VLAN-aware bridges in lack of hardware assistance like extra FID bits, it seems plausible that many DSA switches cannot do it. Therefore, they need to reject configurations with multiple VLAN-aware bridges from the two code paths that can transition towards that state: - joining a VLAN-aware bridge - toggling VLAN awareness on an existing bridge The .port_vlan_filtering method already propagates the netlink extack to the driver, let's propagate it from .port_bridge_join too, to make sure that the driver can use the same function for both. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 87c5f18eb381..cfedcfb86350 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -937,7 +937,8 @@ struct dsa_switch_ops { int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct dsa_bridge bridge, - bool *tx_fwd_offload); + bool *tx_fwd_offload, + struct netlink_ext_ack *extack); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, @@ -1021,7 +1022,8 @@ struct dsa_switch_ops { */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, - struct dsa_bridge bridge); + struct dsa_bridge bridge, + struct netlink_ext_ack *extack); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge); -- cgit v1.2.3 From b8cd5831c61cadee4493248babef8386f836f31c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 24 Feb 2022 10:29:07 +0000 Subject: net: flow_offload: add tc police action parameters The current police offload action entry is missing exceed/notexceed actions and parameters that can be configured by tc police action. Add the missing parameters as a pre-step for offloading police actions to hardware. Signed-off-by: Jianbo Liu Signed-off-by: Roi Dayan Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/flow_offload.h | 9 +++++++++ include/net/tc_act/tc_police.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 5b8c54eb7a6b..74f44d44abe3 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -148,6 +148,8 @@ enum flow_action_id { FLOW_ACTION_MPLS_MANGLE, FLOW_ACTION_GATE, FLOW_ACTION_PPPOE_PUSH, + FLOW_ACTION_JUMP, + FLOW_ACTION_PIPE, NUM_FLOW_ACTIONS, }; @@ -235,9 +237,16 @@ struct flow_action_entry { struct { /* FLOW_ACTION_POLICE */ u32 burst; u64 rate_bytes_ps; + u64 peakrate_bytes_ps; + u32 avrate; + u16 overhead; u64 burst_pkt; u64 rate_pkt_ps; u32 mtu; + struct { + enum flow_action_id act_id; + u32 extval; + } exceed, notexceed; } police; struct { /* FLOW_ACTION_CT */ int action; diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 72649512dcdd..283bde711a42 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -159,4 +159,34 @@ static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act) return params->tcfp_mtu; } +static inline u64 tcf_police_peakrate_bytes_ps(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->peak.rate_bytes_ps; +} + +static inline u32 tcf_police_tcfp_ewma_rate(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->tcfp_ewma_rate; +} + +static inline u16 tcf_police_rate_overhead(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->rate.overhead; +} + #endif /* __NET_TC_POLICE_H */ -- cgit v1.2.3 From d97b4b105ce71f9f907f1511986cc1d224126772 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 24 Feb 2022 10:29:08 +0000 Subject: flow_offload: reject offload for all drivers with invalid police parameters As more police parameters are passed to flow_offload, driver can check them to make sure hardware handles packets in the way indicated by tc. The conform-exceed control should be drop/pipe or drop/ok. Besides, for drop/ok, the police should be the last action. As hardware can't configure peakrate/avrate/overhead, offload should not be supported if any of them is configured. Signed-off-by: Jianbo Liu Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/flow_offload.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 74f44d44abe3..92267d23083e 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -311,6 +311,12 @@ static inline bool flow_offload_has_one_action(const struct flow_action *action) return action->num_entries == 1; } +static inline bool flow_action_is_last_entry(const struct flow_action *action, + const struct flow_action_entry *entry) +{ + return entry == &action->entries[action->num_entries - 1]; +} + #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; \ __i < (__actions)->num_entries; \ -- cgit v1.2.3 From f9c4bb0b245cee35ef66f75bf409c9573d934cf9 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 1 Mar 2022 05:04:36 +0000 Subject: vxlan: vni filtering support on collect metadata device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds vnifiltering support to collect metadata device. Motivation: You can only use a single vxlan collect metadata device for a given vxlan udp port in the system today. The vxlan collect metadata device terminates all received vxlan packets. As shown in the below diagram, there are use-cases where you need to support multiple such vxlan devices in independent bridge domains. Each vxlan device must terminate the vni's it is configured for. Example usecase: In a service provider network a service provider typically supports multiple bridge domains with overlapping vlans. One bridge domain per customer. Vlans in each bridge domain are mapped to globally unique vxlan ranges assigned to each customer. vnifiltering support in collect metadata devices terminates only configured vnis. This is similar to vlan filtering in bridge driver. The vni filtering capability is provided by a new flag on collect metadata device. In the below pic: - customer1 is mapped to br1 bridge domain - customer2 is mapped to br2 bridge domain - customer1 vlan 10-11 is mapped to vni 1001-1002 - customer2 vlan 10-11 is mapped to vni 2001-2002 - br1 and br2 are vlan filtering bridges - vxlan1 and vxlan2 are collect metadata devices with vnifiltering enabled ┌──────────────────────────────────────────────────────────────────┐ │ switch │ │ │ │ ┌───────────┐ ┌───────────┐ │ │ │ │ │ │ │ │ │ br1 │ │ br2 │ │ │ └┬─────────┬┘ └──┬───────┬┘ │ │ vlans│ │ vlans │ │ │ │ 10,11│ │ 10,11│ │ │ │ │ vlanvnimap: │ vlanvnimap: │ │ │ 10-1001,11-1002 │ 10-2001,11-2002 │ │ │ │ │ │ │ │ ┌──────┴┐ ┌──┴─────────┐ ┌───┴────┐ │ │ │ │ swp1 │ │vxlan1 │ │ swp2 │ ┌┴─────────────┐ │ │ │ │ │ vnifilter:│ │ │ │vxlan2 │ │ │ └───┬───┘ │ 1001,1002│ └───┬────┘ │ vnifilter: │ │ │ │ └────────────┘ │ │ 2001,2002 │ │ │ │ │ └──────────────┘ │ │ │ │ │ └───────┼──────────────────────────────────┼───────────────────────┘ │ │ │ │ ┌─────┴───────┐ │ │ customer1 │ ┌─────┴──────┐ │ host/VM │ │customer2 │ └─────────────┘ │ host/VM │ └────────────┘ With this implementation, vxlan dst metadata device can be associated with range of vnis. struct vxlan_vni_node is introduced to represent a configured vni. We start with vni and its associated remote_ip in this structure. This structure can be extended to bring in other per vni attributes if there are usecases for it. A vni inherits an attribute from the base vxlan device if there is no per vni attributes defined. struct vxlan_dev gets a new rhashtable for vnis called vxlan_vni_group. vxlan_vnifilter.c implements the necessary netlink api, notifications and helper functions to process and manage lifecycle of vxlan_vni_node. This patch also adds new helper functions in vxlan_multicast.c to handle per vni remote_ip multicast groups which are part of vxlan_vni_group. Fix build problems: Reported-by: kernel test robot Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/vxlan.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 5a934bebe630..8eb961bb9589 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -232,6 +232,25 @@ struct vxlan_dev_node { struct vxlan_dev *vxlan; }; +struct vxlan_vni_node { + struct rhash_head vnode; + struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ +#if IS_ENABLED(CONFIG_IPV6) + struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ +#endif + struct list_head vlist; + __be32 vni; + union vxlan_addr remote_ip; /* default remote ip for this vni */ + + struct rcu_head rcu; +}; + +struct vxlan_vni_group { + struct rhashtable vni_hash; + struct list_head vni_list; + u32 num_vnis; +}; + /* Pseudo network device */ struct vxlan_dev { struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ @@ -254,6 +273,8 @@ struct vxlan_dev { struct vxlan_config cfg; + struct vxlan_vni_group __rcu *vnigrp; + struct hlist_head fdb_head[FDB_HASH_SIZE]; }; @@ -274,6 +295,7 @@ struct vxlan_dev { #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 #define VXLAN_F_TTL_INHERIT 0x10000 +#define VXLAN_F_VNIFILTER 0x20000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -283,7 +305,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ - VXLAN_F_COLLECT_METADATA) + VXLAN_F_COLLECT_METADATA | \ + VXLAN_F_VNIFILTER) /* Flags that can be set together with VXLAN_F_GPE. */ #define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ @@ -292,7 +315,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM_TX | \ VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ - VXLAN_F_COLLECT_METADATA) + VXLAN_F_COLLECT_METADATA | \ + VXLAN_F_VNIFILTER) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); -- cgit v1.2.3 From 4095e0e1328a3cd9e3b30174d6cb0edb3824256d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 1 Mar 2022 05:04:38 +0000 Subject: drivers: vxlan: vnifilter: per vni stats Add per-vni statistics for vni filter mode. Counting Rx/Tx bytes/packets/drops/errors at the appropriate places. This patch changes vxlan_vs_find_vni to also return the vxlan_vni_node in cases where the vni belongs to a vni filtering vxlan device Signed-off-by: Nikolay Aleksandrov Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/vxlan.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 8eb961bb9589..bca5b01af247 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -227,6 +227,31 @@ struct vxlan_config { enum ifla_vxlan_df df; }; +enum { + VXLAN_VNI_STATS_RX, + VXLAN_VNI_STATS_RX_DROPS, + VXLAN_VNI_STATS_RX_ERRORS, + VXLAN_VNI_STATS_TX, + VXLAN_VNI_STATS_TX_DROPS, + VXLAN_VNI_STATS_TX_ERRORS, +}; + +struct vxlan_vni_stats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_drops; + u64 rx_errors; + u64 tx_packets; + u64 tx_bytes; + u64 tx_drops; + u64 tx_errors; +}; + +struct vxlan_vni_stats_pcpu { + struct vxlan_vni_stats stats; + struct u64_stats_sync syncp; +}; + struct vxlan_dev_node { struct hlist_node hlist; struct vxlan_dev *vxlan; @@ -241,6 +266,7 @@ struct vxlan_vni_node { struct list_head vlist; __be32 vni; union vxlan_addr remote_ip; /* default remote ip for this vni */ + struct vxlan_vni_stats_pcpu __percpu *stats; struct rcu_head rcu; }; -- cgit v1.2.3 From c3873070247d9e3c7a6b0cf9bf9b45e8018427b1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Feb 2022 06:22:22 +0100 Subject: netfilter: nf_queue: fix possible use-after-free Eric Dumazet says: The sock_hold() side seems suspect, because there is no guarantee that sk_refcnt is not already 0. On failure, we cannot queue the packet and need to indicate an error. The packet will be dropped by the caller. v2: split skb prefetch hunk into separate change Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.") Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: Florian Westphal --- include/net/netfilter/nf_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 9eed51e920e8..980daa6e1e3a 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -37,7 +37,7 @@ void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) -- cgit v1.2.3 From 462791bbfa350189e309a5a94541f6b63cd874e8 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:43:56 +0800 Subject: net/smc: add sysctl interface for SMC This patch add sysctl interface to support container environment for SMC as we talk in the mail list. Link: https://lore.kernel.org/netdev/20220224020253.GF5443@linux.alibaba.com Co-developed-by: Tony Lu Signed-off-by: Tony Lu Signed-off-by: Dust Li Signed-off-by: David S. Miller --- include/net/netns/smc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 47b166684fd8..1682eae50579 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -14,5 +14,8 @@ struct netns_smc { struct smc_stats_rsn *fback_rsn; bool limit_smc_hs; /* constraint on handshake */ +#ifdef CONFIG_SYSCTL + struct ctl_table_header *smc_hdr; +#endif }; #endif -- cgit v1.2.3 From 12bbb0d163a90d81a2677cf7808d364697290207 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:43:58 +0800 Subject: net/smc: add sysctl for autocorking This add a new sysctl: net.smc.autocorking_size We can dynamically change the behaviour of autocorking by change the value of autocorking_size. Setting to 0 disables autocorking in SMC Signed-off-by: Dust Li Signed-off-by: David S. Miller --- include/net/netns/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 1682eae50579..e5389eeaf8bd 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -17,5 +17,6 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif + unsigned int sysctl_autocorking_size; }; #endif -- cgit v1.2.3 From db6140e5e35a48405e669353bd54042c1d4c3841 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Mon, 28 Feb 2022 11:23:49 +0200 Subject: net/sched: act_ct: Fix flow table lookup failure with no originating ifindex After cited commit optimizted hw insertion, flow table entries are populated with ifindex information which was intended to only be used for HW offload. This tuple ifindex is hashed in the flow table key, so it must be filled for lookup to be successful. But tuple ifindex is only relevant for the netfilter flowtables (nft), so it's not filled in act_ct flow table lookup, resulting in lookup failure, and no SW offload and no offload teardown for TCP connection FIN/RST packets. To fix this, add new tc ifindex field to tuple, which will only be used for offloading, not for lookup, as it will not be part of the tuple hash. Fixes: 9795ded7f924 ("net/sched: act_ct: Fill offloading tuple iifidx") Signed-off-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a3647fadf1cc..bd59e950f4d6 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -96,6 +96,7 @@ enum flow_offload_xmit_type { FLOW_OFFLOAD_XMIT_NEIGH, FLOW_OFFLOAD_XMIT_XFRM, FLOW_OFFLOAD_XMIT_DIRECT, + FLOW_OFFLOAD_XMIT_TC, }; #define NF_FLOW_TABLE_ENCAP_MAX 2 @@ -127,7 +128,7 @@ struct flow_offload_tuple { struct { } __hash; u8 dir:2, - xmit_type:2, + xmit_type:3, encap_num:2, in_vlan_ingress:2; u16 mtu; @@ -142,6 +143,9 @@ struct flow_offload_tuple { u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; + struct { + u32 iifidx; + } tc; }; }; -- cgit v1.2.3 From 42f0c1934c7cb3e94c2fe8f5771245fa5631d0e7 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 1 Mar 2022 06:35:42 -0800 Subject: tcp: Remove the unused api Last tcp_write_queue_head() use was removed in commit 114f39feab36 ("tcp: restore autocorking"), so remove it. Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/SYZP282MB33317DEE1253B37C0F57231E86029@SYZP282MB3331.AUSP282.PROD.OUTLOOK.COM Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 479a27777ad6..d486d7b6112d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1831,11 +1831,6 @@ static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) return skb_rb_last(&sk->tcp_rtx_queue); } -static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); -- cgit v1.2.3 From 8610037e8106b48c79cfe0afb92b2b2466e51c3d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 1 Mar 2022 23:55:47 -0800 Subject: page_pool: Add allocation stats Add per-pool statistics counters for the allocation path of a page pool. These stats are incremented in softirq context, so no locking or per-cpu variables are needed. This code is disabled by default and a kernel config option is provided for users who wish to enable them. The statistics added are: - fast: successful fast path allocations - slow: slow path order-0 allocations - slow_high_order: slow path high order allocations - empty: ptr ring is empty, so a slow path allocation was forced. - refill: an allocation which triggered a refill of the cache - waive: pages obtained from the ptr ring that cannot be added to the cache due to a NUMA mismatch. Signed-off-by: Joe Damato Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 97c3c19872ff..1f27e8a48830 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -84,6 +84,19 @@ struct page_pool_params { void *init_arg; }; +#ifdef CONFIG_PAGE_POOL_STATS +struct page_pool_alloc_stats { + u64 fast; /* fast path allocations */ + u64 slow; /* slow-path order 0 allocations */ + u64 slow_high_order; /* slow-path high order allocations */ + u64 empty; /* failed refills due to empty ptr ring, forcing + * slow path allocation + */ + u64 refill; /* allocations via successful refill */ + u64 waive; /* failed refills due to numa zone mismatch */ +}; +#endif + struct page_pool { struct page_pool_params p; @@ -96,6 +109,11 @@ struct page_pool { unsigned int frag_offset; struct page *frag_page; long frag_users; + +#ifdef CONFIG_PAGE_POOL_STATS + /* these stats are incremented while in softirq context */ + struct page_pool_alloc_stats alloc_stats; +#endif u32 xdp_mem_id; /* -- cgit v1.2.3 From ad6fa1e1ab1b8164f1ba296b1b4dc556a483bcad Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 1 Mar 2022 23:55:48 -0800 Subject: page_pool: Add recycle stats Add per-cpu stats tracking page pool recycling events: - cached: recycling placed page in the page pool cache - cache_full: page pool cache was full - ring: page placed into the ptr ring - ring_full: page released from page pool because the ptr ring was full - released_refcnt: page released (and not recycled) because refcnt > 1 Signed-off-by: Joe Damato Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/net') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1f27e8a48830..298af95bbf96 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -95,6 +95,18 @@ struct page_pool_alloc_stats { u64 refill; /* allocations via successful refill */ u64 waive; /* failed refills due to numa zone mismatch */ }; + +struct page_pool_recycle_stats { + u64 cached; /* recycling placed page in the cache. */ + u64 cache_full; /* cache was full */ + u64 ring; /* recycling placed page back into ptr ring */ + u64 ring_full; /* page was released from page-pool because + * PTR ring was full. + */ + u64 released_refcnt; /* page released because of elevated + * refcnt + */ +}; #endif struct page_pool { @@ -144,6 +156,10 @@ struct page_pool { */ struct ptr_ring ring; +#ifdef CONFIG_PAGE_POOL_STATS + /* recycle stats are per-cpu to avoid locking */ + struct page_pool_recycle_stats __percpu *recycle_stats; +#endif atomic_t pages_state_release_cnt; /* A page_pool is strictly tied to a single RX-queue being -- cgit v1.2.3 From 6b95e3388b1ea0ca63500c5a6e39162dbf828433 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 1 Mar 2022 23:55:49 -0800 Subject: page_pool: Add function to batch and return stats Adds a function page_pool_get_stats which can be used by drivers to obtain stats for a specified page_pool. Signed-off-by: Joe Damato Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 298af95bbf96..ea5fb70e5101 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -107,6 +107,23 @@ struct page_pool_recycle_stats { * refcnt */ }; + +/* This struct wraps the above stats structs so users of the + * page_pool_get_stats API can pass a single argument when requesting the + * stats for the page pool. + */ +struct page_pool_stats { + struct page_pool_alloc_stats alloc_stats; + struct page_pool_recycle_stats recycle_stats; +}; + +/* + * Drivers that wish to harvest page pool stats and report them to users + * (perhaps via ethtool, debugfs, or another mechanism) can allocate a + * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. + */ +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats); #endif struct page_pool { -- cgit v1.2.3 From f9cef64fa23f6c1ff177be5082113c5a94e34e5d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Mar 2022 21:14:13 +0200 Subject: net: dsa: felix: migrate host FDB and MDB entries when changing tag proto The "ocelot" and "ocelot-8021q" tagging protocols make use of different hardware resources, and host FDB entries have different destination ports in the switch analyzer module, practically speaking. So when the user requests a tagging protocol change, the driver must migrate all host FDB and MDB entries from the NPI port (in fact CPU port module) towards the same physical port, but this time used as a regular port. It is pointless for the felix driver to keep a copy of the host addresses, when we can create and export DSA helpers for walking through the addresses that it already needs to keep on the CPU port, for refcounting purposes. felix_classify_db() is moved up to avoid a forward declaration. We pass "bool change" because dp->fdbs and dp->mdbs are uninitialized lists when felix_setup() first calls felix_set_tag_protocol(), so we need to avoid calling dsa_port_walk_fdbs() during probe time. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index cfedcfb86350..71cc363dbbd4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1219,6 +1219,13 @@ struct dsa_switch_driver { struct net_device *dsa_dev_to_net_device(struct device *dev); +typedef int dsa_fdb_walk_cb_t(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db); + +int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); +int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); + /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { -- cgit v1.2.3 From 8672406eb5d77333ca14e9612e3166704b367c40 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:57 -0800 Subject: net: ip: Handle delivery_time in ip defrag A latter patch will postpone the delivery_time clearing until the stack knows the skb is being delivered locally. That will allow other kernel forwarding path (e.g. ip[6]_forward) to keep the delivery_time also. An earlier attempt was to do skb_clear_delivery_time() in ip_local_deliver() and ip6_input(). The discussion [0] requested to move it one step later into ip_local_deliver_finish() and ip6_input_finish() so that the delivery_time can be kept for the ip_vs forwarding path also. To do that, this patch also needs to take care of the (rcv) timestamp usecase in ip_is_fragment(). It needs to expect delivery_time in the skb->tstamp, so it needs to save the mono_delivery_time bit in inet_frag_queue such that the delivery_time (if any) can be restored in the final defragmented skb. [Note that it will only happen when the locally generated skb is looping from egress to ingress over a virtual interface (e.g. veth, loopback...), skb->tstamp may have the delivery time before it is known that it will be delivered locally and received by another sk.] [0]: https://lore.kernel.org/netdev/ca728d81-80e8-3767-d5e-d44f6ad96e43@ssi.bg/ Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 63540be0fc34..911ad930867d 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -70,6 +70,7 @@ struct frag_v6_compare_key { * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far + * @mono_delivery_time: stamp has a mono delivery time (EDT) * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir @@ -90,6 +91,7 @@ struct inet_frag_queue { ktime_t stamp; int len; int meat; + u8 mono_delivery_time; __u8 flags; u16 max_size; struct fqdir *fqdir; -- cgit v1.2.3 From 2d3916f3189172d5c69d33065c3c21119fe539fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Mar 2022 09:37:28 -0800 Subject: ipv6: fix skb drops in igmp6_event_query() and igmp6_event_report() While investigating on why a synchronize_net() has been added recently in ipv6_mc_down(), I found that igmp6_event_query() and igmp6_event_report() might drop skbs in some cases. Discussion about removing synchronize_net() from ipv6_mc_down() will happen in a different thread. Fixes: f185de28d9ae ("mld: add new workqueues for process mld events") Signed-off-by: Eric Dumazet Cc: Taehee Yoo Cc: Cong Wang Cc: David Ahern Link: https://lore.kernel.org/r/20220303173728.937869-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ndisc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 53cb8de0e589..47ffb360ddfa 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -475,9 +475,9 @@ int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); -int igmp6_event_query(struct sk_buff *skb); +void igmp6_event_query(struct sk_buff *skb); -int igmp6_event_report(struct sk_buff *skb); +void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL -- cgit v1.2.3 From 3a0318140a6f8c3ab60f1f46c3f203923cb01882 Mon Sep 17 00:00:00 2001 From: Changcheng Deng Date: Fri, 21 Jan 2022 01:35:08 +0000 Subject: Bluetooth: mgmt: Replace zero-length array with flexible-array member There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use "flexible array members" for these cases. The older style of one-element or zero-length arrays should no longer be used. Reference: https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays Reported-by: Zeal Robot Signed-off-by: Changcheng Deng Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 99266f7aebdc..3d26e6a3478b 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -1112,7 +1112,7 @@ struct mgmt_ev_adv_monitor_device_found { __s8 rssi; __le32 flags; __le16 eir_len; - __u8 eir[0]; + __u8 eir[]; } __packed; #define MGMT_EV_ADV_MONITOR_DEVICE_LOST 0x0030 -- cgit v1.2.3 From 9b392e0e0b6d026da5a62bb79a08f32e27af858e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 3 Mar 2022 13:11:57 -0800 Subject: Bluetooth: Fix not checking for valid hdev on bt_dev_{info,warn,err,dbg} This fixes attemting to print hdev->name directly which causes them to print an error: kernel: read_version:367: (efault): sock 000000006a3008f2 Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index a647e5fabdbd..2aa5e95808a5 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -204,19 +204,21 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif +#define bt_dev_name(hdev) ((hdev) ? (hdev)->name : "null") + #define bt_dev_info(hdev, fmt, ...) \ - BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_INFO("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ - BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_WARN("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err(hdev, fmt, ...) \ - BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_ERR("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_dbg(hdev, fmt, ...) \ - BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_DBG("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ - bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_warn_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ - bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) /* Connection and socket states */ enum { -- cgit v1.2.3 From cd87fecdedd7b54c61f5d7c2b7237b43126ac50a Mon Sep 17 00:00:00 2001 From: Luiz Angelo Daros de Luca Date: Wed, 2 Mar 2022 22:52:34 -0300 Subject: net: dsa: tag_rtl8_4: add rtl8_4t trailing variant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Realtek switches supports the same tag both before ethertype or between payload and the CRC. Signed-off-by: Luiz Angelo Daros de Luca Reviewed-by: Alvin Šipraga Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 71cc363dbbd4..759479fe8573 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -52,6 +52,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 #define DSA_TAG_PROTO_SJA1110_VALUE 23 #define DSA_TAG_PROTO_RTL8_4_VALUE 24 +#define DSA_TAG_PROTO_RTL8_4T_VALUE 25 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -79,6 +80,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, DSA_TAG_PROTO_RTL8_4 = DSA_TAG_PROTO_RTL8_4_VALUE, + DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, }; struct dsa_switch; -- cgit v1.2.3 From ebe48d368e97d007bfeb76fcb065d6cfc4c96645 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:39 +0100 Subject: esp: Fix possible buffer overflow in ESP transformation The maximum message size that can be send is bigger than the maximum site that skb_page_frag_refill can allocate. So it is possible to write beyond the allocated buffer. Fix this by doing a fallback to COW in that case. v2: Avoid get get_order() costs as suggested by Linus Torvalds. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Reported-by: valis Signed-off-by: Steffen Klassert --- include/net/esp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/esp.h b/include/net/esp.h index 9c5637d41d95..90cd02ff77ef 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) -- cgit v1.2.3 From ee0a4dc9f317fb9a97f20037d219802ca8de939b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Mar 2022 17:28:38 +0100 Subject: Revert "netfilter: conntrack: tag conntracks picked up in local out hook" This was a prerequisite for the ill-fated "netfilter: nat: force port remap to prevent shadowing well-known ports". As this has been reverted, this change can be backed out too. Signed-off-by: Florian Westphal --- include/net/netfilter/nf_conntrack.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 8731d5bcb47d..b08b70989d2c 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -97,7 +97,6 @@ struct nf_conn { unsigned long status; u16 cpu; - u16 local_origin:1; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) -- cgit v1.2.3 From 7e580490ac9819dd55a36be2a9b3380d1391f91b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 8 Mar 2022 11:15:15 +0200 Subject: net: dsa: felix: avoid early deletion of host FDB entries The Felix driver declares FDB isolation but puts all standalone ports in VID 0. This is mostly problem-free as discussed with Alvin here: https://patchwork.kernel.org/project/netdevbpf/cover/20220302191417.1288145-1-vladimir.oltean@nxp.com/#24763870 however there is one catch. DSA still thinks that FDB entries are installed on the CPU port as many times as there are user ports, and this is problematic when multiple user ports share the same MAC address. Consider the default case where all user ports inherit their MAC address from the DSA master, and then the user runs: ip link set swp0 address 00:01:02:03:04:05 The above will make dsa_slave_set_mac_address() call dsa_port_standalone_host_fdb_add() for 00:01:02:03:04:05 in port 0's standalone database, and dsa_port_standalone_host_fdb_del() for the old address of swp0, again in swp0's standalone database. Both the ->port_fdb_add() and ->port_fdb_del() will be propagated down to the felix driver, which will end up deleting the old MAC address from the CPU port. But this is still in use by other user ports, so we end up breaking unicast termination for them. There isn't a problem in the fact that DSA keeps track of host standalone addresses in the individual database of each user port: some drivers like sja1105 need this. There also isn't a problem in the fact that some drivers choose the same VID/FID for all standalone ports. It is just that the deletion of these host addresses must be delayed until they are known to not be in use any longer, and only the driver has this knowledge. Since DSA keeps these addresses in &cpu_dp->fdbs and &cpu_db->mdbs, it is just a matter of walking over those lists and see whether the same MAC address is present on the CPU port in the port db of another user port. I have considered reusing the generic dsa_port_walk_fdbs() and dsa_port_walk_mdbs() schemes for this, but locking makes it difficult. In the ->port_fdb_add() method and co, &dp->addr_lists_lock is held, but dsa_port_walk_fdbs() also acquires that lock. Also, even assuming that we introduce an unlocked variant of the address iterator, we'd still need some relatively complex data structures, and a void *ctx in the dsa_fdb_walk_cb_t which we don't currently pass, such that drivers are able to figure out, after iterating, whether the same MAC address is or isn't present in the port db of another port. All the above, plus the fact that I expect other drivers to follow the same model as felix where all standalone ports use the same FID, made me conclude that a generic method provided by DSA is necessary: dsa_fdb_present_in_other_db() and the mdb equivalent. Felix calls this from the ->port_fdb_del() handler for the CPU port, when the database was classified to either a port db, or a LAG db. For symmetry, we also call this from ->port_fdb_add(), because if the address was installed once, then installing it a second time serves no purpose: it's already in hardware in VID 0 and it affects all standalone ports. This change moves dsa_db_equal() from switch.c to dsa.c, since it now has one more caller. Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 759479fe8573..9d16505fc0e2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1227,6 +1227,12 @@ typedef int dsa_fdb_walk_cb_t(struct dsa_switch *ds, int port, int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); +bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db); +bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) -- cgit v1.2.3 From 1330b6ef3313fcec577d2b020c290dc8b9f11f1a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 7 Mar 2022 16:44:21 -0800 Subject: skb: make drop reason booleanable We have a number of cases where function returns drop/no drop decision as a boolean. Now that we want to report the reason code as well we have to pass extra output arguments. We can make the reason code evaluate correctly as bool. I believe we're good to reorder the reasons as they are reported to user space as strings. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d486d7b6112d..ee8237b58e1d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1674,10 +1674,11 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } -bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - enum skb_drop_reason *reason, - const void *saddr, const void *daddr, - int family, int dif, int sdif); + +enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) @@ -1688,13 +1689,13 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, { return NULL; } -static inline bool tcp_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb, - enum skb_drop_reason *reason, - const void *saddr, const void *daddr, - int family, int dif, int sdif) + +static inline enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); { - return false; + return SKB_NOT_DROPPED_YET; } #define tcp_twsk_md5_key(twsk) NULL #endif -- cgit v1.2.3 From 24055bb87977e0c687b54ebf7bac8715f3636bc3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 9 Mar 2022 14:20:12 +0200 Subject: net: tcp: fix shim definition of tcp_inbound_md5_hash When CONFIG_TCP_MD5SIG isn't enabled, there is a compilation bug due to the fact that the static inline definition of tcp_inbound_md5_hash() has an unexpected semicolon. Remove it. Fixes: 1330b6ef3313 ("skb: make drop reason booleanable") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220309122012.668986-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index ee8237b58e1d..70ca4a5e330a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1693,7 +1693,7 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, static inline enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, - int family, int dif, int sdif); + int family, int dif, int sdif) { return SKB_NOT_DROPPED_YET; } -- cgit v1.2.3 From 77f09e66f613e4801134c64d26a0be593588a42e Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Tue, 8 Mar 2022 19:40:31 -0800 Subject: net/tls: Provide {__,}tls_driver_ctx() unconditionally Having the definitions of {__,}tls_driver_ctx() under an #if guard means code referencing them also needs to rely on the preprocessor. The protection doesn't appear needed so make the definitions unconditional. Fixes: db37bc177dae ("net/funeth: add the data path") Reported-by: Randy Dunlap Reported-by: kernel test robot Suggested-by: Jakub Kicinski Signed-off-by: Dimitris Michailidis Signed-off-by: Jakub Kicinski --- include/net/tls.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 526cb2c3b724..b6968a5b5538 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -626,7 +626,6 @@ tls_offload_ctx_rx(const struct tls_context *tls_ctx) return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; } -#if IS_ENABLED(CONFIG_TLS_DEVICE) static inline void *__tls_driver_ctx(struct tls_context *tls_ctx, enum tls_offload_ctx_dir direction) { @@ -641,7 +640,6 @@ tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction) { return __tls_driver_ctx(tls_get_ctx(sk), direction); } -#endif #define RESYNC_REQ BIT(0) #define RESYNC_REQ_ASYNC BIT(1) -- cgit v1.2.3 From 65466904b015f6eeb9225b51aeb29b01a1d4b59c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Mar 2022 17:57:57 -0800 Subject: tcp: adjust TSO packet sizes based on min_rtt Back when tcp_tso_autosize() and TCP pacing were introduced, our focus was really to reduce burst sizes for long distance flows. The simple heuristic of using sk_pacing_rate/1024 has worked well, but can lead to too small packets for hosts in the same rack/cluster, when thousands of flows compete for the bottleneck. Neal Cardwell had the idea of making the TSO burst size a function of both sk_pacing_rate and tcp_min_rtt() Indeed, for local flows, sending bigger bursts is better to reduce cpu costs, as occasional losses can be repaired quite fast. This patch is based on Neal Cardwell implementation done more than two years ago. bbr is adjusting max_pacing_rate based on measured bandwidth, while cubic would over estimate max_pacing_rate. /proc/sys/net/ipv4/tcp_tso_rtt_log can be used to tune or disable this new feature, in logarithmic steps. Tested: 100Gbit NIC, two hosts in the same rack, 4K MTU. 600 flows rate-limited to 20000000 bytes per second. Before patch: (TSO sizes would be limited to 20000000/1024/4096 -> 4 segments per TSO) ~# echo 0 >/proc/sys/net/ipv4/tcp_tso_rtt_log ~# nstat -n;perf stat ./super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000;nstat|egrep "TcpInSegs|TcpOutSegs|TcpRetransSegs|Delivered" 96005 Performance counter stats for './super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000': 65,945.29 msec task-clock # 2.845 CPUs utilized 1,314,632 context-switches # 19935.279 M/sec 5,292 cpu-migrations # 80.249 M/sec 940,641 page-faults # 14264.023 M/sec 201,117,030,926 cycles # 3049769.216 GHz (83.45%) 17,699,435,405 stalled-cycles-frontend # 8.80% frontend cycles idle (83.48%) 136,584,015,071 stalled-cycles-backend # 67.91% backend cycles idle (83.44%) 53,809,530,436 instructions # 0.27 insn per cycle # 2.54 stalled cycles per insn (83.36%) 9,062,315,523 branches # 137422329.563 M/sec (83.22%) 153,008,621 branch-misses # 1.69% of all branches (83.32%) 23.182970846 seconds time elapsed TcpInSegs 15648792 0.0 TcpOutSegs 58659110 0.0 # Average of 3.7 4K segments per TSO packet TcpExtTCPDelivered 58654791 0.0 TcpExtTCPDeliveredCE 19 0.0 After patch: ~# echo 9 >/proc/sys/net/ipv4/tcp_tso_rtt_log ~# nstat -n;perf stat ./super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000;nstat|egrep "TcpInSegs|TcpOutSegs|TcpRetransSegs|Delivered" 96046 Performance counter stats for './super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000': 48,982.58 msec task-clock # 2.104 CPUs utilized 186,014 context-switches # 3797.599 M/sec 3,109 cpu-migrations # 63.472 M/sec 941,180 page-faults # 19214.814 M/sec 153,459,763,868 cycles # 3132982.807 GHz (83.56%) 12,069,861,356 stalled-cycles-frontend # 7.87% frontend cycles idle (83.32%) 120,485,917,953 stalled-cycles-backend # 78.51% backend cycles idle (83.24%) 36,803,672,106 instructions # 0.24 insn per cycle # 3.27 stalled cycles per insn (83.18%) 5,947,266,275 branches # 121417383.427 M/sec (83.64%) 87,984,616 branch-misses # 1.48% of all branches (83.43%) 23.281200256 seconds time elapsed TcpInSegs 1434706 0.0 TcpOutSegs 58883378 0.0 # Average of 41 4K segments per TSO packet TcpExtTCPDelivered 58878971 0.0 TcpExtTCPDeliveredCE 9664 0.0 Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://lore.kernel.org/r/20220309015757.2532973-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f0687867b5cd..ce0cc4e8d8c7 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -127,6 +127,7 @@ struct netns_ipv4 { u8 sysctl_tcp_synack_retries; u8 sysctl_tcp_syncookies; u8 sysctl_tcp_migrate_req; + u8 sysctl_tcp_comp_sack_nr; int sysctl_tcp_reordering; u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries2; @@ -160,9 +161,9 @@ struct netns_ipv4 { int sysctl_tcp_challenge_ack_limit; int sysctl_tcp_min_rtt_wlen; u8 sysctl_tcp_min_tso_segs; + u8 sysctl_tcp_tso_rtt_log; u8 sysctl_tcp_autocorking; u8 sysctl_tcp_reflect_tos; - u8 sysctl_tcp_comp_sack_nr; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; -- cgit v1.2.3 From 3af722cb735d212554027ec81e2aa2e6bf1ee34d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Mar 2022 17:12:10 +0100 Subject: powerpc/net: Implement powerpc specific csum_shift() to remove branch Today's implementation of csum_shift() leads to branching based on parity of 'offset' 000002f8 : 2f8: 70 a5 00 01 andi. r5,r5,1 2fc: 41 a2 00 08 beq 304 300: 54 84 c0 3e rotlwi r4,r4,24 304: 7c 63 20 14 addc r3,r3,r4 308: 7c 63 01 94 addze r3,r3 30c: 4e 80 00 20 blr Use first bit of 'offset' directly as input of the rotation instead of branching. 000002f8 : 2f8: 54 a5 1f 38 rlwinm r5,r5,3,28,28 2fc: 20 a5 00 20 subfic r5,r5,32 300: 5c 84 28 3e rotlw r4,r4,r5 304: 7c 63 20 14 addc r3,r3,r4 308: 7c 63 01 94 addze r3,r3 30c: 4e 80 00 20 blr And change to left shift instead of right shift to skip one more instruction. This has no impact on the final sum. 000002f8 : 2f8: 54 a5 1f 38 rlwinm r5,r5,3,28,28 2fc: 5c 84 28 3e rotlw r4,r4,r5 300: 7c 63 20 14 addc r3,r3,r4 304: 7c 63 01 94 addze r3,r3 308: 4e 80 00 20 blr Seems like only powerpc benefits from a branchless implementation. Other main architectures like ARM or X86 get better code with the generic implementation and its branch. Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- include/net/checksum.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 79c67f14c448..6bc783b7a06c 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -80,6 +80,7 @@ static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) return csum16_add(csum, ~addend); } +#ifndef HAVE_ARCH_CSUM_SHIFT static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ @@ -87,6 +88,7 @@ static __always_inline __wsum csum_shift(__wsum sum, int offset) return (__force __wsum)ror32((__force u32)sum, 8); return sum; } +#endif static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) -- cgit v1.2.3 From 7d352ccf1e9935b5222ca84e8baeb07a0c8f94b9 Mon Sep 17 00:00:00 2001 From: Youghandhar Chintala Date: Tue, 8 Mar 2022 17:23:24 +0530 Subject: mac80211: Add support to trigger sta disconnect on hardware restart Currently in case of target hardware restart, we just reconfig and re-enable the security keys and enable the network queues to start data traffic back from where it was interrupted. Many ath10k wifi chipsets have sequence numbers for the data packets assigned by firmware and the mac sequence number will restart from zero after target hardware restart leading to mismatch in the sequence number expected by the remote peer vs the sequence number of the frame sent by the target firmware. This mismatch in sequence number will cause out-of-order packets on the remote peer and all the frames sent by the device are dropped until we reach the sequence number which was sent before we restarted the target hardware In order to fix this, we trigger a sta disconnect, in case of target hw restart. After this there will be a fresh connection and thereby avoiding the dropping of frames by remote peer. The right fix would be to pull the entire data path into the host which is not feasible or would need lots of complex changes and will still be inefficient. Tested on ath10k using WCN3990, QCA6174 Signed-off-by: Youghandhar Chintala Link: https://lore.kernel.org/r/20220308115325.5246-2-youghand@codeaurora.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f118b8fe667a..b8e8c82b53aa 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6070,6 +6070,16 @@ void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect); */ void ieee80211_resume_disconnect(struct ieee80211_vif *vif); +/** + * ieee80211_hw_restart_disconnect - disconnect from AP after + * hardware restart + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * Instructs mac80211 to disconnect from the AP after + * hardware restart. + */ +void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif); + /** * ieee80211_cqm_rssi_notify - inform a configured connection quality monitoring * rssi threshold triggered -- cgit v1.2.3 From 9af41cc33471ea1efa6f77e188f055cc77d0a5c5 Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 4 Mar 2022 17:40:43 +0100 Subject: gtp: Implement GTP echo response Adding GTP device through ip link creates the situation where there is no userspace daemon which would handle GTP messages (Echo Request for example). GTP-U instance which would not respond to echo requests would violate GTP specification. When GTP packet arrives with GTP_ECHO_REQ message type, GTP_ECHO_RSP is send to the sender. GTP_ECHO_RSP message should contain information element with GTPIE_RECOVERY tag and restart counter value. For GTPv1 restart counter is not used and should be equal to 0, for GTPv0 restart counter contains information provided from userspace(IFLA_GTP_RESTART_COUNT). Signed-off-by: Wojciech Drewek Suggested-by: Harald Welte Reviewed-by: Harald Welte Tested-by: Harald Welte Signed-off-by: Tony Nguyen --- include/net/gtp.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/net') diff --git a/include/net/gtp.h b/include/net/gtp.h index 0e16ebb2a82d..0e12c37f2958 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -7,8 +7,13 @@ #define GTP0_PORT 3386 #define GTP1U_PORT 2152 +/* GTP messages types */ +#define GTP_ECHO_REQ 1 /* Echo Request */ +#define GTP_ECHO_RSP 2 /* Echo Response */ #define GTP_TPDU 255 +#define GTPIE_RECOVERY 14 + struct gtp0_header { /* According to GSM TS 09.60. */ __u8 flags; __u8 type; @@ -27,6 +32,32 @@ struct gtp1_header { /* According to 3GPP TS 29.060. */ __be32 tid; } __attribute__ ((packed)); +struct gtp1_header_long { /* According to 3GPP TS 29.060. */ + __u8 flags; + __u8 type; + __be16 length; + __be32 tid; + __be16 seq; + __u8 npdu; + __u8 next; +} __packed; + +/* GTP Information Element */ +struct gtp_ie { + __u8 tag; + __u8 val; +} __packed; + +struct gtp0_packet { + struct gtp0_header gtp0_h; + struct gtp_ie ie; +} __packed; + +struct gtp1u_packet { + struct gtp1_header_long gtp1u_h; + struct gtp_ie ie; +} __packed; + #define GTP1_F_NPDU 0x01 #define GTP1_F_SEQ 0x02 #define GTP1_F_EXTHDR 0x04 -- cgit v1.2.3 From e3acda7ade0a36c5cbebc2b54d30b7f08a4ba29b Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 4 Mar 2022 17:40:45 +0100 Subject: net/sched: Allow flower to match on GTP options Options are as follows: PDU_TYPE:QFI and they refernce to the fields from the PDU Session Protocol. PDU Session data is conveyed in GTP-U Extension Header. GTP-U Extension Header is described in 3GPP TS 29.281. PDU Session Protocol is described in 3GPP TS 38.415. PDU_TYPE - indicates the type of the PDU Session Information (4 bits) QFI - QoS Flow Identifier (6 bits) # ip link add gtp_dev type gtp role sgsn # tc qdisc add dev gtp_dev ingress # tc filter add dev gtp_dev protocol ip parent ffff: \ flower \ enc_key_id 11 \ gtp_opts 1:8/ff:ff \ action mirred egress redirect dev eth0 Signed-off-by: Wojciech Drewek Signed-off-by: Tony Nguyen --- include/net/gtp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/gtp.h b/include/net/gtp.h index 0e12c37f2958..23c2aaae8a42 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -58,6 +58,11 @@ struct gtp1u_packet { struct gtp_ie ie; } __packed; +struct gtp_pdu_session_info { /* According to 3GPP TS 38.415. */ + u8 pdu_type; + u8 qfi; +}; + #define GTP1_F_NPDU 0x01 #define GTP1_F_SEQ 0x02 #define GTP1_F_EXTHDR 0x04 -- cgit v1.2.3 From 81dd9849fa4911f76a14f354a048865894b9751e Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 4 Mar 2022 17:40:46 +0100 Subject: gtp: Add support for checking GTP device type Add a function that checks if a net device type is GTP. Signed-off-by: Wojciech Drewek Reviewed-by: Harald Welte Signed-off-by: Tony Nguyen --- include/net/gtp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/gtp.h b/include/net/gtp.h index 23c2aaae8a42..c1d6169df331 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -63,6 +63,12 @@ struct gtp_pdu_session_info { /* According to 3GPP TS 38.415. */ u8 qfi; }; +static inline bool netif_is_gtp(const struct net_device *dev) +{ + return dev->rtnl_link_ops && + !strcmp(dev->rtnl_link_ops->kind, "gtp"); +} + #define GTP1_F_NPDU 0x01 #define GTP1_F_SEQ 0x02 #define GTP1_F_EXTHDR 0x04 -- cgit v1.2.3 From 8e6ed963763fe21429eabfc76c69ce2b0163a3dd Mon Sep 17 00:00:00 2001 From: Jiyong Park Date: Fri, 11 Mar 2022 11:00:16 +0900 Subject: vsock: each transport cycles only on its own sockets When iterating over sockets using vsock_for_each_connected_socket, make sure that a transport filters out sockets that don't belong to the transport. There actually was an issue caused by this; in a nested VM configuration, destroying the nested VM (which often involves the closing of /dev/vhost-vsock if there was h2g connections to the nested VM) kills not only the h2g connections, but also all existing g2h connections to the (outmost) host which are totally unrelated. Tested: Executed the following steps on Cuttlefish (Android running on a VM) [1]: (1) Enter into an `adb shell` session - to have a g2h connection inside the VM, (2) open and then close /dev/vhost-vsock by `exec 3< /dev/vhost-vsock && exec 3<&-`, (3) observe that the adb session is not reset. [1] https://android.googlesource.com/device/google/cuttlefish/ Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: Jiyong Park Link: https://lore.kernel.org/r/20220311020017.1509316-1-jiyong@google.com Signed-off-by: Jakub Kicinski --- include/net/af_vsock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index ab207677e0a8..f742e50207fb 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -205,7 +205,8 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); -void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +void vsock_for_each_connected_socket(struct vsock_transport *transport, + void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); -- cgit v1.2.3 From 625788b5844511cf4c30cffa7fa0bc3a69cebc82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Mar 2022 21:14:20 -0800 Subject: net: add per-cpu storage and net->core_stats Before adding yet another possibly contended atomic_long_t, it is time to add per-cpu storage for existing ones: dev->tx_dropped, dev->rx_dropped, and dev->rx_nohandler Because many devices do not have to increment such counters, allocate the per-cpu storage on demand, so that dev_get_stats() does not have to spend considerable time folding zero counters. Note that some drivers have abused these counters which were supposed to be only used by core networking stack. v4: should use per_cpu_ptr() in dev_get_stats() (Jakub) v3: added a READ_ONCE() in netdev_core_stats_alloc() (Paolo) v2: add a missing include (reported by kernel test robot ) Change in netdev_core_stats_alloc() (Jakub) Signed-off-by: Eric Dumazet Cc: jeffreyji Reviewed-by: Brian Vazquez Reviewed-by: Jakub Kicinski Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220311051420.2608812-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/bonding.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index d0dfe727e0b1..b14f4c0b4e9e 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -770,7 +770,7 @@ extern const struct sysfs_ops slave_sysfs_ops; static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } -- cgit v1.2.3 From d538eca85c2aa6ecb5036e6ff47efc93d9f294da Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Mar 2022 23:15:18 +0200 Subject: net: dsa: report and change port default priority using dcbnl The port-based default QoS class is assigned to packets that lack a VLAN PCP (or the port is configured to not trust the VLAN PCP), an IP DSCP (or the port is configured to not trust IP DSCP), and packets on which no tc-skbedit action has matched. Similar to other drivers, this can be exposed to user space using the DCB Application Priority Table. IEEE 802.1Q-2018 specifies in Table D-8 - Sel field values that when the Selector is 1, the Protocol ID value of 0 denotes the "Default application priority. For use when application priority is not otherwise specified." The way in which the dcbnl integration in DSA has been designed has to do with its requirements. Andrew Lunn explains that SOHO switches are expected to come with some sort of pre-configured QoS profile, and that it is desirable for this to come pre-loaded into the DSA slave interfaces' DCB application priority table. In the dcbnl design, this is possible because calls to dcb_ieee_setapp() can be initiated by anyone including being self-initiated by this device driver. However, what makes this challenging to implement in DSA is that the DSA core manages the net_devices (effectively hiding them from drivers), while drivers manage the hardware. The DSA core has no knowledge of what individual drivers' QoS policies are. DSA could export to drivers a wrapper over dcb_ieee_setapp() and these could call that function to pre-populate the app priority table, however drivers don't have a good moment in time to do this. The dsa_switch_ops :: setup() method gets called before the net_devices are created (dsa_slave_create), and so is dsa_switch_ops :: port_setup(). What remains is dsa_switch_ops :: port_enable(), but this gets called upon each ndo_open. If we add app table entries on every open, we'd need to remove them on close, to avoid duplicate entry errors. But if we delete app priority entries on close, what we delete may not be the initial, driver pre-populated entries, but rather user-added entries. So it is clear that letting drivers choose the timing of the dcb_ieee_setapp() call is inappropriate. The alternative which was chosen is to introduce hardware-specific ops in dsa_switch_ops, and effectively hide dcbnl details from drivers as well. For pre-populating the application table, dsa_slave_dcbnl_init() will call ds->ops->port_get_default_prio() which is supposed to read from hardware. If the operation succeeds, DSA creates a default-prio app table entry. The method is called as soon as the slave_dev is registered, but before we release the rtnl_mutex. This is done such that user space sees the app table entries as soon as it sees the interface being registered. The fact that we populate slave_dev->dcbnl_ops with a non-NULL pointer changes behavior in dcb_doit() from net/dcb/dcbnl.c, which used to return -EOPNOTSUPP for any dcbnl operation where netdev->dcbnl_ops is NULL. Because there are still dcbnl-unaware DSA drivers even if they have dcbnl_ops populated, the way to restore the behavior is to make all dcbnl_ops return -EOPNOTSUPP on absence of the hardware-specific dsa_switch_ops method. The dcbnl framework absurdly allows there to be more than one app table entry for the same selector and protocol (in other words, more than one port-based default priority). In the iproute2 dcb program, there is a "replace" syntactical sugar command which performs an "add" and a "del" to hide this away. But we choose the largest configured priority when we call ds->ops->port_set_default_prio(), using __fls(). When there is no default-prio app table entry left, the port-default priority is restored to 0. Link: https://patchwork.kernel.org/project/netdevbpf/patch/20210113154139.1803705-2-olteanv@gmail.com/ Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9d16505fc0e2..1220af73151b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -892,6 +892,13 @@ struct dsa_switch_ops { int (*get_ts_info)(struct dsa_switch *ds, int port, struct ethtool_ts_info *ts); + /* + * DCB ops + */ + int (*port_get_default_prio)(struct dsa_switch *ds, int port); + int (*port_set_default_prio)(struct dsa_switch *ds, int port, + u8 prio); + /* * Suspend and resume */ -- cgit v1.2.3 From 47d75f7822064d024ec9207c0fc1777f983783b7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Mar 2022 23:15:19 +0200 Subject: net: dsa: report and change port dscp priority using dcbnl Similar to the port-based default priority, IEEE 802.1Q-2018 allows the Application Priority Table to define QoS classes (0 to 7) per IP DSCP value (0 to 63). In the absence of an app table entry for a packet with DSCP value X, QoS classification for that packet falls back to other methods (VLAN PCP or port-based default). The presence of an app table for DSCP value X with priority Y makes the hardware classify the packet to QoS class Y. As opposed to the default-prio where DSA exposes only a "set" in dsa_switch_ops (because the port-based default is the fallback, it always exists, either implicitly or explicitly), for DSCP priorities we expose an "add" and a "del". The addition of a DSCP entry means trusting that DSCP priority, the deletion means ignoring it. Drivers that already trust (at least some) DSCP values can describe their configuration in dsa_switch_ops :: port_get_dscp_prio(), which is called for each DSCP value from 0 to 63. Again, there can be more than one dcbnl app table entry for the same DSCP value, DSA chooses the one with the largest configured priority. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1220af73151b..9bfe984fcdbf 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -898,6 +898,11 @@ struct dsa_switch_ops { int (*port_get_default_prio)(struct dsa_switch *ds, int port); int (*port_set_default_prio)(struct dsa_switch *ds, int port, u8 prio); + int (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp); + int (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, + u8 prio); + int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, + u8 prio); /* * Suspend and resume -- cgit v1.2.3 From 2b3171c6fe0af24b5506e061525e08917a2f744a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 24 Feb 2022 12:54:58 +0100 Subject: mac80211: MBSSID beacon handling in AP mode Add new fields in struct beacon_data to store all MBSSID elements. Generate a beacon template which includes all MBSSID elements. Move CSA offset to reflect the MBSSID element length. Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Co-developed-by: John Crispin Signed-off-by: John Crispin Signed-off-by: Lorenzo Bianconi Tested-by: Money Wang Link: https://lore.kernel.org/r/5322db3c303f431adaf191ab31c45e151dde5465.1645702516.git.lorenzo@kernel.org [small cleanups] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b8e8c82b53aa..382ebb862ea8 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4948,12 +4948,14 @@ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets * to countdown counters. This array can contain zero values which * should be ignored. + * @mbssid_off: position of the multiple bssid element */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; + u16 mbssid_off; }; /** -- cgit v1.2.3 From 40867d74c374b235e14d839f3a77f26684feefe5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 14 Mar 2022 14:45:51 -0600 Subject: net: Add l3mdev index to flow struct and avoid oif reset for port devices The fundamental premise of VRF and l3mdev core code is binding a socket to a device (l3mdev or netdev with an L3 domain) to indicate L3 scope. Legacy code resets flowi_oif to the l3mdev losing any original port device binding. Ben (among others) has demonstrated use cases where the original port device binding is important and needs to be retained. This patch handles that by adding a new entry to the common flow struct that can indicate the l3mdev index for later rule and table matching avoiding the need to reset flowi_oif. In addition to allowing more use cases that require port device binds, this patch brings a few datapath simplications: 1. l3mdev_fib_rule_match is only called when walking fib rules and always after l3mdev_update_flow. That allows an optimization to bail early for non-VRF type uses cases when flowi_l3mdev is not set. Also, only that index needs to be checked for the FIB table id. 2. l3mdev_update_flow can be called with flowi_oif set to a l3mdev (e.g., VRF) device. By resetting flowi_oif only for this case the FLOWI_FLAG_SKIP_NH_OIF flag is not longer needed and can be removed, removing several checks in the datapath. The flowi_iif path can be simplified to only be called if the it is not loopback (loopback can not be assigned to an L3 domain) and the l3mdev index is not already set. 3. Avoid another device lookup in the output path when the fib lookup returns a reject failure. Note: 2 functional tests for local traffic with reject fib rules are updated to reflect the new direct failure at FIB lookup time for ping rather than the failure on packet path. The current code fails like this: HINT: Fails since address on vrf device is out of device scope COMMAND: ip netns exec ns-A ping -c1 -w1 -I eth1 172.16.3.1 ping: Warning: source address might be selected on device other than: eth1 PING 172.16.3.1 (172.16.3.1) from 172.16.3.1 eth1: 56(84) bytes of data. --- 172.16.3.1 ping statistics --- 1 packets transmitted, 0 received, 100% packet loss, time 0ms where the test now directly fails: HINT: Fails since address on vrf device is out of device scope COMMAND: ip netns exec ns-A ping -c1 -w1 -I eth1 172.16.3.1 ping: connect: No route to host Signed-off-by: David Ahern Tested-by: Ben Greear Link: https://lore.kernel.org/r/20220314204551.16369-1-dsahern@kernel.org Signed-off-by: Jakub Kicinski --- include/net/flow.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 58beb16a49b8..987bd511d652 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -29,6 +29,7 @@ struct flowi_tunnel { struct flowi_common { int flowic_oif; int flowic_iif; + int flowic_l3mdev; __u32 flowic_mark; __u8 flowic_tos; __u8 flowic_scope; @@ -36,7 +37,6 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 -#define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; kuid_t flowic_uid; struct flowi_tunnel flowic_tun_key; @@ -70,6 +70,7 @@ struct flowi4 { struct flowi_common __fl_common; #define flowi4_oif __fl_common.flowic_oif #define flowi4_iif __fl_common.flowic_iif +#define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark #define flowi4_tos __fl_common.flowic_tos #define flowi4_scope __fl_common.flowic_scope @@ -102,6 +103,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; + fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; fl4->flowi4_tos = tos; fl4->flowi4_scope = scope; @@ -132,6 +134,7 @@ struct flowi6 { struct flowi_common __fl_common; #define flowi6_oif __fl_common.flowic_oif #define flowi6_iif __fl_common.flowic_iif +#define flowi6_l3mdev __fl_common.flowic_l3mdev #define flowi6_mark __fl_common.flowic_mark #define flowi6_scope __fl_common.flowic_scope #define flowi6_proto __fl_common.flowic_proto @@ -177,6 +180,7 @@ struct flowi { } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif +#define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark #define flowi_tos u.__fl_common.flowic_tos #define flowi_scope u.__fl_common.flowic_scope -- cgit v1.2.3 From 0492d857636e1c52cd71594a723c4b26a7b31978 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Mar 2022 11:19:43 +0100 Subject: netfilter: flowtable: Fix QinQ and pppoe support for inet table nf_flow_offload_inet_hook() does not check for 802.1q and PPPoE. Fetch inner ethertype from these encapsulation protocols. Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Fixes: 4cd91f7c290f ("netfilter: flowtable: add vlan support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index bd59e950f4d6..64daafd1fc41 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include struct nf_flowtable; struct nf_flow_rule; @@ -317,4 +319,20 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); +static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +{ + __be16 proto; + + proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); + switch (proto) { + case htons(PPP_IP): + return htons(ETH_P_IP); + case htons(PPP_IPV6): + return htons(ETH_P_IPV6); + } + + return 0; +} + #endif /* _NF_FLOW_TABLE_H */ -- cgit v1.2.3 From 2cb7b4890d6e7f20560dc251e7f8d3cc68b0d554 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Mar 2022 23:00:04 -0700 Subject: devlink: expose instance locking and add locked port registering It should be familiar and beneficial to expose devlink instance lock to the drivers. This way drivers can block devlink from calling them during critical sections without breakneck locking. Add port helpers, port splitting callbacks will be the first target. Use 'devl_' prefix for "explicitly locked" API. Initial RFC used '__devlink' but that's too much typing. devl_lock_is_held() is not defined without lockdep, which is the same behavior as lockdep_is_held() itself. Reviewed-by: Leon Romanovsky Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 8d5349d2fb68..9de0d091aee9 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1479,6 +1479,17 @@ void *devlink_priv(struct devlink *devlink); struct devlink *priv_to_devlink(void *priv); struct device *devlink_to_dev(const struct devlink *devlink); +/* Devlink instance explicit locking */ +void devl_lock(struct devlink *devlink); +void devl_unlock(struct devlink *devlink); +void devl_assert_locked(struct devlink *devlink); +bool devl_lock_is_held(struct devlink *devlink); + +int devl_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index); +void devl_port_unregister(struct devlink_port *devlink_port); + struct ib_device; struct net *devlink_net(const struct devlink *devlink); -- cgit v1.2.3 From 706217c1ceb516c96283a1557a31fe003d0c8052 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Mar 2022 23:00:09 -0700 Subject: devlink: pass devlink_port to port_split / port_unsplit callbacks Now that devlink ports are protected by the instance lock it seems natural to pass devlink_port as an argument to the port_split / port_unsplit callbacks. This should save the drivers from doing a lookup. In theory drivers may have supported unsplitting ports which were not registered prior to this change. Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 9de0d091aee9..fd89a17adea1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1197,9 +1197,9 @@ struct devlink_ops { struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); - int (*port_split)(struct devlink *devlink, unsigned int port_index, + int (*port_split)(struct devlink *devlink, struct devlink_port *port, unsigned int count, struct netlink_ext_ack *extack); - int (*port_unsplit)(struct devlink *devlink, unsigned int port_index, + int (*port_unsplit)(struct devlink *devlink, struct devlink_port *port, struct netlink_ext_ack *extack); int (*sb_pool_get)(struct devlink *devlink, unsigned int sb_index, u16 pool_index, -- cgit v1.2.3 From ab95465cde2337108252cdf01f064abdc1a67f6c Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Tue, 15 Mar 2022 13:02:09 +0200 Subject: net/sched: add vlan push_eth and pop_eth action to the hardware IR Add vlan push_eth and pop_eth action to the hardware intermediate representation model which would subsequently allow it to be used by drivers for offload. Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Jakub Kicinski --- include/net/flow_offload.h | 6 ++++++ include/net/tc_act/tc_vlan.h | 10 ++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 92267d23083e..021778a7e1af 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -150,6 +150,8 @@ enum flow_action_id { FLOW_ACTION_PPPOE_PUSH, FLOW_ACTION_JUMP, FLOW_ACTION_PIPE, + FLOW_ACTION_VLAN_PUSH_ETH, + FLOW_ACTION_VLAN_POP_ETH, NUM_FLOW_ACTIONS, }; @@ -211,6 +213,10 @@ struct flow_action_entry { __be16 proto; u8 prio; } vlan; + struct { /* FLOW_ACTION_VLAN_PUSH_ETH */ + unsigned char dst[ETH_ALEN]; + unsigned char src[ETH_ALEN]; + } vlan_push_eth; struct { /* FLOW_ACTION_MANGLE */ /* FLOW_ACTION_ADD */ enum flow_action_mangle_base htype; diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index f94b8bc26f9e..a97600f742de 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -78,4 +78,14 @@ static inline u8 tcf_vlan_push_prio(const struct tc_action *a) return tcfv_push_prio; } + +static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest, + const struct tc_action *a) +{ + rcu_read_lock(); + memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN); + memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); + rcu_read_unlock(); +} + #endif /* __NET_TC_VLAN_H */ -- cgit v1.2.3 From 5142239a22219921a7863cf00c9ab853c00689d8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 11 Mar 2022 10:14:18 +0100 Subject: net: veth: Account total xdp_frame len running ndo_xdp_xmit Even if this is a theoretical issue since it is not possible to perform XDP_REDIRECT on a non-linear xdp_frame, veth driver does not account paged area in ndo_xdp_xmit function pointer. Introduce xdp_get_frame_len utility routine to get the xdp_frame full length and account total frame size running XDP_REDIRECT of a non-linear xdp frame into a veth device. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/54f9fd3bb65d190daf2c0bbae2f852ff16cfbaa0.1646989407.git.lorenzo@kernel.org --- include/net/xdp.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index b7721c3e4d1f..04c852c7a77f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -343,6 +343,20 @@ out: __xdp_release_frame(xdpf->data, mem); } +static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) +{ + struct skb_shared_info *sinfo; + unsigned int len = xdpf->len; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + len += sinfo->xdp_frags_size; +out: + return len; +} + int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); -- cgit v1.2.3 From 87c167bb94ee3fd49569d4aa2038b9b8840d906f Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:46 +0100 Subject: net: bridge: mst: Notify switchdev drivers of MST mode changes Trigger a switchdev event whenever the bridge's MST mode is enabled/disabled. This allows constituent ports to either perform any required hardware config, or refuse the change if it not supported. Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 3e424d40fae3..85dd004dc9ad 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -27,6 +27,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, + SWITCHDEV_ATTR_ID_BRIDGE_MST, SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, }; @@ -48,6 +49,7 @@ struct switchdev_attr { clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ u16 vlan_protocol; /* BRIDGE_VLAN_PROTOCOL */ + bool mst; /* BRIDGE_MST */ bool mc_disabled; /* MC_DISABLED */ u8 mrp_port_role; /* MRP_PORT_ROLE */ } u; -- cgit v1.2.3 From 6284c723d9b9995cc27ab3c6368a9d95d67111ff Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:47 +0100 Subject: net: bridge: mst: Notify switchdev drivers of VLAN MSTI migrations Whenever a VLAN moves to a new MSTI, send a switchdev notification so that switchdevs can track a bridge's VID to MSTI mappings. Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 85dd004dc9ad..53dfa0f7cf5b 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -29,6 +29,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, SWITCHDEV_ATTR_ID_BRIDGE_MST, SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, + SWITCHDEV_ATTR_ID_VLAN_MSTI, }; struct switchdev_brport_flags { @@ -36,6 +37,11 @@ struct switchdev_brport_flags { unsigned long mask; }; +struct switchdev_vlan_msti { + u16 vid; + u16 msti; +}; + struct switchdev_attr { struct net_device *orig_dev; enum switchdev_attr_id id; @@ -52,6 +58,7 @@ struct switchdev_attr { bool mst; /* BRIDGE_MST */ bool mc_disabled; /* MC_DISABLED */ u8 mrp_port_role; /* MRP_PORT_ROLE */ + struct switchdev_vlan_msti vlan_msti; /* VLAN_MSTI */ } u; }; -- cgit v1.2.3 From 7ae9147f4312903b97eae231c48571bbd95dc63f Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:48 +0100 Subject: net: bridge: mst: Notify switchdev drivers of MST state changes Generate a switchdev notification whenever an MST state changes. This notification is keyed by the VLANs MSTI rather than the VID, since multiple VLANs may share the same MST instance. Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 53dfa0f7cf5b..aa0171d5786d 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -19,6 +19,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_UNDEFINED, SWITCHDEV_ATTR_ID_PORT_STP_STATE, + SWITCHDEV_ATTR_ID_PORT_MST_STATE, SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS, SWITCHDEV_ATTR_ID_PORT_MROUTER, @@ -32,6 +33,11 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_VLAN_MSTI, }; +struct switchdev_mst_state { + u16 msti; + u8 state; +}; + struct switchdev_brport_flags { unsigned long val; unsigned long mask; @@ -50,6 +56,7 @@ struct switchdev_attr { void (*complete)(struct net_device *dev, int err, void *priv); union { u8 stp_state; /* PORT_STP_STATE */ + struct switchdev_mst_state mst_state; /* PORT_MST_STATE */ struct switchdev_brport_flags brport_flags; /* PORT_BRIDGE_FLAGS */ bool mrouter; /* PORT_MROUTER */ clock_t ageing_time; /* BRIDGE_AGEING_TIME */ -- cgit v1.2.3 From 8e6598a7b0fa834a563392d30017eac1b0102fcd Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:53 +0100 Subject: net: dsa: Pass VLAN MSTI migration notifications to driver Add the usual trampoline functionality from the generic DSA layer down to the drivers for VLAN MSTI migrations. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9bfe984fcdbf..644fda2293a2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -976,6 +976,9 @@ struct dsa_switch_ops { struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); + int (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge, + const struct switchdev_vlan_msti *msti); + /* * Forwarding database */ -- cgit v1.2.3 From 7414af30b7d80f117bed5ad6769e6ffb433573ba Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:54 +0100 Subject: net: dsa: Handle MST state changes Add the usual trampoline functionality from the generic DSA layer down to the drivers for MST state changes. When a state changes to disabled/blocking/listening, make sure to fast age any dynamic entries in the affected VLANs (those controlled by the MSTI in question). Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 644fda2293a2..06cdefd3b9dd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -957,7 +957,10 @@ struct dsa_switch_ops { struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); + int (*port_mst_state_set)(struct dsa_switch *ds, int port, + const struct switchdev_mst_state *state); void (*port_fast_age)(struct dsa_switch *ds, int port); + int (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid); int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 0148bb50b8fd51baf357de8b237c0c6011506540 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Mar 2022 22:41:43 +0200 Subject: net: dsa: pass extack to dsa_switch_ops :: port_mirror_add() Drivers might have error messages to propagate to user space, most common being that they support a single mirror port. Propagate the netlink extack so that they can inform user space in a verbal way of their limitations. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 06cdefd3b9dd..934958fda962 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1028,7 +1028,7 @@ struct dsa_switch_ops { struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, - bool ingress); + bool ingress, struct netlink_ext_ack *extack); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, -- cgit v1.2.3 From b58b1f563ab78955d37e9e43e02790a85c66ac05 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 14 Mar 2022 11:38:22 +0100 Subject: xfrm: rework default policy structure This is a follow up of commit f8d858e607b2 ("xfrm: make user policy API complete"). The goal is to align userland API to the internal structures. Signed-off-by: Nicolas Dichtel Reviewed-by: Antony Antony Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 6 +----- include/net/xfrm.h | 48 ++++++++++++++++++------------------------------ 2 files changed, 19 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 947733a639a6..bd7c3be4af5d 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -66,11 +66,7 @@ struct netns_xfrm { int sysctl_larval_drop; u32 sysctl_acq_expires; - u8 policy_default; -#define XFRM_POL_DEFAULT_IN 1 -#define XFRM_POL_DEFAULT_OUT 2 -#define XFRM_POL_DEFAULT_FWD 4 -#define XFRM_POL_DEFAULT_MASK 7 + u8 policy_default[XFRM_POLICY_MAX]; #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_hdr; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fdb41e8bb626..1541ca0cb13c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1081,25 +1081,18 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un } #ifdef CONFIG_XFRM -static inline bool -xfrm_default_allow(struct net *net, int dir) -{ - u8 def = net->xfrm.policy_default; - - switch (dir) { - case XFRM_POLICY_IN: - return def & XFRM_POL_DEFAULT_IN ? false : true; - case XFRM_POLICY_OUT: - return def & XFRM_POL_DEFAULT_OUT ? false : true; - case XFRM_POLICY_FWD: - return def & XFRM_POL_DEFAULT_FWD ? false : true; - } - return false; -} - int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, unsigned short family); +static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb, + int dir) +{ + if (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) + return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT; + + return false; +} + static inline int __xfrm_policy_check2(struct sock *sk, int dir, struct sk_buff *skb, unsigned int family, int reverse) @@ -1110,13 +1103,9 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, if (sk && sk->sk_policy[XFRM_POLICY_IN]) return __xfrm_policy_check(sk, ndir, skb, family); - if (xfrm_default_allow(net, dir)) - return (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) || - (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || - __xfrm_policy_check(sk, ndir, skb, family); - else - return (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || - __xfrm_policy_check(sk, ndir, skb, family); + return __xfrm_check_nopolicy(net, skb, dir) || + (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || + __xfrm_policy_check(sk, ndir, skb, family); } static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) @@ -1168,13 +1157,12 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); - if (xfrm_default_allow(net, XFRM_POLICY_OUT)) - return !net->xfrm.policy_count[XFRM_POLICY_OUT] || - (skb_dst(skb)->flags & DST_NOXFRM) || - __xfrm_route_forward(skb, family); - else - return (skb_dst(skb)->flags & DST_NOXFRM) || - __xfrm_route_forward(skb, family); + if (!net->xfrm.policy_count[XFRM_POLICY_OUT] && + net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT) + return true; + + return (skb_dst(skb)->flags & DST_NOXFRM) || + __xfrm_route_forward(skb, family); } static inline int xfrm4_route_forward(struct sk_buff *skb) -- cgit v1.2.3 From 0eaecfb2e4814d51ab172df3823e35d7c488b6d2 Mon Sep 17 00:00:00 2001 From: Ismael Ferreras Morezuelas Date: Mon, 7 Mar 2022 21:04:44 +0100 Subject: Bluetooth: hci_sync: Add a new quirk to skip HCI_FLT_CLEAR_ALL Some controllers have problems with being sent a command to clear all filtering. While the HCI code does not unconditionally send a clear-all anymore at BR/EDR setup (after the state machine refactor), there might be more ways of hitting these codepaths in the future as the kernel develops. Cc: stable@vger.kernel.org Cc: Hans de Goede Signed-off-by: Ismael Ferreras Morezuelas Reviewed-by: Hans de Goede Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 35c073d44ec5..5cb095b09a94 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -255,6 +255,16 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, + + /* When this quirk is set, HCI_OP_SET_EVENT_FLT requests with + * HCI_FLT_CLEAR_ALL are ignored and event filtering is + * completely avoided. A subset of the CSR controller + * clones struggle with this and instantly lock up. + * + * Note that devices using this must (separately) disable + * runtime suspend, because event filtering takes place there. + */ + HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, }; /* HCI device flags */ -- cgit v1.2.3 From da8912176fb0ff9fd60e14fa653108d96422b896 Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Mon, 14 Mar 2022 15:42:52 -0700 Subject: Bluetooth: fix incorrect nonblock bitmask in bt_sock_wait_ready() Callers pass msg->msg_flags as flags, which contains MSG_DONTWAIT instead of O_NONBLOCK. Signed-off-by: Gavin Li Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 2aa5e95808a5..6b48d9e2aab9 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -345,7 +345,7 @@ int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, __poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); -int bt_sock_wait_ready(struct sock *sk, unsigned long flags); +int bt_sock_wait_ready(struct sock *sk, unsigned int msg_flags); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh); void bt_accept_unlink(struct sock *sk); -- cgit v1.2.3 From 31d0bb9763efad30377505f3467f958d1ebe1e3d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 2 Mar 2022 22:02:55 +0100 Subject: netfilter: conntrack: Add and use nf_ct_set_auto_assign_helper_warned() The function sets the pernet boolean to avoid the spurious warning from nf_ct_lookup_helper() when assigning conntrack helpers via nftables. Fixes: 1a64edf54f55 ("netfilter: nft_ct: add helper set support") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 37f0fbefb060..9939c366f720 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -177,4 +177,5 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_nat_helper_put(struct nf_conntrack_helper *helper); +void nf_ct_set_auto_assign_helper_warned(struct net *net); #endif /*_NF_CONNTRACK_HELPER_H*/ -- cgit v1.2.3 From b2d306542ff935a4edf7a88ba8145c108193442a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Mar 2022 18:23:00 +0100 Subject: netfilter: nf_tables: do not reduce read-only expressions Skip register tracking for expressions that perform read-only operations on the registers. Define and use a cookie pointer NFT_REDUCE_READONLY to avoid defining stubs for these expressions. This patch re-enables register tracking which was disabled in ed5f85d42290 ("netfilter: nf_tables: disable register tracking"). Follow up patches add remaining register tracking for existing expressions. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c4c0861deac1..edabfb9e97ce 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1633,4 +1633,12 @@ static inline struct nftables_pernet *nft_pernet(const struct net *net) return net_generic(net, nf_tables_net_id); } +#define __NFT_REDUCE_READONLY 1UL +#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY + +static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) +{ + return expr->ops->reduce == NFT_REDUCE_READONLY; +} + #endif /* _NET_NF_TABLES_H */ -- cgit v1.2.3 From 34cc9e52884a16c62acbfb309863fb60e4c24f55 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Mar 2022 18:23:01 +0100 Subject: netfilter: nf_tables: cancel tracking for clobbered destination registers Output of expressions might be larger than one single register, this might clobber existing data. Reset tracking for all destination registers that required to store the expression output. This patch adds three new helper functions: - nft_reg_track_update: cancel previous register tracking and update it. - nft_reg_track_cancel: cancel any previous register tracking info. - __nft_reg_track_cancel: cancel only one single register tracking info. Partial register clobbering detection is also supported by checking the .num_reg field which describes the number of register that are used. This patch updates the following expressions: - meta_bridge - bitwise - byteorder - meta - payload to use these helper functions. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 ++++++++++++++ include/net/netfilter/nft_meta.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index edabfb9e97ce..20af9d3557b9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -126,6 +126,7 @@ struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; + u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; @@ -1641,4 +1642,17 @@ static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) return expr->ops->reduce == NFT_REDUCE_READONLY; } +void nft_reg_track_update(struct nft_regs_track *track, + const struct nft_expr *expr, u8 dreg, u8 len); +void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); +void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); + +static inline bool nft_reg_track_cmp(struct nft_regs_track *track, + const struct nft_expr *expr, u8 dreg) +{ + return track->regs[dreg].selector && + track->regs[dreg].selector->ops == expr->ops && + track->regs[dreg].num_reg == 0; +} + #endif /* _NET_NF_TABLES_H */ diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 2dce55c736f4..246fd023dcf4 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -6,6 +6,7 @@ struct nft_meta { enum nft_meta_keys key:8; + u8 len; union { u8 dreg; u8 sreg; -- cgit v1.2.3 From aaa7b20bd4d637fd4ef0d72b6c828c061b9bc5f7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Mar 2022 18:23:04 +0100 Subject: netfilter: nft_meta: extend reduce support to bridge family its enough to export the meta get reduce helper and then call it from nft_meta_bridge too. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 246fd023dcf4..9b51cc67de54 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -44,4 +44,6 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); +bool nft_meta_get_reduce(struct nft_regs_track *track, + const struct nft_expr *expr); #endif -- cgit v1.2.3 From 3c1eb413a45b6c6327fed394705081ec6202b31a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Mar 2022 18:23:12 +0100 Subject: netfilter: nft_fib: add reduce support The fib expression stores to a register, so we can't add empty stub. Check that the register that is being written is in fact redundant. In most cases, this is expected to cancel tracking as re-use is unlikely. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 237f3757637e..eed099eae672 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -37,4 +37,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct net_device *dev); + +bool nft_fib_reduce(struct nft_regs_track *track, + const struct nft_expr *expr); #endif -- cgit v1.2.3 From 8879b32a3a809a183e6e2998725a0f33c4e42d41 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 18 Mar 2022 12:23:41 -0700 Subject: devlink: add explicitly locked flavor of the rate node APIs We'll need an explicitly locked rate node API for netdevsim to switch eswitch mode setting to locked. Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index fd89a17adea1..a30180c0988a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1490,6 +1490,10 @@ int devl_port_register(struct devlink *devlink, unsigned int port_index); void devl_port_unregister(struct devlink_port *devlink_port); +int devl_rate_leaf_create(struct devlink_port *port, void *priv); +void devl_rate_leaf_destroy(struct devlink_port *devlink_port); +void devl_rate_nodes_destroy(struct devlink *devlink); + struct ib_device; struct net *devlink_net(const struct devlink *devlink); -- cgit v1.2.3 From 054d5575cd6ed2792611a7cbb8c88663cc873780 Mon Sep 17 00:00:00 2001 From: Louis Peens Date: Wed, 23 Mar 2022 11:25:06 +0200 Subject: net/sched: fix incorrect vlan_push_eth dest field Seems like a potential copy-paste bug slipped in here, the second memcpy should of course be populating src and not dest. Fixes: ab95465cde23 ("net/sched: add vlan push_eth and pop_eth action to the hardware IR") Signed-off-by: Louis Peens Link: https://lore.kernel.org/r/20220323092506.21639-1-louis.peens@corigine.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_vlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index a97600f742de..904eddfc1826 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -84,7 +84,7 @@ static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest, { rcu_read_lock(); memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN); - memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); + memcpy(src, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); rcu_read_unlock(); } -- cgit v1.2.3 From 4a9dda1c1da65beee994f0977a56a9a21c5db2a7 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Fri, 1 Apr 2022 10:48:44 +0800 Subject: mctp: Use output netdev to allocate skb headroom Previously the skb was allocated with headroom MCTP_HEADER_MAXLEN, but that isn't sufficient if we are using devs that are not MCTP specific. This also adds a check that the smctp_halen provided to sendmsg for extended addressing is the correct size for the netdev. Fixes: 833ef3b91de6 ("mctp: Populate socket implementation") Reported-by: Matthew Rinaldi Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- include/net/mctp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index d37268fe6825..82800d521c3d 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -36,8 +36,6 @@ struct mctp_hdr { #define MCTP_HDR_TAG_SHIFT 0 #define MCTP_HDR_TAG_MASK GENMASK(2, 0) -#define MCTP_HEADER_MAXLEN 4 - #define MCTP_INITIAL_DEFAULT_NET 1 static inline bool mctp_address_unicast(mctp_eid_t eid) -- cgit v1.2.3