From b4c2b9593a1c4c3a718370e34af28e817fd5e5c6 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 14 Jan 2022 22:09:49 +0530
Subject: net/netfilter: Add unstable CT lookup helpers for XDP and TC-BPF

This change adds conntrack lookup helpers using the unstable kfunc call
interface for the XDP and TC-BPF hooks. The primary usecase is
implementing a synproxy in XDP, see Maxim's patchset [0].

Export get_net_ns_by_id as nf_conntrack_bpf.c needs to call it.

This object is only built when CONFIG_DEBUG_INFO_BTF_MODULES is enabled.

  [0]: https://lore.kernel.org/bpf/20211019144655.3483197-1-maximmi@nvidia.com

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220114163953.1455836-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/netfilter/nf_conntrack_bpf.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 include/net/netfilter/nf_conntrack_bpf.h

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
new file mode 100644
index 000000000000..a473b56842c5
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_bpf.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _NF_CONNTRACK_BPF_H
+#define _NF_CONNTRACK_BPF_H
+
+#include <linux/btf.h>
+#include <linux/kconfig.h>
+
+#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern int register_nf_conntrack_bpf(void);
+
+#else
+
+static inline int register_nf_conntrack_bpf(void)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* _NF_CONNTRACK_BPF_H */
-- 
cgit v1.2.3


From 3368aa357f3ba133ae65fc26c04d24a1447a3903 Mon Sep 17 00:00:00 2001
From: Manish Mandlik <mmandlik@google.com>
Date: Tue, 11 Jan 2022 08:14:25 -0800
Subject: Bluetooth: msft: Handle MSFT Monitor Device Event

Whenever the controller starts/stops monitoring a bt device, it sends
MSFT Monitor Device event. Add handler to read this vendor event.

Test performed:
- Verified by logs that the MSFT Monitor Device event is received from
  the controller whenever it starts/stops monitoring a device.

Signed-off-by: Manish Mandlik <mmandlik@google.com>
Reviewed-by: Miao-chen Chou <mcchou@google.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 586f69d084a2..639fb9f57ae7 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -258,6 +258,15 @@ struct adv_info {
 
 #define HCI_ADV_TX_POWER_NO_PREFERENCE 0x7F
 
+struct monitored_device {
+	struct list_head list;
+
+	bdaddr_t bdaddr;
+	__u8     addr_type;
+	__u16    handle;
+	bool     notified;
+};
+
 struct adv_pattern {
 	struct list_head list;
 	__u8 ad_type;
@@ -591,6 +600,8 @@ struct hci_dev {
 
 	struct delayed_work	interleave_scan;
 
+	struct list_head	monitored_devices;
+
 #if IS_ENABLED(CONFIG_BT_LEDS)
 	struct led_trigger	*power_led;
 #endif
-- 
cgit v1.2.3


From 8d7f167752c3e4c45a39e76ffa6f7209413d3fa6 Mon Sep 17 00:00:00 2001
From: Manish Mandlik <mmandlik@google.com>
Date: Tue, 11 Jan 2022 08:14:26 -0800
Subject: Bluetooth: mgmt: Add MGMT Adv Monitor Device Found/Lost events

This patch introduces two new MGMT events for notifying the bluetoothd
whenever the controller starts/stops monitoring a device.

Test performed:
- Verified by logs that the MSFT Monitor Device is received from the
  controller and the bluetoothd is notified whenever the controller
  starts/stops monitoring a device.

Signed-off-by: Manish Mandlik <mmandlik@google.com>
Reviewed-by: Miao-chen Chou <mcchou@google.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  3 +++
 include/net/bluetooth/mgmt.h     | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 639fb9f57ae7..21eadb113a31 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -601,6 +601,7 @@ struct hci_dev {
 	struct delayed_work	interleave_scan;
 
 	struct list_head	monitored_devices;
+	bool			advmon_pend_notify;
 
 #if IS_ENABLED(CONFIG_BT_LEDS)
 	struct led_trigger	*power_led;
@@ -1858,6 +1859,8 @@ void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle);
 int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip);
 int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status);
 int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status);
+void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
+				  bdaddr_t *bdaddr, u8 addr_type);
 
 u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
 		      u16 to_multiplier);
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 107b25deae68..99266f7aebdc 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -1104,3 +1104,19 @@ struct mgmt_ev_controller_resume {
 #define MGMT_WAKE_REASON_NON_BT_WAKE		0x0
 #define MGMT_WAKE_REASON_UNEXPECTED		0x1
 #define MGMT_WAKE_REASON_REMOTE_WAKE		0x2
+
+#define MGMT_EV_ADV_MONITOR_DEVICE_FOUND	0x002f
+struct mgmt_ev_adv_monitor_device_found {
+	__le16 monitor_handle;
+	struct mgmt_addr_info addr;
+	__s8   rssi;
+	__le32 flags;
+	__le16 eir_len;
+	__u8   eir[0];
+} __packed;
+
+#define MGMT_EV_ADV_MONITOR_DEVICE_LOST		0x0030
+struct mgmt_ev_adv_monitor_device_lost {
+	__le16 monitor_handle;
+	struct mgmt_addr_info addr;
+} __packed;
-- 
cgit v1.2.3


From 2e88d4ff03013937028f5397268b21e10cf68713 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 21 Jan 2022 11:09:45 +0100
Subject: xdp: introduce flags field in xdp_buff/xdp_frame

Introduce flags field in xdp_frame and xdp_buffer data structures
to define additional buffer features. At the moment the only
supported buffer feature is frags bit (XDP_FLAGS_HAS_FRAGS).
frags bit is used to specify if this is a linear buffer
(XDP_FLAGS_HAS_FRAGS not set) or a frags frame (XDP_FLAGS_HAS_FRAGS
set). In the latter case the driver is expected to initialize the
skb_shared_info structure at the end of the first buffer to link together
subsequent buffers belonging to the same frame.

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/e389f14f3a162c0a5bc6a2e1aa8dd01a90be117d.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 8f0812e4996d..485e9495a690 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -66,6 +66,10 @@ struct xdp_txq_info {
 	struct net_device *dev;
 };
 
+enum xdp_buff_flags {
+	XDP_FLAGS_HAS_FRAGS	= BIT(0), /* non-linear xdp buff */
+};
+
 struct xdp_buff {
 	void *data;
 	void *data_end;
@@ -74,13 +78,30 @@ struct xdp_buff {
 	struct xdp_rxq_info *rxq;
 	struct xdp_txq_info *txq;
 	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
+	u32 flags; /* supported values defined in xdp_buff_flags */
 };
 
+static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
+{
+	return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS);
+}
+
+static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp)
+{
+	xdp->flags |= XDP_FLAGS_HAS_FRAGS;
+}
+
+static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
+{
+	xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
+}
+
 static __always_inline void
 xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
 {
 	xdp->frame_sz = frame_sz;
 	xdp->rxq = rxq;
+	xdp->flags = 0;
 }
 
 static __always_inline void
@@ -122,8 +143,14 @@ struct xdp_frame {
 	 */
 	struct xdp_mem_info mem;
 	struct net_device *dev_rx; /* used by cpumap */
+	u32 flags; /* supported values defined in xdp_buff_flags */
 };
 
+static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
+{
+	return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
+}
+
 #define XDP_BULK_QUEUE_SIZE	16
 struct xdp_frame_bulk {
 	int count;
@@ -180,6 +207,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
 	xdp->data_end = frame->data + frame->len;
 	xdp->data_meta = frame->data - frame->metasize;
 	xdp->frame_sz = frame->frame_sz;
+	xdp->flags = frame->flags;
 }
 
 static inline
@@ -206,6 +234,7 @@ int xdp_update_frame_from_buff(struct xdp_buff *xdp,
 	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
 	xdp_frame->metasize = metasize;
 	xdp_frame->frame_sz = xdp->frame_sz;
+	xdp_frame->flags = xdp->flags;
 
 	return 0;
 }
-- 
cgit v1.2.3


From d65a1906b31246492449eafe9cace188cb59e26c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 21 Jan 2022 11:09:48 +0100
Subject: net: xdp: add xdp_update_skb_shared_info utility routine

Introduce xdp_update_skb_shared_info routine to update frags array
metadata in skb_shared_info data structure converting to a skb from
a xdp_buff or xdp_frame.
According to the current skb_shared_info architecture in
xdp_frame/xdp_buff and to the xdp frags support, there is
no need to run skb_add_rx_frag() and reset frags array converting the buffer
to a skb since the frag array will be in the same position for xdp_buff/xdp_frame
and for the skb, we just need to update memory metadata.
Introduce XDP_FLAGS_PF_MEMALLOC flag in xdp_buff_flags in order to mark
the xdp_buff or xdp_frame as under memory-pressure if pages of the frags array
are under memory pressure. Doing so we can avoid looping over all fragments in
xdp_update_skb_shared_info routine. The driver is expected to set the
flag constructing the xdp_buffer using xdp_buff_set_frag_pfmemalloc
utility routine.
Rely on xdp_update_skb_shared_info in __xdp_build_skb_from_frame routine
converting the non-linear xdp_frame to a skb after performing a XDP_REDIRECT.

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/bfd23fb8a8d7438724f7819c567cdf99ffd6226f.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 485e9495a690..1f8641ec658e 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -67,7 +67,10 @@ struct xdp_txq_info {
 };
 
 enum xdp_buff_flags {
-	XDP_FLAGS_HAS_FRAGS	= BIT(0), /* non-linear xdp buff */
+	XDP_FLAGS_HAS_FRAGS		= BIT(0), /* non-linear xdp buff */
+	XDP_FLAGS_FRAGS_PF_MEMALLOC	= BIT(1), /* xdp paged memory is under
+						   * pressure
+						   */
 };
 
 struct xdp_buff {
@@ -96,6 +99,16 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
 	xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
 }
 
+static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp)
+{
+	return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
+}
+
+static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
+{
+	xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
+}
+
 static __always_inline void
 xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
 {
@@ -151,6 +164,11 @@ static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
 	return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
 }
 
+static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame)
+{
+	return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
+}
+
 #define XDP_BULK_QUEUE_SIZE	16
 struct xdp_frame_bulk {
 	int count;
@@ -186,6 +204,19 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
 	frame->dev_rx = NULL;
 }
 
+static inline void
+xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags,
+			   unsigned int size, unsigned int truesize,
+			   bool pfmemalloc)
+{
+	skb_shinfo(skb)->nr_frags = nr_frags;
+
+	skb->len += size;
+	skb->data_len += size;
+	skb->truesize += truesize;
+	skb->pfmemalloc |= pfmemalloc;
+}
+
 /* Avoids inlining WARN macro in fast-path */
 void xdp_warn(const char *msg, const char *func, const int line);
 #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__)
-- 
cgit v1.2.3


From 7c48cb0176c6d6d3b55029f7ff4ffa05faee6446 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 21 Jan 2022 11:09:50 +0100
Subject: xdp: add frags support to xdp_return_{buff/frame}

Take into account if the received xdp_buff/xdp_frame is non-linear
recycling/returning the frame memory to the allocator or into
xdp_frame_bulk.

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/a961069febc868508ce1bdf5e53a343eb4e57cb2.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 1f8641ec658e..8463dea8b4db 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -306,10 +306,24 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
 static inline void xdp_release_frame(struct xdp_frame *xdpf)
 {
 	struct xdp_mem_info *mem = &xdpf->mem;
+	struct skb_shared_info *sinfo;
+	int i;
 
 	/* Curr only page_pool needs this */
-	if (mem->type == MEM_TYPE_PAGE_POOL)
-		__xdp_release_frame(xdpf->data, mem);
+	if (mem->type != MEM_TYPE_PAGE_POOL)
+		return;
+
+	if (likely(!xdp_frame_has_frags(xdpf)))
+		goto out;
+
+	sinfo = xdp_get_shared_info_from_frame(xdpf);
+	for (i = 0; i < sinfo->nr_frags; i++) {
+		struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+		__xdp_release_frame(page_address(page), mem);
+	}
+out:
+	__xdp_release_frame(xdpf->data, mem);
 }
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
-- 
cgit v1.2.3


From 0165cc817075cf701e4289838f1d925ff1911b3e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 21 Jan 2022 11:09:54 +0100
Subject: bpf: introduce bpf_xdp_get_buff_len helper

Introduce bpf_xdp_get_buff_len helper in order to return the xdp buffer
total size (linear and paged area)

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/aac9ac3504c84026cf66a3c71b7c5ae89bc991be.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 8463dea8b4db..52b593321956 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -145,6 +145,20 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
 	return (struct skb_shared_info *)xdp_data_hard_end(xdp);
 }
 
+static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp)
+{
+	unsigned int len = xdp->data_end - xdp->data;
+	struct skb_shared_info *sinfo;
+
+	if (likely(!xdp_buff_has_frags(xdp)))
+		goto out;
+
+	sinfo = xdp_get_shared_info_from_buff(xdp);
+	len += sinfo->xdp_frags_size;
+out:
+	return len;
+}
+
 struct xdp_frame {
 	void *data;
 	u16 len;
-- 
cgit v1.2.3


From bf25146a5595269810b1f47d048f114c5ff9f544 Mon Sep 17 00:00:00 2001
From: Eelco Chaudron <echaudro@redhat.com>
Date: Fri, 21 Jan 2022 11:09:55 +0100
Subject: bpf: add frags support to the bpf_xdp_adjust_tail() API

This change adds support for tail growing and shrinking for XDP frags.

When called on a non-linear packet with a grow request, it will work
on the last fragment of the packet. So the maximum grow size is the
last fragments tailroom, i.e. no new buffer will be allocated.
A XDP frags capable driver is expected to set frag_size in xdp_rxq_info
data structure to notify the XDP core the fragment size.
frag_size set to 0 is interpreted by the XDP core as tail growing is
not allowed.
Introduce __xdp_rxq_info_reg utility routine to initialize frag_size field.

When shrinking, it will work from the last fragment, all the way down to
the base buffer depending on the shrinking size. It's important to mention
that once you shrink down the fragment(s) are freed, so you can not grow
again to the original size.

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Link: https://lore.kernel.org/r/eabda3485dda4f2f158b477729337327e609461d.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 52b593321956..b7721c3e4d1f 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -60,6 +60,7 @@ struct xdp_rxq_info {
 	u32 reg_state;
 	struct xdp_mem_info mem;
 	unsigned int napi_id;
+	u32 frag_size;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
 struct xdp_txq_info {
@@ -304,6 +305,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
 	return xdp_frame;
 }
 
+void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+		  struct xdp_buff *xdp);
 void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
@@ -340,8 +343,17 @@ out:
 	__xdp_release_frame(xdpf->data, mem);
 }
 
-int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
-		     struct net_device *dev, u32 queue_index, unsigned int napi_id);
+int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+		       struct net_device *dev, u32 queue_index,
+		       unsigned int napi_id, u32 frag_size);
+static inline int
+xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+		 struct net_device *dev, u32 queue_index,
+		 unsigned int napi_id)
+{
+	return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0);
+}
+
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
-- 
cgit v1.2.3


From d5ebaa7c5f6f688959e8d40840b2249ede63b8ed Mon Sep 17 00:00:00 2001
From: Soenke Huster <soenke.huster@eknoes.de>
Date: Sun, 23 Jan 2022 15:06:24 +0100
Subject: Bluetooth: hci_event: Ignore multiple conn complete events

When one of the three connection complete events is received multiple
times for the same handle, the device is registered multiple times which
leads to memory corruptions. Therefore, consequent events for a single
connection are ignored.

The conn->state can hold different values, therefore HCI_CONN_HANDLE_UNSET
is introduced to identify new connections. To make sure the events do not
contain this or another invalid handle HCI_CONN_HANDLE_MAX and checks
are introduced.

Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=215497
Signed-off-by: Soenke Huster <soenke.huster@eknoes.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 21eadb113a31..f5caff1ddb29 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -303,6 +303,9 @@ struct adv_monitor {
 
 #define HCI_MAX_SHORT_NAME_LENGTH	10
 
+#define HCI_CONN_HANDLE_UNSET		0xffff
+#define HCI_CONN_HANDLE_MAX		0x0eff
+
 /* Min encryption key size to match with SMP */
 #define HCI_MIN_ENC_KEY_SIZE		7
 
-- 
cgit v1.2.3


From d507204d3c5cc57d9a8bdf0a477615bb59ea1611 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Jan 2022 12:24:52 -0800
Subject: tcp/dccp: add tw->tw_bslot

We want to allow inet_twsk_kill() working even if netns
has been dismantled/freed, to get rid of inet_twsk_purge().

This patch adds tw->tw_bslot to cache the bind bucket slot
so that inet_twsk_kill() no longer needs to dereference twsk_net(tw)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index dfd919b3119e..c221fe2b77dd 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -72,6 +72,7 @@ struct inet_timewait_sock {
 				tw_tos		: 8;
 	u32			tw_txhash;
 	u32			tw_priority;
+	u32			tw_bslot; /* bind bucket slot */
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
 };
-- 
cgit v1.2.3


From 27dd35e02235902933626469a1c9198612d72564 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Jan 2022 12:24:53 -0800
Subject: tcp/dccp: no longer use twsk_net(tw) from tw_timer_handler()

We will soon get rid of inet_twsk_purge().

This means that tw_timer_handler() might fire after
a netns has been dismantled/freed.

Instead of adding a function (and data structure) to find a netns
from tw->tw_net_cookie, just update the SNMP counters
a bit earlier, when the netns is known to be alive.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c221fe2b77dd..b323db969b8b 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -65,10 +65,9 @@ struct inet_timewait_sock {
 	/* these three are in inet_sock */
 	__be16			tw_sport;
 	/* And these are ours. */
-	unsigned int		tw_kill		: 1,
-				tw_transparent  : 1,
+	unsigned int		tw_transparent  : 1,
 				tw_flowlabel	: 20,
-				tw_pad		: 2,	/* 2 bits hole */
+				tw_pad		: 3,	/* 3 bits hole */
 				tw_tos		: 8;
 	u32			tw_txhash;
 	u32			tw_priority;
-- 
cgit v1.2.3


From 0dad4087a86a2cbe177404dc73f18ada26a2c390 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Jan 2022 12:24:54 -0800
Subject: tcp/dccp: get rid of inet_twsk_purge()

Prior patches in the series made sure tw_timer_handler()
can be fired after netns has been dismantled/freed.

We no longer have to scan a potentially big TCP ehash
table at netns dismantle.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index b323db969b8b..463ae5d33eb0 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -110,8 +110,6 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo
 
 void inet_twsk_deschedule_put(struct inet_timewait_sock *tw);
 
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family);
-
 static inline
 struct net *twsk_net(const struct inet_timewait_sock *twsk)
 {
-- 
cgit v1.2.3


From a15c89c703d43490ea68ea4516553d4ea4f6b1e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Jan 2022 12:24:55 -0800
Subject: ipv4: do not use per netns icmp sockets

Back in linux-2.6.25 (commit 4a6ad7a141cb "[NETNS]: Make icmp_sk per namespace."),
we added private per-cpu/per-netns ipv4 icmp sockets.

This adds memory and cpu costs, which do not seem needed.
Now typical servers have 256 or more cores, this adds considerable
tax to netns users.

icmp sockets are used from BH context, are not receiving packets,
and do not store any persistent state but the 'struct net' pointer.

icmp_xmit_lock() already makes sure to lock the chosen per-cpu
socket.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 78557643526e..639a31638159 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -70,7 +70,6 @@ struct netns_ipv4 {
 	struct hlist_head	*fib_table_hash;
 	struct sock		*fibnl;
 
-	struct sock  * __percpu	*icmp_sk;
 	struct sock		*mc_autojoin_sk;
 
 	struct inet_peer_base	*peers;
-- 
cgit v1.2.3


From 6a17b961ec19cd61ca646a6655ab93e8f6fe15c0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Jan 2022 12:24:56 -0800
Subject: ipv6: do not use per netns icmp sockets

Back in linux-2.6.25 (commit 98c6d1b261e7 "[NETNS]: Make icmpv6_sk per namespace.",
we added private per-cpu/per-netns ipv6 icmp sockets.

This adds memory and cpu costs, which do not seem needed.
Now typical servers have 256 or more cores, this adds considerable
tax to netns users.

icmp sockets are used from BH context, are not receiving packets,
and do not store any persistent state but the 'struct net' pointer.

icmpv6_xmit_lock() already makes sure to lock the chosen per-cpu
socket.

This patch has a considerable impact on the number of netns
that the worker thread in cleanup_net() can dismantle per second,
because ip6mr_sk_done() is no longer called, meaning we no longer
acquire the rtnl mutex, competing with other threads adding new netns.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index a4b550380316..30cdfc4e1615 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -88,7 +88,6 @@ struct netns_ipv6 {
 	struct fib6_table       *fib6_local_tbl;
 	struct fib_rules_ops    *fib6_rules_ops;
 #endif
-	struct sock * __percpu	*icmp_sk;
 	struct sock             *ndisc_sk;
 	struct sock             *tcp_sk;
 	struct sock             *igmp_sk;
-- 
cgit v1.2.3


From 37ba017dcc3b1123206808979834655ddcf93251 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 24 Jan 2022 12:24:57 -0800
Subject: ipv4/tcp: do not use per netns ctl sockets

TCP ipv4 uses per-cpu/per-netns ctl sockets in order to send
RST and some ACK packets (on behalf of TIMEWAIT sockets).

This adds memory and cpu costs, which do not seem needed.
Now typical servers have 256 or more cores, this adds considerable
tax to netns users.

tcp sockets are used from BH context, are not receiving packets,
and do not store any persistent state but the 'struct net' pointer
in order to be able to use IPv4 output functions.

Note that I attempted a related change in the past, that had
to be hot-fixed in commit bdbbb8527b6f ("ipv4: tcp: get rid of ugly unicast_sock")

This patch could very well surface old bugs, on layers not
taking care of sk->sk_kern_sock properly.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 639a31638159..22b4c6df1d2b 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -73,7 +73,6 @@ struct netns_ipv4 {
 	struct sock		*mc_autojoin_sk;
 
 	struct inet_peer_base	*peers;
-	struct sock  * __percpu	*tcp_sk;
 	struct fqdir		*fqdir;
 
 	u8 sysctl_icmp_echo_ignore_all;
-- 
cgit v1.2.3


From fbb8295248e1d6f576d444309fcf79356008eac1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 26 Jan 2022 10:07:14 -0800
Subject: tcp: allocate tcp_death_row outside of struct netns_ipv4

I forgot tcp had per netns tracking of timewait sockets,
and their sysctl to change the limit.

After 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()"),
whole struct net can be freed before last tw socket is freed.

We need to allocate a separate struct inet_timewait_death_row
object per netns.

tw_count becomes a refcount and gains associated debugging infrastructure.

BUG: KASAN: use-after-free in inet_twsk_kill+0x358/0x3c0 net/ipv4/inet_timewait_sock.c:46
Read of size 8 at addr ffff88807d5f9f40 by task kworker/1:7/3690

CPU: 1 PID: 3690 Comm: kworker/1:7 Not tainted 5.16.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events pwq_unbound_release_workfn
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255
 __kasan_report mm/kasan/report.c:442 [inline]
 kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
 inet_twsk_kill+0x358/0x3c0 net/ipv4/inet_timewait_sock.c:46
 call_timer_fn+0x1a5/0x6b0 kernel/time/timer.c:1421
 expire_timers kernel/time/timer.c:1466 [inline]
 __run_timers.part.0+0x67c/0xa30 kernel/time/timer.c:1734
 __run_timers kernel/time/timer.c:1715 [inline]
 run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1747
 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558
 invoke_softirq kernel/softirq.c:432 [inline]
 __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637
 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649
 sysvec_apic_timer_interrupt+0x93/0xc0 arch/x86/kernel/apic/apic.c:1097
 </IRQ>
 <TASK>
 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638
RIP: 0010:lockdep_unregister_key+0x1c9/0x250 kernel/locking/lockdep.c:6328
Code: 00 00 00 48 89 ee e8 46 fd ff ff 4c 89 f7 e8 5e c9 ff ff e8 09 cc ff ff 9c 58 f6 c4 02 75 26 41 f7 c4 00 02 00 00 74 01 fb 5b <5d> 41 5c 41 5d 41 5e 41 5f e9 19 4a 08 00 0f 0b 5b 5d 41 5c 41 5d
RSP: 0018:ffffc90004077cb8 EFLAGS: 00000206
RAX: 0000000000000046 RBX: ffff88807b61b498 RCX: 0000000000000001
RDX: dffffc0000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff888077027128 R08: 0000000000000001 R09: ffffffff8f1ea4fc
R10: fffffbfff1ff93ee R11: 000000000000af1e R12: 0000000000000246
R13: 0000000000000000 R14: ffffffff8ffc89b8 R15: ffffffff90157fb0
 wq_unregister_lockdep kernel/workqueue.c:3508 [inline]
 pwq_unbound_release_workfn+0x254/0x340 kernel/workqueue.c:3746
 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307
 worker_thread+0x657/0x1110 kernel/workqueue.c:2454
 kthread+0x2e9/0x3a0 kernel/kthread.c:377
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
 </TASK>

Allocated by task 3635:
 kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38
 kasan_set_track mm/kasan/common.c:46 [inline]
 set_alloc_info mm/kasan/common.c:437 [inline]
 __kasan_slab_alloc+0x90/0xc0 mm/kasan/common.c:470
 kasan_slab_alloc include/linux/kasan.h:260 [inline]
 slab_post_alloc_hook mm/slab.h:732 [inline]
 slab_alloc_node mm/slub.c:3230 [inline]
 slab_alloc mm/slub.c:3238 [inline]
 kmem_cache_alloc+0x202/0x3a0 mm/slub.c:3243
 kmem_cache_zalloc include/linux/slab.h:705 [inline]
 net_alloc net/core/net_namespace.c:407 [inline]
 copy_net_ns+0x125/0x760 net/core/net_namespace.c:462
 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110
 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226
 ksys_unshare+0x445/0x920 kernel/fork.c:3048
 __do_sys_unshare kernel/fork.c:3119 [inline]
 __se_sys_unshare kernel/fork.c:3117 [inline]
 __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3117
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

The buggy address belongs to the object at ffff88807d5f9a80
 which belongs to the cache net_namespace of size 6528
The buggy address is located 1216 bytes inside of
 6528-byte region [ffff88807d5f9a80, ffff88807d5fb400)
The buggy address belongs to the page:
page:ffffea0001f57e00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88807d5f9a80 pfn:0x7d5f8
head:ffffea0001f57e00 order:3 compound_mapcount:0 compound_pincount:0
memcg:ffff888070023001
flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000010200 ffff888010dd4f48 ffffea0001404e08 ffff8880118fd000
raw: ffff88807d5f9a80 0000000000040002 00000001ffffffff ffff888070023001
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 3634, ts 119694798460, free_ts 119693556950
 prep_new_page mm/page_alloc.c:2434 [inline]
 get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165
 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389
 alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271
 alloc_slab_page mm/slub.c:1799 [inline]
 allocate_slab mm/slub.c:1944 [inline]
 new_slab+0x28a/0x3b0 mm/slub.c:2004
 ___slab_alloc+0x87c/0xe90 mm/slub.c:3018
 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105
 slab_alloc_node mm/slub.c:3196 [inline]
 slab_alloc mm/slub.c:3238 [inline]
 kmem_cache_alloc+0x35c/0x3a0 mm/slub.c:3243
 kmem_cache_zalloc include/linux/slab.h:705 [inline]
 net_alloc net/core/net_namespace.c:407 [inline]
 copy_net_ns+0x125/0x760 net/core/net_namespace.c:462
 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110
 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226
 ksys_unshare+0x445/0x920 kernel/fork.c:3048
 __do_sys_unshare kernel/fork.c:3119 [inline]
 __se_sys_unshare kernel/fork.c:3117 [inline]
 __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3117
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae
page last free stack trace:
 reset_page_owner include/linux/page_owner.h:24 [inline]
 free_pages_prepare mm/page_alloc.c:1352 [inline]
 free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404
 free_unref_page_prepare mm/page_alloc.c:3325 [inline]
 free_unref_page+0x19/0x690 mm/page_alloc.c:3404
 skb_free_head net/core/skbuff.c:655 [inline]
 skb_release_data+0x65d/0x790 net/core/skbuff.c:677
 skb_release_all net/core/skbuff.c:742 [inline]
 __kfree_skb net/core/skbuff.c:756 [inline]
 consume_skb net/core/skbuff.c:914 [inline]
 consume_skb+0xc2/0x160 net/core/skbuff.c:908
 skb_free_datagram+0x1b/0x1f0 net/core/datagram.c:325
 netlink_recvmsg+0x636/0xea0 net/netlink/af_netlink.c:1998
 sock_recvmsg_nosec net/socket.c:948 [inline]
 sock_recvmsg net/socket.c:966 [inline]
 sock_recvmsg net/socket.c:962 [inline]
 ____sys_recvmsg+0x2c4/0x600 net/socket.c:2632
 ___sys_recvmsg+0x127/0x200 net/socket.c:2674
 __sys_recvmsg+0xe2/0x1a0 net/socket.c:2704
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Memory state around the buggy address:
 ffff88807d5f9e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88807d5f9e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff88807d5f9f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff88807d5f9f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff88807d5fa000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Reported-by: Paolo Abeni <pabeni@redhat.com>
Tested-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20220126180714.845362-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netns/ipv4.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 22b4c6df1d2b..94568e022001 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -31,18 +31,16 @@ struct ping_group_range {
 struct inet_hashinfo;
 
 struct inet_timewait_death_row {
-	atomic_t		tw_count;
-	char			tw_pad[L1_CACHE_BYTES - sizeof(atomic_t)];
+	refcount_t		tw_refcount;
 
-	struct inet_hashinfo 	*hashinfo;
+	struct inet_hashinfo 	*hashinfo ____cacheline_aligned_in_smp;
 	int			sysctl_max_tw_buckets;
 };
 
 struct tcp_fastopen_context;
 
 struct netns_ipv4 {
-	/* Please keep tcp_death_row at first field in netns_ipv4 */
-	struct inet_timewait_death_row tcp_death_row ____cacheline_aligned_in_smp;
+	struct inet_timewait_death_row *tcp_death_row;
 
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*forw_hdr;
-- 
cgit v1.2.3


From 2e9589ff809e9232f689acd51da73390e135146a Mon Sep 17 00:00:00 2001
From: xu xin <xu.xin16@zte.com.cn>
Date: Wed, 26 Jan 2022 07:10:58 +0000
Subject: ipv4: Namespaceify min_adv_mss sysctl knob

Different netns has different requirement on the setting of min_adv_mss
sysctl which the advertised MSS will be never lower than.

Enable min_adv_mss to be configured per network namespace.

Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 94568e022001..f0687867b5cd 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -83,6 +83,7 @@ struct netns_ipv4 {
 
 	u32 ip_rt_min_pmtu;
 	int ip_rt_mtu_expires;
+	int ip_rt_min_advmss;
 
 	struct local_ports ip_local_ports;
 
-- 
cgit v1.2.3


From 8b0fdcdc3a7d44aff907f0103f5ffb86b12bfe71 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 26 Jan 2022 11:10:59 -0800
Subject: net: remove bond_slave_has_mac_rcu()

No caller since v3.16.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bonding.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index 83cfd2d70247..7dead855a72d 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -698,20 +698,6 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond,
 	return NULL;
 }
 
-/* Caller must hold rcu_read_lock() for read */
-static inline struct slave *bond_slave_has_mac_rcu(struct bonding *bond,
-					       const u8 *mac)
-{
-	struct list_head *iter;
-	struct slave *tmp;
-
-	bond_for_each_slave_rcu(bond, tmp, iter)
-		if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr))
-			return tmp;
-
-	return NULL;
-}
-
 /* Caller must hold rcu_read_lock() for read */
 static inline bool bond_slave_has_mac_rx(struct bonding *bond, const u8 *mac)
 {
-- 
cgit v1.2.3


From 560e08eda7969c3ef0639ab05f718be03a49d387 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 26 Jan 2022 11:11:00 -0800
Subject: net: ax25: remove route refcount

Nothing takes the refcount since v4.9.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ax25.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ax25.h b/include/net/ax25.h
index 526e49589197..cb628c5d7c5b 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -187,18 +187,12 @@ typedef struct {
 
 typedef struct ax25_route {
 	struct ax25_route	*next;
-	refcount_t		refcount;
 	ax25_address		callsign;
 	struct net_device	*dev;
 	ax25_digi		*digipeat;
 	char			ip_mode;
 } ax25_route;
 
-static inline void ax25_hold_route(ax25_route *ax25_rt)
-{
-	refcount_inc(&ax25_rt->refcount);
-}
-
 void __ax25_put_route(ax25_route *ax25_rt);
 
 extern rwlock_t ax25_route_lock;
@@ -213,12 +207,6 @@ static inline void ax25_route_lock_unuse(void)
 	read_unlock(&ax25_route_lock);
 }
 
-static inline void ax25_put_route(ax25_route *ax25_rt)
-{
-	if (refcount_dec_and_test(&ax25_rt->refcount))
-		__ax25_put_route(ax25_rt);
-}
-
 typedef struct {
 	char			slave;			/* slave_mode?   */
 	struct timer_list	slave_timer;		/* timeout timer */
-- 
cgit v1.2.3


From 937fca918aacf54f1c9cb00d16d4e999a0569be0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 26 Jan 2022 11:11:05 -0800
Subject: udplite: remove udplite_csum_outgoing()

Not used since v4.0.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udplite.h | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

(limited to 'include/net')

diff --git a/include/net/udplite.h b/include/net/udplite.h
index 9185e45b997f..a3c53110d30b 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -70,49 +70,6 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh)
 	return 0;
 }
 
-/* Slow-path computation of checksum. Socket is locked. */
-static inline __wsum udplite_csum_outgoing(struct sock *sk, struct sk_buff *skb)
-{
-	const struct udp_sock *up = udp_sk(skb->sk);
-	int cscov = up->len;
-	__wsum csum = 0;
-
-	if (up->pcflag & UDPLITE_SEND_CC) {
-		/*
-		 * Sender has set `partial coverage' option on UDP-Lite socket.
-		 * The special case "up->pcslen == 0" signifies full coverage.
-		 */
-		if (up->pcslen < up->len) {
-			if (0 < up->pcslen)
-				cscov = up->pcslen;
-			udp_hdr(skb)->len = htons(up->pcslen);
-		}
-		/*
-		 * NOTE: Causes for the error case  `up->pcslen > up->len':
-		 *        (i)  Application error (will not be penalized).
-		 *       (ii)  Payload too big for send buffer: data is split
-		 *             into several packets, each with its own header.
-		 *             In this case (e.g. last segment), coverage may
-		 *             exceed packet length.
-		 *       Since packets with coverage length > packet length are
-		 *       illegal, we fall back to the defaults here.
-		 */
-	}
-
-	skb->ip_summed = CHECKSUM_NONE;     /* no HW support for checksumming */
-
-	skb_queue_walk(&sk->sk_write_queue, skb) {
-		const int off = skb_transport_offset(skb);
-		const int len = skb->len - off;
-
-		csum = skb_checksum(skb, off, (cscov > len)? len : cscov, csum);
-
-		if ((cscov -= len) <= 0)
-			break;
-	}
-	return csum;
-}
-
 /* Fast-path computation of checksum. Socket may not be locked. */
 static inline __wsum udplite_csum(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 98b608629746946ecc6cda70bf0fd047785a1197 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 26 Jan 2022 11:11:07 -0800
Subject: net: sched: remove psched_tdiff_bounded()

Not used since v3.9.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 9e7b21c0b3a6..44a35531952e 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -63,12 +63,6 @@ static inline psched_time_t psched_get_time(void)
 	return PSCHED_NS2TICKS(ktime_get_ns());
 }
 
-static inline psched_tdiff_t
-psched_tdiff_bounded(psched_time_t tv1, psched_time_t tv2, psched_time_t bound)
-{
-	return min(tv1 - tv2, bound);
-}
-
 struct qdisc_watchdog {
 	u64		last_expires;
 	struct hrtimer	timer;
-- 
cgit v1.2.3


From a459bc9a3a68f2975ee6661fb9c86126a5636b25 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 26 Jan 2022 11:11:08 -0800
Subject: net: sched: remove qdisc_qlen_cpu()

Never used since it was added in v5.2.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 472843eedbae..9bab396c1f3b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -518,11 +518,6 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
 	BUILD_BUG_ON(sizeof(qcb->data) < sz);
 }
 
-static inline int qdisc_qlen_cpu(const struct Qdisc *q)
-{
-	return this_cpu_ptr(q->cpu_qstats)->qlen;
-}
-
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
 	return q->q.qlen;
-- 
cgit v1.2.3


From d1bc532e99becf104635ed4da6fefa306f452321 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Tue, 25 Jan 2022 17:04:43 +0100
Subject: i40e: xsk: Move tmp desc array from driver to pool

Move desc_array from the driver to the pool. The reason behind this is
that we can then reuse this array as a temporary storage for descriptors
in all zero-copy drivers that use the batched interface. This will make
it easier to add batching to more drivers.

i40e is the only driver that has a batched Tx zero-copy
implementation, so no need to touch any other driver.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20220125160446.78976-6-maciej.fijalkowski@intel.com
---
 include/net/xdp_sock_drv.h  | 5 ++---
 include/net/xsk_buff_pool.h | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 443d45951564..4aa031849668 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -13,7 +13,7 @@
 
 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
-u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max);
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max);
 void xsk_tx_release(struct xsk_buff_pool *pool);
 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
 					    u16 queue_id);
@@ -142,8 +142,7 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool,
 	return false;
 }
 
-static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc,
-						 u32 max)
+static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max)
 {
 	return 0;
 }
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index ddeefc4a1040..5554ee75e7da 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -60,6 +60,7 @@ struct xsk_buff_pool {
 	 */
 	dma_addr_t *dma_pages;
 	struct xdp_buff_xsk *heads;
+	struct xdp_desc *tx_descs;
 	u64 chunk_mask;
 	u64 addrs_cnt;
 	u32 free_list_cnt;
-- 
cgit v1.2.3


From f37a4cc6bb0ba08c2d9fd7d18a1da87161cbb7f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 27 Jan 2022 00:36:28 +0000
Subject: udp6: pass flow in ip6_make_skb together with cork

Another preparation patch. inet_cork_full already contains a field for
iflow, so we can avoid passing a separate struct iflow6 into
__ip6_append_data() and ip6_make_skb(), and use the flow stored in
inet_cork_full. Make sure callers set cork->fl, i.e. we init it in
ip6_append_data() and before calling ip6_make_skb().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 3afcb128e064..5e0b56d66724 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1020,7 +1020,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     int getfrag(void *from, char *to, int offset,
 					 int len, int odd, struct sk_buff *skb),
 			     void *from, int length, int transhdrlen,
-			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
+			     struct ipcm6_cookie *ipc6,
 			     struct rt6_info *rt, unsigned int flags,
 			     struct inet_cork_full *cork);
 
-- 
cgit v1.2.3


From 31ed2261e88fbd1eb62fc870ef2c6b2cf2951336 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 27 Jan 2022 00:36:31 +0000
Subject: ipv6: partially inline ipv6_fixup_options

Inline a part of ipv6_fixup_options() to avoid extra overhead on
function call if opt is NULL.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ipv6.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 5e0b56d66724..082f30256f59 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -437,8 +437,16 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
 					  struct ipv6_txoptions *opt,
 					  int newtype,
 					  struct ipv6_opt_hdr *newopt);
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
-					  struct ipv6_txoptions *opt);
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+					    struct ipv6_txoptions *opt);
+
+static inline struct ipv6_txoptions *
+ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt)
+{
+	if (!opt)
+		return NULL;
+	return __ipv6_fixup_options(opt_space, opt);
+}
 
 bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
 		       const struct inet6_skb_parm *opt);
-- 
cgit v1.2.3


From c265a3a6690b093e542aceb4a7bf8bf6c577e42b Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 25 Jan 2022 13:25:40 +0100
Subject: net: mac802154: Explain the use of ieee802154_wake/stop_queue()

It is not straightforward to the newcomer that a single skb can
currently be sent at a time and that the internal process is to stop the
queue when processing a frame before re-enabling it.

Make this clear by documenting the ieee802154_wake/stop_queue()
helpers.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220125122540.855604-4-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/mac802154.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index d524ffb9eb25..2c3bbc6645ba 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -464,6 +464,12 @@ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb,
  * ieee802154_wake_queue - wake ieee802154 queue
  * @hw: pointer as obtained from ieee802154_alloc_hw().
  *
+ * Tranceivers usually have either one transmit framebuffer or one framebuffer
+ * for both transmitting and receiving. Hence, the core currently only handles
+ * one frame at a time for each phy, which means we had to stop the queue to
+ * avoid new skb to come during the transmission. The queue then needs to be
+ * woken up after the operation.
+ *
  * Drivers should use this function instead of netif_wake_queue.
  */
 void ieee802154_wake_queue(struct ieee802154_hw *hw);
@@ -472,6 +478,12 @@ void ieee802154_wake_queue(struct ieee802154_hw *hw);
  * ieee802154_stop_queue - stop ieee802154 queue
  * @hw: pointer as obtained from ieee802154_alloc_hw().
  *
+ * Tranceivers usually have either one transmit framebuffer or one framebuffer
+ * for both transmitting and receiving. Hence, the core currently only handles
+ * one frame at a time for each phy, which means we need to tell upper layers to
+ * stop giving us new skbs while we are busy with the transmitted one. The queue
+ * must then be stopped before transmitting.
+ *
  * Drivers should use this function instead of netif_stop_queue.
  */
 void ieee802154_stop_queue(struct ieee802154_hw *hw);
-- 
cgit v1.2.3


From 4f0e30407ef6f2075b8e86d54be42e787087cd61 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 28 Jan 2022 08:06:54 -0800
Subject: ipv4: drop fragmentation code from ip_options_build()

Since v2.5.44 and addition of ip_options_fragment()
ip_options_build() does not render headers for fragments
directly. @is_frag is always 0.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index b51bae43b0dd..fdbab0c00fca 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -712,7 +712,7 @@ int ip_forward(struct sk_buff *skb);
  */
 
 void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
-		      __be32 daddr, struct rtable *rt, int is_frag);
+		      __be32 daddr, struct rtable *rt);
 
 int __ip_options_echo(struct net *net, struct ip_options *dopt,
 		      struct sk_buff *skb, const struct ip_options *sopt);
-- 
cgit v1.2.3


From 47ed9442b2ecfcdc72889667236d6c59b6a3337e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Fri, 28 Jan 2022 16:53:47 -0700
Subject: ipv4: Make ip_idents_reserve static

ip_idents_reserve is only used in net/ipv4/route.c. Make it static
and remove the export.

Signed-off-by: David Ahern <dsahern@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index fdbab0c00fca..3984f2c39c4b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -517,7 +517,6 @@ void ip_dst_metrics_put(struct dst_entry *dst)
 		kfree(p);
 }
 
-u32 ip_idents_reserve(u32 hash, int segs);
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);
 
 static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb,
-- 
cgit v1.2.3


From e187013abeb4c2a7ec8a4bb978844c7e92a1a6ec Mon Sep 17 00:00:00 2001
From: Akhmat Karakotov <hmukos@yandex-team.ru>
Date: Mon, 31 Jan 2022 16:31:21 +0300
Subject: txhash: Make rethinking txhash behavior configurable via sysctl

Add a per ns sysctl that controls the txhash rethink behavior:
net.core.txrehash. When enabled, the same behavior is retained,
when disabled, rethink is not performed. Sysctl is enabled by default.

Signed-off-by: Akhmat Karakotov <hmukos@yandex-team.ru>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/core.h |  1 +
 include/net/sock.h       | 34 +++++++++++++++++++++-------------
 2 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 552bc25b1933..388244e315e7 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -10,6 +10,7 @@ struct netns_core {
 	struct ctl_table_header	*sysctl_hdr;
 
 	int	sysctl_somaxconn;
+	u8	sysctl_txrehash;
 
 #ifdef CONFIG_PROC_FS
 	struct prot_inuse __percpu *prot_inuse;
diff --git a/include/net/sock.h b/include/net/sock.h
index ff9b508d9c5f..0540e1b2aeb0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -587,6 +587,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
 			   __tmp | SK_USER_DATA_NOCOPY);		\
 })
 
+static inline
+struct net *sock_net(const struct sock *sk)
+{
+	return read_pnet(&sk->sk_net);
+}
+
+static inline
+void sock_net_set(struct sock *sk, struct net *net)
+{
+	write_pnet(&sk->sk_net, net);
+}
+
 /*
  * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
  * or not whether his port will be reused by someone else. SK_FORCE_REUSE
@@ -2054,10 +2066,18 @@ static inline void sk_set_txhash(struct sock *sk)
 
 static inline bool sk_rethink_txhash(struct sock *sk)
 {
-	if (sk->sk_txhash) {
+	u8 rehash;
+
+	if (!sk->sk_txhash)
+		return false;
+
+	rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+
+	if (rehash) {
 		sk_set_txhash(sk);
 		return true;
 	}
+
 	return false;
 }
 
@@ -2704,18 +2724,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
-static inline
-struct net *sock_net(const struct sock *sk)
-{
-	return read_pnet(&sk->sk_net);
-}
-
-static inline
-void sock_net_set(struct sock *sk, struct net *net)
-{
-	write_pnet(&sk->sk_net, net);
-}
-
 static inline bool
 skb_sk_is_prefetched(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 26859240e4ee701e0379f08634957adaff67e43a Mon Sep 17 00:00:00 2001
From: Akhmat Karakotov <hmukos@yandex-team.ru>
Date: Mon, 31 Jan 2022 16:31:22 +0300
Subject: txhash: Add socket option to control TX hash rethink behavior

Add the SO_TXREHASH socket option to control hash rethink behavior per socket.
When default mode is set, sockets disable rehash at initialization and use
sysctl option when entering listen state. setsockopt() overrides default
behavior.

Signed-off-by: Akhmat Karakotov <hmukos@yandex-team.ru>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 0540e1b2aeb0..d6c13f0fba40 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -316,6 +316,7 @@ struct sk_filter;
   *	@sk_rcvtimeo: %SO_RCVTIMEO setting
   *	@sk_sndtimeo: %SO_SNDTIMEO setting
   *	@sk_txhash: computed flow hash for use on transmit
+  *	@sk_txrehash: enable TX hash rethink
   *	@sk_filter: socket filtering instructions
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
@@ -491,6 +492,7 @@ struct sock {
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
 	kuid_t			sk_uid;
+	u8			sk_txrehash;
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	u8			sk_prefer_busy_poll;
 	u16			sk_busy_poll_budget;
@@ -2066,18 +2068,10 @@ static inline void sk_set_txhash(struct sock *sk)
 
 static inline bool sk_rethink_txhash(struct sock *sk)
 {
-	u8 rehash;
-
-	if (!sk->sk_txhash)
-		return false;
-
-	rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
-
-	if (rehash) {
+	if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
 		sk_set_txhash(sk);
 		return true;
 	}
-
 	return false;
 }
 
-- 
cgit v1.2.3


From 02b2a91c6f0d57df687e666b475849a54f295a12 Mon Sep 17 00:00:00 2001
From: David Girault <david.girault@qorvo.com>
Date: Tue, 1 Feb 2022 19:09:56 +0100
Subject: net: ieee802154: Provide a kdoc to the address structure

Give this structure a header to better explain its content.

Signed-off-by: David Girault <david.girault@qorvo.com>
[miquel.raynal@bootlin.com: Isolate this change from a bigger commit and
                            reword the comment]
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20220201180956.93581-1-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 6ed07844eb24..833672d6fbe4 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -227,6 +227,16 @@ static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net)
 	write_pnet(&wpan_phy->_net, net);
 }
 
+/**
+ * struct ieee802154_addr - IEEE802.15.4 device address
+ * @mode: Address mode from frame header. Can be one of:
+ *        - @IEEE802154_ADDR_NONE
+ *        - @IEEE802154_ADDR_SHORT
+ *        - @IEEE802154_ADDR_LONG
+ * @pan_id: The PAN ID this address belongs to
+ * @short_addr: address if @mode is @IEEE802154_ADDR_SHORT
+ * @extended_addr: address if @mode is @IEEE802154_ADDR_LONG
+ */
 struct ieee802154_addr {
 	u8 mode;
 	__le16 pan_id;
-- 
cgit v1.2.3


From 295ab96f478d0fa56393e85406f19a867e26ce22 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Feb 2022 01:03:20 +0100
Subject: net: dsa: provide switch operations for tracking the master state

Certain drivers may need to send management traffic to the switch for
things like register access, FDB dump, etc, to accelerate what their
slow bus (SPI, I2C, MDIO) can already do.

Ethernet is faster (especially in bulk transactions) but is also more
unreliable, since the user may decide to bring the DSA master down (or
not bring it up), therefore severing the link between the host and the
attached switch.

Drivers needing Ethernet-based register access already should have
fallback logic to the slow bus if the Ethernet method fails, but that
fallback may be based on a timeout, and the I/O to the switch may slow
down to a halt if the master is down, because every Ethernet packet will
have to time out. The driver also doesn't have the option to turn off
Ethernet-based I/O momentarily, because it wouldn't know when to turn it
back on.

Which is where this change comes in. By tracking NETDEV_CHANGE,
NETDEV_UP and NETDEV_GOING_DOWN events on the DSA master, we should know
the exact interval of time during which this interface is reliably
available for traffic. Provide this information to switches so they can
use it as they wish.

An helper is added dsa_port_master_is_operational() to check if a master
port is operational.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 57b3e4e7413b..43c4153ef53a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -278,6 +278,10 @@ struct dsa_port {
 
 	u8			devlink_port_setup:1;
 
+	/* Master state bits, valid only on CPU ports */
+	u8			master_admin_up:1;
+	u8			master_oper_up:1;
+
 	u8			setup:1;
 
 	struct device_node	*dn;
@@ -478,6 +482,12 @@ static inline bool dsa_port_is_unused(struct dsa_port *dp)
 	return dp->type == DSA_PORT_TYPE_UNUSED;
 }
 
+static inline bool dsa_port_master_is_operational(struct dsa_port *dp)
+{
+	return dsa_port_is_cpu(dp) && dp->master_admin_up &&
+	       dp->master_oper_up;
+}
+
 static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
 {
 	return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED;
@@ -1036,6 +1046,13 @@ struct dsa_switch_ops {
 	int	(*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid,
 				      u16 flags);
 	int	(*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);
+
+	/*
+	 * DSA master tracking operations
+	 */
+	void	(*master_state_change)(struct dsa_switch *ds,
+				       const struct net_device *master,
+				       bool operational);
 };
 
 #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\
-- 
cgit v1.2.3


From 5903123f662ed18483f05cac3f9e800a074c29ff Mon Sep 17 00:00:00 2001
From: Akhmat Karakotov <hmukos@yandex-team.ru>
Date: Fri, 28 Jan 2022 22:26:21 +0300
Subject: tcp: Use BPF timeout setting for SYN ACK RTO

When setting RTO through BPF program, some SYN ACK packets were unaffected
and continued to use TCP_TIMEOUT_INIT constant. This patch adds timeout
option to struct request_sock. Option is initialized with TCP_TIMEOUT_INIT
and is reassigned through BPF using tcp_timeout_init call. SYN ACK
retransmits now use newly added timeout option.

Signed-off-by: Akhmat Karakotov <hmukos@yandex-team.ru>
Acked-by: Martin KaFai Lau <kafai@fb.com>

v2:
	- Add timeout option to struct request_sock. Do not call
	  tcp_timeout_init on every syn ack retransmit.

v3:
	- Use unsigned long for min. Bound tcp_timeout_init to TCP_RTO_MAX.

v4:
	- Refactor duplicate code by adding reqsk_timeout function.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 8 ++++++++
 include/net/request_sock.h         | 2 ++
 include/net/tcp.h                  | 2 +-
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 4ad47d9f9d27..3908296d103f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -285,6 +285,14 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
 void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req);
 
+static inline unsigned long
+reqsk_timeout(struct request_sock *req, unsigned long max_timeout)
+{
+	u64 timeout = (u64)req->timeout << req->num_timeout;
+
+	return (unsigned long)min_t(u64, timeout, max_timeout);
+}
+
 static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
 {
 	/* The below has to be done to allow calling inet_csk_destroy_sock */
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 29e41ff3ec93..144c39db9898 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -70,6 +70,7 @@ struct request_sock {
 	struct saved_syn		*saved_syn;
 	u32				secid;
 	u32				peer_secid;
+	u32				timeout;
 };
 
 static inline struct request_sock *inet_reqsk(const struct sock *sk)
@@ -104,6 +105,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
 	sk_node_init(&req_to_sk(req)->sk_node);
 	sk_tx_queue_clear(req_to_sk(req));
 	req->saved_syn = NULL;
+	req->timeout = 0;
 	req->num_timeout = 0;
 	req->num_retrans = 0;
 	req->sk = NULL;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b9fc978fb2ca..eff2487d972d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2358,7 +2358,7 @@ static inline u32 tcp_timeout_init(struct sock *sk)
 
 	if (timeout <= 0)
 		timeout = TCP_TIMEOUT_INIT;
-	return timeout;
+	return min_t(int, timeout, TCP_RTO_MAX);
 }
 
 static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
-- 
cgit v1.2.3


From 52cc6ffc0ab2c61a76127b9347567fc97c15582f Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Mon, 31 Jan 2022 08:40:01 -0800
Subject: page_pool: Refactor page_pool to enable fragmenting after allocation

This change is meant to permit a driver to perform "fragmenting" of the
page from within the driver instead of the current model which requires
pre-partitioning the page. The main motivation behind this is to support
use cases where the page will be split up by the driver after DMA instead
of before.

With this change it becomes possible to start using page pool to replace
some of the existing use cases where multiple references were being used
for a single page, but the number needed was unknown as the size could be
dynamic.

For example, with this code it would be possible to do something like
the following to handle allocation:
  page = page_pool_alloc_pages();
  if (!page)
    return NULL;
  page_pool_fragment_page(page, DRIVER_PAGECNT_BIAS_MAX);
  rx_buf->page = page;
  rx_buf->pagecnt_bias = DRIVER_PAGECNT_BIAS_MAX;

Then we would process a received buffer by handling it with:
  rx_buf->pagecnt_bias--;

Once the page has been fully consumed we could then flush the remaining
instances with:
  if (page_pool_defrag_page(page, rx_buf->pagecnt_bias))
    continue;
  page_pool_put_defragged_page(pool, page -1, !!budget);

The general idea is that we want to have the ability to allocate a page
with excess fragment count and then trim off the unneeded fragments.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 82 +++++++++++++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 30 deletions(-)

(limited to 'include/net')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 79a805542d0f..97c3c19872ff 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -201,21 +201,67 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 }
 #endif
 
-void page_pool_put_page(struct page_pool *pool, struct page *page,
-			unsigned int dma_sync_size, bool allow_direct);
+void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
+				  unsigned int dma_sync_size,
+				  bool allow_direct);
 
-/* Same as above but will try to sync the entire area pool->max_len */
-static inline void page_pool_put_full_page(struct page_pool *pool,
-					   struct page *page, bool allow_direct)
+static inline void page_pool_fragment_page(struct page *page, long nr)
+{
+	atomic_long_set(&page->pp_frag_count, nr);
+}
+
+static inline long page_pool_defrag_page(struct page *page, long nr)
+{
+	long ret;
+
+	/* If nr == pp_frag_count then we have cleared all remaining
+	 * references to the page. No need to actually overwrite it, instead
+	 * we can leave this to be overwritten by the calling function.
+	 *
+	 * The main advantage to doing this is that an atomic_read is
+	 * generally a much cheaper operation than an atomic update,
+	 * especially when dealing with a page that may be partitioned
+	 * into only 2 or 3 pieces.
+	 */
+	if (atomic_long_read(&page->pp_frag_count) == nr)
+		return 0;
+
+	ret = atomic_long_sub_return(nr, &page->pp_frag_count);
+	WARN_ON(ret < 0);
+	return ret;
+}
+
+static inline bool page_pool_is_last_frag(struct page_pool *pool,
+					  struct page *page)
+{
+	/* If fragments aren't enabled or count is 0 we were the last user */
+	return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
+	       (page_pool_defrag_page(page, 1) == 0);
+}
+
+static inline void page_pool_put_page(struct page_pool *pool,
+				      struct page *page,
+				      unsigned int dma_sync_size,
+				      bool allow_direct)
 {
 	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	page_pool_put_page(pool, page, -1, allow_direct);
+	if (!page_pool_is_last_frag(pool, page))
+		return;
+
+	page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
 #endif
 }
 
+/* Same as above but will try to sync the entire area pool->max_len */
+static inline void page_pool_put_full_page(struct page_pool *pool,
+					   struct page *page, bool allow_direct)
+{
+	page_pool_put_page(pool, page, -1, allow_direct);
+}
+
 /* Same as above but the caller must guarantee safe context. e.g NAPI */
 static inline void page_pool_recycle_direct(struct page_pool *pool,
 					    struct page *page)
@@ -243,30 +289,6 @@ static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 		page->dma_addr_upper = upper_32_bits(addr);
 }
 
-static inline void page_pool_set_frag_count(struct page *page, long nr)
-{
-	atomic_long_set(&page->pp_frag_count, nr);
-}
-
-static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
-							  long nr)
-{
-	long ret;
-
-	/* As suggested by Alexander, atomic_long_read() may cover up the
-	 * reference count errors, so avoid calling atomic_long_read() in
-	 * the cases of freeing or draining the page_frags, where we would
-	 * not expect it to match or that are slowpath anyway.
-	 */
-	if (__builtin_constant_p(nr) &&
-	    atomic_long_read(&page->pp_frag_count) == nr)
-		return 0;
-
-	ret = atomic_long_sub_return(nr, &page->pp_frag_count);
-	WARN_ON(ret < 0);
-	return ret;
-}
-
 static inline bool is_page_pool_compiled_in(void)
 {
 #ifdef CONFIG_PAGE_POOL
-- 
cgit v1.2.3


From 7af4a361a62f59b44ec301f21090823365dd0244 Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Thu, 3 Feb 2022 11:16:53 +0100
Subject: net: dsa: mv88e6xxx: Improve isolation of standalone ports

Clear MapDA on standalone ports to bypass any ATU lookup that might
point the packet in the wrong direction. This means that all packets
are flooded using the PVT config. So make sure that standalone ports
are only allowed to communicate with the local upstream port.

Here is a scenario in which this is needed:

   CPU
    |     .----.
.---0---. | .--0--.
|  sw0  | | | sw1 |
'-1-2-3-' | '-1-2-'
      '---'

- sw0p1 and sw1p1 are bridged
- sw0p2 and sw1p2 are in standalone mode
- Learning must be enabled on sw0p3 in order for hardware forwarding
  to work properly between bridged ports

1. A packet with SA :aa comes in on sw1p2
   1a. Egresses sw1p0
   1b. Ingresses sw0p3, ATU adds an entry for :aa towards port 3
   1c. Egresses sw0p0

2. A packet with DA :aa comes in on sw0p2
   2a. If an ATU lookup is done at this point, the packet will be
       incorrectly forwarded towards sw0p3. With this change in place,
       the ATU is bypassed and the packet is forwarded in accordance
       with the PVT, which only contains the CPU port.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 43c4153ef53a..6e5ef62a7dce 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -591,6 +591,18 @@ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port)
 	return port == dsa_upstream_port(ds, port);
 }
 
+/* Return the local port used to reach the CPU port */
+static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds)
+{
+	struct dsa_port *dp;
+
+	dsa_switch_for_each_available_port(dp, ds) {
+		return dsa_upstream_port(ds, dp->index);
+	}
+
+	return ds->num_ports;
+}
+
 /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning
  * that the routing port from @downstream_ds to @upstream_ds is also the port
  * which @downstream_ds uses to reach its dedicated CPU.
-- 
cgit v1.2.3


From d352b20f4174a6bd998992329b773ab513232880 Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Thu, 3 Feb 2022 11:16:56 +0100
Subject: net: dsa: mv88e6xxx: Improve multichip isolation of standalone ports

Given that standalone ports are now configured to bypass the ATU and
forward all frames towards the upstream port, extend the ATU bypass to
multichip systems.

Load VID 0 (standalone) into the VTU with the policy bit set. Since
VID 4095 (bridged) is already loaded, we now know that all VIDs in use
are always available in all VTUs. Therefore, we can safely enable
802.1Q on DSA ports.

Setting the DSA ports' VTU policy to TRAP means that all incoming
frames on VID 0 will be classified as MGMT - as a result, the ATU is
bypassed on all subsequent switches.

With this isolation in place, we are able to support configurations
that are simultaneously very quirky and very useful. Quirky because it
involves looping cables between local switchports like in this
example:

   CPU
    |     .------.
.---0---. | .----0----.
|  sw0  | | |   sw1   |
'-1-2-3-' | '-1-2-3-4-'
  $ @ '---'   $ @ % %

We have three physically looped pairs ($, @, and %).

This is very useful because it allows us to run the kernel's
kselftests for the bridge on mv88e6xxx hardware.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6e5ef62a7dce..ca8c14b547b4 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -591,6 +591,12 @@ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port)
 	return port == dsa_upstream_port(ds, port);
 }
 
+/* Return true if this is a DSA port leading away from the CPU */
+static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port)
+{
+	return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port);
+}
+
 /* Return the local port used to reach the CPU port */
 static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds)
 {
-- 
cgit v1.2.3


From bb62a765b1b5597d32a426096aa78d2a8eb6b091 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 20 Jan 2022 13:06:59 +0100
Subject: netfilter: conntrack: make all extensions 8-byte alignned

All extensions except one need 8 byte alignment, so just make that the
default.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_extend.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index c7515d82ab06..705a4487f023 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -49,7 +49,7 @@ enum nf_ct_ext_id {
 struct nf_ct_ext {
 	u8 offset[NF_CT_EXT_NUM];
 	u8 len;
-	char data[];
+	char data[] __aligned(8);
 };
 
 static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id)
@@ -83,10 +83,7 @@ struct nf_ct_ext_type {
 	void (*destroy)(struct nf_conn *ct);
 
 	enum nf_ct_ext_id id;
-
-	/* Length and min alignment. */
 	u8 len;
-	u8 align;
 };
 
 int nf_ct_extend_register(const struct nf_ct_ext_type *type);
-- 
cgit v1.2.3


From 5f31edc0676b55cd6baf5ba119d1881f3fbadee1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 20 Jan 2022 13:07:00 +0100
Subject: netfilter: conntrack: move extension sizes into core

No need to specify this in the registration modules, we already
collect all sizes for build-time checks on the maximum combined size.

After this change, all extensions except nat have no meaningful content
in their nf_ct_ext_type struct definition.

Next patch handles nat, this will then allow to remove the dynamic
register api completely.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_extend.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 705a4487f023..87d818414092 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -83,7 +83,6 @@ struct nf_ct_ext_type {
 	void (*destroy)(struct nf_conn *ct);
 
 	enum nf_ct_ext_id id;
-	u8 len;
 };
 
 int nf_ct_extend_register(const struct nf_ct_ext_type *type);
-- 
cgit v1.2.3


From 1bc91a5ddf3eaea0e0ea957cccf3abdcfcace00e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 20 Jan 2022 13:07:01 +0100
Subject: netfilter: conntrack: handle ->destroy hook via nat_ops instead

The nat module already exposes a few functions to the conntrack core.
Move the nat extension destroy hook to it.

After this, no conntrack extension needs a destroy hook.
'struct nf_ct_ext_type' and the register/unregister api can be removed
in a followup patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_extend.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 87d818414092..343f9194423a 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -79,9 +79,6 @@ void nf_ct_ext_destroy(struct nf_conn *ct);
 void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);
 
 struct nf_ct_ext_type {
-	/* Destroys relationships (can be NULL). */
-	void (*destroy)(struct nf_conn *ct);
-
 	enum nf_ct_ext_id id;
 };
 
-- 
cgit v1.2.3


From 1015c3de23eedb8ac5a163165434484df44cfe00 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 20 Jan 2022 13:07:02 +0100
Subject: netfilter: conntrack: remove extension register api

These no longer register/unregister a meaningful structure so remove it.

Cc: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_acct.h      |  1 -
 include/net/netfilter/nf_conntrack_ecache.h    | 13 -------------
 include/net/netfilter/nf_conntrack_extend.h    |  9 ---------
 include/net/netfilter/nf_conntrack_labels.h    |  3 ---
 include/net/netfilter/nf_conntrack_seqadj.h    |  3 ---
 include/net/netfilter/nf_conntrack_timeout.h   | 12 ------------
 include/net/netfilter/nf_conntrack_timestamp.h | 13 -------------
 7 files changed, 54 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h
index 7f44a771530e..4b2b7f8914ea 100644
--- a/include/net/netfilter/nf_conntrack_acct.h
+++ b/include/net/netfilter/nf_conntrack_acct.h
@@ -78,7 +78,6 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, u32 dir,
 
 void nf_conntrack_acct_pernet_init(struct net *net);
 
-int nf_conntrack_acct_init(void);
 void nf_conntrack_acct_fini(void);
 
 #endif /* _NF_CONNTRACK_ACCT_H */
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index d932e22edcb4..16bcff809b18 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -166,9 +166,6 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state);
 void nf_conntrack_ecache_pernet_init(struct net *net);
 void nf_conntrack_ecache_pernet_fini(struct net *net);
 
-int nf_conntrack_ecache_init(void);
-void nf_conntrack_ecache_fini(void);
-
 static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net)
 {
 	return net->ct.ecache_dwork_pending;
@@ -194,16 +191,6 @@ static inline void nf_conntrack_ecache_pernet_init(struct net *net)
 static inline void nf_conntrack_ecache_pernet_fini(struct net *net)
 {
 }
-
-static inline int nf_conntrack_ecache_init(void)
-{
-	return 0;
-}
-
-static inline void nf_conntrack_ecache_fini(void)
-{
-}
-
 static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return false; }
 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
 #endif /*_NF_CONNTRACK_ECACHE_H*/
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 343f9194423a..96635ad2acc7 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -72,16 +72,7 @@ static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id)
 #define nf_ct_ext_find(ext, id)	\
 	((id##_TYPE *)__nf_ct_ext_find((ext), (id)))
 
-/* Destroy all relationships */
-void nf_ct_ext_destroy(struct nf_conn *ct);
-
 /* Add this type, returns pointer to data or NULL. */
 void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);
 
-struct nf_ct_ext_type {
-	enum nf_ct_ext_id id;
-};
-
-int nf_ct_extend_register(const struct nf_ct_ext_type *type);
-void nf_ct_extend_unregister(const struct nf_ct_ext_type *type);
 #endif /* _NF_CONNTRACK_EXTEND_H */
diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index ba916411c4e1..3c23298e68ca 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -45,12 +45,9 @@ int nf_connlabels_replace(struct nf_conn *ct,
 
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 int nf_conntrack_labels_init(void);
-void nf_conntrack_labels_fini(void);
 int nf_connlabels_get(struct net *net, unsigned int bit);
 void nf_connlabels_put(struct net *net);
 #else
-static inline int nf_conntrack_labels_init(void) { return 0; }
-static inline void nf_conntrack_labels_fini(void) {}
 static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; }
 static inline void nf_connlabels_put(struct net *net) {}
 #endif
diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h
index 0a10b50537ae..883c414b768e 100644
--- a/include/net/netfilter/nf_conntrack_seqadj.h
+++ b/include/net/netfilter/nf_conntrack_seqadj.h
@@ -42,7 +42,4 @@ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 		     enum ip_conntrack_info ctinfo, unsigned int protoff);
 s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, u32 seq);
 
-int nf_conntrack_seqadj_init(void);
-void nf_conntrack_seqadj_fini(void);
-
 #endif /* _NF_CONNTRACK_SEQADJ_H */
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 659b0ea25b4d..db507e4a65bb 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -89,23 +89,11 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct)
 }
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-int nf_conntrack_timeout_init(void);
-void nf_conntrack_timeout_fini(void);
 void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout);
 int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num,
 		      const char *timeout_name);
 void nf_ct_destroy_timeout(struct nf_conn *ct);
 #else
-static inline int nf_conntrack_timeout_init(void)
-{
-        return 0;
-}
-
-static inline void nf_conntrack_timeout_fini(void)
-{
-        return;
-}
-
 static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
 				    u8 l3num, u8 l4num,
 				    const char *timeout_name)
diff --git a/include/net/netfilter/nf_conntrack_timestamp.h b/include/net/netfilter/nf_conntrack_timestamp.h
index 820ea34b6029..57138d974a9f 100644
--- a/include/net/netfilter/nf_conntrack_timestamp.h
+++ b/include/net/netfilter/nf_conntrack_timestamp.h
@@ -40,21 +40,8 @@ struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp)
 
 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
 void nf_conntrack_tstamp_pernet_init(struct net *net);
-
-int nf_conntrack_tstamp_init(void);
-void nf_conntrack_tstamp_fini(void);
 #else
 static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {}
-
-static inline int nf_conntrack_tstamp_init(void)
-{
-	return 0;
-}
-
-static inline void nf_conntrack_tstamp_fini(void)
-{
-	return;
-}
 #endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */
 
 #endif /* _NF_CONNTRACK_TSTAMP_H */
-- 
cgit v1.2.3


From 7e367b06f16b3d22e884af55117601d2a73fca03 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 2 Feb 2022 10:49:39 +0200
Subject: cfg80211: fix -Wcast-qual warnings

When enabling -Wcast-qual e.g. via W=3, we get a lot of
warnings from this file, whenever it's included. Since
the fixes are simple, just do that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20220202104617.6a1d52213019.I92d82e7251cf712faa43fd09db3142327a3bce3d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d19e48f9fdc6..f6db085ff3df 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2604,7 +2604,7 @@ const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id);
  */
 static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id)
 {
-	return (void *)ieee80211_bss_get_elem(bss, id);
+	return (const void *)ieee80211_bss_get_elem(bss, id);
 }
 
 
@@ -5970,9 +5970,9 @@ cfg80211_find_ie_match(u8 eid, const u8 *ies, unsigned int len,
 		    (!match_len && match_offset)))
 		return NULL;
 
-	return (void *)cfg80211_find_elem_match(eid, ies, len,
-						match, match_len,
-						match_offset ?
+	return (const void *)cfg80211_find_elem_match(eid, ies, len,
+						      match, match_len,
+						      match_offset ?
 							match_offset - 2 : 0);
 }
 
@@ -6099,7 +6099,7 @@ static inline const u8 *
 cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
 			const u8 *ies, unsigned int len)
 {
-	return (void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len);
+	return (const void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len);
 }
 
 /**
-- 
cgit v1.2.3


From 5beb53d6ba4f39743d057e756db22bd8c079fc21 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 2 Feb 2022 10:49:40 +0200
Subject: ieee80211: radiotap: fix -Wcast-qual warnings

When enabling -Wcast-qual e.g. via W=3, we get a lot of
warnings from this file, whenever it's included. Since
the fixes are simple, just do that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20220202104617.cc733aeb1a18.I03396e1bf7a1af364cbd0916037f65d800035039@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 11630351c978..598f53d2a3a0 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2017		Intel Deutschland GmbH
- * Copyright (c) 2018-2019	Intel Corporation
+ * Copyright (c) 2018-2019, 2021 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -365,7 +365,7 @@ enum ieee80211_radiotap_zero_len_psdu_type {
  */
 static inline u16 ieee80211_get_radiotap_len(const char *data)
 {
-	struct ieee80211_radiotap_header *hdr = (void *)data;
+	const struct ieee80211_radiotap_header *hdr = (const void *)data;
 
 	return get_unaligned_le16(&hdr->it_len);
 }
-- 
cgit v1.2.3


From ea5907db2a9ccf37fdb6d1e67bcb620c1fea10f8 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 2 Feb 2022 10:49:47 +0200
Subject: mac80211: fix struct ieee80211_tx_info size

The size of the status_driver_data field was not adjusted when
the is_valid_ack_signal field was added.
Since the size of struct ieee80211_tx_info is limited, replace
the is_valid_ack_signal field with a flags field, and adjust the
struct size accordingly.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20220202104617.0ff363d4fa56.I45792c0187034a6d0e1c99a7db741996ef7caba3@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c50221d7e82c..bd6912d0292b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7,7 +7,7 @@
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2021 Intel Corporation
+ * Copyright (C) 2018 - 2022 Intel Corporation
  */
 
 #ifndef MAC80211_H
@@ -883,6 +883,17 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_DONT_REORDER		= BIT(8),
 };
 
+/**
+ * enum mac80211_tx_status_flags - flags to describe transmit status
+ *
+ * @IEEE80211_TX_STATUS_ACK_SIGNAL_VALID: ACK signal is valid
+ *
+ * These flags are used in tx_info->status.flags.
+ */
+enum mac80211_tx_status_flags {
+	IEEE80211_TX_STATUS_ACK_SIGNAL_VALID = BIT(0),
+};
+
 /*
  * This definition is used as a mask to clear all temporary flags, which are
  * set by the tx handlers for each transmission attempt by the mac80211 stack.
@@ -1046,7 +1057,7 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate)
  * @status.antenna: (legacy, kept only for iwlegacy)
  * @status.tx_time: airtime consumed for transmission; note this is only
  *	used for WMM AC, not for airtime fairness
- * @status.is_valid_ack_signal: ACK signal is valid
+ * @status.flags: status flags, see &enum mac80211_tx_status_flags
  * @status.status_driver_data: driver use area
  * @ack: union part for pure ACK data
  * @ack.cookie: cookie for the ACK
@@ -1099,8 +1110,8 @@ struct ieee80211_tx_info {
 			u8 ampdu_len;
 			u8 antenna;
 			u16 tx_time;
-			bool is_valid_ack_signal;
-			void *status_driver_data[19 / sizeof(void *)];
+			u8 flags;
+			void *status_driver_data[18 / sizeof(void *)];
 		} status;
 		struct {
 			struct ieee80211_tx_rate driver_rates[
-- 
cgit v1.2.3


From c78b8b20e34920231eda02fb40c7aca7d88be837 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 3 Feb 2022 15:12:40 -0800
Subject: net: don't include ndisc.h from ipv6.h

Nothing in ipv6.h needs ndisc.h, drop it.

Link: https://lore.kernel.org/r/20220203043457.2222388-1-kuba@kernel.org
Acked-by: Jeremy Kerr <jk@codeconstruct.com.au>
Acked-by: Stefan Schmidt <stefan@datenfreihafen.org>
Link: https://lore.kernel.org/r/20220203231240.2297588-1-kuba@kernel.org
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ipv6.h      | 1 -
 include/net/ipv6_frag.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 082f30256f59..cda1f205f391 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -15,7 +15,6 @@
 #include <linux/refcount.h>
 #include <linux/jump_label_ratelimit.h>
 #include <net/if_inet6.h>
-#include <net/ndisc.h>
 #include <net/flow.h>
 #include <net/flow_dissector.h>
 #include <net/snmp.h>
diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 0a4779175a52..5052c66e22d2 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _IPV6_FRAG_H
 #define _IPV6_FRAG_H
+#include <linux/icmpv6.h>
 #include <linux/kernel.h>
 #include <net/addrconf.h>
 #include <net/ipv6.h>
-- 
cgit v1.2.3


From 35d39fecbc242150af5587506e58ec1f8541fb68 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Thu, 3 Feb 2022 10:44:30 +0200
Subject: net/sched: Enable tc skb ext allocation on chain miss only when
 needed

Currently tc skb extension is used to send miss info from
tc to ovs datapath module, and driver to tc. For the tc to ovs
miss it is currently always allocated even if it will not
be used by ovs datapath (as it depends on a requested feature).

Export the static key which is used by openvswitch module to
guard this code path as well, so it will be skipped if ovs
datapath doesn't need it. Enable this code path once
ovs datapath needs it.

Signed-off-by: Paul Blakey <paulb@nvidia.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 676cb8ea9e15..a3b57a93228a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -1028,4 +1028,15 @@ struct tc_fifo_qopt_offload {
 	};
 };
 
+#ifdef CONFIG_NET_CLS_ACT
+DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc);
+void tc_skb_ext_tc_enable(void);
+void tc_skb_ext_tc_disable(void);
+#define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc)
+#else /* CONFIG_NET_CLS_ACT */
+static inline void tc_skb_ext_tc_enable(void) { }
+static inline void tc_skb_ext_tc_disable(void) { }
+#define tc_skb_ext_tc_enabled() false
+#endif
+
 #endif
-- 
cgit v1.2.3


From de5a1f3ce4c862150dc442530dba19e1d1dc6bc2 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 4 Feb 2022 12:28:37 +0100
Subject: net: gro: minor optimization for dev_gro_receive()

While inspecting some perf report, I noticed that the compiler
emits suboptimal code for the napi CB initialization, fetching
and storing multiple times the memory for flags bitfield.
This is with gcc 10.3.1, but I observed the same with older compiler
versions.

We can help the compiler to do a nicer work clearing several
fields at once using an u32 alias. The generated code is quite
smaller, with the same number of conditional.

Before:
objdump -t net/core/gro.o | grep " F .text"
0000000000000bb0 l     F .text	0000000000000357 dev_gro_receive

After:
0000000000000bb0 l     F .text	000000000000033c dev_gro_receive

v1  -> v2:
 - use struct_group (Alexander and Alex)

RFC -> v1:
 - use __struct_group to delimit the zeroed area (Alexander)

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gro.h | 52 ++++++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

(limited to 'include/net')

diff --git a/include/net/gro.h b/include/net/gro.h
index 8f75802d50fd..a765fedda5c4 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -29,46 +29,50 @@ struct napi_gro_cb {
 	/* Number of segments aggregated. */
 	u16	count;
 
-	/* Start offset for remote checksum offload */
-	u16	gro_remcsum_start;
+	/* Used in ipv6_gro_receive() and foo-over-udp */
+	u16	proto;
 
 	/* jiffies when first packet was created/queued */
 	unsigned long age;
 
-	/* Used in ipv6_gro_receive() and foo-over-udp */
-	u16	proto;
+	/* portion of the cb set to zero at every gro iteration */
+	struct_group(zeroed,
+
+		/* Start offset for remote checksum offload */
+		u16	gro_remcsum_start;
 
-	/* This is non-zero if the packet may be of the same flow. */
-	u8	same_flow:1;
+		/* This is non-zero if the packet may be of the same flow. */
+		u8	same_flow:1;
 
-	/* Used in tunnel GRO receive */
-	u8	encap_mark:1;
+		/* Used in tunnel GRO receive */
+		u8	encap_mark:1;
 
-	/* GRO checksum is valid */
-	u8	csum_valid:1;
+		/* GRO checksum is valid */
+		u8	csum_valid:1;
 
-	/* Number of checksums via CHECKSUM_UNNECESSARY */
-	u8	csum_cnt:3;
+		/* Number of checksums via CHECKSUM_UNNECESSARY */
+		u8	csum_cnt:3;
 
-	/* Free the skb? */
-	u8	free:2;
+		/* Free the skb? */
+		u8	free:2;
 #define NAPI_GRO_FREE		  1
 #define NAPI_GRO_FREE_STOLEN_HEAD 2
 
-	/* Used in foo-over-udp, set in udp[46]_gro_receive */
-	u8	is_ipv6:1;
+		/* Used in foo-over-udp, set in udp[46]_gro_receive */
+		u8	is_ipv6:1;
 
-	/* Used in GRE, set in fou/gue_gro_receive */
-	u8	is_fou:1;
+		/* Used in GRE, set in fou/gue_gro_receive */
+		u8	is_fou:1;
 
-	/* Used to determine if flush_id can be ignored */
-	u8	is_atomic:1;
+		/* Used to determine if flush_id can be ignored */
+		u8	is_atomic:1;
 
-	/* Number of gro_receive callbacks this packet already went through */
-	u8 recursion_counter:4;
+		/* Number of gro_receive callbacks this packet already went through */
+		u8 recursion_counter:4;
 
-	/* GRO is done by frag_list pointer chaining. */
-	u8	is_flist:1;
+		/* GRO is done by frag_list pointer chaining. */
+		u8	is_flist:1;
+	);
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
-- 
cgit v1.2.3


From 9c1be1935fb68b2413796cdc03d019b8cf35ab51 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 5 Feb 2022 09:01:25 -0800
Subject: net: initialize init_net earlier

While testing a patch that will follow later
("net: add netns refcount tracker to struct nsproxy")
I found that devtmpfs_init() was called before init_net
was initialized.

This is a bug, because devtmpfs_setup() calls
ksys_unshare(CLONE_NEWNS);

This has the effect of increasing init_net refcount,
which will be later overwritten to 1, as part of setup_net(&init_net)

We had too many prior patches [1] trying to work around the root cause.

Really, make sure init_net is in BSS section, and that net_ns_init()
is called earlier at boot time.

Note that another patch ("vfs: add netns refcount tracker
to struct fs_context") also will need net_ns_init() being called
before vfs_caches_init()

As a bonus, this patch saves around 4KB in .data section.

[1]

f8c46cb39079 ("netns: do not call pernet ops for not yet set up init_net namespace")
b5082df8019a ("net: Initialise init_net.count to 1")
734b65417b24 ("net: Statically initialize init_net.dev_base_head")

v2: fixed a build error reported by kernel build bots (CONFIG_NET=n)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 5b61c462e534..374cc7b260fc 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -513,4 +513,10 @@ static inline void fnhe_genid_bump(struct net *net)
 	atomic_inc(&net->fnhe_genid);
 }
 
+#ifdef CONFIG_NET
+void net_ns_init(void);
+#else
+static inline void net_ns_init(void) {}
+#endif
+
 #endif /* __NET_NET_NAMESPACE_H */
-- 
cgit v1.2.3


From a410a0cf98854a698a519bfbeb604145da384c0e Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 4 Feb 2022 14:58:11 +0100
Subject: ipv6: Define dscp_t and stop taking ECN bits into account in
 fib6-rules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Define a dscp_t type and its appropriate helpers that ensure ECN bits
are not taken into account when handling DSCP.

Use this new type to replace the tclass field of struct fib6_rule, so
that fib6-rules don't get influenced by ECN bits anymore.

Before this patch, fib6-rules didn't make any distinction between the
DSCP and ECN bits. Therefore, rules specifying a DSCP (tos or dsfield
options in iproute2) stopped working as soon a packets had at least one
of its ECN bits set (as a work around one could create four rules for
each DSCP value to match, one for each possible ECN value).

After this patch fib6-rules only compare the DSCP bits. ECN doesn't
influence the result anymore. Also, fib6-rules now must have the ECN
bits cleared or they will be rejected.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_dscp.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/net/ipv6.h      |  6 ++++++
 2 files changed, 63 insertions(+)
 create mode 100644 include/net/inet_dscp.h

(limited to 'include/net')

diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h
new file mode 100644
index 000000000000..72f250dffada
--- /dev/null
+++ b/include/net/inet_dscp.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * inet_dscp.h: helpers for handling differentiated services codepoints (DSCP)
+ *
+ * DSCP is defined in RFC 2474:
+ *
+ *        0   1   2   3   4   5   6   7
+ *      +---+---+---+---+---+---+---+---+
+ *      |         DSCP          |  CU   |
+ *      +---+---+---+---+---+---+---+---+
+ *
+ *        DSCP: differentiated services codepoint
+ *        CU:   currently unused
+ *
+ * The whole DSCP + CU bits form the DS field.
+ * The DS field is also commonly called TOS or Traffic Class (for IPv6).
+ *
+ * Note: the CU bits are now used for Explicit Congestion Notification
+ *       (RFC 3168).
+ */
+
+#ifndef _INET_DSCP_H
+#define _INET_DSCP_H
+
+#include <linux/types.h>
+
+/* Special type for storing DSCP values.
+ *
+ * A dscp_t variable stores a DS field with the CU (ECN) bits cleared.
+ * Using dscp_t allows to strictly separate DSCP and ECN bits, thus avoiding
+ * bugs where ECN bits are erroneously taken into account during FIB lookups
+ * or policy routing.
+ *
+ * Note: to get the real DSCP value contained in a dscp_t variable one would
+ * have to do a bit shift after calling inet_dscp_to_dsfield(). We could have
+ * a helper for that, but there's currently no users.
+ */
+typedef u8 __bitwise dscp_t;
+
+#define INET_DSCP_MASK 0xfc
+
+static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield)
+{
+	return (__force dscp_t)(dsfield & INET_DSCP_MASK);
+}
+
+static inline __u8 inet_dscp_to_dsfield(dscp_t dscp)
+{
+	return (__force __u8)dscp;
+}
+
+static inline bool inet_validate_dscp(__u8 val)
+{
+	return !(val & ~INET_DSCP_MASK);
+}
+
+#endif /* _INET_DSCP_H */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index cda1f205f391..f693784e1419 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -17,6 +17,7 @@
 #include <net/if_inet6.h>
 #include <net/flow.h>
 #include <net/flow_dissector.h>
+#include <net/inet_dscp.h>
 #include <net/snmp.h>
 #include <net/netns/hash.h>
 
@@ -974,6 +975,11 @@ static inline u8 ip6_tclass(__be32 flowinfo)
 	return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT;
 }
 
+static inline dscp_t ip6_dscp(__be32 flowinfo)
+{
+	return inet_dsfield_to_dscp(ip6_tclass(flowinfo));
+}
+
 static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 {
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
-- 
cgit v1.2.3


From f55fbb6afb8d701e3185e31e73f5ea9503a66744 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 4 Feb 2022 14:58:16 +0100
Subject: ipv4: Reject routes specifying ECN bits in rtm_tos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the new dscp_t type to replace the fc_tos field of fib_config, to
ensure IPv4 routes aren't influenced by ECN bits when configured with
non-zero rtm_tos.

Before this patch, IPv4 routes specifying an rtm_tos with some of the
ECN bits set were accepted. However they wouldn't work (never match) as
IPv4 normally clears the ECN bits with IPTOS_RT_MASK before doing a FIB
lookup (although a few buggy code paths don't).

After this patch, IPv4 routes specifying an rtm_tos with any ECN bit
set is rejected.

Note: IPv6 routes ignore rtm_tos altogether, any rtm_tos is accepted,
but treated as if it were 0.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_fib.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index c4297704bbcb..6a82bcb8813b 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -17,6 +17,7 @@
 #include <linux/rcupdate.h>
 #include <net/fib_notifier.h>
 #include <net/fib_rules.h>
+#include <net/inet_dscp.h>
 #include <net/inetpeer.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
@@ -24,7 +25,7 @@
 
 struct fib_config {
 	u8			fc_dst_len;
-	u8			fc_tos;
+	dscp_t			fc_dscp;
 	u8			fc_protocol;
 	u8			fc_scope;
 	u8			fc_type;
-- 
cgit v1.2.3


From 21a216a8fc630161e69eaf02cbebdd1816ff1a13 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 7 Feb 2022 20:50:28 -0800
Subject: ipv6/addrconf: allocate a per netns hash table

Add a per netns hash table and a dedicated spinlock,
first step to get rid of the global inet6_addr_lst[] one.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netns/ipv6.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 30cdfc4e1615..755f12001c8b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -92,6 +92,10 @@ struct netns_ipv6 {
 	struct sock             *tcp_sk;
 	struct sock             *igmp_sk;
 	struct sock		*mc_autojoin_sk;
+
+	struct hlist_head	*inet6_addr_lst;
+	spinlock_t		addrconf_hash_lock;
+
 #ifdef CONFIG_IPV6_MROUTE
 #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
 	struct mr_table		*mrt6;
-- 
cgit v1.2.3


From 8805d13ff1b2bef6a7bb8a005d2441763286dd7a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 7 Feb 2022 20:50:29 -0800
Subject: ipv6/addrconf: use one delayed work per netns

Next step for using per netns inet6_addr_lst
is to have per netns work item to ultimately
call addrconf_verify_rtnl() and addrconf_verify()
with a new 'struct net*' argument.

Everything is still using the global inet6_addr_lst[] table.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netns/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 755f12001c8b..d145f1966682 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -95,6 +95,7 @@ struct netns_ipv6 {
 
 	struct hlist_head	*inet6_addr_lst;
 	spinlock_t		addrconf_hash_lock;
+	struct delayed_work	addr_chk_work;
 
 #ifdef CONFIG_IPV6_MROUTE
 #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
-- 
cgit v1.2.3


From c7d9a6751a5fcebc87feee9b999b40078ed9fb43 Mon Sep 17 00:00:00 2001
From: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Date: Tue, 8 Feb 2022 02:32:10 -0300
Subject: net: dsa: typo in comment

Signed-off-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20220208053210.14831-1-luizluca@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ca8c14b547b4..fd1f62a6e0a8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1281,7 +1281,7 @@ module_exit(dsa_tag_driver_module_exit)
 /**
  * module_dsa_tag_drivers() - Helper macro for registering DSA tag
  * drivers
- * @__ops_array: Array of tag driver strucutres
+ * @__ops_array: Array of tag driver structures
  *
  * Helper macro for DSA tag drivers which do not do anything special
  * in module init/exit. Each module may only use this macro once, and
-- 
cgit v1.2.3


From 8dd8678e42b5a8bbd3e2c49cda76c743318c56b0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 5 Feb 2022 13:00:04 +0100
Subject: netfilter: ecache: don't use nf_conn spinlock

For updating eache missed value we can use cmpxchg.
This also avoids need to disable BH.

kernel robot reported build failure on v1 because not all arches support
cmpxchg for u16, so extend this to u32.

This doesn't increase struct size, existing padding is used.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_ecache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 16bcff809b18..6c4c490a3e34 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -21,10 +21,10 @@ enum nf_ct_ecache_state {
 
 struct nf_conntrack_ecache {
 	unsigned long cache;		/* bitops want long */
-	u16 missed;			/* missed events */
 	u16 ctmask;			/* bitmask of ct events to be delivered */
 	u16 expmask;			/* bitmask of expect events to be delivered */
 	enum nf_ct_ecache_state state:8;/* ecache state */
+	u32 missed;			/* missed events */
 	u32 portid;			/* netlink portid of destroyer */
 };
 
-- 
cgit v1.2.3


From 7afa38831aee2ca2f41b22747ed8545e1887aaa9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 Feb 2022 12:29:47 +0100
Subject: netfilter: cttimeout: use option structure

Instead of two exported functions, export a single option structure.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_timeout.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index db507e4a65bb..3ea94f6f3844 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -108,8 +108,12 @@ static inline void nf_ct_destroy_timeout(struct nf_conn *ct)
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-extern struct nf_ct_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name);
-extern void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout);
+struct nf_ct_timeout_hooks {
+	struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name);
+	void (*timeout_put)(struct nf_ct_timeout *timeout);
+};
+
+extern const struct nf_ct_timeout_hooks *nf_ct_timeout_hook;
 #endif
 
 #endif /* _NF_CONNTRACK_TIMEOUT_H */
-- 
cgit v1.2.3


From 23f68d462984bfda47c7bf663dca347e8e3df549 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 7 Feb 2022 19:25:08 +0100
Subject: netfilter: nft_cmp: optimize comparison for 16-bytes

Allow up to 16-byte comparisons with a new cmp fast version. Use two
64-bit words and calculate the mask representing the bits to be
compared. Make sure the comparison is 64-bit aligned and avoid
out-of-bound memory access on registers.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index b6fb1fdff9b2..0ea7c55cea4d 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -42,6 +42,14 @@ struct nft_cmp_fast_expr {
 	bool			inv;
 };
 
+struct nft_cmp16_fast_expr {
+	struct nft_data		data;
+	struct nft_data		mask;
+	u8			sreg;
+	u8			len;
+	bool			inv;
+};
+
 struct nft_immediate_expr {
 	struct nft_data		data;
 	u8			dreg;
@@ -59,6 +67,7 @@ static inline u32 nft_cmp_fast_mask(unsigned int len)
 }
 
 extern const struct nft_expr_ops nft_cmp_fast_ops;
+extern const struct nft_expr_ops nft_cmp16_fast_ops;
 
 struct nft_payload {
 	enum nft_payload_bases	base:8;
-- 
cgit v1.2.3


From 8069b22d656f6e1922352bff90ab78e6fab73779 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Wed, 9 Feb 2022 12:05:55 +0800
Subject: mctp: Add helper for address match checking

Currently, we have a couple of paths that check that an EID matches, or
the match value is MCTP_ADDR_ANY.

Rather than open coding this, add a little helper.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 7e35ec79b909..706d329dd8e8 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -45,6 +45,11 @@ static inline bool mctp_address_ok(mctp_eid_t eid)
 	return eid >= 8 && eid < 255;
 }
 
+static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid)
+{
+	return match == eid || match == MCTP_ADDR_ANY;
+}
+
 static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb)
 {
 	return (struct mctp_hdr *)skb_network_header(skb);
-- 
cgit v1.2.3


From 63ed1aab3d40aa61aaa66819bdce9377ac7f40fa Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Wed, 9 Feb 2022 12:05:57 +0800
Subject: mctp: Add SIOCMCTP{ALLOC,DROP}TAG ioctls for tag control

This change adds a couple of new ioctls for mctp sockets:
SIOCMCTPALLOCTAG and SIOCMCTPDROPTAG.  These ioctls provide facilities
for explicit allocation / release of tags, overriding the automatic
allocate-on-send/release-on-reply and timeout behaviours. This allows
userspace more control over messages that may not fit a simple
request/response model.

In order to indicate a pre-allocated tag to the sendmsg() syscall, we
introduce a new flag to the struct sockaddr_mctp.smctp_tag value:
MCTP_TAG_PREALLOC.

Additional changes from Jeremy Kerr <jk@codeconstruct.com.au>.

Contains a fix that was:
Reported-by: kernel test robot <lkp@intel.com>

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 706d329dd8e8..e80a4baf8379 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -126,7 +126,7 @@ struct mctp_sock {
  */
 struct mctp_sk_key {
 	mctp_eid_t	peer_addr;
-	mctp_eid_t	local_addr;
+	mctp_eid_t	local_addr; /* MCTP_ADDR_ANY for local owned tags */
 	__u8		tag; /* incoming tag match; invert TO for local */
 
 	/* we hold a ref to sk when set */
@@ -163,6 +163,12 @@ struct mctp_sk_key {
 	 */
 	unsigned long	dev_flow_state;
 	struct mctp_dev	*dev;
+
+	/* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire
+	 * automatically on timeout or response, instead SIOCMCTPDROPTAG
+	 * is used.
+	 */
+	bool		manual_alloc;
 };
 
 struct mctp_skb_cb {
@@ -239,6 +245,9 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);
 
 void mctp_key_unref(struct mctp_sk_key *key);
+struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
+					 mctp_eid_t daddr, mctp_eid_t saddr,
+					 bool manual, u8 *tagp);
 
 /* routing <--> device interface */
 unsigned int mctp_default_net(struct net *net);
-- 
cgit v1.2.3


From 731cddce6dd110fb2cdee34eddb48599e7251517 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 1 Feb 2022 19:06:27 +0100
Subject: net: mac802154: Convert the symbol duration into nanoseconds

Tdsym is often given in the spec as pretty small numbers in microseconds
and hence was reflected in the code as symbol_duration and was stored as
a u8. Actually, for UWB PHYs, the symbol duration is given in
nanoseconds and are as precise as picoseconds. In order to handle better
these PHYs, change the type of symbol_duration to u32 and store this
value in nanoseconds.

All the users of this variable are updated in a mechanical way.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220201180629.93410-3-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 833672d6fbe4..92d4f178caca 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -203,8 +203,8 @@ struct wpan_phy {
 
 	/* PHY depended MAC PIB values */
 
-	/* 802.15.4 acronym: Tdsym in usec */
-	u8 symbol_duration;
+	/* 802.15.4 acronym: Tdsym in nsec */
+	u32 symbol_duration;
 	/* lifs and sifs periods timing */
 	u16 lifs_period;
 	u16 sifs_period;
-- 
cgit v1.2.3


From 781830c800ddd19566846c634de09fadf37eafde Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 1 Feb 2022 19:06:28 +0100
Subject: net: mac802154: Set durations automatically

As depicted in the IEEE 802.15.4 specification, modulation/bands are
tight to a number of page/channels so we can for most of them derive the
durations automatically.

The two locations that must call this new helper to set the variou
symbol durations are:
- when manually requesting a channel change though the netlink interface
- at PHY creation, once the device driver has set the default
  page/channel

If an information is missing, the symbol duration is not touched, a
debug message is eventually printed. This keeps the compatibility with
the unconverted drivers for which it was too complicated for me to find
their precise information. If they initially provided a symbol duration,
it would be kept. If they don't, the symbol duration value is left
untouched.

Once the symbol duration derived, the lifs and sifs durations are
updated as well.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220201180629.93410-4-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 92d4f178caca..85f9e8417688 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -415,4 +415,6 @@ static inline const char *wpan_phy_name(struct wpan_phy *phy)
 	return dev_name(&phy->dev);
 }
 
+void ieee802154_configure_durations(struct wpan_phy *phy);
+
 #endif /* __NET_CFG802154_H */
-- 
cgit v1.2.3


From ede6c39c4f9068cbeb4036448c45fff5393e0432 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Feb 2022 18:59:32 -0800
Subject: net: make net->dev_unreg_count atomic

Having to acquire rtnl from netdev_run_todo() for every dismantled
device is not desirable when/if rtnl is under stress.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 374cc7b260fc..c4f5601f6e32 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -63,7 +63,7 @@ struct net {
 						 */
 	spinlock_t		rules_mod_lock;
 
-	unsigned int		dev_unreg_count;
+	atomic_t		dev_unreg_count;
 
 	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
 	int			ifindex;
-- 
cgit v1.2.3


From f9496b7c1b48ce02cd17a3ee88b1e049c689a222 Mon Sep 17 00:00:00 2001
From: "D. Wythe" <alibuda@linux.alibaba.com>
Date: Thu, 10 Feb 2022 17:11:38 +0800
Subject: net/smc: Add global configure for handshake limitation by netlink

Although we can control SMC handshake limitation through socket options,
which means that applications who need it must modify their code. It's
quite troublesome for many existing applications. This patch modifies
the global default value of SMC handshake limitation through netlink,
providing a way to put constraint on handshake without modifies any code
for applications.

Suggested-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/smc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index ea8a9cf2619b..47b166684fd8 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -12,5 +12,7 @@ struct netns_smc {
 	/* protect fback_rsn */
 	struct mutex			mutex_fback_rsn;
 	struct smc_stats_rsn		*fback_rsn;
+
+	bool				limit_smc_hs;	/* constraint on handshake */
 };
 #endif
-- 
cgit v1.2.3


From 2d4feb2c1ba728fe53d6f80705b2d7e5c5b666a5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Feb 2022 13:42:28 -0800
Subject: ipv6: get rid of net->ipv6.rt6_stats->fib_rt_uncache

This counter has never been visible, there is little point
trying to maintain it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 40ae8f1b18e5..32c83bb68879 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -367,9 +367,8 @@ struct rt6_statistics {
 	__u32		fib_rt_cache;		/* cached rt entries in exception table */
 	__u32		fib_discarded_routes;	/* total number of routes delete */
 
-	/* The following stats are not protected by any lock */
+	/* The following stat is not protected by any lock */
 	atomic_t	fib_rt_alloc;		/* total number of routes alloced */
-	atomic_t	fib_rt_uncache;		/* rt entries in uncached list */
 };
 
 #define RTN_TL_ROOT	0x0001
-- 
cgit v1.2.3


From 8d23a54f5beea59b560855fb571e5d73d783e0b4 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 15 Feb 2022 19:02:12 +0200
Subject: net: bridge: switchdev: differentiate new VLANs from changed ones

br_switchdev_port_vlan_add() currently emits a SWITCHDEV_PORT_OBJ_ADD
event with a SWITCHDEV_OBJ_ID_PORT_VLAN for 2 distinct cases:

- a struct net_bridge_vlan got created
- an existing struct net_bridge_vlan was modified

This makes it impossible for switchdev drivers to properly balance
PORT_OBJ_ADD with PORT_OBJ_DEL events, so if we want to allow that to
happen, we must provide a way for drivers to distinguish between a
VLAN with changed flags and a new one.

Annotate struct switchdev_obj_port_vlan with a "bool changed" that
distinguishes the 2 cases above.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d353793dfeb5..92cc763991e9 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -81,6 +81,13 @@ struct switchdev_obj_port_vlan {
 	struct switchdev_obj obj;
 	u16 flags;
 	u16 vid;
+	/* If set, the notifier signifies a change of one of the following
+	 * flags for a VLAN that already exists:
+	 * - BRIDGE_VLAN_INFO_PVID
+	 * - BRIDGE_VLAN_INFO_UNTAGGED
+	 * Entries with BRIDGE_VLAN_INFO_BRENTRY unset are not notified at all.
+	 */
+	bool changed;
 };
 
 #define SWITCHDEV_OBJ_PORT_VLAN(OBJ) \
-- 
cgit v1.2.3


From c4076cdd21f8d68a96f1e7124bd8915c7e31a474 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 15 Feb 2022 19:02:16 +0200
Subject: net: switchdev: introduce switchdev_handle_port_obj_{add,del} for
 foreign interfaces

The switchdev_handle_port_obj_add() helper is good for replicating a
port object on the lower interfaces of @dev, if that object was emitted
on a bridge, or on a bridge port that is a LAG.

However, drivers that use this helper limit themselves to a box from
which they can no longer intercept port objects notified on neighbor
ports ("foreign interfaces").

One such driver is DSA, where software bridging with foreign interfaces
such as standalone NICs or Wi-Fi APs is an important use case. There, a
VLAN installed on a neighbor bridge port roughly corresponds to a
forwarding VLAN installed on the DSA switch's CPU port.

To support this use case while also making use of the benefits of the
switchdev_handle_* replication helper for port objects, introduce a new
variant of these functions that crawls through the neighbor ports of
@dev, in search of potentially compatible switchdev ports that are
interested in the event.

The strategy is identical to switchdev_handle_fdb_event_to_device():
if @dev wasn't a switchdev interface, then go one step upper, and
recursively call this function on the bridge that this port belongs to.
At the next recursion step, __switchdev_handle_port_obj_add() will
iterate through the bridge's lower interfaces. Among those, some will be
switchdev interfaces, and one will be the original @dev that we came
from. To prevent infinite recursion, we must suppress reentry into the
original @dev, and just call the @add_cb for the switchdev_interfaces.

It looks like this:

                br0
               / | \
              /  |  \
             /   |   \
           swp0 swp1 eth0

1. __switchdev_handle_port_obj_add(eth0)
   -> check_cb(eth0) returns false
   -> eth0 has no lower interfaces
   -> eth0's bridge is br0
   -> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb))
      finds br0

2. __switchdev_handle_port_obj_add(br0)
   -> check_cb(br0) returns false
   -> netdev_for_each_lower_dev
      -> check_cb(swp0) returns true, so we don't skip this interface

3. __switchdev_handle_port_obj_add(swp0)
   -> check_cb(swp0) returns true, so we call add_cb(swp0)

(back to netdev_for_each_lower_dev from 2)
      -> check_cb(swp1) returns true, so we don't skip this interface

4. __switchdev_handle_port_obj_add(swp1)
   -> check_cb(swp1) returns true, so we call add_cb(swp1)

(back to netdev_for_each_lower_dev from 2)
      -> check_cb(eth0) returns false, so we skip this interface to
         avoid infinite recursion

Note: eth0 could have been a LAG, and we don't want to suppress the
recursion through its lowers if those exist, so when check_cb() returns
false, we still call switchdev_lower_dev_find() to estimate whether
there's anything worth a recursion beneath that LAG. Using check_cb()
and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures
out whether the lowers of the LAG are switchdev, but also whether they
actively offload the LAG or not (whether the LAG is "foreign" to the
switchdev interface or not).

The port_obj_info->orig_dev is preserved across recursive calls, so
switchdev drivers still know on which device was this notification
originally emitted.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 92cc763991e9..c32e1c8f79ec 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -324,11 +324,26 @@ int switchdev_handle_port_obj_add(struct net_device *dev,
 			int (*add_cb)(struct net_device *dev, const void *ctx,
 				      const struct switchdev_obj *obj,
 				      struct netlink_ext_ack *extack));
+int switchdev_handle_port_obj_add_foreign(struct net_device *dev,
+			struct switchdev_notifier_port_obj_info *port_obj_info,
+			bool (*check_cb)(const struct net_device *dev),
+			bool (*foreign_dev_check_cb)(const struct net_device *dev,
+						     const struct net_device *foreign_dev),
+			int (*add_cb)(struct net_device *dev, const void *ctx,
+				      const struct switchdev_obj *obj,
+				      struct netlink_ext_ack *extack));
 int switchdev_handle_port_obj_del(struct net_device *dev,
 			struct switchdev_notifier_port_obj_info *port_obj_info,
 			bool (*check_cb)(const struct net_device *dev),
 			int (*del_cb)(struct net_device *dev, const void *ctx,
 				      const struct switchdev_obj *obj));
+int switchdev_handle_port_obj_del_foreign(struct net_device *dev,
+			struct switchdev_notifier_port_obj_info *port_obj_info,
+			bool (*check_cb)(const struct net_device *dev),
+			bool (*foreign_dev_check_cb)(const struct net_device *dev,
+						     const struct net_device *foreign_dev),
+			int (*del_cb)(struct net_device *dev, const void *ctx,
+				      const struct switchdev_obj *obj));
 
 int switchdev_handle_port_attr_set(struct net_device *dev,
 			struct switchdev_notifier_port_attr_info *port_attr_info,
@@ -447,6 +462,18 @@ switchdev_handle_port_obj_add(struct net_device *dev,
 	return 0;
 }
 
+static inline int switchdev_handle_port_obj_add_foreign(struct net_device *dev,
+			struct switchdev_notifier_port_obj_info *port_obj_info,
+			bool (*check_cb)(const struct net_device *dev),
+			bool (*foreign_dev_check_cb)(const struct net_device *dev,
+						     const struct net_device *foreign_dev),
+			int (*add_cb)(struct net_device *dev, const void *ctx,
+				      const struct switchdev_obj *obj,
+				      struct netlink_ext_ack *extack))
+{
+	return 0;
+}
+
 static inline int
 switchdev_handle_port_obj_del(struct net_device *dev,
 			struct switchdev_notifier_port_obj_info *port_obj_info,
@@ -457,6 +484,18 @@ switchdev_handle_port_obj_del(struct net_device *dev,
 	return 0;
 }
 
+static inline int
+switchdev_handle_port_obj_del_foreign(struct net_device *dev,
+			struct switchdev_notifier_port_obj_info *port_obj_info,
+			bool (*check_cb)(const struct net_device *dev),
+			bool (*foreign_dev_check_cb)(const struct net_device *dev,
+						     const struct net_device *foreign_dev),
+			int (*del_cb)(struct net_device *dev, const void *ctx,
+				      const struct switchdev_obj *obj))
+{
+	return 0;
+}
+
 static inline int
 switchdev_handle_port_attr_set(struct net_device *dev,
 			struct switchdev_notifier_port_attr_info *port_attr_info,
-- 
cgit v1.2.3


From 134ef2388e7f3271d13223decdb5e45b0f84489f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 15 Feb 2022 19:02:17 +0200
Subject: net: dsa: add explicit support for host bridge VLANs

Currently, DSA programs VLANs on shared (DSA and CPU) ports each time it
does so on user ports. This is good for basic functionality but has
several limitations:

- the VLAN group which must reach the CPU may be radically different
  from the VLAN group that must be autonomously forwarded by the switch.
  In other words, the admin may want to isolate noisy stations and avoid
  traffic from them going to the control processor of the switch, where
  it would just waste useless cycles. The bridge already supports
  independent control of VLAN groups on bridge ports and on the bridge
  itself, and when VLAN-aware, it will drop packets in software anyway
  if their VID isn't added as a 'self' entry towards the bridge device.

- Replaying host FDB entries may depend, for some drivers like mv88e6xxx,
  on replaying the host VLANs as well. The 2 VLAN groups are
  approximately the same in most regular cases, but there are corner
  cases when timing matters, and DSA's approximation of replicating
  VLANs on shared ports simply does not work.

- If a user makes the bridge (implicitly the CPU port) join a VLAN by
  accident, there is no way for the CPU port to isolate itself from that
  noisy VLAN except by rebooting the system. This is because for each
  VLAN added on a user port, DSA will add it on shared ports too, but
  for each VLAN deletion on a user port, it will remain installed on
  shared ports, since DSA has no good indication of whether the VLAN is
  still in use or not.

Now that the bridge driver emits well-balanced SWITCHDEV_OBJ_ID_PORT_VLAN
addition and removal events, DSA has a simple and straightforward task
of separating the bridge port VLANs (these have an orig_dev which is a
DSA slave interface, or a LAG interface) from the host VLANs (these have
an orig_dev which is a bridge interface), and to keep a simple reference
count of each VID on each shared port.

Forwarding VLANs must be installed on the bridge ports and on all DSA
ports interconnecting them. We don't have a good view of the exact
topology, so we simply install forwarding VLANs on all DSA ports, which
is what has been done until now.

Host VLANs must be installed primarily on the dedicated CPU port of each
bridge port. More subtly, they must also be installed on upstream-facing
and downstream-facing DSA ports that are connecting the bridge ports and
the CPU. This ensures that the mv88e6xxx's problem (VID of host FDB
entry may be absent from VTU) is still addressed even if that switch is
in a cross-chip setup, and it has no local CPU port.

Therefore:
- user ports contain only bridge port (forwarding) VLANs, and no
  refcounting is necessary
- DSA ports contain both forwarding and host VLANs. Refcounting is
  necessary among these 2 types.
- CPU ports contain only host VLANs. Refcounting is also necessary.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index fd1f62a6e0a8..85cb9aed4c51 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -312,6 +312,10 @@ struct dsa_port {
 	struct mutex		addr_lists_lock;
 	struct list_head	fdbs;
 	struct list_head	mdbs;
+
+	/* List of VLANs that CPU and DSA ports are members of. */
+	struct mutex		vlans_lock;
+	struct list_head	vlans;
 };
 
 /* TODO: ideally DSA ports would have a single dp->link_dp member,
@@ -332,6 +336,12 @@ struct dsa_mac_addr {
 	struct list_head list;
 };
 
+struct dsa_vlan {
+	u16 vid;
+	refcount_t refcount;
+	struct list_head list;
+};
+
 struct dsa_switch {
 	struct device *dev;
 
-- 
cgit v1.2.3


From 5cd5a8a3e2fb11b1c8a09f062c44c1e228ef987a Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 14 Feb 2022 17:29:53 +0100
Subject: cfg80211: Add data structures to capture EHT capabilities

And advertise EHT capabilities to user space when supported.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Link: https://lore.kernel.org/r/20220214173004.6fb70658529f.I2413a37c8f7d2d6d638038a3d95360a3fce0114d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f6db085ff3df..7bec15f605be 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -360,6 +360,48 @@ struct ieee80211_sta_he_cap {
 	u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN];
 };
 
+/**
+ * struct ieee80211_eht_mcs_nss_supp - EHT max supported NSS per MCS
+ *
+ * See P802.11be_D1.3 Table 9-401k - "Subfields of the Supported EHT-MCS
+ * and NSS Set field"
+ *
+ * @only_20mhz: MCS/NSS support for 20 MHz-only STA.
+ * @bw._80: MCS/NSS support for BW <= 80 MHz
+ * @bw._160: MCS/NSS support for BW = 160 MHz
+ * @bw._320: MCS/NSS support for BW = 320 MHz
+ */
+struct ieee80211_eht_mcs_nss_supp {
+	union {
+		struct ieee80211_eht_mcs_nss_supp_20mhz_only only_20mhz;
+		struct {
+			struct ieee80211_eht_mcs_nss_supp_bw _80;
+			struct ieee80211_eht_mcs_nss_supp_bw _160;
+			struct ieee80211_eht_mcs_nss_supp_bw _320;
+		} __packed bw;
+	} __packed;
+} __packed;
+
+#define IEEE80211_EHT_PPE_THRES_MAX_LEN		32
+
+/**
+ * struct ieee80211_sta_eht_cap - STA's EHT capabilities
+ *
+ * This structure describes most essential parameters needed
+ * to describe 802.11be EHT capabilities for a STA.
+ *
+ * @has_eht: true iff EHT data is valid.
+ * @eht_cap_elem: Fixed portion of the eht capabilities element.
+ * @eht_mcs_nss_supp: The supported NSS/MCS combinations.
+ * @eht_ppe_thres: Holds the PPE Thresholds data.
+ */
+struct ieee80211_sta_eht_cap {
+	bool has_eht;
+	struct ieee80211_eht_cap_elem_fixed eht_cap_elem;
+	struct ieee80211_eht_mcs_nss_supp eht_mcs_nss_supp;
+	u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN];
+};
+
 /**
  * struct ieee80211_sband_iftype_data - sband data per interface type
  *
@@ -379,6 +421,7 @@ struct ieee80211_sband_iftype_data {
 	u16 types_mask;
 	struct ieee80211_sta_he_cap he_cap;
 	struct ieee80211_he_6ghz_capa he_6ghz_capa;
+	struct ieee80211_sta_eht_cap eht_cap;
 	struct {
 		const u8 *data;
 		unsigned int len;
@@ -561,6 +604,26 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband,
 	return data->he_6ghz_capa.capa;
 }
 
+/**
+ * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype
+ * @sband: the sband to search for the iftype on
+ * @iftype: enum nl80211_iftype
+ *
+ * Return: pointer to the struct ieee80211_sta_eht_cap, or NULL is none found
+ */
+static inline const struct ieee80211_sta_eht_cap *
+ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband,
+			     enum nl80211_iftype iftype)
+{
+	const struct ieee80211_sband_iftype_data *data =
+		ieee80211_get_sband_iftype_data(sband, iftype);
+
+	if (data && data->eht_cap.has_eht)
+		return &data->eht_cap;
+
+	return NULL;
+}
+
 /**
  * wiphy_read_of_freq_limits - read frequency limits from device tree
  *
-- 
cgit v1.2.3


From 3743bec6120ae0748cb3fda6ff80a690117ef1f3 Mon Sep 17 00:00:00 2001
From: Jia Ding <quic_jiad@quicinc.com>
Date: Mon, 14 Feb 2022 17:29:54 +0100
Subject: cfg80211: Add support for EHT 320 MHz channel width

Add 320 MHz support in the channel def and center frequency validation
with compatible check.

Signed-off-by: Jia Ding <quic_jiad@quicinc.com>
Co-authored-by: Karthikeyan Periyasamy <quic_periyasa@quicinc.com>
Signed-off-by: Karthikeyan Periyasamy <quic_periyasa@quicinc.com>
Co-authored-by: Muna Sinada <quic_msinada@quicinc.com>
Signed-off-by: Muna Sinada <quic_msinada@quicinc.com>
Co-authored-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/1640163883-12696-5-git-send-email-quic_vjakkam@quicinc.com
Link: https://lore.kernel.org/r/20220214163009.175289-1-johannes@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7bec15f605be..a27f2d699dac 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -109,7 +109,11 @@ struct wiphy;
  *	on this channel.
  * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted
  *	on this channel.
- *
+ * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band,
+ *	this flag indicates that a 320 MHz channel cannot use this
+ *	channel as the control or any of the secondary channels.
+ *	This may be due to the driver or due to regulatory bandwidth
+ *	restrictions.
  */
 enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DISABLED		= 1<<0,
@@ -131,6 +135,7 @@ enum ieee80211_channel_flags {
 	IEEE80211_CHAN_4MHZ		= 1<<16,
 	IEEE80211_CHAN_8MHZ		= 1<<17,
 	IEEE80211_CHAN_16MHZ		= 1<<18,
+	IEEE80211_CHAN_NO_320MHZ	= 1<<19,
 };
 
 #define IEEE80211_CHAN_NO_HT40 \
-- 
cgit v1.2.3


From cfb14110acf87b4db62e07ba08a80429f1749f40 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Mon, 14 Feb 2022 17:29:55 +0100
Subject: nl80211: add EHT MCS support

Add support for reporting and calculating EHT bitrates.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/1640163883-12696-7-git-send-email-quic_vjakkam@quicinc.com
Link: https://lore.kernel.org/r/20220214163009.175289-2-johannes@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a27f2d699dac..f35ffd81d213 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1595,6 +1595,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
  * @RATE_INFO_FLAGS_HE_MCS: HE MCS information
  * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode
  * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS
+ * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information
  */
 enum rate_info_flags {
 	RATE_INFO_FLAGS_MCS			= BIT(0),
@@ -1604,6 +1605,7 @@ enum rate_info_flags {
 	RATE_INFO_FLAGS_HE_MCS			= BIT(4),
 	RATE_INFO_FLAGS_EDMG			= BIT(5),
 	RATE_INFO_FLAGS_EXTENDED_SC_DMG		= BIT(6),
+	RATE_INFO_FLAGS_EHT_MCS			= BIT(7),
 };
 
 /**
@@ -1618,6 +1620,8 @@ enum rate_info_flags {
  * @RATE_INFO_BW_80: 80 MHz bandwidth
  * @RATE_INFO_BW_160: 160 MHz bandwidth
  * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
+ * @RATE_INFO_BW_320: 320 MHz bandwidth
+ * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation
  */
 enum rate_info_bw {
 	RATE_INFO_BW_20 = 0,
@@ -1627,6 +1631,8 @@ enum rate_info_bw {
 	RATE_INFO_BW_80,
 	RATE_INFO_BW_160,
 	RATE_INFO_BW_HE_RU,
+	RATE_INFO_BW_320,
+	RATE_INFO_BW_EHT_RU,
 };
 
 /**
@@ -1644,6 +1650,9 @@ enum rate_info_bw {
  * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc,
  *	only valid if bw is %RATE_INFO_BW_HE_RU)
  * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4)
+ * @eht_gi: EHT guard interval (from &enum nl80211_eht_gi)
+ * @eht_ru_alloc: EHT RU allocation (from &enum nl80211_eht_ru_alloc,
+ *	only valid if bw is %RATE_INFO_BW_EHT_RU)
  */
 struct rate_info {
 	u8 flags;
@@ -1655,6 +1664,8 @@ struct rate_info {
 	u8 he_dcm;
 	u8 he_ru_alloc;
 	u8 n_bonded_ch;
+	u8 eht_gi;
+	u8 eht_ru_alloc;
 };
 
 /**
-- 
cgit v1.2.3


From 31846b657857e6a73d982604f36a34710d98902c Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 14 Feb 2022 17:29:57 +0100
Subject: cfg80211: add NO-EHT flag to regulatory

This may be necessary in some cases, add a flag and propagate
it, just like the NO-HE that already exists.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
[split off from a combined 320/no-EHT patch]
Link: https://lore.kernel.org/r/20220214173004.dbb85a7b86bb.Ifc1e2daac51c1cc5f895ccfb79faf5eaec3950ec@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f35ffd81d213..5cfc483dece1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -114,6 +114,7 @@ struct wiphy;
  *	channel as the control or any of the secondary channels.
  *	This may be due to the driver or due to regulatory bandwidth
  *	restrictions.
+ * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel.
  */
 enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DISABLED		= 1<<0,
@@ -136,6 +137,7 @@ enum ieee80211_channel_flags {
 	IEEE80211_CHAN_8MHZ		= 1<<17,
 	IEEE80211_CHAN_16MHZ		= 1<<18,
 	IEEE80211_CHAN_NO_320MHZ	= 1<<19,
+	IEEE80211_CHAN_NO_EHT		= 1<<20,
 };
 
 #define IEEE80211_CHAN_NO_HT40 \
-- 
cgit v1.2.3


From ea05fd3581d32a0f1098657005c7a9b763798fe8 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 14 Feb 2022 17:29:58 +0100
Subject: cfg80211: Support configuration of station EHT capabilities

Add attributes and some code bits to support userspace passing
in EHT capabilities of stations.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Link: https://lore.kernel.org/r/20220214173004.ecf0b3ff9627.Icb4a5f2ec7b41d9008ac4cfc16c59baeb84793d3@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5cfc483dece1..68713388b617 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1487,6 +1487,8 @@ struct sta_txpwr {
  * @airtime_weight: airtime scheduler weight for this station
  * @txpwr: transmit power for an associated station
  * @he_6ghz_capa: HE 6 GHz Band capabilities of station
+ * @eht_capa: EHT capabilities of station
+ * @eht_capa_len: the length of the EHT capabilities
  */
 struct station_parameters {
 	const u8 *supported_rates;
@@ -1520,6 +1522,8 @@ struct station_parameters {
 	u16 airtime_weight;
 	struct sta_txpwr txpwr;
 	const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
+	const struct ieee80211_eht_cap_elem *eht_capa;
+	u8 eht_capa_len;
 };
 
 /**
-- 
cgit v1.2.3


From 5dca295dd76756c7918ad7113fc4a3cf8262ed43 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 14 Feb 2022 17:30:00 +0100
Subject: mac80211: Add initial support for EHT and 320 MHz channels

Add initial support for EHT and 320 MHz bandwidth in mac80211.

As a new IEEE80211_STA_RX_BW_320 is added to
enum ieee80211_sta_rx_bandwidth, update the drivers to avoid
compilation warnings.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Link: https://lore.kernel.org/r/20220214173004.0f144cc0bba6.Iad18111264da87eed5fd7b017f0cc6e58c604e07@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index bd6912d0292b..586d3c26c8ac 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2005,6 +2005,7 @@ enum ieee80211_sta_state {
  * @IEEE80211_STA_RX_BW_80: station can receive up to 80 MHz
  * @IEEE80211_STA_RX_BW_160: station can receive up to 160 MHz
  *	(including 80+80 MHz)
+ * @IEEE80211_STA_RX_BW_320: station can receive up to 320 MHz
  *
  * Implementation note: 20 must be zero to be initialized
  *	correctly, the values must be sorted.
@@ -2014,6 +2015,7 @@ enum ieee80211_sta_rx_bandwidth {
 	IEEE80211_STA_RX_BW_40,
 	IEEE80211_STA_RX_BW_80,
 	IEEE80211_STA_RX_BW_160,
+	IEEE80211_STA_RX_BW_320,
 };
 
 /**
-- 
cgit v1.2.3


From a1de64078bf7a3fed856fef5334132ba1963b683 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 14 Feb 2022 17:30:02 +0100
Subject: mac80211: Handle station association response with EHT

When the association is an EHT association, parse the EHT
element from the association response and update the
station's EHT capabilities accordingly.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Link: https://lore.kernel.org/r/20220214173004.f33574718755.I21182234c5303d9423eabd5eb997e7cf75f8e0c8@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 586d3c26c8ac..f118b8fe667a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -636,6 +636,7 @@ struct ieee80211_fils_discovery {
  * @tx_pwr_env: transmit power envelope array of BSS.
  * @tx_pwr_env_num: number of @tx_pwr_env.
  * @pwr_reduction: power constraint of BSS.
+ * @eht_support: does this BSS support EHT
  */
 struct ieee80211_bss_conf {
 	const u8 *bssid;
@@ -710,6 +711,7 @@ struct ieee80211_bss_conf {
 	struct ieee80211_tx_pwr_env tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT];
 	u8 tx_pwr_env_num;
 	u8 pwr_reduction;
+	bool eht_support;
 };
 
 /**
@@ -2071,6 +2073,7 @@ struct ieee80211_sta_txpwr {
  * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities
  * @he_cap: HE capabilities of this STA
  * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities
+ * @eht_cap: EHT capabilities of this STA
  * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU
  *	that this station is allowed to transmit to us.
  *	Can be modified by driver.
@@ -2111,6 +2114,7 @@ struct ieee80211_sta {
 	struct ieee80211_sta_vht_cap vht_cap;
 	struct ieee80211_sta_he_cap he_cap;
 	struct ieee80211_he_6ghz_capa he_6ghz_capa;
+	struct ieee80211_sta_eht_cap eht_cap;
 	u16 max_rx_aggregation_subframes;
 	bool wme;
 	u8 uapsd_queues;
-- 
cgit v1.2.3


From f8e9ce4a6e85067d7d7cfa89167f5ce5f0ec2a8a Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Tue, 15 Feb 2022 18:11:24 -0800
Subject: mptcp: mptcp_parse_option is no longer exported

Options parsing in now done from mptcp_incoming_options().

mptcp_parse_option() has been removed from mptcp.h when CONFIG_MPTCP is
defined but not when it is not.

Fixes: cfde141ea3fa ("mptcp: move option parsing into mptcp_incoming_options()")
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mptcp.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index a925349b4b89..0a3b0fb04a3b 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -217,12 +217,6 @@ static inline bool rsk_drop_req(const struct request_sock *req)
 	return false;
 }
 
-static inline void mptcp_parse_option(const struct sk_buff *skb,
-				      const unsigned char *ptr, int opsize,
-				      struct tcp_options_received *opt_rx)
-{
-}
-
 static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
 				     unsigned int *size,
 				     struct mptcp_out_options *opts)
-- 
cgit v1.2.3


From 5224f79096170bf7b92cc8fe42a12f44b91e5f62 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Mon, 14 Feb 2022 19:11:44 -0600
Subject: treewide: Replace zero-length arrays with flexible-array members
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare
having a dynamically sized set of trailing elements in a structure.
Kernel code should always use “flexible array members”[1] for these
cases. The older style of one-element or zero-length arrays should
no longer be used[2].

This code was transformed with the help of Coccinelle:
(next-20220214$ spatch --jobs $(getconf _NPROCESSORS_ONLN) --sp-file script.cocci --include-headers --dir . > output.patch)

@@
identifier S, member, array;
type T1, T2;
@@

struct S {
  ...
  T1 member;
  T2 array[
- 0
  ];
};

UAPI and wireless changes were intentionally excluded from this patch
and will be sent out separately.

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/78
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 include/net/bluetooth/mgmt.h | 2 +-
 include/net/ioam6.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 107b25deae68..9607ec289fd0 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -696,7 +696,7 @@ struct mgmt_cp_set_blocked_keys {
 #define MGMT_READ_CONTROLLER_CAP_SIZE	0
 struct mgmt_rp_read_controller_cap {
 	__le16   cap_len;
-	__u8     cap[0];
+	__u8     cap[];
 } __packed;
 
 #define MGMT_OP_READ_EXP_FEATURES_INFO	0x0049
diff --git a/include/net/ioam6.h b/include/net/ioam6.h
index 3f45ba37a2c6..781d2d8b2f29 100644
--- a/include/net/ioam6.h
+++ b/include/net/ioam6.h
@@ -35,7 +35,7 @@ struct ioam6_schema {
 	int len;
 	__be32 hdr;
 
-	u8 data[0];
+	u8 data[];
 };
 
 struct ioam6_pernet_data {
-- 
cgit v1.2.3


From d2b1d186ce2eac6b15d31db3e2750ee8e02bbe81 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 16 Feb 2022 21:37:26 +0200
Subject: net: dsa: delete unused exported symbols for ethtool PHY stats

Introduced in commit cf963573039a ("net: dsa: Allow providing PHY
statistics from CPU port"), it appears these were never used.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20220216193726.2926320-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1456313a1faa..c8626dec970c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1257,9 +1257,6 @@ static inline bool dsa_slave_dev_check(const struct net_device *dev)
 #endif
 
 netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev);
-int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
-int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
-int dsa_port_get_phy_sset_count(struct dsa_port *dp);
 void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);
 
 struct dsa_tag_driver {
-- 
cgit v1.2.3


From 8467fadc115cb08bb1cbc7885cb7b7ef1871cae4 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Thu, 17 Feb 2022 10:07:55 +0200
Subject: net: gro: Fix a 'directive in macro's argument list' sparse warning

Following the cited commit, sparse started complaining about:
../include/net/gro.h:58:1: warning: directive in macro's argument list
../include/net/gro.h:59:1: warning: directive in macro's argument list

Fix that by moving the defines out of the struct_group() macro.

Fixes: de5a1f3ce4c8 ("net: gro: minor optimization for dev_gro_receive()")
Reviewed-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Acked-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gro.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/gro.h b/include/net/gro.h
index a765fedda5c4..867656b0739c 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -35,6 +35,9 @@ struct napi_gro_cb {
 	/* jiffies when first packet was created/queued */
 	unsigned long age;
 
+/* Used in napi_gro_cb::free */
+#define NAPI_GRO_FREE             1
+#define NAPI_GRO_FREE_STOLEN_HEAD 2
 	/* portion of the cb set to zero at every gro iteration */
 	struct_group(zeroed,
 
@@ -55,8 +58,6 @@ struct napi_gro_cb {
 
 		/* Free the skb? */
 		u8	free:2;
-#define NAPI_GRO_FREE		  1
-#define NAPI_GRO_FREE_STOLEN_HEAD 2
 
 		/* Used in foo-over-udp, set in udp[46]_gro_receive */
 		u8	is_ipv6:1;
-- 
cgit v1.2.3


From bde018222c6b084ac32933a9f933581dd83da18e Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 17 Feb 2022 18:30:35 +0000
Subject: net: dsa: add support for phylink mac_select_pcs()

Add DSA support for the phylink mac_select_pcs() method so DSA drivers
can return provide phylink with the appropriate PCS for the PHY
interface mode.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index c8626dec970c..bc6eef6af810 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -788,6 +788,9 @@ struct dsa_switch_ops {
 	void	(*phylink_validate)(struct dsa_switch *ds, int port,
 				    unsigned long *supported,
 				    struct phylink_link_state *state);
+	struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds,
+						      int port,
+						      phy_interface_t iface);
 	int	(*phylink_mac_link_state)(struct dsa_switch *ds, int port,
 					  struct phylink_link_state *state);
 	void	(*phylink_mac_config)(struct dsa_switch *ds, int port,
-- 
cgit v1.2.3


From 47f0bd5032106469827cf56c8b45bb9101112105 Mon Sep 17 00:00:00 2001
From: Jacques de Laval <Jacques.De.Laval@westermo.com>
Date: Thu, 17 Feb 2022 16:02:02 +0100
Subject: net: Add new protocol attribute to IP addresses

This patch adds a new protocol attribute to IPv4 and IPv6 addresses.
Inspiration was taken from the protocol attribute of routes. User space
applications like iproute2 can set/get the protocol with the Netlink API.

The attribute is stored as an 8-bit unsigned integer.

The protocol attribute is set by kernel for these categories:

- IPv4 and IPv6 loopback addresses
- IPv6 addresses generated from router announcements
- IPv6 link local addresses

User space may pass custom protocols, not defined by the kernel.

Grouping addresses on their origin is useful in scenarios where you want
to distinguish between addresses based on who added them, e.g. kernel
vs. user space.

Tagging addresses with a string label is an existing feature that could be
used as a solution. Unfortunately the max length of a label is
15 characters, and for compatibility reasons the label must be prefixed
with the name of the device followed by a colon. Since device names also
have a max length of 15 characters, only -1 characters is guaranteed to be
available for any origin tag, which is not that much.

A reference implementation of user space setting and getting protocols
is available for iproute2:

https://github.com/westermo/iproute2/commit/9a6ea18bd79f47f293e5edc7780f315ea42ff540

Signed-off-by: Jacques de Laval <Jacques.De.Laval@westermo.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220217150202.80802-1-Jacques.De.Laval@westermo.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/addrconf.h | 2 ++
 include/net/if_inet6.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 59940e230b78..f7506f08e505 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -64,6 +64,8 @@ struct ifa6_config {
 	const struct in6_addr	*pfx;
 	unsigned int		plen;
 
+	u8			ifa_proto;
+
 	const struct in6_addr	*peer_pfx;
 
 	u32			rt_priority;
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index f026cf08a8e8..4cfdef6ca4f6 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -71,6 +71,8 @@ struct inet6_ifaddr {
 
 	bool			tokenized;
 
+	u8			ifa_proto;
+
 	struct rcu_head		rcu;
 	struct in6_addr		peer_addr;
 };
-- 
cgit v1.2.3


From cb196b725936f6b776ad1d073f66fbe92aa798fa Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Fri, 18 Feb 2022 12:25:53 +0800
Subject: mctp: replace mctp_address_ok with more fine-grained helpers

Currently, we have mctp_address_ok(), which checks if an EID is in the
"valid" range of 8-254 inclusive. However, 0 and 255 may also be valid
addresses, depending on context. 0 is the NULL EID, which may be set
when physical addressing is used. 255 is valid as a destination address
for broadcasts.

This change renames mctp_address_ok to mctp_address_unicast, and adds
similar helpers for broadcast and null EIDs, which will be used in an
upcoming commit.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mctp.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index e80a4baf8379..d37268fe6825 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -40,11 +40,21 @@ struct mctp_hdr {
 
 #define MCTP_INITIAL_DEFAULT_NET	1
 
-static inline bool mctp_address_ok(mctp_eid_t eid)
+static inline bool mctp_address_unicast(mctp_eid_t eid)
 {
 	return eid >= 8 && eid < 255;
 }
 
+static inline bool mctp_address_broadcast(mctp_eid_t eid)
+{
+	return eid == 255;
+}
+
+static inline bool mctp_address_null(mctp_eid_t eid)
+{
+	return eid == 0;
+}
+
 static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid)
 {
 	return match == eid || match == MCTP_ADDR_ANY;
-- 
cgit v1.2.3


From ccfbf44d4c7fb7c64cf79b3f2a5ae522e5165878 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Sat, 19 Feb 2022 11:47:17 +0000
Subject: net: dsa: remove pcs_poll

With drivers converted over to using phylink PCS, there is no need for
the struct dsa_switch member "pcs_poll" to exist anymore - there is a
flag in the struct phylink_pcs which indicates whether this PCS needs
to be polled which supersedes this.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index bc6eef6af810..f13de2d8aef3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -391,11 +391,6 @@ struct dsa_switch {
 	 */
 	u32			vlan_filtering:1;
 
-	/* MAC PCS does not provide link state change interrupt, and requires
-	 * polling. Flag passed on to PHYLINK.
-	 */
-	u32			pcs_poll:1;
-
 	/* For switches that only have the MRU configurable. To ensure the
 	 * configured MTU is not exceeded, normalization of MRU on all bridged
 	 * interfaces is needed.
-- 
cgit v1.2.3


From 7a26dc9e7b43f5a24c4b843713e728582adf1c38 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Sun, 20 Feb 2022 15:06:33 +0800
Subject: net: tcp: add skb drop reasons to tcp_add_backlog()

Pass the address of drop_reason to tcp_add_backlog() to store the
reasons for skb drops when fails. Following drop reasons are
introduced:

SKB_DROP_REASON_SOCKET_BACKLOG

Reviewed-by: Mengen Sun <mengensun@tencent.com>
Reviewed-by: Hao Peng <flyingpeng@tencent.com>
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index eff2487d972d..04f4650e0ff0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1367,7 +1367,8 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
 		__skb_checksum_complete(skb);
 }
 
-bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
+bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
+		     enum skb_drop_reason *reason);
 
 #ifdef CONFIG_INET
 void __sk_defer_free_flush(struct sock *sk);
-- 
cgit v1.2.3


From 0c51e12e218f20b7d976158fdc18019627326f7a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Sat, 19 Feb 2022 17:45:19 +0200
Subject: ipv4: Invalidate neighbour for broadcast address upon address
 addition

In case user space sends a packet destined to a broadcast address when a
matching broadcast route is not configured, the kernel will create a
unicast neighbour entry that will never be resolved [1].

When the broadcast route is configured, the unicast neighbour entry will
not be invalidated and continue to linger, resulting in packets being
dropped.

Solve this by invalidating unresolved neighbour entries for broadcast
addresses after routes for these addresses are internally configured by
the kernel. This allows the kernel to create a broadcast neighbour entry
following the next route lookup.

Another possible solution that is more generic but also more complex is
to have the ARP code register a listener to the FIB notification chain
and invalidate matching neighbour entries upon the addition of broadcast
routes.

It is also possible to wave off the issue as a user space problem, but
it seems a bit excessive to expect user space to be that intimately
familiar with the inner workings of the FIB/neighbour kernel code.

[1] https://lore.kernel.org/netdev/55a04a8f-56f3-f73c-2aea-2195923f09d1@huawei.com/

Reported-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/arp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/arp.h b/include/net/arp.h
index 031374ac2f22..d7ef4ec71dfe 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -65,6 +65,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
 	      const unsigned char *src_hw, const unsigned char *th);
 int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir);
 void arp_ifdown(struct net_device *dev);
+int arp_invalidate(struct net_device *dev, __be32 ip, bool force);
 
 struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
 			   struct net_device *dev, __be32 src_ip,
-- 
cgit v1.2.3


From 696c65444120cfe98ae704e86a59df8666fabf1d Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 21 Feb 2022 13:54:53 +0800
Subject: ipv6: separate ndisc_ns_create() from ndisc_send_ns()

This patch separate NS message allocation steps from ndisc_send_ns(),
so it could be used in other places, like bonding, to allocate and
send IPv6 NS message.

Also export ndisc_send_skb() and ndisc_ns_create() for later bonding usage.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ndisc.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 53cb8de0e589..aac3a42de432 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -447,10 +447,15 @@ void ndisc_cleanup(void);
 
 int ndisc_rcv(struct sk_buff *skb);
 
+struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit,
+				const struct in6_addr *saddr, u64 nonce);
 void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
 		   const struct in6_addr *daddr, const struct in6_addr *saddr,
 		   u64 nonce);
 
+void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
+		    const struct in6_addr *saddr);
+
 void ndisc_send_rs(struct net_device *dev,
 		   const struct in6_addr *saddr, const struct in6_addr *daddr);
 void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
-- 
cgit v1.2.3


From 841e95641e4cb695586b036f330d6d272fcd7173 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 21 Feb 2022 13:54:55 +0800
Subject: bonding: add extra field for bond_opt_value

Adding an extra storage field for bond_opt_value so we can set large
bytes of data for bonding options in future, e.g. IPv6 address.

Define a new call bond_opt_initextra(). Also change the checking order of
__bond_opt_init() and check values first.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bond_options.h | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index dd75c071f67e..286b29c6c451 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -71,14 +71,18 @@ enum {
 
 /* This structure is used for storing option values and for passing option
  * values when changing an option. The logic when used as an arg is as follows:
- * - if string != NULL -> parse it, if the opt is RAW type then return it, else
- *   return the parse result
- * - if string == NULL -> parse value
+ * - if value != ULLONG_MAX -> parse value
+ * - if string != NULL -> parse string
+ * - if the opt is RAW data and length less than maxlen,
+ *   copy the data to extra storage
  */
+
+#define BOND_OPT_EXTRA_MAXLEN 16
 struct bond_opt_value {
 	char *string;
 	u64 value;
 	u32 flags;
+	char extra[BOND_OPT_EXTRA_MAXLEN];
 };
 
 struct bonding;
@@ -118,17 +122,22 @@ const struct bond_opt_value *bond_opt_get_val(unsigned int option, u64 val);
  * When value is ULLONG_MAX then string will be used.
  */
 static inline void __bond_opt_init(struct bond_opt_value *optval,
-				   char *string, u64 value)
+				   char *string, u64 value,
+				   void *extra, size_t extra_len)
 {
 	memset(optval, 0, sizeof(*optval));
 	optval->value = ULLONG_MAX;
-	if (value == ULLONG_MAX)
-		optval->string = string;
-	else
+	if (value != ULLONG_MAX)
 		optval->value = value;
+	else if (string)
+		optval->string = string;
+	else if (extra_len <= BOND_OPT_EXTRA_MAXLEN)
+		memcpy(optval->extra, extra, extra_len);
 }
-#define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value)
-#define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX)
+#define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value, NULL, 0)
+#define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX, NULL, 0)
+#define bond_opt_initextra(optval, extra, extra_len) \
+	__bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len)
 
 void bond_option_arp_ip_targets_clear(struct bonding *bond);
 
-- 
cgit v1.2.3


From 4e24be018eb9dbcefa4b01c07e298b147dc1a4d7 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 21 Feb 2022 13:54:56 +0800
Subject: bonding: add new parameter ns_targets

Add a new bonding parameter ns_targets to store IPv6 address.
Add required bond_ns_send/rcv functions first before adding
IPv6 address option setting.

Add two functions bond_send/rcv_validate so we can send/recv
ARP and NS at the same time.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bonding.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index 7dead855a72d..f3b986f6b6e4 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -29,8 +29,11 @@
 #include <net/bond_3ad.h>
 #include <net/bond_alb.h>
 #include <net/bond_options.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
 
 #define BOND_MAX_ARP_TARGETS	16
+#define BOND_MAX_NS_TARGETS	BOND_MAX_ARP_TARGETS
 
 #define BOND_DEFAULT_MIIMON	100
 
@@ -146,6 +149,7 @@ struct bond_params {
 	struct reciprocal_value reciprocal_packets_per_slave;
 	u16 ad_actor_sys_prio;
 	u16 ad_user_port_key;
+	struct in6_addr ns_targets[BOND_MAX_NS_TARGETS];
 
 	/* 2 bytes of padding : see ether_addr_equal_64bits() */
 	u8 ad_actor_system[ETH_ALEN + 2];
@@ -628,7 +632,7 @@ struct bond_net {
 	struct class_attribute	class_attr_bonding_masters;
 };
 
-int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave);
+int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave);
 netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev);
 int bond_create(struct net *net, const char *name);
 int bond_create_sysfs(struct bond_net *net);
@@ -735,6 +739,19 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip)
 	return -1;
 }
 
+static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip)
+{
+	int i;
+
+	for (i = 0; i < BOND_MAX_NS_TARGETS; i++)
+		if (ipv6_addr_equal(&targets[i], ip))
+			return i;
+		else if (ipv6_addr_any(&targets[i]))
+			break;
+
+	return -1;
+}
+
 /* exported from bond_main.c */
 extern unsigned int bond_net_id;
 
-- 
cgit v1.2.3


From 129e3c1bab24d27d0fa6e505a472345a92d7a2b0 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 21 Feb 2022 13:54:57 +0800
Subject: bonding: add new option ns_ip6_target

This patch add a new bonding option ns_ip6_target, which correspond
to the arp_ip_target. With this we set IPv6 targets and send IPv6 NS
request to determine the health of the link.

For other related options like the validation, we still use
arp_validate, and will change to ns_validate later.

Note: the sysfs configuration support was removed based on
https://lore.kernel.org/netdev/8863.1645071997@famine

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bond_options.h | 4 ++++
 include/net/bonding.h      | 7 +++++++
 2 files changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 286b29c6c451..61b49063791c 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -66,6 +66,7 @@ enum {
 	BOND_OPT_PEER_NOTIF_DELAY,
 	BOND_OPT_LACP_ACTIVE,
 	BOND_OPT_MISSED_MAX,
+	BOND_OPT_NS_TARGETS,
 	BOND_OPT_LAST
 };
 
@@ -140,5 +141,8 @@ static inline void __bond_opt_init(struct bond_opt_value *optval,
 	__bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len)
 
 void bond_option_arp_ip_targets_clear(struct bonding *bond);
+#if IS_ENABLED(CONFIG_IPV6)
+void bond_option_ns_ip6_targets_clear(struct bonding *bond);
+#endif
 
 #endif /* _NET_BOND_OPTIONS_H */
diff --git a/include/net/bonding.h b/include/net/bonding.h
index f3b986f6b6e4..d0dfe727e0b1 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -503,6 +503,13 @@ static inline int bond_is_ip_target_ok(__be32 addr)
 	return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr);
 }
 
+static inline int bond_is_ip6_target_ok(struct in6_addr *addr)
+{
+	return !ipv6_addr_any(addr) &&
+	       !ipv6_addr_loopback(addr) &&
+	       !ipv6_addr_is_multicast(addr);
+}
+
 /* Get the oldest arp which we've received on this slave for bond's
  * arp_targets.
  */
-- 
cgit v1.2.3


From 46a76724e4c93bb1cda8ee11276001a92d1f7987 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 23 Feb 2022 16:00:44 +0200
Subject: net: dsa: rename references to "lag" as "lag_dev"

In preparation of converting struct net_device *dp->lag_dev into a
struct dsa_lag *dp->lag, we need to rename, for consistency purposes,
all occurrences of the "lag" variable in the DSA core to "lag_dev".

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index f13de2d8aef3..ef7f446cbdf4 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -182,12 +182,12 @@ static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst,
 }
 
 static inline int dsa_lag_id(struct dsa_switch_tree *dst,
-			     struct net_device *lag)
+			     struct net_device *lag_dev)
 {
 	unsigned int id;
 
 	dsa_lags_foreach_id(id, dst) {
-		if (dsa_lag_dev(dst, id) == lag)
+		if (dsa_lag_dev(dst, id) == lag_dev)
 			return id;
 	}
 
@@ -966,10 +966,10 @@ struct dsa_switch_ops {
 	int	(*crosschip_lag_change)(struct dsa_switch *ds, int sw_index,
 					int port);
 	int	(*crosschip_lag_join)(struct dsa_switch *ds, int sw_index,
-				      int port, struct net_device *lag,
+				      int port, struct net_device *lag_dev,
 				      struct netdev_lag_upper_info *info);
 	int	(*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index,
-				       int port, struct net_device *lag);
+				       int port, struct net_device *lag_dev);
 
 	/*
 	 * PTP functionality
@@ -1041,10 +1041,10 @@ struct dsa_switch_ops {
 	 */
 	int	(*port_lag_change)(struct dsa_switch *ds, int port);
 	int	(*port_lag_join)(struct dsa_switch *ds, int port,
-				 struct net_device *lag,
+				 struct net_device *lag_dev,
 				 struct netdev_lag_upper_info *info);
 	int	(*port_lag_leave)(struct dsa_switch *ds, int port,
-				  struct net_device *lag);
+				  struct net_device *lag_dev);
 
 	/*
 	 * HSR integration
-- 
cgit v1.2.3


From 3d4a0a2a46ab8ff8897dfd6324edee5e8184d2c5 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 23 Feb 2022 16:00:47 +0200
Subject: net: dsa: make LAG IDs one-based

The DSA LAG API will be changed to become more similar with the bridge
data structures, where struct dsa_bridge holds an unsigned int num,
which is generated by DSA and is one-based. We have a similar thing
going with the DSA LAG, except that isn't stored anywhere, it is
calculated dynamically by dsa_lag_id() by iterating through dst->lags.

The idea of encoding an invalid (or not requested) LAG ID as zero for
the purpose of simplifying checks in drivers means that the LAG IDs
passed by DSA to drivers need to be one-based too. So back-and-forth
conversion is needed when indexing the dst->lags array, as well as in
drivers which assume a zero-based index.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ef7f446cbdf4..3ae93adda25d 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -163,9 +163,10 @@ struct dsa_switch_tree {
 	unsigned int last_switch;
 };
 
+/* LAG IDs are one-based, the dst->lags array is zero-based */
 #define dsa_lags_foreach_id(_id, _dst)				\
-	for ((_id) = 0; (_id) < (_dst)->lags_len; (_id)++)	\
-		if ((_dst)->lags[(_id)])
+	for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++)	\
+		if ((_dst)->lags[(_id) - 1])
 
 #define dsa_lag_foreach_port(_dp, _dst, _lag)			\
 	list_for_each_entry((_dp), &(_dst)->ports, list)	\
@@ -178,7 +179,8 @@ struct dsa_switch_tree {
 static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst,
 					     unsigned int id)
 {
-	return dst->lags[id];
+	/* DSA LAG IDs are one-based, dst->lags is zero-based */
+	return dst->lags[id - 1];
 }
 
 static inline int dsa_lag_id(struct dsa_switch_tree *dst,
-- 
cgit v1.2.3


From dedd6a009f4191989bee83c1faf66728648a223f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 23 Feb 2022 16:00:49 +0200
Subject: net: dsa: create a dsa_lag structure

The main purpose of this change is to create a data structure for a LAG
as seen by DSA. This is similar to what we have for bridging - we pass a
copy of this structure by value to ->port_lag_join and ->port_lag_leave.
For now we keep the lag_dev, id and a reference count in it. Future
patches will add a list of FDB entries for the LAG (these also need to
be refcounted to work properly).

The LAG structure is created using dsa_port_lag_create() and destroyed
using dsa_port_lag_destroy(), just like we have for bridging.

Because now, the dsa_lag itself is refcounted, we can simplify
dsa_lag_map() and dsa_lag_unmap(). These functions need to keep a LAG in
the dst->lags array only as long as at least one port uses it. The
refcounting logic inside those functions can be removed now - they are
called only when we should perform the operation.

dsa_lag_dev() is renamed to dsa_lag_by_id() and now returns the dsa_lag
structure instead of the lag_dev net_device.

dsa_lag_foreach_port() now takes the dsa_lag structure as argument.

dst->lags holds an array of dsa_lag structures.

dsa_lag_map() now also saves the dsa_lag->id value, so that linear
walking of dst->lags in drivers using dsa_lag_id() is no longer
necessary. They can just look at lag.id.

dsa_port_lag_id_get() is a helper, similar to dsa_port_bridge_num_get(),
which can be used by drivers to get the LAG ID assigned by DSA to a
given port.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 50 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 3ae93adda25d..81ed34998416 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -116,6 +116,12 @@ struct dsa_netdevice_ops {
 #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto)				\
 	MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE))
 
+struct dsa_lag {
+	struct net_device *dev;
+	unsigned int id;
+	refcount_t refcount;
+};
+
 struct dsa_switch_tree {
 	struct list_head	list;
 
@@ -134,7 +140,7 @@ struct dsa_switch_tree {
 	/* Maps offloaded LAG netdevs to a zero-based linear ID for
 	 * drivers that need it.
 	 */
-	struct net_device **lags;
+	struct dsa_lag **lags;
 
 	/* Tagging protocol operations */
 	const struct dsa_device_ops *tag_ops;
@@ -170,14 +176,14 @@ struct dsa_switch_tree {
 
 #define dsa_lag_foreach_port(_dp, _dst, _lag)			\
 	list_for_each_entry((_dp), &(_dst)->ports, list)	\
-		if ((_dp)->lag_dev == (_lag))
+		if (dsa_port_offloads_lag((_dp), (_lag)))
 
 #define dsa_hsr_foreach_port(_dp, _ds, _hsr)			\
 	list_for_each_entry((_dp), &(_ds)->dst->ports, list)	\
 		if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr))
 
-static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst,
-					     unsigned int id)
+static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst,
+					    unsigned int id)
 {
 	/* DSA LAG IDs are one-based, dst->lags is zero-based */
 	return dst->lags[id - 1];
@@ -189,8 +195,10 @@ static inline int dsa_lag_id(struct dsa_switch_tree *dst,
 	unsigned int id;
 
 	dsa_lags_foreach_id(id, dst) {
-		if (dsa_lag_dev(dst, id) == lag_dev)
-			return id;
+		struct dsa_lag *lag = dsa_lag_by_id(dst, id);
+
+		if (lag->dev == lag_dev)
+			return lag->id;
 	}
 
 	return -ENODEV;
@@ -293,7 +301,7 @@ struct dsa_port {
 	struct devlink_port	devlink_port;
 	struct phylink		*pl;
 	struct phylink_config	pl_config;
-	struct net_device	*lag_dev;
+	struct dsa_lag		*lag;
 	struct net_device	*hsr_dev;
 
 	struct list_head list;
@@ -643,14 +651,30 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp)
 		return dp->vlan_filtering;
 }
 
+static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp)
+{
+	return dp->lag ? dp->lag->id : 0;
+}
+
+static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp)
+{
+	return dp->lag ? dp->lag->dev : NULL;
+}
+
+static inline bool dsa_port_offloads_lag(struct dsa_port *dp,
+					 const struct dsa_lag *lag)
+{
+	return dsa_port_lag_dev_get(dp) == lag->dev;
+}
+
 static inline
 struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp)
 {
 	if (!dp->bridge)
 		return NULL;
 
-	if (dp->lag_dev)
-		return dp->lag_dev;
+	if (dp->lag)
+		return dp->lag->dev;
 	else if (dp->hsr_dev)
 		return dp->hsr_dev;
 
@@ -968,10 +992,10 @@ struct dsa_switch_ops {
 	int	(*crosschip_lag_change)(struct dsa_switch *ds, int sw_index,
 					int port);
 	int	(*crosschip_lag_join)(struct dsa_switch *ds, int sw_index,
-				      int port, struct net_device *lag_dev,
+				      int port, struct dsa_lag lag,
 				      struct netdev_lag_upper_info *info);
 	int	(*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index,
-				       int port, struct net_device *lag_dev);
+				       int port, struct dsa_lag lag);
 
 	/*
 	 * PTP functionality
@@ -1043,10 +1067,10 @@ struct dsa_switch_ops {
 	 */
 	int	(*port_lag_change)(struct dsa_switch *ds, int port);
 	int	(*port_lag_join)(struct dsa_switch *ds, int port,
-				 struct net_device *lag_dev,
+				 struct dsa_lag lag,
 				 struct netdev_lag_upper_info *info);
 	int	(*port_lag_leave)(struct dsa_switch *ds, int port,
-				  struct net_device *lag_dev);
+				  struct dsa_lag lag);
 
 	/*
 	 * HSR integration
-- 
cgit v1.2.3


From ec638740fce990ad2b9af43ead8088d6d6eb2145 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 23 Feb 2022 16:00:50 +0200
Subject: net: switchdev: remove lag_mod_cb from
 switchdev_handle_fdb_event_to_device

When the switchdev_handle_fdb_event_to_device() event replication helper
was created, my original thought was that FDB events on LAG interfaces
should most likely be special-cased, not just replicated towards all
switchdev ports beneath that LAG. So this replication helper currently
does not recurse through switchdev lower interfaces of LAG bridge ports,
but rather calls the lag_mod_cb() if that was provided.

No switchdev driver uses this helper for FDB events on LAG interfaces
yet, so that was an assumption which was yet to be tested. It is
certainly usable for that purpose, as my RFC series shows:

https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/

however this approach is slightly convoluted because:

- the switchdev driver gets a "dev" that isn't its own net device, but
  rather the LAG net device. It must call switchdev_lower_dev_find(dev)
  in order to get a handle of any of its own net devices (the ones that
  pass check_cb).

- in order for FDB entries on LAG ports to be correctly refcounted per
  the number of switchdev ports beneath that LAG, we haven't escaped the
  need to iterate through the LAG's lower interfaces. Except that is now
  the responsibility of the switchdev driver, because the replication
  helper just stopped half-way.

So, even though yes, FDB events on LAG bridge ports must be
special-cased, in the end it's simpler to let switchdev_handle_fdb_*
just iterate through the LAG port's switchdev lowers, and let the
switchdev driver figure out that those physical ports are under a LAG.

The switchdev_handle_fdb_event_to_device() helper takes a
"foreign_dev_check" callback so it can figure out whether @dev can
autonomously forward to @foreign_dev. DSA fills this method properly:
if the LAG is offloaded by another port in the same tree as @dev, then
it isn't foreign. If it is a software LAG, it is foreign - forwarding
happens in software.

Whether an interface is foreign or not decides whether the replication
helper will go through the LAG's switchdev lowers or not. Since the
lan966x doesn't properly fill this out, FDB events on software LAG
uppers will get called. By changing lan966x_foreign_dev_check(), we can
suppress them.

Whereas DSA will now start receiving FDB events for its offloaded LAG
uppers, so we need to return -EOPNOTSUPP, since we currently don't do
the right thing for them.

Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/switchdev.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index c32e1c8f79ec..3e424d40fae3 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -313,10 +313,7 @@ int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long e
 					     const struct net_device *foreign_dev),
 		int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev,
 			      unsigned long event, const void *ctx,
-			      const struct switchdev_notifier_fdb_info *fdb_info),
-		int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev,
-				  unsigned long event, const void *ctx,
-				  const struct switchdev_notifier_fdb_info *fdb_info));
+			      const struct switchdev_notifier_fdb_info *fdb_info));
 
 int switchdev_handle_port_obj_add(struct net_device *dev,
 			struct switchdev_notifier_port_obj_info *port_obj_info,
@@ -443,10 +440,7 @@ switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event
 					     const struct net_device *foreign_dev),
 		int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev,
 			      unsigned long event, const void *ctx,
-			      const struct switchdev_notifier_fdb_info *fdb_info),
-		int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev,
-				  unsigned long event, const void *ctx,
-				  const struct switchdev_notifier_fdb_info *fdb_info))
+			      const struct switchdev_notifier_fdb_info *fdb_info))
 {
 	return 0;
 }
-- 
cgit v1.2.3


From e212fa7c54184b7b1f88990bd328b23b567cbf41 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 23 Feb 2022 16:00:53 +0200
Subject: net: dsa: support FDB events on offloaded LAG interfaces

This change introduces support for installing static FDB entries towards
a bridge port that is a LAG of multiple DSA switch ports, as well as
support for filtering towards the CPU local FDB entries emitted for LAG
interfaces that are bridge ports.

Conceptually, host addresses on LAG ports are identical to what we do
for plain bridge ports. Whereas FDB entries _towards_ a LAG can't simply
be replicated towards all member ports like we do for multicast, or VLAN.
Instead we need new driver API. Hardware usually considers a LAG to be a
"logical port", and sets the entire LAG as the forwarding destination.
The physical egress port selection within the LAG is made by hashing
policy, as usual.

To represent the logical port corresponding to the LAG, we pass by value
a copy of the dsa_lag structure to all switches in the tree that have at
least one port in that LAG.

To illustrate why a refcounted list of FDB entries is needed in struct
dsa_lag, it is enough to say that:
- a LAG may be a bridge port and may therefore receive FDB events even
  while it isn't yet offloaded by any DSA interface
- DSA interfaces may be removed from a LAG while that is a bridge port;
  we don't want FDB entries lingering around, but we don't want to
  remove entries that are still in use, either

For all the cases below to work, the idea is to always keep an FDB entry
on a LAG with a reference count equal to the DSA member ports. So:
- if a port joins a LAG, it requests the bridge to replay the FDB, and
  the FDB entries get created, or their refcount gets bumped by one
- if a port leaves a LAG, the FDB replay deletes or decrements refcount
  by one
- if an FDB is installed towards a LAG with ports already present, that
  entry is created (if it doesn't exist) and its refcount is bumped by
  the amount of ports already present in the LAG

echo "Adding FDB entry to bond with existing ports"
ip link del bond0
ip link add bond0 type bond mode 802.3ad
ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up
ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up
ip link del br0
ip link add br0 type bridge
ip link set bond0 master br0
bridge fdb add dev bond0 00:01:02:03:04:05 master static

ip link del br0
ip link del bond0

echo "Adding FDB entry to empty bond"
ip link del bond0
ip link add bond0 type bond mode 802.3ad
ip link del br0
ip link add br0 type bridge
ip link set bond0 master br0
bridge fdb add dev bond0 00:01:02:03:04:05 master static
ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up
ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up

ip link del br0
ip link del bond0

echo "Adding FDB entry to empty bond, then removing ports one by one"
ip link del bond0
ip link add bond0 type bond mode 802.3ad
ip link del br0
ip link add br0 type bridge
ip link set bond0 master br0
bridge fdb add dev bond0 00:01:02:03:04:05 master static
ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up
ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up

ip link set swp1 nomaster
ip link set swp2 nomaster
ip link del br0
ip link del bond0

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 81ed34998416..01faba89c987 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -119,6 +119,8 @@ struct dsa_netdevice_ops {
 struct dsa_lag {
 	struct net_device *dev;
 	unsigned int id;
+	struct mutex fdb_lock;
+	struct list_head fdbs;
 	refcount_t refcount;
 };
 
@@ -944,6 +946,10 @@ struct dsa_switch_ops {
 				const unsigned char *addr, u16 vid);
 	int	(*port_fdb_dump)(struct dsa_switch *ds, int port,
 				 dsa_fdb_dump_cb_t *cb, void *data);
+	int	(*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag,
+			       const unsigned char *addr, u16 vid);
+	int	(*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag,
+			       const unsigned char *addr, u16 vid);
 
 	/*
 	 * Multicast database
-- 
cgit v1.2.3


From 7bbb765b73496699a165d505ecdce962f903b422 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Wed, 23 Feb 2022 17:57:40 +0000
Subject: net/tcp: Merge TCP-MD5 inbound callbacks

The functions do essentially the same work to verify TCP-MD5 sign.
Code can be merged into one family-independent function in order to
reduce copy'n'paste and generated code.
Later with TCP-AO option added, this will allow to create one function
that's responsible for segment verification, that will have all the
different checks for MD5/AO/non-signed packets, which in turn will help
to see checks for all corner-cases in one function, rather than spread
around different families and functions.

Cc: Eric Dumazet <edumazet@google.com>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220223175740.452397-1-dima@arista.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 04f4650e0ff0..479a27777ad6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1674,6 +1674,11 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index,
 		return NULL;
 	return __tcp_md5_do_lookup(sk, l3index, addr, family);
 }
+bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+			  enum skb_drop_reason *reason,
+			  const void *saddr, const void *daddr,
+			  int family, int dif, int sdif);
+
 
 #define tcp_twsk_md5_key(twsk)	((twsk)->tw_md5_key)
 #else
@@ -1683,6 +1688,14 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index,
 {
 	return NULL;
 }
+static inline bool tcp_inbound_md5_hash(const struct sock *sk,
+					const struct sk_buff *skb,
+					enum skb_drop_reason *reason,
+					const void *saddr, const void *daddr,
+					int family, int dif, int sdif)
+{
+	return false;
+}
 #define tcp_twsk_md5_key(twsk)	NULL
 #endif
 
-- 
cgit v1.2.3


From c26933639b5402c174c65c01d33f145622784012 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 25 Feb 2022 11:22:22 +0200
Subject: net: dsa: request drivers to perform FDB isolation

For DSA, to encourage drivers to perform FDB isolation simply means to
track which bridge does each FDB and MDB entry belong to. It then
becomes the driver responsibility to use something that makes the FDB
entry from one bridge not match the FDB lookup of ports from other
bridges.

The top-level functions where the bridge is determined are:
- dsa_port_fdb_{add,del}
- dsa_port_host_fdb_{add,del}
- dsa_port_mdb_{add,del}
- dsa_port_host_mdb_{add,del}

aka the pre-crosschip-notifier functions.

Changing the API to pass a reference to a bridge is not superfluous, and
looking at the passed bridge argument is not the same as having the
driver look at dsa_to_port(ds, port)->bridge from the ->port_fdb_add()
method.

DSA installs FDB and MDB entries on shared (CPU and DSA) ports as well,
and those do not have any dp->bridge information to retrieve, because
they are not in any bridge - they are merely the pipes that serve the
user ports that are in one or multiple bridges.

The struct dsa_bridge associated with each FDB/MDB entry is encapsulated
in a larger "struct dsa_db" database. Although only databases associated
to bridges are notified for now, this API will be the starting point for
implementing IFF_UNICAST_FLT in DSA. There, the idea is to install FDB
entries on the CPU port which belong to the corresponding user port's
port database. These are supposed to match only when the port is
standalone.

It is better to introduce the API in its expected final form than to
introduce it for bridges first, then to have to change drivers which may
have made one or more assumptions.

Drivers can use the provided bridge.num, but they can also use a
different numbering scheme that is more convenient.

DSA must perform refcounting on the CPU and DSA ports by also taking
into account the bridge number. So if two bridges request the same local
address, DSA must notify the driver twice, once for each bridge.

In fact, if the driver supports FDB isolation, DSA must perform
refcounting per bridge, but if the driver doesn't, DSA must refcount
host addresses across all bridges, otherwise it would be telling the
driver to delete an FDB entry for a bridge and the driver would delete
it for all bridges. So introduce a bool fdb_isolation in drivers which
would make all bridge databases passed to the cross-chip notifier have
the same number (0). This makes dsa_mac_addr_find() -> dsa_db_equal()
say that all bridge databases are the same database - which is
essentially the legacy behavior.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 01faba89c987..87c5f18eb381 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -341,11 +341,28 @@ struct dsa_link {
 	struct list_head list;
 };
 
+enum dsa_db_type {
+	DSA_DB_PORT,
+	DSA_DB_LAG,
+	DSA_DB_BRIDGE,
+};
+
+struct dsa_db {
+	enum dsa_db_type type;
+
+	union {
+		const struct dsa_port *dp;
+		struct dsa_lag lag;
+		struct dsa_bridge bridge;
+	};
+};
+
 struct dsa_mac_addr {
 	unsigned char addr[ETH_ALEN];
 	u16 vid;
 	refcount_t refcount;
 	struct list_head list;
+	struct dsa_db db;
 };
 
 struct dsa_vlan {
@@ -409,6 +426,13 @@ struct dsa_switch {
 	 */
 	u32			mtu_enforcement_ingress:1;
 
+	/* Drivers that isolate the FDBs of multiple bridges must set this
+	 * to true to receive the bridge as an argument in .port_fdb_{add,del}
+	 * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be
+	 * passed as zero.
+	 */
+	u32			fdb_isolation:1;
+
 	/* Listener for switch fabric events */
 	struct notifier_block	nb;
 
@@ -941,23 +965,29 @@ struct dsa_switch_ops {
 	 * Forwarding database
 	 */
 	int	(*port_fdb_add)(struct dsa_switch *ds, int port,
-				const unsigned char *addr, u16 vid);
+				const unsigned char *addr, u16 vid,
+				struct dsa_db db);
 	int	(*port_fdb_del)(struct dsa_switch *ds, int port,
-				const unsigned char *addr, u16 vid);
+				const unsigned char *addr, u16 vid,
+				struct dsa_db db);
 	int	(*port_fdb_dump)(struct dsa_switch *ds, int port,
 				 dsa_fdb_dump_cb_t *cb, void *data);
 	int	(*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag,
-			       const unsigned char *addr, u16 vid);
+			       const unsigned char *addr, u16 vid,
+			       struct dsa_db db);
 	int	(*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag,
-			       const unsigned char *addr, u16 vid);
+			       const unsigned char *addr, u16 vid,
+			       struct dsa_db db);
 
 	/*
 	 * Multicast database
 	 */
 	int	(*port_mdb_add)(struct dsa_switch *ds, int port,
-				const struct switchdev_obj_port_mdb *mdb);
+				const struct switchdev_obj_port_mdb *mdb,
+				struct dsa_db db);
 	int	(*port_mdb_del)(struct dsa_switch *ds, int port,
-				const struct switchdev_obj_port_mdb *mdb);
+				const struct switchdev_obj_port_mdb *mdb,
+				struct dsa_db db);
 	/*
 	 * RXNFC
 	 */
-- 
cgit v1.2.3


From 06b9cce42634a50f2840777a66553b02320db5ef Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 25 Feb 2022 11:22:23 +0200
Subject: net: dsa: pass extack to .port_bridge_join driver methods

As FDB isolation cannot be enforced between VLAN-aware bridges in lack
of hardware assistance like extra FID bits, it seems plausible that many
DSA switches cannot do it. Therefore, they need to reject configurations
with multiple VLAN-aware bridges from the two code paths that can
transition towards that state:

- joining a VLAN-aware bridge
- toggling VLAN awareness on an existing bridge

The .port_vlan_filtering method already propagates the netlink extack to
the driver, let's propagate it from .port_bridge_join too, to make sure
that the driver can use the same function for both.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 87c5f18eb381..cfedcfb86350 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -937,7 +937,8 @@ struct dsa_switch_ops {
 	int	(*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs);
 	int	(*port_bridge_join)(struct dsa_switch *ds, int port,
 				    struct dsa_bridge bridge,
-				    bool *tx_fwd_offload);
+				    bool *tx_fwd_offload,
+				    struct netlink_ext_ack *extack);
 	void	(*port_bridge_leave)(struct dsa_switch *ds, int port,
 				     struct dsa_bridge bridge);
 	void	(*port_stp_state_set)(struct dsa_switch *ds, int port,
@@ -1021,7 +1022,8 @@ struct dsa_switch_ops {
 	 */
 	int	(*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index,
 					 int sw_index, int port,
-					 struct dsa_bridge bridge);
+					 struct dsa_bridge bridge,
+					 struct netlink_ext_ack *extack);
 	void	(*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index,
 					  int sw_index, int port,
 					  struct dsa_bridge bridge);
-- 
cgit v1.2.3


From b8cd5831c61cadee4493248babef8386f836f31c Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Thu, 24 Feb 2022 10:29:07 +0000
Subject: net: flow_offload: add tc police action parameters

The current police offload action entry is missing exceed/notexceed
actions and parameters that can be configured by tc police action.
Add the missing parameters as a pre-step for offloading police actions
to hardware.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Signed-off-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h     |  9 +++++++++
 include/net/tc_act/tc_police.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 5b8c54eb7a6b..74f44d44abe3 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -148,6 +148,8 @@ enum flow_action_id {
 	FLOW_ACTION_MPLS_MANGLE,
 	FLOW_ACTION_GATE,
 	FLOW_ACTION_PPPOE_PUSH,
+	FLOW_ACTION_JUMP,
+	FLOW_ACTION_PIPE,
 	NUM_FLOW_ACTIONS,
 };
 
@@ -235,9 +237,16 @@ struct flow_action_entry {
 		struct {				/* FLOW_ACTION_POLICE */
 			u32			burst;
 			u64			rate_bytes_ps;
+			u64			peakrate_bytes_ps;
+			u32			avrate;
+			u16			overhead;
 			u64			burst_pkt;
 			u64			rate_pkt_ps;
 			u32			mtu;
+			struct {
+				enum flow_action_id	act_id;
+				u32			extval;
+			} exceed, notexceed;
 		} police;
 		struct {				/* FLOW_ACTION_CT */
 			int action;
diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
index 72649512dcdd..283bde711a42 100644
--- a/include/net/tc_act/tc_police.h
+++ b/include/net/tc_act/tc_police.h
@@ -159,4 +159,34 @@ static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act)
 	return params->tcfp_mtu;
 }
 
+static inline u64 tcf_police_peakrate_bytes_ps(const struct tc_action *act)
+{
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+
+	params = rcu_dereference_protected(police->params,
+					   lockdep_is_held(&police->tcf_lock));
+	return params->peak.rate_bytes_ps;
+}
+
+static inline u32 tcf_police_tcfp_ewma_rate(const struct tc_action *act)
+{
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+
+	params = rcu_dereference_protected(police->params,
+					   lockdep_is_held(&police->tcf_lock));
+	return params->tcfp_ewma_rate;
+}
+
+static inline u16 tcf_police_rate_overhead(const struct tc_action *act)
+{
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+
+	params = rcu_dereference_protected(police->params,
+					   lockdep_is_held(&police->tcf_lock));
+	return params->rate.overhead;
+}
+
 #endif /* __NET_TC_POLICE_H */
-- 
cgit v1.2.3


From d97b4b105ce71f9f907f1511986cc1d224126772 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Thu, 24 Feb 2022 10:29:08 +0000
Subject: flow_offload: reject offload for all drivers with invalid police
 parameters

As more police parameters are passed to flow_offload, driver can check
them to make sure hardware handles packets in the way indicated by tc.
The conform-exceed control should be drop/pipe or drop/ok. Besides,
for drop/ok, the police should be the last action. As hardware can't
configure peakrate/avrate/overhead, offload should not be supported if
any of them is configured.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 74f44d44abe3..92267d23083e 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -311,6 +311,12 @@ static inline bool flow_offload_has_one_action(const struct flow_action *action)
 	return action->num_entries == 1;
 }
 
+static inline bool flow_action_is_last_entry(const struct flow_action *action,
+					     const struct flow_action_entry *entry)
+{
+	return entry == &action->entries[action->num_entries - 1];
+}
+
 #define flow_action_for_each(__i, __act, __actions)			\
         for (__i = 0, __act = &(__actions)->entries[0];			\
 	     __i < (__actions)->num_entries;				\
-- 
cgit v1.2.3


From f9c4bb0b245cee35ef66f75bf409c9573d934cf9 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@nvidia.com>
Date: Tue, 1 Mar 2022 05:04:36 +0000
Subject: vxlan: vni filtering support on collect metadata device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds vnifiltering support to collect metadata device.

Motivation:
You can only use a single vxlan collect metadata device for a given
vxlan udp port in the system today. The vxlan collect metadata device
terminates all received vxlan packets. As shown in the below diagram,
there are use-cases where you need to support multiple such vxlan devices in
independent bridge domains. Each vxlan device must terminate the vni's
it is configured for.
Example usecase: In a service provider network a service provider
typically supports multiple bridge domains with overlapping vlans.
One bridge domain per customer. Vlans in each bridge domain are
mapped to globally unique vxlan ranges assigned to each customer.

vnifiltering support in collect metadata devices terminates only configured
vnis. This is similar to vlan filtering in bridge driver. The vni filtering
capability is provided by a new flag on collect metadata device.

In the below pic:
	- customer1 is mapped to br1 bridge domain
	- customer2 is mapped to br2 bridge domain
	- customer1 vlan 10-11 is mapped to vni 1001-1002
	- customer2 vlan 10-11 is mapped to vni 2001-2002
	- br1 and br2 are vlan filtering bridges
	- vxlan1 and vxlan2 are collect metadata devices with
	  vnifiltering enabled

┌──────────────────────────────────────────────────────────────────┐
│  switch                                                          │
│                                                                  │
│         ┌───────────┐                 ┌───────────┐              │
│         │           │                 │           │              │
│         │   br1     │                 │   br2     │              │
│         └┬─────────┬┘                 └──┬───────┬┘              │
│     vlans│         │               vlans │       │               │
│     10,11│         │                10,11│       │               │
│          │     vlanvnimap:               │    vlanvnimap:        │
│          │       10-1001,11-1002         │      10-2001,11-2002  │
│          │         │                     │       │               │
│   ┌──────┴┐     ┌──┴─────────┐       ┌───┴────┐  │               │
│   │ swp1  │     │vxlan1      │       │ swp2   │ ┌┴─────────────┐ │
│   │       │     │  vnifilter:│       │        │ │vxlan2        │ │
│   └───┬───┘     │   1001,1002│       └───┬────┘ │ vnifilter:   │ │
│       │         └────────────┘           │      │  2001,2002   │ │
│       │                                  │      └──────────────┘ │
│       │                                  │                       │
└───────┼──────────────────────────────────┼───────────────────────┘
        │                                  │
        │                                  │
  ┌─────┴───────┐                          │
  │  customer1  │                    ┌─────┴──────┐
  │ host/VM     │                    │customer2   │
  └─────────────┘                    │ host/VM    │
                                     └────────────┘

With this implementation, vxlan dst metadata device can
be associated with range of vnis.
struct vxlan_vni_node is introduced to represent
a configured vni. We start with vni and its
associated remote_ip in this structure. This
structure can be extended to bring in other
per vni attributes if there are usecases for it.
A vni inherits an attribute from the base vxlan device
if there is no per vni attributes defined.

struct vxlan_dev gets a new rhashtable for
vnis called vxlan_vni_group. vxlan_vnifilter.c
implements the necessary netlink api, notifications
and helper functions to process and manage lifecycle
of vxlan_vni_node.

This patch also adds new helper functions in vxlan_multicast.c
to handle per vni remote_ip multicast groups which are part
of vxlan_vni_group.

Fix build problems:
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 5a934bebe630..8eb961bb9589 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -232,6 +232,25 @@ struct vxlan_dev_node {
 	struct vxlan_dev *vxlan;
 };
 
+struct vxlan_vni_node {
+	struct rhash_head vnode;
+	struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */
+#if IS_ENABLED(CONFIG_IPV6)
+	struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */
+#endif
+	struct list_head vlist;
+	__be32 vni;
+	union vxlan_addr remote_ip; /* default remote ip for this vni */
+
+	struct rcu_head rcu;
+};
+
+struct vxlan_vni_group {
+	struct rhashtable	vni_hash;
+	struct list_head	vni_list;
+	u32			num_vnis;
+};
+
 /* Pseudo network device */
 struct vxlan_dev {
 	struct vxlan_dev_node hlist4;	/* vni hash table for IPv4 socket */
@@ -254,6 +273,8 @@ struct vxlan_dev {
 
 	struct vxlan_config	cfg;
 
+	struct vxlan_vni_group  __rcu *vnigrp;
+
 	struct hlist_head fdb_head[FDB_HASH_SIZE];
 };
 
@@ -274,6 +295,7 @@ struct vxlan_dev {
 #define VXLAN_F_GPE			0x4000
 #define VXLAN_F_IPV6_LINKLOCAL		0x8000
 #define VXLAN_F_TTL_INHERIT		0x10000
+#define VXLAN_F_VNIFILTER               0x20000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -283,7 +305,8 @@ struct vxlan_dev {
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_REMCSUM_RX |		\
 					 VXLAN_F_REMCSUM_NOPARTIAL |	\
-					 VXLAN_F_COLLECT_METADATA)
+					 VXLAN_F_COLLECT_METADATA |	\
+					 VXLAN_F_VNIFILTER)
 
 /* Flags that can be set together with VXLAN_F_GPE. */
 #define VXLAN_F_ALLOWED_GPE		(VXLAN_F_GPE |			\
@@ -292,7 +315,8 @@ struct vxlan_dev {
 					 VXLAN_F_UDP_ZERO_CSUM_TX |	\
 					 VXLAN_F_UDP_ZERO_CSUM6_TX |	\
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
-					 VXLAN_F_COLLECT_METADATA)
+					 VXLAN_F_COLLECT_METADATA  |	\
+					 VXLAN_F_VNIFILTER)
 
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
-- 
cgit v1.2.3


From 4095e0e1328a3cd9e3b30174d6cb0edb3824256d Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 1 Mar 2022 05:04:38 +0000
Subject: drivers: vxlan: vnifilter: per vni stats

Add per-vni statistics for vni filter mode. Counting Rx/Tx
bytes/packets/drops/errors at the appropriate places.

This patch changes vxlan_vs_find_vni to also return the
vxlan_vni_node in cases where the vni belongs to a vni
filtering vxlan device

Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/net')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 8eb961bb9589..bca5b01af247 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -227,6 +227,31 @@ struct vxlan_config {
 	enum ifla_vxlan_df	df;
 };
 
+enum {
+	VXLAN_VNI_STATS_RX,
+	VXLAN_VNI_STATS_RX_DROPS,
+	VXLAN_VNI_STATS_RX_ERRORS,
+	VXLAN_VNI_STATS_TX,
+	VXLAN_VNI_STATS_TX_DROPS,
+	VXLAN_VNI_STATS_TX_ERRORS,
+};
+
+struct vxlan_vni_stats {
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 rx_drops;
+	u64 rx_errors;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_drops;
+	u64 tx_errors;
+};
+
+struct vxlan_vni_stats_pcpu {
+	struct vxlan_vni_stats stats;
+	struct u64_stats_sync syncp;
+};
+
 struct vxlan_dev_node {
 	struct hlist_node hlist;
 	struct vxlan_dev *vxlan;
@@ -241,6 +266,7 @@ struct vxlan_vni_node {
 	struct list_head vlist;
 	__be32 vni;
 	union vxlan_addr remote_ip; /* default remote ip for this vni */
+	struct vxlan_vni_stats_pcpu __percpu *stats;
 
 	struct rcu_head rcu;
 };
-- 
cgit v1.2.3


From 462791bbfa350189e309a5a94541f6b63cd874e8 Mon Sep 17 00:00:00 2001
From: Dust Li <dust.li@linux.alibaba.com>
Date: Tue, 1 Mar 2022 17:43:56 +0800
Subject: net/smc: add sysctl interface for SMC

This patch add sysctl interface to support container environment
for SMC as we talk in the mail list.

Link: https://lore.kernel.org/netdev/20220224020253.GF5443@linux.alibaba.com
Co-developed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/smc.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index 47b166684fd8..1682eae50579 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -14,5 +14,8 @@ struct netns_smc {
 	struct smc_stats_rsn		*fback_rsn;
 
 	bool				limit_smc_hs;	/* constraint on handshake */
+#ifdef CONFIG_SYSCTL
+	struct ctl_table_header		*smc_hdr;
+#endif
 };
 #endif
-- 
cgit v1.2.3


From 12bbb0d163a90d81a2677cf7808d364697290207 Mon Sep 17 00:00:00 2001
From: Dust Li <dust.li@linux.alibaba.com>
Date: Tue, 1 Mar 2022 17:43:58 +0800
Subject: net/smc: add sysctl for autocorking

This add a new sysctl: net.smc.autocorking_size

We can dynamically change the behaviour of autocorking
by change the value of autocorking_size.
Setting to 0 disables autocorking in SMC

Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/smc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index 1682eae50579..e5389eeaf8bd 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -17,5 +17,6 @@ struct netns_smc {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header		*smc_hdr;
 #endif
+	unsigned int			sysctl_autocorking_size;
 };
 #endif
-- 
cgit v1.2.3


From 42f0c1934c7cb3e94c2fe8f5771245fa5631d0e7 Mon Sep 17 00:00:00 2001
From: Tao Chen <chentao3@hotmail.com>
Date: Tue, 1 Mar 2022 06:35:42 -0800
Subject: tcp: Remove the unused api

Last tcp_write_queue_head() use was removed in commit
114f39feab36 ("tcp: restore autocorking"), so remove it.

Signed-off-by: Tao Chen <chentao3@hotmail.com>
Link: https://lore.kernel.org/r/SYZP282MB33317DEE1253B37C0F57231E86029@SYZP282MB3331.AUSP282.PROD.OUTLOOK.COM
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 479a27777ad6..d486d7b6112d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1831,11 +1831,6 @@ static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk)
 	return skb_rb_last(&sk->tcp_rtx_queue);
 }
 
-static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
-{
-	return skb_peek(&sk->sk_write_queue);
-}
-
 static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
 {
 	return skb_peek_tail(&sk->sk_write_queue);
-- 
cgit v1.2.3


From 8610037e8106b48c79cfe0afb92b2b2466e51c3d Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Tue, 1 Mar 2022 23:55:47 -0800
Subject: page_pool: Add allocation stats

Add per-pool statistics counters for the allocation path of a page pool.
These stats are incremented in softirq context, so no locking or per-cpu
variables are needed.

This code is disabled by default and a kernel config option is provided for
users who wish to enable them.

The statistics added are:
	- fast: successful fast path allocations
	- slow: slow path order-0 allocations
	- slow_high_order: slow path high order allocations
	- empty: ptr ring is empty, so a slow path allocation was forced.
	- refill: an allocation which triggered a refill of the cache
	- waive: pages obtained from the ptr ring that cannot be added to
	  the cache due to a NUMA mismatch.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/net')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 97c3c19872ff..1f27e8a48830 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -84,6 +84,19 @@ struct page_pool_params {
 	void *init_arg;
 };
 
+#ifdef CONFIG_PAGE_POOL_STATS
+struct page_pool_alloc_stats {
+	u64 fast; /* fast path allocations */
+	u64 slow; /* slow-path order 0 allocations */
+	u64 slow_high_order; /* slow-path high order allocations */
+	u64 empty; /* failed refills due to empty ptr ring, forcing
+		    * slow path allocation
+		    */
+	u64 refill; /* allocations via successful refill */
+	u64 waive;  /* failed refills due to numa zone mismatch */
+};
+#endif
+
 struct page_pool {
 	struct page_pool_params p;
 
@@ -96,6 +109,11 @@ struct page_pool {
 	unsigned int frag_offset;
 	struct page *frag_page;
 	long frag_users;
+
+#ifdef CONFIG_PAGE_POOL_STATS
+	/* these stats are incremented while in softirq context */
+	struct page_pool_alloc_stats alloc_stats;
+#endif
 	u32 xdp_mem_id;
 
 	/*
-- 
cgit v1.2.3


From ad6fa1e1ab1b8164f1ba296b1b4dc556a483bcad Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Tue, 1 Mar 2022 23:55:48 -0800
Subject: page_pool: Add recycle stats

Add per-cpu stats tracking page pool recycling events:
	- cached: recycling placed page in the page pool cache
	- cache_full: page pool cache was full
	- ring: page placed into the ptr ring
	- ring_full: page released from page pool because the ptr ring was full
	- released_refcnt: page released (and not recycled) because refcnt > 1

Signed-off-by: Joe Damato <jdamato@fastly.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/net')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 1f27e8a48830..298af95bbf96 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -95,6 +95,18 @@ struct page_pool_alloc_stats {
 	u64 refill; /* allocations via successful refill */
 	u64 waive;  /* failed refills due to numa zone mismatch */
 };
+
+struct page_pool_recycle_stats {
+	u64 cached;	/* recycling placed page in the cache. */
+	u64 cache_full; /* cache was full */
+	u64 ring;	/* recycling placed page back into ptr ring */
+	u64 ring_full;	/* page was released from page-pool because
+			 * PTR ring was full.
+			 */
+	u64 released_refcnt; /* page released because of elevated
+			      * refcnt
+			      */
+};
 #endif
 
 struct page_pool {
@@ -144,6 +156,10 @@ struct page_pool {
 	 */
 	struct ptr_ring ring;
 
+#ifdef CONFIG_PAGE_POOL_STATS
+	/* recycle stats are per-cpu to avoid locking */
+	struct page_pool_recycle_stats __percpu *recycle_stats;
+#endif
 	atomic_t pages_state_release_cnt;
 
 	/* A page_pool is strictly tied to a single RX-queue being
-- 
cgit v1.2.3


From 6b95e3388b1ea0ca63500c5a6e39162dbf828433 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Tue, 1 Mar 2022 23:55:49 -0800
Subject: page_pool: Add function to batch and return stats

Adds a function page_pool_get_stats which can be used by drivers to obtain
stats for a specified page_pool.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/net')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 298af95bbf96..ea5fb70e5101 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -107,6 +107,23 @@ struct page_pool_recycle_stats {
 			      * refcnt
 			      */
 };
+
+/* This struct wraps the above stats structs so users of the
+ * page_pool_get_stats API can pass a single argument when requesting the
+ * stats for the page pool.
+ */
+struct page_pool_stats {
+	struct page_pool_alloc_stats alloc_stats;
+	struct page_pool_recycle_stats recycle_stats;
+};
+
+/*
+ * Drivers that wish to harvest page pool stats and report them to users
+ * (perhaps via ethtool, debugfs, or another mechanism) can allocate a
+ * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool.
+ */
+bool page_pool_get_stats(struct page_pool *pool,
+			 struct page_pool_stats *stats);
 #endif
 
 struct page_pool {
-- 
cgit v1.2.3


From f9cef64fa23f6c1ff177be5082113c5a94e34e5d Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Mar 2022 21:14:13 +0200
Subject: net: dsa: felix: migrate host FDB and MDB entries when changing tag
 proto

The "ocelot" and "ocelot-8021q" tagging protocols make use of different
hardware resources, and host FDB entries have different destination
ports in the switch analyzer module, practically speaking.

So when the user requests a tagging protocol change, the driver must
migrate all host FDB and MDB entries from the NPI port (in fact CPU port
module) towards the same physical port, but this time used as a regular
port.

It is pointless for the felix driver to keep a copy of the host
addresses, when we can create and export DSA helpers for walking through
the addresses that it already needs to keep on the CPU port, for
refcounting purposes.

felix_classify_db() is moved up to avoid a forward declaration.

We pass "bool change" because dp->fdbs and dp->mdbs are uninitialized
lists when felix_setup() first calls felix_set_tag_protocol(), so we
need to avoid calling dsa_port_walk_fdbs() during probe time.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index cfedcfb86350..71cc363dbbd4 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1219,6 +1219,13 @@ struct dsa_switch_driver {
 
 struct net_device *dsa_dev_to_net_device(struct device *dev);
 
+typedef int dsa_fdb_walk_cb_t(struct dsa_switch *ds, int port,
+			      const unsigned char *addr, u16 vid,
+			      struct dsa_db db);
+
+int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb);
+int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb);
+
 /* Keep inline for faster access in hot path */
 static inline bool netdev_uses_dsa(const struct net_device *dev)
 {
-- 
cgit v1.2.3


From 8672406eb5d77333ca14e9612e3166704b367c40 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 2 Mar 2022 11:55:57 -0800
Subject: net: ip: Handle delivery_time in ip defrag

A latter patch will postpone the delivery_time clearing until the stack
knows the skb is being delivered locally.  That will allow other kernel
forwarding path (e.g. ip[6]_forward) to keep the delivery_time also.

An earlier attempt was to do skb_clear_delivery_time() in
ip_local_deliver() and ip6_input().  The discussion [0] requested
to move it one step later into ip_local_deliver_finish()
and ip6_input_finish() so that the delivery_time can be kept
for the ip_vs forwarding path also.

To do that, this patch also needs to take care of the (rcv) timestamp
usecase in ip_is_fragment().  It needs to expect delivery_time in
the skb->tstamp, so it needs to save the mono_delivery_time bit in
inet_frag_queue such that the delivery_time (if any) can be restored
in the final defragmented skb.

[Note that it will only happen when the locally generated skb is looping
 from egress to ingress over a virtual interface (e.g. veth, loopback...),
 skb->tstamp may have the delivery time before it is known that it will
 be delivered locally and received by another sk.]

[0]: https://lore.kernel.org/netdev/ca728d81-80e8-3767-d5e-d44f6ad96e43@ssi.bg/

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 63540be0fc34..911ad930867d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -70,6 +70,7 @@ struct frag_v6_compare_key {
  * @stamp: timestamp of the last received fragment
  * @len: total length of the original datagram
  * @meat: length of received fragments so far
+ * @mono_delivery_time: stamp has a mono delivery time (EDT)
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
  * @fqdir: pointer to struct fqdir
@@ -90,6 +91,7 @@ struct inet_frag_queue {
 	ktime_t			stamp;
 	int			len;
 	int			meat;
+	u8			mono_delivery_time;
 	__u8			flags;
 	u16			max_size;
 	struct fqdir		*fqdir;
-- 
cgit v1.2.3


From 3a0318140a6f8c3ab60f1f46c3f203923cb01882 Mon Sep 17 00:00:00 2001
From: Changcheng Deng <deng.changcheng@zte.com.cn>
Date: Fri, 21 Jan 2022 01:35:08 +0000
Subject: Bluetooth: mgmt: Replace zero-length array with flexible-array member

There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use "flexible array members" for these cases. The older
style of one-element or zero-length arrays should no longer be used.
Reference:
https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Changcheng Deng <deng.changcheng@zte.com.cn>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/mgmt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 99266f7aebdc..3d26e6a3478b 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -1112,7 +1112,7 @@ struct mgmt_ev_adv_monitor_device_found {
 	__s8   rssi;
 	__le32 flags;
 	__le16 eir_len;
-	__u8   eir[0];
+	__u8   eir[];
 } __packed;
 
 #define MGMT_EV_ADV_MONITOR_DEVICE_LOST		0x0030
-- 
cgit v1.2.3


From 9b392e0e0b6d026da5a62bb79a08f32e27af858e Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 3 Mar 2022 13:11:57 -0800
Subject: Bluetooth: Fix not checking for valid hdev on
 bt_dev_{info,warn,err,dbg}

This fixes attemting to print hdev->name directly which causes them to
print an error:

kernel: read_version:367: (efault): sock 000000006a3008f2

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/bluetooth.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index a647e5fabdbd..2aa5e95808a5 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -204,19 +204,21 @@ void bt_err_ratelimited(const char *fmt, ...);
 #define BT_DBG(fmt, ...)	pr_debug(fmt "\n", ##__VA_ARGS__)
 #endif
 
+#define bt_dev_name(hdev) ((hdev) ? (hdev)->name : "null")
+
 #define bt_dev_info(hdev, fmt, ...)				\
-	BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	BT_INFO("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 #define bt_dev_warn(hdev, fmt, ...)				\
-	BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	BT_WARN("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 #define bt_dev_err(hdev, fmt, ...)				\
-	BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	BT_ERR("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 #define bt_dev_dbg(hdev, fmt, ...)				\
-	BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	BT_DBG("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 
 #define bt_dev_warn_ratelimited(hdev, fmt, ...)			\
-	bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	bt_warn_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 #define bt_dev_err_ratelimited(hdev, fmt, ...)			\
-	bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__)
+	bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 
 /* Connection and socket states */
 enum {
-- 
cgit v1.2.3


From cd87fecdedd7b54c61f5d7c2b7237b43126ac50a Mon Sep 17 00:00:00 2001
From: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Date: Wed, 2 Mar 2022 22:52:34 -0300
Subject: net: dsa: tag_rtl8_4: add rtl8_4t trailing variant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Realtek switches supports the same tag both before ethertype or between
payload and the CRC.

Signed-off-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Reviewed-by: Alvin Šipraga <alsi@bang-olufsen.dk>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 71cc363dbbd4..759479fe8573 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -52,6 +52,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE		22
 #define DSA_TAG_PROTO_SJA1110_VALUE		23
 #define DSA_TAG_PROTO_RTL8_4_VALUE		24
+#define DSA_TAG_PROTO_RTL8_4T_VALUE		25
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -79,6 +80,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_SEVILLE		= DSA_TAG_PROTO_SEVILLE_VALUE,
 	DSA_TAG_PROTO_SJA1110		= DSA_TAG_PROTO_SJA1110_VALUE,
 	DSA_TAG_PROTO_RTL8_4		= DSA_TAG_PROTO_RTL8_4_VALUE,
+	DSA_TAG_PROTO_RTL8_4T		= DSA_TAG_PROTO_RTL8_4T_VALUE,
 };
 
 struct dsa_switch;
-- 
cgit v1.2.3


From 7e580490ac9819dd55a36be2a9b3380d1391f91b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 8 Mar 2022 11:15:15 +0200
Subject: net: dsa: felix: avoid early deletion of host FDB entries

The Felix driver declares FDB isolation but puts all standalone ports in
VID 0. This is mostly problem-free as discussed with Alvin here:
https://patchwork.kernel.org/project/netdevbpf/cover/20220302191417.1288145-1-vladimir.oltean@nxp.com/#24763870

however there is one catch. DSA still thinks that FDB entries are
installed on the CPU port as many times as there are user ports, and
this is problematic when multiple user ports share the same MAC address.

Consider the default case where all user ports inherit their MAC address
from the DSA master, and then the user runs:

ip link set swp0 address 00:01:02:03:04:05

The above will make dsa_slave_set_mac_address() call
dsa_port_standalone_host_fdb_add() for 00:01:02:03:04:05 in port 0's
standalone database, and dsa_port_standalone_host_fdb_del() for the old
address of swp0, again in swp0's standalone database.

Both the ->port_fdb_add() and ->port_fdb_del() will be propagated down
to the felix driver, which will end up deleting the old MAC address from
the CPU port. But this is still in use by other user ports, so we end up
breaking unicast termination for them.

There isn't a problem in the fact that DSA keeps track of host
standalone addresses in the individual database of each user port: some
drivers like sja1105 need this. There also isn't a problem in the fact
that some drivers choose the same VID/FID for all standalone ports.
It is just that the deletion of these host addresses must be delayed
until they are known to not be in use any longer, and only the driver
has this knowledge. Since DSA keeps these addresses in &cpu_dp->fdbs and
&cpu_db->mdbs, it is just a matter of walking over those lists and see
whether the same MAC address is present on the CPU port in the port db
of another user port.

I have considered reusing the generic dsa_port_walk_fdbs() and
dsa_port_walk_mdbs() schemes for this, but locking makes it difficult.
In the ->port_fdb_add() method and co, &dp->addr_lists_lock is held, but
dsa_port_walk_fdbs() also acquires that lock. Also, even assuming that
we introduce an unlocked variant of the address iterator, we'd still
need some relatively complex data structures, and a void *ctx in the
dsa_fdb_walk_cb_t which we don't currently pass, such that drivers are
able to figure out, after iterating, whether the same MAC address is or
isn't present in the port db of another port.

All the above, plus the fact that I expect other drivers to follow the
same model as felix where all standalone ports use the same FID, made me
conclude that a generic method provided by DSA is necessary:
dsa_fdb_present_in_other_db() and the mdb equivalent. Felix calls this
from the ->port_fdb_del() handler for the CPU port, when the database
was classified to either a port db, or a LAG db.

For symmetry, we also call this from ->port_fdb_add(), because if the
address was installed once, then installing it a second time serves no
purpose: it's already in hardware in VID 0 and it affects all standalone
ports.

This change moves dsa_db_equal() from switch.c to dsa.c, since it now
has one more caller.

Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 759479fe8573..9d16505fc0e2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1227,6 +1227,12 @@ typedef int dsa_fdb_walk_cb_t(struct dsa_switch *ds, int port,
 
 int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb);
 int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb);
+bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
+				 const unsigned char *addr, u16 vid,
+				 struct dsa_db db);
+bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
+				 const struct switchdev_obj_port_mdb *mdb,
+				 struct dsa_db db);
 
 /* Keep inline for faster access in hot path */
 static inline bool netdev_uses_dsa(const struct net_device *dev)
-- 
cgit v1.2.3


From 1330b6ef3313fcec577d2b020c290dc8b9f11f1a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 7 Mar 2022 16:44:21 -0800
Subject: skb: make drop reason booleanable

We have a number of cases where function returns drop/no drop
decision as a boolean. Now that we want to report the reason
code as well we have to pass extra output arguments.

We can make the reason code evaluate correctly as bool.

I believe we're good to reorder the reasons as they are
reported to user space as strings.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d486d7b6112d..ee8237b58e1d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1674,10 +1674,11 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index,
 		return NULL;
 	return __tcp_md5_do_lookup(sk, l3index, addr, family);
 }
-bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
-			  enum skb_drop_reason *reason,
-			  const void *saddr, const void *daddr,
-			  int family, int dif, int sdif);
+
+enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+		     const void *saddr, const void *daddr,
+		     int family, int dif, int sdif);
 
 
 #define tcp_twsk_md5_key(twsk)	((twsk)->tw_md5_key)
@@ -1688,13 +1689,13 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index,
 {
 	return NULL;
 }
-static inline bool tcp_inbound_md5_hash(const struct sock *sk,
-					const struct sk_buff *skb,
-					enum skb_drop_reason *reason,
-					const void *saddr, const void *daddr,
-					int family, int dif, int sdif)
+
+static inline enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+		     const void *saddr, const void *daddr,
+		     int family, int dif, int sdif);
 {
-	return false;
+	return SKB_NOT_DROPPED_YET;
 }
 #define tcp_twsk_md5_key(twsk)	NULL
 #endif
-- 
cgit v1.2.3


From 24055bb87977e0c687b54ebf7bac8715f3636bc3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 9 Mar 2022 14:20:12 +0200
Subject: net: tcp: fix shim definition of tcp_inbound_md5_hash

When CONFIG_TCP_MD5SIG isn't enabled, there is a compilation bug due to
the fact that the static inline definition of tcp_inbound_md5_hash() has
an unexpected semicolon. Remove it.

Fixes: 1330b6ef3313 ("skb: make drop reason booleanable")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20220309122012.668986-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ee8237b58e1d..70ca4a5e330a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1693,7 +1693,7 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index,
 static inline enum skb_drop_reason
 tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 		     const void *saddr, const void *daddr,
-		     int family, int dif, int sdif);
+		     int family, int dif, int sdif)
 {
 	return SKB_NOT_DROPPED_YET;
 }
-- 
cgit v1.2.3


From 77f09e66f613e4801134c64d26a0be593588a42e Mon Sep 17 00:00:00 2001
From: Dimitris Michailidis <d.michailidis@fungible.com>
Date: Tue, 8 Mar 2022 19:40:31 -0800
Subject: net/tls: Provide {__,}tls_driver_ctx() unconditionally

Having the definitions of {__,}tls_driver_ctx() under an #if
guard means code referencing them also needs to rely on the
preprocessor. The protection doesn't appear needed so make the
definitions unconditional.

Fixes: db37bc177dae ("net/funeth: add the data path")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Dimitris Michailidis <dmichail@fungible.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tls.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tls.h b/include/net/tls.h
index 526cb2c3b724..b6968a5b5538 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -626,7 +626,6 @@ tls_offload_ctx_rx(const struct tls_context *tls_ctx)
 	return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx;
 }
 
-#if IS_ENABLED(CONFIG_TLS_DEVICE)
 static inline void *__tls_driver_ctx(struct tls_context *tls_ctx,
 				     enum tls_offload_ctx_dir direction)
 {
@@ -641,7 +640,6 @@ tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction)
 {
 	return __tls_driver_ctx(tls_get_ctx(sk), direction);
 }
-#endif
 
 #define RESYNC_REQ BIT(0)
 #define RESYNC_REQ_ASYNC BIT(1)
-- 
cgit v1.2.3


From 65466904b015f6eeb9225b51aeb29b01a1d4b59c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Mar 2022 17:57:57 -0800
Subject: tcp: adjust TSO packet sizes based on min_rtt

Back when tcp_tso_autosize() and TCP pacing were introduced,
our focus was really to reduce burst sizes for long distance
flows.

The simple heuristic of using sk_pacing_rate/1024 has worked
well, but can lead to too small packets for hosts in the same
rack/cluster, when thousands of flows compete for the bottleneck.

Neal Cardwell had the idea of making the TSO burst size
a function of both sk_pacing_rate and tcp_min_rtt()

Indeed, for local flows, sending bigger bursts is better
to reduce cpu costs, as occasional losses can be repaired
quite fast.

This patch is based on Neal Cardwell implementation
done more than two years ago.
bbr is adjusting max_pacing_rate based on measured bandwidth,
while cubic would over estimate max_pacing_rate.

/proc/sys/net/ipv4/tcp_tso_rtt_log can be used to tune or disable
this new feature, in logarithmic steps.

Tested:

100Gbit NIC, two hosts in the same rack, 4K MTU.
600 flows rate-limited to 20000000 bytes per second.

Before patch: (TSO sizes would be limited to 20000000/1024/4096 -> 4 segments per TSO)

~# echo 0 >/proc/sys/net/ipv4/tcp_tso_rtt_log
~# nstat -n;perf stat ./super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000;nstat|egrep "TcpInSegs|TcpOutSegs|TcpRetransSegs|Delivered"
  96005

 Performance counter stats for './super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000':

         65,945.29 msec task-clock                #    2.845 CPUs utilized
         1,314,632      context-switches          # 19935.279 M/sec
             5,292      cpu-migrations            #   80.249 M/sec
           940,641      page-faults               # 14264.023 M/sec
   201,117,030,926      cycles                    # 3049769.216 GHz                   (83.45%)
    17,699,435,405      stalled-cycles-frontend   #    8.80% frontend cycles idle     (83.48%)
   136,584,015,071      stalled-cycles-backend    #   67.91% backend cycles idle      (83.44%)
    53,809,530,436      instructions              #    0.27  insn per cycle
                                                  #    2.54  stalled cycles per insn  (83.36%)
     9,062,315,523      branches                  # 137422329.563 M/sec               (83.22%)
       153,008,621      branch-misses             #    1.69% of all branches          (83.32%)

      23.182970846 seconds time elapsed

TcpInSegs                       15648792           0.0
TcpOutSegs                      58659110           0.0  # Average of 3.7 4K segments per TSO packet
TcpExtTCPDelivered              58654791           0.0
TcpExtTCPDeliveredCE            19                 0.0

After patch:

~# echo 9 >/proc/sys/net/ipv4/tcp_tso_rtt_log
~# nstat -n;perf stat ./super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000;nstat|egrep "TcpInSegs|TcpOutSegs|TcpRetransSegs|Delivered"
  96046

 Performance counter stats for './super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000':

         48,982.58 msec task-clock                #    2.104 CPUs utilized
           186,014      context-switches          # 3797.599 M/sec
             3,109      cpu-migrations            #   63.472 M/sec
           941,180      page-faults               # 19214.814 M/sec
   153,459,763,868      cycles                    # 3132982.807 GHz                   (83.56%)
    12,069,861,356      stalled-cycles-frontend   #    7.87% frontend cycles idle     (83.32%)
   120,485,917,953      stalled-cycles-backend    #   78.51% backend cycles idle      (83.24%)
    36,803,672,106      instructions              #    0.24  insn per cycle
                                                  #    3.27  stalled cycles per insn  (83.18%)
     5,947,266,275      branches                  # 121417383.427 M/sec               (83.64%)
        87,984,616      branch-misses             #    1.48% of all branches          (83.43%)

      23.281200256 seconds time elapsed

TcpInSegs                       1434706            0.0
TcpOutSegs                      58883378           0.0  # Average of 41 4K segments per TSO packet
TcpExtTCPDelivered              58878971           0.0
TcpExtTCPDeliveredCE            9664               0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://lore.kernel.org/r/20220309015757.2532973-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netns/ipv4.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index f0687867b5cd..ce0cc4e8d8c7 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -127,6 +127,7 @@ struct netns_ipv4 {
 	u8 sysctl_tcp_synack_retries;
 	u8 sysctl_tcp_syncookies;
 	u8 sysctl_tcp_migrate_req;
+	u8 sysctl_tcp_comp_sack_nr;
 	int sysctl_tcp_reordering;
 	u8 sysctl_tcp_retries1;
 	u8 sysctl_tcp_retries2;
@@ -160,9 +161,9 @@ struct netns_ipv4 {
 	int sysctl_tcp_challenge_ack_limit;
 	int sysctl_tcp_min_rtt_wlen;
 	u8 sysctl_tcp_min_tso_segs;
+	u8 sysctl_tcp_tso_rtt_log;
 	u8 sysctl_tcp_autocorking;
 	u8 sysctl_tcp_reflect_tos;
-	u8 sysctl_tcp_comp_sack_nr;
 	int sysctl_tcp_invalid_ratelimit;
 	int sysctl_tcp_pacing_ss_ratio;
 	int sysctl_tcp_pacing_ca_ratio;
-- 
cgit v1.2.3


From 3af722cb735d212554027ec81e2aa2e6bf1ee34d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 8 Mar 2022 17:12:10 +0100
Subject: powerpc/net: Implement powerpc specific csum_shift() to remove branch

Today's implementation of csum_shift() leads to branching based on
parity of 'offset'

	000002f8 <csum_block_add>:
	     2f8:	70 a5 00 01 	andi.   r5,r5,1
	     2fc:	41 a2 00 08 	beq     304 <csum_block_add+0xc>
	     300:	54 84 c0 3e 	rotlwi  r4,r4,24
	     304:	7c 63 20 14 	addc    r3,r3,r4
	     308:	7c 63 01 94 	addze   r3,r3
	     30c:	4e 80 00 20 	blr

Use first bit of 'offset' directly as input of the rotation instead of
branching.

	000002f8 <csum_block_add>:
	     2f8:	54 a5 1f 38 	rlwinm  r5,r5,3,28,28
	     2fc:	20 a5 00 20 	subfic  r5,r5,32
	     300:	5c 84 28 3e 	rotlw   r4,r4,r5
	     304:	7c 63 20 14 	addc    r3,r3,r4
	     308:	7c 63 01 94 	addze   r3,r3
	     30c:	4e 80 00 20 	blr

And change to left shift instead of right shift to skip one more
instruction. This has no impact on the final sum.

	000002f8 <csum_block_add>:
	     2f8:	54 a5 1f 38 	rlwinm  r5,r5,3,28,28
	     2fc:	5c 84 28 3e 	rotlw   r4,r4,r5
	     300:	7c 63 20 14 	addc    r3,r3,r4
	     304:	7c 63 01 94 	addze   r3,r3
	     308:	4e 80 00 20 	blr

Seems like only powerpc benefits from a branchless implementation.
Other main architectures like ARM or X86 get better code with
the generic implementation and its branch.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/checksum.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/checksum.h b/include/net/checksum.h
index 79c67f14c448..6bc783b7a06c 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -80,6 +80,7 @@ static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend)
 	return csum16_add(csum, ~addend);
 }
 
+#ifndef HAVE_ARCH_CSUM_SHIFT
 static __always_inline __wsum csum_shift(__wsum sum, int offset)
 {
 	/* rotate sum to align it with a 16b boundary */
@@ -87,6 +88,7 @@ static __always_inline __wsum csum_shift(__wsum sum, int offset)
 		return (__force __wsum)ror32((__force u32)sum, 8);
 	return sum;
 }
+#endif
 
 static __always_inline __wsum
 csum_block_add(__wsum csum, __wsum csum2, int offset)
-- 
cgit v1.2.3


From 7d352ccf1e9935b5222ca84e8baeb07a0c8f94b9 Mon Sep 17 00:00:00 2001
From: Youghandhar Chintala <youghand@codeaurora.org>
Date: Tue, 8 Mar 2022 17:23:24 +0530
Subject: mac80211: Add support to trigger sta disconnect on hardware restart

Currently in case of target hardware restart, we just reconfig and
re-enable the security keys and enable the network queues to start
data traffic back from where it was interrupted.

Many ath10k wifi chipsets have sequence numbers for the data
packets assigned by firmware and the mac sequence number will
restart from zero after target hardware restart leading to mismatch
in the sequence number expected by the remote peer vs the sequence
number of the frame sent by the target firmware.

This mismatch in sequence number will cause out-of-order packets
on the remote peer and all the frames sent by the device are dropped
until we reach the sequence number which was sent before we restarted
the target hardware

In order to fix this, we trigger a sta disconnect, in case of target
hw restart. After this there will be a fresh connection and thereby
avoiding the dropping of frames by remote peer.

The right fix would be to pull the entire data path into the host
which is not feasible or would need lots of complex changes and
will still be inefficient.

Tested on ath10k using WCN3990, QCA6174

Signed-off-by: Youghandhar Chintala <youghand@codeaurora.org>
Link: https://lore.kernel.org/r/20220308115325.5246-2-youghand@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f118b8fe667a..b8e8c82b53aa 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6070,6 +6070,16 @@ void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect);
  */
 void ieee80211_resume_disconnect(struct ieee80211_vif *vif);
 
+/**
+ * ieee80211_hw_restart_disconnect - disconnect from AP after
+ * hardware restart
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ *
+ * Instructs mac80211 to disconnect from the AP after
+ * hardware restart.
+ */
+void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif);
+
 /**
  * ieee80211_cqm_rssi_notify - inform a configured connection quality monitoring
  *	rssi threshold triggered
-- 
cgit v1.2.3


From 9af41cc33471ea1efa6f77e188f055cc77d0a5c5 Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Fri, 4 Mar 2022 17:40:43 +0100
Subject: gtp: Implement GTP echo response

Adding GTP device through ip link creates the situation where
there is no userspace daemon which would handle GTP messages
(Echo Request for example). GTP-U instance which would not respond
to echo requests would violate GTP specification.

When GTP packet arrives with GTP_ECHO_REQ message type,
GTP_ECHO_RSP is send to the sender. GTP_ECHO_RSP message
should contain information element with GTPIE_RECOVERY tag and
restart counter value. For GTPv1 restart counter is not used
and should be equal to 0, for GTPv0 restart counter contains
information provided from userspace(IFLA_GTP_RESTART_COUNT).

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Suggested-by: Harald Welte <laforge@gnumonks.org>
Reviewed-by: Harald Welte <laforge@gnumonks.org>
Tested-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/gtp.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/net')

diff --git a/include/net/gtp.h b/include/net/gtp.h
index 0e16ebb2a82d..0e12c37f2958 100644
--- a/include/net/gtp.h
+++ b/include/net/gtp.h
@@ -7,8 +7,13 @@
 #define GTP0_PORT	3386
 #define GTP1U_PORT	2152
 
+/* GTP messages types */
+#define GTP_ECHO_REQ	1	/* Echo Request */
+#define GTP_ECHO_RSP	2	/* Echo Response */
 #define GTP_TPDU	255
 
+#define GTPIE_RECOVERY	14
+
 struct gtp0_header {	/* According to GSM TS 09.60. */
 	__u8	flags;
 	__u8	type;
@@ -27,6 +32,32 @@ struct gtp1_header {	/* According to 3GPP TS 29.060. */
 	__be32	tid;
 } __attribute__ ((packed));
 
+struct gtp1_header_long {	/* According to 3GPP TS 29.060. */
+	__u8	flags;
+	__u8	type;
+	__be16	length;
+	__be32	tid;
+	__be16	seq;
+	__u8	npdu;
+	__u8	next;
+} __packed;
+
+/* GTP Information Element */
+struct gtp_ie {
+	__u8	tag;
+	__u8	val;
+} __packed;
+
+struct gtp0_packet {
+	struct gtp0_header gtp0_h;
+	struct gtp_ie ie;
+} __packed;
+
+struct gtp1u_packet {
+	struct gtp1_header_long gtp1u_h;
+	struct gtp_ie ie;
+} __packed;
+
 #define GTP1_F_NPDU	0x01
 #define GTP1_F_SEQ	0x02
 #define GTP1_F_EXTHDR	0x04
-- 
cgit v1.2.3


From e3acda7ade0a36c5cbebc2b54d30b7f08a4ba29b Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Fri, 4 Mar 2022 17:40:45 +0100
Subject: net/sched: Allow flower to match on GTP options

Options are as follows: PDU_TYPE:QFI and they refernce to
the fields from the  PDU Session Protocol. PDU Session data
is conveyed in GTP-U Extension Header.

GTP-U Extension Header is described in 3GPP TS 29.281.
PDU Session Protocol is described in 3GPP TS 38.415.

PDU_TYPE -  indicates the type of the PDU Session Information (4 bits)
QFI      -  QoS Flow Identifier (6 bits)

  # ip link add gtp_dev type gtp role sgsn
  # tc qdisc add dev gtp_dev ingress
  # tc filter add dev gtp_dev protocol ip parent ffff: \
      flower \
        enc_key_id 11 \
        gtp_opts 1:8/ff:ff \
      action mirred egress redirect dev eth0

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/gtp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/gtp.h b/include/net/gtp.h
index 0e12c37f2958..23c2aaae8a42 100644
--- a/include/net/gtp.h
+++ b/include/net/gtp.h
@@ -58,6 +58,11 @@ struct gtp1u_packet {
 	struct gtp_ie ie;
 } __packed;
 
+struct gtp_pdu_session_info {	/* According to 3GPP TS 38.415. */
+	u8	pdu_type;
+	u8	qfi;
+};
+
 #define GTP1_F_NPDU	0x01
 #define GTP1_F_SEQ	0x02
 #define GTP1_F_EXTHDR	0x04
-- 
cgit v1.2.3


From 81dd9849fa4911f76a14f354a048865894b9751e Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Fri, 4 Mar 2022 17:40:46 +0100
Subject: gtp: Add support for checking GTP device type

Add a function that checks if a net device type is GTP.

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Reviewed-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/gtp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/gtp.h b/include/net/gtp.h
index 23c2aaae8a42..c1d6169df331 100644
--- a/include/net/gtp.h
+++ b/include/net/gtp.h
@@ -63,6 +63,12 @@ struct gtp_pdu_session_info {	/* According to 3GPP TS 38.415. */
 	u8	qfi;
 };
 
+static inline bool netif_is_gtp(const struct net_device *dev)
+{
+	return dev->rtnl_link_ops &&
+		!strcmp(dev->rtnl_link_ops->kind, "gtp");
+}
+
 #define GTP1_F_NPDU	0x01
 #define GTP1_F_SEQ	0x02
 #define GTP1_F_EXTHDR	0x04
-- 
cgit v1.2.3


From 625788b5844511cf4c30cffa7fa0bc3a69cebc82 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Mar 2022 21:14:20 -0800
Subject: net: add per-cpu storage and net->core_stats

Before adding yet another possibly contended atomic_long_t,
it is time to add per-cpu storage for existing ones:
 dev->tx_dropped, dev->rx_dropped, and dev->rx_nohandler

Because many devices do not have to increment such counters,
allocate the per-cpu storage on demand, so that dev_get_stats()
does not have to spend considerable time folding zero counters.

Note that some drivers have abused these counters which
were supposed to be only used by core networking stack.

v4: should use per_cpu_ptr() in dev_get_stats() (Jakub)
v3: added a READ_ONCE() in netdev_core_stats_alloc() (Paolo)
v2: add a missing include (reported by kernel test robot <lkp@intel.com>)
    Change in netdev_core_stats_alloc() (Jakub)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: jeffreyji <jeffreyji@google.com>
Reviewed-by: Brian Vazquez <brianvv@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20220311051420.2608812-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/bonding.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index d0dfe727e0b1..b14f4c0b4e9e 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -770,7 +770,7 @@ extern const struct sysfs_ops slave_sysfs_ops;
 
 static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb)
 {
-	atomic_long_inc(&dev->tx_dropped);
+	dev_core_stats_tx_dropped_inc(dev);
 	dev_kfree_skb_any(skb);
 	return NET_XMIT_DROP;
 }
-- 
cgit v1.2.3


From d538eca85c2aa6ecb5036e6ff47efc93d9f294da Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Mar 2022 23:15:18 +0200
Subject: net: dsa: report and change port default priority using dcbnl

The port-based default QoS class is assigned to packets that lack a
VLAN PCP (or the port is configured to not trust the VLAN PCP),
an IP DSCP (or the port is configured to not trust IP DSCP), and packets
on which no tc-skbedit action has matched.

Similar to other drivers, this can be exposed to user space using the
DCB Application Priority Table. IEEE 802.1Q-2018 specifies in Table
D-8 - Sel field values that when the Selector is 1, the Protocol ID
value of 0 denotes the "Default application priority. For use when
application priority is not otherwise specified."

The way in which the dcbnl integration in DSA has been designed has to
do with its requirements. Andrew Lunn explains that SOHO switches are
expected to come with some sort of pre-configured QoS profile, and that
it is desirable for this to come pre-loaded into the DSA slave interfaces'
DCB application priority table.

In the dcbnl design, this is possible because calls to dcb_ieee_setapp()
can be initiated by anyone including being self-initiated by this device
driver.

However, what makes this challenging to implement in DSA is that the DSA
core manages the net_devices (effectively hiding them from drivers),
while drivers manage the hardware. The DSA core has no knowledge of what
individual drivers' QoS policies are. DSA could export to drivers a
wrapper over dcb_ieee_setapp() and these could call that function to
pre-populate the app priority table, however drivers don't have a good
moment in time to do this. The dsa_switch_ops :: setup() method gets
called before the net_devices are created (dsa_slave_create), and so is
dsa_switch_ops :: port_setup(). What remains is dsa_switch_ops ::
port_enable(), but this gets called upon each ndo_open. If we add app
table entries on every open, we'd need to remove them on close, to avoid
duplicate entry errors. But if we delete app priority entries on close,
what we delete may not be the initial, driver pre-populated entries, but
rather user-added entries.

So it is clear that letting drivers choose the timing of the
dcb_ieee_setapp() call is inappropriate. The alternative which was
chosen is to introduce hardware-specific ops in dsa_switch_ops, and
effectively hide dcbnl details from drivers as well. For pre-populating
the application table, dsa_slave_dcbnl_init() will call
ds->ops->port_get_default_prio() which is supposed to read from
hardware. If the operation succeeds, DSA creates a default-prio app
table entry. The method is called as soon as the slave_dev is
registered, but before we release the rtnl_mutex. This is done such that
user space sees the app table entries as soon as it sees the interface
being registered.

The fact that we populate slave_dev->dcbnl_ops with a non-NULL pointer
changes behavior in dcb_doit() from net/dcb/dcbnl.c, which used to
return -EOPNOTSUPP for any dcbnl operation where netdev->dcbnl_ops is
NULL. Because there are still dcbnl-unaware DSA drivers even if they
have dcbnl_ops populated, the way to restore the behavior is to make all
dcbnl_ops return -EOPNOTSUPP on absence of the hardware-specific
dsa_switch_ops method.

The dcbnl framework absurdly allows there to be more than one app table
entry for the same selector and protocol (in other words, more than one
port-based default priority). In the iproute2 dcb program, there is a
"replace" syntactical sugar command which performs an "add" and a "del"
to hide this away. But we choose the largest configured priority when we
call ds->ops->port_set_default_prio(), using __fls(). When there is no
default-prio app table entry left, the port-default priority is restored
to 0.

Link: https://patchwork.kernel.org/project/netdevbpf/patch/20210113154139.1803705-2-olteanv@gmail.com/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9d16505fc0e2..1220af73151b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -892,6 +892,13 @@ struct dsa_switch_ops {
 	int	(*get_ts_info)(struct dsa_switch *ds, int port,
 			       struct ethtool_ts_info *ts);
 
+	/*
+	 * DCB ops
+	 */
+	int	(*port_get_default_prio)(struct dsa_switch *ds, int port);
+	int	(*port_set_default_prio)(struct dsa_switch *ds, int port,
+					 u8 prio);
+
 	/*
 	 * Suspend and resume
 	 */
-- 
cgit v1.2.3


From 47d75f7822064d024ec9207c0fc1777f983783b7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Mar 2022 23:15:19 +0200
Subject: net: dsa: report and change port dscp priority using dcbnl

Similar to the port-based default priority, IEEE 802.1Q-2018 allows the
Application Priority Table to define QoS classes (0 to 7) per IP DSCP
value (0 to 63).

In the absence of an app table entry for a packet with DSCP value X,
QoS classification for that packet falls back to other methods (VLAN PCP
or port-based default). The presence of an app table for DSCP value X
with priority Y makes the hardware classify the packet to QoS class Y.

As opposed to the default-prio where DSA exposes only a "set" in
dsa_switch_ops (because the port-based default is the fallback, it
always exists, either implicitly or explicitly), for DSCP priorities we
expose an "add" and a "del". The addition of a DSCP entry means trusting
that DSCP priority, the deletion means ignoring it.

Drivers that already trust (at least some) DSCP values can describe
their configuration in dsa_switch_ops :: port_get_dscp_prio(), which is
called for each DSCP value from 0 to 63.

Again, there can be more than one dcbnl app table entry for the same
DSCP value, DSA chooses the one with the largest configured priority.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1220af73151b..9bfe984fcdbf 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -898,6 +898,11 @@ struct dsa_switch_ops {
 	int	(*port_get_default_prio)(struct dsa_switch *ds, int port);
 	int	(*port_set_default_prio)(struct dsa_switch *ds, int port,
 					 u8 prio);
+	int	(*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp);
+	int	(*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
+				      u8 prio);
+	int	(*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
+				      u8 prio);
 
 	/*
 	 * Suspend and resume
-- 
cgit v1.2.3


From 2b3171c6fe0af24b5506e061525e08917a2f744a Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 24 Feb 2022 12:54:58 +0100
Subject: mac80211: MBSSID beacon handling in AP mode

Add new fields in struct beacon_data to store all MBSSID elements.
Generate a beacon template which includes all MBSSID elements.
Move CSA offset to reflect the MBSSID element length.

Co-developed-by: Aloka Dixit <alokad@codeaurora.org>
Signed-off-by: Aloka Dixit <alokad@codeaurora.org>
Co-developed-by: John Crispin <john@phrozen.org>
Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Tested-by: Money Wang <money.wang@mediatek.com>
Link: https://lore.kernel.org/r/5322db3c303f431adaf191ab31c45e151dde5465.1645702516.git.lorenzo@kernel.org
[small cleanups]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index b8e8c82b53aa..382ebb862ea8 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4948,12 +4948,14 @@ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets);
  * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets
  *	to countdown counters.  This array can contain zero values which
  *	should be ignored.
+ * @mbssid_off: position of the multiple bssid element
  */
 struct ieee80211_mutable_offsets {
 	u16 tim_offset;
 	u16 tim_length;
 
 	u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
+	u16 mbssid_off;
 };
 
 /**
-- 
cgit v1.2.3


From 40867d74c374b235e14d839f3a77f26684feefe5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Mon, 14 Mar 2022 14:45:51 -0600
Subject: net: Add l3mdev index to flow struct and avoid oif reset for port
 devices

The fundamental premise of VRF and l3mdev core code is binding a socket
to a device (l3mdev or netdev with an L3 domain) to indicate L3 scope.
Legacy code resets flowi_oif to the l3mdev losing any original port
device binding. Ben (among others) has demonstrated use cases where the
original port device binding is important and needs to be retained.
This patch handles that by adding a new entry to the common flow struct
that can indicate the l3mdev index for later rule and table matching
avoiding the need to reset flowi_oif.

In addition to allowing more use cases that require port device binds,
this patch brings a few datapath simplications:

1. l3mdev_fib_rule_match is only called when walking fib rules and
   always after l3mdev_update_flow. That allows an optimization to bail
   early for non-VRF type uses cases when flowi_l3mdev is not set. Also,
   only that index needs to be checked for the FIB table id.

2. l3mdev_update_flow can be called with flowi_oif set to a l3mdev
   (e.g., VRF) device. By resetting flowi_oif only for this case the
   FLOWI_FLAG_SKIP_NH_OIF flag is not longer needed and can be removed,
   removing several checks in the datapath. The flowi_iif path can be
   simplified to only be called if the it is not loopback (loopback can
   not be assigned to an L3 domain) and the l3mdev index is not already
   set.

3. Avoid another device lookup in the output path when the fib lookup
   returns a reject failure.

Note: 2 functional tests for local traffic with reject fib rules are
updated to reflect the new direct failure at FIB lookup time for ping
rather than the failure on packet path. The current code fails like this:

    HINT: Fails since address on vrf device is out of device scope
    COMMAND: ip netns exec ns-A ping -c1 -w1 -I eth1 172.16.3.1
    ping: Warning: source address might be selected on device other than: eth1
    PING 172.16.3.1 (172.16.3.1) from 172.16.3.1 eth1: 56(84) bytes of data.

    --- 172.16.3.1 ping statistics ---
    1 packets transmitted, 0 received, 100% packet loss, time 0ms

where the test now directly fails:

    HINT: Fails since address on vrf device is out of device scope
    COMMAND: ip netns exec ns-A ping -c1 -w1 -I eth1 172.16.3.1
    ping: connect: No route to host

Signed-off-by: David Ahern <dsahern@kernel.org>
Tested-by: Ben Greear <greearb@candelatech.com>
Link: https://lore.kernel.org/r/20220314204551.16369-1-dsahern@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/flow.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/flow.h b/include/net/flow.h
index 58beb16a49b8..987bd511d652 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -29,6 +29,7 @@ struct flowi_tunnel {
 struct flowi_common {
 	int	flowic_oif;
 	int	flowic_iif;
+	int     flowic_l3mdev;
 	__u32	flowic_mark;
 	__u8	flowic_tos;
 	__u8	flowic_scope;
@@ -36,7 +37,6 @@ struct flowi_common {
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_KNOWN_NH		0x02
-#define FLOWI_FLAG_SKIP_NH_OIF		0x04
 	__u32	flowic_secid;
 	kuid_t  flowic_uid;
 	struct flowi_tunnel flowic_tun_key;
@@ -70,6 +70,7 @@ struct flowi4 {
 	struct flowi_common	__fl_common;
 #define flowi4_oif		__fl_common.flowic_oif
 #define flowi4_iif		__fl_common.flowic_iif
+#define flowi4_l3mdev		__fl_common.flowic_l3mdev
 #define flowi4_mark		__fl_common.flowic_mark
 #define flowi4_tos		__fl_common.flowic_tos
 #define flowi4_scope		__fl_common.flowic_scope
@@ -102,6 +103,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 {
 	fl4->flowi4_oif = oif;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
+	fl4->flowi4_l3mdev = 0;
 	fl4->flowi4_mark = mark;
 	fl4->flowi4_tos = tos;
 	fl4->flowi4_scope = scope;
@@ -132,6 +134,7 @@ struct flowi6 {
 	struct flowi_common	__fl_common;
 #define flowi6_oif		__fl_common.flowic_oif
 #define flowi6_iif		__fl_common.flowic_iif
+#define flowi6_l3mdev		__fl_common.flowic_l3mdev
 #define flowi6_mark		__fl_common.flowic_mark
 #define flowi6_scope		__fl_common.flowic_scope
 #define flowi6_proto		__fl_common.flowic_proto
@@ -177,6 +180,7 @@ struct flowi {
 	} u;
 #define flowi_oif	u.__fl_common.flowic_oif
 #define flowi_iif	u.__fl_common.flowic_iif
+#define flowi_l3mdev	u.__fl_common.flowic_l3mdev
 #define flowi_mark	u.__fl_common.flowic_mark
 #define flowi_tos	u.__fl_common.flowic_tos
 #define flowi_scope	u.__fl_common.flowic_scope
-- 
cgit v1.2.3


From 0492d857636e1c52cd71594a723c4b26a7b31978 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 16 Mar 2022 11:19:43 +0100
Subject: netfilter: flowtable: Fix QinQ and pppoe support for inet table

nf_flow_offload_inet_hook() does not check for 802.1q and PPPoE.
Fetch inner ethertype from these encapsulation protocols.

Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support")
Fixes: 4cd91f7c290f ("netfilter: flowtable: add vlan support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index bd59e950f4d6..64daafd1fc41 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -10,6 +10,8 @@
 #include <linux/netfilter/nf_conntrack_tuple_common.h>
 #include <net/flow_offload.h>
 #include <net/dst.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
 
 struct nf_flowtable;
 struct nf_flow_rule;
@@ -317,4 +319,20 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
 int nf_flow_table_offload_init(void);
 void nf_flow_table_offload_exit(void);
 
+static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb)
+{
+	__be16 proto;
+
+	proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+			     sizeof(struct pppoe_hdr)));
+	switch (proto) {
+	case htons(PPP_IP):
+		return htons(ETH_P_IP);
+	case htons(PPP_IPV6):
+		return htons(ETH_P_IPV6);
+	}
+
+	return 0;
+}
+
 #endif /* _NF_FLOW_TABLE_H */
-- 
cgit v1.2.3


From 2cb7b4890d6e7f20560dc251e7f8d3cc68b0d554 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 14 Mar 2022 23:00:04 -0700
Subject: devlink: expose instance locking and add locked port registering

It should be familiar and beneficial to expose devlink instance
lock to the drivers. This way drivers can block devlink from
calling them during critical sections without breakneck locking.

Add port helpers, port splitting callbacks will be the first
target.

Use 'devl_' prefix for "explicitly locked" API. Initial RFC used
'__devlink' but that's too much typing.

devl_lock_is_held() is not defined without lockdep, which is
the same behavior as lockdep_is_held() itself.

Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 8d5349d2fb68..9de0d091aee9 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1479,6 +1479,17 @@ void *devlink_priv(struct devlink *devlink);
 struct devlink *priv_to_devlink(void *priv);
 struct device *devlink_to_dev(const struct devlink *devlink);
 
+/* Devlink instance explicit locking */
+void devl_lock(struct devlink *devlink);
+void devl_unlock(struct devlink *devlink);
+void devl_assert_locked(struct devlink *devlink);
+bool devl_lock_is_held(struct devlink *devlink);
+
+int devl_port_register(struct devlink *devlink,
+		       struct devlink_port *devlink_port,
+		       unsigned int port_index);
+void devl_port_unregister(struct devlink_port *devlink_port);
+
 struct ib_device;
 
 struct net *devlink_net(const struct devlink *devlink);
-- 
cgit v1.2.3


From 706217c1ceb516c96283a1557a31fe003d0c8052 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 14 Mar 2022 23:00:09 -0700
Subject: devlink: pass devlink_port to port_split / port_unsplit callbacks

Now that devlink ports are protected by the instance lock
it seems natural to pass devlink_port as an argument to
the port_split / port_unsplit callbacks.

This should save the drivers from doing a lookup.

In theory drivers may have supported unsplitting ports
which were not registered prior to this change.

Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 9de0d091aee9..fd89a17adea1 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1197,9 +1197,9 @@ struct devlink_ops {
 			 struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
-	int (*port_split)(struct devlink *devlink, unsigned int port_index,
+	int (*port_split)(struct devlink *devlink, struct devlink_port *port,
 			  unsigned int count, struct netlink_ext_ack *extack);
-	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index,
+	int (*port_unsplit)(struct devlink *devlink, struct devlink_port *port,
 			    struct netlink_ext_ack *extack);
 	int (*sb_pool_get)(struct devlink *devlink, unsigned int sb_index,
 			   u16 pool_index,
-- 
cgit v1.2.3


From ab95465cde2337108252cdf01f064abdc1a67f6c Mon Sep 17 00:00:00 2001
From: Maor Dickman <maord@nvidia.com>
Date: Tue, 15 Mar 2022 13:02:09 +0200
Subject: net/sched: add vlan push_eth and pop_eth action to the hardware IR

Add vlan push_eth and pop_eth action to the hardware intermediate
representation model which would subsequently allow it to be used
by drivers for offload.

Signed-off-by: Maor Dickman <maord@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/flow_offload.h   |  6 ++++++
 include/net/tc_act/tc_vlan.h | 10 ++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 92267d23083e..021778a7e1af 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -150,6 +150,8 @@ enum flow_action_id {
 	FLOW_ACTION_PPPOE_PUSH,
 	FLOW_ACTION_JUMP,
 	FLOW_ACTION_PIPE,
+	FLOW_ACTION_VLAN_PUSH_ETH,
+	FLOW_ACTION_VLAN_POP_ETH,
 	NUM_FLOW_ACTIONS,
 };
 
@@ -211,6 +213,10 @@ struct flow_action_entry {
 			__be16		proto;
 			u8		prio;
 		} vlan;
+		struct {				/* FLOW_ACTION_VLAN_PUSH_ETH */
+			unsigned char dst[ETH_ALEN];
+			unsigned char src[ETH_ALEN];
+		} vlan_push_eth;
 		struct {				/* FLOW_ACTION_MANGLE */
 							/* FLOW_ACTION_ADD */
 			enum flow_action_mangle_base htype;
diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index f94b8bc26f9e..a97600f742de 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -78,4 +78,14 @@ static inline u8 tcf_vlan_push_prio(const struct tc_action *a)
 
 	return tcfv_push_prio;
 }
+
+static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest,
+				     const struct tc_action *a)
+{
+	rcu_read_lock();
+	memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN);
+	memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN);
+	rcu_read_unlock();
+}
+
 #endif /* __NET_TC_VLAN_H */
-- 
cgit v1.2.3


From 5142239a22219921a7863cf00c9ab853c00689d8 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 11 Mar 2022 10:14:18 +0100
Subject: net: veth: Account total xdp_frame len running ndo_xdp_xmit

Even if this is a theoretical issue since it is not possible to perform
XDP_REDIRECT on a non-linear xdp_frame, veth driver does not account
paged area in ndo_xdp_xmit function pointer.
Introduce xdp_get_frame_len utility routine to get the xdp_frame full
length and account total frame size running XDP_REDIRECT of a
non-linear xdp frame into a veth device.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/54f9fd3bb65d190daf2c0bbae2f852ff16cfbaa0.1646989407.git.lorenzo@kernel.org
---
 include/net/xdp.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/net')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index b7721c3e4d1f..04c852c7a77f 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -343,6 +343,20 @@ out:
 	__xdp_release_frame(xdpf->data, mem);
 }
 
+static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
+{
+	struct skb_shared_info *sinfo;
+	unsigned int len = xdpf->len;
+
+	if (likely(!xdp_frame_has_frags(xdpf)))
+		goto out;
+
+	sinfo = xdp_get_shared_info_from_frame(xdpf);
+	len += sinfo->xdp_frags_size;
+out:
+	return len;
+}
+
 int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		       struct net_device *dev, u32 queue_index,
 		       unsigned int napi_id, u32 frag_size);
-- 
cgit v1.2.3


From 87c167bb94ee3fd49569d4aa2038b9b8840d906f Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Wed, 16 Mar 2022 16:08:46 +0100
Subject: net: bridge: mst: Notify switchdev drivers of MST mode changes

Trigger a switchdev event whenever the bridge's MST mode is
enabled/disabled. This allows constituent ports to either perform any
required hardware config, or refuse the change if it not supported.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/switchdev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 3e424d40fae3..85dd004dc9ad 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -27,6 +27,7 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL,
 	SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
 	SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
+	SWITCHDEV_ATTR_ID_BRIDGE_MST,
 	SWITCHDEV_ATTR_ID_MRP_PORT_ROLE,
 };
 
@@ -48,6 +49,7 @@ struct switchdev_attr {
 		clock_t ageing_time;			/* BRIDGE_AGEING_TIME */
 		bool vlan_filtering;			/* BRIDGE_VLAN_FILTERING */
 		u16 vlan_protocol;			/* BRIDGE_VLAN_PROTOCOL */
+		bool mst;				/* BRIDGE_MST */
 		bool mc_disabled;			/* MC_DISABLED */
 		u8 mrp_port_role;			/* MRP_PORT_ROLE */
 	} u;
-- 
cgit v1.2.3


From 6284c723d9b9995cc27ab3c6368a9d95d67111ff Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Wed, 16 Mar 2022 16:08:47 +0100
Subject: net: bridge: mst: Notify switchdev drivers of VLAN MSTI migrations

Whenever a VLAN moves to a new MSTI, send a switchdev notification so
that switchdevs can track a bridge's VID to MSTI mappings.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/switchdev.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 85dd004dc9ad..53dfa0f7cf5b 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -29,6 +29,7 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
 	SWITCHDEV_ATTR_ID_BRIDGE_MST,
 	SWITCHDEV_ATTR_ID_MRP_PORT_ROLE,
+	SWITCHDEV_ATTR_ID_VLAN_MSTI,
 };
 
 struct switchdev_brport_flags {
@@ -36,6 +37,11 @@ struct switchdev_brport_flags {
 	unsigned long mask;
 };
 
+struct switchdev_vlan_msti {
+	u16 vid;
+	u16 msti;
+};
+
 struct switchdev_attr {
 	struct net_device *orig_dev;
 	enum switchdev_attr_id id;
@@ -52,6 +58,7 @@ struct switchdev_attr {
 		bool mst;				/* BRIDGE_MST */
 		bool mc_disabled;			/* MC_DISABLED */
 		u8 mrp_port_role;			/* MRP_PORT_ROLE */
+		struct switchdev_vlan_msti vlan_msti;	/* VLAN_MSTI */
 	} u;
 };
 
-- 
cgit v1.2.3


From 7ae9147f4312903b97eae231c48571bbd95dc63f Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Wed, 16 Mar 2022 16:08:48 +0100
Subject: net: bridge: mst: Notify switchdev drivers of MST state changes

Generate a switchdev notification whenever an MST state changes. This
notification is keyed by the VLANs MSTI rather than the VID, since
multiple VLANs may share the same MST instance.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/switchdev.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 53dfa0f7cf5b..aa0171d5786d 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -19,6 +19,7 @@
 enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_UNDEFINED,
 	SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+	SWITCHDEV_ATTR_ID_PORT_MST_STATE,
 	SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
 	SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS,
 	SWITCHDEV_ATTR_ID_PORT_MROUTER,
@@ -32,6 +33,11 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_VLAN_MSTI,
 };
 
+struct switchdev_mst_state {
+	u16 msti;
+	u8 state;
+};
+
 struct switchdev_brport_flags {
 	unsigned long val;
 	unsigned long mask;
@@ -50,6 +56,7 @@ struct switchdev_attr {
 	void (*complete)(struct net_device *dev, int err, void *priv);
 	union {
 		u8 stp_state;				/* PORT_STP_STATE */
+		struct switchdev_mst_state mst_state;	/* PORT_MST_STATE */
 		struct switchdev_brport_flags brport_flags; /* PORT_BRIDGE_FLAGS */
 		bool mrouter;				/* PORT_MROUTER */
 		clock_t ageing_time;			/* BRIDGE_AGEING_TIME */
-- 
cgit v1.2.3


From 8e6598a7b0fa834a563392d30017eac1b0102fcd Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Wed, 16 Mar 2022 16:08:53 +0100
Subject: net: dsa: Pass VLAN MSTI migration notifications to driver

Add the usual trampoline functionality from the generic DSA layer down
to the drivers for VLAN MSTI migrations.

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9bfe984fcdbf..644fda2293a2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -976,6 +976,9 @@ struct dsa_switch_ops {
 				 struct netlink_ext_ack *extack);
 	int	(*port_vlan_del)(struct dsa_switch *ds, int port,
 				 const struct switchdev_obj_port_vlan *vlan);
+	int	(*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge,
+				 const struct switchdev_vlan_msti *msti);
+
 	/*
 	 * Forwarding database
 	 */
-- 
cgit v1.2.3


From 7414af30b7d80f117bed5ad6769e6ffb433573ba Mon Sep 17 00:00:00 2001
From: Tobias Waldekranz <tobias@waldekranz.com>
Date: Wed, 16 Mar 2022 16:08:54 +0100
Subject: net: dsa: Handle MST state changes

Add the usual trampoline functionality from the generic DSA layer down
to the drivers for MST state changes.

When a state changes to disabled/blocking/listening, make sure to fast
age any dynamic entries in the affected VLANs (those controlled by the
MSTI in question).

Signed-off-by: Tobias Waldekranz <tobias@waldekranz.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 644fda2293a2..06cdefd3b9dd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -957,7 +957,10 @@ struct dsa_switch_ops {
 				     struct dsa_bridge bridge);
 	void	(*port_stp_state_set)(struct dsa_switch *ds, int port,
 				      u8 state);
+	int	(*port_mst_state_set)(struct dsa_switch *ds, int port,
+				      const struct switchdev_mst_state *state);
 	void	(*port_fast_age)(struct dsa_switch *ds, int port);
+	int	(*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid);
 	int	(*port_pre_bridge_flags)(struct dsa_switch *ds, int port,
 					 struct switchdev_brport_flags flags,
 					 struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From 0148bb50b8fd51baf357de8b237c0c6011506540 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 16 Mar 2022 22:41:43 +0200
Subject: net: dsa: pass extack to dsa_switch_ops :: port_mirror_add()

Drivers might have error messages to propagate to user space, most
common being that they support a single mirror port.

Propagate the netlink extack so that they can inform user space in a
verbal way of their limitations.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 06cdefd3b9dd..934958fda962 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1028,7 +1028,7 @@ struct dsa_switch_ops {
 				    struct flow_cls_offload *cls, bool ingress);
 	int	(*port_mirror_add)(struct dsa_switch *ds, int port,
 				   struct dsa_mall_mirror_tc_entry *mirror,
-				   bool ingress);
+				   bool ingress, struct netlink_ext_ack *extack);
 	void	(*port_mirror_del)(struct dsa_switch *ds, int port,
 				   struct dsa_mall_mirror_tc_entry *mirror);
 	int	(*port_policer_add)(struct dsa_switch *ds, int port,
-- 
cgit v1.2.3


From b58b1f563ab78955d37e9e43e02790a85c66ac05 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 14 Mar 2022 11:38:22 +0100
Subject: xfrm: rework default policy structure

This is a follow up of commit f8d858e607b2 ("xfrm: make user policy API
complete"). The goal is to align userland API to the internal structures.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by:  Antony Antony <antony.antony@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h |  6 +-----
 include/net/xfrm.h       | 48 ++++++++++++++++++------------------------------
 2 files changed, 19 insertions(+), 35 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 947733a639a6..bd7c3be4af5d 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -66,11 +66,7 @@ struct netns_xfrm {
 	int			sysctl_larval_drop;
 	u32			sysctl_acq_expires;
 
-	u8			policy_default;
-#define XFRM_POL_DEFAULT_IN	1
-#define XFRM_POL_DEFAULT_OUT	2
-#define XFRM_POL_DEFAULT_FWD	4
-#define XFRM_POL_DEFAULT_MASK	7
+	u8			policy_default[XFRM_POLICY_MAX];
 
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*sysctl_hdr;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index fdb41e8bb626..1541ca0cb13c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1081,25 +1081,18 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un
 }
 
 #ifdef CONFIG_XFRM
-static inline bool
-xfrm_default_allow(struct net *net, int dir)
-{
-	u8 def = net->xfrm.policy_default;
-
-	switch (dir) {
-	case XFRM_POLICY_IN:
-		return def & XFRM_POL_DEFAULT_IN ? false : true;
-	case XFRM_POLICY_OUT:
-		return def & XFRM_POL_DEFAULT_OUT ? false : true;
-	case XFRM_POLICY_FWD:
-		return def & XFRM_POL_DEFAULT_FWD ? false : true;
-	}
-	return false;
-}
-
 int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
 			unsigned short family);
 
+static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb,
+					 int dir)
+{
+	if (!net->xfrm.policy_count[dir] && !secpath_exists(skb))
+		return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT;
+
+	return false;
+}
+
 static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 				       struct sk_buff *skb,
 				       unsigned int family, int reverse)
@@ -1110,13 +1103,9 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 	if (sk && sk->sk_policy[XFRM_POLICY_IN])
 		return __xfrm_policy_check(sk, ndir, skb, family);
 
-	if (xfrm_default_allow(net, dir))
-		return (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) ||
-		       (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
-		       __xfrm_policy_check(sk, ndir, skb, family);
-	else
-		return (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
-		       __xfrm_policy_check(sk, ndir, skb, family);
+	return __xfrm_check_nopolicy(net, skb, dir) ||
+	       (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
+	       __xfrm_policy_check(sk, ndir, skb, family);
 }
 
 static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
@@ -1168,13 +1157,12 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
 	struct net *net = dev_net(skb->dev);
 
-	if (xfrm_default_allow(net, XFRM_POLICY_OUT))
-		return !net->xfrm.policy_count[XFRM_POLICY_OUT] ||
-			(skb_dst(skb)->flags & DST_NOXFRM) ||
-			__xfrm_route_forward(skb, family);
-	else
-		return (skb_dst(skb)->flags & DST_NOXFRM) ||
-			__xfrm_route_forward(skb, family);
+	if (!net->xfrm.policy_count[XFRM_POLICY_OUT] &&
+	    net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT)
+		return true;
+
+	return (skb_dst(skb)->flags & DST_NOXFRM) ||
+	       __xfrm_route_forward(skb, family);
 }
 
 static inline int xfrm4_route_forward(struct sk_buff *skb)
-- 
cgit v1.2.3


From 0eaecfb2e4814d51ab172df3823e35d7c488b6d2 Mon Sep 17 00:00:00 2001
From: Ismael Ferreras Morezuelas <swyterzone@gmail.com>
Date: Mon, 7 Mar 2022 21:04:44 +0100
Subject: Bluetooth: hci_sync: Add a new quirk to skip HCI_FLT_CLEAR_ALL

Some controllers have problems with being sent a command to clear
all filtering. While the HCI code does not unconditionally
send a clear-all anymore at BR/EDR setup (after the state machine
refactor), there might be more ways of hitting these codepaths
in the future as the kernel develops.

Cc: stable@vger.kernel.org
Cc: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Ismael Ferreras Morezuelas <swyterzone@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 35c073d44ec5..5cb095b09a94 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -255,6 +255,16 @@ enum {
 	 * during the hdev->setup vendor callback.
 	 */
 	HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER,
+
+	/* When this quirk is set, HCI_OP_SET_EVENT_FLT requests with
+	 * HCI_FLT_CLEAR_ALL are ignored and event filtering is
+	 * completely avoided. A subset of the CSR controller
+	 * clones struggle with this and instantly lock up.
+	 *
+	 * Note that devices using this must (separately) disable
+	 * runtime suspend, because event filtering takes place there.
+	 */
+	HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL,
 };
 
 /* HCI device flags */
-- 
cgit v1.2.3


From da8912176fb0ff9fd60e14fa653108d96422b896 Mon Sep 17 00:00:00 2001
From: Gavin Li <gavin@matician.com>
Date: Mon, 14 Mar 2022 15:42:52 -0700
Subject: Bluetooth: fix incorrect nonblock bitmask in bt_sock_wait_ready()

Callers pass msg->msg_flags as flags, which contains MSG_DONTWAIT
instead of O_NONBLOCK.

Signed-off-by: Gavin Li <gavin@matician.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/bluetooth.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 2aa5e95808a5..6b48d9e2aab9 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -345,7 +345,7 @@ int  bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 __poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait);
 int  bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int  bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);
-int  bt_sock_wait_ready(struct sock *sk, unsigned long flags);
+int  bt_sock_wait_ready(struct sock *sk, unsigned int msg_flags);
 
 void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh);
 void bt_accept_unlink(struct sock *sk);
-- 
cgit v1.2.3


From 31d0bb9763efad30377505f3467f958d1ebe1e3d Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 2 Mar 2022 22:02:55 +0100
Subject: netfilter: conntrack: Add and use
 nf_ct_set_auto_assign_helper_warned()

The function sets the pernet boolean to avoid the spurious warning from
nf_ct_lookup_helper() when assigning conntrack helpers via nftables.

Fixes: 1a64edf54f55 ("netfilter: nft_ct: add helper set support")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 37f0fbefb060..9939c366f720 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -177,4 +177,5 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat);
 int nf_nat_helper_try_module_get(const char *name, u16 l3num,
 				 u8 protonum);
 void nf_nat_helper_put(struct nf_conntrack_helper *helper);
+void nf_ct_set_auto_assign_helper_warned(struct net *net);
 #endif /*_NF_CONNTRACK_HELPER_H*/
-- 
cgit v1.2.3


From b2d306542ff935a4edf7a88ba8145c108193442a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 14 Mar 2022 18:23:00 +0100
Subject: netfilter: nf_tables: do not reduce read-only expressions

Skip register tracking for expressions that perform read-only operations
on the registers. Define and use a cookie pointer NFT_REDUCE_READONLY to
avoid defining stubs for these expressions.

This patch re-enables register tracking which was disabled in ed5f85d42290
("netfilter: nf_tables: disable register tracking"). Follow up patches
add remaining register tracking for existing expressions.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index c4c0861deac1..edabfb9e97ce 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1633,4 +1633,12 @@ static inline struct nftables_pernet *nft_pernet(const struct net *net)
 	return net_generic(net, nf_tables_net_id);
 }
 
+#define __NFT_REDUCE_READONLY	1UL
+#define NFT_REDUCE_READONLY	(void *)__NFT_REDUCE_READONLY
+
+static inline bool nft_reduce_is_readonly(const struct nft_expr *expr)
+{
+	return expr->ops->reduce == NFT_REDUCE_READONLY;
+}
+
 #endif /* _NET_NF_TABLES_H */
-- 
cgit v1.2.3


From 34cc9e52884a16c62acbfb309863fb60e4c24f55 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 14 Mar 2022 18:23:01 +0100
Subject: netfilter: nf_tables: cancel tracking for clobbered destination
 registers

Output of expressions might be larger than one single register, this might
clobber existing data. Reset tracking for all destination registers that
required to store the expression output.

This patch adds three new helper functions:

- nft_reg_track_update: cancel previous register tracking and update it.
- nft_reg_track_cancel: cancel any previous register tracking info.
- __nft_reg_track_cancel: cancel only one single register tracking info.

Partial register clobbering detection is also supported by checking the
.num_reg field which describes the number of register that are used.

This patch updates the following expressions:

- meta_bridge
- bitwise
- byteorder
- meta
- payload

to use these helper functions.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 14 ++++++++++++++
 include/net/netfilter/nft_meta.h  |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index edabfb9e97ce..20af9d3557b9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -126,6 +126,7 @@ struct nft_regs_track {
 	struct {
 		const struct nft_expr		*selector;
 		const struct nft_expr		*bitwise;
+		u8				num_reg;
 	} regs[NFT_REG32_NUM];
 
 	const struct nft_expr			*cur;
@@ -1641,4 +1642,17 @@ static inline bool nft_reduce_is_readonly(const struct nft_expr *expr)
 	return expr->ops->reduce == NFT_REDUCE_READONLY;
 }
 
+void nft_reg_track_update(struct nft_regs_track *track,
+			  const struct nft_expr *expr, u8 dreg, u8 len);
+void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len);
+void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg);
+
+static inline bool nft_reg_track_cmp(struct nft_regs_track *track,
+				     const struct nft_expr *expr, u8 dreg)
+{
+	return track->regs[dreg].selector &&
+	       track->regs[dreg].selector->ops == expr->ops &&
+	       track->regs[dreg].num_reg == 0;
+}
+
 #endif /* _NET_NF_TABLES_H */
diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index 2dce55c736f4..246fd023dcf4 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -6,6 +6,7 @@
 
 struct nft_meta {
 	enum nft_meta_keys	key:8;
+	u8			len;
 	union {
 		u8		dreg;
 		u8		sreg;
-- 
cgit v1.2.3


From aaa7b20bd4d637fd4ef0d72b6c828c061b9bc5f7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 Mar 2022 18:23:04 +0100
Subject: netfilter: nft_meta: extend reduce support to bridge family

its enough to export the meta get reduce helper and then call it
from nft_meta_bridge too.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index 246fd023dcf4..9b51cc67de54 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -44,4 +44,6 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
 			  const struct nft_expr *expr,
 			  const struct nft_data **data);
 
+bool nft_meta_get_reduce(struct nft_regs_track *track,
+			 const struct nft_expr *expr);
 #endif
-- 
cgit v1.2.3


From 3c1eb413a45b6c6327fed394705081ec6202b31a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 Mar 2022 18:23:12 +0100
Subject: netfilter: nft_fib: add reduce support

The fib expression stores to a register, so we can't add empty stub.
Check that the register that is being written is in fact redundant.

In most cases, this is expected to cancel tracking as re-use is
unlikely.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_fib.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
index 237f3757637e..eed099eae672 100644
--- a/include/net/netfilter/nft_fib.h
+++ b/include/net/netfilter/nft_fib.h
@@ -37,4 +37,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 
 void nft_fib_store_result(void *reg, const struct nft_fib *priv,
 			  const struct net_device *dev);
+
+bool nft_fib_reduce(struct nft_regs_track *track,
+		    const struct nft_expr *expr);
 #endif
-- 
cgit v1.2.3


From 8879b32a3a809a183e6e2998725a0f33c4e42d41 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 18 Mar 2022 12:23:41 -0700
Subject: devlink: add explicitly locked flavor of the rate node APIs

We'll need an explicitly locked rate node API for netdevsim
to switch eswitch mode setting to locked.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index fd89a17adea1..a30180c0988a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1490,6 +1490,10 @@ int devl_port_register(struct devlink *devlink,
 		       unsigned int port_index);
 void devl_port_unregister(struct devlink_port *devlink_port);
 
+int devl_rate_leaf_create(struct devlink_port *port, void *priv);
+void devl_rate_leaf_destroy(struct devlink_port *devlink_port);
+void devl_rate_nodes_destroy(struct devlink *devlink);
+
 struct ib_device;
 
 struct net *devlink_net(const struct devlink *devlink);
-- 
cgit v1.2.3


From 054d5575cd6ed2792611a7cbb8c88663cc873780 Mon Sep 17 00:00:00 2001
From: Louis Peens <louis.peens@corigine.com>
Date: Wed, 23 Mar 2022 11:25:06 +0200
Subject: net/sched: fix incorrect vlan_push_eth dest field

Seems like a potential copy-paste bug slipped in here,
the second memcpy should of course be populating src
and not dest.

Fixes: ab95465cde23 ("net/sched: add vlan push_eth and pop_eth action to the hardware IR")
Signed-off-by: Louis Peens <louis.peens@corigine.com>
Link: https://lore.kernel.org/r/20220323092506.21639-1-louis.peens@corigine.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_vlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index a97600f742de..904eddfc1826 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -84,7 +84,7 @@ static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest,
 {
 	rcu_read_lock();
 	memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN);
-	memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN);
+	memcpy(src, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From 4a9dda1c1da65beee994f0977a56a9a21c5db2a7 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Fri, 1 Apr 2022 10:48:44 +0800
Subject: mctp: Use output netdev to allocate skb headroom

Previously the skb was allocated with headroom MCTP_HEADER_MAXLEN,
but that isn't sufficient if we are using devs that are not MCTP
specific.

This also adds a check that the smctp_halen provided to sendmsg for
extended addressing is the correct size for the netdev.

Fixes: 833ef3b91de6 ("mctp: Populate socket implementation")
Reported-by: Matthew Rinaldi <mjrinal@g.clemson.edu>
Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mctp.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index d37268fe6825..82800d521c3d 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -36,8 +36,6 @@ struct mctp_hdr {
 #define MCTP_HDR_TAG_SHIFT	0
 #define MCTP_HDR_TAG_MASK	GENMASK(2, 0)
 
-#define MCTP_HEADER_MAXLEN	4
-
 #define MCTP_INITIAL_DEFAULT_NET	1
 
 static inline bool mctp_address_unicast(mctp_eid_t eid)
-- 
cgit v1.2.3


From 40570375356c874b1578e05c1dcc3ff7c1322dbe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Apr 2022 16:35:38 -0700
Subject: tcp: add accessors to read/set tp->snd_cwnd

We had various bugs over the years with code
breaking the assumption that tp->snd_cwnd is greater
than zero.

Lately, syzbot reported the WARN_ON_ONCE(!tp->prior_cwnd) added
in commit 8b8a321ff72c ("tcp: fix zero cwnd in tcp_cwnd_reduction")
can trigger, and without a repro we would have to spend
considerable time finding the bug.

Instead of complaining too late, we want to catch where
and when tp->snd_cwnd is set to an illegal value.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Suggested-by: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Link: https://lore.kernel.org/r/20220405233538.947344-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70ca4a5e330a..54aa2f02bb63 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1207,9 +1207,20 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
 
 #define TCP_INFINITE_SSTHRESH	0x7fffffff
 
+static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
+{
+	return tp->snd_cwnd;
+}
+
+static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
+{
+	WARN_ON_ONCE((int)val <= 0);
+	tp->snd_cwnd = val;
+}
+
 static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
 {
-	return tp->snd_cwnd < tp->snd_ssthresh;
+	return tcp_snd_cwnd(tp) < tp->snd_ssthresh;
 }
 
 static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
@@ -1235,8 +1246,8 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 		return tp->snd_ssthresh;
 	else
 		return max(tp->snd_ssthresh,
-			   ((tp->snd_cwnd >> 1) +
-			    (tp->snd_cwnd >> 2)));
+			   ((tcp_snd_cwnd(tp) >> 1) +
+			    (tcp_snd_cwnd(tp) >> 2)));
 }
 
 /* Use define here intentionally to get WARN_ON location shown at the caller */
@@ -1278,7 +1289,7 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
 
 	/* If in slow start, ensure cwnd grows to twice what was ACKed. */
 	if (tcp_in_slow_start(tp))
-		return tp->snd_cwnd < 2 * tp->max_packets_out;
+		return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out;
 
 	return tp->is_cwnd_limited;
 }
-- 
cgit v1.2.3


From 51454ea42c1ab4e0c2828bb0d4d53957976980de Mon Sep 17 00:00:00 2001
From: Niels Dossche <dossche.niels@gmail.com>
Date: Mon, 4 Apr 2022 01:15:24 +0200
Subject: ipv6: fix locking issues with loops over idev->addr_list

idev->addr_list needs to be protected by idev->lock. However, it is not
always possible to do so while iterating and performing actions on
inet6_ifaddr instances. For example, multiple functions (like
addrconf_{join,leave}_anycast) eventually call down to other functions
that acquire the idev->lock. The current code temporarily unlocked the
idev->lock during the loops, which can cause race conditions. Moving the
locks up is also not an appropriate solution as the ordering of lock
acquisition will be inconsistent with for example mc_lock.

This solution adds an additional field to inet6_ifaddr that is used
to temporarily add the instances to a temporary list while holding
idev->lock. The temporary list can then be traversed without holding
idev->lock. This change was done in two places. In addrconf_ifdown, the
list_for_each_entry_safe variant of the list loop is also no longer
necessary as there is no deletion within that specific loop.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Niels Dossche <dossche.niels@gmail.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20220403231523.45843-1-dossche.niels@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/if_inet6.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 4cfdef6ca4f6..c8490729b4ae 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -64,6 +64,14 @@ struct inet6_ifaddr {
 
 	struct hlist_node	addr_lst;
 	struct list_head	if_list;
+	/*
+	 * Used to safely traverse idev->addr_list in process context
+	 * if the idev->lock needed to protect idev->addr_list cannot be held.
+	 * In that case, add the items to this list temporarily and iterate
+	 * without holding idev->lock.
+	 * See addrconf_ifdown and dev_forward_change.
+	 */
+	struct list_head	if_list_aux;
 
 	struct list_head	tmp_list;
 	struct inet6_ifaddr	*ifpub;
-- 
cgit v1.2.3


From 15fcdf6ae116d1e8da4ff76c6fd82514f5ea501b Mon Sep 17 00:00:00 2001
From: Ping Gan <jacky_gam_2001@163.com>
Date: Wed, 6 Apr 2022 09:09:56 +0800
Subject: tcp: Add tracepoint for tcp_set_ca_state

The congestion status of a tcp flow may be updated since there
is congestion between tcp sender and receiver. It makes sense to
add tracepoint for congestion status set function to summate cc
status duration and evaluate the performance of network
and congestion algorithm. the backgound of this patch is below.

Link: https://github.com/iovisor/bcc/pull/3899

Signed-off-by: Ping Gan <jacky_gam_2001@163.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220406010956.19656-1-jacky_gam_2001@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 54aa2f02bb63..6d50a662bf89 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1139,15 +1139,6 @@ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 	return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
 }
 
-static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
-{
-	struct inet_connection_sock *icsk = inet_csk(sk);
-
-	if (icsk->icsk_ca_ops->set_state)
-		icsk->icsk_ca_ops->set_state(sk, ca_state);
-	icsk->icsk_ca_state = ca_state;
-}
-
 static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1156,6 +1147,9 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 		icsk->icsk_ca_ops->cwnd_event(sk, event);
 }
 
+/* From tcp_cong.c */
+void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
+
 /* From tcp_rate.c */
 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
-- 
cgit v1.2.3


From 9027ce0b071a1bbd046682907fc2e23ca3592883 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 23 Mar 2022 14:22:01 +0100
Subject: netfilter: ecache: move to separate structure

This makes it easier for a followup patch to only expose ecache
related parts of nf_conntrack_net structure.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index b08b70989d2c..69e6c6a218be 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -43,6 +43,11 @@ union nf_conntrack_expect_proto {
 	/* insert expect proto private data here */
 };
 
+struct nf_conntrack_net_ecache {
+	struct delayed_work dwork;
+	struct netns_ct *ct_net;
+};
+
 struct nf_conntrack_net {
 	/* only used when new connection is allocated: */
 	atomic_t count;
@@ -58,8 +63,7 @@ struct nf_conntrack_net {
 	struct ctl_table_header	*sysctl_header;
 #endif
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-	struct delayed_work ecache_dwork;
-	struct netns_ct *ct_net;
+	struct nf_conntrack_net_ecache ecache;
 #endif
 };
 
-- 
cgit v1.2.3


From c3f6bb74137c68b515b7e2ff123a80611e801013 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Apr 2022 20:38:16 -0700
Subject: tls: rx: don't store the record type in socket context

Original TLS implementation was handling one record at a time.
It stashed the type of the record inside tls context (per socket
structure) for convenience. When async crypto support was added
[1] the author had to use skb->cb to store the type per-message.

The use of skb->cb overlaps with strparser, however, so a hybrid
approach was taken where type is stored in context while parsing
(since we parse a message at a time) but once parsed its copied
to skb->cb.

Recently a workaround for sockmaps [2] exposed the previously
private struct _strp_msg and started a trend of adding user
fields directly in strparser's header. This is cleaner than
storing information about an skb in the context.

This change is not strictly necessary, but IMHO the ownership
of the context field is confusing. Information naturally
belongs to the skb.

[1] commit 94524d8fc965 ("net/tls: Add support for async decryption of tls records")
[2] commit b2c4618162ec ("bpf, sockmap: sk_skb data_end access incorrect when src_reg = dst_reg")

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/strparser.h |  3 +++
 include/net/tls.h       | 10 +++-------
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index 732b7097d78e..c271543076cf 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -70,6 +70,9 @@ struct sk_skb_cb {
 	 * when dst_reg == src_reg.
 	 */
 	u64 temp_reg;
+	struct tls_msg {
+		u8 control;
+	} tls;
 };
 
 static inline struct strp_msg *strp_msg(struct sk_buff *skb)
diff --git a/include/net/tls.h b/include/net/tls.h
index b6968a5b5538..c3717cd1f1cd 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -117,11 +117,6 @@ struct tls_rec {
 	u8 aead_req_ctx[];
 };
 
-struct tls_msg {
-	struct strp_msg rxm;
-	u8 control;
-};
-
 struct tx_work {
 	struct delayed_work work;
 	struct sock *sk;
@@ -152,7 +147,6 @@ struct tls_sw_context_rx {
 	void (*saved_data_ready)(struct sock *sk);
 
 	struct sk_buff *recv_pkt;
-	u8 control;
 	u8 async_capable:1;
 	u8 decrypted:1;
 	atomic_t decrypt_pending;
@@ -411,7 +405,9 @@ void tls_free_partial_record(struct sock *sk, struct tls_context *ctx);
 
 static inline struct tls_msg *tls_msg(struct sk_buff *skb)
 {
-	return (struct tls_msg *)strp_msg(skb);
+	struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb;
+
+	return &scb->tls;
 }
 
 static inline bool tls_is_partially_sent_record(struct tls_context *ctx)
-- 
cgit v1.2.3


From 7dc59c33d62c4520a119051d4486c214ef5caa23 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Apr 2022 20:38:17 -0700
Subject: tls: rx: don't store the decryption status in socket context

Similar justification to previous change, the information
about decryption status belongs in the skb.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/strparser.h | 1 +
 include/net/tls.h       | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index c271543076cf..a191486eb1e4 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -72,6 +72,7 @@ struct sk_skb_cb {
 	u64 temp_reg;
 	struct tls_msg {
 		u8 control;
+		u8 decrypted;
 	} tls;
 };
 
diff --git a/include/net/tls.h b/include/net/tls.h
index c3717cd1f1cd..f040edc97c50 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -148,7 +148,6 @@ struct tls_sw_context_rx {
 
 	struct sk_buff *recv_pkt;
 	u8 async_capable:1;
-	u8 decrypted:1;
 	atomic_t decrypt_pending;
 	/* protect crypto_wait with decrypt_pending*/
 	spinlock_t decrypt_compl_lock;
-- 
cgit v1.2.3


From a8340cc02beed4ffbb5e7b1b0eadca445323fc6a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Apr 2022 20:38:19 -0700
Subject: tls: rx: use a define for tag length

TLS 1.3 has to strip padding, and it starts out 16 bytes
from the end of the record. Make it clear this is because
of the auth tag.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/tls.h b/include/net/tls.h
index f040edc97c50..a01c264e5f15 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -64,6 +64,7 @@
 #define TLS_AAD_SPACE_SIZE		13
 
 #define MAX_IV_SIZE			16
+#define TLS_TAG_SIZE			16
 #define TLS_MAX_REC_SEQ_SIZE		8
 
 /* For CCM mode, the full 16-bytes of IV is made of '4' fields of given sizes.
-- 
cgit v1.2.3


From 2105f700b53c24aa48b65c15652acc386044d26a Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Wed, 6 Apr 2022 14:22:41 +0300
Subject: net/sched: flower: fix parsing of ethertype following VLAN header

A tc flower filter matching TCA_FLOWER_KEY_VLAN_ETH_TYPE is expected to
match the L2 ethertype following the first VLAN header, as confirmed by
linked discussion with the maintainer. However, such rule also matches
packets that have additional second VLAN header, even though filter has
both eth_type and vlan_ethtype set to "ipv4". Looking at the code this
seems to be mostly an artifact of the way flower uses flow dissector.
First, even though looking at the uAPI eth_type and vlan_ethtype appear
like a distinct fields, in flower they are all mapped to the same
key->basic.n_proto. Second, flow dissector skips following VLAN header as
no keys for FLOW_DISSECTOR_KEY_CVLAN are set and eventually assigns the
value of n_proto to last parsed header. With these, such filters ignore any
headers present between first VLAN header and first "non magic"
header (ipv4 in this case) that doesn't result
FLOW_DISSECT_RET_PROTO_AGAIN.

Fix the issue by extending flow dissector VLAN key structure with new
'vlan_eth_type' field that matches first ethertype following previously
parsed VLAN header. Modify flower classifier to set the new
flow_dissector_key_vlan->vlan_eth_type with value obtained from
TCA_FLOWER_KEY_VLAN_ETH_TYPE/TCA_FLOWER_KEY_CVLAN_ETH_TYPE uAPIs.

Link: https://lore.kernel.org/all/Yjhgi48BpTGh6dig@nanopsycho/
Fixes: 9399ae9a6cb2 ("net_sched: flower: Add vlan support")
Fixes: d64efd0926ba ("net/sched: flower: Add supprt for matching on QinQ vlan headers")
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index aa33e1092e2c..9f65f1bfbd24 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -59,6 +59,8 @@ struct flow_dissector_key_vlan {
 		__be16	vlan_tci;
 	};
 	__be16	vlan_tpid;
+	__be16	vlan_eth_type;
+	u16	padding;
 };
 
 struct flow_dissector_mpls_lse {
-- 
cgit v1.2.3


From c2ccf84ecb715bb81dc7f51e69d680a95bf055ae Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 7 Apr 2022 10:35:22 +0300
Subject: net/sched: act_api: Add extack to offload_act_setup() callback

The callback is used by various actions to populate the flow action
structure prior to offload. Pass extack to this callback so that the
various actions will be able to report accurate error messages to user
space.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 3 ++-
 include/net/pkt_cls.h | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 3049cb69c025..9cf6870b526e 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -134,7 +134,8 @@ struct tc_action_ops {
 	(*get_psample_group)(const struct tc_action *a,
 			     tc_action_priv_destructor *destructor);
 	int     (*offload_act_setup)(struct tc_action *act, void *entry_data,
-				     u32 *index_inc, bool bind);
+				     u32 *index_inc, bool bind,
+				     struct netlink_ext_ack *extack);
 };
 
 struct tc_action_net {
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a3b57a93228a..8cf001aed858 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -547,10 +547,12 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 }
 
 int tc_setup_offload_action(struct flow_action *flow_action,
-			    const struct tcf_exts *exts);
+			    const struct tcf_exts *exts,
+			    struct netlink_ext_ack *extack);
 void tc_cleanup_offload_action(struct flow_action *flow_action);
 int tc_setup_action(struct flow_action *flow_action,
-		    struct tc_action *actions[]);
+		    struct tc_action *actions[],
+		    struct netlink_ext_ack *extack);
 
 int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
 		     void *type_data, bool err_stop, bool rtnl_held);
-- 
cgit v1.2.3


From 69642c2ab2f553fd8c4c121fcdf3b3054c727e86 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 7 Apr 2022 10:35:23 +0300
Subject: net/sched: act_gact: Add extack messages for offload failure

For better error reporting to user space, add extack messages when gact
action offload fails.

Example:

 # echo 1 > /sys/kernel/tracing/events/netlink/netlink_extack/enable

 # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action continue
 Error: cls_matchall: Failed to setup flow action.
 We have an error talking to the kernel

 # cat /sys/kernel/tracing/trace_pipe
       tc-181     [002] b..1.   105.493450: netlink_extack: msg=act_gact: Offload of "continue" action is not supported
       tc-181     [002] .....   105.493466: netlink_extack: msg=cls_matchall: Failed to setup flow action

 # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action reclassify
 Error: cls_matchall: Failed to setup flow action.
 We have an error talking to the kernel

 # cat /sys/kernel/tracing/trace_pipe
       tc-183     [002] b..1.   124.126477: netlink_extack: msg=act_gact: Offload of "reclassify" action is not supported
       tc-183     [002] .....   124.126489: netlink_extack: msg=cls_matchall: Failed to setup flow action

 # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action pipe action drop
 Error: cls_matchall: Failed to setup flow action.
 We have an error talking to the kernel

 # cat /sys/kernel/tracing/trace_pipe
       tc-185     [002] b..1.   137.097791: netlink_extack: msg=act_gact: Offload of "pipe" action is not supported
       tc-185     [002] .....   137.097804: netlink_extack: msg=cls_matchall: Failed to setup flow action

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_gact.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index eb8f01c819e6..832efd40e023 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -59,4 +59,19 @@ static inline u32 tcf_gact_goto_chain_index(const struct tc_action *a)
 	return READ_ONCE(a->tcfa_action) & TC_ACT_EXT_VAL_MASK;
 }
 
+static inline bool is_tcf_gact_continue(const struct tc_action *a)
+{
+	return __is_tcf_gact_act(a, TC_ACT_UNSPEC, false);
+}
+
+static inline bool is_tcf_gact_reclassify(const struct tc_action *a)
+{
+	return __is_tcf_gact_act(a, TC_ACT_RECLASSIFY, false);
+}
+
+static inline bool is_tcf_gact_pipe(const struct tc_action *a)
+{
+	return __is_tcf_gact_act(a, TC_ACT_PIPE, false);
+}
+
 #endif /* __NET_TC_GACT_H */
-- 
cgit v1.2.3


From a9c64939b669b3c83cc54738d1986430e6beaff2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 7 Apr 2022 10:35:28 +0300
Subject: net/sched: act_skbedit: Add extack messages for offload failure

For better error reporting to user space, add extack messages when
skbedit action offload fails.

Example:

 # echo 1 > /sys/kernel/tracing/events/netlink/netlink_extack/enable

 # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action skbedit queue_mapping 1234
 Error: cls_matchall: Failed to setup flow action.
 We have an error talking to the kernel

 # cat /sys/kernel/tracing/trace_pipe
       tc-185     [002] b..1.    31.802414: netlink_extack: msg=act_skbedit: Offload not supported when "queue_mapping" option is used
       tc-185     [002] .....    31.802418: netlink_extack: msg=cls_matchall: Failed to setup flow action

 # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action skbedit inheritdsfield
 Error: cls_matchall: Failed to setup flow action.
 We have an error talking to the kernel

 # cat /sys/kernel/tracing/trace_pipe
       tc-187     [002] b..1.    45.985145: netlink_extack: msg=act_skbedit: Offload not supported when "inheritdsfield" option is used
       tc-187     [002] .....    45.985160: netlink_extack: msg=cls_matchall: Failed to setup flow action

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_skbedit.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 00bfee70609e..cab8229b9bed 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -94,4 +94,16 @@ static inline u32 tcf_skbedit_priority(const struct tc_action *a)
 	return priority;
 }
 
+/* Return true iff action is queue_mapping */
+static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a)
+{
+	return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING);
+}
+
+/* Return true iff action is inheritdsfield */
+static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a)
+{
+	return is_tcf_skbedit_with_flag(a, SKBEDIT_F_INHERITDSFIELD);
+}
+
 #endif /* __NET_TC_SKBEDIT_H */
-- 
cgit v1.2.3


From 37943f047bfb88ba4dfc7a522563f57c86d088a0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 8 Apr 2022 11:31:27 -0700
Subject: tls: rx: simplify async wait

Since we are protected from async completions by decrypt_compl_lock
we can drop the async_notify and reinit the completion before we
start waiting.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tls.h b/include/net/tls.h
index a01c264e5f15..6fe78361c8c8 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -152,7 +152,6 @@ struct tls_sw_context_rx {
 	atomic_t decrypt_pending;
 	/* protect crypto_wait with decrypt_pending*/
 	spinlock_t decrypt_compl_lock;
-	bool async_notify;
 };
 
 struct tls_record_info {
-- 
cgit v1.2.3


From c1b8a56755ee121f9fa374de9eec36c91a7da25f Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 7 Apr 2022 14:20:49 +0800
Subject: net: sock: introduce sock_queue_rcv_skb_reason()

In order to report the reasons of skb drops in 'sock_queue_rcv_skb()',
introduce the function 'sock_queue_rcv_skb_reason()'.

As the return value of 'sock_queue_rcv_skb()' is used as the error code,
we can't make it as drop reason and have to pass extra output argument.
'sock_queue_rcv_skb()' is used in many places, so we can't change it
directly.

Introduce the new function 'sock_queue_rcv_skb_reason()' and make
'sock_queue_rcv_skb()' an inline call to it.

Reviewed-by: Hao Peng <flyingpeng@tencent.com>
Reviewed-by: Jiang Biao <benbjiang@tencent.com>
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index c4b91fc19b9c..1a988e605f09 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2392,7 +2392,14 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
 			void (*destructor)(struct sock *sk,
 					   struct sk_buff *skb));
 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
+			      enum skb_drop_reason *reason);
+
+static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	return sock_queue_rcv_skb_reason(sk, skb, NULL);
+}
 
 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
 struct sk_buff *sock_dequeue_err_skb(struct sock *sk);
-- 
cgit v1.2.3


From b384c95a861eebf47e88695cf6a29f34e0b10b0f Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 7 Apr 2022 14:20:52 +0800
Subject: net: icmp: add skb drop reasons to icmp protocol

Replace kfree_skb() used in icmp_rcv() and icmpv6_rcv() with
kfree_skb_reason().

In order to get the reasons of the skb drops after icmp message handle,
we change the return type of 'handler()' in 'struct icmp_control' from
'bool' to 'enum skb_drop_reason'. This may change its original
intention, as 'false' means failure, but 'SKB_NOT_DROPPED_YET' means
success now. Therefore, all 'handler' and the call of them need to be
handled. Following 'handler' functions are involved:

icmp_unreach()
icmp_redirect()
icmp_echo()
icmp_timestamp()
icmp_discard()

And following new drop reasons are added:

SKB_DROP_REASON_ICMP_CSUM
SKB_DROP_REASON_INVALID_PROTO

The reason 'INVALID_PROTO' is introduced for the case that the packet
doesn't follow rfc 1122 and is dropped. This is not a common case, and
I believe we can locate the problem from the data in the packet. For now,
this 'INVALID_PROTO' is used for the icmp broadcasts with wrong types.

Maybe there should be a document file for these reasons. For example,
list all the case that causes the 'UNHANDLED_PROTO' and 'INVALID_PROTO'
drop reason. Therefore, users can locate their problems according to the
document.

Reviewed-by: Hao Peng <flyingpeng@tencent.com>
Reviewed-by: Jiang Biao <benbjiang@tencent.com>
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ping.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ping.h b/include/net/ping.h
index 2fe78874318c..b68fbfdb606f 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -76,7 +76,7 @@ int  ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 int  ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
 			 void *user_icmph, size_t icmph_len);
 int  ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-bool ping_rcv(struct sk_buff *skb);
+enum skb_drop_reason ping_rcv(struct sk_buff *skb);
 
 #ifdef CONFIG_PROC_FS
 void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family);
-- 
cgit v1.2.3


From e5c95ca094cfe59a4013d711f87ff034cba30f80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@toke.dk>
Date: Mon, 4 Apr 2022 23:01:08 +0200
Subject: mac80211: Improve confusing comment around tx_info clearing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The comment above the ieee80211_tx_info_clear_status() helper was somewhat
confusing as to which fields it was or wasn't clearing. So replace it by
something that is hopefully more, well, clear.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Link: https://lore.kernel.org/r/20220404210108.2684907-1-toke@toke.dk
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 382ebb862ea8..db992f71604d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1201,9 +1201,9 @@ static inline struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb
  * in the TX status but the rate control information (it does clear
  * the count since you need to fill that in anyway).
  *
- * NOTE: You can only use this function if you do NOT use
- *	 info->driver_data! Use info->rate_driver_data
- *	 instead if you need only the less space that allows.
+ * NOTE: While the rates array is kept intact, this will wipe all of the
+ *	 driver_data fields in info, so it's up to the driver to restore
+ *	 any fields it needs after calling this helper.
  */
 static inline void
 ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
-- 
cgit v1.2.3


From 6d945a33f2b0aa24fc210dadaa0af3e8218e7002 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 25 Mar 2022 11:42:41 +0100
Subject: mac80211: introduce BSS color collision detection

Add ieee80211_rx_check_bss_color_collision routine in order to introduce
BSS color collision detection in mac80211 if it is not supported in HW/FW
(e.g. for mt7915 chipset).
Add IEEE80211_HW_DETECTS_COLOR_COLLISION flag to let the driver notify
BSS color collision detection is supported in HW/FW. Set this for ath11k
which apparently didn't need this code.

Tested-by: Peter Chiu <Chui-Hao.Chiu@mediatek.com>
Co-developed-by: Ryder Lee <ryder.lee@mediatek.com>
Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/a05eeeb1841a84560dc5aaec77894fcb69a54f27.1648204871.git.lorenzo@kernel.org
[clarify commit message a bit, move flag to mac80211]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index db992f71604d..0943b2626f4d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2434,6 +2434,9 @@ struct ieee80211_txq {
  *	usage and 802.11 frames with %RX_FLAG_ONLY_MONITOR set for monitor to
  *	the stack.
  *
+ * @IEEE80211_HW_DETECTS_COLOR_COLLISION: HW/driver has support for BSS color
+ *	collision detection and doesn't need it in software.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2489,6 +2492,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD,
 	IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD,
 	IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP,
+	IEEE80211_HW_DETECTS_COLOR_COLLISION,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
-- 
cgit v1.2.3


From 046d2e7c50e3087a32a85fd384c21f896dbccf83 Mon Sep 17 00:00:00 2001
From: Sriram R <quic_srirrama@quicinc.com>
Date: Mon, 4 Apr 2022 21:11:23 +0530
Subject: mac80211: prepare sta handling for MLO support

Currently in mac80211 each STA object is represented
using sta_info datastructure with the associated
STA specific information and drivers access ieee80211_sta
part of it.

With MLO (Multi Link Operation) support being added
in 802.11be standard, though the association is logically
with a single Multi Link capable STA, at the physical level
communication can happen via different advertised
links (uniquely identified by Channel, operating class,
BSSID) and hence the need to handle multiple link
STA parameters within a composite sta_info object
called the MLD STA. The different link STA part of
MLD STA are identified using the link address which can
be same or different as the MLD STA address and unique
link id based on the link vif.

To support extension of such a model, the sta_info
datastructure is modified to hold multiple link STA
objects with link specific params currently within
sta_info moved to this new structure. Similarly this is
done for ieee80211_sta as well which will be accessed
within mac80211 as well as by drivers, hence trivial
driver changes are expected to support this.

For current non MLO supported drivers, only one link STA
is present and link information is accessed via 'deflink'
member.

For MLO drivers, we still need to define the APIs etc. to
get the correct link ID and access the correct part of
the station info.

Currently in mac80211, all link STA info are accessed directly
via deflink. These will be updated to access via link pointers
indexed by link id with MLO support patches, with link id
being 0 for non MLO supported cases.

Except for couple of macro related changes, below spatch takes
care of updating mac80211 and driver code to access to the
link STA info via deflink.

  @ieee80211_sta@
  struct ieee80211_sta *s;
  struct sta_info *si;
  identifier var = {supp_rates, ht_cap, vht_cap, he_cap, he_6ghz_capa, eht_cap, rx_nss, bandwidth, txpwr};
  @@

  (
    s->
  -    var
  +    deflink.var
  |
   si->sta.
  -    var
  +    deflink.var
  )

  @sta_info@
  struct sta_info *si;
  identifier var = {gtk, pcpu_rx_stats, rx_stats, rx_stats_avg, status_stats, tx_stats, cur_max_bandwidth};
  @@

  (
    si->
  -    var
  +    deflink.var
  )

Signed-off-by: Sriram R <quic_srirrama@quicinc.com>
Link: https://lore.kernel.org/r/1649086883-13246-1-git-send-email-quic_srirrama@quicinc.com
[remove MLO-drivers notes from commit message, not clear yet; run spatch]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 80 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 22 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 0943b2626f4d..75880fc70700 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2056,6 +2056,45 @@ struct ieee80211_sta_txpwr {
 	enum nl80211_tx_power_setting type;
 };
 
+#define MAX_STA_LINKS			15
+
+/**
+ * struct ieee80211_link_sta - station Link specific info
+ * All link specific info for a STA link for a non MLD STA(single)
+ * or a MLD STA(multiple entries) are stored here.
+ *
+ * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr
+ *	in ieee80211_sta. For MLO Link STA this addr can be same or different
+ *	from addr in ieee80211_sta (representing MLD STA addr)
+ * @supp_rates: Bitmap of supported rates
+ * @ht_cap: HT capabilities of this STA; restricted to our own capabilities
+ * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities
+ * @he_cap: HE capabilities of this STA
+ * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities
+ * @eht_cap: EHT capabilities of this STA
+ * @bandwidth: current bandwidth the station can receive with
+ * @rx_nss: in HT/VHT, the maximum number of spatial streams the
+ *	station can receive at the moment, changed by operating mode
+ *	notifications and capabilities. The value is only valid after
+ *	the station moves to associated state.
+ * @txpwr: the station tx power configuration
+ *
+ */
+struct ieee80211_link_sta {
+	u8 addr[ETH_ALEN];
+
+	u32 supp_rates[NUM_NL80211_BANDS];
+	struct ieee80211_sta_ht_cap ht_cap;
+	struct ieee80211_sta_vht_cap vht_cap;
+	struct ieee80211_sta_he_cap he_cap;
+	struct ieee80211_he_6ghz_capa he_6ghz_capa;
+	struct ieee80211_sta_eht_cap eht_cap;
+
+	u8 rx_nss;
+	enum ieee80211_sta_rx_bandwidth bandwidth;
+	struct ieee80211_sta_txpwr txpwr;
+};
+
 /**
  * struct ieee80211_sta - station table entry
  *
@@ -2065,15 +2104,11 @@ struct ieee80211_sta_txpwr {
  * either be protected by rcu_read_lock() explicitly or implicitly,
  * or you must take good care to not use such a pointer after a
  * call to your sta_remove callback that removed it.
+ * This also represents the MLD STA in case of MLO association
+ * and holds pointers to various link STA's
  *
  * @addr: MAC address
  * @aid: AID we assigned to the station if we're an AP
- * @supp_rates: Bitmap of supported rates (per band)
- * @ht_cap: HT capabilities of this STA; restricted to our own capabilities
- * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities
- * @he_cap: HE capabilities of this STA
- * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities
- * @eht_cap: EHT capabilities of this STA
  * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU
  *	that this station is allowed to transmit to us.
  *	Can be modified by driver.
@@ -2085,11 +2120,6 @@ struct ieee80211_sta_txpwr {
  *	if wme is supported. The bits order is like in
  *	IEEE80211_WMM_IE_STA_QOSINFO_AC_*.
  * @max_sp: max Service Period. Only valid if wme is supported.
- * @bandwidth: current bandwidth the station can receive with
- * @rx_nss: in HT/VHT, the maximum number of spatial streams the
- *	station can receive at the moment, changed by operating mode
- *	notifications and capabilities. The value is only valid after
- *	the station moves to associated state.
  * @smps_mode: current SMPS mode (off, static or dynamic)
  * @rates: rate control selection table
  * @tdls: indicates whether the STA is a TDLS peer
@@ -2102,25 +2132,28 @@ struct ieee80211_sta_txpwr {
  * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not.
  * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control.
  * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID
- * @txpwr: the station tx power configuration
  * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that
  *	the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames
+ * @multi_link_sta: Identifies if this sta is a MLD STA
+ * @deflink: This holds the default link STA information, for non MLO STA all link
+ *	specific STA information is accessed through @deflink or through
+ *	link[0] which points to address of @deflink. For MLO Link STA
+ *	the first added link STA will point to deflink.
+ * @link: reference to Link Sta entries. For Non MLO STA, except 1st link,
+ *	i.e link[0] all links would be assigned to NULL by default and
+ *	would access link information via @deflink or link[0]. For MLO
+ *	STA, first link STA being added will point its link pointer to
+ *	@deflink address and remaining would be allocated and the address
+ *	would be assigned to link[link_id] where link_id is the id assigned
+ *	by the AP.
  */
 struct ieee80211_sta {
-	u32 supp_rates[NUM_NL80211_BANDS];
 	u8 addr[ETH_ALEN];
 	u16 aid;
-	struct ieee80211_sta_ht_cap ht_cap;
-	struct ieee80211_sta_vht_cap vht_cap;
-	struct ieee80211_sta_he_cap he_cap;
-	struct ieee80211_he_6ghz_capa he_6ghz_capa;
-	struct ieee80211_sta_eht_cap eht_cap;
 	u16 max_rx_aggregation_subframes;
 	bool wme;
 	u8 uapsd_queues;
 	u8 max_sp;
-	u8 rx_nss;
-	enum ieee80211_sta_rx_bandwidth bandwidth;
 	enum ieee80211_smps_mode smps_mode;
 	struct ieee80211_sta_rates __rcu *rates;
 	bool tdls;
@@ -2147,10 +2180,13 @@ struct ieee80211_sta {
 	bool support_p2p_ps;
 	u16 max_rc_amsdu_len;
 	u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS];
-	struct ieee80211_sta_txpwr txpwr;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];
 
+	bool multi_link_sta;
+	struct ieee80211_link_sta deflink;
+	struct ieee80211_link_sta *link[MAX_STA_LINKS];
+
 	/* must be last */
 	u8 drv_priv[] __aligned(sizeof(void *));
 };
@@ -6371,7 +6407,7 @@ static inline int rate_supported(struct ieee80211_sta *sta,
 				 enum nl80211_band band,
 				 int index)
 {
-	return (sta == NULL || sta->supp_rates[band] & BIT(index));
+	return (sta == NULL || sta->deflink.supp_rates[band] & BIT(index));
 }
 
 static inline s8
-- 
cgit v1.2.3


From 888ade8f90d7dbbdc8552ae9b23d311f9e61ab0e Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 8 Apr 2022 22:08:37 +0200
Subject: ipv4: Use dscp_t in struct fib_rt_info

Use the new dscp_t type to replace the tos field of struct fib_rt_info.
This ensures ECN bits are ignored and makes it compatible with the
fa_dscp field of struct fib_alias.

This also allows sparse to flag potential incorrect uses of DSCP and
ECN bits.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_fib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 6a82bcb8813b..f08ba531ac08 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -212,7 +212,7 @@ struct fib_rt_info {
 	u32			tb_id;
 	__be32			dst;
 	int			dst_len;
-	u8			tos;
+	dscp_t			dscp;
 	u8			type;
 	u8			offload:1,
 				trap:1,
-- 
cgit v1.2.3


From 568a3f33b4273833f1b1e4a39d4a3410c4770c32 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 8 Apr 2022 22:08:40 +0200
Subject: ipv4: Use dscp_t in struct fib_entry_notifier_info

Use the new dscp_t type to replace the tos field of struct
fib_entry_notifier_info. This ensures ECN bits are ignored and makes it
compatible with the dscp field of struct fib_rt_info.

This also allows sparse to flag potential incorrect uses of DSCP and
ECN bits.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_fib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f08ba531ac08..a378eff827c7 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -225,7 +225,7 @@ struct fib_entry_notifier_info {
 	u32 dst;
 	int dst_len;
 	struct fib_info *fi;
-	u8 tos;
+	dscp_t dscp;
 	u8 type;
 	u32 tb_id;
 };
-- 
cgit v1.2.3


From ec095263a965720e1ca39db1d9c5cd47846c789b Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 11 Apr 2022 14:49:55 +0200
Subject: net: remove noblock parameter from recvmsg() entities

The internal recvmsg() functions have two parameters 'flags' and 'noblock'
that were merged inside skb_recv_datagram(). As a follow up patch to commit
f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()")
this patch removes the separate 'noblock' parameter for recvmsg().

Analogue to the referenced patch for skb_recv_datagram() the 'flags' and
'noblock' parameters are unnecessarily split up with e.g.

err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
                           flags & ~MSG_DONTWAIT, &addr_len);

or in

err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
                      sk, msg, size, flags & MSG_DONTWAIT,
                      flags & ~MSG_DONTWAIT, &addr_len);

instead of simply using only flags all the time and check for MSG_DONTWAIT
where needed (to preserve for the formerly separated no(n)block condition).

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/ping.h      | 2 +-
 include/net/sctp/sctp.h | 2 +-
 include/net/sock.h      | 3 +--
 include/net/tcp.h       | 2 +-
 include/net/tls.h       | 2 +-
 include/net/udp.h       | 8 ++++----
 6 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ping.h b/include/net/ping.h
index b68fbfdb606f..e4ff3911cbf5 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -71,7 +71,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info);
 int  ping_getfrag(void *from, char *to, int offset, int fraglen, int odd,
 		  struct sk_buff *);
 
-int  ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+int  ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		  int flags, int *addr_len);
 int  ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
 			 void *user_icmph, size_t icmph_len);
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index bf3716fe83e0..a04999ee99b0 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -103,7 +103,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
 extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
-struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
+struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *);
 
 typedef int (*sctp_callback_t)(struct sctp_endpoint *, struct sctp_transport *, void *);
 void sctp_transport_walk_start(struct rhashtable_iter *iter);
diff --git a/include/net/sock.h b/include/net/sock.h
index 1a988e605f09..a01d6c421aa2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1202,8 +1202,7 @@ struct proto {
 	int			(*sendmsg)(struct sock *sk, struct msghdr *msg,
 					   size_t len);
 	int			(*recvmsg)(struct sock *sk, struct msghdr *msg,
-					   size_t len, int noblock, int flags,
-					   int *addr_len);
+					   size_t len, int flags, int *addr_len);
 	int			(*sendpage)(struct sock *sk, struct page *page,
 					int offset, size_t size, int flags);
 	int			(*bind)(struct sock *sk,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d50a662bf89..679b1964d494 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -407,7 +407,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		   unsigned int optlen);
 void tcp_set_keepalive(struct sock *sk, int val);
 void tcp_syn_ack_timeout(const struct request_sock *req);
-int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		int flags, int *addr_len);
 int tcp_set_rcvlowat(struct sock *sk, int val);
 int tcp_set_window_clamp(struct sock *sk, int val);
diff --git a/include/net/tls.h b/include/net/tls.h
index 6fe78361c8c8..b59f0a63292b 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -371,7 +371,7 @@ void tls_sw_free_resources_rx(struct sock *sk);
 void tls_sw_release_resources_rx(struct sock *sk);
 void tls_sw_free_ctx_rx(struct tls_context *tls_ctx);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
-		   int nonblock, int flags, int *addr_len);
+		   int flags, int *addr_len);
 bool tls_sw_sock_is_readable(struct sock *sk);
 ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
diff --git a/include/net/udp.h b/include/net/udp.h
index f1c2a88c9005..b83a00330566 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -250,14 +250,14 @@ void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
-struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
-			       int noblock, int *off, int *err);
+struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off,
+			       int *err);
 static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
-					   int noblock, int *err)
+					   int *err)
 {
 	int off = 0;
 
-	return __skb_recv_udp(sk, flags, noblock, &off, err);
+	return __skb_recv_udp(sk, flags, &off, err);
 }
 
 int udp_v4_early_demux(struct sk_buff *skb);
-- 
cgit v1.2.3


From 5bd8baab087dff657e05387aee802e70304cc813 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 13 Apr 2022 10:10:50 +0200
Subject: esp: limit skb_page_frag_refill use to a single page

Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP
transformation") tried to fix skb_page_frag_refill usage in ESP by
capping allocsize to 32k, but that doesn't completely solve the issue,
as skb_page_frag_refill may return a single page. If that happens, we
will write out of bounds, despite the check introduced in the previous
patch.

This patch forces COW in cases where we would end up calling
skb_page_frag_refill with a size larger than a page (first in
esp_output_head with tailen, then in esp_output_tail with
skb->data_len).

Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible")
Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/esp.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/esp.h b/include/net/esp.h
index 90cd02ff77ef..9c5637d41d95 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -4,8 +4,6 @@
 
 #include <linux/skbuff.h>
 
-#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER)
-
 struct ip_esp_hdr;
 
 static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb)
-- 
cgit v1.2.3


From 12dc5c2cb7b269c5a1c6d02844f40bfce942a7a6 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Wed, 13 Apr 2022 13:51:51 +0300
Subject: net: rtnetlink: add msg kind names

Add rtnl kind names instead of using raw values. We'll need to
check for DEL kind later to validate bulk flag support.

Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 9f48733bfd21..78712b51f3da 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -13,6 +13,13 @@ enum rtnl_link_flags {
 	RTNL_FLAG_DOIT_UNLOCKED = 1,
 };
 
+enum rtnl_kinds {
+	RTNL_KIND_NEW,
+	RTNL_KIND_DEL,
+	RTNL_KIND_GET,
+	RTNL_KIND_SET
+};
+
 void rtnl_register(int protocol, int msgtype,
 		   rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
 int rtnl_register_module(struct module *owner, int protocol, int msgtype,
-- 
cgit v1.2.3


From 2e9ea3e30f696fd438319c07836422bb0bbb4608 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Wed, 13 Apr 2022 13:51:52 +0300
Subject: net: rtnetlink: add helper to extract msg type's kind

Add a helper which extracts the msg type's kind using the kind mask (0x3).

Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 78712b51f3da..c51c5ff7f7e2 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -19,6 +19,12 @@ enum rtnl_kinds {
 	RTNL_KIND_GET,
 	RTNL_KIND_SET
 };
+#define RTNL_KIND_MASK 0x3
+
+static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype)
+{
+	return msgtype & RTNL_KIND_MASK;
+}
 
 void rtnl_register(int protocol, int msgtype,
 		   rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
-- 
cgit v1.2.3


From 0569e31f1bc2f50613ba4c219f3ecc0d1174d841 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Wed, 13 Apr 2022 13:51:53 +0300
Subject: net: rtnetlink: use BIT for flag values

Use BIT to define flag values.

Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index c51c5ff7f7e2..0bf622409aaa 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -10,7 +10,7 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *,
 typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
 
 enum rtnl_link_flags {
-	RTNL_FLAG_DOIT_UNLOCKED = 1,
+	RTNL_FLAG_DOIT_UNLOCKED = BIT(0),
 };
 
 enum rtnl_kinds {
-- 
cgit v1.2.3


From a6cec0bcd34264be8887791594be793b3f12719f Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Wed, 13 Apr 2022 13:51:55 +0300
Subject: net: rtnetlink: add bulk delete support flag

Add a new rtnl flag (RTNL_FLAG_BULK_DEL_SUPPORTED) which is used to
verify that the delete operation allows bulk object deletion. Also emit
a warning if anyone tries to set it for non-delete kind.

Suggested-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 0bf622409aaa..bf8bb3357825 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -10,7 +10,8 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *,
 typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
 
 enum rtnl_link_flags {
-	RTNL_FLAG_DOIT_UNLOCKED = BIT(0),
+	RTNL_FLAG_DOIT_UNLOCKED		= BIT(0),
+	RTNL_FLAG_BULK_DEL_SUPPORTED	= BIT(1),
 };
 
 enum rtnl_kinds {
-- 
cgit v1.2.3


From f3c5264f452a5b0ac1de1f2f657efbabdea3c76a Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 12 Apr 2022 18:31:58 +0200
Subject: net: page_pool: introduce ethtool stats

Introduce page_pool APIs to report stats through ethtool and reduce
duplicated code in each driver.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/net')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ea5fb70e5101..813c93499f20 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -117,6 +117,10 @@ struct page_pool_stats {
 	struct page_pool_recycle_stats recycle_stats;
 };
 
+int page_pool_ethtool_stats_get_count(void);
+u8 *page_pool_ethtool_stats_get_strings(u8 *data);
+u64 *page_pool_ethtool_stats_get(u64 *data, void *stats);
+
 /*
  * Drivers that wish to harvest page pool stats and report them to users
  * (perhaps via ethtool, debugfs, or another mechanism) can allocate a
@@ -124,6 +128,23 @@ struct page_pool_stats {
  */
 bool page_pool_get_stats(struct page_pool *pool,
 			 struct page_pool_stats *stats);
+#else
+
+static inline int page_pool_ethtool_stats_get_count(void)
+{
+	return 0;
+}
+
+static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data)
+{
+	return data;
+}
+
+static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
+{
+	return data;
+}
+
 #endif
 
 struct page_pool {
-- 
cgit v1.2.3


From db53cd3d88dc328dea2e968c9c8d3b4294a8a674 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Wed, 13 Apr 2022 11:43:20 -0600
Subject: net: Handle l3mdev in ip_tunnel_init_flow

Ido reported that the commit referenced in the Fixes tag broke
a gre use case with dummy devices. Add a check to ip_tunnel_init_flow
to see if the oif is an l3mdev port and if so set the oif to 0 to
avoid the oif comparison in fib_lookup_good_nhc.

Fixes: 40867d74c374 ("net: Add l3mdev index to flow struct and avoid oif reset for port devices")
Reported-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_tunnels.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0219fe907b26..88dee57eac8a 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -243,11 +243,18 @@ static inline __be32 tunnel_id_to_key32(__be64 tun_id)
 static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
 				       int proto,
 				       __be32 daddr, __be32 saddr,
-				       __be32 key, __u8 tos, int oif,
+				       __be32 key, __u8 tos,
+				       struct net *net, int oif,
 				       __u32 mark, __u32 tun_inner_hash)
 {
 	memset(fl4, 0, sizeof(*fl4));
-	fl4->flowi4_oif = oif;
+
+	if (oif) {
+		fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index_rcu(net, oif);
+		/* Legacy VRF/l3mdev use case */
+		fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif;
+	}
+
 	fl4->daddr = daddr;
 	fl4->saddr = saddr;
 	fl4->flowi4_tos = tos;
-- 
cgit v1.2.3


From 9cb7c013420f98fa6fd12fc6a5dc055170c108db Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Apr 2022 11:13:33 -0700
Subject: ipv6: make ip6_rt_gc_expire an atomic_t

Reads and Writes to ip6_rt_gc_expire always have been racy,
as syzbot reported lately [1]

There is a possible risk of under-flow, leading
to unexpected high value passed to fib6_run_gc(),
although I have not observed this in the field.

Hosts hitting ip6_dst_gc() very hard are under pretty bad
state anyway.

[1]
BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc

read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1:
 ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311
 dst_alloc+0x9b/0x160 net/core/dst.c:86
 ip6_dst_alloc net/ipv6/route.c:344 [inline]
 icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261
 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807
 mld_send_cr net/ipv6/mcast.c:2119 [inline]
 mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651
 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289
 worker_thread+0x618/0xa70 kernel/workqueue.c:2436
 kthread+0x1a9/0x1e0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30

read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0:
 ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311
 dst_alloc+0x9b/0x160 net/core/dst.c:86
 ip6_dst_alloc net/ipv6/route.c:344 [inline]
 icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261
 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807
 mld_send_cr net/ipv6/mcast.c:2119 [inline]
 mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651
 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289
 worker_thread+0x618/0xa70 kernel/workqueue.c:2436
 kthread+0x1a9/0x1e0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30

value changed: 0x00000bb3 -> 0x00000ba9

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: mld mld_ifc_work

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netns/ipv6.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 3d83b64471d3..b4af4837d80b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -75,8 +75,8 @@ struct netns_ipv6 {
 	struct list_head	fib6_walkers;
 	rwlock_t		fib6_walker_lock;
 	spinlock_t		fib6_gc_lock;
-	unsigned int		 ip6_rt_gc_expire;
-	unsigned long		 ip6_rt_last_gc;
+	atomic_t		ip6_rt_gc_expire;
+	unsigned long		ip6_rt_last_gc;
 	unsigned char		flowlabel_has_excl;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	bool			fib6_has_custom_rules;
-- 
cgit v1.2.3


From c246f9b5fd617fe487f8b6f18851703f468501d6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 18 Apr 2022 09:42:25 +0300
Subject: devlink: add support to create line card and expose to user

Extend the devlink API so the driver is going to be able to create and
destroy linecard instances. There can be multiple line cards per devlink
device. Expose this new type of object over devlink netlink API to the
userspace, with notifications.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index a30180c0988a..7aabdfb3bc6a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -22,6 +22,7 @@
 #include <linux/firmware.h>
 
 struct devlink;
+struct devlink_linecard;
 
 struct devlink_port_phys_attrs {
 	u32 port_number; /* Same value as "split group".
@@ -1536,6 +1537,9 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 int devlink_rate_leaf_create(struct devlink_port *port, void *priv);
 void devlink_rate_leaf_destroy(struct devlink_port *devlink_port);
 void devlink_rate_nodes_destroy(struct devlink *devlink);
+struct devlink_linecard *devlink_linecard_create(struct devlink *devlink,
+						 unsigned int linecard_index);
+void devlink_linecard_destroy(struct devlink_linecard *linecard);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From fcdc8ce23a309c26a67fc613a741d9b21a248311 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 18 Apr 2022 09:42:26 +0300
Subject: devlink: implement line card provisioning

In order to be able to configure all needed stuff on a port/netdevice
of a line card without the line card being present, introduce line card
provisioning. Basically by setting a type, provisioning process will
start and driver is supposed to create a placeholder for instances
(ports/netdevices) for a line card type.

Allow the user to query the supported line card types over line card
get command. Then implement two netlink command SET to allow user to
set/unset the card type.

On the driver API side, add provision/unprovision ops and supported
types array to be advertised. Upon provision op call, the driver should
take care of creating the instances for the particular line card type.
Introduce provision_set/clear() functions to be called by the driver
once the provisioning/unprovisioning is done on its side. These helpers
are not to be called directly due to the async nature of provisioning.

Example:
$ devlink port # No ports are listed
$ devlink lc
pci/0000:01:00.0:
  lc 1 state unprovisioned
    supported_types:
       16x100G
  lc 2 state unprovisioned
    supported_types:
       16x100G
  lc 3 state unprovisioned
    supported_types:
       16x100G
  lc 4 state unprovisioned
    supported_types:
       16x100G
  lc 5 state unprovisioned
    supported_types:
       16x100G
  lc 6 state unprovisioned
    supported_types:
       16x100G
  lc 7 state unprovisioned
    supported_types:
       16x100G
  lc 8 state unprovisioned
    supported_types:
       16x100G

$ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G
$ devlink lc show pci/0000:01:00.0 lc 8
pci/0000:01:00.0:
  lc 8 state active type 16x100G
    supported_types:
       16x100G
$ devlink port
pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false
pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4
pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4
pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4
pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4
pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4
pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4
pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4
pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4
pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4
pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4
pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4
pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4
pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4
pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4
pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4
pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4

$ devlink lc set pci/0000:01:00.0 lc 8 notype

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 7aabdfb3bc6a..3e49d4ff498c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -149,6 +149,40 @@ struct devlink_port_new_attrs {
 	   sfnum_valid:1;
 };
 
+/**
+ * struct devlink_linecard_ops - Linecard operations
+ * @provision: callback to provision the linecard slot with certain
+ *	       type of linecard. As a result of this operation,
+ *	       driver is expected to eventually (could be after
+ *	       the function call returns) call one of:
+ *	       devlink_linecard_provision_set()
+ *	       devlink_linecard_provision_fail()
+ * @unprovision: callback to unprovision the linecard slot. As a result
+ *		 of this operation, driver is expected to eventually
+ *		 (could be after the function call returns) call
+ *	         devlink_linecard_provision_clear()
+ *	         devlink_linecard_provision_fail()
+ * @same_provision: callback to ask the driver if linecard is already
+ *                  provisioned in the same way user asks this linecard to be
+ *                  provisioned.
+ * @types_count: callback to get number of supported types
+ * @types_get: callback to get next type in list
+ */
+struct devlink_linecard_ops {
+	int (*provision)(struct devlink_linecard *linecard, void *priv,
+			 const char *type, const void *type_priv,
+			 struct netlink_ext_ack *extack);
+	int (*unprovision)(struct devlink_linecard *linecard, void *priv,
+			   struct netlink_ext_ack *extack);
+	bool (*same_provision)(struct devlink_linecard *linecard, void *priv,
+			       const char *type, const void *type_priv);
+	unsigned int (*types_count)(struct devlink_linecard *linecard,
+				    void *priv);
+	void (*types_get)(struct devlink_linecard *linecard,
+			  void *priv, unsigned int index, const char **type,
+			  const void **type_priv);
+};
+
 struct devlink_sb_pool_info {
 	enum devlink_sb_pool_type pool_type;
 	u32 size;
@@ -1537,9 +1571,14 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 int devlink_rate_leaf_create(struct devlink_port *port, void *priv);
 void devlink_rate_leaf_destroy(struct devlink_port *devlink_port);
 void devlink_rate_nodes_destroy(struct devlink *devlink);
-struct devlink_linecard *devlink_linecard_create(struct devlink *devlink,
-						 unsigned int linecard_index);
+struct devlink_linecard *
+devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
+			const struct devlink_linecard_ops *ops, void *priv);
 void devlink_linecard_destroy(struct devlink_linecard *linecard);
+void devlink_linecard_provision_set(struct devlink_linecard *linecard,
+				    const char *type);
+void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
+void devlink_linecard_provision_fail(struct devlink_linecard *linecard);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From fc9f50d5b366cd9f35bdee22fe3f8d77833cb1d8 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 18 Apr 2022 09:42:27 +0300
Subject: devlink: implement line card active state

Allow driver to mark a line card as active. Expose this state to the
userspace over devlink netlink interface with proper notifications.
'active' state means that line card was plugged in after
being provisioned.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 3e49d4ff498c..d8061a11fee6 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1579,6 +1579,8 @@ void devlink_linecard_provision_set(struct devlink_linecard *linecard,
 				    const char *type);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
 void devlink_linecard_provision_fail(struct devlink_linecard *linecard);
+void devlink_linecard_activate(struct devlink_linecard *linecard);
+void devlink_linecard_deactivate(struct devlink_linecard *linecard);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From b837585985386e05478cb17868d23a6fa87c4f8d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 18 Apr 2022 09:42:28 +0300
Subject: devlink: add port to line card relationship set

In order to properly inform user about relationship between port and
line card, introduce a driver API to set line card for a port. Use this
information to extend port devlink netlink message by line card index
and also include the line card index into phys_port_name and by that
into a netdevice name.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index d8061a11fee6..2a2a2a0c93f7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -136,6 +136,7 @@ struct devlink_port {
 	struct mutex reporters_lock; /* Protects reporter_list */
 
 	struct devlink_rate *devlink_rate;
+	struct devlink_linecard *linecard;
 };
 
 struct devlink_port_new_attrs {
@@ -1571,6 +1572,8 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 int devlink_rate_leaf_create(struct devlink_port *port, void *priv);
 void devlink_rate_leaf_destroy(struct devlink_port *devlink_port);
 void devlink_rate_nodes_destroy(struct devlink *devlink);
+void devlink_port_linecard_set(struct devlink_port *devlink_port,
+			       struct devlink_linecard *linecard);
 struct devlink_linecard *
 devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
 			const struct devlink_linecard_ops *ops, void *priv);
-- 
cgit v1.2.3


From 38a6f0865796e26fc38fff4858f681d9ae76fa0f Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Sat, 16 Apr 2022 00:40:46 +0800
Subject: net: sched: support hash selecting tx queue

This patch allows users to pick queue_mapping, range
from A to B. Then we can load balance packets from A
to B tx queue. The range is an unsigned 16bit value
in decimal format.

$ tc filter ... action skbedit queue_mapping skbhash A B

"skbedit queue_mapping QUEUE_MAPPING" (from "man 8 tc-skbedit")
is enhanced with flags: SKBEDIT_F_TXQ_SKBHASH

  +----+      +----+      +----+
  | P1 |      | P2 |      | Pn |
  +----+      +----+      +----+
    |           |           |
    +-----------+-----------+
                |
                | clsact/skbedit
                |      MQ
                v
    +-----------+-----------+
    | q0        | qn        | qm
    v           v           v
  HTB/FQ       FIFO   ...  FIFO

For example:
If P1 sends out packets to different Pods on other host, and
we want distribute flows from qn - qm. Then we can use skb->hash
as hash.

setup commands:
$ NETDEV=eth0
$ ip netns add n1
$ ip link add ipv1 link $NETDEV type ipvlan mode l2
$ ip link set ipv1 netns n1
$ ip netns exec n1 ifconfig ipv1 2.2.2.100/24 up

$ tc qdisc add dev $NETDEV clsact
$ tc filter add dev $NETDEV egress protocol ip prio 1 \
        flower skip_hw src_ip 2.2.2.100 action skbedit queue_mapping skbhash 2 6
$ tc qdisc add dev $NETDEV handle 1: root mq
$ tc qdisc add dev $NETDEV parent 1:1 handle 2: htb
$ tc class add dev $NETDEV parent 2: classid 2:1 htb rate 100kbit
$ tc class add dev $NETDEV parent 2: classid 2:2 htb rate 200kbit
$ tc qdisc add dev $NETDEV parent 1:2 tbf rate 100mbit burst 100mb latency 1
$ tc qdisc add dev $NETDEV parent 1:3 pfifo
$ tc qdisc add dev $NETDEV parent 1:4 pfifo
$ tc qdisc add dev $NETDEV parent 1:5 pfifo
$ tc qdisc add dev $NETDEV parent 1:6 pfifo
$ tc qdisc add dev $NETDEV parent 1:7 pfifo

$ ip netns exec n1 iperf3 -c 2.2.2.1 -i 1 -t 10 -P 10

pick txqueue from 2 - 6:
$ ethtool -S $NETDEV | grep -i tx_queue_[0-9]_bytes
     tx_queue_0_bytes: 42
     tx_queue_1_bytes: 0
     tx_queue_2_bytes: 11442586444
     tx_queue_3_bytes: 7383615334
     tx_queue_4_bytes: 3981365579
     tx_queue_5_bytes: 3983235051
     tx_queue_6_bytes: 6706236461
     tx_queue_7_bytes: 42
     tx_queue_8_bytes: 0
     tx_queue_9_bytes: 0

txqueues 2 - 6 are mapped to classid 1:3 - 1:7
$ tc -s class show dev $NETDEV
...
class mq 1:3 root leaf 8002:
 Sent 11949133672 bytes 7929798 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
class mq 1:4 root leaf 8003:
 Sent 7710449050 bytes 5117279 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
class mq 1:5 root leaf 8004:
 Sent 4157648675 bytes 2758990 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
class mq 1:6 root leaf 8005:
 Sent 4159632195 bytes 2759990 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
class mq 1:7 root leaf 8006:
 Sent 7003169603 bytes 4646912 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
...

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jonathan Lemon <jonathan.lemon@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Talal Ahmad <talalahmad@google.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Antoine Tenart <atenart@kernel.org>
Cc: Wei Wang <weiwan@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tc_act/tc_skbedit.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index cab8229b9bed..dc1079f28e13 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -17,6 +17,7 @@ struct tcf_skbedit_params {
 	u32 mark;
 	u32 mask;
 	u16 queue_mapping;
+	u16 mapping_mod;
 	u16 ptype;
 	struct rcu_head rcu;
 };
-- 
cgit v1.2.3


From 34951fcf26c59e78ae430fba1fce7c08b1871249 Mon Sep 17 00:00:00 2001
From: Boris Sukholitko <boris.sukholitko@broadcom.com>
Date: Tue, 19 Apr 2022 11:14:32 +0300
Subject: flow_dissector: Add number of vlan tags dissector

Our customers in the fiber telecom world have network configurations
where they would like to control their traffic according to the number
of tags appearing in the packet.

For example, TR247 GPON conformance test suite specification mostly
talks about untagged, single, double tagged packets and gives lax
guidelines on the vlan protocol vs. number of vlan tags.

This is different from the common IT networks where 802.1Q and 802.1ad
protocols are usually describe single and double tagged packet. GPON
configurations that we work with have arbitrary mix the above protocols
and number of vlan tags in the packet.

The goal is to make the following TC commands possible:

tc filter add dev eth1 ingress flower \
  num_of_vlans 1 vlan_prio 5 action drop

From our logs, we have redirect rules such that:

tc filter add dev $GPON ingress flower num_of_vlans $N \
     action mirred egress redirect dev $DEV

where N can range from 0 to 3 and $DEV is the function of $N.

Also there are rules setting skb mark based on the number of vlans:

tc filter add dev $GPON ingress flower num_of_vlans $N vlan_prio \
    $P action skbedit mark $M

This new dissector allows extracting the number of vlan tags existing in
the packet.

Signed-off-by: Boris Sukholitko <boris.sukholitko@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 9f65f1bfbd24..a4c6057c7097 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -253,6 +253,14 @@ struct flow_dissector_key_hash {
 	u32 hash;
 };
 
+/**
+ * struct flow_dissector_key_num_of_vlans:
+ * @num_of_vlans: num_of_vlans value
+ */
+struct flow_dissector_key_num_of_vlans {
+	u8 num_of_vlans;
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
@@ -282,6 +290,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
 	FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */
 	FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */
+	FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
-- 
cgit v1.2.3


From 67e1e2f4854bb2c0dd2b8440cf090016a0e1a091 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Thu, 21 Apr 2022 01:21:33 +0200
Subject: ipv4: Avoid using RTO_ONLINK with ip_route_connect().

Now that ip_rt_fix_tos() doesn't reset ->flowi4_scope unconditionally,
we don't have to rely on the RTO_ONLINK bit to properly set the scope
of a flowi4 structure. We can just set ->flowi4_scope explicitly and
avoid using RTO_ONLINK in ->flowi4_tos.

This patch converts callers of ip_route_connect(). Instead of setting
the tos parameter with RT_CONN_FLAGS(sk), as all callers do, we can:

  1- Drop the tos parameter from ip_route_connect(): its value was
     entirely based on sk, which is also passed as parameter.

  2- Set ->flowi4_scope depending on the SOCK_LOCALROUTE socket option
     instead of always initialising it with RT_SCOPE_UNIVERSE (let's
     define ip_sock_rt_scope() for this purpose).

  3- Avoid overloading ->flowi4_tos with RTO_ONLINK: since the scope is
     now properly initialised, we don't need to tell ip_rt_fix_tos() to
     adjust ->flowi4_scope for us. So let's define ip_sock_rt_tos(),
     which is the same as RT_CONN_FLAGS() but without the RTO_ONLINK
     bit overload.

Note:
  In the original ip_route_connect() code, __ip_route_output_key()
  might clear the RTO_ONLINK bit of fl4->flowi4_tos (because of
  ip_rt_fix_tos()). Therefore flowi4_update_output() had to reuse the
  original tos variable. Now that we don't set RTO_ONLINK any more,
  this is not a problem and we can use fl4->flowi4_tos in
  flowi4_update_output().

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/route.h b/include/net/route.h
index 25404fc2b483..991a3985712d 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -43,6 +43,19 @@
 #define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
 #define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
 
+static inline __u8 ip_sock_rt_scope(const struct sock *sk)
+{
+	if (sock_flag(sk, SOCK_LOCALROUTE))
+		return RT_SCOPE_LINK;
+
+	return RT_SCOPE_UNIVERSE;
+}
+
+static inline __u8 ip_sock_rt_tos(const struct sock *sk)
+{
+	return RT_TOS(inet_sk(sk)->tos);
+}
+
 struct ip_tunnel_info;
 struct fib_nh;
 struct fib_info;
@@ -289,39 +302,38 @@ static inline char rt_tos2priority(u8 tos)
  * ip_route_newports() calls.
  */
 
-static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src,
-					 u32 tos, int oif, u8 protocol,
+static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
+					 __be32 src, int oif, u8 protocol,
 					 __be16 sport, __be16 dport,
-					 struct sock *sk)
+					 const struct sock *sk)
 {
 	__u8 flow_flags = 0;
 
 	if (inet_sk(sk)->transparent)
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
-	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags, dst, src, dport, sport,
-			   sk->sk_uid);
+	flowi4_init_output(fl4, oif, sk->sk_mark, ip_sock_rt_tos(sk),
+			   ip_sock_rt_scope(sk), protocol, flow_flags, dst,
+			   src, dport, sport, sk->sk_uid);
 }
 
-static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
-					      __be32 dst, __be32 src, u32 tos,
-					      int oif, u8 protocol,
+static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst,
+					      __be32 src, int oif, u8 protocol,
 					      __be16 sport, __be16 dport,
 					      struct sock *sk)
 {
 	struct net *net = sock_net(sk);
 	struct rtable *rt;
 
-	ip_route_connect_init(fl4, dst, src, tos, oif, protocol,
-			      sport, dport, sk);
+	ip_route_connect_init(fl4, dst, src, oif, protocol, sport, dport, sk);
 
 	if (!dst || !src) {
 		rt = __ip_route_output_key(net, fl4);
 		if (IS_ERR(rt))
 			return rt;
 		ip_rt_put(rt);
-		flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr);
+		flowi4_update_output(fl4, oif, fl4->flowi4_tos, fl4->daddr,
+				     fl4->saddr);
 	}
 	security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
 	return ip_route_output_flow(net, fl4, sk);
-- 
cgit v1.2.3


From b253a0680ceadc5d7b4acca7aa2d870326cad8ad Mon Sep 17 00:00:00 2001
From: Pengcheng Yang <yangpc@wangsu.com>
Date: Wed, 20 Apr 2022 10:34:41 +0800
Subject: tcp: ensure to use the most recently sent skb when filling the rate
 sample

If an ACK (s)acks multiple skbs, we favor the information
from the most recently sent skb by choosing the skb with
the highest prior_delivered count. But in the interval
between receiving ACKs, we send multiple skbs with the same
prior_delivered, because the tp->delivered only changes
when we receive an ACK.

We used RACK's solution, copying tcp_rack_sent_after() as
tcp_skb_sent_after() helper to determine "which packet was
sent last?". Later, we will use tcp_skb_sent_after() instead
in RACK.

Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection")
Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/1650422081-22153-1-git-send-email-yangpc@wangsu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70ca4a5e330a..be712fb9ddd7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1042,6 +1042,7 @@ struct rate_sample {
 	int  losses;		/* number of packets marked lost upon ACK */
 	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
 	u32  prior_in_flight;	/* in flight before this ACK */
+	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
@@ -1164,6 +1165,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 		  bool is_sack_reneg, struct rate_sample *rs);
 void tcp_rate_check_app_limited(struct sock *sk);
 
+static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
+{
+	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
+}
+
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
  * between different flows.
-- 
cgit v1.2.3


From 1e39e5a32ad7fdd82d6e071aa14ecd511eedc1f7 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliang.tang@suse.com>
Date: Fri, 22 Apr 2022 14:55:39 -0700
Subject: mptcp: infinite mapping sending

This patch adds the infinite mapping sending logic.

Add a new flag send_infinite_map in struct mptcp_subflow_context. Set
it true when a single contiguous subflow is in use and the
allow_infinite_fallback flag is true in mptcp_pm_mp_fail_received().

In mptcp_sendmsg_frag(), if this flag is true, call the new function
mptcp_update_infinite_map() to set the infinite mapping.

Add a new flag infinite_map in struct mptcp_ext, set it true in
mptcp_update_infinite_map(), and check this flag in a new helper
mptcp_check_infinite_map().

In mptcp_update_infinite_map(), set data_len to 0, and clear the
send_infinite_map flag, then do fallback.

In mptcp_established_options(), use the helper mptcp_check_infinite_map()
to let the infinite mapping DSS can be sent out in the fallback mode.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 0a3b0fb04a3b..8b1afd6f5cc4 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -35,7 +35,8 @@ struct mptcp_ext {
 			frozen:1,
 			reset_transient:1;
 	u8		reset_reason:4,
-			csum_reqd:1;
+			csum_reqd:1,
+			infinite_map:1;
 };
 
 #define MPTCP_RM_IDS_MAX	8
-- 
cgit v1.2.3


From 8d92e4fbcf0fb7ecb24223b7b1ce95b9beb4dfa2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 25 Apr 2022 06:44:21 +0300
Subject: devlink: introduce line card devices support

Line card can contain one or more devices that makes sense to make
visible to the user. For example, this can be a gearbox with
flash memory, which could be updated.

Provide the driver possibility to attach such devices to a line card
and expose those to user.

Example:
$ devlink lc show pci/0000:01:00.0 lc 8
pci/0000:01:00.0:
  lc 8 state active type 16x100G
    supported_types:
      16x100G
    devices:
      device 0
      device 1
      device 2
      device 3

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 2a2a2a0c93f7..c84b52fb9ff0 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1578,6 +1578,13 @@ struct devlink_linecard *
 devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
 			const struct devlink_linecard_ops *ops, void *priv);
 void devlink_linecard_destroy(struct devlink_linecard *linecard);
+struct devlink_linecard_device;
+struct devlink_linecard_device *
+devlink_linecard_device_create(struct devlink_linecard *linecard,
+			       unsigned int device_index);
+void
+devlink_linecard_device_destroy(struct devlink_linecard *linecard,
+				struct devlink_linecard_device *linecard_device);
 void devlink_linecard_provision_set(struct devlink_linecard *linecard,
 				    const char *type);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
-- 
cgit v1.2.3


From 276910aecc6a4076f5fbfd8160ff70695d6c1eb5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 25 Apr 2022 06:44:22 +0300
Subject: devlink: introduce line card info get message

Allow the driver to provide per line card info get op to fill-up info,
similar to the "devlink dev info".

Example:

$ devlink lc info pci/0000:01:00.0 lc 8
pci/0000:01:00.0:
  lc 8
    versions:
        fixed:
          hw.revision 0
        running:
          ini.version 4

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index c84b52fb9ff0..f96dcb376630 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -150,6 +150,8 @@ struct devlink_port_new_attrs {
 	   sfnum_valid:1;
 };
 
+struct devlink_info_req;
+
 /**
  * struct devlink_linecard_ops - Linecard operations
  * @provision: callback to provision the linecard slot with certain
@@ -168,6 +170,7 @@ struct devlink_port_new_attrs {
  *                  provisioned.
  * @types_count: callback to get number of supported types
  * @types_get: callback to get next type in list
+ * @info_get: callback to get linecard info
  */
 struct devlink_linecard_ops {
 	int (*provision)(struct devlink_linecard *linecard, void *priv,
@@ -182,6 +185,9 @@ struct devlink_linecard_ops {
 	void (*types_get)(struct devlink_linecard *linecard,
 			  void *priv, unsigned int index, const char **type,
 			  const void **type_priv);
+	int (*info_get)(struct devlink_linecard *linecard, void *priv,
+			struct devlink_info_req *req,
+			struct netlink_ext_ack *extack);
 };
 
 struct devlink_sb_pool_info {
@@ -628,7 +634,6 @@ struct devlink_flash_update_params {
 #define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK	BIT(1)
 
 struct devlink_region;
-struct devlink_info_req;
 
 /**
  * struct devlink_region_ops - Region operations
-- 
cgit v1.2.3


From 28b2d1f1ac41dda0b239830765d85c7ae4434182 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Mon, 25 Apr 2022 06:44:23 +0300
Subject: devlink: introduce line card device info infrastructure

Extend the line card info message with information (e.g., FW version)
about devices found on the line card.

Example:
$ devlink lc info pci/0000:01:00.0 lc 8
pci/0000:01:00.0:
  lc 8
    versions:
        fixed:
          hw.revision 0
        running:
          ini.version 4
    devices:
      device 0
        versions:
            running:
              fw 19.2010.1310
      device 1
        versions:
            running:
              fw 19.2010.1310
      device 2
        versions:
            running:
              fw 19.2010.1310
      device 3
        versions:
            running:
              fw 19.2010.1310

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index f96dcb376630..062895973656 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -151,6 +151,7 @@ struct devlink_port_new_attrs {
 };
 
 struct devlink_info_req;
+struct devlink_linecard_device;
 
 /**
  * struct devlink_linecard_ops - Linecard operations
@@ -171,6 +172,7 @@ struct devlink_info_req;
  * @types_count: callback to get number of supported types
  * @types_get: callback to get next type in list
  * @info_get: callback to get linecard info
+ * @device_info_get: callback to get linecard device info
  */
 struct devlink_linecard_ops {
 	int (*provision)(struct devlink_linecard *linecard, void *priv,
@@ -188,6 +190,9 @@ struct devlink_linecard_ops {
 	int (*info_get)(struct devlink_linecard *linecard, void *priv,
 			struct devlink_info_req *req,
 			struct netlink_ext_ack *extack);
+	int (*device_info_get)(struct devlink_linecard_device *device,
+			       void *priv, struct devlink_info_req *req,
+			       struct netlink_ext_ack *extack);
 };
 
 struct devlink_sb_pool_info {
@@ -1583,10 +1588,9 @@ struct devlink_linecard *
 devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
 			const struct devlink_linecard_ops *ops, void *priv);
 void devlink_linecard_destroy(struct devlink_linecard *linecard);
-struct devlink_linecard_device;
 struct devlink_linecard_device *
 devlink_linecard_device_create(struct devlink_linecard *linecard,
-			       unsigned int device_index);
+			       unsigned int device_index, void *priv);
 void
 devlink_linecard_device_destroy(struct devlink_linecard *linecard,
 				struct devlink_linecard_device *linecard_device);
-- 
cgit v1.2.3


From 31c417c948d7f6909cb63f0ac3298f3c38f8ce20 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Thu, 21 Apr 2022 15:09:02 -0700
Subject: ip_gre, ip6_gre: Fix race condition on o_seqno in collect_md mode

As pointed out by Jakub Kicinski, currently using TUNNEL_SEQ in
collect_md mode is racy for [IP6]GRE[TAP] devices.  Consider the
following sequence of events:

1. An [IP6]GRE[TAP] device is created in collect_md mode using "ip link
   add ... external".  "ip" ignores "[o]seq" if "external" is specified,
   so TUNNEL_SEQ is off, and the device is marked as NETIF_F_LLTX (i.e.
   it uses lockless TX);
2. Someone sets TUNNEL_SEQ on outgoing skb's, using e.g.
   bpf_skb_set_tunnel_key() in an eBPF program attached to this device;
3. gre_fb_xmit() or __gre6_xmit() processes these skb's:

	gre_build_header(skb, tun_hlen,
			 flags, protocol,
			 tunnel_id_to_key32(tun_info->key.tun_id),
			 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
					      : 0);   ^^^^^^^^^^^^^^^^^

Since we are not using the TX lock (&txq->_xmit_lock), multiple CPUs may
try to do this tunnel->o_seqno++ in parallel, which is racy.  Fix it by
making o_seqno atomic_t.

As mentioned by Eric Dumazet in commit b790e01aee74 ("ip_gre: lockless
xmit"), making o_seqno atomic_t increases "chance for packets being out
of order at receiver" when NETIF_F_LLTX is on.

Maybe a better fix would be:

1. Do not ignore "oseq" in external mode.  Users MUST specify "oseq" if
   they want the kernel to allow sequencing of outgoing packets;
2. Reject all outgoing TUNNEL_SEQ packets if the device was not created
   with "oseq".

Unfortunately, that would break userspace.

We could now make [IP6]GRE[TAP] devices always NETIF_F_LLTX, but let us
do it in separate patches to keep this fix minimal.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Fixes: 77a5196a804e ("gre: add sequence number for collect md mode.")
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_tunnel.h | 2 +-
 include/net/ip_tunnels.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index a38c4f1e4e5c..74b369bddf49 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -58,7 +58,7 @@ struct ip6_tnl {
 
 	/* These fields used only by GRE */
 	__u32 i_seqno;	/* The last seen seqno	*/
-	__u32 o_seqno;	/* The last output seqno */
+	atomic_t o_seqno;	/* The last output seqno */
 	int hlen;       /* tun_hlen + encap_hlen */
 	int tun_hlen;	/* Precalculated header length */
 	int encap_hlen; /* Encap header length (FOU,GUE) */
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 88dee57eac8a..c24fa934221d 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -116,7 +116,7 @@ struct ip_tunnel {
 
 	/* These four fields used only by GRE */
 	u32		i_seqno;	/* The last seen seqno	*/
-	u32		o_seqno;	/* The last output seqno */
+	atomic_t	o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
 
 	/* These four fields used only by ERSPAN */
-- 
cgit v1.2.3


From 4bfe744ff1644fbc0a991a2677dc874475dd6776 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 24 Apr 2022 17:34:07 -0700
Subject: tcp: fix potential xmit stalls caused by TCP_NOTSENT_LOWAT

I had this bug sitting for too long in my pile, it is time to fix it.

Thanks to Doug Porter for reminding me of it!

We had various attempts in the past, including commit
0cbe6a8f089e ("tcp: remove SOCK_QUEUE_SHRUNK"),
but the issue is that TCP stack currently only generates
EPOLLOUT from input path, when tp->snd_una has advanced
and skb(s) cleaned from rtx queue.

If a flow has a big RTT, and/or receives SACKs, it is possible
that the notsent part (tp->write_seq - tp->snd_nxt) reaches 0
and no more data can be sent until tp->snd_una finally advances.

What is needed is to also check if POLLOUT needs to be generated
whenever tp->snd_nxt is advanced, from output path.

This bug triggers more often after an idle period, as
we do not receive ACK for at least one RTT. tcp_notsent_lowat
could be a fraction of what CWND and pacing rate would allow to
send during this RTT.

In a followup patch, I will remove the bogus call
to tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED)
from tcp_check_space(). Fact that we have decided to generate
an EPOLLOUT does not mean the application has immediately
refilled the transmit queue. This optimistic call
might have been the reason the bug seemed not too serious.

Tested:

200 ms rtt, 1% packet loss, 32 MB tcp_rmem[2] and tcp_wmem[2]

$ echo 500000 >/proc/sys/net/ipv4/tcp_notsent_lowat
$ cat bench_rr.sh
SUM=0
for i in {1..10}
do
 V=`netperf -H remote_host -l30 -t TCP_RR -- -r 10000000,10000 -o LOCAL_BYTES_SENT | egrep -v "MIGRATED|Bytes"`
 echo $V
 SUM=$(($SUM + $V))
done
echo SUM=$SUM

Before patch:
$ bench_rr.sh
130000000
80000000
140000000
140000000
140000000
140000000
130000000
40000000
90000000
110000000
SUM=1140000000

After patch:
$ bench_rr.sh
430000000
590000000
530000000
450000000
450000000
350000000
450000000
490000000
480000000
460000000
SUM=4680000000  # This is 410 % of the value before patch.

Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Doug Porter <dsp@fb.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index be712fb9ddd7..b99d9d9cbd99 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -620,6 +620,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk, struct sk_buff *skb);
 void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
 void tcp_fin(struct sock *sk);
+void tcp_check_space(struct sock *sk);
 
 /* tcp_timer.c */
 void tcp_init_xmit_timers(struct sock *);
-- 
cgit v1.2.3


From ba5a4fdd63ae0c575707030db0b634b160baddd7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 24 Apr 2022 13:35:09 -0700
Subject: tcp: make sure treq->af_specific is initialized

syzbot complained about a recent change in TCP stack,
hitting a NULL pointer [1]

tcp request sockets have an af_specific pointer, which
was used before the blamed change only for SYNACK generation
in non SYNCOOKIE mode.

tcp requests sockets momentarily created when third packet
coming from client in SYNCOOKIE mode were not using
treq->af_specific.

Make sure this field is populated, in the same way normal
TCP requests sockets do in tcp_conn_request().

[1]
TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies.  Check SNMP counters.
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
CPU: 1 PID: 3695 Comm: syz-executor864 Not tainted 5.18.0-rc3-syzkaller-00224-g5fd1fe4807f9 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:tcp_create_openreq_child+0xe16/0x16b0 net/ipv4/tcp_minisocks.c:534
Code: 48 c1 ea 03 80 3c 02 00 0f 85 e5 07 00 00 4c 8b b3 28 01 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7e 08 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 c9 07 00 00 48 8b 3c 24 48 89 de 41 ff 56 08 48
RSP: 0018:ffffc90000de0588 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff888076490330 RCX: 0000000000000100
RDX: 0000000000000001 RSI: ffffffff87d67ff0 RDI: 0000000000000008
RBP: ffff88806ee1c7f8 R08: 0000000000000000 R09: 0000000000000000
R10: ffffffff87d67f00 R11: 0000000000000000 R12: ffff88806ee1bfc0
R13: ffff88801b0e0368 R14: 0000000000000000 R15: 0000000000000000
FS:  00007f517fe58700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffcead76960 CR3: 000000006f97b000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <IRQ>
 tcp_v6_syn_recv_sock+0x199/0x23b0 net/ipv6/tcp_ipv6.c:1267
 tcp_get_cookie_sock+0xc9/0x850 net/ipv4/syncookies.c:207
 cookie_v6_check+0x15c3/0x2340 net/ipv6/syncookies.c:258
 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1131 [inline]
 tcp_v6_do_rcv+0x1148/0x13b0 net/ipv6/tcp_ipv6.c:1486
 tcp_v6_rcv+0x3305/0x3840 net/ipv6/tcp_ipv6.c:1725
 ip6_protocol_deliver_rcu+0x2e9/0x1900 net/ipv6/ip6_input.c:422
 ip6_input_finish+0x14c/0x2c0 net/ipv6/ip6_input.c:464
 NF_HOOK include/linux/netfilter.h:307 [inline]
 NF_HOOK include/linux/netfilter.h:301 [inline]
 ip6_input+0x9c/0xd0 net/ipv6/ip6_input.c:473
 dst_input include/net/dst.h:461 [inline]
 ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline]
 NF_HOOK include/linux/netfilter.h:307 [inline]
 NF_HOOK include/linux/netfilter.h:301 [inline]
 ipv6_rcv+0x27f/0x3b0 net/ipv6/ip6_input.c:297
 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5405
 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5519
 process_backlog+0x3a0/0x7c0 net/core/dev.c:5847
 __napi_poll+0xb3/0x6e0 net/core/dev.c:6413
 napi_poll net/core/dev.c:6480 [inline]
 net_rx_action+0x8ec/0xc60 net/core/dev.c:6567
 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558
 invoke_softirq kernel/softirq.c:432 [inline]
 __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637
 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649
 sysvec_apic_timer_interrupt+0x93/0xc0 arch/x86/kernel/apic/apic.c:1097

Fixes: 5b0b9e4c2c89 ("tcp: md5: incorrect tcp_header_len for incoming connections")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b99d9d9cbd99..cc1295037533 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -480,6 +480,7 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 		      u32 cookie);
 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
 struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
+					    const struct tcp_request_sock_ops *af_ops,
 					    struct sock *sk, struct sk_buff *skb);
 #ifdef CONFIG_SYN_COOKIES
 
-- 
cgit v1.2.3


From 30ca44eb2480ede838c7c8c131a7d25589a98f0a Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 7 Apr 2022 12:08:57 +0200
Subject: net: mac802154: Create an offloaded transmission error helper

So far there is only a helper for successful transmissions, which led
device drivers to implement their own handling in case of
error. Unfortunately, we really need all the drivers to give the hand
back to the core once they are done in order to be able to build a
proper synchronous API. So let's create a _xmit_error() helper and take
this opportunity to fill the new device-global field storing Tx
statuses.

Suggested-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220407100903.1695973-5-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/mac802154.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 2c3bbc6645ba..29f289d58720 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -498,4 +498,14 @@ void ieee802154_stop_queue(struct ieee802154_hw *hw);
 void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
 			      bool ifs_handling);
 
+/**
+ * ieee802154_xmit_error - offloaded frame transmission failed
+ *
+ * @hw: pointer as obtained from ieee802154_alloc_hw().
+ * @skb: buffer for transmission
+ * @reason: error code
+ */
+void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb,
+			   int reason);
+
 #endif /* NET_MAC802154_H */
-- 
cgit v1.2.3


From 5a1b57c0dde9b93ba8261fa1c81f9aae2bac284a Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 7 Apr 2022 12:08:58 +0200
Subject: net: mac802154: Create an error helper for asynchronous offloading
 errors

A few drivers do the full transmit operation asynchronously, which means
that a bus error that happens when forwarding the packet to the
transmitter or a timeout happening when offloading the request to the
transmitter will not be reported immediately.

The solution in this case is to call this new helper to free the
necessary resources, restart the queue and always return the same
generic TRAC error code: IEEE802154_SYSTEM_ERROR.

Suggested-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220407100903.1695973-6-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/mac802154.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 29f289d58720..bdac0ddbdcdb 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -508,4 +508,13 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
 void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb,
 			   int reason);
 
+/**
+ * ieee802154_xmit_hw_error - frame could not be offloaded to the transmitter
+ *                            because of a hardware error (bus error, timeout, etc)
+ *
+ * @hw: pointer as obtained from ieee802154_alloc_hw().
+ * @skb: buffer for transmission
+ */
+void ieee802154_xmit_hw_error(struct ieee802154_hw *hw, struct sk_buff *skb);
+
 #endif /* NET_MAC802154_H */
-- 
cgit v1.2.3


From ba3beec2ec1d3b4fd8672ca6e781dac4b3267f6e Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Mon, 25 Apr 2022 17:37:45 +0200
Subject: xsk: Fix possible crash when multiple sockets are created

Fix a crash that happens if an Rx only socket is created first, then a
second socket is created that is Tx only and bound to the same umem as
the first socket and also the same netdev and queue_id together with the
XDP_SHARED_UMEM flag. In this specific case, the tx_descs array page
pool was not created by the first socket as it was an Rx only socket.
When the second socket is bound it needs this tx_descs array of this
shared page pool as it has a Tx component, but unfortunately it was
never allocated, leading to a crash. Note that this array is only used
for zero-copy drivers using the batched Tx APIs, currently only ice and
i40e.

[ 5511.150360] BUG: kernel NULL pointer dereference, address: 0000000000000008
[ 5511.158419] #PF: supervisor write access in kernel mode
[ 5511.164472] #PF: error_code(0x0002) - not-present page
[ 5511.170416] PGD 0 P4D 0
[ 5511.173347] Oops: 0002 [#1] PREEMPT SMP PTI
[ 5511.178186] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G            E     5.18.0-rc1+ #97
[ 5511.187245] Hardware name: Intel Corp. GRANTLEY/GRANTLEY, BIOS GRRFCRB1.86B.0276.D07.1605190235 05/19/2016
[ 5511.198418] RIP: 0010:xsk_tx_peek_release_desc_batch+0x198/0x310
[ 5511.205375] Code: c0 83 c6 01 84 c2 74 6d 8d 46 ff 23 07 44 89 e1 48 83 c0 14 48 c1 e1 04 48 c1 e0 04 48 03 47 10 4c 01 c1 48 8b 50 08 48 8b 00 <48> 89 51 08 48 89 01 41 80 bd d7 00 00 00 00 75 82 48 8b 19 49 8b
[ 5511.227091] RSP: 0018:ffffc90000003dd0 EFLAGS: 00010246
[ 5511.233135] RAX: 0000000000000000 RBX: ffff88810c8da600 RCX: 0000000000000000
[ 5511.241384] RDX: 000000000000003c RSI: 0000000000000001 RDI: ffff888115f555c0
[ 5511.249634] RBP: ffffc90000003e08 R08: 0000000000000000 R09: ffff889092296b48
[ 5511.257886] R10: 0000ffffffffffff R11: ffff889092296800 R12: 0000000000000000
[ 5511.266138] R13: ffff88810c8db500 R14: 0000000000000040 R15: 0000000000000100
[ 5511.274387] FS:  0000000000000000(0000) GS:ffff88903f800000(0000) knlGS:0000000000000000
[ 5511.283746] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5511.290389] CR2: 0000000000000008 CR3: 00000001046e2001 CR4: 00000000003706f0
[ 5511.298640] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 5511.306892] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 5511.315142] Call Trace:
[ 5511.317972]  <IRQ>
[ 5511.320301]  ice_xmit_zc+0x68/0x2f0 [ice]
[ 5511.324977]  ? ktime_get+0x38/0xa0
[ 5511.328913]  ice_napi_poll+0x7a/0x6a0 [ice]
[ 5511.333784]  __napi_poll+0x2c/0x160
[ 5511.337821]  net_rx_action+0xdd/0x200
[ 5511.342058]  __do_softirq+0xe6/0x2dd
[ 5511.346198]  irq_exit_rcu+0xb5/0x100
[ 5511.350339]  common_interrupt+0xa4/0xc0
[ 5511.354777]  </IRQ>
[ 5511.357201]  <TASK>
[ 5511.359625]  asm_common_interrupt+0x1e/0x40
[ 5511.364466] RIP: 0010:cpuidle_enter_state+0xd2/0x360
[ 5511.370211] Code: 49 89 c5 0f 1f 44 00 00 31 ff e8 e9 00 7b ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 72 02 00 00 31 ff e8 02 0c 80 ff fb 45 85 f6 <0f> 88 11 01 00 00 49 63 c6 4c 2b 2c 24 48 8d 14 40 48 8d 14 90 49
[ 5511.391921] RSP: 0018:ffffffff82a03e60 EFLAGS: 00000202
[ 5511.397962] RAX: ffff88903f800000 RBX: 0000000000000001 RCX: 000000000000001f
[ 5511.406214] RDX: 0000000000000000 RSI: ffffffff823400b9 RDI: ffffffff8234c046
[ 5511.424646] RBP: ffff88810a384800 R08: 000005032a28c046 R09: 0000000000000008
[ 5511.443233] R10: 000000000000000b R11: 0000000000000006 R12: ffffffff82bcf700
[ 5511.461922] R13: 000005032a28c046 R14: 0000000000000001 R15: 0000000000000000
[ 5511.480300]  cpuidle_enter+0x29/0x40
[ 5511.494329]  do_idle+0x1c7/0x250
[ 5511.507610]  cpu_startup_entry+0x19/0x20
[ 5511.521394]  start_kernel+0x649/0x66e
[ 5511.534626]  secondary_startup_64_no_verify+0xc3/0xcb
[ 5511.549230]  </TASK>

Detect such case during bind() and allocate this memory region via newly
introduced xp_alloc_tx_descs(). Also, use kvcalloc instead of kcalloc as
for other buffer pool allocations, so that it matches the kvfree() from
xp_destroy().

Fixes: d1bc532e99be ("i40e: xsk: Move tmp desc array from driver to pool")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20220425153745.481322-1-maciej.fijalkowski@intel.com
---
 include/net/xsk_buff_pool.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 5554ee75e7da..647722e847b4 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -97,6 +97,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
 		  u16 queue_id, u16 flags);
 int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
 			 struct net_device *dev, u16 queue_id);
+int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs);
 void xp_destroy(struct xsk_buff_pool *pool);
 void xp_get_pool(struct xsk_buff_pool *pool);
 bool xp_put_pool(struct xsk_buff_pool *pool);
-- 
cgit v1.2.3


From c86cc5a3ec70f5644f1fa21610b943d0441bc1f7 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 22 Apr 2022 12:58:16 -0700
Subject: Bluetooth: hci_event: Fix checking for invalid handle on error status

Commit d5ebaa7c5f6f6 introduces checks for handle range
(e.g HCI_CONN_HANDLE_MAX) but controllers like Intel AX200 don't seem
to respect the valid range int case of error status:

> HCI Event: Connect Complete (0x03) plen 11
        Status: Page Timeout (0x04)
        Handle: 65535
        Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment&
	Sound Products Inc)
        Link type: ACL (0x01)
        Encryption: Disabled (0x00)
[1644965.827560] Bluetooth: hci0: Ignoring HCI_Connection_Complete for invalid handle

Because of it is impossible to cleanup the connections properly since
the stack would attempt to cancel the connection which is no longer in
progress causing the following trace:

< HCI Command: Create Connection Cancel (0x01|0x0008) plen 6
        Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment&
	Sound Products Inc)
= bluetoothd: src/profile.c:record_cb() Unable to get Hands-Free Voice
	gateway SDP record: Connection timed out
> HCI Event: Command Complete (0x0e) plen 10
      Create Connection Cancel (0x01|0x0008) ncmd 1
        Status: Unknown Connection Identifier (0x02)
        Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment&
	Sound Products Inc)
< HCI Command: Create Connection Cancel (0x01|0x0008) plen 6
        Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment&
	Sound Products Inc)

Fixes: d5ebaa7c5f6f6 ("Bluetooth: hci_event: Ignore multiple conn complete events")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 5cb095b09a94..69ef31cea582 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -578,6 +578,7 @@ enum {
 #define HCI_ERROR_CONNECTION_TIMEOUT	0x08
 #define HCI_ERROR_REJ_LIMITED_RESOURCES	0x0d
 #define HCI_ERROR_REJ_BAD_ADDR		0x0f
+#define HCI_ERROR_INVALID_PARAMETERS	0x12
 #define HCI_ERROR_REMOTE_USER_TERM	0x13
 #define HCI_ERROR_REMOTE_LOW_RESOURCES	0x14
 #define HCI_ERROR_REMOTE_POWER_OFF	0x15
-- 
cgit v1.2.3


From 9b3628d79b46f06157affc56fdb218fdd4988321 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 22 Apr 2022 12:58:18 -0700
Subject: Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted

This attempts to cleanup the hci_conn if it cannot be aborted as
otherwise it would likely result in having the controller and host
stack out of sync with respect to connection handle.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index d5377740e99c..8abd08245326 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1156,7 +1156,7 @@ int hci_conn_switch_role(struct hci_conn *conn, __u8 role);
 
 void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active);
 
-void hci_le_conn_failed(struct hci_conn *conn, u8 status);
+void hci_conn_failed(struct hci_conn *conn, u8 status);
 
 /*
  * hci_conn_get() and hci_conn_put() are used to control the life-time of an
-- 
cgit v1.2.3


From 68822bdf76f10c3dc80609d4e2cdc1e847429086 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Apr 2022 13:12:37 -0700
Subject: net: generalize skb freeing deferral to per-cpu lists

Logic added in commit f35f821935d8 ("tcp: defer skb freeing after socket
lock is released") helped bulk TCP flows to move the cost of skbs
frees outside of critical section where socket lock was held.

But for RPC traffic, or hosts with RFS enabled, the solution is far from
being ideal.

For RPC traffic, recvmsg() has to return to user space right after
skb payload has been consumed, meaning that BH handler has no chance
to pick the skb before recvmsg() thread. This issue is more visible
with BIG TCP, as more RPC fit one skb.

For RFS, even if BH handler picks the skbs, they are still picked
from the cpu on which user thread is running.

Ideally, it is better to free the skbs (and associated page frags)
on the cpu that originally allocated them.

This patch removes the per socket anchor (sk->defer_list) and
instead uses a per-cpu list, which will hold more skbs per round.

This new per-cpu list is drained at the end of net_action_rx(),
after incoming packets have been processed, to lower latencies.

In normal conditions, skbs are added to the per-cpu list with
no further action. In the (unlikely) cases where the cpu does not
run net_action_rx() handler fast enough, we use an IPI to raise
NET_RX_SOFTIRQ on the remote cpu.

Also, we do not bother draining the per-cpu list from dev_cpu_dead()
This is because skbs in this list have no requirement on how fast
they should be freed.

Note that we can add in the future a small per-cpu cache
if we see any contention on sd->defer_lock.

Tested on a pair of hosts with 100Gbit NIC, RFS enabled,
and /proc/sys/net/ipv4/tcp_rmem[2] tuned to 16MB to work around
page recycling strategy used by NIC driver (its page pool capacity
being too small compared to number of skbs/pages held in sockets
receive queues)

Note that this tuning was only done to demonstrate worse
conditions for skb freeing for this particular test.
These conditions can happen in more general production workload.

10 runs of one TCP_STREAM flow

Before:
Average throughput: 49685 Mbit.

Kernel profiles on cpu running user thread recvmsg() show high cost for
skb freeing related functions (*)

    57.81%  [kernel]       [k] copy_user_enhanced_fast_string
(*) 12.87%  [kernel]       [k] skb_release_data
(*)  4.25%  [kernel]       [k] __free_one_page
(*)  3.57%  [kernel]       [k] __list_del_entry_valid
     1.85%  [kernel]       [k] __netif_receive_skb_core
     1.60%  [kernel]       [k] __skb_datagram_iter
(*)  1.59%  [kernel]       [k] free_unref_page_commit
(*)  1.16%  [kernel]       [k] __slab_free
     1.16%  [kernel]       [k] _copy_to_iter
(*)  1.01%  [kernel]       [k] kfree
(*)  0.88%  [kernel]       [k] free_unref_page
     0.57%  [kernel]       [k] ip6_rcv_core
     0.55%  [kernel]       [k] ip6t_do_table
     0.54%  [kernel]       [k] flush_smp_call_function_queue
(*)  0.54%  [kernel]       [k] free_pcppages_bulk
     0.51%  [kernel]       [k] llist_reverse_order
     0.38%  [kernel]       [k] process_backlog
(*)  0.38%  [kernel]       [k] free_pcp_prepare
     0.37%  [kernel]       [k] tcp_recvmsg_locked
(*)  0.37%  [kernel]       [k] __list_add_valid
     0.34%  [kernel]       [k] sock_rfree
     0.34%  [kernel]       [k] _raw_spin_lock_irq
(*)  0.33%  [kernel]       [k] __page_cache_release
     0.33%  [kernel]       [k] tcp_v6_rcv
(*)  0.33%  [kernel]       [k] __put_page
(*)  0.29%  [kernel]       [k] __mod_zone_page_state
     0.27%  [kernel]       [k] _raw_spin_lock

After patch:
Average throughput: 73076 Mbit.

Kernel profiles on cpu running user thread recvmsg() looks better:

    81.35%  [kernel]       [k] copy_user_enhanced_fast_string
     1.95%  [kernel]       [k] _copy_to_iter
     1.95%  [kernel]       [k] __skb_datagram_iter
     1.27%  [kernel]       [k] __netif_receive_skb_core
     1.03%  [kernel]       [k] ip6t_do_table
     0.60%  [kernel]       [k] sock_rfree
     0.50%  [kernel]       [k] tcp_v6_rcv
     0.47%  [kernel]       [k] ip6_rcv_core
     0.45%  [kernel]       [k] read_tsc
     0.44%  [kernel]       [k] _raw_spin_lock_irqsave
     0.37%  [kernel]       [k] _raw_spin_lock
     0.37%  [kernel]       [k] native_irq_return_iret
     0.33%  [kernel]       [k] __inet6_lookup_established
     0.31%  [kernel]       [k] ip6_protocol_deliver_rcu
     0.29%  [kernel]       [k] tcp_rcv_established
     0.29%  [kernel]       [k] llist_reverse_order

v2: kdoc issue (kernel bots)
    do not defer if (alloc_cpu == smp_processor_id()) (Paolo)
    replace the sk_buff_head with a single-linked list (Jakub)
    add a READ_ONCE()/WRITE_ONCE() for the lockless read of sd->defer_list

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20220422201237.416238-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h |  2 --
 include/net/tcp.h  | 12 ------------
 2 files changed, 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index a01d6c421aa2..f9f8ecae0f8d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -292,7 +292,6 @@ struct sk_filter;
   *	@sk_pacing_shift: scaling factor for TCP Small Queues
   *	@sk_lingertime: %SO_LINGER l_linger setting
   *	@sk_backlog: always used with the per-socket spinlock held
-  *	@defer_list: head of llist storing skbs to be freed
   *	@sk_callback_lock: used with the callbacks in the end of this struct
   *	@sk_error_queue: rarely used
   *	@sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
@@ -417,7 +416,6 @@ struct sock {
 		struct sk_buff	*head;
 		struct sk_buff	*tail;
 	} sk_backlog;
-	struct llist_head defer_list;
 
 #define sk_rmem_alloc sk_backlog.rmem_alloc
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 679b1964d494..94a52ad1101c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1375,18 +1375,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
 		     enum skb_drop_reason *reason);
 
-#ifdef CONFIG_INET
-void __sk_defer_free_flush(struct sock *sk);
-
-static inline void sk_defer_free_flush(struct sock *sk)
-{
-	if (llist_empty(&sk->defer_list))
-		return;
-	__sk_defer_free_flush(sk);
-}
-#else
-static inline void sk_defer_free_flush(struct sock *sk) {}
-#endif
 
 int tcp_filter(struct sock *sk, struct sk_buff *skb);
 void tcp_set_state(struct sock *sk, int state);
-- 
cgit v1.2.3


From 6fd1d51cfa253b5ee7dae18d7cf1df830e9b6137 Mon Sep 17 00:00:00 2001
From: Erin MacNeil <lnx.erin@gmail.com>
Date: Wed, 27 Apr 2022 16:02:37 -0400
Subject: net: SO_RCVMARK socket option for SO_MARK with recvmsg()

Adding a new socket option, SO_RCVMARK, to indicate that SO_MARK
should be included in the ancillary data returned by recvmsg().

Renamed the sock_recv_ts_and_drops() function to sock_recv_cmsgs().

Signed-off-by: Erin MacNeil <lnx.erin@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/r/20220427200259.2564-1-lnx.erin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index f9f8ecae0f8d..663041b92c21 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -893,6 +893,7 @@ enum sock_flags {
 	SOCK_TXTIME,
 	SOCK_XDP, /* XDP is attached */
 	SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
+	SOCK_RCVMARK, /* Receive SO_MARK  ancillary data with packet */
 };
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -2647,20 +2648,21 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 		__sock_recv_wifi_status(msg, sk, skb);
 }
 
-void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
-			      struct sk_buff *skb);
+void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
+		       struct sk_buff *skb);
 
 #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
-static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
-					  struct sk_buff *skb)
+static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
+				   struct sk_buff *skb)
 {
-#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL)			| \
-			   (1UL << SOCK_RCVTSTAMP))
+#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL)			| \
+			   (1UL << SOCK_RCVTSTAMP)			| \
+			   (1UL << SOCK_RCVMARK))
 #define TSFLAGS_ANY	  (SOF_TIMESTAMPING_SOFTWARE			| \
 			   SOF_TIMESTAMPING_RAW_HARDWARE)
 
-	if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
-		__sock_recv_ts_and_drops(msg, sk, skb);
+	if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY)
+		__sock_recv_cmsgs(msg, sk, skb);
 	else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
 		sock_write_timestamp(sk, skb->tstamp);
 	else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP))
-- 
cgit v1.2.3


From de32bc6aad09131a30b4a9a738e2bf2ba5a9a5aa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 28 Apr 2022 11:58:44 +0100
Subject: net: inline sock_alloc_send_skb

sock_alloc_send_skb() is simple and just proxying to another function,
so we can inline it and cut associated overhead.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 663041b92c21..73063c88a249 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1823,11 +1823,17 @@ int sock_getsockopt(struct socket *sock, int level, int op,
 		    char __user *optval, int __user *optlen);
 int sock_gettstamp(struct socket *sock, void __user *userstamp,
 		   bool timeval, bool time32);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
-				    int noblock, int *errcode);
 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 				     unsigned long data_len, int noblock,
 				     int *errcode, int max_page_order);
+
+static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
+						  unsigned long size,
+						  int noblock, int *errcode)
+{
+	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
+}
+
 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
 void sock_kfree_s(struct sock *sk, void *mem, int size);
 void sock_kzfree_s(struct sock *sk, void *mem, int size);
-- 
cgit v1.2.3


From 34c9a0e71cbb316f360919353273b185c2780cd7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 12 Apr 2022 22:09:59 +0200
Subject: cfg80211: remove cfg80211_get_chan_state()

We haven't used this function for years, since commit c781944b71f8
("cfg80211: Remove unused cfg80211_can_use_iftype_chan()") which
itself removed a function unused since commit 97dc94f1d933
("cfg80211: remove channel_switch combination check"), almost eight
years ago.

Also remove the now unused enum cfg80211_chan_mode and some struct
members that were only used for this function.

Link: https://lore.kernel.org/r/20220412220958.1a191dca19d7.Ide4448f02d0e2f1ca2992971421ffc1933a5370a@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 68713388b617..cd1212113901 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5549,8 +5549,6 @@ static inline void wiphy_unlock(struct wiphy *wiphy)
  * @conn_owner_nlportid: (private) connection owner socket port ID
  * @disconnect_wk: (private) auto-disconnect work
  * @disconnect_bssid: (private) the BSSID to use for auto-disconnect
- * @ibss_fixed: (private) IBSS is using fixed BSSID
- * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
  * @event_list: (private) list for internal event processing
  * @event_lock: (private) lock for event list
  * @owner_nlportid: (private) owner socket port ID
@@ -5599,9 +5597,6 @@ struct wireless_dev {
 	struct cfg80211_chan_def preset_chandef;
 	struct cfg80211_chan_def chandef;
 
-	bool ibss_fixed;
-	bool ibss_dfs_possible;
-
 	bool ps;
 	int ps_timeout;
 
-- 
cgit v1.2.3


From 36f8423597000bd7d5e48b7b306e1d0958e72359 Mon Sep 17 00:00:00 2001
From: Muna Sinada <quic_msinada@quicinc.com>
Date: Wed, 23 Mar 2022 15:46:35 -0700
Subject: cfg80211: support disabling EHT mode

Allow userspace to disable EHT mode during association.

Signed-off-by: Muna Sinada <quic_msinada@quicinc.com>
Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Link: https://lore.kernel.org/r/20220323224636.20211-1-quic_alokad@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cd1212113901..6a3e3f0a8615 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2735,6 +2735,7 @@ struct cfg80211_auth_request {
  *	userspace if this flag is set. Only applicable for cfg80211_connect()
  *	request (connect callback).
  * @ASSOC_REQ_DISABLE_HE:  Disable HE
+ * @ASSOC_REQ_DISABLE_EHT:  Disable EHT
  */
 enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_DISABLE_HT			= BIT(0),
@@ -2742,6 +2743,7 @@ enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_USE_RRM			= BIT(2),
 	CONNECT_REQ_EXTERNAL_AUTH_SUPPORT	= BIT(3),
 	ASSOC_REQ_DISABLE_HE			= BIT(4),
+	ASSOC_REQ_DISABLE_EHT			= BIT(5),
 };
 
 /**
-- 
cgit v1.2.3


From b2d057560b8107c633b39aabe517ff9d93f285e3 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Mon, 2 May 2022 10:46:08 +0200
Subject: secure_seq: use the 64 bits of the siphash for port offset
 calculation

SipHash replaced MD5 in secure_ipv{4,6}_port_ephemeral() via commit
7cd23e5300c1 ("secure_seq: use SipHash in place of MD5"), but the output
remained truncated to 32-bit only. In order to exploit more bits from the
hash, let's make the functions return the full 64-bit of siphash_3u32().
We also make sure the port offset calculation in __inet_hash_connect()
remains done on 32-bit to avoid the need for div_u64_rem() and an extra
cost on 32-bit systems.

Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Moshe Kol <moshe.kol@mail.huji.ac.il>
Cc: Yossi Gilad <yossi.gilad@mail.huji.ac.il>
Cc: Amit Klein <aksecurity@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_hashtables.h | 2 +-
 include/net/secure_seq.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index f72ec113ae56..98e1ec1a14f0 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -425,7 +425,7 @@ static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr)
 }
 
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
-			struct sock *sk, u32 port_offset,
+			struct sock *sk, u64 port_offset,
 			int (*check_established)(struct inet_timewait_death_row *,
 						 struct sock *, __u16,
 						 struct inet_timewait_sock **));
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index d7d2495f83c2..dac91aa38c5a 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -4,8 +4,8 @@
 
 #include <linux/types.h>
 
-u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
-u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
+u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
 u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
 		   __be16 sport, __be16 dport);
-- 
cgit v1.2.3


From c4a67a21a6d255ddcbaa076c0412aad73c7e0c02 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 4 May 2022 08:40:37 -0700
Subject: Revert "Merge branch 'mlxsw-line-card-model'"

This reverts commit 5e927a9f4b9f29d78a7c7d66ea717bb5c8bbad8e, reversing
changes made to cfc1d91a7d78cf9de25b043d81efcc16966d55b3.

The discussion is still ongoing so let's remove the uAPI
until the discussion settles.

Link: https://lore.kernel.org/all/20220425090021.32e9a98f@kernel.org/
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20220504154037.539442-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 062895973656..2a2a2a0c93f7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -150,9 +150,6 @@ struct devlink_port_new_attrs {
 	   sfnum_valid:1;
 };
 
-struct devlink_info_req;
-struct devlink_linecard_device;
-
 /**
  * struct devlink_linecard_ops - Linecard operations
  * @provision: callback to provision the linecard slot with certain
@@ -171,8 +168,6 @@ struct devlink_linecard_device;
  *                  provisioned.
  * @types_count: callback to get number of supported types
  * @types_get: callback to get next type in list
- * @info_get: callback to get linecard info
- * @device_info_get: callback to get linecard device info
  */
 struct devlink_linecard_ops {
 	int (*provision)(struct devlink_linecard *linecard, void *priv,
@@ -187,12 +182,6 @@ struct devlink_linecard_ops {
 	void (*types_get)(struct devlink_linecard *linecard,
 			  void *priv, unsigned int index, const char **type,
 			  const void **type_priv);
-	int (*info_get)(struct devlink_linecard *linecard, void *priv,
-			struct devlink_info_req *req,
-			struct netlink_ext_ack *extack);
-	int (*device_info_get)(struct devlink_linecard_device *device,
-			       void *priv, struct devlink_info_req *req,
-			       struct netlink_ext_ack *extack);
 };
 
 struct devlink_sb_pool_info {
@@ -639,6 +628,7 @@ struct devlink_flash_update_params {
 #define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK	BIT(1)
 
 struct devlink_region;
+struct devlink_info_req;
 
 /**
  * struct devlink_region_ops - Region operations
@@ -1588,12 +1578,6 @@ struct devlink_linecard *
 devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
 			const struct devlink_linecard_ops *ops, void *priv);
 void devlink_linecard_destroy(struct devlink_linecard *linecard);
-struct devlink_linecard_device *
-devlink_linecard_device_create(struct devlink_linecard *linecard,
-			       unsigned int device_index, void *priv);
-void
-devlink_linecard_device_destroy(struct devlink_linecard *linecard,
-				struct devlink_linecard_device *linecard_device);
 void devlink_linecard_provision_set(struct devlink_linecard *linecard,
 				    const char *type);
 void devlink_linecard_provision_clear(struct devlink_linecard *linecard);
-- 
cgit v1.2.3


From ea66758c1795cef81dc59cd5240e9d0f5c5207cf Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 4 May 2022 14:54:06 -0700
Subject: tcp: allow MPTCP to update the announced window

The MPTCP RFC requires that the MPTCP-level receive window's
right edge never moves backward. Currently the MPTCP code
enforces such constraint while tracking the right edge, but it
does not reflects it on the wire, as MPTCP lacks a suitable hook
to update accordingly the TCP header.

This change modifies the existing mptcp_write_options() hook,
providing the current packet's TCP header to the MPTCP protocol,
so that the next patch could implement the above mentioned
constraint.

No functional changes intended.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mptcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 8b1afd6f5cc4..d4ec894ce67b 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -125,7 +125,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 			       struct mptcp_out_options *opts);
 bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
 
-void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
+void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
 			 struct mptcp_out_options *opts);
 
 void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info);
-- 
cgit v1.2.3


From b01a277a0520edd5c815af346f05ef2c747919b6 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 5 May 2022 13:06:38 +0300
Subject: xfrm: free not used XFRM_ESP_NO_TRAILER flag

After removal of Innova IPsec support from mlx5 driver, the last user
of this XFRM_ESP_NO_TRAILER was gone too. This means that we can safely
remove it as no other hardware is capable (or need) to remove ESP trailer.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 6fb899ff5afc..b41278abeeaa 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1006,7 +1006,7 @@ struct xfrm_offload {
 #define	CRYPTO_FALLBACK		8
 #define	XFRM_GSO_SEGMENT	16
 #define	XFRM_GRO		32
-#define	XFRM_ESP_NO_TRAILER	64
+/* 64 is free */
 #define	XFRM_DEV_RESUME		128
 #define	XFRM_XMIT		256
 
-- 
cgit v1.2.3


From a36708e646586f74d073199828ed878b223e988d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 5 May 2022 13:06:39 +0300
Subject: xfrm: delete not used number of external headers

num_exthdrs is set but never used, so delete it.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b41278abeeaa..4e097423116c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -131,7 +131,6 @@ struct xfrm_state_offload {
 	netdevice_tracker	dev_tracker;
 	struct net_device	*real_dev;
 	unsigned long		offload_handle;
-	unsigned int		num_exthdrs;
 	u8			flags;
 };
 
-- 
cgit v1.2.3


From 87e0a94e60ea2e29be9dec6bc146fbc9861a4055 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 5 May 2022 13:06:40 +0300
Subject: xfrm: rename xfrm_state_offload struct to allow reuse

The struct xfrm_state_offload has all fields needed to hold information
for offloaded policies too. In order to do not create new struct with
same fields, let's rename existing one and reuse it later.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4e097423116c..bb20278d689c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -126,7 +126,7 @@ struct xfrm_state_walk {
 	struct xfrm_address_filter *filter;
 };
 
-struct xfrm_state_offload {
+struct xfrm_dev_offload {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
 	struct net_device	*real_dev;
@@ -246,7 +246,7 @@ struct xfrm_state {
 	struct xfrm_lifetime_cur curlft;
 	struct hrtimer		mtimer;
 
-	struct xfrm_state_offload xso;
+	struct xfrm_dev_offload xso;
 
 	/* used to fix curlft->add_time when changing date */
 	long		saved_tmo;
@@ -1865,7 +1865,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
 static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
 {
-	struct xfrm_state_offload *xso = &x->xso;
+	struct xfrm_dev_offload *xso = &x->xso;
 
 	if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn)
 		xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
@@ -1891,7 +1891,7 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 
 static inline void xfrm_dev_state_delete(struct xfrm_state *x)
 {
-	struct xfrm_state_offload *xso = &x->xso;
+	struct xfrm_dev_offload *xso = &x->xso;
 
 	if (xso->dev)
 		xso->dev->xfrmdev_ops->xdo_dev_state_delete(x);
@@ -1899,7 +1899,7 @@ static inline void xfrm_dev_state_delete(struct xfrm_state *x)
 
 static inline void xfrm_dev_state_free(struct xfrm_state *x)
 {
-	struct xfrm_state_offload *xso = &x->xso;
+	struct xfrm_dev_offload *xso = &x->xso;
 	struct net_device *dev = xso->dev;
 
 	if (dev && dev->xfrmdev_ops) {
-- 
cgit v1.2.3


From 482db2f1dd211f73ad9d71e33ae15c1df6379982 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 5 May 2022 13:06:41 +0300
Subject: xfrm: store and rely on direction to construct offload flags

XFRM state doesn't need anything from flags except to understand
direction, so store it separately. For future patches, such change
will allow us to reuse xfrm_dev_offload for policy offload too, which
has three possible directions instead of two.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index bb20278d689c..45422f7be0c5 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -126,12 +126,18 @@ struct xfrm_state_walk {
 	struct xfrm_address_filter *filter;
 };
 
+enum {
+	XFRM_DEV_OFFLOAD_IN = 1,
+	XFRM_DEV_OFFLOAD_OUT,
+};
+
 struct xfrm_dev_offload {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
 	struct net_device	*real_dev;
 	unsigned long		offload_handle;
 	u8			flags;
+	u8			dir : 2;
 };
 
 struct xfrm_mode {
-- 
cgit v1.2.3


From 254c4a824c7c6a53360bc4974710e4213b8b7f5d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 5 May 2022 13:06:45 +0300
Subject: xfrm: drop not needed flags variable in XFRM offload struct

After drivers were converted to rely on direction, the flags is not
used anymore and can be removed.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 45422f7be0c5..736c349de8bf 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -136,7 +136,6 @@ struct xfrm_dev_offload {
 	netdevice_tracker	dev_tracker;
 	struct net_device	*real_dev;
 	unsigned long		offload_handle;
-	u8			flags;
 	u8			dir : 2;
 };
 
-- 
cgit v1.2.3


From fe5233b0ba0d2216d549f93e4540542b99b97642 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 5 May 2022 19:22:13 +0300
Subject: net: dsa: delete dsa_port_walk_{fdbs,mdbs}

All the users of these functions are gone, delete them before they gain
new ones.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 934958fda962..efd33956df37 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1239,12 +1239,6 @@ struct dsa_switch_driver {
 
 struct net_device *dsa_dev_to_net_device(struct device *dev);
 
-typedef int dsa_fdb_walk_cb_t(struct dsa_switch *ds, int port,
-			      const unsigned char *addr, u16 vid,
-			      struct dsa_db db);
-
-int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb);
-int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb);
 bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
 				 const unsigned char *addr, u16 vid,
 				 struct dsa_db db);
-- 
cgit v1.2.3


From 5b87be9e4978fe507c7e14c0bdff147d10d39aec Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 9 May 2022 20:57:37 -0700
Subject: net: add include/net/net_debug.h

Remove from include/linux/netdevice.h helpers
that send debug/info/warnings to syslog.

We plan adding more helpers in following patches.

v2: added two includes, and 'struct net_device' forward declaration
    to avoid compile errors (kernel bots)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_debug.h | 151 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 include/net/net_debug.h

(limited to 'include/net')

diff --git a/include/net/net_debug.h b/include/net/net_debug.h
new file mode 100644
index 000000000000..19cd2700c20e
--- /dev/null
+++ b/include/net/net_debug.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NET_DEBUG_H
+#define _LINUX_NET_DEBUG_H
+
+#include <linux/bug.h>
+#include <linux/kern_levels.h>
+
+struct net_device;
+
+__printf(3, 4) __cold
+void netdev_printk(const char *level, const struct net_device *dev,
+		   const char *format, ...);
+__printf(2, 3) __cold
+void netdev_emerg(const struct net_device *dev, const char *format, ...);
+__printf(2, 3) __cold
+void netdev_alert(const struct net_device *dev, const char *format, ...);
+__printf(2, 3) __cold
+void netdev_crit(const struct net_device *dev, const char *format, ...);
+__printf(2, 3) __cold
+void netdev_err(const struct net_device *dev, const char *format, ...);
+__printf(2, 3) __cold
+void netdev_warn(const struct net_device *dev, const char *format, ...);
+__printf(2, 3) __cold
+void netdev_notice(const struct net_device *dev, const char *format, ...);
+__printf(2, 3) __cold
+void netdev_info(const struct net_device *dev, const char *format, ...);
+
+#define netdev_level_once(level, dev, fmt, ...)			\
+do {								\
+	static bool __section(".data.once") __print_once;	\
+								\
+	if (!__print_once) {					\
+		__print_once = true;				\
+		netdev_printk(level, dev, fmt, ##__VA_ARGS__);	\
+	}							\
+} while (0)
+
+#define netdev_emerg_once(dev, fmt, ...) \
+	netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__)
+#define netdev_alert_once(dev, fmt, ...) \
+	netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__)
+#define netdev_crit_once(dev, fmt, ...) \
+	netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__)
+#define netdev_err_once(dev, fmt, ...) \
+	netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__)
+#define netdev_warn_once(dev, fmt, ...) \
+	netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__)
+#define netdev_notice_once(dev, fmt, ...) \
+	netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__)
+#define netdev_info_once(dev, fmt, ...) \
+	netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)
+
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+#define netdev_dbg(__dev, format, args...)			\
+do {								\
+	dynamic_netdev_dbg(__dev, format, ##args);		\
+} while (0)
+#elif defined(DEBUG)
+#define netdev_dbg(__dev, format, args...)			\
+	netdev_printk(KERN_DEBUG, __dev, format, ##args)
+#else
+#define netdev_dbg(__dev, format, args...)			\
+({								\
+	if (0)							\
+		netdev_printk(KERN_DEBUG, __dev, format, ##args); \
+})
+#endif
+
+#if defined(VERBOSE_DEBUG)
+#define netdev_vdbg	netdev_dbg
+#else
+
+#define netdev_vdbg(dev, format, args...)			\
+({								\
+	if (0)							\
+		netdev_printk(KERN_DEBUG, dev, format, ##args);	\
+	0;							\
+})
+#endif
+
+/* netif printk helpers, similar to netdev_printk */
+
+#define netif_printk(priv, type, level, dev, fmt, args...)	\
+do {					  			\
+	if (netif_msg_##type(priv))				\
+		netdev_printk(level, (dev), fmt, ##args);	\
+} while (0)
+
+#define netif_level(level, priv, type, dev, fmt, args...)	\
+do {								\
+	if (netif_msg_##type(priv))				\
+		netdev_##level(dev, fmt, ##args);		\
+} while (0)
+
+#define netif_emerg(priv, type, dev, fmt, args...)		\
+	netif_level(emerg, priv, type, dev, fmt, ##args)
+#define netif_alert(priv, type, dev, fmt, args...)		\
+	netif_level(alert, priv, type, dev, fmt, ##args)
+#define netif_crit(priv, type, dev, fmt, args...)		\
+	netif_level(crit, priv, type, dev, fmt, ##args)
+#define netif_err(priv, type, dev, fmt, args...)		\
+	netif_level(err, priv, type, dev, fmt, ##args)
+#define netif_warn(priv, type, dev, fmt, args...)		\
+	netif_level(warn, priv, type, dev, fmt, ##args)
+#define netif_notice(priv, type, dev, fmt, args...)		\
+	netif_level(notice, priv, type, dev, fmt, ##args)
+#define netif_info(priv, type, dev, fmt, args...)		\
+	netif_level(info, priv, type, dev, fmt, ##args)
+
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+#define netif_dbg(priv, type, netdev, format, args...)		\
+do {								\
+	if (netif_msg_##type(priv))				\
+		dynamic_netdev_dbg(netdev, format, ##args);	\
+} while (0)
+#elif defined(DEBUG)
+#define netif_dbg(priv, type, dev, format, args...)		\
+	netif_printk(priv, type, KERN_DEBUG, dev, format, ##args)
+#else
+#define netif_dbg(priv, type, dev, format, args...)			\
+({									\
+	if (0)								\
+		netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
+	0;								\
+})
+#endif
+
+/* if @cond then downgrade to debug, else print at @level */
+#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...)     \
+	do {                                                              \
+		if (cond)                                                 \
+			netif_dbg(priv, type, netdev, fmt, ##args);       \
+		else                                                      \
+			netif_ ## level(priv, type, netdev, fmt, ##args); \
+	} while (0)
+
+#if defined(VERBOSE_DEBUG)
+#define netif_vdbg	netif_dbg
+#else
+#define netif_vdbg(priv, type, dev, format, args...)		\
+({								\
+	if (0)							\
+		netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
+	0;							\
+})
+#endif
+
+
+#endif	/* _LINUX_NET_DEBUG_H */
-- 
cgit v1.2.3


From d268c1f5cfc92eb5bb605f7365769aacd93be234 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 9 May 2022 20:57:38 -0700
Subject: net: add CONFIG_DEBUG_NET

This config option enables network debugging checks.

This patch adds DEBUG_NET_WARN_ON_ONCE(cond)
Note that this is not a replacement for WARN_ON_ONCE(cond)
as (cond) is not evaluated if CONFIG_DEBUG_NET is not set.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_debug.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/net_debug.h b/include/net/net_debug.h
index 19cd2700c20e..1e74684cbbdb 100644
--- a/include/net/net_debug.h
+++ b/include/net/net_debug.h
@@ -148,4 +148,10 @@ do {								\
 #endif
 
 
+#if defined(CONFIG_DEBUG_NET)
+#define DEBUG_NET_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
+#else
+#define DEBUG_NET_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#endif
+
 #endif	/* _LINUX_NET_DEBUG_H */
-- 
cgit v1.2.3


From 8b796475fd7882663a870456466a4fb315cc1bd6 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 10 May 2022 16:57:34 +0200
Subject: net/sched: act_pedit: really ensure the skb is writable

Currently pedit tries to ensure that the accessed skb offset
is writable via skb_unclone(). The action potentially allows
touching any skb bytes, so it may end-up modifying shared data.

The above causes some sporadic MPTCP self-test failures, due to
this code:

	tc -n $ns2 filter add dev ns2eth$i egress \
		protocol ip prio 1000 \
		handle 42 fw \
		action pedit munge offset 148 u8 invert \
		pipe csum tcp \
		index 100

The above modifies a data byte outside the skb head and the skb is
a cloned one, carrying a TCP output packet.

This change addresses the issue by keeping track of a rough
over-estimate highest skb offset accessed by the action and ensuring
such offset is really writable.

Note that this may cause performance regressions in some scenarios,
but hopefully pedit is not in the critical path.

Fixes: db2c24175d14 ("act_pedit: access skb->data safely")
Acked-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Tested-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/1fcf78e6679d0a287dd61bb0f04730ce33b3255d.1652194627.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_pedit.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 748cf87a4d7e..3e02709a1df6 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -14,6 +14,7 @@ struct tcf_pedit {
 	struct tc_action	common;
 	unsigned char		tcfp_nkeys;
 	unsigned char		tcfp_flags;
+	u32			tcfp_off_max_hint;
 	struct tc_pedit_key	*tcfp_keys;
 	struct tcf_pedit_key_ex	*tcfp_keys_ex;
 };
-- 
cgit v1.2.3


From 103a2f3255a95991252f8f13375c3a96a75011cd Mon Sep 17 00:00:00 2001
From: Itay Iellin <ieitayie@gmail.com>
Date: Sat, 7 May 2022 08:32:48 -0400
Subject: Bluetooth: Fix the creation of hdev->name

Set a size limit of 8 bytes of the written buffer to "hdev->name"
including the terminating null byte, as the size of "hdev->name" is 8
bytes. If an id value which is greater than 9999 is allocated,
then the "snprintf(hdev->name, sizeof(hdev->name), "hci%d", id)"
function call would lead to a truncation of the id value in decimal
notation.

Set an explicit maximum id parameter in the id allocation function call.
The id allocation function defines the maximum allocated id value as the
maximum id parameter value minus one. Therefore, HCI_MAX_ID is defined
as 10000.

Signed-off-by: Itay Iellin <ieitayie@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 8abd08245326..62d7b81b1cb7 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -36,6 +36,9 @@
 /* HCI priority */
 #define HCI_PRIO_MAX	7
 
+/* HCI maximum id value */
+#define HCI_MAX_ID 10000
+
 /* HCI Core structures */
 struct inquiry_data {
 	bdaddr_t	bdaddr;
-- 
cgit v1.2.3


From 465c3de42b5dcdf0fa8cf996a81de6ba0e8553a3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 11 May 2022 12:50:16 +0300
Subject: net: dsa: introduce the dsa_cpu_ports() helper

Similar to dsa_user_ports() which retrieves a port mask of all user
ports, introduce dsa_cpu_ports() which retrieves the mask of all CPU
ports of a switch.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index efd33956df37..76257a9f0e1b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -590,6 +590,17 @@ static inline u32 dsa_user_ports(struct dsa_switch *ds)
 	return mask;
 }
 
+static inline u32 dsa_cpu_ports(struct dsa_switch *ds)
+{
+	struct dsa_port *cpu_dp;
+	u32 mask = 0;
+
+	dsa_switch_for_each_cpu_port(cpu_dp, ds)
+		mask |= BIT(cpu_dp->index);
+
+	return mask;
+}
+
 /* Return the local port used to reach an arbitrary switch device */
 static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device)
 {
-- 
cgit v1.2.3


From 72c3b0c7359a6f91dd03e08b839adccd9d4268a8 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 11 May 2022 12:50:17 +0300
Subject: net: dsa: felix: manage host flooding using a specific driver
 callback

At the time - commit 7569459a52c9 ("net: dsa: manage flooding on the CPU
ports") - not introducing a dedicated switch callback for host flooding
made sense, because for the only user, the felix driver, there was
nothing different to do for the CPU port than set the flood flags on the
CPU port just like on any other bridge port.

There are 2 reasons why this approach is not good enough, however.

(1) Other drivers, like sja1105, support configuring flooding as a
    function of {ingress port, egress port}, whereas the DSA
    ->port_bridge_flags() function only operates on an egress port.
    So with that driver we'd have useless host flooding from user ports
    which don't need it.

(2) Even with the felix driver, support for multiple CPU ports makes it
    difficult to piggyback on ->port_bridge_flags(). The way in which
    the felix driver is going to support host-filtered addresses with
    multiple CPU ports is that it will direct these addresses towards
    both CPU ports (in a sort of multicast fashion), then restrict the
    forwarding to only one of the two using the forwarding masks.
    Consequently, flooding will also be enabled towards both CPU ports.
    However, ->port_bridge_flags() gets passed the index of a single CPU
    port, and that leaves the flood settings out of sync between the 2
    CPU ports.

This is to say, it's better to have a specific driver method for host
flooding, which takes the user port as argument. This solves problem (1)
by allowing the driver to do different things for different user ports,
and problem (2) by abstracting the operation and letting the driver do
whatever, rather than explicitly making the DSA core point to the CPU
port it thinks needs to be touched.

This new method also creates a problem, which is that cross-chip setups
are not handled. However I don't have hardware right now where I can
test what is the proper thing to do, and there isn't hardware compatible
with multi-switch trees that supports host flooding. So it remains a
problem to be tackled in the future.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 76257a9f0e1b..cfb287b0d311 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -978,6 +978,8 @@ struct dsa_switch_ops {
 	int	(*port_bridge_flags)(struct dsa_switch *ds, int port,
 				     struct switchdev_brport_flags flags,
 				     struct netlink_ext_ack *extack);
+	void	(*port_set_host_flood)(struct dsa_switch *ds, int port,
+				       bool uc, bool mc);
 
 	/*
 	 * VLAN support
-- 
cgit v1.2.3


From bacf93b0561937695e9c6c1dc1d8ed10ca80eb81 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 11 May 2022 12:50:18 +0300
Subject: net: dsa: remove port argument from ->change_tag_protocol()

DSA has not supported (and probably will not support in the future
either) independent tagging protocols per CPU port.

Different switch drivers have different requirements, some may need to
replicate some settings for each CPU port, some may need to apply some
settings on a single CPU port, while some may have to configure some
global settings and then some per-CPU-port settings.

In any case, the current model where DSA calls ->change_tag_protocol for
each CPU port turns out to be impractical for drivers where there are
global things to be done. For example, felix calls dsa_tag_8021q_register(),
which makes no sense per CPU port, so it suppresses the second call.

Let drivers deal with replication towards all CPU ports, and remove the
CPU port argument from the function prototype.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index cfb287b0d311..14f07275852b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -579,6 +579,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
 	dsa_switch_for_each_port((_dp), (_ds)) \
 		if (dsa_port_is_cpu((_dp)))
 
+#define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \
+	dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \
+		if (dsa_port_is_cpu((_dp)))
+
 static inline u32 dsa_user_ports(struct dsa_switch *ds)
 {
 	struct dsa_port *dp;
@@ -803,7 +807,7 @@ struct dsa_switch_ops {
 	enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds,
 						  int port,
 						  enum dsa_tag_protocol mprot);
-	int	(*change_tag_protocol)(struct dsa_switch *ds, int port,
+	int	(*change_tag_protocol)(struct dsa_switch *ds,
 				       enum dsa_tag_protocol proto);
 	/*
 	 * Method for switch drivers to connect to the tagging protocol driver
-- 
cgit v1.2.3


From 8ea1eebb49a2dfee1dce621a638cc1626e542392 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 11 May 2022 17:05:52 -0700
Subject: net: inet: Remove count from inet_listen_hashbucket

After commit 0ee58dad5b06 ("net: tcp6: prefer listeners bound to an address")
and commit d9fbc7f6431f ("net: tcp: prefer listeners bound to an address"),
the count is no longer used.  This patch removes it.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_hashtables.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 98e1ec1a14f0..103fc7a30e60 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -111,7 +111,6 @@ struct inet_bind_hashbucket {
 #define LISTENING_NULLS_BASE (1U << 29)
 struct inet_listen_hashbucket {
 	spinlock_t		lock;
-	unsigned int		count;
 	union {
 		struct hlist_head	head;
 		struct hlist_nulls_head	nulls_head;
-- 
cgit v1.2.3


From cae3873c5b3a4fcd9706fb461ff4e91bdf1f0120 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 11 May 2022 17:06:05 -0700
Subject: net: inet: Retire port only listening_hash

The listen sk is currently stored in two hash tables,
listening_hash (hashed by port) and lhash2 (hashed by port and address).

After commit 0ee58dad5b06 ("net: tcp6: prefer listeners bound to an address")
and commit d9fbc7f6431f ("net: tcp: prefer listeners bound to an address"),
the TCP-SYN lookup fast path does not use listening_hash.

The commit 05c0b35709c5 ("tcp: seq_file: Replace listening_hash with lhash2")
also moved the seq_file (/proc/net/tcp) iteration usage from
listening_hash to lhash2.

There are still a few listening_hash usages left.
One of them is inet_reuseport_add_sock() which uses the listening_hash
to search a listen sk during the listen() system call.  This turns
out to be very slow on use cases that listen on many different
VIPs at a popular port (e.g. 443).  [ On top of the slowness in
adding to the tail in the IPv6 case ].  The latter patch has a
selftest to demonstrate this case.

This patch takes this chance to move all remaining listening_hash
usages to lhash2 and then retire listening_hash.

Since most changes need to be done together, it is hard to cut
the listening_hash to lhash2 switch into small patches.  The
changes in this patch is highlighted here for the review
purpose.

1. Because of the listening_hash removal, lhash2 can use the
   sk->sk_nulls_node instead of the icsk->icsk_listen_portaddr_node.
   This will also keep the sk_unhashed() check to work as is
   after stop adding sk to listening_hash.

   The union is removed from inet_listen_hashbucket because
   only nulls_head is needed.

2. icsk->icsk_listen_portaddr_node and its helpers are removed.

3. The current lhash2 users needs to iterate with sk_nulls_node
   instead of icsk_listen_portaddr_node.

   One case is in the inet[6]_lhash2_lookup().

   Another case is the seq_file iterator in tcp_ipv4.c.
   One thing to note is sk_nulls_next() is needed
   because the old inet_lhash2_for_each_icsk_continue()
   does a "next" first before iterating.

4. Move the remaining listening_hash usage to lhash2

   inet_reuseport_add_sock() which this series is
   trying to improve.

   inet_diag.c and mptcp_diag.c are the final two
   remaining use cases and is moved to lhash2 now also.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h |  2 --
 include/net/inet_hashtables.h      | 41 +-------------------------------------
 2 files changed, 1 insertion(+), 42 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 3908296d103f..85cd695e7fd1 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -66,7 +66,6 @@ struct inet_connection_sock_af_ops {
  * @icsk_ulp_ops	   Pluggable ULP control hook
  * @icsk_ulp_data	   ULP private data
  * @icsk_clean_acked	   Clean acked data hook
- * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
  * @icsk_pending:	   Scheduled timer event
@@ -96,7 +95,6 @@ struct inet_connection_sock {
 	const struct tcp_ulp_ops  *icsk_ulp_ops;
 	void __rcu		  *icsk_ulp_data;
 	void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
-	struct hlist_node         icsk_listen_portaddr_node;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state:5,
 				  icsk_ca_initialized:1,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 103fc7a30e60..1b8706719d4f 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -111,10 +111,7 @@ struct inet_bind_hashbucket {
 #define LISTENING_NULLS_BASE (1U << 29)
 struct inet_listen_hashbucket {
 	spinlock_t		lock;
-	union {
-		struct hlist_head	head;
-		struct hlist_nulls_head	nulls_head;
-	};
+	struct hlist_nulls_head	nulls_head;
 };
 
 /* This is for listening sockets, thus all sockets which possess wildcards. */
@@ -142,32 +139,8 @@ struct inet_hashinfo {
 	/* The 2nd listener table hashed by local port and address */
 	unsigned int			lhash2_mask;
 	struct inet_listen_hashbucket	*lhash2;
-
-	/* All the above members are written once at bootup and
-	 * never written again _or_ are predominantly read-access.
-	 *
-	 * Now align to a new cache line as all the following members
-	 * might be often dirty.
-	 */
-	/* All sockets in TCP_LISTEN state will be in listening_hash.
-	 * This is the only table where wildcard'd TCP sockets can
-	 * exist.  listening_hash is only hashed by local port number.
-	 * If lhash2 is initialized, the same socket will also be hashed
-	 * to lhash2 by port and address.
-	 */
-	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
-					____cacheline_aligned_in_smp;
 };
 
-#define inet_lhash2_for_each_icsk_continue(__icsk) \
-	hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node)
-
-#define inet_lhash2_for_each_icsk(__icsk, list) \
-	hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node)
-
-#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
-	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
-
 static inline struct inet_listen_hashbucket *
 inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
 {
@@ -229,23 +202,11 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 		    const unsigned short snum);
 
-/* These can have wildcards, don't try too hard. */
-static inline u32 inet_lhashfn(const struct net *net, const unsigned short num)
-{
-	return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1);
-}
-
-static inline int inet_sk_listen_hashfn(const struct sock *sk)
-{
-	return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num);
-}
-
 /* Caller must disable local BH processing. */
 int __inet_inherit_port(const struct sock *sk, struct sock *child);
 
 void inet_put_port(struct sock *sk);
 
-void inet_hashinfo_init(struct inet_hashinfo *h);
 void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
 			 unsigned long numentries, int scale,
 			 unsigned long low_limit,
-- 
cgit v1.2.3


From 05abad857277dda198063017b00ba5b9fed2c0cb Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 1 Apr 2022 16:38:23 -0700
Subject: Bluetooth: HCI: Add HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN quirk

This adds HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN quirk which can be
used to mark HCI_Enhanced_Setup_Synchronous_Connection as broken even
if its support command bit are set since some controller report it as
supported but the command don't work properly with some configurations
(e.g. BT_VOICE_TRANSPARENT/mSBC).

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h      | 9 +++++++++
 include/net/bluetooth/hci_core.h | 8 ++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 69ef31cea582..62a9bb022aed 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -265,6 +265,15 @@ enum {
 	 * runtime suspend, because event filtering takes place there.
 	 */
 	HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL,
+
+	/*
+	 * When this quirk is set, disables the use of
+	 * HCI_OP_ENHANCED_SETUP_SYNC_CONN command to setup SCO connections.
+	 *
+	 * This quirk can be set before hci_register_dev is called or
+	 * during the hdev->setup vendor callback.
+	 */
+	HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN,
 };
 
 /* HCI device flags */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 62d7b81b1cb7..5a52a2018b56 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1495,8 +1495,12 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 #define privacy_mode_capable(dev) (use_ll_privacy(dev) && \
 				   (hdev->commands[39] & 0x04))
 
-/* Use enhanced synchronous connection if command is supported */
-#define enhanced_sco_capable(dev) ((dev)->commands[29] & 0x08)
+/* Use enhanced synchronous connection if command is supported and its quirk
+ * has not been set.
+ */
+#define enhanced_sync_conn_capable(dev) \
+	(((dev)->commands[29] & 0x08) && \
+	 !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks))
 
 /* Use ext scanning if set ext scan param and ext scan enable is supported */
 #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \
-- 
cgit v1.2.3


From 4915d50e300e96929d2462041d6f6c6f061167fd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 12 May 2022 09:56:01 -0700
Subject: inet: add READ_ONCE(sk->sk_bound_dev_if) in INET_MATCH()

INET_MATCH() runs without holding a lock on the socket.

We probably need to annotate most reads.

This patch makes INET_MATCH() an inline function
to ease our changes.

v2:

We remove the 32bit version of it, as modern compilers
should generate the same code really, no need to
try to be smarter.

Also make 'struct net *net' the first argument.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 35 ++++++++++++++++-------------------
 include/net/sock.h            |  3 ---
 2 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 1b8706719d4f..59d72024ad1d 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -255,7 +255,6 @@ static inline struct sock *inet_lookup_listener(struct net *net,
 	((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
 #endif
 
-#if (BITS_PER_LONG == 64)
 #ifdef __BIG_ENDIAN
 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
 	const __addrpair __name = (__force __addrpair) ( \
@@ -267,24 +266,22 @@ static inline struct sock *inet_lookup_listener(struct net *net,
 				   (((__force __u64)(__be32)(__daddr)) << 32) | \
 				   ((__force __u64)(__be32)(__saddr)))
 #endif /* __BIG_ENDIAN */
-#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \
-	(((__sk)->sk_portpair == (__ports))			&&	\
-	 ((__sk)->sk_addrpair == (__cookie))			&&	\
-	 (((__sk)->sk_bound_dev_if == (__dif))			||	\
-	  ((__sk)->sk_bound_dev_if == (__sdif)))		&&	\
-	 net_eq(sock_net(__sk), (__net)))
-#else /* 32-bit arch */
-#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
-	const int __name __deprecated __attribute__((unused))
-
-#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \
-	(((__sk)->sk_portpair == (__ports))		&&		\
-	 ((__sk)->sk_daddr	== (__saddr))		&&		\
-	 ((__sk)->sk_rcv_saddr	== (__daddr))		&&		\
-	 (((__sk)->sk_bound_dev_if == (__dif))		||		\
-	  ((__sk)->sk_bound_dev_if == (__sdif)))	&&		\
-	 net_eq(sock_net(__sk), (__net)))
-#endif /* 64-bit arch */
+
+static inline bool INET_MATCH(struct net *net, const struct sock *sk,
+			      const __addrpair cookie, const __portpair ports,
+			      int dif, int sdif)
+{
+	int bound_dev_if;
+
+	if (!net_eq(sock_net(sk), net) ||
+	    sk->sk_portpair != ports ||
+	    sk->sk_addrpair != cookie)
+	        return false;
+
+	/* Paired with WRITE_ONCE() from sock_bindtoindex_locked() */
+	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+	return bound_dev_if == dif || bound_dev_if == sdif;
+}
 
 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
  * not check it for lookups anymore, thanks Alexey. -DaveM
diff --git a/include/net/sock.h b/include/net/sock.h
index 73063c88a249..01edfde4257d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -161,9 +161,6 @@ typedef __u64 __bitwise __addrpair;
  *	for struct sock and struct inet_timewait_sock.
  */
 struct sock_common {
-	/* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
-	 * address on 64bit arches : cf INET_MATCH()
-	 */
 	union {
 		__addrpair	skc_addrpair;
 		struct {
-- 
cgit v1.2.3


From 04c494e68a1340cb5c70d4704ac32d863dc64293 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 12 May 2022 14:14:56 -0700
Subject: Revert "tcp/dccp: get rid of inet_twsk_purge()"

This reverts commits:

0dad4087a86a2cbe177404dc73f18ada26a2c390 ("tcp/dccp: get rid of inet_twsk_purge()")
d507204d3c5cc57d9a8bdf0a477615bb59ea1611 ("tcp/dccp: add tw->tw_bslot")

As Leonard pointed out, a newly allocated netns can happen
to reuse a freed 'struct net'.

While TCP TW timers were covered by my patches, other things were not:

1) Lookups in rx path (INET_MATCH() and INET6_MATCH()), as they look
  at 4-tuple plus the 'struct net' pointer.

2) /proc/net/tcp[6] and inet_diag, same reason.

3) hashinfo->bhash[], same reason.

Fixing all this seems risky, lets instead revert.

In the future, we might have a per netns tcp hash table, or
a per netns list of timewait sockets...

Fixes: 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Leonard Crestez <cdleonard@gmail.com>
Tested-by: Leonard Crestez <cdleonard@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 463ae5d33eb0..5b47545f22d3 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -71,7 +71,6 @@ struct inet_timewait_sock {
 				tw_tos		: 8;
 	u32			tw_txhash;
 	u32			tw_priority;
-	u32			tw_bslot; /* bind bucket slot */
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
 };
@@ -110,6 +109,8 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo
 
 void inet_twsk_deschedule_put(struct inet_timewait_sock *tw);
 
+void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family);
+
 static inline
 struct net *twsk_net(const struct inet_timewait_sock *twsk)
 {
-- 
cgit v1.2.3


From 2ed3bf188b33630cf9d93b996ebf001847a00b5a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Apr 2022 13:01:16 +0200
Subject: netfilter: ecache: use dedicated list for event redelivery

This disentangles event redelivery and the percpu dying list.

Because entries are now stored on a dedicated list, all
entries are in NFCT_ECACHE_DESTROY_FAIL state and all entries
still have confirmed bit set -- the reference count is at least 1.

The 'struct net' back-pointer can be removed as well.

The pcpu dying list will be removed eventually, it has no functionality.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h        | 3 ++-
 include/net/netfilter/nf_conntrack_ecache.h | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 69e6c6a218be..28672a944499 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -45,7 +45,8 @@ union nf_conntrack_expect_proto {
 
 struct nf_conntrack_net_ecache {
 	struct delayed_work dwork;
-	struct netns_ct *ct_net;
+	spinlock_t dying_lock;
+	struct hlist_nulls_head dying_list;
 };
 
 struct nf_conntrack_net {
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 6c4c490a3e34..a6135b5030dd 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -14,7 +14,6 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 
 enum nf_ct_ecache_state {
-	NFCT_ECACHE_UNKNOWN,		/* destroy event not sent */
 	NFCT_ECACHE_DESTROY_FAIL,	/* tried but failed to send destroy event */
 	NFCT_ECACHE_DESTROY_SENT,	/* sent destroy event after failure */
 };
@@ -23,7 +22,6 @@ struct nf_conntrack_ecache {
 	unsigned long cache;		/* bitops want long */
 	u16 ctmask;			/* bitmask of ct events to be delivered */
 	u16 expmask;			/* bitmask of expect events to be delivered */
-	enum nf_ct_ecache_state state:8;/* ecache state */
 	u32 missed;			/* missed events */
 	u32 portid;			/* netlink portid of destroyer */
 };
-- 
cgit v1.2.3


From 0d3cc504ba9cdcff76346306c37eb1ea01e60a86 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Apr 2022 13:01:17 +0200
Subject: netfilter: conntrack: include ecache dying list in dumps

The new pernet dying list includes conntrack entries that await
delivery of the 'destroy' event via ctnetlink.

The old percpu dying list will be removed soon.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_ecache.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index a6135b5030dd..b57d73785e4d 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -164,6 +164,8 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state);
 void nf_conntrack_ecache_pernet_init(struct net *net);
 void nf_conntrack_ecache_pernet_fini(struct net *net);
 
+struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net);
+
 static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net)
 {
 	return net->ct.ecache_dwork_pending;
-- 
cgit v1.2.3


From 1397af5bfd7d32b0cf2adb70a78c9a9e8f11d912 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Apr 2022 13:01:18 +0200
Subject: netfilter: conntrack: remove the percpu dying list

Its no longer needed. Entries that need event redelivery are placed
on the new pernet dying list.

The advantage is that there is no need to take additional spinlock on
conntrack removal unless event redelivery failed or the conntrack entry
was never added to the table in the first place (confirmed bit not set).

The IPS_CONFIRMED bit now needs to be set as soon as the entry has been
unlinked from the unconfirmed list, else the destroy function may
attempt to unlink it a second time.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/conntrack.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 0294f3d473af..e985a3010b89 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -96,7 +96,6 @@ struct nf_ip_net {
 struct ct_pcpu {
 	spinlock_t		lock;
 	struct hlist_nulls_head unconfirmed;
-	struct hlist_nulls_head dying;
 };
 
 struct netns_ct {
-- 
cgit v1.2.3


From 78222bacfca97cb18505df1ba5f3591864498a7e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Apr 2022 13:01:19 +0200
Subject: netfilter: cttimeout: decouple unlink and free on netns destruction

Make it so netns pre_exit unlinks the objects from the pernet list, so
they cannot be found anymore.

netns core issues a synchronize_rcu() before calling the exit hooks so
any the time the exit hooks run unconfirmed nf_conn entries have been
free'd or they have been committed to the hashtable.

The exit hook still tags unconfirmed entries as dying, this can
now be removed in a followup change.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_timeout.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 3ea94f6f3844..fea258983d23 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -17,14 +17,6 @@ struct nf_ct_timeout {
 	char			data[];
 };
 
-struct ctnl_timeout {
-	struct list_head	head;
-	struct rcu_head		rcu_head;
-	refcount_t		refcnt;
-	char			name[CTNL_TIMEOUT_NAME_MAX];
-	struct nf_ct_timeout	timeout;
-};
-
 struct nf_conn_timeout {
 	struct nf_ct_timeout __rcu *timeout;
 };
-- 
cgit v1.2.3


From 17438b42ce14cb60ceda9ae62ad5dd022d55a216 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Apr 2022 13:01:20 +0200
Subject: netfilter: remove nf_ct_unconfirmed_destroy helper

This helper tags connections not yet in the conntrack table as
dying.  These nf_conn entries will be dropped instead when the
core attempts to insert them from the input or postrouting
'confirm' hook.

After the previous change, the entries get unlinked from the
list earlier, so that by the time the actual exit hook runs,
new connections no longer have a timeout policy assigned.

Its enough to walk the hashtable instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 28672a944499..f60212244b13 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -237,9 +237,6 @@ static inline bool nf_ct_kill(struct nf_conn *ct)
 	return nf_ct_delete(ct, 0, 0);
 }
 
-/* Set all unconfirmed conntrack as dying */
-void nf_ct_unconfirmed_destroy(struct net *);
-
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
 void nf_ct_iterate_cleanup_net(struct net *net,
 			       int (*iter)(struct nf_conn *i, void *data),
-- 
cgit v1.2.3


From c56716c69ce1ac320432fb1ea5654196ba24d2f8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Apr 2022 13:01:21 +0200
Subject: netfilter: extensions: introduce extension genid count

Multiple netfilter extensions store pointers to external data
in their extension area struct.

Examples:
1. Timeout policies
2. Connection tracking helpers.

No references are taken for these.

When a helper or timeout policy is removed, the conntrack table gets
traversed and affected extensions are cleared.

Conntrack entries not yet in the hashtable are referenced via a special
list, the unconfirmed list.

On removal of a policy or connection tracking helper, the unconfirmed
list gets traversed an all entries are marked as dying, this prevents
them from getting committed to the table at insertion time: core checks
for dying bit, if set, the conntrack entry gets destroyed at confirm
time.

The disadvantage is that each new conntrack has to be added to the percpu
unconfirmed list, and each insertion needs to remove it from this list.
The list is only ever needed when a policy or helper is removed -- a rare
occurrence.

Add a generation ID count: Instead of adding to the list and then
traversing that list on policy/helper removal, increment a counter
that is stored in the extension area.

For unconfirmed conntracks, the extension has the genid valid at ct
allocation time.

Removal of a helper/policy etc. increments the counter.
At confirmation time, validate that ext->genid == global_id.

If the stored number is not the same, do not allow the conntrack
insertion, just like as if a confirmed-list traversal would have flagged
the entry as dying.

After insertion, the genid is no longer relevant (conntrack entries
are now reachable via the conntrack table iterators and is set to 0.

This allows removal of the percpu unconfirmed list.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_extend.h | 31 +++++++++++++++--------------
 include/net/netfilter/nf_conntrack_labels.h | 10 +++++++++-
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 96635ad2acc7..0b247248b032 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -34,21 +34,11 @@ enum nf_ct_ext_id {
 	NF_CT_EXT_NUM,
 };
 
-#define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
-#define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
-#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj
-#define NF_CT_EXT_ACCT_TYPE struct nf_conn_acct
-#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
-#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp
-#define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout
-#define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels
-#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy
-#define NF_CT_EXT_ACT_CT_TYPE struct nf_conn_act_ct_ext
-
 /* Extensions: optional stuff which isn't permanently in struct. */
 struct nf_ct_ext {
 	u8 offset[NF_CT_EXT_NUM];
 	u8 len;
+	unsigned int gen_id;
 	char data[] __aligned(8);
 };
 
@@ -62,17 +52,28 @@ static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id)
 	return (ct->ext && __nf_ct_ext_exist(ct->ext, id));
 }
 
-static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id)
+void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id);
+
+static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id)
 {
-	if (!nf_ct_ext_exist(ct, id))
+	struct nf_ct_ext *ext = ct->ext;
+
+	if (!ext || !__nf_ct_ext_exist(ext, id))
 		return NULL;
 
+	if (unlikely(ext->gen_id))
+		return __nf_ct_ext_find(ext, id);
+
 	return (void *)ct->ext + ct->ext->offset[id];
 }
-#define nf_ct_ext_find(ext, id)	\
-	((id##_TYPE *)__nf_ct_ext_find((ext), (id)))
 
 /* Add this type, returns pointer to data or NULL. */
 void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);
 
+/* ext genid.  if ext->id != ext_genid, extensions cannot be used
+ * anymore unless conntrack has CONFIRMED bit set.
+ */
+extern atomic_t nf_conntrack_ext_genid;
+void nf_ct_ext_bump_genid(void);
+
 #endif /* _NF_CONNTRACK_EXTEND_H */
diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index 3c23298e68ca..66bab6c60d12 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -17,10 +17,18 @@ struct nf_conn_labels {
 	unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)];
 };
 
+/* Can't use nf_ct_ext_find(), flow dissector cannot use symbols
+ * exported by nf_conntrack module.
+ */
 static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct)
 {
 #ifdef CONFIG_NF_CONNTRACK_LABELS
-	return nf_ct_ext_find(ct, NF_CT_EXT_LABELS);
+	struct nf_ct_ext *ext = ct->ext;
+
+	if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS))
+		return NULL;
+
+	return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS];
 #else
 	return NULL;
 #endif
-- 
cgit v1.2.3


From 8a75a2c1741073f0c2d7bee146648e717527e048 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 Apr 2022 13:01:24 +0200
Subject: netfilter: conntrack: remove unconfirmed list

It has no function anymore and can be removed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 1 -
 include/net/netns/conntrack.h        | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index f60212244b13..3ce9a5b42fe5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -101,7 +101,6 @@ struct nf_conn {
 	/* Have we seen traffic both ways yet? (bitset) */
 	unsigned long status;
 
-	u16		cpu;
 	possible_net_t ct_net;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index e985a3010b89..a71cfd4e2f21 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -93,11 +93,6 @@ struct nf_ip_net {
 #endif
 };
 
-struct ct_pcpu {
-	spinlock_t		lock;
-	struct hlist_nulls_head unconfirmed;
-};
-
 struct netns_ct {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	bool ecache_dwork_pending;
@@ -109,7 +104,6 @@ struct netns_ct {
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
 
-	struct ct_pcpu __percpu *pcpu_lists;
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
 	struct nf_ip_net	nf_ct_proto;
-- 
cgit v1.2.3


From 8169ff584003c871a226719e998bb034231954d6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 8 Apr 2022 13:10:19 +0200
Subject: netfilter: conntrack: add nf_ct_iter_data object for
 nf_ct_iterate_cleanup*()

This patch adds a structure to collect all the context data that is
passed to the cleanup iterator.

 struct nf_ct_iter_data {
       struct net *net;
       void *data;
       u32 portid;
       int report;
 };

There is a netns field that allows to clean up conntrack entries
specifically owned by the specified netns.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3ce9a5b42fe5..a32be8aa7ed2 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -236,10 +236,16 @@ static inline bool nf_ct_kill(struct nf_conn *ct)
 	return nf_ct_delete(ct, 0, 0);
 }
 
+struct nf_ct_iter_data {
+	struct net *net;
+	void *data;
+	u32 portid;
+	int report;
+};
+
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
-void nf_ct_iterate_cleanup_net(struct net *net,
-			       int (*iter)(struct nf_conn *i, void *data),
-			       void *data, u32 portid, int report);
+void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
+			       const struct nf_ct_iter_data *iter_data);
 
 /* also set unconfirmed conntracks as dying. Only use in module exit path. */
 void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data),
-- 
cgit v1.2.3


From 2794cdb0b97bfe62d25c996c8afe4832207e78bc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 Apr 2022 15:15:41 +0200
Subject: netfilter: nfnetlink: allow to detect if ctnetlink listeners exist

At this time, every new conntrack gets the 'event cache extension'
enabled for it.

This is because the 'net.netfilter.nf_conntrack_events' sysctl defaults
to 1.

Changing the default to 0 means that commands that rely on the event
notification extension, e.g. 'conntrack -E' or conntrackd, stop working.

We COULD detect if there is a listener by means of
'nfnetlink_has_listeners()' and only add the extension if this is true.

The downside is a dependency from conntrack module to nfnetlink module.

This adds a different way: inc/dec a counter whenever a ctnetlink group
is being (un)subscribed and toggle a flag in struct net.

Next patches will take advantage of this and will only add the event
extension if the flag is set.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/conntrack.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index a71cfd4e2f21..0677cd3de034 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -95,6 +95,7 @@ struct nf_ip_net {
 
 struct netns_ct {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
+	bool ctnetlink_has_listener;
 	bool ecache_dwork_pending;
 #endif
 	u8			sysctl_log_invalid; /* Log invalid packets */
-- 
cgit v1.2.3


From b0a7ab4a776583b4344e8313638dc795f7589209 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 Apr 2022 15:15:42 +0200
Subject: netfilter: conntrack: un-inline nf_ct_ecache_ext_add

Only called when new ct is allocated or the extension isn't present.
This function will be extended, place this in the conntrack module
instead of inlining.

The callers already depend on nf_conntrack module.
Return value is changed to bool, noone used the returned pointer.

Make sure that the core drops the newly allocated conntrack
if the extension is requested but can't be added.
This makes it necessary to ifdef the section, as the stub
always returns false we'd drop every new conntrack if the
the ecache extension is disabled in kconfig.

Add from data path (xt_CT, nft_ct) is unchanged.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_ecache.h | 30 +++++------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index b57d73785e4d..2e3d58439e34 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -36,31 +36,6 @@ nf_ct_ecache_find(const struct nf_conn *ct)
 #endif
 }
 
-static inline struct nf_conntrack_ecache *
-nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
-{
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-	struct net *net = nf_ct_net(ct);
-	struct nf_conntrack_ecache *e;
-
-	if (!ctmask && !expmask && net->ct.sysctl_events) {
-		ctmask = ~0;
-		expmask = ~0;
-	}
-	if (!ctmask && !expmask)
-		return NULL;
-
-	e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
-	if (e) {
-		e->ctmask  = ctmask;
-		e->expmask = expmask;
-	}
-	return e;
-#else
-	return NULL;
-#endif
-}
-
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 
 /* This structure is passed to event handler */
@@ -89,6 +64,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct);
 int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
 				  u32 portid, int report);
 
+bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp);
 #else
 
 static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct)
@@ -103,6 +79,10 @@ static inline int nf_conntrack_eventmask_report(unsigned int eventmask,
 	return 0;
 }
 
+static inline bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
+{
+	return false;
+}
 #endif
 
 static inline void
-- 
cgit v1.2.3


From 8edc813111001e9be3cce066d3d4091d2ef04a1d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 Apr 2022 15:15:44 +0200
Subject: netfilter: prefer extension check to pointer check

The pointer check usually results in a 'false positive': its likely
that the ctnetlink module is loaded but no event monitoring is enabled.

After recent change to autodetect ctnetlink usage and only allocate
the ecache extension if a listener is active, check if the extension
is present on a given conntrack.

If its not there, there is nothing to report and calls to the
notification framework can be elided.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h   |  2 +-
 include/net/netfilter/nf_conntrack_ecache.h | 31 ++++++++++++++---------------
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 13807ea94cd2..6406cfee34c2 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -60,7 +60,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
 	if (ct) {
 		if (!nf_ct_is_confirmed(ct))
 			ret = __nf_conntrack_confirm(skb);
-		if (likely(ret == NF_ACCEPT))
+		if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct))
 			nf_ct_deliver_cached_events(ct);
 	}
 	return ret;
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 2e3d58439e34..0c1dac318e02 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -36,6 +36,15 @@ nf_ct_ecache_find(const struct nf_conn *ct)
 #endif
 }
 
+static inline bool nf_ct_ecache_exist(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+	return nf_ct_ext_exist(ct, NF_CT_EXT_ECACHE);
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 
 /* This structure is passed to event handler */
@@ -108,30 +117,20 @@ nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct,
 			  u32 portid, int report)
 {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-	const struct net *net = nf_ct_net(ct);
-
-	if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
-		return 0;
-
-	return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
-#else
-	return 0;
+	if (nf_ct_ecache_exist(ct))
+		return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
 #endif
+	return 0;
 }
 
 static inline int
 nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
 {
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-	const struct net *net = nf_ct_net(ct);
-
-	if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
-		return 0;
-
-	return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
-#else
-	return 0;
+	if (nf_ct_ecache_exist(ct))
+		return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
 #endif
+	return 0;
 }
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-- 
cgit v1.2.3


From 4f9bd53084d18c2f9f1ec68fa56587b99a2cef00 Mon Sep 17 00:00:00 2001
From: Kevin Mitchell <kevmitch@arista.com>
Date: Fri, 29 Apr 2022 20:40:27 -0700
Subject: netfilter: conntrack: skip verification of zero UDP checksum

The checksum is optional for UDP packets. However nf_reject would
previously require a valid checksum to elicit a response such as
ICMP_DEST_UNREACH.

Add some logic to nf_reject_verify_csum to determine if a UDP packet has
a zero checksum and should therefore not be verified.

Signed-off-by: Kevin Mitchell <kevmitch@arista.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_reject.h | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h
index 9051c3a0c8e7..7c669792fb9c 100644
--- a/include/net/netfilter/nf_reject.h
+++ b/include/net/netfilter/nf_reject.h
@@ -5,12 +5,28 @@
 #include <linux/types.h>
 #include <uapi/linux/in.h>
 
-static inline bool nf_reject_verify_csum(__u8 proto)
+static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff,
+					  __u8 proto)
 {
 	/* Skip protocols that don't use 16-bit one's complement checksum
 	 * of the entire payload.
 	 */
 	switch (proto) {
+		/* Protocols with optional checksums. */
+		case IPPROTO_UDP: {
+			const struct udphdr *udp_hdr;
+			struct udphdr _udp_hdr;
+
+			udp_hdr = skb_header_pointer(skb, dataoff,
+						     sizeof(_udp_hdr),
+						     &_udp_hdr);
+			if (!udp_hdr || udp_hdr->check)
+				return true;
+
+			return false;
+		}
+		case IPPROTO_GRE:
+
 		/* Protocols with other integrity checks. */
 		case IPPROTO_AH:
 		case IPPROTO_ESP:
@@ -19,9 +35,6 @@ static inline bool nf_reject_verify_csum(__u8 proto)
 		/* Protocols with partial checksums. */
 		case IPPROTO_UDPLITE:
 		case IPPROTO_DCCP:
-
-		/* Protocols with optional checksums. */
-		case IPPROTO_GRE:
 			return false;
 	}
 	return true;
-- 
cgit v1.2.3


From 4273d3fa8aa594e37c295bef3e73073608644241 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 10 May 2022 13:37:59 +0200
Subject: mac80211: fix typo in documentation

This is called offload_flags, remove the extra 'a'.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 75880fc70700..7b74c695a83a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1701,7 +1701,7 @@ enum ieee80211_offload_flags {
  *	these need to be set (or cleared) when the interface is added
  *	or, if supported by the driver, the interface type is changed
  *	at runtime, mac80211 will never touch this field
- * @offloaad_flags: hardware offload capabilities/flags for this interface.
+ * @offload_flags: hardware offload capabilities/flags for this interface.
  *	These are initialized by mac80211 before calling .add_interface,
  *	.change_interface or .update_vif_offload and updated by the driver
  *	within these ops, based on supported features or runtime change
-- 
cgit v1.2.3


From f5bf586aadddd9803a1f16bc18621fd819377130 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 10 May 2022 16:38:19 +0200
Subject: mac80211: remove stray multi_sta_back_32bit docs

This field doesn't exist, remove the docs for it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7b74c695a83a..944b31f0eb0d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -514,7 +514,6 @@ struct ieee80211_fils_discovery {
  * to that BSS) that can change during the lifetime of the BSS.
  *
  * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE
- * @multi_sta_back_32bit: supports BA bitmap of 32-bits in Multi-STA BACK
  * @uora_exists: is the UORA element advertised by AP
  * @ack_enabled: indicates support to receive a multi-TID that solicits either
  *	ACK, BACK or both
-- 
cgit v1.2.3


From e6175a2ed1f18bf2f649625bf725e07adcfa6a28 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Fri, 13 May 2022 23:34:02 +0300
Subject: xfrm: fix "disable_policy" flag use when arriving from different
 devices

In IPv4 setting the "disable_policy" flag on a device means no policy
should be enforced for traffic originating from the device. This was
implemented by seting the DST_NOPOLICY flag in the dst based on the
originating device.

However, dsts are cached in nexthops regardless of the originating
devices, in which case, the DST_NOPOLICY flag value may be incorrect.

Consider the following setup:

                     +------------------------------+
                     | ROUTER                       |
  +-------------+    | +-----------------+          |
  | ipsec src   |----|-|ipsec0           |          |
  +-------------+    | |disable_policy=0 |   +----+ |
                     | +-----------------+   |eth1|-|-----
  +-------------+    | +-----------------+   +----+ |
  | noipsec src |----|-|eth0             |          |
  +-------------+    | |disable_policy=1 |          |
                     | +-----------------+          |
                     +------------------------------+

Where ROUTER has a default route towards eth1.

dst entries for traffic arriving from eth0 would have DST_NOPOLICY
and would be cached and therefore can be reused by traffic originating
from ipsec0, skipping policy check.

Fix by setting a IPSKB_NOPOLICY flag in IPCB and observing it instead
of the DST in IN/FWD IPv4 policy checks.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/ip.h   |  1 +
 include/net/xfrm.h | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 3984f2c39c4b..0161137914cf 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -56,6 +56,7 @@ struct inet_skb_parm {
 #define IPSKB_DOREDIRECT	BIT(5)
 #define IPSKB_FRAG_PMTU		BIT(6)
 #define IPSKB_L3SLAVE		BIT(7)
+#define IPSKB_NOPOLICY		BIT(8)
 
 	u16			frag_max_size;
 };
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 6fb899ff5afc..d2efddce65d4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1093,6 +1093,18 @@ static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb,
 	return false;
 }
 
+static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb,
+					     int dir, unsigned short family)
+{
+	if (dir != XFRM_POLICY_OUT && family == AF_INET) {
+		/* same dst may be used for traffic originating from
+		 * devices with different policy settings.
+		 */
+		return IPCB(skb)->flags & IPSKB_NOPOLICY;
+	}
+	return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY);
+}
+
 static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 				       struct sk_buff *skb,
 				       unsigned int family, int reverse)
@@ -1104,7 +1116,7 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 		return __xfrm_policy_check(sk, ndir, skb, family);
 
 	return __xfrm_check_nopolicy(net, skb, dir) ||
-	       (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) ||
+	       __xfrm_check_dev_nopolicy(skb, dir, family) ||
 	       __xfrm_policy_check(sk, ndir, skb, family);
 }
 
-- 
cgit v1.2.3


From 3d48cb74816d8468f0235ce9a867a2d7b9832693 Mon Sep 17 00:00:00 2001
From: Rameshkumar Sundaram <quic_ramess@quicinc.com>
Date: Wed, 13 Apr 2022 21:58:14 +0530
Subject: nl80211: Parse NL80211_ATTR_HE_BSS_COLOR as a part of
 nl80211_parse_beacon

NL80211_ATTR_HE_BSS_COLOR attribute can be included in both
NL80211_CMD_START_AP and NL80211_CMD_SET_BEACON commands.

Move he_bss_color from cfg80211_ap_settings to cfg80211_beacon_data
and parse NL80211_ATTR_HE_BSS_COLOR as a part of nl80211_parse_beacon()
to have bss color settings parsed for both start ap and set beacon
commands.
Add a new flag he_bss_color_valid to indicate whether
NL80211_ATTR_HE_BSS_COLOR attribute is included.

Signed-off-by: Rameshkumar Sundaram <quic_ramess@quicinc.com>
Link: https://lore.kernel.org/r/1649867295-7204-2-git-send-email-quic_ramess@quicinc.com
[fix build ...]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6a3e3f0a8615..97a5804ccdcf 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1183,6 +1183,9 @@ struct cfg80211_mbssid_elems {
  *	Token (measurement type 11)
  * @lci_len: LCI data length
  * @civicloc_len: Civic location data length
+ * @he_bss_color: BSS Color settings
+ * @he_bss_color_valid: indicates whether bss color
+	attribute is present in beacon data or not.
  */
 struct cfg80211_beacon_data {
 	const u8 *head, *tail;
@@ -1202,6 +1205,8 @@ struct cfg80211_beacon_data {
 	size_t probe_resp_len;
 	size_t lci_len;
 	size_t civicloc_len;
+	struct cfg80211_he_bss_color he_bss_color;
+	bool he_bss_color_valid;
 };
 
 struct mac_address {
@@ -1292,7 +1297,6 @@ struct cfg80211_unsol_bcast_probe_resp {
  * @sae_h2e_required: stations must support direct H2E technique in SAE
  * @flags: flags, as defined in enum cfg80211_ap_settings_flags
  * @he_obss_pd: OBSS Packet Detection settings
- * @he_bss_color: BSS Color settings
  * @he_oper: HE operation IE (or %NULL if HE isn't enabled)
  * @fils_discovery: FILS discovery transmission parameters
  * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
@@ -1326,7 +1330,6 @@ struct cfg80211_ap_settings {
 	bool twt_responder;
 	u32 flags;
 	struct ieee80211_he_obss_pd he_obss_pd;
-	struct cfg80211_he_bss_color he_bss_color;
 	struct cfg80211_fils_discovery fils_discovery;
 	struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
 	struct cfg80211_mbssid_config mbssid_config;
-- 
cgit v1.2.3


From 44fa75f207d8a106bc75e6230db61e961fdbf8a8 Mon Sep 17 00:00:00 2001
From: Jonas Jelonek <jelonek.jonas@gmail.com>
Date: Mon, 9 May 2022 19:39:57 +0200
Subject: mac80211: extend current rate control tx status API

This patch adds the new struct ieee80211_rate_status and replaces
'struct rate_info *rate' in ieee80211_tx_status with pointer and length
annotation.

The struct ieee80211_rate_status allows to:
(1)	receive tx power status feedback for transmit power control (TPC)
	per packet or packet retry
(2)	dynamic mapping of wifi chip specific multi-rate retry (mrr)
	chains with different lengths
(3)	increase the limit of annotatable rate indices to support
	IEEE802.11ac rate sets and beyond

ieee80211_tx_info, control and status buffer, and ieee80211_tx_rate
cannot be used to achieve these goals due to fixed size limitations.

Our new struct contains a struct rate_info to annotate the rate that was
used, retry count of the rate and tx power. It is intended for all
information related to RC and TPC that needs to be passed from driver to
mac80211 and its RC/TPC algorithms like Minstrel_HT. It corresponds to
one stage in an mrr. Multiple subsequent instances of this struct can be
included in struct ieee80211_tx_status via a pointer and a length variable.
Those instances can be allocated on-stack. The former reference to a single
instance of struct rate_info is replaced with our new annotation.

An extension is introduced to struct ieee80211_hw. There are two new
members called 'tx_power_levels' and 'max_txpwr_levels_idx' acting as a
tx power level table. When a wifi device is registered, the driver shall
supply all supported power levels in this list. This allows to support
several quirks like differing power steps in power level ranges or
alike. TPC can use this for algorithm and thus be designed more abstract
instead of handling all possible step widths individually.

Further mandatory changes in status.c, mt76 and ath11k drivers due to the
removal of 'struct rate_info *rate' are also included.
status.c already uses the information in ieee80211_tx_status->rate in
radiotap, this is now changed to use ieee80211_rate_status->rate_idx.
mt76 driver already uses struct rate_info to pass the tx rate to status
path. The new members of the ieee80211_tx_status are set to NULL and 0
because the previously passed rate is not relevant to rate control and
accurate information is passed via tx_info->status.rates.
For ath11k, the txrate can be passed via this struct because ath11k uses
firmware RC and thus the information does not interfere with software RC.

Compile-Tested: current wireless-next tree with all flags on
Tested-on: Xiaomi 4A Gigabit (MediaTek MT7603E, MT7612E) with OpenWrt
		Linux 5.10.113

Signed-off-by: Jonas Jelonek <jelonek.jonas@gmail.com>
Link: https://lore.kernel.org/r/20220509173958.1398201-2-jelonek.jonas@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 944b31f0eb0d..ebadb2103968 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1143,20 +1143,41 @@ ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info)
 	return info->tx_time_est << 2;
 }
 
+/***
+ * struct ieee80211_rate_status - mrr stage for status path
+ *
+ * This struct is used in struct ieee80211_tx_status to provide drivers a
+ * dynamic way to report about used rates and power levels per packet.
+ *
+ * @rate_idx The actual used rate.
+ * @try_count How often the rate was tried.
+ * @tx_power_idx An idx into the ieee80211_hw->tx_power_levels list of the
+ * 	corresponding wifi hardware. The idx shall point to the power level
+ * 	that was used when sending the packet.
+ */
+struct ieee80211_rate_status {
+	struct rate_info rate_idx;
+	u8 try_count;
+	u8 tx_power_idx;
+};
+
 /**
  * struct ieee80211_tx_status - extended tx status info for rate control
  *
  * @sta: Station that the packet was transmitted for
  * @info: Basic tx status information
  * @skb: Packet skb (can be NULL if not provided by the driver)
- * @rate: The TX rate that was used when sending the packet
+ * @rates: Mrr stages that were used when sending the packet
+ * @n_rates: Number of mrr stages (count of instances for @rates)
  * @free_list: list where processed skbs are stored to be free'd by the driver
  */
 struct ieee80211_tx_status {
 	struct ieee80211_sta *sta;
 	struct ieee80211_tx_info *info;
 	struct sk_buff *skb;
-	struct rate_info *rate;
+	struct ieee80211_rate_status *rates;
+	u8 n_rates;
+
 	struct list_head *free_list;
 };
 
@@ -2657,6 +2678,12 @@ enum ieee80211_hw_flags {
  *	refilling deficit of each TXQ.
  *
  * @max_mtu: the max mtu could be set.
+ *
+ * @tx_power_levels: a list of power levels supported by the wifi hardware.
+ * 	The power levels can be specified either as integer or fractions.
+ * 	The power level at idx 0 shall be the maximum positive power level.
+ *
+ * @max_txpwr_levels_idx: the maximum valid idx of 'tx_power_levels' list.
  */
 struct ieee80211_hw {
 	struct ieee80211_conf conf;
@@ -2695,6 +2722,8 @@ struct ieee80211_hw {
 	u8 tx_sk_pacing_shift;
 	u8 weight_multiplier;
 	u32 max_mtu;
+	const s8 *tx_power_levels;
+	u8 max_txpwr_levels_idx;
 };
 
 static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw,
-- 
cgit v1.2.3


From 7c96d8ec96bb71aac54c9f872aaa65d7411ab864 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 13 May 2022 11:34:00 -0700
Subject: ipv6: add struct hop_jumbo_hdr definition

Following patches will need to add and remove local IPv6 jumbogram
options to enable BIG TCP.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 213612f1680c..63d019953c47 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -151,6 +151,17 @@ struct frag_hdr {
 	__be32	identification;
 };
 
+/*
+ * Jumbo payload option, as described in RFC 2675 2.
+ */
+struct hop_jumbo_hdr {
+	u8	nexthdr;
+	u8	hdrlen;
+	u8	tlv_type;	/* IPV6_TLV_JUMBO, 0xC2 */
+	u8	tlv_len;	/* 4 */
+	__be32	jumbo_payload_len;
+};
+
 #define	IP6_MF		0x0001
 #define	IP6_OFFSET	0xFFF8
 
-- 
cgit v1.2.3


From 09f3d1a3a52c696208008618a67e2c7c3fb16d41 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 13 May 2022 11:34:01 -0700
Subject: ipv6/gso: remove temporary HBH/jumbo header

ipv6 tcp and gro stacks will soon be able to build big TCP packets,
with an added temporary Hop By Hop header.

If GSO is involved for these large packets, we need to remove
the temporary HBH header before segmentation happens.

v2: perform HBH removal from ipv6_gso_segment() instead of
    skb_segment() (Alexander feedback)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 63d019953c47..b6df0314aa02 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -467,6 +467,39 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
 struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 					   struct ipv6_txoptions *opt);
 
+/* This helper is specialized for BIG TCP needs.
+ * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header.
+ * It assumes headers are already in skb->head.
+ * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there.
+ */
+static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
+{
+	const struct hop_jumbo_hdr *jhdr;
+	const struct ipv6hdr *nhdr;
+
+	if (likely(skb->len <= GRO_MAX_SIZE))
+		return 0;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		return 0;
+
+	if (skb_network_offset(skb) +
+	    sizeof(struct ipv6hdr) +
+	    sizeof(struct hop_jumbo_hdr) > skb_headlen(skb))
+		return 0;
+
+	nhdr = ipv6_hdr(skb);
+
+	if (nhdr->nexthdr != NEXTHDR_HOP)
+		return 0;
+
+	jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1);
+	if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 ||
+	    jhdr->nexthdr != IPPROTO_TCP)
+		return 0;
+	return jhdr->nexthdr;
+}
+
 static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 {
 	/* If forwarding is enabled, RA are not accepted unless the special
-- 
cgit v1.2.3


From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Fri, 13 May 2022 11:34:03 -0700
Subject: net: allow gro_max_size to exceed 65536

Allow the gro_max_size to exceed a value larger than 65536.

There weren't really any external limitations that prevented this other
than the fact that IPv4 only supports a 16 bit length field. Since we have
the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to
exceed this value and for IPv4 and non-TCP flows we can cap things at 65536
via a constant rather than relying on gro_max_size.

[edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index b6df0314aa02..5b38bf1a586b 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -477,7 +477,7 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
 	const struct hop_jumbo_hdr *jhdr;
 	const struct ipv6hdr *nhdr;
 
-	if (likely(skb->len <= GRO_MAX_SIZE))
+	if (likely(skb->len <= GRO_LEGACY_MAX_SIZE))
 		return 0;
 
 	if (skb->protocol != htons(ETH_P_IPV6))
-- 
cgit v1.2.3


From 4c971d2f3548e4f11b1460ac048f5307e4b39fdb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 13 May 2022 11:55:41 -0700
Subject: net: annotate races around sk->sk_bound_dev_if

UDP sendmsg() is lockless, and reads sk->sk_bound_dev_if while
this field can be changed by another thread.

Adds minimal annotations to avoid KCSAN splats for UDP.
Following patches will add more annotations to potential lockless readers.

BUG: KCSAN: data-race in __ip6_datagram_connect / udpv6_sendmsg

write to 0xffff888136d47a94 of 4 bytes by task 7681 on cpu 0:
 __ip6_datagram_connect+0x6e2/0x930 net/ipv6/datagram.c:221
 ip6_datagram_connect+0x2a/0x40 net/ipv6/datagram.c:272
 inet_dgram_connect+0x107/0x190 net/ipv4/af_inet.c:576
 __sys_connect_file net/socket.c:1900 [inline]
 __sys_connect+0x197/0x1b0 net/socket.c:1917
 __do_sys_connect net/socket.c:1927 [inline]
 __se_sys_connect net/socket.c:1924 [inline]
 __x64_sys_connect+0x3d/0x50 net/socket.c:1924
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x2b/0x50 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff888136d47a94 of 4 bytes by task 7670 on cpu 1:
 udpv6_sendmsg+0xc60/0x16e0 net/ipv6/udp.c:1436
 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:652
 sock_sendmsg_nosec net/socket.c:705 [inline]
 sock_sendmsg net/socket.c:725 [inline]
 ____sys_sendmsg+0x39a/0x510 net/socket.c:2413
 ___sys_sendmsg net/socket.c:2467 [inline]
 __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553
 __do_sys_sendmmsg net/socket.c:2582 [inline]
 __se_sys_sendmmsg net/socket.c:2579 [inline]
 __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x2b/0x50 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x00000000 -> 0xffffff9b

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 7670 Comm: syz-executor.3 Tainted: G        W         5.18.0-rc1-syzkaller-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

I chose to not add Fixes: tag because race has minor consequences
and stable teams busy enough.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h   | 2 +-
 include/net/sock.h | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 3984f2c39c4b..8ad04f60b413 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -93,7 +93,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
 
 	ipcm->sockc.mark = inet->sk.sk_mark;
 	ipcm->sockc.tsflags = inet->sk.sk_tsflags;
-	ipcm->oif = inet->sk.sk_bound_dev_if;
+	ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
 	ipcm->addr = inet->inet_saddr;
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 01edfde4257d..72ca97ccb460 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2875,13 +2875,14 @@ static inline void sk_pacing_shift_update(struct sock *sk, int val)
  */
 static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
 {
+	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 	int mdif;
 
-	if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)
+	if (!bound_dev_if || bound_dev_if == dif)
 		return true;
 
 	mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
-	if (mdif && mdif == sk->sk_bound_dev_if)
+	if (mdif && mdif == bound_dev_if)
 		return true;
 
 	return false;
-- 
cgit v1.2.3


From fdb5fd7f736ec7ae9fb36d2842ea6d9ebc4e7269 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 13 May 2022 11:55:43 -0700
Subject: tcp: sk->sk_bound_dev_if once in inet_request_bound_dev_if()

inet_request_bound_dev_if() reads sk->sk_bound_dev_if twice
while listener socket is not locked.

Another cpu could change this field under us.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 234d70ae5f4c..c1b5dcd6597c 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -116,14 +116,15 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
 static inline int inet_request_bound_dev_if(const struct sock *sk,
 					    struct sk_buff *skb)
 {
+	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	struct net *net = sock_net(sk);
 
-	if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept)
+	if (!bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept)
 		return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
 #endif
 
-	return sk->sk_bound_dev_if;
+	return bound_dev_if;
 }
 
 static inline int inet_sk_bound_l3mdev(const struct sock *sk)
-- 
cgit v1.2.3


From 5d368f03280d3678433a7f119efe15dfbbb87bc8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 13 May 2022 11:55:49 -0700
Subject: ipv6: add READ_ONCE(sk->sk_bound_dev_if) in INET6_MATCH()

INET6_MATCH() runs without holding a lock on the socket.

We probably need to annotate most reads.

This patch makes INET6_MATCH() an inline function
to ease our changes.

v2: inline function only defined if IS_ENABLED(CONFIG_IPV6)
    Change the name to inet6_match(), this is no longer a macro.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_hashtables.h | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 81b965953036..f259e1ae14ba 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -103,15 +103,25 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 			  const int dif);
 
 int inet6_hash(struct sock *sk);
-#endif /* IS_ENABLED(CONFIG_IPV6) */
 
-#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif, __sdif) \
-	(((__sk)->sk_portpair == (__ports))			&&	\
-	 ((__sk)->sk_family == AF_INET6)			&&	\
-	 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))		&&	\
-	 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))	&&	\
-	 (((__sk)->sk_bound_dev_if == (__dif))	||			\
-	  ((__sk)->sk_bound_dev_if == (__sdif)))		&&	\
-	 net_eq(sock_net(__sk), (__net)))
+static inline bool inet6_match(struct net *net, const struct sock *sk,
+			       const struct in6_addr *saddr,
+			       const struct in6_addr *daddr,
+			       const __portpair ports,
+			       const int dif, const int sdif)
+{
+	int bound_dev_if;
+
+	if (!net_eq(sock_net(sk), net) ||
+	    sk->sk_family != AF_INET6 ||
+	    sk->sk_portpair != ports ||
+	    !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
+	    !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+		return false;
+
+	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+	return bound_dev_if == dif || bound_dev_if == sdif;
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
 
 #endif /* _INET6_HASHTABLES_H */
-- 
cgit v1.2.3


From eda090c31fe923ab9463b884469744ec903ab0cc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 13 May 2022 11:55:50 -0700
Subject: inet: rename INET_MATCH()

This is no longer a macro, but an inlined function.

INET_MATCH() -> inet_match()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Suggested-by: Olivier Hartkopp <socketcan@hartkopp.net>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 59d72024ad1d..ebfa3df6f8dc 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -267,7 +267,7 @@ static inline struct sock *inet_lookup_listener(struct net *net,
 				   ((__force __u64)(__be32)(__saddr)))
 #endif /* __BIG_ENDIAN */
 
-static inline bool INET_MATCH(struct net *net, const struct sock *sk,
+static inline bool inet_match(struct net *net, const struct sock *sk,
 			      const __addrpair cookie, const __portpair ports,
 			      int dif, int sdif)
 {
-- 
cgit v1.2.3


From d265929930e2ffafc744c0ae05fb70acd53be1ee Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 4 May 2022 08:35:59 -0700
Subject: netfilter: nf_conncount: reduce unnecessary GC

Currently nf_conncount can trigger garbage collection (GC)
at multiple places. Each GC process takes a spin_lock_bh
to traverse the nf_conncount_list. We found that when testing
port scanning use two parallel nmap, because the number of
connection increase fast, the nf_conncount_count and its
subsequent call to __nf_conncount_add take too much time,
causing several CPU lockup. This happens when user set the
conntrack limit to +20,000, because the larger the limit,
the longer the list that GC has to traverse.

The patch mitigate the performance issue by avoiding unnecessary
GC with a timestamp. Whenever nf_conncount has done a GC,
a timestamp is updated, and beforce the next time GC is
triggered, we make sure it's more than a jiffies.
By doin this we can greatly reduce the CPU cycles and
avoid the softirq lockup.

To reproduce it in OVS,
$ ovs-appctl dpctl/ct-set-limits zone=1,limit=20000
$ ovs-appctl dpctl/ct-get-limits

At another machine, runs two nmap
$ nmap -p1- <IP>
$ nmap -p1- <IP>

Signed-off-by: William Tu <u9012063@gmail.com>
Co-authored-by: Yifeng Sun <pkusunyifeng@gmail.com>
Reported-by: Greg Rose <gvrose8192@gmail.com>
Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index 9645b47fa7e4..e227d997fc71 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -10,6 +10,7 @@ struct nf_conncount_data;
 
 struct nf_conncount_list {
 	spinlock_t list_lock;
+	u32 last_gc;		/* jiffies at most recent gc */
 	struct list_head head;	/* connections with the same filtering key */
 	unsigned int count;	/* length of list */
 };
-- 
cgit v1.2.3


From ee0e2f51e2115c2578d40e5a8ac33737984fe477 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 17 May 2022 12:43:46 +0200
Subject: cfg80211: fix kernel-doc for cfg80211_beacon_data

The kernel-doc comment is formatted badly, resulting
in a warning:

  include/net/cfg80211.h:1188: warning: bad line: [...]

Fix that.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 97a5804ccdcf..cc8a9880b9d6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1185,7 +1185,7 @@ struct cfg80211_mbssid_elems {
  * @civicloc_len: Civic location data length
  * @he_bss_color: BSS Color settings
  * @he_bss_color_valid: indicates whether bss color
-	attribute is present in beacon data or not.
+ *	attribute is present in beacon data or not.
  */
 struct cfg80211_beacon_data {
 	const u8 *head, *tail;
-- 
cgit v1.2.3


From c1318b39c7d36bd5139a9c71044ff2b2d3c6f9d8 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@nvidia.com>
Date: Wed, 18 May 2022 12:27:31 +0300
Subject: tls: Add opt-in zerocopy mode of sendfile()

TLS device offload copies sendfile data to a bounce buffer before
transmitting. It allows to maintain the valid MAC on TLS records when
the file contents change and a part of TLS record has to be
retransmitted on TCP level.

In many common use cases (like serving static files over HTTPS) the file
contents are not changed on the fly. In many use cases breaking the
connection is totally acceptable if the file is changed during
transmission, because it would be received corrupted in any case.

This commit allows to optimize performance for such use cases to
providing a new optional mode of TLS sendfile(), in which the extra copy
is skipped. Removing this copy improves performance significantly, as
TLS and TCP sendfile perform the same operations, and the only overhead
is TLS header/trailer insertion.

The new mode can only be enabled with the new socket option named
TLS_TX_ZEROCOPY_SENDFILE on per-socket basis. It preserves backwards
compatibility with existing applications that rely on the copying
behavior.

The new mode is safe, meaning that unsolicited modifications of the file
being sent can't break integrity of the kernel. The worst thing that can
happen is sending a corrupted TLS record, which is in any case not
forbidden when using regular TCP sockets.

Sockets other than TLS device offload are not affected by the new socket
option. The actual status of zerocopy sendfile can be queried with
sock_diag.

Performance numbers in a single-core test with 24 HTTPS streams on
nginx, under 100% CPU load:

* non-zerocopy: 33.6 Gbit/s
* zerocopy: 79.92 Gbit/s

CPU: Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz

Signed-off-by: Boris Pismenny <borisp@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20220518092731.1243494-1-maximmi@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/tls.h b/include/net/tls.h
index b59f0a63292b..8017f1703447 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -238,6 +238,7 @@ struct tls_context {
 
 	u8 tx_conf:3;
 	u8 rx_conf:3;
+	u8 zerocopy_sendfile:1;
 
 	int (*push_pending_record)(struct sock *sk, int flags);
 	void (*sk_write_space)(struct sock *sk);
-- 
cgit v1.2.3


From 8f9ae5b3ae80f168a6224529e3787f4fb27f299a Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 4 May 2022 17:06:28 -0700
Subject: Bluetooth: eir: Add helpers for managing service data

This adds helpers for accessing and appending service data (0x16) ad
type.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 62a9bb022aed..fe7935be7dc4 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -625,6 +625,7 @@ enum {
 #define EIR_SSP_RAND_R192	0x0F /* Simple Pairing Randomizer R-192 */
 #define EIR_DEVICE_ID		0x10 /* device ID */
 #define EIR_APPEARANCE		0x19 /* Device appearance */
+#define EIR_SERVICE_DATA	0x16 /* Service Data */
 #define EIR_LE_BDADDR		0x1B /* LE Bluetooth device address */
 #define EIR_LE_ROLE		0x1C /* LE role */
 #define EIR_SSP_HASH_C256	0x1D /* Simple Pairing Hash C-256 */
-- 
cgit v1.2.3


From 3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliang.tang@suse.com>
Date: Thu, 19 May 2022 16:30:10 -0700
Subject: bpf: Add bpf_skc_to_mptcp_sock_proto

This patch implements a new struct bpf_func_proto, named
bpf_skc_to_mptcp_sock_proto. Define a new bpf_id BTF_SOCK_TYPE_MPTCP,
and a new helper bpf_skc_to_mptcp_sock(), which invokes another new
helper bpf_mptcp_sock_from_subflow() in net/mptcp/bpf.c to get struct
mptcp_sock from a given subflow socket.

v2: Emit BTF type, add func_id checks in verifier.c and bpf_trace.c,
remove build check for CONFIG_BPF_JIT
v5: Drop EXPORT_SYMBOL (Martin)

Co-developed-by: Nicolas Rybowski <nicolas.rybowski@tessares.net>
Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Nicolas Rybowski <nicolas.rybowski@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220519233016.105670-2-mathew.j.martineau@linux.intel.com
---
 include/net/mptcp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 8b1afd6f5cc4..2ba09de955c7 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -284,4 +284,10 @@ static inline int mptcpv6_init(void) { return 0; }
 static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { }
 #endif
 
+#if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL)
+struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk);
+#else
+static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; }
+#endif
+
 #endif /* __NET_MPTCP_H */
-- 
cgit v1.2.3


From d5a42de8bdbe25081f07b801d8b35f4d75a791f4 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 19 May 2022 17:18:33 -0700
Subject: net: Add a second bind table hashed by port and address

We currently have one tcp bind table (bhash) which hashes by port
number only. In the socket bind path, we check for bind conflicts by
traversing the specified port's inet_bind2_bucket while holding the
bucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()).

In instances where there are tons of sockets hashed to the same port
at different addresses, checking for a bind conflict is time-intensive
and can cause softirq cpu lockups, as well as stops new tcp connections
since __inet_inherit_port() also contests for the spinlock.

This patch proposes adding a second bind table, bhash2, that hashes by
port and ip address. Searching the bhash2 table leads to significantly
faster conflict resolution and less time holding the spinlock.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h |  3 ++
 include/net/inet_hashtables.h      | 68 +++++++++++++++++++++++++++++++++++++-
 include/net/sock.h                 | 14 ++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 85cd695e7fd1..077cd730ce2f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -25,6 +25,7 @@
 #undef INET_CSK_CLEAR_TIMERS
 
 struct inet_bind_bucket;
+struct inet_bind2_bucket;
 struct tcp_congestion_ops;
 
 /*
@@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
  *
  * @icsk_accept_queue:	   FIFO of established children
  * @icsk_bind_hash:	   Bind node
+ * @icsk_bind2_hash:	   Bind node in the bhash2 table
  * @icsk_timeout:	   Timeout
  * @icsk_retransmit_timer: Resend (no ack)
  * @icsk_rto:		   Retransmit timeout
@@ -83,6 +85,7 @@ struct inet_connection_sock {
 	struct inet_sock	  icsk_inet;
 	struct request_sock_queue icsk_accept_queue;
 	struct inet_bind_bucket	  *icsk_bind_hash;
+	struct inet_bind2_bucket  *icsk_bind2_hash;
 	unsigned long		  icsk_timeout;
  	struct timer_list	  icsk_retransmit_timer;
  	struct timer_list	  icsk_delack_timer;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ebfa3df6f8dc..a0887b70967b 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -90,11 +90,32 @@ struct inet_bind_bucket {
 	struct hlist_head	owners;
 };
 
+struct inet_bind2_bucket {
+	possible_net_t		ib_net;
+	int			l3mdev;
+	unsigned short		port;
+	union {
+#if IS_ENABLED(CONFIG_IPV6)
+		struct in6_addr		v6_rcv_saddr;
+#endif
+		__be32			rcv_saddr;
+	};
+	/* Node in the inet2_bind_hashbucket chain */
+	struct hlist_node	node;
+	/* List of sockets hashed to this bucket */
+	struct hlist_head	owners;
+};
+
 static inline struct net *ib_net(struct inet_bind_bucket *ib)
 {
 	return read_pnet(&ib->ib_net);
 }
 
+static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
+{
+	return read_pnet(&ib->ib_net);
+}
+
 #define inet_bind_bucket_for_each(tb, head) \
 	hlist_for_each_entry(tb, head, node)
 
@@ -103,6 +124,15 @@ struct inet_bind_hashbucket {
 	struct hlist_head	chain;
 };
 
+/* This is synchronized using the inet_bind_hashbucket's spinlock.
+ * Instead of having separate spinlocks, the inet_bind2_hashbucket can share
+ * the inet_bind_hashbucket's given that in every case where the bhash2 table
+ * is useful, a lookup in the bhash table also occurs.
+ */
+struct inet_bind2_hashbucket {
+	struct hlist_head	chain;
+};
+
 /* Sockets can be hashed in established or listening table.
  * We must use different 'nulls' end-of-chain value for all hash buckets :
  * A socket might transition from ESTABLISH to LISTEN state without
@@ -134,6 +164,12 @@ struct inet_hashinfo {
 	 */
 	struct kmem_cache		*bind_bucket_cachep;
 	struct inet_bind_hashbucket	*bhash;
+	/* The 2nd binding table hashed by port and address.
+	 * This is used primarily for expediting the resolution of bind
+	 * conflicts.
+	 */
+	struct kmem_cache		*bind2_bucket_cachep;
+	struct inet_bind2_hashbucket	*bhash2;
 	unsigned int			bhash_size;
 
 	/* The 2nd listener table hashed by local port and address */
@@ -193,6 +229,36 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
 void inet_bind_bucket_destroy(struct kmem_cache *cachep,
 			      struct inet_bind_bucket *tb);
 
+static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb,
+					   struct net *net,
+					   const unsigned short port,
+					   int l3mdev)
+{
+	return net_eq(ib_net(tb), net) && tb->port == port &&
+		tb->l3mdev == l3mdev;
+}
+
+struct inet_bind2_bucket *
+inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
+			 struct inet_bind2_hashbucket *head,
+			 const unsigned short port, int l3mdev,
+			 const struct sock *sk);
+
+void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
+			       struct inet_bind2_bucket *tb);
+
+struct inet_bind2_bucket *
+inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
+		       const unsigned short port, int l3mdev,
+		       struct sock *sk,
+		       struct inet_bind2_hashbucket **head);
+
+bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
+				       struct net *net,
+				       const unsigned short port,
+				       int l3mdev,
+				       const struct sock *sk);
+
 static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
 			       const u32 bhash_size)
 {
@@ -200,7 +266,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
 }
 
 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
-		    const unsigned short snum);
+		    struct inet_bind2_bucket *tb2, const unsigned short snum);
 
 /* Caller must disable local BH processing. */
 int __inet_inherit_port(const struct sock *sk, struct sock *child);
diff --git a/include/net/sock.h b/include/net/sock.h
index 72ca97ccb460..c585ef6565d9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -348,6 +348,7 @@ struct sk_filter;
   *	@sk_txtime_report_errors: set report errors mode for SO_TXTIME
   *	@sk_txtime_unused: unused txtime flags
   *	@ns_tracker: tracker for netns reference
+  *	@sk_bind2_node: bind node in the bhash2 table
   */
 struct sock {
 	/*
@@ -537,6 +538,7 @@ struct sock {
 #endif
 	struct rcu_head		sk_rcu;
 	netns_tracker		ns_tracker;
+	struct hlist_node	sk_bind2_node;
 };
 
 enum sk_pacing {
@@ -817,6 +819,16 @@ static inline void sk_add_bind_node(struct sock *sk,
 	hlist_add_head(&sk->sk_bind_node, list);
 }
 
+static inline void __sk_del_bind2_node(struct sock *sk)
+{
+	__hlist_del(&sk->sk_bind2_node);
+}
+
+static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
+{
+	hlist_add_head(&sk->sk_bind2_node, list);
+}
+
 #define sk_for_each(__sk, list) \
 	hlist_for_each_entry(__sk, list, sk_node)
 #define sk_for_each_rcu(__sk, list) \
@@ -834,6 +846,8 @@ static inline void sk_add_bind_node(struct sock *sk,
 	hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
 #define sk_for_each_bound(__sk, list) \
 	hlist_for_each_entry(__sk, list, sk_bind_node)
+#define sk_for_each_bound_bhash2(__sk, list) \
+	hlist_for_each_entry(__sk, list, sk_bind2_node)
 
 /**
  * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
-- 
cgit v1.2.3


From c304eddcecfe2513ff98ce3ae97d1c196d82ba08 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 19 May 2022 13:20:54 -0700
Subject: net: wrap the wireless pointers in struct net_device in an ifdef

Most protocol-specific pointers in struct net_device are under
a respective ifdef. Wireless is the notable exception. Since
there's a sizable number of custom-built kernels for datacenter
workloads which don't build wireless it seems reasonable to
ifdefy those pointers as well.

While at it move IPv4 and IPv6 pointers up, those are special
for obvious reasons.

Acked-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Stefan Schmidt <stefan@datenfreihafen.org> # ieee802154
Acked-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cfg80211.h  | 2 ++
 include/net/cfg802154.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cc8a9880b9d6..6d02e12e4702 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8006,7 +8006,9 @@ int cfg80211_register_netdevice(struct net_device *dev);
  */
 static inline void cfg80211_unregister_netdevice(struct net_device *dev)
 {
+#if IS_ENABLED(CONFIG_CFG80211)
 	cfg80211_unregister_wdev(dev->ieee80211_ptr);
+#endif
 }
 
 /**
diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 85f9e8417688..d8d8719315fd 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -373,6 +373,7 @@ struct wpan_dev {
 
 #define to_phy(_dev)	container_of(_dev, struct wpan_phy, dev)
 
+#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
 static inline int
 wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 		     const struct ieee802154_addr *daddr,
@@ -383,6 +384,7 @@ wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 
 	return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len);
 }
+#endif
 
 struct wpan_phy *
 wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size);
-- 
cgit v1.2.3


From 4934609dda03ec90ca5052deecbe455b09a44e21 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 23 May 2022 16:17:06 +0000
Subject: amt: fix typo in amt

AMT_MSG_TEARDOWM is defined,
But it should be AMT_MSG_TEARDOWN.

Fixes: b9022b53adad ("amt: add control plane of amt interface")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/amt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/amt.h b/include/net/amt.h
index 7a4db8b903ee..0e40c3d64fcf 100644
--- a/include/net/amt.h
+++ b/include/net/amt.h
@@ -15,7 +15,7 @@ enum amt_msg_type {
 	AMT_MSG_MEMBERSHIP_QUERY,
 	AMT_MSG_MEMBERSHIP_UPDATE,
 	AMT_MSG_MULTICAST_DATA,
-	AMT_MSG_TEARDOWM,
+	AMT_MSG_TEARDOWN,
 	__AMT_MSG_MAX,
 };
 
-- 
cgit v1.2.3


From a54ce3703613e41fe1d98060b62ec09a3984dc28 Mon Sep 17 00:00:00 2001
From: Vincent Ray <vray@kalrayinc.com>
Date: Wed, 25 May 2022 17:17:46 -0700
Subject: net: sched: fixed barrier to prevent skbuff sticking in qdisc backlog

In qdisc_run_begin(), smp_mb__before_atomic() used before test_bit()
does not provide any ordering guarantee as test_bit() is not an atomic
operation. This, added to the fact that the spin_trylock() call at
the beginning of qdisc_run_begin() does not guarantee acquire
semantics if it does not grab the lock, makes it possible for the
following statement :

if (test_bit(__QDISC_STATE_MISSED, &qdisc->state))

to be executed before an enqueue operation called before
qdisc_run_begin().

As a result the following race can happen :

           CPU 1                             CPU 2

      qdisc_run_begin()               qdisc_run_begin() /* true */
        set(MISSED)                            .
      /* returns false */                      .
          .                            /* sees MISSED = 1 */
          .                            /* so qdisc not empty */
          .                            __qdisc_run()
          .                                    .
          .                              pfifo_fast_dequeue()
 ----> /* may be done here */                  .
|         .                                clear(MISSED)
|         .                                    .
|         .                                smp_mb __after_atomic();
|         .                                    .
|         .                                /* recheck the queue */
|         .                                /* nothing => exit   */
|   enqueue(skb1)
|         .
|   qdisc_run_begin()
|         .
|     spin_trylock() /* fail */
|         .
|     smp_mb__before_atomic() /* not enough */
|         .
 ---- if (test_bit(MISSED))
        return false;   /* exit */

In the above scenario, CPU 1 and CPU 2 both try to grab the
qdisc->seqlock at the same time. Only CPU 2 succeeds and enters the
bypass code path, where it emits its skb then calls __qdisc_run().

CPU1 fails, sets MISSED and goes down the traditionnal enqueue() +
dequeue() code path. But when executing qdisc_run_begin() for the
second time, after enqueuing its skbuff, it sees the MISSED bit still
set (by itself) and consequently chooses to exit early without setting
it again nor trying to grab the spinlock again.

Meanwhile CPU2 has seen MISSED = 1, cleared it, checked the queue
and found it empty, so it returned.

At the end of the sequence, we end up with skb1 enqueued in the
backlog, both CPUs out of __dev_xmit_skb(), the MISSED bit not set,
and no __netif_schedule() called made. skb1 will now linger in the
qdisc until somebody later performs a full __qdisc_run(). Associated
to the bypass capacity of the qdisc, and the ability of the TCP layer
to avoid resending packets which it knows are still in the qdisc, this
can lead to serious traffic "holes" in a TCP connection.

We fix this by replacing the smp_mb__before_atomic() / test_bit() /
set_bit() / smp_mb__after_atomic() sequence inside qdisc_run_begin()
by a single test_and_set_bit() call, which is more concise and
enforces the needed memory barriers.

Fixes: 89837eb4b246 ("net: sched: add barrier to ensure correct ordering for lockless qdisc")
Signed-off-by: Vincent Ray <vray@kalrayinc.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220526001746.2437669-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sch_generic.h | 36 ++++++++----------------------------
 1 file changed, 8 insertions(+), 28 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9bab396c1f3b..80973ce820f3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -187,37 +187,17 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 		if (spin_trylock(&qdisc->seqlock))
 			return true;
 
-		/* Paired with smp_mb__after_atomic() to make sure
-		 * STATE_MISSED checking is synchronized with clearing
-		 * in pfifo_fast_dequeue().
+		/* No need to insist if the MISSED flag was already set.
+		 * Note that test_and_set_bit() also gives us memory ordering
+		 * guarantees wrt potential earlier enqueue() and below
+		 * spin_trylock(), both of which are necessary to prevent races
 		 */
-		smp_mb__before_atomic();
-
-		/* If the MISSED flag is set, it means other thread has
-		 * set the MISSED flag before second spin_trylock(), so
-		 * we can return false here to avoid multi cpus doing
-		 * the set_bit() and second spin_trylock() concurrently.
-		 */
-		if (test_bit(__QDISC_STATE_MISSED, &qdisc->state))
+		if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state))
 			return false;
 
-		/* Set the MISSED flag before the second spin_trylock(),
-		 * if the second spin_trylock() return false, it means
-		 * other cpu holding the lock will do dequeuing for us
-		 * or it will see the MISSED flag set after releasing
-		 * lock and reschedule the net_tx_action() to do the
-		 * dequeuing.
-		 */
-		set_bit(__QDISC_STATE_MISSED, &qdisc->state);
-
-		/* spin_trylock() only has load-acquire semantic, so use
-		 * smp_mb__after_atomic() to ensure STATE_MISSED is set
-		 * before doing the second spin_trylock().
-		 */
-		smp_mb__after_atomic();
-
-		/* Retry again in case other CPU may not see the new flag
-		 * after it releases the lock at the end of qdisc_run_end().
+		/* Try to take the lock again to make sure that we will either
+		 * grab it or the CPU that still has it will see MISSED set
+		 * when testing it in qdisc_run_end()
 		 */
 		return spin_trylock(&qdisc->seqlock);
 	}
-- 
cgit v1.2.3


From 56b14ecec97f39118bf85c9ac2438c5a949509ed Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 20 May 2022 00:02:04 +0200
Subject: netfilter: conntrack: re-fetch conntrack after insertion

In case the conntrack is clashing, insertion can free skb->_nfct and
set skb->_nfct to the already-confirmed entry.

This wasn't found before because the conntrack entry and the extension
space used to free'd after an rcu grace period, plus the race needs
events enabled to trigger.

Reported-by: <syzbot+793a590957d9c1b96620@syzkaller.appspotmail.com>
Fixes: 71d8c47fc653 ("netfilter: conntrack: introduce clash resolution on insertion race")
Fixes: 2ad9d7747c10 ("netfilter: conntrack: free extension area immediately")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 6406cfee34c2..37866c8386e2 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -58,8 +58,13 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
 	int ret = NF_ACCEPT;
 
 	if (ct) {
-		if (!nf_ct_is_confirmed(ct))
+		if (!nf_ct_is_confirmed(ct)) {
 			ret = __nf_conntrack_confirm(skb);
+
+			if (ret == NF_ACCEPT)
+				ct = (struct nf_conn *)skb_nfct(skb);
+		}
+
 		if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct))
 			nf_ct_deliver_cached_events(ct);
 	}
-- 
cgit v1.2.3


From 2e8728c955ce0624b958eee6e030a37aca3a5d86 Mon Sep 17 00:00:00 2001
From: Guoju Fang <gjfang@linux.alibaba.com>
Date: Sat, 28 May 2022 18:16:28 +0800
Subject: net: sched: add barrier to fix packet stuck problem for lockless
 qdisc

In qdisc_run_end(), the spin_unlock() only has store-release semantic,
which guarantees all earlier memory access are visible before it. But
the subsequent test_bit() has no barrier semantics so may be reordered
ahead of the spin_unlock(). The store-load reordering may cause a packet
stuck problem.

The concurrent operations can be described as below,
         CPU 0                      |          CPU 1
   qdisc_run_end()                  |     qdisc_run_begin()
          .                         |           .
 ----> /* may be reorderd here */   |           .
|         .                         |           .
|     spin_unlock()                 |         set_bit()
|         .                         |         smp_mb__after_atomic()
 ---- test_bit()                    |         spin_trylock()
          .                         |          .

Consider the following sequence of events:
    CPU 0 reorder test_bit() ahead and see MISSED = 0
    CPU 1 calls set_bit()
    CPU 1 calls spin_trylock() and return fail
    CPU 0 executes spin_unlock()

At the end of the sequence, CPU 0 calls spin_unlock() and does nothing
because it see MISSED = 0. The skb on CPU 1 has beed enqueued but no one
take it, until the next cpu pushing to the qdisc (if ever ...) will
notice and dequeue it.

This patch fix this by adding one explicit barrier. As spin_unlock() and
test_bit() ordering is a store-load ordering, a full memory barrier
smp_mb() is needed here.

Fixes: a90c57f2cedd ("net: sched: fix packet stuck problem for lockless qdisc")
Signed-off-by: Guoju Fang <gjfang@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220528101628.120193-1-gjfang@linux.alibaba.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sch_generic.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 80973ce820f3..d6cf5116b5f9 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -209,6 +209,12 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 	if (qdisc->flags & TCQ_F_NOLOCK) {
 		spin_unlock(&qdisc->seqlock);
 
+		/* spin_unlock() only has store-release semantic. The unlock
+		 * and test_bit() ordering is a store-load ordering, so a full
+		 * memory barrier is needed here.
+		 */
+		smp_mb();
+
 		if (unlikely(test_bit(__QDISC_STATE_MISSED,
 				      &qdisc->state)))
 			__netif_schedule(qdisc);
-- 
cgit v1.2.3


From c4caa500ffebf64795d1c0f6f9d6f179b502c6b7 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 31 May 2022 14:37:27 +0800
Subject: bonding: guard ns_targets by CONFIG_IPV6

Guard ns_targets in struct bond_params by CONFIG_IPV6, which could save
256 bytes if IPv6 not configed. Also add this protection for function
bond_is_ip6_target_ok() and bond_get_targets_ip6().

Remove the IS_ENABLED() check for bond_opts[] as this will make
BOND_OPT_NS_TARGETS uninitialized if CONFIG_IPV6 not enabled. Add
a dummy bond_option_ns_ip6_targets_set() for this situation.

Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Jonathan Toppins <jtoppins@redhat.com>
Link: https://lore.kernel.org/r/20220531063727.224043-1-liuhangbin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/bonding.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index b14f4c0b4e9e..cb904d356e31 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -149,7 +149,9 @@ struct bond_params {
 	struct reciprocal_value reciprocal_packets_per_slave;
 	u16 ad_actor_sys_prio;
 	u16 ad_user_port_key;
+#if IS_ENABLED(CONFIG_IPV6)
 	struct in6_addr ns_targets[BOND_MAX_NS_TARGETS];
+#endif
 
 	/* 2 bytes of padding : see ether_addr_equal_64bits() */
 	u8 ad_actor_system[ETH_ALEN + 2];
@@ -503,12 +505,14 @@ static inline int bond_is_ip_target_ok(__be32 addr)
 	return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr);
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
 static inline int bond_is_ip6_target_ok(struct in6_addr *addr)
 {
 	return !ipv6_addr_any(addr) &&
 	       !ipv6_addr_loopback(addr) &&
 	       !ipv6_addr_is_multicast(addr);
 }
+#endif
 
 /* Get the oldest arp which we've received on this slave for bond's
  * arp_targets.
@@ -746,6 +750,7 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip)
 	return -1;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
 static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip)
 {
 	int i;
@@ -758,6 +763,7 @@ static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr
 
 	return -1;
 }
+#endif
 
 /* exported from bond_main.c */
 extern unsigned int bond_net_id;
-- 
cgit v1.2.3


From b6d9014a3335194590abdd2a2471ef5147a67645 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 30 May 2022 18:40:06 +0200
Subject: netfilter: nf_tables: delete flowtable hooks via transaction list

Remove inactive bool field in nft_hook object that was introduced in
abadb2f865d7 ("netfilter: nf_tables: delete devices from flowtable").
Move stale flowtable hooks to transaction list instead.

Deleting twice the same device does not result in ENOENT.

Fixes: abadb2f865d7 ("netfilter: nf_tables: delete devices from flowtable")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 20af9d3557b9..279ae0fff7ad 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1090,7 +1090,6 @@ struct nft_stats {
 
 struct nft_hook {
 	struct list_head	list;
-	bool			inactive;
 	struct nf_hook_ops	ops;
 	struct rcu_head		rcu;
 };
-- 
cgit v1.2.3


From 7d8a3a477b3e25ada8dc71d22048c2ea417209a0 Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Mon, 30 May 2022 23:21:58 +0800
Subject: ax25: Fix ax25 session cleanup problems

There are session cleanup problems in ax25_release() and
ax25_disconnect(). If we setup a session and then disconnect,
the disconnected session is still in "LISTENING" state that
is shown below.

Active AX.25 sockets
Dest       Source     Device  State        Vr/Vs    Send-Q  Recv-Q
DL9SAU-4   DL9SAU-3   ???     LISTENING    000/000  0       0
DL9SAU-3   DL9SAU-4   ???     LISTENING    000/000  0       0

The first reason is caused by del_timer_sync() in ax25_release().
The timers of ax25 are used for correct session cleanup. If we use
ax25_release() to close ax25 sessions and ax25_dev is not null,
the del_timer_sync() functions in ax25_release() will execute.
As a result, the sessions could not be cleaned up correctly,
because the timers have stopped.

In order to solve this problem, this patch adds a device_up flag
in ax25_dev in order to judge whether the device is up. If there
are sessions to be cleaned up, the del_timer_sync() in
ax25_release() will not execute. What's more, we add ax25_cb_del()
in ax25_kill_by_device(), because the timers have been stopped
and there are no functions that could delete ax25_cb if we do not
call ax25_release(). Finally, we reorder the position of
ax25_list_lock in ax25_cb_del() in order to synchronize among
different functions that call ax25_cb_del().

The second reason is caused by improper check in ax25_disconnect().
The incoming ax25 sessions which ax25->sk is null will close
heartbeat timer, because the check "if(!ax25->sk || ..)" is
satisfied. As a result, the session could not be cleaned up properly.

In order to solve this problem, this patch changes the improper
check to "if(ax25->sk && ..)" in ax25_disconnect().

What`s more, the ax25_disconnect() may be called twice, which is
not necessary. For example, ax25_kill_by_device() calls
ax25_disconnect() and sets ax25->state to AX25_STATE_0, but
ax25_release() calls ax25_disconnect() again.

In order to solve this problem, this patch add a check in
ax25_release(). If the flag of ax25->sk equals to SOCK_DEAD,
the ax25_disconnect() in ax25_release() should not be executed.

Fixes: 82e31755e55f ("ax25: Fix UAF bugs in ax25 timers")
Fixes: 8a367e74c012 ("ax25: Fix segfault after sock connection timeout")
Reported-and-tested-by: Thomas Osterried <thomas@osterried.de>
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Link: https://lore.kernel.org/r/20220530152158.108619-1-duoming@zju.edu.cn
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/ax25.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/ax25.h b/include/net/ax25.h
index 0f9790c455bb..a427a05672e2 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -228,6 +228,7 @@ typedef struct ax25_dev {
 	ax25_dama_info		dama;
 #endif
 	refcount_t		refcount;
+	bool device_up;
 } ax25_dev;
 
 typedef struct ax25_cb {
-- 
cgit v1.2.3


From e1cff7002b716bd0b5f5f4afd4273c99aa8644be Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 5 Jun 2022 11:51:48 -0700
Subject: bluetooth: don't use bitmaps for random flag accesses

The bluetooth code uses our bitmap infrastructure for the two bits (!)
of connection setup flags, and in the process causes odd problems when
it converts between a bitmap and just the regular values of said bits.

It's completely pointless to do things like bitmap_to_arr32() to convert
a bitmap into a u32.  It shoudln't have been a bitmap in the first
place.  The reason to use bitmaps is if you have arbitrary number of
bits you want to manage (not two!), or if you rely on the atomicity
guarantees of the bitmap setting and clearing.

The code could use an "atomic_t" and use "atomic_or/andnot()" to set and
clear the bit values, but considering that it then copies the bitmaps
around with "bitmap_to_arr32()" and friends, there clearly cannot be a
lot of atomicity requirements.

So just use a regular integer.

In the process, this avoids the warnings about erroneous use of
bitmap_from_u64() which were triggered on 32-bit architectures when
conversion from a u64 would access two words (and, surprise, surprise,
only one word is needed - and indeed overkill - for a 2-bit bitmap).

That was always problematic, but the compiler seems to notice it and
warn about the invalid pattern only after commit 0a97953fd221 ("lib: add
bitmap_{from,to}_arr64") changed the exact implementation details of
'bitmap_from_u64()', as reported by Sudip Mukherjee and Stephen Rothwell.

Fixes: fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params flags")
Link: https://lore.kernel.org/all/YpyJ9qTNHJzz0FHY@debian/
Link: https://lore.kernel.org/all/20220606080631.0c3014f2@canb.auug.org.au/
Link: https://lore.kernel.org/all/20220605162537.1604762-1-yury.norov@gmail.com/
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Cc: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/bluetooth/hci_core.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 5a52a2018b56..c0ea2a4892b1 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -155,21 +155,18 @@ struct bdaddr_list_with_irk {
 	u8 local_irk[16];
 };
 
+/* Bitmask of connection flags */
 enum hci_conn_flags {
-	HCI_CONN_FLAG_REMOTE_WAKEUP,
-	HCI_CONN_FLAG_DEVICE_PRIVACY,
-
-	__HCI_CONN_NUM_FLAGS,
+	HCI_CONN_FLAG_REMOTE_WAKEUP = 1,
+	HCI_CONN_FLAG_DEVICE_PRIVACY = 2,
 };
-
-/* Make sure number of flags doesn't exceed sizeof(current_flags) */
-static_assert(__HCI_CONN_NUM_FLAGS < 32);
+typedef u8 hci_conn_flags_t;
 
 struct bdaddr_list_with_flags {
 	struct list_head list;
 	bdaddr_t bdaddr;
 	u8 bdaddr_type;
-	DECLARE_BITMAP(flags, __HCI_CONN_NUM_FLAGS);
+	hci_conn_flags_t flags;
 };
 
 struct bt_uuid {
@@ -576,7 +573,7 @@ struct hci_dev {
 	struct rfkill		*rfkill;
 
 	DECLARE_BITMAP(dev_flags, __HCI_NUM_FLAGS);
-	DECLARE_BITMAP(conn_flags, __HCI_CONN_NUM_FLAGS);
+	hci_conn_flags_t	conn_flags;
 
 	__s8			adv_tx_power;
 	__u8			adv_data[HCI_MAX_EXT_AD_LENGTH];
@@ -775,7 +772,7 @@ struct hci_conn_params {
 
 	struct hci_conn *conn;
 	bool explicit_connect;
-	DECLARE_BITMAP(flags, __HCI_CONN_NUM_FLAGS);
+	hci_conn_flags_t flags;
 	u8  privacy_mode;
 };
 
-- 
cgit v1.2.3


From 3a41c64d9c1185a2f3a184015e2a9b78bfc99c71 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 6 Jun 2022 17:31:29 +0200
Subject: netfilter: nf_tables: bail out early if hardware offload is not
 supported

If user requests for NFT_CHAIN_HW_OFFLOAD, then check if either device
provides the .ndo_setup_tc interface or there is an indirect flow block
that has been registered. Otherwise, bail out early from the preparation
phase. Moreover, validate that family == NFPROTO_NETDEV and hook is
NF_NETDEV_INGRESS.

Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/flow_offload.h                | 1 +
 include/net/netfilter/nf_tables_offload.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 021778a7e1af..6484095a8c01 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -612,5 +612,6 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
 				enum tc_setup_type type, void *data,
 				struct flow_block_offload *bo,
 				void (*cleanup)(struct flow_block_cb *block_cb));
+bool flow_indr_dev_exists(void);
 
 #endif /* _NET_FLOW_OFFLOAD_H */
diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h
index 797147843958..3568b6a2f5f0 100644
--- a/include/net/netfilter/nf_tables_offload.h
+++ b/include/net/netfilter/nf_tables_offload.h
@@ -92,7 +92,7 @@ int nft_flow_rule_offload_commit(struct net *net);
 	NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg)		\
 	memset(&(__reg)->mask, 0xff, (__reg)->len);
 
-int nft_chain_offload_priority(struct nft_base_chain *basechain);
+bool nft_chain_offload_support(const struct nft_base_chain *basechain);
 
 int nft_offload_init(void);
 void nft_offload_exit(void);
-- 
cgit v1.2.3


From f93431c86b631bbca5614c66f966bf3ddb3c2803 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Tue, 7 Jun 2022 20:00:27 +0800
Subject: ipv6: Fix signed integer overflow in __ip6_append_data

Resurrect ubsan overflow checks and ubsan report this warning,
fix it by change the variable [length] type to size_t.

UBSAN: signed-integer-overflow in net/ipv6/ip6_output.c:1489:19
2147479552 + 8567 cannot be represented in type 'int'
CPU: 0 PID: 253 Comm: err Not tainted 5.16.0+ #1
Hardware name: linux,dummy-virt (DT)
Call trace:
  dump_backtrace+0x214/0x230
  show_stack+0x30/0x78
  dump_stack_lvl+0xf8/0x118
  dump_stack+0x18/0x30
  ubsan_epilogue+0x18/0x60
  handle_overflow+0xd0/0xf0
  __ubsan_handle_add_overflow+0x34/0x44
  __ip6_append_data.isra.48+0x1598/0x1688
  ip6_append_data+0x128/0x260
  udpv6_sendmsg+0x680/0xdd0
  inet6_sendmsg+0x54/0x90
  sock_sendmsg+0x70/0x88
  ____sys_sendmsg+0xe8/0x368
  ___sys_sendmsg+0x98/0xe0
  __sys_sendmmsg+0xf4/0x3b8
  __arm64_sys_sendmmsg+0x34/0x48
  invoke_syscall+0x64/0x160
  el0_svc_common.constprop.4+0x124/0x300
  do_el0_svc+0x44/0xc8
  el0_svc+0x3c/0x1e8
  el0t_64_sync_handler+0x88/0xb0
  el0t_64_sync+0x16c/0x170

Changes since v1:
-Change the variable [length] type to unsigned, as Eric Dumazet suggested.
Changes since v2:
-Don't change exthdrlen type in ip6_make_skb, as Paolo Abeni suggested.
Changes since v3:
-Don't change ulen type in udpv6_sendmsg and l2tp_ip6_sendmsg, as
Jakub Kicinski suggested.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Link: https://lore.kernel.org/r/20220607120028.845916-1-wangyufen@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ipv6.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 5b38bf1a586b..de9dcc5652c4 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1063,7 +1063,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);
 int ip6_append_data(struct sock *sk,
 		    int getfrag(void *from, char *to, int offset, int len,
 				int odd, struct sk_buff *skb),
-		    void *from, int length, int transhdrlen,
+		    void *from, size_t length, int transhdrlen,
 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 		    struct rt6_info *rt, unsigned int flags);
 
@@ -1079,7 +1079,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue,
 struct sk_buff *ip6_make_skb(struct sock *sk,
 			     int getfrag(void *from, char *to, int offset,
 					 int len, int odd, struct sk_buff *skb),
-			     void *from, int length, int transhdrlen,
+			     void *from, size_t length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6,
 			     struct rt6_info *rt, unsigned int flags,
 			     struct inet_cork_full *cork);
-- 
cgit v1.2.3


From 593d1ebe00a45af5cb7bda1235c0790987c2a2b2 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 15 Jun 2022 12:32:13 -0700
Subject: Revert "net: Add a second bind table hashed by port and address"

This reverts:

commit d5a42de8bdbe ("net: Add a second bind table hashed by port and address")
commit 538aaf9b2383 ("selftests: Add test for timing a bind request to a port with a populated bhash entry")
Link: https://lore.kernel.org/netdev/20220520001834.2247810-1-kuba@kernel.org/

There are a few things that need to be fixed here:
* Updating bhash2 in cases where the socket's rcv saddr changes
* Adding bhash2 hashbucket locks

Links to syzbot reports:
https://lore.kernel.org/netdev/00000000000022208805e0df247a@google.com/
https://lore.kernel.org/netdev/0000000000003f33bc05dfaf44fe@google.com/

Fixes: d5a42de8bdbe ("net: Add a second bind table hashed by port and address")
Reported-by: syzbot+015d756bbd1f8b5c8f09@syzkaller.appspotmail.com
Reported-by: syzbot+98fd2d1422063b0f8c44@syzkaller.appspotmail.com
Reported-by: syzbot+0a847a982613c6438fba@syzkaller.appspotmail.com
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20220615193213.2419568-1-joannelkoong@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h |  3 --
 include/net/inet_hashtables.h      | 68 +-------------------------------------
 include/net/sock.h                 | 14 --------
 3 files changed, 1 insertion(+), 84 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 077cd730ce2f..85cd695e7fd1 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -25,7 +25,6 @@
 #undef INET_CSK_CLEAR_TIMERS
 
 struct inet_bind_bucket;
-struct inet_bind2_bucket;
 struct tcp_congestion_ops;
 
 /*
@@ -58,7 +57,6 @@ struct inet_connection_sock_af_ops {
  *
  * @icsk_accept_queue:	   FIFO of established children
  * @icsk_bind_hash:	   Bind node
- * @icsk_bind2_hash:	   Bind node in the bhash2 table
  * @icsk_timeout:	   Timeout
  * @icsk_retransmit_timer: Resend (no ack)
  * @icsk_rto:		   Retransmit timeout
@@ -85,7 +83,6 @@ struct inet_connection_sock {
 	struct inet_sock	  icsk_inet;
 	struct request_sock_queue icsk_accept_queue;
 	struct inet_bind_bucket	  *icsk_bind_hash;
-	struct inet_bind2_bucket  *icsk_bind2_hash;
 	unsigned long		  icsk_timeout;
  	struct timer_list	  icsk_retransmit_timer;
  	struct timer_list	  icsk_delack_timer;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index a0887b70967b..ebfa3df6f8dc 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -90,32 +90,11 @@ struct inet_bind_bucket {
 	struct hlist_head	owners;
 };
 
-struct inet_bind2_bucket {
-	possible_net_t		ib_net;
-	int			l3mdev;
-	unsigned short		port;
-	union {
-#if IS_ENABLED(CONFIG_IPV6)
-		struct in6_addr		v6_rcv_saddr;
-#endif
-		__be32			rcv_saddr;
-	};
-	/* Node in the inet2_bind_hashbucket chain */
-	struct hlist_node	node;
-	/* List of sockets hashed to this bucket */
-	struct hlist_head	owners;
-};
-
 static inline struct net *ib_net(struct inet_bind_bucket *ib)
 {
 	return read_pnet(&ib->ib_net);
 }
 
-static inline struct net *ib2_net(struct inet_bind2_bucket *ib)
-{
-	return read_pnet(&ib->ib_net);
-}
-
 #define inet_bind_bucket_for_each(tb, head) \
 	hlist_for_each_entry(tb, head, node)
 
@@ -124,15 +103,6 @@ struct inet_bind_hashbucket {
 	struct hlist_head	chain;
 };
 
-/* This is synchronized using the inet_bind_hashbucket's spinlock.
- * Instead of having separate spinlocks, the inet_bind2_hashbucket can share
- * the inet_bind_hashbucket's given that in every case where the bhash2 table
- * is useful, a lookup in the bhash table also occurs.
- */
-struct inet_bind2_hashbucket {
-	struct hlist_head	chain;
-};
-
 /* Sockets can be hashed in established or listening table.
  * We must use different 'nulls' end-of-chain value for all hash buckets :
  * A socket might transition from ESTABLISH to LISTEN state without
@@ -164,12 +134,6 @@ struct inet_hashinfo {
 	 */
 	struct kmem_cache		*bind_bucket_cachep;
 	struct inet_bind_hashbucket	*bhash;
-	/* The 2nd binding table hashed by port and address.
-	 * This is used primarily for expediting the resolution of bind
-	 * conflicts.
-	 */
-	struct kmem_cache		*bind2_bucket_cachep;
-	struct inet_bind2_hashbucket	*bhash2;
 	unsigned int			bhash_size;
 
 	/* The 2nd listener table hashed by local port and address */
@@ -229,36 +193,6 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
 void inet_bind_bucket_destroy(struct kmem_cache *cachep,
 			      struct inet_bind_bucket *tb);
 
-static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb,
-					   struct net *net,
-					   const unsigned short port,
-					   int l3mdev)
-{
-	return net_eq(ib_net(tb), net) && tb->port == port &&
-		tb->l3mdev == l3mdev;
-}
-
-struct inet_bind2_bucket *
-inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
-			 struct inet_bind2_hashbucket *head,
-			 const unsigned short port, int l3mdev,
-			 const struct sock *sk);
-
-void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
-			       struct inet_bind2_bucket *tb);
-
-struct inet_bind2_bucket *
-inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net,
-		       const unsigned short port, int l3mdev,
-		       struct sock *sk,
-		       struct inet_bind2_hashbucket **head);
-
-bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb,
-				       struct net *net,
-				       const unsigned short port,
-				       int l3mdev,
-				       const struct sock *sk);
-
 static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
 			       const u32 bhash_size)
 {
@@ -266,7 +200,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
 }
 
 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
-		    struct inet_bind2_bucket *tb2, const unsigned short snum);
+		    const unsigned short snum);
 
 /* Caller must disable local BH processing. */
 int __inet_inherit_port(const struct sock *sk, struct sock *child);
diff --git a/include/net/sock.h b/include/net/sock.h
index c585ef6565d9..72ca97ccb460 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -348,7 +348,6 @@ struct sk_filter;
   *	@sk_txtime_report_errors: set report errors mode for SO_TXTIME
   *	@sk_txtime_unused: unused txtime flags
   *	@ns_tracker: tracker for netns reference
-  *	@sk_bind2_node: bind node in the bhash2 table
   */
 struct sock {
 	/*
@@ -538,7 +537,6 @@ struct sock {
 #endif
 	struct rcu_head		sk_rcu;
 	netns_tracker		ns_tracker;
-	struct hlist_node	sk_bind2_node;
 };
 
 enum sk_pacing {
@@ -819,16 +817,6 @@ static inline void sk_add_bind_node(struct sock *sk,
 	hlist_add_head(&sk->sk_bind_node, list);
 }
 
-static inline void __sk_del_bind2_node(struct sock *sk)
-{
-	__hlist_del(&sk->sk_bind2_node);
-}
-
-static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
-{
-	hlist_add_head(&sk->sk_bind2_node, list);
-}
-
 #define sk_for_each(__sk, list) \
 	hlist_for_each_entry(__sk, list, sk_node)
 #define sk_for_each_rcu(__sk, list) \
@@ -846,8 +834,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
 	hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
 #define sk_for_each_bound(__sk, list) \
 	hlist_for_each_entry(__sk, list, sk_bind_node)
-#define sk_for_each_bound_bhash2(__sk, list) \
-	hlist_for_each_entry(__sk, list, sk_bind2_node)
 
 /**
  * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
-- 
cgit v1.2.3


From e34a07c0ae3906f97eb18df50902e2a01c1015b6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Jun 2022 12:13:53 -0700
Subject: sock: redo the psock vs ULP protection check

Commit 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()")
has moved the inet_csk_has_ulp(sk) check from sk_psock_init() to
the new tcp_bpf_update_proto() function. I'm guessing that this
was done to allow creating psocks for non-inet sockets.

Unfortunately the destruction path for psock includes the ULP
unwind, so we need to fail the sk_psock_init() itself.
Otherwise if ULP is already present we'll notice that later,
and call tcp_update_ulp() with the sk_proto of the ULP
itself, which will most likely result in the ULP looping
its callbacks.

Fixes: 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20220620191353.1184629-2-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/inet_sock.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index c1b5dcd6597c..daead5fb389a 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -253,6 +253,11 @@ struct inet_sock {
 #define IP_CMSG_CHECKSUM	BIT(7)
 #define IP_CMSG_RECVFRAGSIZE	BIT(8)
 
+static inline bool sk_is_inet(struct sock *sk)
+{
+	return sk->sk_family == AF_INET || sk->sk_family == AF_INET6;
+}
+
 /**
  * sk_to_full_sk - Access to a full socket
  * @sk: pointer to a socket
-- 
cgit v1.2.3


From e34b9ed96ce3b06c79bf884009b16961ca478f87 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 22 Jun 2022 16:43:57 +0200
Subject: netfilter: nf_tables: avoid skb access on nf_stolen

When verdict is NF_STOLEN, the skb might have been freed.

When tracing is enabled, this can result in a use-after-free:
1. access to skb->nf_trace
2. access to skb->mark
3. computation of trace id
4. dump of packet payload

To avoid 1, keep a cached copy of skb->nf_trace in the
trace state struct.
Refresh this copy whenever verdict is != STOLEN.

Avoid 2 by skipping skb->mark access if verdict is STOLEN.

3 is avoided by precomputing the trace id.

Only dump the packet when verdict is not "STOLEN".

Reported-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 279ae0fff7ad..5c4e5a96a984 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1338,24 +1338,28 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
 /**
  *	struct nft_traceinfo - nft tracing information and state
  *
+ *	@trace: other struct members are initialised
+ *	@nf_trace: copy of skb->nf_trace before rule evaluation
+ *	@type: event type (enum nft_trace_types)
+ *	@skbid: hash of skb to be used as trace id
+ *	@packet_dumped: packet headers sent in a previous traceinfo message
  *	@pkt: pktinfo currently processed
  *	@basechain: base chain currently processed
  *	@chain: chain currently processed
  *	@rule:  rule that was evaluated
  *	@verdict: verdict given by rule
- *	@type: event type (enum nft_trace_types)
- *	@packet_dumped: packet headers sent in a previous traceinfo message
- *	@trace: other struct members are initialised
  */
 struct nft_traceinfo {
+	bool				trace;
+	bool				nf_trace;
+	bool				packet_dumped;
+	enum nft_trace_types		type:8;
+	u32				skbid;
 	const struct nft_pktinfo	*pkt;
 	const struct nft_base_chain	*basechain;
 	const struct nft_chain		*chain;
 	const struct nft_rule_dp	*rule;
 	const struct nft_verdict	*verdict;
-	enum nft_trace_types		type;
-	bool				packet_dumped;
-	bool				trace;
 };
 
 void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
-- 
cgit v1.2.3