From f4dca95fc0f6350918f2e6727e35b41f7f86fcce Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 May 2024 13:05:27 +0000
Subject: tcp: reduce accepted window in NEW_SYN_RECV state

Jason commit made checks against ACK sequence less strict
and can be exploited by attackers to establish spoofed flows
with less probes.

Innocent users might use tcp_rmem[1] == 1,000,000,000,
or something more reasonable.

An attacker can use a regular TCP connection to learn the server
initial tp->rcv_wnd, and use it to optimize the attack.

If we make sure that only the announced window (smaller than 65535)
is used for ACK validation, we force an attacker to use
65537 packets to complete the 3WHS (assuming server ISN is unknown)

Fixes: 378979e94e95 ("tcp: remove 64 KByte limit for initial tp->rcv_wnd value")
Link: https://datatracker.ietf.org/meeting/119/materials/slides-119-tcpm-ghost-acks-00
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/r/20240523130528.60376-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/request_sock.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/net')
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index d88c0dfc2d46..ebcb8896bffc 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -285,4 +285,16 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
 	return atomic_read(&queue->young);
 }
 
+/* RFC 7323 2.3 Using the Window Scale Option
+ *  The window field (SEG.WND) of every outgoing segment, with the
+ *  exception of <SYN> segments, MUST be right-shifted by
+ *  Rcv.Wind.Shift bits.
+ *
+ * This means the SEG.WND carried in SYNACK can not exceed 65535.
+ * We use this property to harden TCP stack while in NEW_SYN_RECV state.
+ */
+static inline u32 tcp_synack_window(const struct request_sock *req)
+{
+	return min(req->rsk_rcv_wnd, 65535U);
+}
 #endif /* _REQUEST_SOCK_H */
-- 
cgit v1.2.3


From 266aa3b4812e97942a8ce5c7aafa7da059f7b5b8 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 24 May 2024 13:28:59 +0200
Subject: page_pool: fix &page_pool_params kdoc issues

After the tagged commit, @netdev got documented twice and the kdoc
script didn't notice that. Remove the second description added later
and move the initial one according to the field position.

After merging commit 5f8e4007c10d ("kernel-doc: fix
struct_group_tagged() parsing"), kdoc requires to describe struct
groups as well. &page_pool_params has 2 struct groups which
generated new warnings, describe them to resolve this.

Fixes: 403f11ac9ab7 ("page_pool: don't use driver-set flags field directly")
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20240524112859.2757403-1-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/types.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index b088d131aeb0..7e8477057f3d 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -45,16 +45,17 @@ struct pp_alloc_cache {
 
 /**
  * struct page_pool_params - page pool parameters
+ * @fast:	params accessed frequently on hotpath
  * @order:	2^order pages on allocation
  * @pool_size:	size of the ptr_ring
  * @nid:	NUMA node id to allocate from pages from
  * @dev:	device, for DMA pre-mapping purposes
- * @netdev:	netdev this pool will serve (leave as NULL if none or multiple)
  * @napi:	NAPI which is the sole consumer of pages, otherwise NULL
  * @dma_dir:	DMA mapping direction
  * @max_len:	max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV
  * @offset:	DMA sync address offset for PP_FLAG_DMA_SYNC_DEV
- * @netdev:	corresponding &net_device for Netlink introspection
+ * @slow:	params with slowpath access only (initialization and Netlink)
+ * @netdev:	netdev this pool will serve (leave as NULL if none or multiple)
  * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL
  */
 struct page_pool_params {
-- 
cgit v1.2.3


From 92f1655aa2b2294d0b49925f3b875a634bd3b59e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 May 2024 11:43:53 +0000
Subject: net: fix __dst_negative_advice() race

__dst_negative_advice() does not enforce proper RCU rules when
sk->dst_cache must be cleared, leading to possible UAF.

RCU rules are that we must first clear sk->sk_dst_cache,
then call dst_release(old_dst).

Note that sk_dst_reset(sk) is implementing this protocol correctly,
while __dst_negative_advice() uses the wrong order.

Given that ip6_negative_advice() has special logic
against RTF_CACHE, this means each of the three ->negative_advice()
existing methods must perform the sk_dst_reset() themselves.

Note the check against NULL dst is centralized in
__dst_negative_advice(), there is no need to duplicate
it in various callbacks.

Many thanks to Clement Lecigne for tracking this issue.

This old bug became visible after the blamed commit, using UDP sockets.

Fixes: a87cb3e48ee8 ("net: Facility to report route quality of connected sockets")
Reported-by: Clement Lecigne <clecigne@google.com>
Diagnosed-by: Clement Lecigne <clecigne@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240528114353.1794151-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst_ops.h |  2 +-
 include/net/sock.h    | 13 +++----------
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 6d1c8541183d..3a9001a042a5 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -24,7 +24,7 @@ struct dst_ops {
 	void			(*destroy)(struct dst_entry *);
 	void			(*ifdown)(struct dst_entry *,
 					  struct net_device *dev);
-	struct dst_entry *	(*negative_advice)(struct dst_entry *);
+	void			(*negative_advice)(struct sock *sk, struct dst_entry *);
 	void			(*link_failure)(struct sk_buff *);
 	void			(*update_pmtu)(struct dst_entry *dst, struct sock *sk,
 					       struct sk_buff *skb, u32 mtu,
diff --git a/include/net/sock.h b/include/net/sock.h
index 5f4d0629348f..953c8dc4e259 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2063,17 +2063,10 @@ sk_dst_get(const struct sock *sk)
 
 static inline void __dst_negative_advice(struct sock *sk)
 {
-	struct dst_entry *ndst, *dst = __sk_dst_get(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (dst && dst->ops->negative_advice) {
-		ndst = dst->ops->negative_advice(dst);
-
-		if (ndst != dst) {
-			rcu_assign_pointer(sk->sk_dst_cache, ndst);
-			sk_tx_queue_clear(sk);
-			WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
-		}
-	}
+	if (dst && dst->ops->negative_advice)
+		dst->ops->negative_advice(sk, dst);
 }
 
 static inline void dst_negative_advice(struct sock *sk)
-- 
cgit v1.2.3


From 33700a0c9b562700c28d31360a5f04508f459a45 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <0x7f454c46@gmail.com>
Date: Wed, 29 May 2024 18:29:32 +0100
Subject: net/tcp: Don't consider TCP_CLOSE in TCP_AO_ESTABLISHED

TCP_CLOSE may or may not have current/rnext keys and should not be
considered "established". The fast-path for TCP_CLOSE is
SKB_DROP_REASON_TCP_CLOSE. This is what tcp_rcv_state_process() does
anyways. Add an early drop path to not spend any time verifying
segment signatures for sockets in TCP_CLOSE state.

Cc: stable@vger.kernel.org # v6.7
Fixes: 0a3a809089eb ("net/tcp: Verify inbound TCP-AO signed segments")
Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://lore.kernel.org/r/20240529-tcp_ao-sk_state-v1-1-d69b5d323c52@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp_ao.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 471e177362b4..5d8e9ed2c005 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -86,7 +86,8 @@ static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key)
 struct tcp_ao_info {
 	/* List of tcp_ao_key's */
 	struct hlist_head	head;
-	/* current_key and rnext_key aren't maintained on listen sockets.
+	/* current_key and rnext_key are maintained on sockets
+	 * in TCP_AO_ESTABLISHED states.
 	 * Their purpose is to cache keys on established connections,
 	 * saving needless lookups. Never dereference any of them from
 	 * listen sockets.
@@ -201,9 +202,9 @@ struct tcp6_ao_context {
 };
 
 struct tcp_sigpool;
+/* Established states are fast-path and there always is current_key/rnext_key */
 #define TCP_AO_ESTABLISHED (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | \
-			    TCPF_CLOSE | TCPF_CLOSE_WAIT | \
-			    TCPF_LAST_ACK | TCPF_CLOSING)
+			    TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSING)
 
 int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 			struct tcp_ao_key *key, struct tcphdr *th,
-- 
cgit v1.2.3


From 5b4b62a169e10401cca34a6e7ac39161986f5605 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 3 Jun 2024 11:48:26 -0700
Subject: rtnetlink: make the "split" NLM_DONE handling generic

Jaroslav reports Dell's OMSA Systems Management Data Engine
expects NLM_DONE in a separate recvmsg(), both for rtnl_dump_ifinfo()
and inet_dump_ifaddr(). We already added a similar fix previously in
commit 460b0d33cf10 ("inet: bring NLM_DONE out to a separate recv() again")

Instead of modifying all the dump handlers, and making them look
different than modern for_each_netdev_dump()-based dump handlers -
put the workaround in rtnetlink code. This will also help us move
the custom rtnl-locking from af_netlink in the future (in net-next).

Note that this change is not touching rtnl_dump_all(). rtnl_dump_all()
is different kettle of fish and a potential problem. We now mix families
in a single recvmsg(), but NLM_DONE is not coalesced.

Tested:

  ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_addr.yaml \
           --dump getaddr --json '{"ifa-family": 2}'

  ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \
           --dump getroute --json '{"rtm-family": 2}'

  ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_link.yaml \
           --dump getlink

Fixes: 3e41af90767d ("rtnetlink: use xarray iterator to implement rtnl_dump_ifinfo()")
Fixes: cdb2f80f1c10 ("inet: use xa_array iterator to implement inet_dump_ifaddr()")
Reported-by: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Link: https://lore.kernel.org/all/CAK8fFZ7MKoFSEzMBDAOjoUt+vTZRRQgLDNXEOfdCCXSoXXKE0g@mail.gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 3bfb80bad173..b45d57b5968a 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -13,6 +13,7 @@ enum rtnl_link_flags {
 	RTNL_FLAG_DOIT_UNLOCKED		= BIT(0),
 	RTNL_FLAG_BULK_DEL_SUPPORTED	= BIT(1),
 	RTNL_FLAG_DUMP_UNLOCKED		= BIT(2),
+	RTNL_FLAG_DUMP_SPLIT_NLM_DONE	= BIT(3),	/* legacy behavior */
 };
 
 enum rtnl_kinds {
-- 
cgit v1.2.3