From 80eae5e569da8b2a6e474e87a9a8b08e64700749 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Apr 2019 10:55:20 -0700 Subject: tcp: tcp_grow_window() needs to respect tcp_space() [ Upstream commit 50ce163a72d817a99e8974222dcf2886d5deb1ae ] For some reason, tcp_grow_window() correctly tests if enough room is present before attempting to increase tp->rcv_ssthresh, but does not prevent it to grow past tcp_space() This is causing hard to debug issues, like failing the (__tcp_select_window(sk) >= tp->rcv_wnd) test in __tcp_ack_snd_check(), causing ACK delays and possibly slow flows. Depending on tcp_rmem[2], MTU, skb->len/skb->truesize ratio, we can see the problem happening on "netperf -t TCP_RR -- -r 2000,2000" after about 60 round trips, when the active side no longer sends immediate acks. This bug predates git history. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Acked-by: Wei Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cd4f13dda49e..e238539c3497 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -389,11 +389,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + int room; + + room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; /* Check #1 */ - if (tp->rcv_ssthresh < tp->window_clamp && - (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_under_memory_pressure(sk)) { + if (room > 0 && !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -406,8 +407,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) if (incr) { incr = max_t(int, incr, 2 * skb->len); - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, - tp->window_clamp); + tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } } -- cgit v1.2.3 From 418894c12113d82f4a133a99df77c52fefa7f6cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:25 -0700 Subject: tcp: reduce tcp_fastretrans_alert() verbosity commit 8ba6ddaaf86c4c6814774e4e4ef158b732bd9f9f upstream. With upcoming rb-tree implementation, the checks will trigger more often, and this is expected. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Cc: Amit Shah Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e238539c3497..80a0cdbabd40 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2837,9 +2837,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + if (!tp->sacked_out && tp->fackets_out) tp->fackets_out = 0; /* Now state machine starts. -- cgit v1.2.3 From cc1b58ccb78e0de51bcec1f2914d9296260668bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2019 17:31:03 -0700 Subject: tcp: limit payload size of sacked skbs commit 3b4929f65b0d8249f19a50245cd88ed1a2f78cff upstream. Jonathan Looney reported that TCP can trigger the following crash in tcp_shifted_skb() : BUG_ON(tcp_skb_pcount(skb) < pcount); This can happen if the remote peer has advertized the smallest MSS that linux TCP accepts : 48 An skb can hold 17 fragments, and each fragment can hold 32KB on x86, or 64KB on PowerPC. This means that the 16bit witdh of TCP_SKB_CB(skb)->tcp_gso_segs can overflow. Note that tcp_sendmsg() builds skbs with less than 64KB of payload, so this problem needs SACK to be enabled. SACK blocks allow TCP to coalesce multiple skbs in the retransmit queue, thus filling the 17 fragments to maximal capacity. CVE-2019-11477 -- u16 overflow of TCP_SKB_CB(skb)->tcp_gso_segs Backport notes, provided by Joao Martins v4.15 or since commit 737ff314563 ("tcp: use sequence distance to detect reordering") had switched from the packet-based FACK tracking and switched to sequence-based. v4.14 and older still have the old logic and hence on tcp_skb_shift_data() needs to retain its original logic and have @fack_count in sync. In other words, we keep the increment of pcount with tcp_skb_pcount(skb) to later used that to update fack_count. To make it more explicit we track the new skb that gets incremented to pcount in @next_pcount, and we get to avoid the constant invocation of tcp_skb_pcount(skb) all together. Fixes: 832d11c5cd07 ("tcp: Try to restore large SKBs while SACK processing") Signed-off-by: Eric Dumazet Reported-by: Jonathan Looney Acked-by: Neal Cardwell Reviewed-by: Tyler Hicks Cc: Yuchung Cheng Cc: Bruce Curtis Cc: Jonathan Lemon Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 80a0cdbabd40..e2e58bc42ba4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1320,7 +1320,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->seq += shifted; tcp_skb_pcount_add(prev, pcount); - BUG_ON(tcp_skb_pcount(skb) < pcount); + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, @@ -1387,6 +1387,21 @@ static int skb_can_shift(const struct sk_buff *skb) return !skb_headlen(skb) && skb_is_nonlinear(skb); } +int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, + int pcount, int shiftlen) +{ + /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) + * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need + * to make sure not storing more than 65535 * 8 bytes per skb, + * even if current MSS is bigger. + */ + if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) + return 0; + if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) + return 0; + return skb_shift(to, from, shiftlen); +} + /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ @@ -1398,6 +1413,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; int mss; + int next_pcount; int pcount = 0; int len; int in_sack; @@ -1495,7 +1511,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) goto fallback; - if (!skb_shift(prev, skb, len)) + if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) goto out; @@ -1514,11 +1530,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto out; len = skb->len; - if (skb_shift(prev, skb, len)) { - pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + next_pcount = tcp_skb_pcount(skb); + if (tcp_skb_shift(prev, skb, next_pcount, len)) { + pcount += next_pcount; + tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0); } - out: state->fack_count += pcount; return prev; -- cgit v1.2.3 From 6aab2d9c76c5307b7a067c91d0399e68ce9ef015 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 9 Sep 2019 16:56:02 -0400 Subject: tcp: fix tcp_ecn_withdraw_cwr() to clear TCP_ECN_QUEUE_CWR [ Upstream commit af38d07ed391b21f7405fa1f936ca9686787d6d2 ] Fix tcp_ecn_withdraw_cwr() to clear the correct bit: TCP_ECN_QUEUE_CWR. Rationale: basically, TCP_ECN_DEMAND_CWR is a bit that is purely about the behavior of data receivers, and deciding whether to reflect incoming IP ECN CE marks as outgoing TCP th->ece marks. The TCP_ECN_QUEUE_CWR bit is purely about the behavior of data senders, and deciding whether to send CWR. The tcp_ecn_withdraw_cwr() function is only called from tcp_undo_cwnd_reduction() by data senders during an undo, so it should zero the sender-side state, TCP_ECN_QUEUE_CWR. It does not make sense to stop the reflection of incoming CE bits on incoming data packets just because outgoing packets were spuriously retransmitted. The bug has been reproduced with packetdrill to manifest in a scenario with RFC3168 ECN, with an incoming data packet with CE bit set and carrying a TCP timestamp value that causes cwnd undo. Before this fix, the IP CE bit was ignored and not reflected in the TCP ECE header bit, and sender sent a TCP CWR ('W') bit on the next outgoing data packet, even though the cwnd reduction had been undone. After this fix, the sender properly reflects the CE bit and does not set the W bit. Note: the bug actually predates 2005 git history; this Fixes footer is chosen to be the oldest SHA1 I have tested (from Sep 2007) for which the patch applies cleanly (since before this commit the code was in a .h file). Fixes: bdf1ee5d3bd3 ("[TCP]: Move code from tcp_ecn.h to tcp*.c and tcp.h & remove it") Signed-off-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Cc: Eric Dumazet Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e2e58bc42ba4..84ff36a6d4e3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -247,7 +247,7 @@ static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) -- cgit v1.2.3 From ea3cc787e127f601c311ba0c8158349a949692e0 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Mon, 30 Dec 2019 17:54:41 +0800 Subject: tcp: fix "old stuff" D-SACK causing SACK to be treated as D-SACK [ Upstream commit c9655008e7845bcfdaac10a1ed8554ec167aea88 ] When we receive a D-SACK, where the sequence number satisfies: undo_marker <= start_seq < end_seq <= prior_snd_una we consider this is a valid D-SACK and tcp_is_sackblock_valid() returns true, then this D-SACK is discarded as "old stuff", but the variable first_sack_index is not marked as negative in tcp_sacktag_write_queue(). If this D-SACK also carries a SACK that needs to be processed (for example, the previous SACK segment was lost), this SACK will be treated as a D-SACK in the following processing of tcp_sacktag_write_queue(), which will eventually lead to incorrect updates of undo_retrans and reordering. Fixes: fd6dad616d4f ("[TCP]: Earlier SACK block verification & simplify access to them") Signed-off-by: Pengcheng Yang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 84ff36a6d4e3..4901d17a8e63 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1741,8 +1741,11 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } /* Ignore very old stuff early */ - if (!after(sp[used_sacks].end_seq, prior_snd_una)) + if (!after(sp[used_sacks].end_seq, prior_snd_una)) { + if (i == 0) + first_sack_index = -1; continue; + } used_sacks++; } -- cgit v1.2.3 From 638b632ddec39db8600efa90c765e9f7fa11a2ef Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Sat, 1 Apr 2017 11:00:21 -0300 Subject: tcp: minimize false-positives on TCP/GRO check commit 0b9aefea860063bb39e36bd7fe6c7087fed0ba87 upstream. Markus Trippelsdorf reported that after commit dcb17d22e1c2 ("tcp: warn on bogus MSS and try to amend it") the kernel started logging the warning for a NIC driver that doesn't even support GRO. It was diagnosed that it was possibly caused on connections that were using TCP Timestamps but some packets lacked the Timestamps option. As we reduce rcv_mss when timestamps are used, the lack of them would cause the packets to be bigger than expected, although this is a valid case. As this warning is more as a hint, getting a clean-cut on the threshold is probably not worth the execution time spent on it. This patch thus alleviates the false-positives with 2 quick checks: by accounting for the entire TCP option space and also checking against the interface MTU if it's available. These changes, specially the MTU one, might mask some real positives, though if they are really happening, it's possible that sooner or later it will be triggered anyway. Reported-by: Markus Trippelsdorf Cc: Eric Dumazet Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Cc: Salvatore Bonaccorso Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4901d17a8e63..9644dcef5a2b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -129,7 +129,8 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define REXMIT_LOST 1 /* retransmit packets marked lost */ #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ -static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb) +static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, + unsigned int len) { static bool __once __read_mostly; @@ -140,8 +141,9 @@ static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb) rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); - pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", - dev ? dev->name : "Unknown driver"); + if (!dev || len >= dev->mtu) + pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", + dev ? dev->name : "Unknown driver"); rcu_read_unlock(); } } @@ -164,8 +166,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) if (len >= icsk->icsk_ack.rcv_mss) { icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); - if (unlikely(icsk->icsk_ack.rcv_mss != len)) - tcp_gro_dev_warn(sk, skb); + /* Account for possibly-removed options */ + if (unlikely(len > icsk->icsk_ack.rcv_mss + + MAX_TCP_OPTION_SPACE)) + tcp_gro_dev_warn(sk, skb, len); } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. -- cgit v1.2.3 From 3930b48a34abec508db506dd758ccb8dd2af921f Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 14 Jan 2020 17:23:40 +0800 Subject: tcp: fix marked lost packets not being retransmitted [ Upstream commit e176b1ba476cf36f723cfcc7a9e57f3cb47dec70 ] When the packet pointed to by retransmit_skb_hint is unlinked by ACK, retransmit_skb_hint will be set to NULL in tcp_clean_rtx_queue(). If packet loss is detected at this time, retransmit_skb_hint will be set to point to the current packet loss in tcp_verify_retransmit_hint(), then the packets that were previously marked lost but not retransmitted due to the restriction of cwnd will be skipped and cannot be retransmitted. To fix this, when retransmit_skb_hint is NULL, retransmit_skb_hint can be reset only after all marked lost packets are retransmitted (retrans_out >= lost_out), otherwise we need to traverse from tcp_rtx_queue_head in tcp_xmit_retransmit_queue(). Packetdrill to demonstrate: // Disable RACK and set max_reordering to keep things simple 0 `sysctl -q net.ipv4.tcp_recovery=0` +0 `sysctl -q net.ipv4.tcp_max_reordering=3` // Establish a connection +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +.1 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 <...> +.01 < . 1:1(0) ack 1 win 257 +0 accept(3, ..., ...) = 4 // Send 8 data segments +0 write(4, ..., 8000) = 8000 +0 > P. 1:8001(8000) ack 1 // Enter recovery and 1:3001 is marked lost +.01 < . 1:1(0) ack 1 win 257 +0 < . 1:1(0) ack 1 win 257 +0 < . 1:1(0) ack 1 win 257 // Retransmit 1:1001, now retransmit_skb_hint points to 1001:2001 +0 > . 1:1001(1000) ack 1 // 1001:2001 was ACKed causing retransmit_skb_hint to be set to NULL +.01 < . 1:1(0) ack 2001 win 257 // Now retransmit_skb_hint points to 4001:5001 which is now marked lost // BUG: 2001:3001 was not retransmitted +0 > . 2001:3001(1000) ack 1 Signed-off-by: Pengcheng Yang Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9644dcef5a2b..52014c5312b9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -923,9 +923,10 @@ static void tcp_update_reordering(struct sock *sk, const int metric, /* This must be called before lost_out is incremented */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { - if (!tp->retransmit_skb_hint || - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || + (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq))) tp->retransmit_skb_hint = skb; if (!tp->lost_out || -- cgit v1.2.3