From 88f8598d0a302a08380eadefd09b9f5cb1c4c428 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 Jan 2019 15:05:28 -0800
Subject: tcp: exit if nothing to retransmit on RTO timeout

Previously TCP only warns if its RTO timer fires and the
retransmission queue is empty, but it'll cause null pointer
reference later on. It's better to avoid such catastrophic failure
and simply exit with a warning.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 71a29e9c0620..e7d09e3705b8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -443,10 +443,8 @@ void tcp_retransmit_timer(struct sock *sk)
 		 */
 		return;
 	}
-	if (!tp->packets_out)
-		goto out;
-
-	WARN_ON(tcp_rtx_queue_empty(sk));
+	if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk)))
+		return;
 
 	tp->tlp_high_seq = 0;
 
-- 
cgit v1.2.3


From 7ae189759cc48cf8b54beebff566e9fd2d4e7d7c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 Jan 2019 15:05:30 -0800
Subject: tcp: always set retrans_stamp on recovery

Previously TCP socket's retrans_stamp is not set if the
retransmission has failed to send. As a result if a socket is
experiencing local issues to retransmit packets, determining when
to abort a socket is complicated w/o knowning the starting time of
the recovery since retrans_stamp may remain zero.

This complication causes sub-optimal behavior that TCP may use the
latest, instead of the first, retransmission time to compute the
elapsed time of a stalling connection due to local issues. Then TCP
may disrecard TCP retries settings and keep retrying until it finally
succeed: not a good idea when the local host is already strained.

The simple fix is to always timestamp the start of a recovery.
It's worth noting that retrans_stamp is also used to compare echo
timestamp values to detect spurious recovery. This patch does
not break that because retrans_stamp is still later than when the
original packet was sent.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e7d09e3705b8..1e61f0bd6e24 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,28 +22,14 @@
 #include <linux/gfp.h>
 #include <net/tcp.h>
 
-static u32 tcp_retransmit_stamp(const struct sock *sk)
-{
-	u32 start_ts = tcp_sk(sk)->retrans_stamp;
-
-	if (unlikely(!start_ts)) {
-		struct sk_buff *head = tcp_rtx_queue_head(sk);
-
-		if (!head)
-			return 0;
-		start_ts = tcp_skb_timestamp(head);
-	}
-	return start_ts;
-}
-
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 elapsed, start_ts;
 	s32 remaining;
 
-	start_ts = tcp_retransmit_stamp(sk);
-	if (!icsk->icsk_user_timeout || !start_ts)
+	start_ts = tcp_sk(sk)->retrans_stamp;
+	if (!icsk->icsk_user_timeout)
 		return icsk->icsk_rto;
 	elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
 	remaining = icsk->icsk_user_timeout - elapsed;
@@ -197,10 +183,7 @@ static bool retransmits_timed_out(struct sock *sk,
 	if (!inet_csk(sk)->icsk_retransmits)
 		return false;
 
-	start_ts = tcp_retransmit_stamp(sk);
-	if (!start_ts)
-		return false;
-
+	start_ts = tcp_sk(sk)->retrans_stamp;
 	if (likely(timeout == 0)) {
 		linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
 
-- 
cgit v1.2.3


From c7d13c8faa74f4e8ef191f88a252cefab6805b38 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 Jan 2019 15:05:31 -0800
Subject: tcp: properly track retry time on passive Fast Open

This patch addresses a corner issue on timeout behavior of a
passive Fast Open socket.  A passive Fast Open server may write
and close the socket when it is re-trying SYN-ACK to complete
the handshake. After the handshake is completely, the server does
not properly stamp the recovery start time (tp->retrans_stamp is
0), and the socket may abort immediately on the very first FIN
timeout, instead of retying until it passes the system or user
specified limit.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 1e61f0bd6e24..074de38bafbd 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -378,6 +378,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int max_retries = icsk->icsk_syn_retries ? :
 	    sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct request_sock *req;
 
 	req = tcp_sk(sk)->fastopen_rsk;
@@ -395,6 +396,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
 	inet_rtx_syn_ack(sk, req);
 	req->num_timeout++;
 	icsk->icsk_retransmits++;
+	if (!tp->retrans_stamp)
+		tp->retrans_stamp = tcp_time_stamp(tp);
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 			  TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
 }
-- 
cgit v1.2.3


From 01a523b071618abbc634d1958229fe3bd2dfa5fa Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 Jan 2019 15:05:32 -0800
Subject: tcp: create a helper to model exponential backoff

Create a helper to model TCP exponential backoff for the next patch.
This is pure refactor w no behavior change.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 074de38bafbd..bcc2f5783e57 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -159,7 +159,20 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
 	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 }
 
-
+static unsigned int tcp_model_timeout(struct sock *sk,
+				      unsigned int boundary,
+				      unsigned int rto_base)
+{
+	unsigned int linear_backoff_thresh, timeout;
+
+	linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
+	if (boundary <= linear_backoff_thresh)
+		timeout = ((2 << boundary) - 1) * rto_base;
+	else
+		timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+			(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+	return jiffies_to_msecs(timeout);
+}
 /**
  *  retransmits_timed_out() - returns true if this connection has timed out
  *  @sk:       The current socket
@@ -177,23 +190,15 @@ static bool retransmits_timed_out(struct sock *sk,
 				  unsigned int boundary,
 				  unsigned int timeout)
 {
-	const unsigned int rto_base = TCP_RTO_MIN;
-	unsigned int linear_backoff_thresh, start_ts;
+	unsigned int start_ts;
 
 	if (!inet_csk(sk)->icsk_retransmits)
 		return false;
 
 	start_ts = tcp_sk(sk)->retrans_stamp;
-	if (likely(timeout == 0)) {
-		linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
-
-		if (boundary <= linear_backoff_thresh)
-			timeout = ((2 << boundary) - 1) * rto_base;
-		else
-			timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
-				(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
-		timeout = jiffies_to_msecs(timeout);
-	}
+	if (likely(timeout == 0))
+		timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN);
+
 	return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
 }
 
-- 
cgit v1.2.3


From 9721e709fa68ef9b860c322b474cfbd1f8285b0f Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 Jan 2019 15:05:33 -0800
Subject: tcp: simplify window probe aborting on USER_TIMEOUT

Previously we use the next unsent skb's timestamp to determine
when to abort a socket stalling on window probes. This no longer
works as skb timestamp reflects the last instead of the first
transmission.

Instead we can estimate how long the socket has been stalling
with the probe count and the exponential backoff behavior.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index bcc2f5783e57..c36089aa3515 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -333,7 +333,6 @@ static void tcp_probe_timer(struct sock *sk)
 	struct sk_buff *skb = tcp_send_head(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int max_probes;
-	u32 start_ts;
 
 	if (tp->packets_out || !skb) {
 		icsk->icsk_probes_out = 0;
@@ -348,12 +347,13 @@ static void tcp_probe_timer(struct sock *sk)
 	 * corresponding system limit. We also implement similar policy when
 	 * we use RTO to probe window in tcp_retransmit_timer().
 	 */
-	start_ts = tcp_skb_timestamp(skb);
-	if (!start_ts)
-		skb->skb_mstamp_ns = tp->tcp_clock_cache;
-	else if (icsk->icsk_user_timeout &&
-		 (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
-		goto abort;
+	if (icsk->icsk_user_timeout) {
+		u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out,
+						tcp_probe0_base(sk));
+
+		if (elapsed >= icsk->icsk_user_timeout)
+			goto abort;
+	}
 
 	max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
 	if (sock_flag(sk, SOCK_DEAD)) {
-- 
cgit v1.2.3


From 590d2026d62418bb27de9ca87526e9131c1f48af Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 Jan 2019 15:05:34 -0800
Subject: tcp: retry more conservatively on local congestion

Previously when the sender fails to retransmit a data packet on
timeout due to congestion in the local host (e.g. throttling in
qdisc), it'll retry within an RTO up to 500ms.

In low-RTT networks such as data-centers, RTO is often far
below the default minimum 200ms (and the cap 500ms). Then local
host congestion could trigger a retry storm pouring gas to the
fire. Worse yet, the retry counter (icsk_retransmits) is not
properly updated so the aggressive retry may exceed the system
limit (15 rounds) until the packet finally slips through.

On such rare events, it's wise to retry more conservatively (500ms)
and update the stats properly to reflect these incidents and follow
the system limit. Note that this is consistent with the behavior
when a keep-alive probe is dropped due to local congestion.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c36089aa3515..d7399a89469d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -500,14 +500,13 @@ void tcp_retransmit_timer(struct sock *sk)
 
 	tcp_enter_loss(sk);
 
+	icsk->icsk_retransmits++;
 	if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
 		/* Retransmission failed because of local congestion,
-		 * do not backoff.
+		 * Let senders fight for local resources conservatively.
 		 */
-		if (!icsk->icsk_retransmits)
-			icsk->icsk_retransmits = 1;
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-					  min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+					  TCP_RESOURCE_PROBE_INTERVAL,
 					  TCP_RTO_MAX);
 		goto out;
 	}
@@ -528,7 +527,6 @@ void tcp_retransmit_timer(struct sock *sk)
 	 * the 120 second clamps though!
 	 */
 	icsk->icsk_backoff++;
-	icsk->icsk_retransmits++;
 
 out_reset_timer:
 	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
-- 
cgit v1.2.3