From 08abdffa1c16b9603ee9bb33f3226ff186d98fa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Barr=C3=A9?= Date: Mon, 12 Jan 2015 10:30:40 +0100 Subject: tcp: avoid reducing cwnd when ACK+DSACK is received MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With TLP, the peer may reply to a probe with an ACK+D-SACK, with ack value set to tlp_high_seq. In the current code, such ACK+DSACK will be missed and only at next, higher ack will the TLP episode be considered done. Since the DSACK is not present anymore, this will cost a cwnd reduction. This patch ensures that this scenario does not cause a cwnd reduction, since receiving an ACK+DSACK indicates that both the initial segment and the probe have been received by the peer. The following packetdrill test, from Neal Cardwell, validates this patch: // Establish a connection. 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 +.020 < . 1:1(0) ack 1 win 257 +0 accept(3, ..., ...) = 4 // Send 1 packet. +0 write(4, ..., 1000) = 1000 +0 > P. 1:1001(1000) ack 1 // Loss probe retransmission. // packets_out == 1 => schedule PTO in max(2*RTT, 1.5*RTT + 200ms) // In this case, this means: 1.5*RTT + 200ms = 230ms +.230 > P. 1:1001(1000) ack 1 +0 %{ assert tcpi_snd_cwnd == 10 }% // Receiver ACKs at tlp_high_seq with a DSACK, // indicating they received the original packet and probe. +.020 < . 1:1(0) ack 1001 win 257 +0 %{ assert tcpi_snd_cwnd == 10 }% // Send another packet. +0 write(4, ..., 1000) = 1000 +0 > P. 1001:2001(1000) ack 1 // Receiver ACKs above tlp_high_seq, which should end the TLP episode // if we haven't already. We should not reduce cwnd. +.020 < . 1:1(0) ack 2001 win 257 +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% Credits: -Gregory helped in finding that tcp_process_tlp_ack was where the cwnd got reduced in our MPTCP tests. -Neal wrote the packetdrill test above -Yuchung reworked the patch to make it more readable. Cc: Gregory Detal Cc: Nandita Dukkipati Tested-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Signed-off-by: Sébastien Barré Acked-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 075ab4d5af5e..71fb37c70581 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3358,34 +3358,34 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } /* This routine deals with acks during a TLP episode. + * We mark the end of a TLP episode on receiving TLP dupack or when + * ack is after tlp_high_seq. * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { struct tcp_sock *tp = tcp_sk(sk); - bool is_tlp_dupack = (ack == tp->tlp_high_seq) && - !(flag & (FLAG_SND_UNA_ADVANCED | - FLAG_NOT_DUP | FLAG_DATA_SACKED)); - /* Mark the end of TLP episode on receiving TLP dupack or when - * ack is after tlp_high_seq. - */ - if (is_tlp_dupack) { - tp->tlp_high_seq = 0; + if (before(ack, tp->tlp_high_seq)) return; - } - if (after(ack, tp->tlp_high_seq)) { + if (flag & FLAG_DSACKING_ACK) { + /* This DSACK means original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; + } else if (after(ack, tp->tlp_high_seq)) { + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); + tcp_try_keep_open(sk); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSSPROBERECOVERY); + } else if (!(flag & (FLAG_SND_UNA_ADVANCED | + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; - /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ - if (!(flag & FLAG_DSACKING_ACK)) { - tcp_init_cwnd_reduction(sk); - tcp_set_ca_state(sk, TCP_CA_CWR); - tcp_end_cwnd_reduction(sk); - tcp_try_keep_open(sk); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSSPROBERECOVERY); - } } } -- cgit v1.2.3 From 932eb7638ad7d9145620178992044b5e87356969 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Thu, 29 Jan 2015 20:08:03 +0100 Subject: tcp: use SACK RTTs for CC Current behavior only passes RTTs from sequentially acked data to CC. If sender gets a combined ACK for segment 1 and SACK for segment 3, then the computed RTT for CC is the time between sending segment 1 and receiving SACK for segment 3. Pass the minimum computed RTT from any acked data to CC, i.e. time between sending segment 3 and receiving SACK for segment 3. Signed-off-by: Kenneth Klette Jonassen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71fb37c70581..ed11931f340f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3183,8 +3183,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) - ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); + if (ca_ops->pkts_acked) { + long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us); + ca_ops->pkts_acked(sk, pkts_acked, rtt_us); + } } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { -- cgit v1.2.3 From 843c2fdf7a12951815d981fa431d7e04d8259332 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 30 Jan 2015 20:45:20 +0100 Subject: net: dctcp: loosen requirement to assert ECT(0) during 3WHS One deployment requirement of DCTCP is to be able to run in a DC setting along with TCP traffic. As Glenn Judd's NSDI'15 paper "Attaining the Promise and Avoiding the Pitfalls of TCP in the Datacenter" [1] (tba) explains, one way to solve this on switch side is to split DCTCP and TCP traffic in two queues per switch port based on the DSCP: one queue soley intended for DCTCP traffic and one for non-DCTCP traffic. For the DCTCP queue, there's the marking threshold K as explained in commit e3118e8359bb ("net: tcp: add DCTCP congestion control algorithm") for RED marking ECT(0) packets with CE. For the non-DCTCP queue, there's f.e. a classic tail drop queue. As already explained in e3118e8359bb, running DCTCP at scale when not marking SYN/SYN-ACK packets with ECT(0) has severe consequences as for non-ECT(0) packets, traversing the RED marking DCTCP queue will result in a severe reduction of connection probability. This is due to the DCTCP queue being dominated by ECT(0) traffic and switches handle non-ECT traffic in the RED marking queue after passing K as drops, where K is usually a low watermark in order to leave enough tailroom for bursts. Splitting DCTCP traffic among several queues (ECN and non-ECN queue) is being considered a terrible idea in the network community as it splits single flows across multiple network paths. Therefore, commit e3118e8359bb implements this on Linux as ECT(0) marked traffic, as we argue that marking all packets of a DCTCP flow is the only viable solution and also doesn't speak against the draft. However, recently, a DCTCP implementation for FreeBSD hit also their mainline kernel [2]. In order to let them play well together with Linux' DCTCP, we would need to loosen the requirement that ECT(0) has to be asserted during the 3WHS as not implemented in FreeBSD. This simplifies the ECN test and lets DCTCP work together with FreeBSD. Joint work with Daniel Borkmann. [1] https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/judd [2] https://github.com/freebsd/freebsd/commit/8ad879445281027858a7fa706d13e458095b595f Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Cc: Glenn Judd Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ed11931f340f..d3dfff78fa19 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5872,10 +5872,9 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP - * congestion control; it requires setting ECT on all packets, - * including SYN. We inverse the test in this case: If our - * local socket wants ECN, but peer only set ece/cwr (but not - * ECT in IP header) its probably a non-DCTCP aware sender. + * congestion control: Linux DCTCP asserts ECT on all packets, + * including SYN, which is most optimal solution; however, + * others, such as FreeBSD do not. */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, @@ -5885,18 +5884,15 @@ static void tcp_ecn_create_request(struct request_sock *req, const struct tcphdr *th = tcp_hdr(skb); const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn, ecn_ok; + bool ect, ecn_ok; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - need_ecn = tcp_ca_needs_ecn(listen_sk); ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); - if (!ect && !need_ecn && ecn_ok) - inet_rsk(req)->ecn_ok = 1; - else if (ect && need_ecn) + if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) inet_rsk(req)->ecn_ok = 1; } -- cgit v1.2.3 From 032ee4236954eb214651cb9bfc1b38ffa8fd7a01 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 6 Feb 2015 16:04:38 -0500 Subject: tcp: helpers to mitigate ACK loops by rate-limiting out-of-window dupacks Helpers for mitigating ACK loops by rate-limiting dupacks sent in response to incoming out-of-window packets. This patch includes: - rate-limiting logic - sysctl to control how often we allow dupacks to out-of-window packets - SNMP counter for cases where we rate-limited our dupack sending The rate-limiting logic in this patch decides to not send dupacks in response to out-of-window segments if (a) they are SYNs or pure ACKs and (b) the remote endpoint is sending them faster than the configured rate limit. We rate-limit our responses rather than blocking them entirely or resetting the connection, because legitimate connections can rely on dupacks in response to some out-of-window segments. For example, zero window probes are typically sent with a sequence number that is below the current window, and ZWPs thus expect to thus elicit a dupack in response. We allow dupacks in response to TCP segments with data, because these may be spurious retransmissions for which the remote endpoint wants to receive DSACKs. This is safe because segments with data can't realistically be part of ACK loops, which by their nature consist of each side sending pure/data-less ACKs to each other. The dupack interval is controlled by a new sysctl knob, tcp_invalid_ratelimit, given in milliseconds, in case an administrator needs to dial this upward in the face of a high-rate DoS attack. The name and units are chosen to be analogous to the existing analogous knob for ICMP, icmp_ratelimit. The default value for tcp_invalid_ratelimit is 500ms, which allows at most one such dupack per 500ms. This is chosen to be 2x faster than the 1-second minimum RTO interval allowed by RFC 6298 (section 2, rule 2.4). We allow the extra 2x factor because network delay variations can cause packets sent at 1 second intervals to be compressed and arrive much closer. Reported-by: Avery Fay Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d3dfff78fa19..9401aa43b814 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; +int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -- cgit v1.2.3 From f2b2c582e82429270d5818fbabe653f4359d7024 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 6 Feb 2015 16:04:40 -0500 Subject: tcp: mitigate ACK loops for connections as tcp_sock Ensure that in state ESTABLISHED, where the connection is represented by a tcp_sock, we rate limit dupacks in response to incoming packets (a) with TCP timestamps that fail PAWS checks, or (b) with sequence numbers or ACK numbers that are out of the acceptable window. We do not send a dupack in response to out-of-window packets if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we last sent a dupack in response to an out-of-window packet. There is already a similar (although global) rate-limiting mechanism for "challenge ACKs". When deciding whether to send a challence ACK, we first consult the new per-connection rate limit, and then the global rate limit. Reported-by: Avery Fay Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9401aa43b814..8fdd27b17306 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3322,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 } /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) { /* unprotected vars, we dont care of overwrites */ static u32 challenge_timestamp; static unsigned int challenge_count; - u32 now = jiffies / HZ; + struct tcp_sock *tp = tcp_sk(sk); + u32 now; + + /* First check our per-socket dupack rate limit. */ + if (tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) + return; + /* Then check the check host-wide RFC 5961 rate limit. */ + now = jiffies / HZ; if (now != challenge_timestamp) { challenge_timestamp = now; challenge_count = 0; @@ -3424,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) { /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - tp->max_window)) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, skb); return -1; } goto old_ack; @@ -4993,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - tcp_send_dupack(sk, skb); + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDPAWS, + &tp->last_oow_ack_time)) + tcp_send_dupack(sk, skb); goto discard; } /* Reset is accepted even if it did not pass PAWS. */ @@ -5010,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (!th->rst) { if (th->syn) goto syn_challenge; - tcp_send_dupack(sk, skb); + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSEQ, + &tp->last_oow_ack_time)) + tcp_send_dupack(sk, skb); } goto discard; } @@ -5026,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) tcp_reset(sk); else - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, skb); goto discard; } @@ -5040,7 +5055,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, skb); goto discard; } -- cgit v1.2.3