summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-03-06 14:57:46 -0500
committerDavid S. Miller <davem@davemloft.net>2015-03-06 14:57:46 -0500
commitf0fdc80bd9ced4de3eb165c9c408e83713a71104 (patch)
treeedd39a71398f8639e3166dd64805399c5a8cec11
parentaaa4e70404c7b38a8792dc69af54afd7218b2ec0 (diff)
parentfab42760843734a82b6b2d1241ca44f375a686eb (diff)
Merge branch 'pmtu-probe'
Fan Du says: ==================== Improvements for TCP PMTU This patchset performs some improvements and enhancement for current TCP PMTU as per RFC4821 with the aim to find optimal mms size quickly, and also be adaptive to route changes like enlarged path MTU. Then TCP PMTU could be used to probe a effective pmtu in absence of ICMP message for tunnels(e.g. vxlan) across different networking stack. Patch1/4: Set probe mss base to 1024 Bytes per RFC4821 Patch2/4: Do not double probe_size for each probing, use a simple binary search to gain maximum performance. mss for next probing. Patch3/4: Create a probe timer to detect enlarged path MTU. Patch4/4: Update ip-sysctl.txt for new sysctl knobs. Changelog: v5: - Zero probe_size before resetting search range. - Update ip-sysctl.txt for new sysctl knobs. v4: - Convert probe_size to mss, not directly from search_low/high - Clamp probe_threshold - Don't adjust search_high in blackhole probe, so drop orignal patch3 v3: - Update commit message for patch2 - Fix pseudo timer delta calculation in patch4 v2: - Introduce sysctl_tcp_probe_threshold to control when probing will stop, as suggested by John Heffner. - Add patch3 to shrink current mss value for search low boundary. - Drop cannonical timer usages, implements pseudo timer based on 32bits jiffies tcp_time_stamp, as suggested by Eric Dumazet. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt10
-rw-r--r--include/net/inet_connection_sock.h2
-rw-r--r--include/net/netns/ipv4.h2
-rw-r--r--include/net/tcp.h8
-rw-r--r--net/ipv4/sysctl_net_ipv4.c14
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c50
-rw-r--r--net/ipv4/tcp_timer.c1
8 files changed, 84 insertions, 5 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1b8c964b0d17..4412f695a62f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -388,6 +388,16 @@ tcp_mtu_probing - INTEGER
1 - Disabled by default, enabled when an ICMP black hole detected
2 - Always enabled, use initial MSS of tcp_base_mss.
+tcp_probe_interval - INTEGER
+ Controls how often to start TCP Packetization-Layer Path MTU
+ Discovery reprobe. The default is reprobing every 10 minutes as
+ per RFC4821.
+
+tcp_probe_threshold - INTEGER
+ Controls when TCP Packetization-Layer Path MTU Discovery probing
+ will stop in respect to the width of search range in bytes. Default
+ is 8 bytes.
+
tcp_no_metrics_save - BOOLEAN
By default, TCP saves various connection metrics in the route cache
when the connection closes, so that connections established in the
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5976bdecf58b..b9a6b0a94cc6 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -126,6 +126,8 @@ struct inet_connection_sock {
/* Information on the current probe. */
int probe_size;
+
+ u32 probe_timestamp;
} icsk_mtup;
u32 icsk_ca_priv[16];
u32 icsk_user_timeout;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 1085e12f940f..8f3a1a1a5a94 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -87,6 +87,8 @@ struct netns_ipv4 {
int sysctl_tcp_fwmark_accept;
int sysctl_tcp_mtu_probing;
int sysctl_tcp_base_mss;
+ int sysctl_tcp_probe_threshold;
+ u32 sysctl_tcp_probe_interval;
struct ping_group_range ping_group_range;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f87599d5af82..2e11e38205c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -65,7 +65,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_MIN_MSS 88U
/* The least MTU to use for probing */
-#define TCP_BASE_MSS 512
+#define TCP_BASE_MSS 1024
+
+/* probing interval, default to 10 minutes as per RFC4821 */
+#define TCP_PROBE_INTERVAL 600
+
+/* Specify interval when tcp mtu probing will stop */
+#define TCP_PROBE_THRESHOLD 8
/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d151539da8e6..fdf899163d44 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -883,6 +883,20 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tcp_probe_threshold",
+ .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_probe_interval",
+ .data = &init_net.ipv4.sysctl_tcp_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5a2dfed4783b..f0c6fc32bfa8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2460,6 +2460,8 @@ static int __net_init tcp_sk_init(struct net *net)
}
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+ net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
return 0;
fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8bbd86cd81c8..5a73ad5afaf7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
}
EXPORT_SYMBOL(tcp_mtup_init);
@@ -1828,6 +1830,31 @@ send_now:
return false;
}
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = net->ipv4.sysctl_tcp_probe_interval;
+ delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ }
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
@@ -1842,11 +1869,13 @@ static int tcp_mtu_probe(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *nskb, *next;
+ struct net *net = sock_net(sk);
int len;
int probe_size;
int size_needed;
int copy;
int mss_now;
+ int interval;
/* Not currently probing/verifying,
* not in recovery,
@@ -1859,12 +1888,25 @@ static int tcp_mtu_probe(struct sock *sk)
tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
- /* Very simple search strategy: just double the MSS. */
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
mss_now = tcp_current_mss(sk);
- probe_size = 2 * tp->mss_cache;
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
- if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
- /* TODO: set timer for probe_converge_event */
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
return -1;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0732b787904e..15505936511d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
if (net->ipv4.sysctl_tcp_mtu_probing) {
if (!icsk->icsk_mtup.enabled) {
icsk->icsk_mtup.enabled = 1;
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else {
struct net *net = sock_net(sk);