summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt10
-rw-r--r--include/net/inet_connection_sock.h2
-rw-r--r--include/net/netns/ipv4.h2
-rw-r--r--include/net/tcp.h8
-rw-r--r--net/ipv4/sysctl_net_ipv4.c14
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c50
-rw-r--r--net/ipv4/tcp_timer.c1
8 files changed, 84 insertions, 5 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1b8c964b0d17..4412f695a62f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -388,6 +388,16 @@ tcp_mtu_probing - INTEGER
1 - Disabled by default, enabled when an ICMP black hole detected
2 - Always enabled, use initial MSS of tcp_base_mss.
+tcp_probe_interval - INTEGER
+ Controls how often to start TCP Packetization-Layer Path MTU
+ Discovery reprobe. The default is reprobing every 10 minutes as
+ per RFC4821.
+
+tcp_probe_threshold - INTEGER
+ Controls when TCP Packetization-Layer Path MTU Discovery probing
+ will stop in respect to the width of search range in bytes. Default
+ is 8 bytes.
+
tcp_no_metrics_save - BOOLEAN
By default, TCP saves various connection metrics in the route cache
when the connection closes, so that connections established in the
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5976bdecf58b..b9a6b0a94cc6 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -126,6 +126,8 @@ struct inet_connection_sock {
/* Information on the current probe. */
int probe_size;
+
+ u32 probe_timestamp;
} icsk_mtup;
u32 icsk_ca_priv[16];
u32 icsk_user_timeout;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 1085e12f940f..8f3a1a1a5a94 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -87,6 +87,8 @@ struct netns_ipv4 {
int sysctl_tcp_fwmark_accept;
int sysctl_tcp_mtu_probing;
int sysctl_tcp_base_mss;
+ int sysctl_tcp_probe_threshold;
+ u32 sysctl_tcp_probe_interval;
struct ping_group_range ping_group_range;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f87599d5af82..2e11e38205c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -65,7 +65,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_MIN_MSS 88U
/* The least MTU to use for probing */
-#define TCP_BASE_MSS 512
+#define TCP_BASE_MSS 1024
+
+/* probing interval, default to 10 minutes as per RFC4821 */
+#define TCP_PROBE_INTERVAL 600
+
+/* Specify interval when tcp mtu probing will stop */
+#define TCP_PROBE_THRESHOLD 8
/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d151539da8e6..fdf899163d44 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -883,6 +883,20 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tcp_probe_threshold",
+ .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_probe_interval",
+ .data = &init_net.ipv4.sysctl_tcp_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5a2dfed4783b..f0c6fc32bfa8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2460,6 +2460,8 @@ static int __net_init tcp_sk_init(struct net *net)
}
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+ net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
return 0;
fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8bbd86cd81c8..5a73ad5afaf7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
}
EXPORT_SYMBOL(tcp_mtup_init);
@@ -1828,6 +1830,31 @@ send_now:
return false;
}
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = net->ipv4.sysctl_tcp_probe_interval;
+ delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ }
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
@@ -1842,11 +1869,13 @@ static int tcp_mtu_probe(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *nskb, *next;
+ struct net *net = sock_net(sk);
int len;
int probe_size;
int size_needed;
int copy;
int mss_now;
+ int interval;
/* Not currently probing/verifying,
* not in recovery,
@@ -1859,12 +1888,25 @@ static int tcp_mtu_probe(struct sock *sk)
tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
- /* Very simple search strategy: just double the MSS. */
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
mss_now = tcp_current_mss(sk);
- probe_size = 2 * tp->mss_cache;
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
- if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
- /* TODO: set timer for probe_converge_event */
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
return -1;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0732b787904e..15505936511d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
if (net->ipv4.sysctl_tcp_mtu_probing) {
if (!icsk->icsk_mtup.enabled) {
icsk->icsk_mtup.enabled = 1;
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else {
struct net *net = sock_net(sk);