summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2023-10-23 09:35:02 +0100
committerDavid S. Miller <davem@davemloft.net>2023-10-23 09:35:02 +0100
commitbdf24b4bdfa59b124f9d0ff837f8d35a908da3b8 (patch)
treebe82eb8c966ea4a7efa7cd6ed0df9d8968ed9f5d /include
parent35c1b273206346c4178928b1121675dc143e61d2 (diff)
parenta77a0f5c7f23a8a4981a2a3ff47baa91ceaf1f53 (diff)
Merge branch 'tcp-ts-usec-resolution'
Eric Dumazet says: ==================== tcp: add optional usec resolution to TCP TS As discussed in various public places in 2016, Google adopted usec resolution in RFC 7323 TS values, at Van Jacobson suggestion. Goals were : 1) better observability of delays in networking stacks/fabrics. 2) better disambiguation of events based on TSval/ecr values. 3) building block for congestion control modules needing usec resolution. Back then we implemented a schem based on private SYN options to safely negotiate the feature. For upstream submission, we chose to use a much simpler route attribute because this feature is probably going to be used in private networks. ip route add 10/8 ... features tcp_usec_ts References: https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/ First two patches are fixing old minor bugs and might be taken by stable teams (thanks to appropriate Fixes: tags) ==================== Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/tcp.h9
-rw-r--r--include/net/inet_timewait_sock.h3
-rw-r--r--include/net/tcp.h59
-rw-r--r--include/uapi/linux/rtnetlink.h18
-rw-r--r--include/uapi/linux/tcp.h1
5 files changed, 65 insertions, 25 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e15452df9804..6df715b6e51d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -152,6 +152,7 @@ struct tcp_request_sock {
u64 snt_synack; /* first SYNACK sent time */
bool tfo_listener;
bool is_mptcp;
+ s8 req_usec_ts;
#if IS_ENABLED(CONFIG_MPTCP)
bool drop_req;
#endif
@@ -257,7 +258,8 @@ struct tcp_sock {
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
- unused:5;
+ tcp_usec_ts:1, /* TSval values in usec */
+ unused:4;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */
@@ -576,4 +578,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);
+static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
+{
+ return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
+}
+
#endif /* _LINUX_TCP_H */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 4a8e578405cb..b14999ff55db 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -67,7 +67,8 @@ struct inet_timewait_sock {
/* And these are ours. */
unsigned int tw_transparent : 1,
tw_flowlabel : 20,
- tw_pad : 3, /* 3 bits hole */
+ tw_usec_ts : 1,
+ tw_pad : 2, /* 2 bits hole */
tw_tos : 8;
u32 tw_txhash;
u32 tw_priority;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index bad304d173a5..39b731c900dd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -166,7 +166,12 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define MAX_TCP_KEEPCNT 127
#define MAX_TCP_SYNCNT 127
-#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
+/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
+ * to avoid overflows. This assumes a clock smaller than 1 Mhz.
+ * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
+ */
+#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)
+
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
* after this time. It should be equal
* (or greater than) TCP_TIMEWAIT_LEN
@@ -798,22 +803,31 @@ static inline u64 tcp_clock_us(void)
return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}
-/* This should only be used in contexts where tp->tcp_mstamp is up to date */
-static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
+static inline u64 tcp_clock_ms(void)
{
- return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
+ return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
}
-/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
-static inline u32 tcp_ns_to_ts(u64 ns)
+/* TCP Timestamp included in TS option (RFC 1323) can either use ms
+ * or usec resolution. Each socket carries a flag to select one or other
+ * resolution, as the route attribute could change anytime.
+ * Each flow must stick to initial resolution.
+ */
+static inline u32 tcp_clock_ts(bool usec_ts)
{
- return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
+ return usec_ts ? tcp_clock_us() : tcp_clock_ms();
}
-/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
-static inline u32 tcp_time_stamp_raw(void)
+static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
{
- return tcp_ns_to_ts(tcp_clock_ns());
+ return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
+}
+
+static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
+{
+ if (tp->tcp_usec_ts)
+ return tp->tcp_mstamp;
+ return tcp_time_stamp_ms(tp);
}
void tcp_mstamp_refresh(struct tcp_sock *tp);
@@ -823,17 +837,30 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
return max_t(s64, t1 - t0, 0);
}
-static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
-{
- return tcp_ns_to_ts(skb->skb_mstamp_ns);
-}
-
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}
+/* Provide skb TSval in usec or ms unit */
+static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
+{
+ if (usec_ts)
+ return tcp_skb_timestamp_us(skb);
+
+ return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC);
+}
+
+static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
+{
+ return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
+}
+
+static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
+{
+ return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
+}
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
@@ -1599,7 +1626,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
return true;
if (unlikely(!time_before32(ktime_get_seconds(),
- rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
+ rx_opt->ts_recent_stamp + TCP_PAWS_WRAP)))
return true;
/*
* Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 51c13cf9c5ae..aa2482a0614a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -502,13 +502,17 @@ enum {
#define RTAX_MAX (__RTAX_MAX - 1)
-#define RTAX_FEATURE_ECN (1 << 0)
-#define RTAX_FEATURE_SACK (1 << 1)
-#define RTAX_FEATURE_TIMESTAMP (1 << 2)
-#define RTAX_FEATURE_ALLFRAG (1 << 3)
-
-#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
- RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
+#define RTAX_FEATURE_ECN (1 << 0)
+#define RTAX_FEATURE_SACK (1 << 1) /* unused */
+#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */
+#define RTAX_FEATURE_ALLFRAG (1 << 3)
+#define RTAX_FEATURE_TCP_USEC_TS (1 << 4)
+
+#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \
+ RTAX_FEATURE_SACK | \
+ RTAX_FEATURE_TIMESTAMP | \
+ RTAX_FEATURE_ALLFRAG | \
+ RTAX_FEATURE_TCP_USEC_TS)
struct rta_session {
__u8 proto;
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index d1d08da6331a..8aa3916e14f6 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */
#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */
#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
+#define TCPI_OPT_USEC_TS 64 /* usec timestamps */
/*
* Sender's congestion state indicating normal or abnormal situations