diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2025-11-20 17:44:26 -0800 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2025-11-20 17:44:27 -0800 |
| commit | 4707191ca9d337599cdcfd8b296344c28eef9d03 (patch) | |
| tree | 7117223e56ad8b724e987ba02ef4e7ef0f00249d | |
| parent | 738cd803b9d418861438df0c7232e9ca6afad58e (diff) | |
| parent | ecfea98b7d0d56c5bf2df3fc02c5501afa5cef6f (diff) | |
Merge branch 'tcp-tcp_rcvbuf_grow-changes'
Eric Dumazet says:
====================
tcp: tcp_rcvbuf_grow() changes
First pach is minor and moves tcp_moderate_rcvbuf in appropriate group.
Second patch is another attempt to keep small sk->sk_rcvbuf for DC
(small RT) TCP flows for optimal performance.
====================
Link: https://patch.msgid.link/20251119084813.3684576-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
| -rw-r--r-- | Documentation/networking/ip-sysctl.rst | 10 | ||||
| -rw-r--r-- | Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst | 3 | ||||
| -rw-r--r-- | include/net/netns/ipv4.h | 3 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 11 | ||||
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 9 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 18 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 |
7 files changed, 42 insertions, 13 deletions
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index f4ad739a6b53..bc9a01606daf 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -673,6 +673,16 @@ tcp_moderate_rcvbuf - BOOLEAN Default: 1 (enabled) +tcp_rcvbuf_low_rtt - INTEGER + rcvbuf autotuning can over estimate final socket rcvbuf, which + can lead to cache trashing for high throughput flows. + + For small RTT flows (below tcp_rcvbuf_low_rtt usecs), we can relax + rcvbuf growth: Few additional ms to reach the final (and smaller) + rcvbuf is a good tradeoff. + + Default : 1000 (1 ms) + tcp_mtu_probing - INTEGER Controls TCP Packetization-Layer Path MTU Discovery. Takes three values: diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst index 6e7b20afd2d4..beaf1880a19b 100644 --- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst +++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst @@ -102,7 +102,8 @@ u8 sysctl_tcp_app_win u8 sysctl_tcp_frto tcp_enter_loss u8 sysctl_tcp_nometrics_save TCP_LAST_ACK/tcp_update_metrics u8 sysctl_tcp_no_ssthresh_metrics_save TCP_LAST_ACK/tcp_(update/init)_metrics -u8 sysctl_tcp_moderate_rcvbuf read_mostly read_mostly tcp_tso_should_defer(tx);tcp_rcv_space_adjust(rx) +u8 sysctl_tcp_moderate_rcvbuf read_mostly tcp_rcvbuf_grow() +u32 sysctl_tcp_rcvbuf_low_rtt read_mostly tcp_rcvbuf_grow() u8 sysctl_tcp_tso_win_divisor read_mostly tcp_tso_should_defer(tcp_write_xmit) u8 sysctl_tcp_workaround_signed_windows tcp_select_window int sysctl_tcp_limit_output_bytes read_mostly tcp_small_queue_check(tcp_write_xmit) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index de9d36acc8e2..2dbd46fc4734 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -74,17 +74,18 @@ struct netns_ipv4 { /* TXRX readonly hotpath cache lines */ __cacheline_group_begin(netns_ipv4_read_txrx); - u8 sysctl_tcp_moderate_rcvbuf; __cacheline_group_end(netns_ipv4_read_txrx); /* RX readonly hotpath cache line */ __cacheline_group_begin(netns_ipv4_read_rx); + u8 sysctl_tcp_moderate_rcvbuf; u8 sysctl_ip_early_demux; u8 sysctl_tcp_early_demux; u8 sysctl_tcp_l3mdev_accept; /* 3 bytes hole, try to pack */ int sysctl_tcp_reordering; int sysctl_tcp_rmem[3]; + int sysctl_tcp_rcvbuf_low_rtt; __cacheline_group_end(netns_ipv4_read_rx); struct inet_timewait_death_row tcp_death_row; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index adcfef55a66f..dfad7c03b809 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1223,15 +1223,13 @@ static void __init netns_ipv4_struct_check(void) sysctl_tcp_wmem); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_ip_fwd_use_pmtu); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); - - /* TXRX readonly hotpath cache lines */ - CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, - sysctl_tcp_moderate_rcvbuf); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); /* RX readonly hotpath cache line */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_moderate_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rcvbuf_low_rtt); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_ip_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); @@ -1241,7 +1239,6 @@ static void __init netns_ipv4_struct_check(void) sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 35367f8e2da3..a1a50a5c80dc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1343,6 +1343,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, }, { + .procname = "tcp_rcvbuf_low_rtt", + .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, .maxlen = sizeof(u8), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9df5d7515605..198f8a0d37be 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -896,6 +896,7 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval) const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 rcvwin, rcvbuf, cap, oldval; + u32 rtt_threshold, rtt_us; u64 grow; oldval = tp->rcvq_space.space; @@ -908,10 +909,19 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval) /* DRS is always one RTT late. */ rcvwin = newval << 1; - /* slow start: allow the sender to double its rate. */ - grow = (u64)rcvwin * (newval - oldval); - do_div(grow, oldval); - rcvwin += grow << 1; + rtt_us = tp->rcv_rtt_est.rtt_us >> 3; + rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt); + if (rtt_us < rtt_threshold) { + /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold. + * It might take few additional ms to reach 'line rate', + * but will avoid sk_rcvbuf inflation and poor cache use. + */ + grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold); + } else { + /* slow start: allow the sender to double its rate. */ + grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval); + } + rcvwin += grow; if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6fcaecb67284..e0bb8d9e2d9c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3566,6 +3566,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_frto = 2; net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; + net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. |
