Merge branch 'tcp-receive-side-improvements'

Eric Dumazet says:

====================
tcp: receive side improvements

We have set tcp_rmem[2] to 15 MB for about 8 years at Google,
but had some issues for high speed flows on very small RTT.

TCP rx autotuning has a tendency to overestimate the RTT,
thus tp->rcvq_space.space and sk->sk_rcvbuf.

This makes TCP receive queues much bigger than necessary,
to a point cpu caches are evicted before application can
copy the data, on cpus using DDIO.

This series aims to fix this.

- First patch adds tcp_rcvbuf_grow() tracepoint, which was very
  convenient to study the various issues fixed in this series.

- Seven patches fix receiver autotune issues.

- Two patches fix sender side issues.

- Final patch increases tcp_rmem[2] so that TCP speed over WAN
  can meet modern needs.

Tested on a 200Gbit NIC, average max throughput of a single flow:

Before:
 73593 Mbit.

After:
 122514 Mbit.
====================

Link: https://patch.msgid.link/20250513193919.1089692-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2025-05-15 11:30:11 -07:00
commit 2da35e4b4d
7 changed files with 134 additions and 66 deletions

View File

@ -735,7 +735,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables
automatic tuning of that socket's receive buffer size, in which
case this value is ignored.
Default: between 131072 and 6MB, depending on RAM size.
Default: between 131072 and 32MB, depending on RAM size.
tcp_sack - BOOLEAN
Enable select acknowledgments (SACKS).
@ -1099,7 +1099,7 @@ tcp_limit_output_bytes - INTEGER
limits the number of bytes on qdisc or device to reduce artificial
RTT/cwnd and reduce bufferbloat.
Default: 1048576 (16 * 65536)
Default: 4194304 (4 MB)
tcp_challenge_ack_limit - INTEGER
Limits number of Challenge ACK sent per second, as recommended

View File

@ -340,7 +340,7 @@ struct tcp_sock {
} rcv_rtt_est;
/* Receiver queue space */
struct {
u32 space;
int space;
u32 seq;
u64 time;
} rcvq_space;

View File

@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
TP_ARGS(sk)
);
TRACE_EVENT(tcp_rcvbuf_grow,
TP_PROTO(struct sock *sk, int time),
TP_ARGS(sk, time),
TP_STRUCT__entry(
__field(int, time)
__field(__u32, rtt_us)
__field(__u32, copied)
__field(__u32, inq)
__field(__u32, space)
__field(__u32, ooo_space)
__field(__u32, rcvbuf)
__field(__u8, scaling_ratio)
__field(__u16, sport)
__field(__u16, dport)
__field(__u16, family)
__array(__u8, saddr, 4)
__array(__u8, daddr, 4)
__array(__u8, saddr_v6, 16)
__array(__u8, daddr_v6, 16)
__field(const void *, skaddr)
__field(__u64, sock_cookie)
),
TP_fast_assign(
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__be32 *p32;
__entry->time = time;
__entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
__entry->copied = tp->copied_seq - tp->rcvq_space.seq;
__entry->inq = tp->rcv_nxt - tp->copied_seq;
__entry->space = tp->rcvq_space.space;
__entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 :
TCP_SKB_CB(tp->ooo_last_skb)->end_seq -
tp->rcv_nxt;
__entry->rcvbuf = sk->sk_rcvbuf;
__entry->scaling_ratio = tp->scaling_ratio;
__entry->sport = ntohs(inet->inet_sport);
__entry->dport = ntohs(inet->inet_dport);
__entry->family = sk->sk_family;
p32 = (__be32 *) __entry->saddr;
*p32 = inet->inet_saddr;
p32 = (__be32 *) __entry->daddr;
*p32 = inet->inet_daddr;
TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
__entry->skaddr = sk;
__entry->sock_cookie = sock_gen_cookie(sk);
),
TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
"family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
"saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
__entry->time, __entry->rtt_us, __entry->copied,
__entry->inq, __entry->space, __entry->ooo_space,
__entry->scaling_ratio, __entry->rcvbuf,
show_family_name(__entry->family),
__entry->sport, __entry->dport,
__entry->saddr, __entry->daddr,
__entry->saddr_v6, __entry->daddr_v6,
__entry->skaddr,
__entry->sock_cookie)
);
TRACE_EVENT(tcp_retransmit_synack,
TP_PROTO(const struct sock *sk, const struct request_sock *req),

View File

@ -5231,7 +5231,7 @@ void __init tcp_init(void)
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit);
max_rshare = min(32UL*1024*1024, limit);
init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;

View File

@ -664,10 +664,12 @@ EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
u32 new_sample = tp->rcv_rtt_est.rtt_us;
long m = sample;
u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
long m = sample << 3;
if (new_sample != 0) {
if (old_sample == 0 || m < old_sample) {
new_sample = m;
} else {
/* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially
* with chatty applications or bulk transfer apps which
@ -678,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* else with timestamps disabled convergence takes too
* long.
*/
if (!win_dep) {
m -= (new_sample >> 3);
new_sample += m;
} else {
m <<= 3;
if (m < new_sample)
new_sample = m;
}
} else {
/* No previous measure. */
new_sample = m << 3;
if (win_dep)
return;
/* Do not use this sample if receive queue is not empty. */
if (tp->rcv_nxt != tp->copied_seq)
return;
new_sample = old_sample - (old_sample >> 3) + sample;
}
tp->rcv_rtt_est.rtt_us = new_sample;
@ -712,7 +709,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
tp->rcv_rtt_est.time = tp->tcp_mstamp;
}
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
{
u32 delta, delta_us;
@ -722,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta)
delta = 1;
delta = min_delta;
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
return delta_us;
}
@ -740,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
s32 delta = tcp_rtt_tsopt_us(tp);
s32 delta = tcp_rtt_tsopt_us(tp, 0);
if (delta >= 0)
if (delta > 0)
tcp_rcv_rtt_update(tp, delta, 0);
}
}
static void tcp_rcvbuf_grow(struct sock *sk)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
int rcvwin, rcvbuf, cap;
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
return;
/* slow start: allow the sender to double its rate. */
rcvwin = tp->rcvq_space.space << 1;
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
WRITE_ONCE(tp->window_clamp,
tcp_win_from_space(sk, rcvbuf));
}
}
/*
* This function should be called every time data is copied to user space.
* It calculates the appropriate TCP receive buffer space.
@ -754,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 copied;
int time;
int time, inq, copied;
trace_tcp_rcv_space_adjust(sk);
@ -766,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk)
/* Number of bytes copied to user in last RTT */
copied = tp->copied_seq - tp->rcvq_space.seq;
/* Number of bytes in receive queue. */
inq = tp->rcv_nxt - tp->copied_seq;
copied -= inq;
if (copied <= tp->rcvq_space.space)
goto new_measure;
/* A bit of theory :
* copied = bytes received in previous RTT, our base window
* To cope with packet losses, we need a 2x factor
* To cope with slow start, and sender growing its cwin by 100 %
* every RTT, we need a 4x factor, because the ACK we are sending
* now is for the next RTT, not the current one :
* <prev RTT . ><current RTT .. ><next RTT .... >
*/
trace_tcp_rcvbuf_grow(sk, time);
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
u64 rcvwin, grow;
int rcvbuf;
/* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations.
*/
rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
/* Accommodate for sender rate increase (eg. slow start) */
grow = rcvwin * (copied - tp->rcvq_space.space);
do_div(grow, tp->rcvq_space.space);
rcvwin += (grow << 1);
rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
WRITE_ONCE(tp->window_clamp,
tcp_win_from_space(sk, rcvbuf));
}
}
tp->rcvq_space.space = copied;
tcp_rcvbuf_grow(sk);
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
tp->rcvq_space.time = tp->tcp_mstamp;
@ -3226,7 +3221,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
@ -5173,6 +5168,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
tcp_rcvbuf_grow(sk);
}
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
@ -6873,6 +6869,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
if (req) {
tcp_rcv_synrecv_state_fastopen(sk);
} else {
@ -6898,9 +6897,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);

View File

@ -3495,8 +3495,8 @@ static int __net_init tcp_sk_init(struct net *net)
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
/* Default TSQ limit of 16 TSO segments */
net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
/* Default TSQ limit of 4 MB */
net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;

View File

@ -2619,9 +2619,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
if (sk->sk_pacing_status == SK_PACING_NONE)
limit = min_t(unsigned long, limit,
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
limit = min_t(unsigned long, limit,
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
limit <<= factor;
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&