Merge branch 'tcp-receive-side-improvements'

Eric Dumazet says:

====================
tcp: receive side improvements

We have set tcp_rmem[2] to 15 MB for about 8 years at Google,
but had some issues for high speed flows on very small RTT.

TCP rx autotuning has a tendency to overestimate the RTT,
thus tp->rcvq_space.space and sk->sk_rcvbuf.

This makes TCP receive queues much bigger than necessary,
to a point cpu caches are evicted before application can
copy the data, on cpus using DDIO.

This series aims to fix this.

- First patch adds tcp_rcvbuf_grow() tracepoint, which was very
  convenient to study the various issues fixed in this series.

- Seven patches fix receiver autotune issues.

- Two patches fix sender side issues.

- Final patch increases tcp_rmem[2] so that TCP speed over WAN
  can meet modern needs.

Tested on a 200Gbit NIC, average max throughput of a single flow:

Before:
 73593 Mbit.

After:
 122514 Mbit.
====================

Link: https://patch.msgid.link/20250513193919.1089692-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2025-05-15 11:30:11 -07:00
commit 2da35e4b4d
7 changed files with 134 additions and 66 deletions

View File

@ -735,7 +735,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables
automatic tuning of that socket's receive buffer size, in which automatic tuning of that socket's receive buffer size, in which
case this value is ignored. case this value is ignored.
Default: between 131072 and 6MB, depending on RAM size. Default: between 131072 and 32MB, depending on RAM size.
tcp_sack - BOOLEAN tcp_sack - BOOLEAN
Enable select acknowledgments (SACKS). Enable select acknowledgments (SACKS).
@ -1099,7 +1099,7 @@ tcp_limit_output_bytes - INTEGER
limits the number of bytes on qdisc or device to reduce artificial limits the number of bytes on qdisc or device to reduce artificial
RTT/cwnd and reduce bufferbloat. RTT/cwnd and reduce bufferbloat.
Default: 1048576 (16 * 65536) Default: 4194304 (4 MB)
tcp_challenge_ack_limit - INTEGER tcp_challenge_ack_limit - INTEGER
Limits number of Challenge ACK sent per second, as recommended Limits number of Challenge ACK sent per second, as recommended

View File

@ -340,7 +340,7 @@ struct tcp_sock {
} rcv_rtt_est; } rcv_rtt_est;
/* Receiver queue space */ /* Receiver queue space */
struct { struct {
u32 space; int space;
u32 seq; u32 seq;
u64 time; u64 time;
} rcvq_space; } rcvq_space;

View File

@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
TP_ARGS(sk) TP_ARGS(sk)
); );
TRACE_EVENT(tcp_rcvbuf_grow,
TP_PROTO(struct sock *sk, int time),
TP_ARGS(sk, time),
TP_STRUCT__entry(
__field(int, time)
__field(__u32, rtt_us)
__field(__u32, copied)
__field(__u32, inq)
__field(__u32, space)
__field(__u32, ooo_space)
__field(__u32, rcvbuf)
__field(__u8, scaling_ratio)
__field(__u16, sport)
__field(__u16, dport)
__field(__u16, family)
__array(__u8, saddr, 4)
__array(__u8, daddr, 4)
__array(__u8, saddr_v6, 16)
__array(__u8, daddr_v6, 16)
__field(const void *, skaddr)
__field(__u64, sock_cookie)
),
TP_fast_assign(
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__be32 *p32;
__entry->time = time;
__entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
__entry->copied = tp->copied_seq - tp->rcvq_space.seq;
__entry->inq = tp->rcv_nxt - tp->copied_seq;
__entry->space = tp->rcvq_space.space;
__entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 :
TCP_SKB_CB(tp->ooo_last_skb)->end_seq -
tp->rcv_nxt;
__entry->rcvbuf = sk->sk_rcvbuf;
__entry->scaling_ratio = tp->scaling_ratio;
__entry->sport = ntohs(inet->inet_sport);
__entry->dport = ntohs(inet->inet_dport);
__entry->family = sk->sk_family;
p32 = (__be32 *) __entry->saddr;
*p32 = inet->inet_saddr;
p32 = (__be32 *) __entry->daddr;
*p32 = inet->inet_daddr;
TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
__entry->skaddr = sk;
__entry->sock_cookie = sock_gen_cookie(sk);
),
TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
"family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
"saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
__entry->time, __entry->rtt_us, __entry->copied,
__entry->inq, __entry->space, __entry->ooo_space,
__entry->scaling_ratio, __entry->rcvbuf,
show_family_name(__entry->family),
__entry->sport, __entry->dport,
__entry->saddr, __entry->daddr,
__entry->saddr_v6, __entry->daddr_v6,
__entry->skaddr,
__entry->sock_cookie)
);
TRACE_EVENT(tcp_retransmit_synack, TRACE_EVENT(tcp_retransmit_synack,
TP_PROTO(const struct sock *sk, const struct request_sock *req), TP_PROTO(const struct sock *sk, const struct request_sock *req),

View File

@ -5231,7 +5231,7 @@ void __init tcp_init(void)
/* Set per-socket limits to no more than 1/128 the pressure threshold */ /* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit); max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit); max_rshare = min(32UL*1024*1024, limit);
init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;

View File

@ -664,10 +664,12 @@ EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
*/ */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{ {
u32 new_sample = tp->rcv_rtt_est.rtt_us; u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
long m = sample; long m = sample << 3;
if (new_sample != 0) { if (old_sample == 0 || m < old_sample) {
new_sample = m;
} else {
/* If we sample in larger samples in the non-timestamp /* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially * case, we could grossly overestimate the RTT especially
* with chatty applications or bulk transfer apps which * with chatty applications or bulk transfer apps which
@ -678,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* else with timestamps disabled convergence takes too * else with timestamps disabled convergence takes too
* long. * long.
*/ */
if (!win_dep) { if (win_dep)
m -= (new_sample >> 3); return;
new_sample += m; /* Do not use this sample if receive queue is not empty. */
} else { if (tp->rcv_nxt != tp->copied_seq)
m <<= 3; return;
if (m < new_sample) new_sample = old_sample - (old_sample >> 3) + sample;
new_sample = m;
}
} else {
/* No previous measure. */
new_sample = m << 3;
} }
tp->rcv_rtt_est.rtt_us = new_sample; tp->rcv_rtt_est.rtt_us = new_sample;
@ -712,7 +709,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
tp->rcv_rtt_est.time = tp->tcp_mstamp; tp->rcv_rtt_est.time = tp->tcp_mstamp;
} }
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
{ {
u32 delta, delta_us; u32 delta, delta_us;
@ -722,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta) if (!delta)
delta = 1; delta = min_delta;
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
return delta_us; return delta_us;
} }
@ -740,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (TCP_SKB_CB(skb)->end_seq - if (TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
s32 delta = tcp_rtt_tsopt_us(tp); s32 delta = tcp_rtt_tsopt_us(tp, 0);
if (delta >= 0) if (delta > 0)
tcp_rcv_rtt_update(tp, delta, 0); tcp_rcv_rtt_update(tp, delta, 0);
} }
} }
static void tcp_rcvbuf_grow(struct sock *sk)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
int rcvwin, rcvbuf, cap;
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
return;
/* slow start: allow the sender to double its rate. */
rcvwin = tp->rcvq_space.space << 1;
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
WRITE_ONCE(tp->window_clamp,
tcp_win_from_space(sk, rcvbuf));
}
}
/* /*
* This function should be called every time data is copied to user space. * This function should be called every time data is copied to user space.
* It calculates the appropriate TCP receive buffer space. * It calculates the appropriate TCP receive buffer space.
@ -754,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk) void tcp_rcv_space_adjust(struct sock *sk)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
u32 copied; int time, inq, copied;
int time;
trace_tcp_rcv_space_adjust(sk); trace_tcp_rcv_space_adjust(sk);
@ -766,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk)
/* Number of bytes copied to user in last RTT */ /* Number of bytes copied to user in last RTT */
copied = tp->copied_seq - tp->rcvq_space.seq; copied = tp->copied_seq - tp->rcvq_space.seq;
/* Number of bytes in receive queue. */
inq = tp->rcv_nxt - tp->copied_seq;
copied -= inq;
if (copied <= tp->rcvq_space.space) if (copied <= tp->rcvq_space.space)
goto new_measure; goto new_measure;
/* A bit of theory : trace_tcp_rcvbuf_grow(sk, time);
* copied = bytes received in previous RTT, our base window
* To cope with packet losses, we need a 2x factor
* To cope with slow start, and sender growing its cwin by 100 %
* every RTT, we need a 4x factor, because the ACK we are sending
* now is for the next RTT, not the current one :
* <prev RTT . ><current RTT .. ><next RTT .... >
*/
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
u64 rcvwin, grow;
int rcvbuf;
/* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations.
*/
rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
/* Accommodate for sender rate increase (eg. slow start) */
grow = rcvwin * (copied - tp->rcvq_space.space);
do_div(grow, tp->rcvq_space.space);
rcvwin += (grow << 1);
rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
WRITE_ONCE(tp->window_clamp,
tcp_win_from_space(sk, rcvbuf));
}
}
tp->rcvq_space.space = copied; tp->rcvq_space.space = copied;
tcp_rcvbuf_grow(sk);
new_measure: new_measure:
tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.seq = tp->copied_seq;
tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.time = tp->tcp_mstamp;
@ -3226,7 +3221,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/ */
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0) if (seq_rtt_us < 0)
@ -5173,6 +5168,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
skb_condense(skb); skb_condense(skb);
skb_set_owner_r(skb, sk); skb_set_owner_r(skb, sk);
} }
tcp_rcvbuf_grow(sk);
} }
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
@ -6873,6 +6869,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!tp->srtt_us) if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req); tcp_synack_rtt_meas(sk, req);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
if (req) { if (req) {
tcp_rcv_synrecv_state_fastopen(sk); tcp_rcv_synrecv_state_fastopen(sk);
} else { } else {
@ -6898,9 +6897,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
if (!inet_csk(sk)->icsk_ca_ops->cong_control) if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk); tcp_update_pacing_rate(sk);

View File

@ -3495,8 +3495,8 @@ static int __net_init tcp_sk_init(struct net *net)
* which are too large can cause TCP streams to be bursty. * which are too large can cause TCP streams to be bursty.
*/ */
net->ipv4.sysctl_tcp_tso_win_divisor = 3; net->ipv4.sysctl_tcp_tso_win_divisor = 3;
/* Default TSQ limit of 16 TSO segments */ /* Default TSQ limit of 4 MB */
net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;

View File

@ -2619,9 +2619,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long, limit = max_t(unsigned long,
2 * skb->truesize, 2 * skb->truesize,
READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit,
limit = min_t(unsigned long, limit, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
limit <<= factor; limit <<= factor;
if (static_branch_unlikely(&tcp_tx_delay_enabled) && if (static_branch_unlikely(&tcp_tx_delay_enabled) &&