mirror of https://github.com/torvalds/linux.git
Merge branch 'tcp-receive-side-improvements'
Eric Dumazet says: ==================== tcp: receive side improvements We have set tcp_rmem[2] to 15 MB for about 8 years at Google, but had some issues for high speed flows on very small RTT. TCP rx autotuning has a tendency to overestimate the RTT, thus tp->rcvq_space.space and sk->sk_rcvbuf. This makes TCP receive queues much bigger than necessary, to a point cpu caches are evicted before application can copy the data, on cpus using DDIO. This series aims to fix this. - First patch adds tcp_rcvbuf_grow() tracepoint, which was very convenient to study the various issues fixed in this series. - Seven patches fix receiver autotune issues. - Two patches fix sender side issues. - Final patch increases tcp_rmem[2] so that TCP speed over WAN can meet modern needs. Tested on a 200Gbit NIC, average max throughput of a single flow: Before: 73593 Mbit. After: 122514 Mbit. ==================== Link: https://patch.msgid.link/20250513193919.1089692-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
2da35e4b4d
|
|
@ -735,7 +735,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
|
|||
net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables
|
||||
automatic tuning of that socket's receive buffer size, in which
|
||||
case this value is ignored.
|
||||
Default: between 131072 and 6MB, depending on RAM size.
|
||||
Default: between 131072 and 32MB, depending on RAM size.
|
||||
|
||||
tcp_sack - BOOLEAN
|
||||
Enable select acknowledgments (SACKS).
|
||||
|
|
@ -1099,7 +1099,7 @@ tcp_limit_output_bytes - INTEGER
|
|||
limits the number of bytes on qdisc or device to reduce artificial
|
||||
RTT/cwnd and reduce bufferbloat.
|
||||
|
||||
Default: 1048576 (16 * 65536)
|
||||
Default: 4194304 (4 MB)
|
||||
|
||||
tcp_challenge_ack_limit - INTEGER
|
||||
Limits number of Challenge ACK sent per second, as recommended
|
||||
|
|
|
|||
|
|
@ -340,7 +340,7 @@ struct tcp_sock {
|
|||
} rcv_rtt_est;
|
||||
/* Receiver queue space */
|
||||
struct {
|
||||
u32 space;
|
||||
int space;
|
||||
u32 seq;
|
||||
u64 time;
|
||||
} rcvq_space;
|
||||
|
|
|
|||
|
|
@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
|
|||
TP_ARGS(sk)
|
||||
);
|
||||
|
||||
TRACE_EVENT(tcp_rcvbuf_grow,
|
||||
|
||||
TP_PROTO(struct sock *sk, int time),
|
||||
|
||||
TP_ARGS(sk, time),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, time)
|
||||
__field(__u32, rtt_us)
|
||||
__field(__u32, copied)
|
||||
__field(__u32, inq)
|
||||
__field(__u32, space)
|
||||
__field(__u32, ooo_space)
|
||||
__field(__u32, rcvbuf)
|
||||
__field(__u8, scaling_ratio)
|
||||
__field(__u16, sport)
|
||||
__field(__u16, dport)
|
||||
__field(__u16, family)
|
||||
__array(__u8, saddr, 4)
|
||||
__array(__u8, daddr, 4)
|
||||
__array(__u8, saddr_v6, 16)
|
||||
__array(__u8, daddr_v6, 16)
|
||||
__field(const void *, skaddr)
|
||||
__field(__u64, sock_cookie)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
__be32 *p32;
|
||||
|
||||
__entry->time = time;
|
||||
__entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
|
||||
__entry->copied = tp->copied_seq - tp->rcvq_space.seq;
|
||||
__entry->inq = tp->rcv_nxt - tp->copied_seq;
|
||||
__entry->space = tp->rcvq_space.space;
|
||||
__entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 :
|
||||
TCP_SKB_CB(tp->ooo_last_skb)->end_seq -
|
||||
tp->rcv_nxt;
|
||||
|
||||
__entry->rcvbuf = sk->sk_rcvbuf;
|
||||
__entry->scaling_ratio = tp->scaling_ratio;
|
||||
__entry->sport = ntohs(inet->inet_sport);
|
||||
__entry->dport = ntohs(inet->inet_dport);
|
||||
__entry->family = sk->sk_family;
|
||||
|
||||
p32 = (__be32 *) __entry->saddr;
|
||||
*p32 = inet->inet_saddr;
|
||||
|
||||
p32 = (__be32 *) __entry->daddr;
|
||||
*p32 = inet->inet_daddr;
|
||||
|
||||
TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
|
||||
sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
|
||||
|
||||
__entry->skaddr = sk;
|
||||
__entry->sock_cookie = sock_gen_cookie(sk);
|
||||
),
|
||||
|
||||
TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
|
||||
"family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
|
||||
"saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
|
||||
__entry->time, __entry->rtt_us, __entry->copied,
|
||||
__entry->inq, __entry->space, __entry->ooo_space,
|
||||
__entry->scaling_ratio, __entry->rcvbuf,
|
||||
show_family_name(__entry->family),
|
||||
__entry->sport, __entry->dport,
|
||||
__entry->saddr, __entry->daddr,
|
||||
__entry->saddr_v6, __entry->daddr_v6,
|
||||
__entry->skaddr,
|
||||
__entry->sock_cookie)
|
||||
);
|
||||
|
||||
TRACE_EVENT(tcp_retransmit_synack,
|
||||
|
||||
TP_PROTO(const struct sock *sk, const struct request_sock *req),
|
||||
|
|
|
|||
|
|
@ -5231,7 +5231,7 @@ void __init tcp_init(void)
|
|||
/* Set per-socket limits to no more than 1/128 the pressure threshold */
|
||||
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
|
||||
max_wshare = min(4UL*1024*1024, limit);
|
||||
max_rshare = min(6UL*1024*1024, limit);
|
||||
max_rshare = min(32UL*1024*1024, limit);
|
||||
|
||||
init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
|
||||
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
|
||||
|
|
|
|||
|
|
@ -664,10 +664,12 @@ EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
|
|||
*/
|
||||
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
|
||||
{
|
||||
u32 new_sample = tp->rcv_rtt_est.rtt_us;
|
||||
long m = sample;
|
||||
u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
|
||||
long m = sample << 3;
|
||||
|
||||
if (new_sample != 0) {
|
||||
if (old_sample == 0 || m < old_sample) {
|
||||
new_sample = m;
|
||||
} else {
|
||||
/* If we sample in larger samples in the non-timestamp
|
||||
* case, we could grossly overestimate the RTT especially
|
||||
* with chatty applications or bulk transfer apps which
|
||||
|
|
@ -678,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
|
|||
* else with timestamps disabled convergence takes too
|
||||
* long.
|
||||
*/
|
||||
if (!win_dep) {
|
||||
m -= (new_sample >> 3);
|
||||
new_sample += m;
|
||||
} else {
|
||||
m <<= 3;
|
||||
if (m < new_sample)
|
||||
new_sample = m;
|
||||
}
|
||||
} else {
|
||||
/* No previous measure. */
|
||||
new_sample = m << 3;
|
||||
if (win_dep)
|
||||
return;
|
||||
/* Do not use this sample if receive queue is not empty. */
|
||||
if (tp->rcv_nxt != tp->copied_seq)
|
||||
return;
|
||||
new_sample = old_sample - (old_sample >> 3) + sample;
|
||||
}
|
||||
|
||||
tp->rcv_rtt_est.rtt_us = new_sample;
|
||||
|
|
@ -712,7 +709,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
|
|||
tp->rcv_rtt_est.time = tp->tcp_mstamp;
|
||||
}
|
||||
|
||||
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
|
||||
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
|
||||
{
|
||||
u32 delta, delta_us;
|
||||
|
||||
|
|
@ -722,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
|
|||
|
||||
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
|
||||
if (!delta)
|
||||
delta = 1;
|
||||
delta = min_delta;
|
||||
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
|
||||
return delta_us;
|
||||
}
|
||||
|
|
@ -740,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
|
|||
|
||||
if (TCP_SKB_CB(skb)->end_seq -
|
||||
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
|
||||
s32 delta = tcp_rtt_tsopt_us(tp);
|
||||
s32 delta = tcp_rtt_tsopt_us(tp, 0);
|
||||
|
||||
if (delta >= 0)
|
||||
if (delta > 0)
|
||||
tcp_rcv_rtt_update(tp, delta, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void tcp_rcvbuf_grow(struct sock *sk)
|
||||
{
|
||||
const struct net *net = sock_net(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
int rcvwin, rcvbuf, cap;
|
||||
|
||||
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
|
||||
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
|
||||
return;
|
||||
|
||||
/* slow start: allow the sender to double its rate. */
|
||||
rcvwin = tp->rcvq_space.space << 1;
|
||||
|
||||
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
|
||||
rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
|
||||
|
||||
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
|
||||
|
||||
rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
|
||||
if (rcvbuf > sk->sk_rcvbuf) {
|
||||
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
|
||||
/* Make the window clamp follow along. */
|
||||
WRITE_ONCE(tp->window_clamp,
|
||||
tcp_win_from_space(sk, rcvbuf));
|
||||
}
|
||||
}
|
||||
/*
|
||||
* This function should be called every time data is copied to user space.
|
||||
* It calculates the appropriate TCP receive buffer space.
|
||||
|
|
@ -754,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
|
|||
void tcp_rcv_space_adjust(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
u32 copied;
|
||||
int time;
|
||||
int time, inq, copied;
|
||||
|
||||
trace_tcp_rcv_space_adjust(sk);
|
||||
|
||||
|
|
@ -766,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk)
|
|||
|
||||
/* Number of bytes copied to user in last RTT */
|
||||
copied = tp->copied_seq - tp->rcvq_space.seq;
|
||||
/* Number of bytes in receive queue. */
|
||||
inq = tp->rcv_nxt - tp->copied_seq;
|
||||
copied -= inq;
|
||||
if (copied <= tp->rcvq_space.space)
|
||||
goto new_measure;
|
||||
|
||||
/* A bit of theory :
|
||||
* copied = bytes received in previous RTT, our base window
|
||||
* To cope with packet losses, we need a 2x factor
|
||||
* To cope with slow start, and sender growing its cwin by 100 %
|
||||
* every RTT, we need a 4x factor, because the ACK we are sending
|
||||
* now is for the next RTT, not the current one :
|
||||
* <prev RTT . ><current RTT .. ><next RTT .... >
|
||||
*/
|
||||
trace_tcp_rcvbuf_grow(sk, time);
|
||||
|
||||
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
|
||||
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
|
||||
u64 rcvwin, grow;
|
||||
int rcvbuf;
|
||||
|
||||
/* minimal window to cope with packet losses, assuming
|
||||
* steady state. Add some cushion because of small variations.
|
||||
*/
|
||||
rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
|
||||
|
||||
/* Accommodate for sender rate increase (eg. slow start) */
|
||||
grow = rcvwin * (copied - tp->rcvq_space.space);
|
||||
do_div(grow, tp->rcvq_space.space);
|
||||
rcvwin += (grow << 1);
|
||||
|
||||
rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
|
||||
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
|
||||
if (rcvbuf > sk->sk_rcvbuf) {
|
||||
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
|
||||
|
||||
/* Make the window clamp follow along. */
|
||||
WRITE_ONCE(tp->window_clamp,
|
||||
tcp_win_from_space(sk, rcvbuf));
|
||||
}
|
||||
}
|
||||
tp->rcvq_space.space = copied;
|
||||
|
||||
tcp_rcvbuf_grow(sk);
|
||||
|
||||
new_measure:
|
||||
tp->rcvq_space.seq = tp->copied_seq;
|
||||
tp->rcvq_space.time = tp->tcp_mstamp;
|
||||
|
|
@ -3226,7 +3221,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
|
|||
*/
|
||||
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
|
||||
tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
|
||||
seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
|
||||
seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);
|
||||
|
||||
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
|
||||
if (seq_rtt_us < 0)
|
||||
|
|
@ -5173,6 +5168,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
|
|||
skb_condense(skb);
|
||||
skb_set_owner_r(skb, sk);
|
||||
}
|
||||
tcp_rcvbuf_grow(sk);
|
||||
}
|
||||
|
||||
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
|
||||
|
|
@ -6873,6 +6869,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|||
if (!tp->srtt_us)
|
||||
tcp_synack_rtt_meas(sk, req);
|
||||
|
||||
if (tp->rx_opt.tstamp_ok)
|
||||
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
|
||||
|
||||
if (req) {
|
||||
tcp_rcv_synrecv_state_fastopen(sk);
|
||||
} else {
|
||||
|
|
@ -6898,9 +6897,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|||
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
|
||||
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
|
||||
|
||||
if (tp->rx_opt.tstamp_ok)
|
||||
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
|
||||
|
||||
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
|
||||
tcp_update_pacing_rate(sk);
|
||||
|
||||
|
|
|
|||
|
|
@ -3495,8 +3495,8 @@ static int __net_init tcp_sk_init(struct net *net)
|
|||
* which are too large can cause TCP streams to be bursty.
|
||||
*/
|
||||
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
|
||||
/* Default TSQ limit of 16 TSO segments */
|
||||
net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
|
||||
/* Default TSQ limit of 4 MB */
|
||||
net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
|
||||
|
||||
/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
|
||||
net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
|
||||
|
|
|
|||
|
|
@ -2619,9 +2619,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
|
|||
limit = max_t(unsigned long,
|
||||
2 * skb->truesize,
|
||||
READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
|
||||
if (sk->sk_pacing_status == SK_PACING_NONE)
|
||||
limit = min_t(unsigned long, limit,
|
||||
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
|
||||
limit = min_t(unsigned long, limit,
|
||||
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
|
||||
limit <<= factor;
|
||||
|
||||
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
|
||||
|
|
|
|||
Loading…
Reference in New Issue