From b46ab63181ff973ddce44ebc9ac24b269d42f481 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 23:54:56 +0000 Subject: [PATCH] net: Introduce net.core.bypass_prot_mem sysctl. If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out of the global protocol memory accounting. Let's control the flag by a new sysctl knob. The flag is written once during socket(2) and is inherited to child sockets. Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" # ulimit -n 524288 Without net.core.bypass_prot_mem, charged to tcp_mem & memcg # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 <-------------------------------------- charged to memcg # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With net.core.bypass_prot_mem=1, charged to memcg only: # sysctl -q net.core.bypass_prot_mem=1 # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 <------------------------------------ charged to memcg # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Shakeel Butt Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Link: https://patch.msgid.link/20251014235604.3057003-4-kuniyu@google.com --- Documentation/admin-guide/sysctl/net.rst | 8 ++++++++ include/net/netns/core.h | 1 + net/core/sock.c | 5 +++++ net/core/sysctl_net_core.c | 9 +++++++++ 4 files changed, 23 insertions(+) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 40749b3cd356..991773dcb9cf 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -212,6 +212,14 @@ mem_pcpu_rsv Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU. +bypass_prot_mem +--------------- + +Skip charging socket buffers to the global per-protocol memory +accounting controlled by net.ipv4.tcp_mem, net.ipv4.udp_mem, etc. + +Default: 0 (off) + rmem_default ------------ diff --git a/include/net/netns/core.h b/include/net/netns/core.h index cb9c3e4cd738..9ef3d70e5e9c 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -17,6 +17,7 @@ struct netns_core { int sysctl_optmem_max; u8 sysctl_txrehash; u8 sysctl_tstamp_allow_data; + u8 sysctl_bypass_prot_mem; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/net/core/sock.c b/net/core/sock.c index 5bf208579c02..b78533fb9268 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2306,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + + if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) + sk->sk_bypass_prot_mem = 1; + sk->sk_kern_sock = kern; sock_lock_init(sk); + sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f79137826d7f..8d4decb2606f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -683,6 +683,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */