mirror of https://github.com/torvalds/linux.git
net: Introduce net.core.bypass_prot_mem sysctl.
If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out of the global protocol memory accounting. Let's control the flag by a new sysctl knob. The flag is written once during socket(2) and is inherited to child sockets. Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" # ulimit -n 524288 Without net.core.bypass_prot_mem, charged to tcp_mem & memcg # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 <-------------------------------------- charged to memcg # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With net.core.bypass_prot_mem=1, charged to memcg only: # sysctl -q net.core.bypass_prot_mem=1 # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 <------------------------------------ charged to memcg # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Roman Gushchin <roman.gushchin@linux.dev> Link: https://patch.msgid.link/20251014235604.3057003-4-kuniyu@google.com
This commit is contained in:
parent
7c268eaeec
commit
b46ab63181
|
|
@ -212,6 +212,14 @@ mem_pcpu_rsv
|
||||||
|
|
||||||
Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
|
Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
|
||||||
|
|
||||||
|
bypass_prot_mem
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Skip charging socket buffers to the global per-protocol memory
|
||||||
|
accounting controlled by net.ipv4.tcp_mem, net.ipv4.udp_mem, etc.
|
||||||
|
|
||||||
|
Default: 0 (off)
|
||||||
|
|
||||||
rmem_default
|
rmem_default
|
||||||
------------
|
------------
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ struct netns_core {
|
||||||
int sysctl_optmem_max;
|
int sysctl_optmem_max;
|
||||||
u8 sysctl_txrehash;
|
u8 sysctl_txrehash;
|
||||||
u8 sysctl_tstamp_allow_data;
|
u8 sysctl_tstamp_allow_data;
|
||||||
|
u8 sysctl_bypass_prot_mem;
|
||||||
|
|
||||||
#ifdef CONFIG_PROC_FS
|
#ifdef CONFIG_PROC_FS
|
||||||
struct prot_inuse __percpu *prot_inuse;
|
struct prot_inuse __percpu *prot_inuse;
|
||||||
|
|
|
||||||
|
|
@ -2306,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
|
||||||
* why we need sk_prot_creator -acme
|
* why we need sk_prot_creator -acme
|
||||||
*/
|
*/
|
||||||
sk->sk_prot = sk->sk_prot_creator = prot;
|
sk->sk_prot = sk->sk_prot_creator = prot;
|
||||||
|
|
||||||
|
if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
|
||||||
|
sk->sk_bypass_prot_mem = 1;
|
||||||
|
|
||||||
sk->sk_kern_sock = kern;
|
sk->sk_kern_sock = kern;
|
||||||
sock_lock_init(sk);
|
sock_lock_init(sk);
|
||||||
|
|
||||||
sk->sk_net_refcnt = kern ? 0 : 1;
|
sk->sk_net_refcnt = kern ? 0 : 1;
|
||||||
if (likely(sk->sk_net_refcnt)) {
|
if (likely(sk->sk_net_refcnt)) {
|
||||||
get_net_track(net, &sk->ns_tracker, priority);
|
get_net_track(net, &sk->ns_tracker, priority);
|
||||||
|
|
|
||||||
|
|
@ -683,6 +683,15 @@ static struct ctl_table netns_core_table[] = {
|
||||||
.extra1 = SYSCTL_ZERO,
|
.extra1 = SYSCTL_ZERO,
|
||||||
.extra2 = SYSCTL_ONE
|
.extra2 = SYSCTL_ONE
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.procname = "bypass_prot_mem",
|
||||||
|
.data = &init_net.core.sysctl_bypass_prot_mem,
|
||||||
|
.maxlen = sizeof(u8),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dou8vec_minmax,
|
||||||
|
.extra1 = SYSCTL_ZERO,
|
||||||
|
.extra2 = SYSCTL_ONE
|
||||||
|
},
|
||||||
/* sysctl_core_net_init() will set the values after this
|
/* sysctl_core_net_init() will set the values after this
|
||||||
* to readonly in network namespaces
|
* to readonly in network namespaces
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue