Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: Allow opt-out from global protocol memory accounting.

Some protocols (e.g., TCP, UDP) implement memory accounting for socket
buffers and charge memory to per-protocol global counters pointed to by
sk->sk_proto->memory_allocated.

Sometimes, system processes do not want that limitation. For a similar
purpose, there is SO_RESERVE_MEM for sockets under memcg.

Also, by opting out of the per-protocol accounting, sockets under memcg
can avoid paying costs for two orthogonal memory accounting mechanisms.
A microbenchmark result is in the subsequent bpf patch.

Let's allow opt-out from the per-protocol memory accounting if
sk->sk_bypass_prot_mem is true.

sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache
line, and sk_has_account() always fetches sk->sk_prot before accessing
sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch.

The following patches will set sk->sk_bypass_prot_mem to true, and
then, the per-protocol memory accounting will be skipped.

Note that this does NOT disable memcg, but rather the per-protocol one.

Another option not to use the hole in struct sock_common is create
sk_prot variants like tcp_prot_bypass, but this would complicate
SOCKMAP logic, tcp_bpf_prots etc.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com

authored by

Kuniyuki Iwashima and committed by
Martin KaFai Lau
7c268eae 4a997d49

+48 -13
+3
include/net/proto_memory.h
··· 35 35 mem_cgroup_sk_under_memory_pressure(sk)) 36 36 return true; 37 37 38 + if (sk->sk_bypass_prot_mem) 39 + return false; 40 + 38 41 return !!READ_ONCE(*sk->sk_prot->memory_pressure); 39 42 } 40 43
+3
include/net/sock.h
··· 118 118 * @skc_reuseport: %SO_REUSEPORT setting 119 119 * @skc_ipv6only: socket is IPV6 only 120 120 * @skc_net_refcnt: socket is using net ref counting 121 + * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb 121 122 * @skc_bound_dev_if: bound device index if != 0 122 123 * @skc_bind_node: bind hash linkage for various protocol lookup tables 123 124 * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol ··· 175 174 unsigned char skc_reuseport:1; 176 175 unsigned char skc_ipv6only:1; 177 176 unsigned char skc_net_refcnt:1; 177 + unsigned char skc_bypass_prot_mem:1; 178 178 int skc_bound_dev_if; 179 179 union { 180 180 struct hlist_node skc_bind_node; ··· 383 381 #define sk_reuseport __sk_common.skc_reuseport 384 382 #define sk_ipv6only __sk_common.skc_ipv6only 385 383 #define sk_net_refcnt __sk_common.skc_net_refcnt 384 + #define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem 386 385 #define sk_bound_dev_if __sk_common.skc_bound_dev_if 387 386 #define sk_bind_node __sk_common.skc_bind_node 388 387 #define sk_prot __sk_common.skc_prot
+3
include/net/tcp.h
··· 303 303 mem_cgroup_sk_under_memory_pressure(sk)) 304 304 return true; 305 305 306 + if (sk->sk_bypass_prot_mem) 307 + return false; 308 + 306 309 return READ_ONCE(tcp_memory_pressure); 307 310 } 308 311 /*
+25 -7
net/core/sock.c
··· 1046 1046 if (!charged) 1047 1047 return -ENOMEM; 1048 1048 1049 + if (sk->sk_bypass_prot_mem) 1050 + goto success; 1051 + 1049 1052 /* pre-charge to forward_alloc */ 1050 1053 sk_memory_allocated_add(sk, pages); 1051 1054 allocated = sk_memory_allocated(sk); 1055 + 1052 1056 /* If the system goes into memory pressure with this 1053 1057 * precharge, give up and return error. 1054 1058 */ ··· 1061 1057 mem_cgroup_sk_uncharge(sk, pages); 1062 1058 return -ENOMEM; 1063 1059 } 1060 + 1061 + success: 1064 1062 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1065 1063 1066 1064 WRITE_ONCE(sk->sk_reserved_mem, ··· 3151 3145 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 3152 3146 return true; 3153 3147 3154 - sk_enter_memory_pressure(sk); 3148 + if (!sk->sk_bypass_prot_mem) 3149 + sk_enter_memory_pressure(sk); 3150 + 3155 3151 sk_stream_moderate_sndbuf(sk); 3152 + 3156 3153 return false; 3157 3154 } 3158 3155 EXPORT_SYMBOL(sk_page_frag_refill); ··· 3272 3263 { 3273 3264 bool memcg_enabled = false, charged = false; 3274 3265 struct proto *prot = sk->sk_prot; 3275 - long allocated; 3266 + long allocated = 0; 3276 3267 3277 - sk_memory_allocated_add(sk, amt); 3278 - allocated = sk_memory_allocated(sk); 3268 + if (!sk->sk_bypass_prot_mem) { 3269 + sk_memory_allocated_add(sk, amt); 3270 + allocated = sk_memory_allocated(sk); 3271 + } 3279 3272 3280 3273 if (mem_cgroup_sk_enabled(sk)) { 3281 3274 memcg_enabled = true; ··· 3285 3274 if (!charged) 3286 3275 goto suppress_allocation; 3287 3276 } 3277 + 3278 + if (!allocated) 3279 + return 1; 3288 3280 3289 3281 /* Under limit. */ 3290 3282 if (allocated <= sk_prot_mem_limits(sk, 0)) { ··· 3367 3353 3368 3354 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3369 3355 3370 - sk_memory_allocated_sub(sk, amt); 3356 + if (allocated) 3357 + sk_memory_allocated_sub(sk, amt); 3371 3358 3372 3359 if (charged) 3373 3360 mem_cgroup_sk_uncharge(sk, amt); ··· 3407 3392 */ 3408 3393 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3409 3394 { 3410 - sk_memory_allocated_sub(sk, amount); 3411 - 3412 3395 if (mem_cgroup_sk_enabled(sk)) 3413 3396 mem_cgroup_sk_uncharge(sk, amount); 3397 + 3398 + if (sk->sk_bypass_prot_mem) 3399 + return; 3400 + 3401 + sk_memory_allocated_sub(sk, amount); 3414 3402 3415 3403 if (sk_under_global_memory_pressure(sk) && 3416 3404 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+2 -1
net/ipv4/tcp.c
··· 928 928 } 929 929 __kfree_skb(skb); 930 930 } else { 931 - sk->sk_prot->enter_memory_pressure(sk); 931 + if (!sk->sk_bypass_prot_mem) 932 + tcp_enter_memory_pressure(sk); 932 933 sk_stream_moderate_sndbuf(sk); 933 934 } 934 935 return NULL;
+6 -1
net/ipv4/tcp_output.c
··· 3743 3743 delta = size - sk->sk_forward_alloc; 3744 3744 if (delta <= 0) 3745 3745 return; 3746 + 3746 3747 amt = sk_mem_pages(delta); 3747 3748 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3748 - sk_memory_allocated_add(sk, amt); 3749 3749 3750 3750 if (mem_cgroup_sk_enabled(sk)) 3751 3751 mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); 3752 + 3753 + if (sk->sk_bypass_prot_mem) 3754 + return; 3755 + 3756 + sk_memory_allocated_add(sk, amt); 3752 3757 } 3753 3758 3754 3759 /* Send a FIN. The caller locks the socket for us.
+4 -3
net/mptcp/protocol.c
··· 1065 1065 mptcp_for_each_subflow(msk, subflow) { 1066 1066 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1067 1067 1068 - if (first) 1068 + if (first && !ssk->sk_bypass_prot_mem) { 1069 1069 tcp_enter_memory_pressure(ssk); 1070 - sk_stream_moderate_sndbuf(ssk); 1070 + first = false; 1071 + } 1071 1072 1072 - first = false; 1073 + sk_stream_moderate_sndbuf(ssk); 1073 1074 } 1074 1075 __mptcp_sync_sndbuf(sk); 1075 1076 }
+2 -1
net/tls/tls_device.c
··· 373 373 if (!offload_ctx->open_record) { 374 374 if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, 375 375 sk->sk_allocation))) { 376 - READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); 376 + if (!sk->sk_bypass_prot_mem) 377 + READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); 377 378 sk_stream_moderate_sndbuf(sk); 378 379 return -ENOMEM; 379 380 }