Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Martin KaFai Lau says:

====================
pull-request: bpf-next 2025-10-16

We've added 6 non-merge commits during the last 1 day(s) which contain
a total of 18 files changed, 577 insertions(+), 38 deletions(-).

The main changes are:

1) Bypass the global per-protocol memory accounting either by setting
a netns sysctl or using bpf_setsockopt in a bpf program,
from Kuniyuki Iwashima.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
selftests/bpf: Add test for sk->sk_bypass_prot_mem.
bpf: Introduce SK_BPF_BYPASS_PROT_MEM.
bpf: Support bpf_setsockopt() for BPF_CGROUP_INET_SOCK_CREATE.
net: Introduce net.core.bypass_prot_mem sysctl.
net: Allow opt-out from global protocol memory accounting.
tcp: Save lock_sock() for memcg in inet_csk_accept().
====================

Link: https://patch.msgid.link/20251016204539.773707-1-martin.lau@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+577 -38
+8
Documentation/admin-guide/sysctl/net.rst
··· 212 212 213 213 Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU. 214 214 215 + bypass_prot_mem 216 + --------------- 217 + 218 + Skip charging socket buffers to the global per-protocol memory 219 + accounting controlled by net.ipv4.tcp_mem, net.ipv4.udp_mem, etc. 220 + 221 + Default: 0 (off) 222 + 215 223 rmem_default 216 224 ------------ 217 225
+1
include/net/netns/core.h
··· 17 17 int sysctl_optmem_max; 18 18 u8 sysctl_txrehash; 19 19 u8 sysctl_tstamp_allow_data; 20 + u8 sysctl_bypass_prot_mem; 20 21 21 22 #ifdef CONFIG_PROC_FS 22 23 struct prot_inuse __percpu *prot_inuse;
+3
include/net/proto_memory.h
··· 35 35 mem_cgroup_sk_under_memory_pressure(sk)) 36 36 return true; 37 37 38 + if (sk->sk_bypass_prot_mem) 39 + return false; 40 + 38 41 return !!READ_ONCE(*sk->sk_prot->memory_pressure); 39 42 } 40 43
+3
include/net/sock.h
··· 118 118 * @skc_reuseport: %SO_REUSEPORT setting 119 119 * @skc_ipv6only: socket is IPV6 only 120 120 * @skc_net_refcnt: socket is using net ref counting 121 + * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb 121 122 * @skc_bound_dev_if: bound device index if != 0 122 123 * @skc_bind_node: bind hash linkage for various protocol lookup tables 123 124 * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol ··· 175 174 unsigned char skc_reuseport:1; 176 175 unsigned char skc_ipv6only:1; 177 176 unsigned char skc_net_refcnt:1; 177 + unsigned char skc_bypass_prot_mem:1; 178 178 int skc_bound_dev_if; 179 179 union { 180 180 struct hlist_node skc_bind_node; ··· 383 381 #define sk_reuseport __sk_common.skc_reuseport 384 382 #define sk_ipv6only __sk_common.skc_ipv6only 385 383 #define sk_net_refcnt __sk_common.skc_net_refcnt 384 + #define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem 386 385 #define sk_bound_dev_if __sk_common.skc_bound_dev_if 387 386 #define sk_bind_node __sk_common.skc_bind_node 388 387 #define sk_prot __sk_common.skc_prot
+3
include/net/tcp.h
··· 303 303 mem_cgroup_sk_under_memory_pressure(sk)) 304 304 return true; 305 305 306 + if (sk->sk_bypass_prot_mem) 307 + return false; 308 + 306 309 return READ_ONCE(tcp_memory_pressure); 307 310 } 308 311 /*
+2
include/uapi/linux/bpf.h
··· 7200 7200 TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ 7201 7201 TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ 7202 7202 SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ 7203 + SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ 7204 + 7203 7205 }; 7204 7206 7205 7207 enum {
+85
net/core/filter.c
··· 5733 5733 .arg5_type = ARG_CONST_SIZE, 5734 5734 }; 5735 5735 5736 + static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, 5737 + char *optval, int optlen, 5738 + bool getopt) 5739 + { 5740 + int val; 5741 + 5742 + if (optlen != sizeof(int)) 5743 + return -EINVAL; 5744 + 5745 + if (!sk_has_account(sk)) 5746 + return -EOPNOTSUPP; 5747 + 5748 + if (getopt) { 5749 + *(int *)optval = sk->sk_bypass_prot_mem; 5750 + return 0; 5751 + } 5752 + 5753 + val = *(int *)optval; 5754 + if (val < 0 || val > 1) 5755 + return -EINVAL; 5756 + 5757 + sk->sk_bypass_prot_mem = val; 5758 + return 0; 5759 + } 5760 + 5761 + BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, 5762 + int, optname, char *, optval, int, optlen) 5763 + { 5764 + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) 5765 + return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); 5766 + 5767 + return __bpf_setsockopt(sk, level, optname, optval, optlen); 5768 + } 5769 + 5770 + static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { 5771 + .func = bpf_sock_create_setsockopt, 5772 + .gpl_only = false, 5773 + .ret_type = RET_INTEGER, 5774 + .arg1_type = ARG_PTR_TO_CTX, 5775 + .arg2_type = ARG_ANYTHING, 5776 + .arg3_type = ARG_ANYTHING, 5777 + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 5778 + .arg5_type = ARG_CONST_SIZE, 5779 + }; 5780 + 5781 + BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, 5782 + int, optname, char *, optval, int, optlen) 5783 + { 5784 + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { 5785 + int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); 5786 + 5787 + if (err) 5788 + memset(optval, 0, optlen); 5789 + 5790 + return err; 5791 + } 5792 + 5793 + return __bpf_getsockopt(sk, level, optname, optval, optlen); 5794 + } 5795 + 5796 + static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = { 5797 + .func = bpf_sock_create_getsockopt, 5798 + .gpl_only = false, 5799 + .ret_type = RET_INTEGER, 5800 + .arg1_type = ARG_PTR_TO_CTX, 5801 + .arg2_type = ARG_ANYTHING, 5802 + .arg3_type = ARG_ANYTHING, 5803 + .arg4_type = ARG_PTR_TO_UNINIT_MEM, 5804 + .arg5_type = ARG_CONST_SIZE, 5805 + }; 5806 + 5736 5807 BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, 5737 5808 int, level, int, optname, char *, optval, int, optlen) 5738 5809 { ··· 8133 8062 return &bpf_sk_storage_get_cg_sock_proto; 8134 8063 case BPF_FUNC_ktime_get_coarse_ns: 8135 8064 return &bpf_ktime_get_coarse_ns_proto; 8065 + case BPF_FUNC_setsockopt: 8066 + switch (prog->expected_attach_type) { 8067 + case BPF_CGROUP_INET_SOCK_CREATE: 8068 + return &bpf_sock_create_setsockopt_proto; 8069 + default: 8070 + return NULL; 8071 + } 8072 + case BPF_FUNC_getsockopt: 8073 + switch (prog->expected_attach_type) { 8074 + case BPF_CGROUP_INET_SOCK_CREATE: 8075 + return &bpf_sock_create_getsockopt_proto; 8076 + default: 8077 + return NULL; 8078 + } 8136 8079 default: 8137 8080 return bpf_base_func_proto(func_id, prog); 8138 8081 }
+30 -7
net/core/sock.c
··· 1046 1046 if (!charged) 1047 1047 return -ENOMEM; 1048 1048 1049 + if (sk->sk_bypass_prot_mem) 1050 + goto success; 1051 + 1049 1052 /* pre-charge to forward_alloc */ 1050 1053 sk_memory_allocated_add(sk, pages); 1051 1054 allocated = sk_memory_allocated(sk); 1055 + 1052 1056 /* If the system goes into memory pressure with this 1053 1057 * precharge, give up and return error. 1054 1058 */ ··· 1061 1057 mem_cgroup_sk_uncharge(sk, pages); 1062 1058 return -ENOMEM; 1063 1059 } 1060 + 1061 + success: 1064 1062 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1065 1063 1066 1064 WRITE_ONCE(sk->sk_reserved_mem, ··· 2306 2300 * why we need sk_prot_creator -acme 2307 2301 */ 2308 2302 sk->sk_prot = sk->sk_prot_creator = prot; 2303 + 2304 + if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) 2305 + sk->sk_bypass_prot_mem = 1; 2306 + 2309 2307 sk->sk_kern_sock = kern; 2310 2308 sock_lock_init(sk); 2309 + 2311 2310 sk->sk_net_refcnt = kern ? 0 : 1; 2312 2311 if (likely(sk->sk_net_refcnt)) { 2313 2312 get_net_track(net, &sk->ns_tracker, priority); ··· 3156 3145 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 3157 3146 return true; 3158 3147 3159 - sk_enter_memory_pressure(sk); 3148 + if (!sk->sk_bypass_prot_mem) 3149 + sk_enter_memory_pressure(sk); 3150 + 3160 3151 sk_stream_moderate_sndbuf(sk); 3152 + 3161 3153 return false; 3162 3154 } 3163 3155 EXPORT_SYMBOL(sk_page_frag_refill); ··· 3277 3263 { 3278 3264 bool memcg_enabled = false, charged = false; 3279 3265 struct proto *prot = sk->sk_prot; 3280 - long allocated; 3266 + long allocated = 0; 3281 3267 3282 - sk_memory_allocated_add(sk, amt); 3283 - allocated = sk_memory_allocated(sk); 3268 + if (!sk->sk_bypass_prot_mem) { 3269 + sk_memory_allocated_add(sk, amt); 3270 + allocated = sk_memory_allocated(sk); 3271 + } 3284 3272 3285 3273 if (mem_cgroup_sk_enabled(sk)) { 3286 3274 memcg_enabled = true; ··· 3290 3274 if (!charged) 3291 3275 goto suppress_allocation; 3292 3276 } 3277 + 3278 + if (!allocated) 3279 + return 1; 3293 3280 3294 3281 /* Under limit. */ 3295 3282 if (allocated <= sk_prot_mem_limits(sk, 0)) { ··· 3372 3353 3373 3354 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3374 3355 3375 - sk_memory_allocated_sub(sk, amt); 3356 + if (allocated) 3357 + sk_memory_allocated_sub(sk, amt); 3376 3358 3377 3359 if (charged) 3378 3360 mem_cgroup_sk_uncharge(sk, amt); ··· 3412 3392 */ 3413 3393 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3414 3394 { 3415 - sk_memory_allocated_sub(sk, amount); 3416 - 3417 3395 if (mem_cgroup_sk_enabled(sk)) 3418 3396 mem_cgroup_sk_uncharge(sk, amount); 3397 + 3398 + if (sk->sk_bypass_prot_mem) 3399 + return; 3400 + 3401 + sk_memory_allocated_sub(sk, amount); 3419 3402 3420 3403 if (sk_under_global_memory_pressure(sk) && 3421 3404 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+9
net/core/sysctl_net_core.c
··· 683 683 .extra1 = SYSCTL_ZERO, 684 684 .extra2 = SYSCTL_ONE 685 685 }, 686 + { 687 + .procname = "bypass_prot_mem", 688 + .data = &init_net.core.sysctl_bypass_prot_mem, 689 + .maxlen = sizeof(u8), 690 + .mode = 0644, 691 + .proc_handler = proc_dou8vec_minmax, 692 + .extra1 = SYSCTL_ZERO, 693 + .extra2 = SYSCTL_ONE 694 + }, 686 695 /* sysctl_core_net_init() will set the values after this 687 696 * to readonly in network namespaces 688 697 */
+22
net/ipv4/af_inet.c
··· 755 755 756 756 void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) 757 757 { 758 + /* TODO: use sk_clone_lock() in SCTP and remove protocol checks */ 759 + if (mem_cgroup_sockets_enabled && 760 + (!IS_ENABLED(CONFIG_IP_SCTP) || sk_is_tcp(newsk))) { 761 + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; 762 + 763 + mem_cgroup_sk_alloc(newsk); 764 + 765 + if (mem_cgroup_from_sk(newsk)) { 766 + int amt; 767 + 768 + /* The socket has not been accepted yet, no need 769 + * to look at newsk->sk_wmem_queued. 770 + */ 771 + amt = sk_mem_pages(newsk->sk_forward_alloc + 772 + atomic_read(&newsk->sk_rmem_alloc)); 773 + if (amt) 774 + mem_cgroup_sk_charge(newsk, amt, gfp); 775 + } 776 + 777 + kmem_cache_charge(newsk, gfp); 778 + } 779 + 758 780 sock_rps_record_flow(newsk); 759 781 WARN_ON(!((1 << newsk->sk_state) & 760 782 (TCPF_ESTABLISHED | TCPF_SYN_RECV |
-25
net/ipv4/inet_connection_sock.c
··· 712 712 713 713 release_sock(sk); 714 714 715 - if (mem_cgroup_sockets_enabled) { 716 - gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; 717 - int amt = 0; 718 - 719 - /* atomically get the memory usage, set and charge the 720 - * newsk->sk_memcg. 721 - */ 722 - lock_sock(newsk); 723 - 724 - mem_cgroup_sk_alloc(newsk); 725 - if (mem_cgroup_from_sk(newsk)) { 726 - /* The socket has not been accepted yet, no need 727 - * to look at newsk->sk_wmem_queued. 728 - */ 729 - amt = sk_mem_pages(newsk->sk_forward_alloc + 730 - atomic_read(&newsk->sk_rmem_alloc)); 731 - } 732 - 733 - if (amt) 734 - mem_cgroup_sk_charge(newsk, amt, gfp); 735 - kmem_cache_charge(newsk, gfp); 736 - 737 - release_sock(newsk); 738 - } 739 - 740 715 if (req) 741 716 reqsk_put(req); 742 717
+2 -1
net/ipv4/tcp.c
··· 926 926 } 927 927 __kfree_skb(skb); 928 928 } else { 929 - sk->sk_prot->enter_memory_pressure(sk); 929 + if (!sk->sk_bypass_prot_mem) 930 + tcp_enter_memory_pressure(sk); 930 931 sk_stream_moderate_sndbuf(sk); 931 932 } 932 933 return NULL;
+6 -1
net/ipv4/tcp_output.c
··· 3743 3743 delta = size - sk->sk_forward_alloc; 3744 3744 if (delta <= 0) 3745 3745 return; 3746 + 3746 3747 amt = sk_mem_pages(delta); 3747 3748 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3748 - sk_memory_allocated_add(sk, amt); 3749 3749 3750 3750 if (mem_cgroup_sk_enabled(sk)) 3751 3751 mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); 3752 + 3753 + if (sk->sk_bypass_prot_mem) 3754 + return; 3755 + 3756 + sk_memory_allocated_add(sk, amt); 3752 3757 } 3753 3758 3754 3759 /* Send a FIN. The caller locks the socket for us.
+4 -3
net/mptcp/protocol.c
··· 1065 1065 mptcp_for_each_subflow(msk, subflow) { 1066 1066 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 1067 1067 1068 - if (first) 1068 + if (first && !ssk->sk_bypass_prot_mem) { 1069 1069 tcp_enter_memory_pressure(ssk); 1070 - sk_stream_moderate_sndbuf(ssk); 1070 + first = false; 1071 + } 1071 1072 1072 - first = false; 1073 + sk_stream_moderate_sndbuf(ssk); 1073 1074 } 1074 1075 __mptcp_sync_sndbuf(sk); 1075 1076 }
+2 -1
net/tls/tls_device.c
··· 373 373 if (!offload_ctx->open_record) { 374 374 if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, 375 375 sk->sk_allocation))) { 376 - READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); 376 + if (!sk->sk_bypass_prot_mem) 377 + READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); 377 378 sk_stream_moderate_sndbuf(sk); 378 379 return -ENOMEM; 379 380 }
+1
tools/include/uapi/linux/bpf.h
··· 7200 7200 TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ 7201 7201 TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ 7202 7202 SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ 7203 + SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ 7203 7204 }; 7204 7205 7205 7206 enum {
+292
tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright 2025 Google LLC */ 3 + 4 + #include <test_progs.h> 5 + #include "sk_bypass_prot_mem.skel.h" 6 + #include "network_helpers.h" 7 + 8 + #define NR_PAGES 32 9 + #define NR_SOCKETS 2 10 + #define BUF_TOTAL (NR_PAGES * 4096 / NR_SOCKETS) 11 + #define BUF_SINGLE 1024 12 + #define NR_SEND (BUF_TOTAL / BUF_SINGLE) 13 + 14 + struct test_case { 15 + char name[8]; 16 + int family; 17 + int type; 18 + int (*create_sockets)(struct test_case *test_case, int sk[], int len); 19 + long (*get_memory_allocated)(struct test_case *test_case, struct sk_bypass_prot_mem *skel); 20 + }; 21 + 22 + static int tcp_create_sockets(struct test_case *test_case, int sk[], int len) 23 + { 24 + int server, i, err = 0; 25 + 26 + server = start_server(test_case->family, test_case->type, NULL, 0, 0); 27 + if (!ASSERT_GE(server, 0, "start_server_str")) 28 + return server; 29 + 30 + /* Keep for-loop so we can change NR_SOCKETS easily. */ 31 + for (i = 0; i < len; i += 2) { 32 + sk[i] = connect_to_fd(server, 0); 33 + if (sk[i] < 0) { 34 + ASSERT_GE(sk[i], 0, "connect_to_fd"); 35 + err = sk[i]; 36 + break; 37 + } 38 + 39 + sk[i + 1] = accept(server, NULL, NULL); 40 + if (sk[i + 1] < 0) { 41 + ASSERT_GE(sk[i + 1], 0, "accept"); 42 + err = sk[i + 1]; 43 + break; 44 + } 45 + } 46 + 47 + close(server); 48 + 49 + return err; 50 + } 51 + 52 + static int udp_create_sockets(struct test_case *test_case, int sk[], int len) 53 + { 54 + int i, j, err, rcvbuf = BUF_TOTAL; 55 + 56 + /* Keep for-loop so we can change NR_SOCKETS easily. */ 57 + for (i = 0; i < len; i += 2) { 58 + sk[i] = start_server(test_case->family, test_case->type, NULL, 0, 0); 59 + if (sk[i] < 0) { 60 + ASSERT_GE(sk[i], 0, "start_server"); 61 + return sk[i]; 62 + } 63 + 64 + sk[i + 1] = connect_to_fd(sk[i], 0); 65 + if (sk[i + 1] < 0) { 66 + ASSERT_GE(sk[i + 1], 0, "connect_to_fd"); 67 + return sk[i + 1]; 68 + } 69 + 70 + err = connect_fd_to_fd(sk[i], sk[i + 1], 0); 71 + if (err) { 72 + ASSERT_EQ(err, 0, "connect_fd_to_fd"); 73 + return err; 74 + } 75 + 76 + for (j = 0; j < 2; j++) { 77 + err = setsockopt(sk[i + j], SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(int)); 78 + if (err) { 79 + ASSERT_EQ(err, 0, "setsockopt(SO_RCVBUF)"); 80 + return err; 81 + } 82 + } 83 + } 84 + 85 + return 0; 86 + } 87 + 88 + static long get_memory_allocated(struct test_case *test_case, 89 + bool *activated, long *memory_allocated) 90 + { 91 + int sk; 92 + 93 + *activated = true; 94 + 95 + /* AF_INET and AF_INET6 share the same memory_allocated. 96 + * tcp_init_sock() is called by AF_INET and AF_INET6, 97 + * but udp_lib_init_sock() is inline. 98 + */ 99 + sk = socket(AF_INET, test_case->type, 0); 100 + if (!ASSERT_GE(sk, 0, "get_memory_allocated")) 101 + return -1; 102 + 103 + close(sk); 104 + 105 + return *memory_allocated; 106 + } 107 + 108 + static long tcp_get_memory_allocated(struct test_case *test_case, struct sk_bypass_prot_mem *skel) 109 + { 110 + return get_memory_allocated(test_case, 111 + &skel->bss->tcp_activated, 112 + &skel->bss->tcp_memory_allocated); 113 + } 114 + 115 + static long udp_get_memory_allocated(struct test_case *test_case, struct sk_bypass_prot_mem *skel) 116 + { 117 + return get_memory_allocated(test_case, 118 + &skel->bss->udp_activated, 119 + &skel->bss->udp_memory_allocated); 120 + } 121 + 122 + static int check_bypass(struct test_case *test_case, 123 + struct sk_bypass_prot_mem *skel, bool bypass) 124 + { 125 + char buf[BUF_SINGLE] = {}; 126 + long memory_allocated[2]; 127 + int sk[NR_SOCKETS]; 128 + int err, i, j; 129 + 130 + for (i = 0; i < ARRAY_SIZE(sk); i++) 131 + sk[i] = -1; 132 + 133 + err = test_case->create_sockets(test_case, sk, ARRAY_SIZE(sk)); 134 + if (err) 135 + goto close; 136 + 137 + memory_allocated[0] = test_case->get_memory_allocated(test_case, skel); 138 + 139 + /* allocate pages >= NR_PAGES */ 140 + for (i = 0; i < ARRAY_SIZE(sk); i++) { 141 + for (j = 0; j < NR_SEND; j++) { 142 + int bytes = send(sk[i], buf, sizeof(buf), 0); 143 + 144 + /* Avoid too noisy logs when something failed. */ 145 + if (bytes != sizeof(buf)) { 146 + ASSERT_EQ(bytes, sizeof(buf), "send"); 147 + if (bytes < 0) { 148 + err = bytes; 149 + goto drain; 150 + } 151 + } 152 + } 153 + } 154 + 155 + memory_allocated[1] = test_case->get_memory_allocated(test_case, skel); 156 + 157 + if (bypass) 158 + ASSERT_LE(memory_allocated[1], memory_allocated[0] + 10, "bypass"); 159 + else 160 + ASSERT_GT(memory_allocated[1], memory_allocated[0] + NR_PAGES, "no bypass"); 161 + 162 + drain: 163 + if (test_case->type == SOCK_DGRAM) { 164 + /* UDP starts purging sk->sk_receive_queue after one RCU 165 + * grace period, then udp_memory_allocated goes down, 166 + * so drain the queue before close(). 167 + */ 168 + for (i = 0; i < ARRAY_SIZE(sk); i++) { 169 + for (j = 0; j < NR_SEND; j++) { 170 + int bytes = recv(sk[i], buf, 1, MSG_DONTWAIT | MSG_TRUNC); 171 + 172 + if (bytes == sizeof(buf)) 173 + continue; 174 + if (bytes != -1 || errno != EAGAIN) 175 + PRINT_FAIL("bytes: %d, errno: %s\n", bytes, strerror(errno)); 176 + break; 177 + } 178 + } 179 + } 180 + 181 + close: 182 + for (i = 0; i < ARRAY_SIZE(sk); i++) { 183 + if (sk[i] < 0) 184 + break; 185 + 186 + close(sk[i]); 187 + } 188 + 189 + return err; 190 + } 191 + 192 + static void run_test(struct test_case *test_case) 193 + { 194 + struct sk_bypass_prot_mem *skel; 195 + struct nstoken *nstoken; 196 + int cgroup, err; 197 + 198 + skel = sk_bypass_prot_mem__open_and_load(); 199 + if (!ASSERT_OK_PTR(skel, "open_and_load")) 200 + return; 201 + 202 + skel->bss->nr_cpus = libbpf_num_possible_cpus(); 203 + 204 + err = sk_bypass_prot_mem__attach(skel); 205 + if (!ASSERT_OK(err, "attach")) 206 + goto destroy_skel; 207 + 208 + cgroup = test__join_cgroup("/sk_bypass_prot_mem"); 209 + if (!ASSERT_GE(cgroup, 0, "join_cgroup")) 210 + goto destroy_skel; 211 + 212 + err = make_netns("sk_bypass_prot_mem"); 213 + if (!ASSERT_EQ(err, 0, "make_netns")) 214 + goto close_cgroup; 215 + 216 + nstoken = open_netns("sk_bypass_prot_mem"); 217 + if (!ASSERT_OK_PTR(nstoken, "open_netns")) 218 + goto remove_netns; 219 + 220 + err = check_bypass(test_case, skel, false); 221 + if (!ASSERT_EQ(err, 0, "test_bypass(false)")) 222 + goto close_netns; 223 + 224 + err = write_sysctl("/proc/sys/net/core/bypass_prot_mem", "1"); 225 + if (!ASSERT_EQ(err, 0, "write_sysctl(1)")) 226 + goto close_netns; 227 + 228 + err = check_bypass(test_case, skel, true); 229 + if (!ASSERT_EQ(err, 0, "test_bypass(true by sysctl)")) 230 + goto close_netns; 231 + 232 + err = write_sysctl("/proc/sys/net/core/bypass_prot_mem", "0"); 233 + if (!ASSERT_EQ(err, 0, "write_sysctl(0)")) 234 + goto close_netns; 235 + 236 + skel->links.sock_create = bpf_program__attach_cgroup(skel->progs.sock_create, cgroup); 237 + if (!ASSERT_OK_PTR(skel->links.sock_create, "attach_cgroup(sock_create)")) 238 + goto close_netns; 239 + 240 + err = check_bypass(test_case, skel, true); 241 + ASSERT_EQ(err, 0, "test_bypass(true by bpf)"); 242 + 243 + close_netns: 244 + close_netns(nstoken); 245 + remove_netns: 246 + remove_netns("sk_bypass_prot_mem"); 247 + close_cgroup: 248 + close(cgroup); 249 + destroy_skel: 250 + sk_bypass_prot_mem__destroy(skel); 251 + } 252 + 253 + static struct test_case test_cases[] = { 254 + { 255 + .name = "TCP ", 256 + .family = AF_INET, 257 + .type = SOCK_STREAM, 258 + .create_sockets = tcp_create_sockets, 259 + .get_memory_allocated = tcp_get_memory_allocated, 260 + }, 261 + { 262 + .name = "UDP ", 263 + .family = AF_INET, 264 + .type = SOCK_DGRAM, 265 + .create_sockets = udp_create_sockets, 266 + .get_memory_allocated = udp_get_memory_allocated, 267 + }, 268 + { 269 + .name = "TCPv6", 270 + .family = AF_INET6, 271 + .type = SOCK_STREAM, 272 + .create_sockets = tcp_create_sockets, 273 + .get_memory_allocated = tcp_get_memory_allocated, 274 + }, 275 + { 276 + .name = "UDPv6", 277 + .family = AF_INET6, 278 + .type = SOCK_DGRAM, 279 + .create_sockets = udp_create_sockets, 280 + .get_memory_allocated = udp_get_memory_allocated, 281 + }, 282 + }; 283 + 284 + void serial_test_sk_bypass_prot_mem(void) 285 + { 286 + int i; 287 + 288 + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { 289 + if (test__start_subtest(test_cases[i].name)) 290 + run_test(&test_cases[i]); 291 + } 292 + }
+104
tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright 2025 Google LLC */ 3 + 4 + #include "bpf_tracing_net.h" 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_tracing.h> 7 + #include <errno.h> 8 + 9 + extern int tcp_memory_per_cpu_fw_alloc __ksym; 10 + extern int udp_memory_per_cpu_fw_alloc __ksym; 11 + 12 + int nr_cpus; 13 + bool tcp_activated, udp_activated; 14 + long tcp_memory_allocated, udp_memory_allocated; 15 + 16 + struct sk_prot { 17 + long *memory_allocated; 18 + int *memory_per_cpu_fw_alloc; 19 + }; 20 + 21 + static int drain_memory_per_cpu_fw_alloc(__u32 i, struct sk_prot *sk_prot_ctx) 22 + { 23 + int *memory_per_cpu_fw_alloc; 24 + 25 + memory_per_cpu_fw_alloc = bpf_per_cpu_ptr(sk_prot_ctx->memory_per_cpu_fw_alloc, i); 26 + if (memory_per_cpu_fw_alloc) 27 + *sk_prot_ctx->memory_allocated += *memory_per_cpu_fw_alloc; 28 + 29 + return 0; 30 + } 31 + 32 + static long get_memory_allocated(struct sock *_sk, int *memory_per_cpu_fw_alloc) 33 + { 34 + struct sock *sk = bpf_core_cast(_sk, struct sock); 35 + struct sk_prot sk_prot_ctx; 36 + long memory_allocated; 37 + 38 + /* net_aligned_data.{tcp,udp}_memory_allocated was not available. */ 39 + memory_allocated = sk->__sk_common.skc_prot->memory_allocated->counter; 40 + 41 + sk_prot_ctx.memory_allocated = &memory_allocated; 42 + sk_prot_ctx.memory_per_cpu_fw_alloc = memory_per_cpu_fw_alloc; 43 + 44 + bpf_loop(nr_cpus, drain_memory_per_cpu_fw_alloc, &sk_prot_ctx, 0); 45 + 46 + return memory_allocated; 47 + } 48 + 49 + static void fentry_init_sock(struct sock *sk, bool *activated, 50 + long *memory_allocated, int *memory_per_cpu_fw_alloc) 51 + { 52 + if (!*activated) 53 + return; 54 + 55 + *memory_allocated = get_memory_allocated(sk, memory_per_cpu_fw_alloc); 56 + *activated = false; 57 + } 58 + 59 + SEC("fentry/tcp_init_sock") 60 + int BPF_PROG(fentry_tcp_init_sock, struct sock *sk) 61 + { 62 + fentry_init_sock(sk, &tcp_activated, 63 + &tcp_memory_allocated, &tcp_memory_per_cpu_fw_alloc); 64 + return 0; 65 + } 66 + 67 + SEC("fentry/udp_init_sock") 68 + int BPF_PROG(fentry_udp_init_sock, struct sock *sk) 69 + { 70 + fentry_init_sock(sk, &udp_activated, 71 + &udp_memory_allocated, &udp_memory_per_cpu_fw_alloc); 72 + return 0; 73 + } 74 + 75 + SEC("cgroup/sock_create") 76 + int sock_create(struct bpf_sock *ctx) 77 + { 78 + int err, val = 1; 79 + 80 + err = bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM, 81 + &val, sizeof(val)); 82 + if (err) 83 + goto err; 84 + 85 + val = 0; 86 + 87 + err = bpf_getsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM, 88 + &val, sizeof(val)); 89 + if (err) 90 + goto err; 91 + 92 + if (val != 1) { 93 + err = -EINVAL; 94 + goto err; 95 + } 96 + 97 + return 1; 98 + 99 + err: 100 + bpf_set_retval(err); 101 + return 0; 102 + } 103 + 104 + char LICENSE[] SEC("license") = "GPL";