bpf: tcp: Support tcp_congestion_ops in bpf

+2

include/linux/filter.h

··· 843 843 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 844 844 bpf_aux_classic_check_t trans, bool save_orig); 845 845 void bpf_prog_destroy(struct bpf_prog *fp); 846 + const struct bpf_func_proto * 847 + bpf_base_func_proto(enum bpf_func_id func_id); 846 848 847 849 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); 848 850 int sk_attach_bpf(u32 ufd, struct sock *sk);

+2

include/net/tcp.h

··· 1007 1007 #define TCP_CONG_NON_RESTRICTED 0x1 1008 1008 /* Requires ECN/ECT set on all packets */ 1009 1009 #define TCP_CONG_NEEDS_ECN 0x2 1010 + #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) 1010 1011 1011 1012 union tcp_cc_info; 1012 1013 ··· 1102 1101 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); 1103 1102 extern struct tcp_congestion_ops tcp_reno; 1104 1103 1104 + struct tcp_congestion_ops *tcp_ca_find(const char *name); 1105 1105 struct tcp_congestion_ops *tcp_ca_find_key(u32 key); 1106 1106 u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); 1107 1107 #ifdef CONFIG_INET

+6 -1

kernel/bpf/bpf_struct_ops_types.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 /* internal file - do not include directly */ 3 3 4 - /* To be filled in a later patch */ 4 + #ifdef CONFIG_BPF_JIT 5 + #ifdef CONFIG_INET 6 + #include <net/tcp.h> 7 + BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) 8 + #endif 9 + #endif

+1 -1

net/core/filter.c

··· 5935 5935 return false; 5936 5936 } 5937 5937 5938 - static const struct bpf_func_proto * 5938 + const struct bpf_func_proto * 5939 5939 bpf_base_func_proto(enum bpf_func_id func_id) 5940 5940 { 5941 5941 switch (func_id) {

+4

net/ipv4/Makefile

··· 65 65 66 66 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 67 67 xfrm4_output.o xfrm4_protocol.o 68 + 69 + ifeq ($(CONFIG_BPF_JIT),y) 70 + obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o 71 + endif

+230

net/ipv4/bpf_tcp_ca.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2019 Facebook */ 3 + 4 + #include <linux/types.h> 5 + #include <linux/bpf_verifier.h> 6 + #include <linux/bpf.h> 7 + #include <linux/btf.h> 8 + #include <linux/filter.h> 9 + #include <net/tcp.h> 10 + 11 + static u32 optional_ops[] = { 12 + offsetof(struct tcp_congestion_ops, init), 13 + offsetof(struct tcp_congestion_ops, release), 14 + offsetof(struct tcp_congestion_ops, set_state), 15 + offsetof(struct tcp_congestion_ops, cwnd_event), 16 + offsetof(struct tcp_congestion_ops, in_ack_event), 17 + offsetof(struct tcp_congestion_ops, pkts_acked), 18 + offsetof(struct tcp_congestion_ops, min_tso_segs), 19 + offsetof(struct tcp_congestion_ops, sndbuf_expand), 20 + offsetof(struct tcp_congestion_ops, cong_control), 21 + }; 22 + 23 + static u32 unsupported_ops[] = { 24 + offsetof(struct tcp_congestion_ops, get_info), 25 + }; 26 + 27 + static const struct btf_type *tcp_sock_type; 28 + static u32 tcp_sock_id, sock_id; 29 + 30 + static int bpf_tcp_ca_init(struct btf *btf) 31 + { 32 + s32 type_id; 33 + 34 + type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT); 35 + if (type_id < 0) 36 + return -EINVAL; 37 + sock_id = type_id; 38 + 39 + type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT); 40 + if (type_id < 0) 41 + return -EINVAL; 42 + tcp_sock_id = type_id; 43 + tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); 44 + 45 + return 0; 46 + } 47 + 48 + static bool is_optional(u32 member_offset) 49 + { 50 + unsigned int i; 51 + 52 + for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { 53 + if (member_offset == optional_ops[i]) 54 + return true; 55 + } 56 + 57 + return false; 58 + } 59 + 60 + static bool is_unsupported(u32 member_offset) 61 + { 62 + unsigned int i; 63 + 64 + for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { 65 + if (member_offset == unsupported_ops[i]) 66 + return true; 67 + } 68 + 69 + return false; 70 + } 71 + 72 + extern struct btf *btf_vmlinux; 73 + 74 + static bool bpf_tcp_ca_is_valid_access(int off, int size, 75 + enum bpf_access_type type, 76 + const struct bpf_prog *prog, 77 + struct bpf_insn_access_aux *info) 78 + { 79 + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 80 + return false; 81 + if (type != BPF_READ) 82 + return false; 83 + if (off % size != 0) 84 + return false; 85 + 86 + if (!btf_ctx_access(off, size, type, prog, info)) 87 + return false; 88 + 89 + if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) 90 + /* promote it to tcp_sock */ 91 + info->btf_id = tcp_sock_id; 92 + 93 + return true; 94 + } 95 + 96 + static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, 97 + const struct btf_type *t, int off, 98 + int size, enum bpf_access_type atype, 99 + u32 *next_btf_id) 100 + { 101 + size_t end; 102 + 103 + if (atype == BPF_READ) 104 + return btf_struct_access(log, t, off, size, atype, next_btf_id); 105 + 106 + if (t != tcp_sock_type) { 107 + bpf_log(log, "only read is supported\n"); 108 + return -EACCES; 109 + } 110 + 111 + switch (off) { 112 + case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): 113 + end = offsetofend(struct inet_connection_sock, icsk_ca_priv); 114 + break; 115 + case offsetof(struct inet_connection_sock, icsk_ack.pending): 116 + end = offsetofend(struct inet_connection_sock, 117 + icsk_ack.pending); 118 + break; 119 + case offsetof(struct tcp_sock, snd_cwnd): 120 + end = offsetofend(struct tcp_sock, snd_cwnd); 121 + break; 122 + case offsetof(struct tcp_sock, snd_cwnd_cnt): 123 + end = offsetofend(struct tcp_sock, snd_cwnd_cnt); 124 + break; 125 + case offsetof(struct tcp_sock, snd_ssthresh): 126 + end = offsetofend(struct tcp_sock, snd_ssthresh); 127 + break; 128 + case offsetof(struct tcp_sock, ecn_flags): 129 + end = offsetofend(struct tcp_sock, ecn_flags); 130 + break; 131 + default: 132 + bpf_log(log, "no write support to tcp_sock at off %d\n", off); 133 + return -EACCES; 134 + } 135 + 136 + if (off + size > end) { 137 + bpf_log(log, 138 + "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n", 139 + off, size, end); 140 + return -EACCES; 141 + } 142 + 143 + return NOT_INIT; 144 + } 145 + 146 + static const struct bpf_func_proto * 147 + bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, 148 + const struct bpf_prog *prog) 149 + { 150 + return bpf_base_func_proto(func_id); 151 + } 152 + 153 + static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { 154 + .get_func_proto = bpf_tcp_ca_get_func_proto, 155 + .is_valid_access = bpf_tcp_ca_is_valid_access, 156 + .btf_struct_access = bpf_tcp_ca_btf_struct_access, 157 + }; 158 + 159 + static int bpf_tcp_ca_init_member(const struct btf_type *t, 160 + const struct btf_member *member, 161 + void *kdata, const void *udata) 162 + { 163 + const struct tcp_congestion_ops *utcp_ca; 164 + struct tcp_congestion_ops *tcp_ca; 165 + size_t tcp_ca_name_len; 166 + int prog_fd; 167 + u32 moff; 168 + 169 + utcp_ca = (const struct tcp_congestion_ops *)udata; 170 + tcp_ca = (struct tcp_congestion_ops *)kdata; 171 + 172 + moff = btf_member_bit_offset(t, member) / 8; 173 + switch (moff) { 174 + case offsetof(struct tcp_congestion_ops, flags): 175 + if (utcp_ca->flags & ~TCP_CONG_MASK) 176 + return -EINVAL; 177 + tcp_ca->flags = utcp_ca->flags; 178 + return 1; 179 + case offsetof(struct tcp_congestion_ops, name): 180 + tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); 181 + if (!tcp_ca_name_len || 182 + tcp_ca_name_len == sizeof(utcp_ca->name)) 183 + return -EINVAL; 184 + if (tcp_ca_find(utcp_ca->name)) 185 + return -EEXIST; 186 + memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); 187 + return 1; 188 + } 189 + 190 + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) 191 + return 0; 192 + 193 + /* Ensure bpf_prog is provided for compulsory func ptr */ 194 + prog_fd = (int)(*(unsigned long *)(udata + moff)); 195 + if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) 196 + return -EINVAL; 197 + 198 + return 0; 199 + } 200 + 201 + static int bpf_tcp_ca_check_member(const struct btf_type *t, 202 + const struct btf_member *member) 203 + { 204 + if (is_unsupported(btf_member_bit_offset(t, member) / 8)) 205 + return -ENOTSUPP; 206 + return 0; 207 + } 208 + 209 + static int bpf_tcp_ca_reg(void *kdata) 210 + { 211 + return tcp_register_congestion_control(kdata); 212 + } 213 + 214 + static void bpf_tcp_ca_unreg(void *kdata) 215 + { 216 + tcp_unregister_congestion_control(kdata); 217 + } 218 + 219 + /* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ 220 + extern struct bpf_struct_ops bpf_tcp_congestion_ops; 221 + 222 + struct bpf_struct_ops bpf_tcp_congestion_ops = { 223 + .verifier_ops = &bpf_tcp_ca_verifier_ops, 224 + .reg = bpf_tcp_ca_reg, 225 + .unreg = bpf_tcp_ca_unreg, 226 + .check_member = bpf_tcp_ca_check_member, 227 + .init_member = bpf_tcp_ca_init_member, 228 + .init = bpf_tcp_ca_init, 229 + .name = "tcp_congestion_ops", 230 + };

+8 -8

net/ipv4/tcp_cong.c

··· 21 21 static LIST_HEAD(tcp_cong_list); 22 22 23 23 /* Simple linear search, don't expect many entries! */ 24 - static struct tcp_congestion_ops *tcp_ca_find(const char *name) 24 + struct tcp_congestion_ops *tcp_ca_find(const char *name) 25 25 { 26 26 struct tcp_congestion_ops *e; 27 27 ··· 162 162 163 163 rcu_read_lock(); 164 164 ca = rcu_dereference(net->ipv4.tcp_congestion_control); 165 - if (unlikely(!try_module_get(ca->owner))) 165 + if (unlikely(!bpf_try_module_get(ca, ca->owner))) 166 166 ca = &tcp_reno; 167 167 icsk->icsk_ca_ops = ca; 168 168 rcu_read_unlock(); ··· 208 208 209 209 if (icsk->icsk_ca_ops->release) 210 210 icsk->icsk_ca_ops->release(sk); 211 - module_put(icsk->icsk_ca_ops->owner); 211 + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); 212 212 } 213 213 214 214 /* Used by sysctl to change default congestion control */ ··· 222 222 ca = tcp_ca_find_autoload(net, name); 223 223 if (!ca) { 224 224 ret = -ENOENT; 225 - } else if (!try_module_get(ca->owner)) { 225 + } else if (!bpf_try_module_get(ca, ca->owner)) { 226 226 ret = -EBUSY; 227 227 } else { 228 228 prev = xchg(&net->ipv4.tcp_congestion_control, ca); 229 229 if (prev) 230 - module_put(prev->owner); 230 + bpf_module_put(prev, prev->owner); 231 231 232 232 ca->flags |= TCP_CONG_NON_RESTRICTED; 233 233 ret = 0; ··· 366 366 } else if (!load) { 367 367 const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; 368 368 369 - if (try_module_get(ca->owner)) { 369 + if (bpf_try_module_get(ca, ca->owner)) { 370 370 if (reinit) { 371 371 tcp_reinit_congestion_control(sk, ca); 372 372 } else { 373 373 icsk->icsk_ca_ops = ca; 374 - module_put(old_ca->owner); 374 + bpf_module_put(old_ca, old_ca->owner); 375 375 } 376 376 } else { 377 377 err = -EBUSY; 378 378 } 379 379 } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { 380 380 err = -EPERM; 381 - } else if (!try_module_get(ca->owner)) { 381 + } else if (!bpf_try_module_get(ca, ca->owner)) { 382 382 err = -EBUSY; 383 383 } else { 384 384 tcp_reinit_congestion_control(sk, ca);

+4 -2

net/ipv4/tcp_ipv4.c

··· 2678 2678 int cpu; 2679 2679 2680 2680 if (net->ipv4.tcp_congestion_control) 2681 - module_put(net->ipv4.tcp_congestion_control->owner); 2681 + bpf_module_put(net->ipv4.tcp_congestion_control, 2682 + net->ipv4.tcp_congestion_control->owner); 2682 2683 2683 2684 for_each_possible_cpu(cpu) 2684 2685 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); ··· 2786 2785 2787 2786 /* Reno is always built in */ 2788 2787 if (!net_eq(net, &init_net) && 2789 - try_module_get(init_net.ipv4.tcp_congestion_control->owner)) 2788 + bpf_try_module_get(init_net.ipv4.tcp_congestion_control, 2789 + init_net.ipv4.tcp_congestion_control->owner)) 2790 2790 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; 2791 2791 else 2792 2792 net->ipv4.tcp_congestion_control = &tcp_reno;

+2 -2

net/ipv4/tcp_minisocks.c

··· 414 414 415 415 rcu_read_lock(); 416 416 ca = tcp_ca_find_key(ca_key); 417 - if (likely(ca && try_module_get(ca->owner))) { 417 + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { 418 418 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); 419 419 icsk->icsk_ca_ops = ca; 420 420 ca_got_dst = true; ··· 425 425 /* If no valid choice made yet, assign current system default ca. */ 426 426 if (!ca_got_dst && 427 427 (!icsk->icsk_ca_setsockopt || 428 - !try_module_get(icsk->icsk_ca_ops->owner))) 428 + !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) 429 429 tcp_assign_congestion_control(sk); 430 430 431 431 tcp_set_ca_state(sk, TCP_CA_Open);

+2 -2

net/ipv4/tcp_output.c

··· 3368 3368 3369 3369 rcu_read_lock(); 3370 3370 ca = tcp_ca_find_key(ca_key); 3371 - if (likely(ca && try_module_get(ca->owner))) { 3372 - module_put(icsk->icsk_ca_ops->owner); 3371 + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { 3372 + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); 3373 3373 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); 3374 3374 icsk->icsk_ca_ops = ca; 3375 3375 }