Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt

This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).

During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be
called and this could cause a recursion but it is stopped by the
current trampoline's logic (in the prog->active counter).

While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()),
the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring
bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not
available to the bpf-tcp-cc's release(). This will avoid release()
making setsockopt() call that will potentially allocate new resources.

Although the bpf-tcp-cc already has a more powerful way to read tcp_sock
from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and
bpf_setsockopt are available together. Thus, bpf_getsockopt() is also
added to all tcp_congestion_ops except release().

When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION)
to switch to a new cc, the old bpf-tcp-cc will be released by
bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map
after a rcu grace period because the trampoline's image cannot be freed
while the old bpf-tcp-cc is still running.

bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's
tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size
of icsk_ca_priv has already been raised a few times to avoid
extra kmalloc and memory referencing. The only exception is the
kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv.
To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer
value stored in icsk_ca_priv after switching and without over-complicating
the bpf's verifier for this one exception in tcp_cdg, this patch does not
allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be
implemented and then use the bpf_sk_storage as the extended storage.

bpf_sk_setsockopt proto has only been recently added and used
in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the
same proto instead of adding a new proto specifically for bpf-tcp-cc.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com

authored by

Martin KaFai Lau and committed by
Alexei Starovoitov
eb18b49e 7d789bd0

+65 -4
+21 -1
kernel/bpf/bpf_struct_ops.c
··· 28 28 29 29 struct bpf_struct_ops_map { 30 30 struct bpf_map map; 31 + struct rcu_head rcu; 31 32 const struct bpf_struct_ops *st_ops; 32 33 /* protect map_update */ 33 34 struct mutex lock; ··· 623 622 return refcount_inc_not_zero(&kvalue->refcnt); 624 623 } 625 624 625 + static void bpf_struct_ops_put_rcu(struct rcu_head *head) 626 + { 627 + struct bpf_struct_ops_map *st_map; 628 + 629 + st_map = container_of(head, struct bpf_struct_ops_map, rcu); 630 + bpf_map_put(&st_map->map); 631 + } 632 + 626 633 void bpf_struct_ops_put(const void *kdata) 627 634 { 628 635 struct bpf_struct_ops_value *kvalue; ··· 641 632 642 633 st_map = container_of(kvalue, struct bpf_struct_ops_map, 643 634 kvalue); 644 - bpf_map_put(&st_map->map); 635 + /* The struct_ops's function may switch to another struct_ops. 636 + * 637 + * For example, bpf_tcp_cc_x->init() may switch to 638 + * another tcp_cc_y by calling 639 + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 640 + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 641 + * and its map->refcnt may reach 0 which then free its 642 + * trampoline image while tcp_cc_x is still running. 643 + * 644 + * Thus, a rcu grace period is needed here. 645 + */ 646 + call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); 645 647 } 646 648 }
+6
net/core/filter.c
··· 5051 5051 BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, 5052 5052 int, optname, char *, optval, int, optlen) 5053 5053 { 5054 + if (level == SOL_TCP && optname == TCP_CONGESTION) { 5055 + if (optlen >= sizeof("cdg") - 1 && 5056 + !strncmp("cdg", optval, optlen)) 5057 + return -ENOTSUPP; 5058 + } 5059 + 5054 5060 return _bpf_setsockopt(sk, level, optname, optval, optlen); 5055 5061 } 5056 5062
+38 -3
net/ipv4/bpf_tcp_ca.c
··· 10 10 #include <net/tcp.h> 11 11 #include <net/bpf_sk_storage.h> 12 12 13 + /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ 14 + extern struct bpf_struct_ops bpf_tcp_congestion_ops; 15 + 13 16 static u32 optional_ops[] = { 14 17 offsetof(struct tcp_congestion_ops, init), 15 18 offsetof(struct tcp_congestion_ops, release), ··· 166 163 .arg2_type = ARG_ANYTHING, 167 164 }; 168 165 166 + static u32 prog_ops_moff(const struct bpf_prog *prog) 167 + { 168 + const struct btf_member *m; 169 + const struct btf_type *t; 170 + u32 midx; 171 + 172 + midx = prog->expected_attach_type; 173 + t = bpf_tcp_congestion_ops.type; 174 + m = &btf_type_member(t)[midx]; 175 + 176 + return btf_member_bit_offset(t, m) / 8; 177 + } 178 + 169 179 static const struct bpf_func_proto * 170 180 bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, 171 181 const struct bpf_prog *prog) ··· 190 174 return &bpf_sk_storage_get_proto; 191 175 case BPF_FUNC_sk_storage_delete: 192 176 return &bpf_sk_storage_delete_proto; 177 + case BPF_FUNC_setsockopt: 178 + /* Does not allow release() to call setsockopt. 179 + * release() is called when the current bpf-tcp-cc 180 + * is retiring. It is not allowed to call 181 + * setsockopt() to make further changes which 182 + * may potentially allocate new resources. 183 + */ 184 + if (prog_ops_moff(prog) != 185 + offsetof(struct tcp_congestion_ops, release)) 186 + return &bpf_sk_setsockopt_proto; 187 + return NULL; 188 + case BPF_FUNC_getsockopt: 189 + /* Since get/setsockopt is usually expected to 190 + * be available together, disable getsockopt for 191 + * release also to avoid usage surprise. 192 + * The bpf-tcp-cc already has a more powerful way 193 + * to read tcp_sock from the PTR_TO_BTF_ID. 194 + */ 195 + if (prog_ops_moff(prog) != 196 + offsetof(struct tcp_congestion_ops, release)) 197 + return &bpf_sk_getsockopt_proto; 198 + return NULL; 193 199 default: 194 200 return bpf_base_func_proto(func_id); 195 201 } ··· 323 285 { 324 286 tcp_unregister_congestion_control(kdata); 325 287 } 326 - 327 - /* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ 328 - extern struct bpf_struct_ops bpf_tcp_congestion_ops; 329 288 330 289 struct bpf_struct_ops bpf_tcp_congestion_ops = { 331 290 .verifier_ops = &bpf_tcp_ca_verifier_ops,