Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf, netns: Keep attached programs in bpf_prog_array

Prepare for having multi-prog attachments for new netns attach types by
storing programs to run in a bpf_prog_array, which is well suited for
iterating over programs and running them in sequence.

After this change bpf(PROG_QUERY) may block to allocate memory in
bpf_prog_array_copy_to_user() for collected program IDs. This forces a
change in how we protect access to the attached program in the query
callback. Because bpf_prog_array_copy_to_user() can sleep, we switch from
an RCU read lock to holding a mutex that serializes updaters.

Because we allow only one BPF flow_dissector program to be attached to
netns at all times, the bpf_prog_array pointed by net->bpf.run_array is
always either detached (null) or one element long.

No functional changes intended.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200625141357.910330-3-jakub@cloudflare.com

authored by

Jakub Sitnicki and committed by
Alexei Starovoitov
695c1214 3b701699

+96 -48
+4 -1
include/net/netns/bpf.h
··· 9 9 #include <linux/bpf-netns.h> 10 10 11 11 struct bpf_prog; 12 + struct bpf_prog_array; 12 13 13 14 struct netns_bpf { 14 - struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; 15 + /* Array of programs to run compiled from progs or links */ 16 + struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE]; 17 + struct bpf_prog *progs[MAX_NETNS_BPF_ATTACH_TYPE]; 15 18 struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; 16 19 }; 17 20
+82 -38
kernel/bpf/net_namespace.c
··· 33 33 net_link->net = NULL; 34 34 } 35 35 36 + /* Must be called with netns_bpf_mutex held. */ 37 + static void netns_bpf_run_array_detach(struct net *net, 38 + enum netns_bpf_attach_type type) 39 + { 40 + struct bpf_prog_array *run_array; 41 + 42 + run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, 43 + lockdep_is_held(&netns_bpf_mutex)); 44 + bpf_prog_array_free(run_array); 45 + } 46 + 36 47 static void bpf_netns_link_release(struct bpf_link *link) 37 48 { 38 49 struct bpf_netns_link *net_link = ··· 65 54 if (!net) 66 55 goto out_unlock; 67 56 57 + netns_bpf_run_array_detach(net, type); 68 58 net->bpf.links[type] = NULL; 69 - RCU_INIT_POINTER(net->bpf.progs[type], NULL); 70 59 71 60 out_unlock: 72 61 mutex_unlock(&netns_bpf_mutex); ··· 87 76 struct bpf_netns_link *net_link = 88 77 container_of(link, struct bpf_netns_link, link); 89 78 enum netns_bpf_attach_type type = net_link->netns_type; 79 + struct bpf_prog_array *run_array; 90 80 struct net *net; 91 81 int ret = 0; 92 82 ··· 105 93 goto out_unlock; 106 94 } 107 95 96 + run_array = rcu_dereference_protected(net->bpf.run_array[type], 97 + lockdep_is_held(&netns_bpf_mutex)); 98 + WRITE_ONCE(run_array->items[0].prog, new_prog); 99 + 108 100 old_prog = xchg(&link->prog, new_prog); 109 - rcu_assign_pointer(net->bpf.progs[type], new_prog); 110 101 bpf_prog_put(old_prog); 111 102 112 103 out_unlock: ··· 157 142 .show_fdinfo = bpf_netns_link_show_fdinfo, 158 143 }; 159 144 145 + /* Must be called with netns_bpf_mutex held. */ 146 + static int __netns_bpf_prog_query(const union bpf_attr *attr, 147 + union bpf_attr __user *uattr, 148 + struct net *net, 149 + enum netns_bpf_attach_type type) 150 + { 151 + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 152 + struct bpf_prog_array *run_array; 153 + u32 prog_cnt = 0, flags = 0; 154 + 155 + run_array = rcu_dereference_protected(net->bpf.run_array[type], 156 + lockdep_is_held(&netns_bpf_mutex)); 157 + if (run_array) 158 + prog_cnt = bpf_prog_array_length(run_array); 159 + 160 + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 161 + return -EFAULT; 162 + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) 163 + return -EFAULT; 164 + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) 165 + return 0; 166 + 167 + return bpf_prog_array_copy_to_user(run_array, prog_ids, 168 + attr->query.prog_cnt); 169 + } 170 + 160 171 int netns_bpf_prog_query(const union bpf_attr *attr, 161 172 union bpf_attr __user *uattr) 162 173 { 163 - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 164 - u32 prog_id, prog_cnt = 0, flags = 0; 165 174 enum netns_bpf_attach_type type; 166 - struct bpf_prog *attached; 167 175 struct net *net; 176 + int ret; 168 177 169 178 if (attr->query.query_flags) 170 179 return -EINVAL; ··· 201 162 if (IS_ERR(net)) 202 163 return PTR_ERR(net); 203 164 204 - rcu_read_lock(); 205 - attached = rcu_dereference(net->bpf.progs[type]); 206 - if (attached) { 207 - prog_cnt = 1; 208 - prog_id = attached->aux->id; 209 - } 210 - rcu_read_unlock(); 165 + mutex_lock(&netns_bpf_mutex); 166 + ret = __netns_bpf_prog_query(attr, uattr, net, type); 167 + mutex_unlock(&netns_bpf_mutex); 211 168 212 169 put_net(net); 213 - 214 - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 215 - return -EFAULT; 216 - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) 217 - return -EFAULT; 218 - 219 - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) 220 - return 0; 221 - 222 - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) 223 - return -EFAULT; 224 - 225 - return 0; 170 + return ret; 226 171 } 227 172 228 173 int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) 229 174 { 175 + struct bpf_prog_array *run_array; 230 176 enum netns_bpf_attach_type type; 231 177 struct bpf_prog *attached; 232 178 struct net *net; ··· 241 217 if (ret) 242 218 goto out_unlock; 243 219 244 - attached = rcu_dereference_protected(net->bpf.progs[type], 245 - lockdep_is_held(&netns_bpf_mutex)); 220 + attached = net->bpf.progs[type]; 246 221 if (attached == prog) { 247 222 /* The same program cannot be attached twice */ 248 223 ret = -EINVAL; 249 224 goto out_unlock; 250 225 } 251 - rcu_assign_pointer(net->bpf.progs[type], prog); 226 + 227 + run_array = rcu_dereference_protected(net->bpf.run_array[type], 228 + lockdep_is_held(&netns_bpf_mutex)); 229 + if (run_array) { 230 + WRITE_ONCE(run_array->items[0].prog, prog); 231 + } else { 232 + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); 233 + if (!run_array) { 234 + ret = -ENOMEM; 235 + goto out_unlock; 236 + } 237 + run_array->items[0].prog = prog; 238 + rcu_assign_pointer(net->bpf.run_array[type], run_array); 239 + } 240 + 241 + net->bpf.progs[type] = prog; 252 242 if (attached) 253 243 bpf_prog_put(attached); 254 244 ··· 282 244 if (net->bpf.links[type]) 283 245 return -EINVAL; 284 246 285 - attached = rcu_dereference_protected(net->bpf.progs[type], 286 - lockdep_is_held(&netns_bpf_mutex)); 247 + attached = net->bpf.progs[type]; 287 248 if (!attached) 288 249 return -ENOENT; 289 - RCU_INIT_POINTER(net->bpf.progs[type], NULL); 250 + netns_bpf_run_array_detach(net, type); 251 + net->bpf.progs[type] = NULL; 290 252 bpf_prog_put(attached); 291 253 return 0; 292 254 } ··· 310 272 static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, 311 273 enum netns_bpf_attach_type type) 312 274 { 313 - struct bpf_prog *prog; 275 + struct bpf_prog_array *run_array; 314 276 int err; 315 277 316 278 mutex_lock(&netns_bpf_mutex); ··· 321 283 goto out_unlock; 322 284 } 323 285 /* Links are not compatible with attaching prog directly */ 324 - prog = rcu_dereference_protected(net->bpf.progs[type], 325 - lockdep_is_held(&netns_bpf_mutex)); 326 - if (prog) { 286 + if (net->bpf.progs[type]) { 327 287 err = -EEXIST; 328 288 goto out_unlock; 329 289 } ··· 337 301 if (err) 338 302 goto out_unlock; 339 303 340 - rcu_assign_pointer(net->bpf.progs[type], link->prog); 304 + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); 305 + if (!run_array) { 306 + err = -ENOMEM; 307 + goto out_unlock; 308 + } 309 + run_array->items[0].prog = link->prog; 310 + rcu_assign_pointer(net->bpf.run_array[type], run_array); 311 + 341 312 net->bpf.links[type] = link; 342 313 343 314 out_unlock: ··· 411 368 412 369 mutex_lock(&netns_bpf_mutex); 413 370 for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { 371 + netns_bpf_run_array_detach(net, type); 414 372 link = net->bpf.links[type]; 415 373 if (link) 416 374 bpf_netns_link_auto_detach(link); 417 - else 418 - __netns_bpf_prog_detach(net, type); 375 + else if (net->bpf.progs[type]) 376 + bpf_prog_put(net->bpf.progs[type]); 419 377 } 420 378 mutex_unlock(&netns_bpf_mutex); 421 379 }
+10 -9
net/core/flow_dissector.c
··· 86 86 for_each_net(ns) { 87 87 if (ns == &init_net) 88 88 continue; 89 - if (rcu_access_pointer(ns->bpf.progs[type])) 89 + if (rcu_access_pointer(ns->bpf.run_array[type])) 90 90 return -EEXIST; 91 91 } 92 92 } else { 93 93 /* Make sure root flow dissector is not attached 94 94 * when attaching to the non-root namespace. 95 95 */ 96 - if (rcu_access_pointer(init_net.bpf.progs[type])) 96 + if (rcu_access_pointer(init_net.bpf.run_array[type])) 97 97 return -EEXIST; 98 98 } 99 99 ··· 894 894 struct flow_dissector_key_addrs *key_addrs; 895 895 struct flow_dissector_key_tags *key_tags; 896 896 struct flow_dissector_key_vlan *key_vlan; 897 - struct bpf_prog *attached = NULL; 898 897 enum flow_dissect_ret fdret; 899 898 enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; 900 899 bool mpls_el = false; ··· 950 951 WARN_ON_ONCE(!net); 951 952 if (net) { 952 953 enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; 954 + struct bpf_prog_array *run_array; 953 955 954 956 rcu_read_lock(); 955 - attached = rcu_dereference(init_net.bpf.progs[type]); 957 + run_array = rcu_dereference(init_net.bpf.run_array[type]); 958 + if (!run_array) 959 + run_array = rcu_dereference(net->bpf.run_array[type]); 956 960 957 - if (!attached) 958 - attached = rcu_dereference(net->bpf.progs[type]); 959 - 960 - if (attached) { 961 + if (run_array) { 961 962 struct bpf_flow_keys flow_keys; 962 963 struct bpf_flow_dissector ctx = { 963 964 .flow_keys = &flow_keys, ··· 965 966 .data_end = data + hlen, 966 967 }; 967 968 __be16 n_proto = proto; 969 + struct bpf_prog *prog; 968 970 969 971 if (skb) { 970 972 ctx.skb = skb; ··· 976 976 n_proto = skb->protocol; 977 977 } 978 978 979 - ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, 979 + prog = READ_ONCE(run_array->items[0].prog); 980 + ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, 980 981 hlen, flags); 981 982 __skb_flow_bpf_to_target(&flow_keys, flow_dissector, 982 983 target_container);