Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Introduce pinnable bpf_link abstraction

Introduce bpf_link abstraction, representing an attachment of BPF program to
a BPF hook point (e.g., tracepoint, perf event, etc). bpf_link encapsulates
ownership of attached BPF program, reference counting of a link itself, when
reference from multiple anonymous inodes, as well as ensures that release
callback will be called from a process context, so that users can safely take
mutex locks and sleep.

Additionally, with a new abstraction it's now possible to generalize pinning
of a link object in BPF FS, allowing to explicitly prevent BPF program
detachment on process exit by pinning it in a BPF FS and let it open from
independent other process to keep working with it.

Convert two existing bpf_link-like objects (raw tracepoint and tracing BPF
program attachments) into utilizing bpf_link framework, making them pinnable
in BPF FS. More FD-based bpf_links will be added in follow up patches.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200303043159.323675-2-andriin@fb.com

authored by

Andrii Nakryiko and committed by
Alexei Starovoitov
70ed506c 775a2be5

+237 -51
+13
include/linux/bpf.h
··· 1056 1056 int bpf_map_new_fd(struct bpf_map *map, int flags); 1057 1057 int bpf_prog_new_fd(struct bpf_prog *prog); 1058 1058 1059 + struct bpf_link; 1060 + 1061 + struct bpf_link_ops { 1062 + void (*release)(struct bpf_link *link); 1063 + }; 1064 + 1065 + void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, 1066 + struct bpf_prog *prog); 1067 + void bpf_link_inc(struct bpf_link *link); 1068 + void bpf_link_put(struct bpf_link *link); 1069 + int bpf_link_new_fd(struct bpf_link *link); 1070 + struct bpf_link *bpf_link_get_from_fd(u32 ufd); 1071 + 1059 1072 int bpf_obj_pin_user(u32 ufd, const char __user *pathname); 1060 1073 int bpf_obj_get_user(const char __user *pathname, int flags); 1061 1074
+39 -5
kernel/bpf/inode.c
··· 25 25 BPF_TYPE_UNSPEC = 0, 26 26 BPF_TYPE_PROG, 27 27 BPF_TYPE_MAP, 28 + BPF_TYPE_LINK, 28 29 }; 29 30 30 31 static void *bpf_any_get(void *raw, enum bpf_type type) ··· 36 35 break; 37 36 case BPF_TYPE_MAP: 38 37 bpf_map_inc_with_uref(raw); 38 + break; 39 + case BPF_TYPE_LINK: 40 + bpf_link_inc(raw); 39 41 break; 40 42 default: 41 43 WARN_ON_ONCE(1); ··· 57 53 case BPF_TYPE_MAP: 58 54 bpf_map_put_with_uref(raw); 59 55 break; 56 + case BPF_TYPE_LINK: 57 + bpf_link_put(raw); 58 + break; 60 59 default: 61 60 WARN_ON_ONCE(1); 62 61 break; ··· 70 63 { 71 64 void *raw; 72 65 73 - *type = BPF_TYPE_MAP; 74 66 raw = bpf_map_get_with_uref(ufd); 75 - if (IS_ERR(raw)) { 76 - *type = BPF_TYPE_PROG; 77 - raw = bpf_prog_get(ufd); 67 + if (!IS_ERR(raw)) { 68 + *type = BPF_TYPE_MAP; 69 + return raw; 78 70 } 79 71 80 - return raw; 72 + raw = bpf_prog_get(ufd); 73 + if (!IS_ERR(raw)) { 74 + *type = BPF_TYPE_PROG; 75 + return raw; 76 + } 77 + 78 + raw = bpf_link_get_from_fd(ufd); 79 + if (!IS_ERR(raw)) { 80 + *type = BPF_TYPE_LINK; 81 + return raw; 82 + } 83 + 84 + return ERR_PTR(-EINVAL); 81 85 } 82 86 83 87 static const struct inode_operations bpf_dir_iops; 84 88 85 89 static const struct inode_operations bpf_prog_iops = { }; 86 90 static const struct inode_operations bpf_map_iops = { }; 91 + static const struct inode_operations bpf_link_iops = { }; 87 92 88 93 static struct inode *bpf_get_inode(struct super_block *sb, 89 94 const struct inode *dir, ··· 133 114 *type = BPF_TYPE_PROG; 134 115 else if (inode->i_op == &bpf_map_iops) 135 116 *type = BPF_TYPE_MAP; 117 + else if (inode->i_op == &bpf_link_iops) 118 + *type = BPF_TYPE_LINK; 136 119 else 137 120 return -EACCES; 138 121 ··· 356 335 &bpffs_map_fops : &bpffs_obj_fops); 357 336 } 358 337 338 + static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) 339 + { 340 + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, 341 + &bpffs_obj_fops); 342 + } 343 + 359 344 static struct dentry * 360 345 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) 361 346 { ··· 437 410 break; 438 411 case BPF_TYPE_MAP: 439 412 ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); 413 + break; 414 + case BPF_TYPE_LINK: 415 + ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); 440 416 break; 441 417 default: 442 418 ret = -EPERM; ··· 517 487 ret = bpf_prog_new_fd(raw); 518 488 else if (type == BPF_TYPE_MAP) 519 489 ret = bpf_map_new_fd(raw, f_flags); 490 + else if (type == BPF_TYPE_LINK) 491 + ret = bpf_link_new_fd(raw); 520 492 else 521 493 return -ENOENT; 522 494 ··· 535 503 return ERR_PTR(ret); 536 504 537 505 if (inode->i_op == &bpf_map_iops) 506 + return ERR_PTR(-EINVAL); 507 + if (inode->i_op == &bpf_link_iops) 538 508 return ERR_PTR(-EINVAL); 539 509 if (inode->i_op != &bpf_prog_iops) 540 510 return ERR_PTR(-EACCES);
+185 -46
kernel/bpf/syscall.c
··· 2173 2173 attr->file_flags); 2174 2174 } 2175 2175 2176 - static int bpf_tracing_prog_release(struct inode *inode, struct file *filp) 2177 - { 2178 - struct bpf_prog *prog = filp->private_data; 2176 + struct bpf_link { 2177 + atomic64_t refcnt; 2178 + const struct bpf_link_ops *ops; 2179 + struct bpf_prog *prog; 2180 + struct work_struct work; 2181 + }; 2179 2182 2180 - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); 2183 + void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, 2184 + struct bpf_prog *prog) 2185 + { 2186 + atomic64_set(&link->refcnt, 1); 2187 + link->ops = ops; 2188 + link->prog = prog; 2189 + } 2190 + 2191 + void bpf_link_inc(struct bpf_link *link) 2192 + { 2193 + atomic64_inc(&link->refcnt); 2194 + } 2195 + 2196 + /* bpf_link_free is guaranteed to be called from process context */ 2197 + static void bpf_link_free(struct bpf_link *link) 2198 + { 2199 + struct bpf_prog *prog; 2200 + 2201 + /* remember prog locally, because release below will free link memory */ 2202 + prog = link->prog; 2203 + /* extra clean up and kfree of container link struct */ 2204 + link->ops->release(link); 2205 + /* no more accesing of link members after this point */ 2181 2206 bpf_prog_put(prog); 2207 + } 2208 + 2209 + static void bpf_link_put_deferred(struct work_struct *work) 2210 + { 2211 + struct bpf_link *link = container_of(work, struct bpf_link, work); 2212 + 2213 + bpf_link_free(link); 2214 + } 2215 + 2216 + /* bpf_link_put can be called from atomic context, but ensures that resources 2217 + * are freed from process context 2218 + */ 2219 + void bpf_link_put(struct bpf_link *link) 2220 + { 2221 + if (!atomic64_dec_and_test(&link->refcnt)) 2222 + return; 2223 + 2224 + if (in_atomic()) { 2225 + INIT_WORK(&link->work, bpf_link_put_deferred); 2226 + schedule_work(&link->work); 2227 + } else { 2228 + bpf_link_free(link); 2229 + } 2230 + } 2231 + 2232 + static int bpf_link_release(struct inode *inode, struct file *filp) 2233 + { 2234 + struct bpf_link *link = filp->private_data; 2235 + 2236 + bpf_link_put(link); 2182 2237 return 0; 2183 2238 } 2184 2239 2185 - static const struct file_operations bpf_tracing_prog_fops = { 2186 - .release = bpf_tracing_prog_release, 2240 + #ifdef CONFIG_PROC_FS 2241 + static const struct bpf_link_ops bpf_raw_tp_lops; 2242 + static const struct bpf_link_ops bpf_tracing_link_lops; 2243 + static const struct bpf_link_ops bpf_xdp_link_lops; 2244 + 2245 + static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) 2246 + { 2247 + const struct bpf_link *link = filp->private_data; 2248 + const struct bpf_prog *prog = link->prog; 2249 + char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; 2250 + const char *link_type; 2251 + 2252 + if (link->ops == &bpf_raw_tp_lops) 2253 + link_type = "raw_tracepoint"; 2254 + else if (link->ops == &bpf_tracing_link_lops) 2255 + link_type = "tracing"; 2256 + else 2257 + link_type = "unknown"; 2258 + 2259 + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); 2260 + seq_printf(m, 2261 + "link_type:\t%s\n" 2262 + "prog_tag:\t%s\n" 2263 + "prog_id:\t%u\n", 2264 + link_type, 2265 + prog_tag, 2266 + prog->aux->id); 2267 + } 2268 + #endif 2269 + 2270 + const struct file_operations bpf_link_fops = { 2271 + #ifdef CONFIG_PROC_FS 2272 + .show_fdinfo = bpf_link_show_fdinfo, 2273 + #endif 2274 + .release = bpf_link_release, 2187 2275 .read = bpf_dummy_read, 2188 2276 .write = bpf_dummy_write, 2189 2277 }; 2190 2278 2279 + int bpf_link_new_fd(struct bpf_link *link) 2280 + { 2281 + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); 2282 + } 2283 + 2284 + struct bpf_link *bpf_link_get_from_fd(u32 ufd) 2285 + { 2286 + struct fd f = fdget(ufd); 2287 + struct bpf_link *link; 2288 + 2289 + if (!f.file) 2290 + return ERR_PTR(-EBADF); 2291 + if (f.file->f_op != &bpf_link_fops) { 2292 + fdput(f); 2293 + return ERR_PTR(-EINVAL); 2294 + } 2295 + 2296 + link = f.file->private_data; 2297 + bpf_link_inc(link); 2298 + fdput(f); 2299 + 2300 + return link; 2301 + } 2302 + 2303 + struct bpf_tracing_link { 2304 + struct bpf_link link; 2305 + }; 2306 + 2307 + static void bpf_tracing_link_release(struct bpf_link *link) 2308 + { 2309 + struct bpf_tracing_link *tr_link = 2310 + container_of(link, struct bpf_tracing_link, link); 2311 + 2312 + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); 2313 + kfree(tr_link); 2314 + } 2315 + 2316 + static const struct bpf_link_ops bpf_tracing_link_lops = { 2317 + .release = bpf_tracing_link_release, 2318 + }; 2319 + 2191 2320 static int bpf_tracing_prog_attach(struct bpf_prog *prog) 2192 2321 { 2193 - int tr_fd, err; 2322 + struct bpf_tracing_link *link; 2323 + int link_fd, err; 2194 2324 2195 2325 if (prog->expected_attach_type != BPF_TRACE_FENTRY && 2196 2326 prog->expected_attach_type != BPF_TRACE_FEXIT && ··· 2329 2199 goto out_put_prog; 2330 2200 } 2331 2201 2332 - err = bpf_trampoline_link_prog(prog); 2333 - if (err) 2334 - goto out_put_prog; 2335 - 2336 - tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops, 2337 - prog, O_CLOEXEC); 2338 - if (tr_fd < 0) { 2339 - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); 2340 - err = tr_fd; 2202 + link = kzalloc(sizeof(*link), GFP_USER); 2203 + if (!link) { 2204 + err = -ENOMEM; 2341 2205 goto out_put_prog; 2342 2206 } 2343 - return tr_fd; 2207 + bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); 2344 2208 2209 + err = bpf_trampoline_link_prog(prog); 2210 + if (err) 2211 + goto out_free_link; 2212 + 2213 + link_fd = bpf_link_new_fd(&link->link); 2214 + if (link_fd < 0) { 2215 + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); 2216 + err = link_fd; 2217 + goto out_free_link; 2218 + } 2219 + return link_fd; 2220 + 2221 + out_free_link: 2222 + kfree(link); 2345 2223 out_put_prog: 2346 2224 bpf_prog_put(prog); 2347 2225 return err; 2348 2226 } 2349 2227 2350 - struct bpf_raw_tracepoint { 2228 + struct bpf_raw_tp_link { 2229 + struct bpf_link link; 2351 2230 struct bpf_raw_event_map *btp; 2352 - struct bpf_prog *prog; 2353 2231 }; 2354 2232 2355 - static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) 2233 + static void bpf_raw_tp_link_release(struct bpf_link *link) 2356 2234 { 2357 - struct bpf_raw_tracepoint *raw_tp = filp->private_data; 2235 + struct bpf_raw_tp_link *raw_tp = 2236 + container_of(link, struct bpf_raw_tp_link, link); 2358 2237 2359 - if (raw_tp->prog) { 2360 - bpf_probe_unregister(raw_tp->btp, raw_tp->prog); 2361 - bpf_prog_put(raw_tp->prog); 2362 - } 2238 + bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); 2363 2239 bpf_put_raw_tracepoint(raw_tp->btp); 2364 2240 kfree(raw_tp); 2365 - return 0; 2366 2241 } 2367 2242 2368 - static const struct file_operations bpf_raw_tp_fops = { 2369 - .release = bpf_raw_tracepoint_release, 2370 - .read = bpf_dummy_read, 2371 - .write = bpf_dummy_write, 2243 + static const struct bpf_link_ops bpf_raw_tp_lops = { 2244 + .release = bpf_raw_tp_link_release, 2372 2245 }; 2373 2246 2374 2247 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd 2375 2248 2376 2249 static int bpf_raw_tracepoint_open(const union bpf_attr *attr) 2377 2250 { 2378 - struct bpf_raw_tracepoint *raw_tp; 2251 + struct bpf_raw_tp_link *raw_tp; 2379 2252 struct bpf_raw_event_map *btp; 2380 2253 struct bpf_prog *prog; 2381 2254 const char *tp_name; 2382 2255 char buf[128]; 2383 - int tp_fd, err; 2256 + int link_fd, err; 2384 2257 2385 2258 if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) 2386 2259 return -EINVAL; ··· 2435 2302 err = -ENOMEM; 2436 2303 goto out_put_btp; 2437 2304 } 2305 + bpf_link_init(&raw_tp->link, &bpf_raw_tp_lops, prog); 2438 2306 raw_tp->btp = btp; 2439 - raw_tp->prog = prog; 2440 2307 2441 2308 err = bpf_probe_register(raw_tp->btp, prog); 2442 2309 if (err) 2443 2310 goto out_free_tp; 2444 2311 2445 - tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, 2446 - O_CLOEXEC); 2447 - if (tp_fd < 0) { 2312 + link_fd = bpf_link_new_fd(&raw_tp->link); 2313 + if (link_fd < 0) { 2448 2314 bpf_probe_unregister(raw_tp->btp, prog); 2449 - err = tp_fd; 2315 + err = link_fd; 2450 2316 goto out_free_tp; 2451 2317 } 2452 - return tp_fd; 2318 + return link_fd; 2453 2319 2454 2320 out_free_tp: 2455 2321 kfree(raw_tp); ··· 3398 3266 if (err) 3399 3267 goto out; 3400 3268 3401 - if (file->f_op == &bpf_raw_tp_fops) { 3402 - struct bpf_raw_tracepoint *raw_tp = file->private_data; 3403 - struct bpf_raw_event_map *btp = raw_tp->btp; 3269 + if (file->f_op == &bpf_link_fops) { 3270 + struct bpf_link *link = file->private_data; 3404 3271 3405 - err = bpf_task_fd_query_copy(attr, uattr, 3406 - raw_tp->prog->aux->id, 3407 - BPF_FD_TYPE_RAW_TRACEPOINT, 3408 - btp->tp->name, 0, 0); 3409 - goto put_file; 3272 + if (link->ops == &bpf_raw_tp_lops) { 3273 + struct bpf_raw_tp_link *raw_tp = 3274 + container_of(link, struct bpf_raw_tp_link, link); 3275 + struct bpf_raw_event_map *btp = raw_tp->btp; 3276 + 3277 + err = bpf_task_fd_query_copy(attr, uattr, 3278 + raw_tp->link.prog->aux->id, 3279 + BPF_FD_TYPE_RAW_TRACEPOINT, 3280 + btp->tp->name, 0, 0); 3281 + goto put_file; 3282 + } 3283 + goto out_not_supp; 3410 3284 } 3411 3285 3412 3286 event = perf_get_event(file); ··· 3432 3294 goto put_file; 3433 3295 } 3434 3296 3297 + out_not_supp: 3435 3298 err = -ENOTSUPP; 3436 3299 put_file: 3437 3300 fput(file);