Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Add kernel module with user mode driver that populates bpffs.

Add kernel module with user mode driver that populates bpffs with
BPF iterators.

$ mount bpffs /my/bpffs/ -t bpf
$ ls -la /my/bpffs/
total 4
drwxrwxrwt 2 root root 0 Jul 2 00:27 .
drwxr-xr-x 19 root root 4096 Jul 2 00:09 ..
-rw------- 1 root root 0 Jul 2 00:27 maps.debug
-rw------- 1 root root 0 Jul 2 00:27 progs.debug

The user mode driver will load BPF Type Formats, create BPF maps, populate BPF
maps, load two BPF programs, attach them to BPF iterators, and finally send two
bpf_link IDs back to the kernel.
The kernel will pin two bpf_links into newly mounted bpffs instance under
names "progs.debug" and "maps.debug". These two files become human readable.

$ cat /my/bpffs/progs.debug
id name attached
11 dump_bpf_map bpf_iter_bpf_map
12 dump_bpf_prog bpf_iter_bpf_prog
27 test_pkt_access
32 test_main test_pkt_access test_pkt_access
33 test_subprog1 test_pkt_access_subprog1 test_pkt_access
34 test_subprog2 test_pkt_access_subprog2 test_pkt_access
35 test_subprog3 test_pkt_access_subprog3 test_pkt_access
36 new_get_skb_len get_skb_len test_pkt_access
37 new_get_skb_ifindex get_skb_ifindex test_pkt_access
38 new_get_constant get_constant test_pkt_access

The BPF program dump_bpf_prog() in iterators.bpf.c is printing this data about
all BPF programs currently loaded in the system. This information is unstable
and will change from kernel to kernel as ".debug" suffix conveys.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200819042759.51280-4-alexei.starovoitov@gmail.com

authored by

Alexei Starovoitov and committed by
Daniel Borkmann
d71fa5c9 f0fdfefb

+390 -6
+2
init/Kconfig
··· 1710 1710 def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON 1711 1711 depends on HAVE_EBPF_JIT && BPF_JIT 1712 1712 1713 + source "kernel/bpf/preload/Kconfig" 1714 + 1713 1715 config USERFAULTFD 1714 1716 bool "Enable userfaultfd() system call" 1715 1717 depends on MMU
+1 -1
kernel/Makefile
··· 12 12 notifier.o ksysfs.o cred.o reboot.o \ 13 13 async.o range.o smpboot.o ucount.o regset.o 14 14 15 - obj-$(CONFIG_BPFILTER) += usermode_driver.o 15 + obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o 16 16 obj-$(CONFIG_MODULES) += kmod.o 17 17 obj-$(CONFIG_MULTIUSER) += groups.o 18 18
+1
kernel/bpf/Makefile
··· 29 29 obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o 30 30 obj-${CONFIG_BPF_LSM} += bpf_lsm.o 31 31 endif 32 + obj-$(CONFIG_BPF_PRELOAD) += preload/
+113 -3
kernel/bpf/inode.c
··· 20 20 #include <linux/filter.h> 21 21 #include <linux/bpf.h> 22 22 #include <linux/bpf_trace.h> 23 + #include "preload/bpf_preload.h" 23 24 24 25 enum bpf_type { 25 26 BPF_TYPE_UNSPEC = 0, ··· 370 369 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) 371 370 { 372 371 /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future 373 - * extensions. 372 + * extensions. That allows popoulate_bpffs() create special files. 374 373 */ 375 - if (strchr(dentry->d_name.name, '.')) 374 + if ((dir->i_mode & S_IALLUGO) && 375 + strchr(dentry->d_name.name, '.')) 376 376 return ERR_PTR(-EPERM); 377 377 378 378 return simple_lookup(dir, dentry, flags); ··· 410 408 .link = simple_link, 411 409 .unlink = simple_unlink, 412 410 }; 411 + 412 + /* pin iterator link into bpffs */ 413 + static int bpf_iter_link_pin_kernel(struct dentry *parent, 414 + const char *name, struct bpf_link *link) 415 + { 416 + umode_t mode = S_IFREG | S_IRUSR; 417 + struct dentry *dentry; 418 + int ret; 419 + 420 + inode_lock(parent->d_inode); 421 + dentry = lookup_one_len(name, parent, strlen(name)); 422 + if (IS_ERR(dentry)) { 423 + inode_unlock(parent->d_inode); 424 + return PTR_ERR(dentry); 425 + } 426 + ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, 427 + &bpf_iter_fops); 428 + dput(dentry); 429 + inode_unlock(parent->d_inode); 430 + return ret; 431 + } 413 432 414 433 static int bpf_obj_do_pin(const char __user *pathname, void *raw, 415 434 enum bpf_type type) ··· 661 638 return 0; 662 639 } 663 640 641 + struct bpf_preload_ops *bpf_preload_ops; 642 + EXPORT_SYMBOL_GPL(bpf_preload_ops); 643 + 644 + static bool bpf_preload_mod_get(void) 645 + { 646 + /* If bpf_preload.ko wasn't loaded earlier then load it now. 647 + * When bpf_preload is built into vmlinux the module's __init 648 + * function will populate it. 649 + */ 650 + if (!bpf_preload_ops) { 651 + request_module("bpf_preload"); 652 + if (!bpf_preload_ops) 653 + return false; 654 + } 655 + /* And grab the reference, so the module doesn't disappear while the 656 + * kernel is interacting with the kernel module and its UMD. 657 + */ 658 + if (!try_module_get(bpf_preload_ops->owner)) { 659 + pr_err("bpf_preload module get failed.\n"); 660 + return false; 661 + } 662 + return true; 663 + } 664 + 665 + static void bpf_preload_mod_put(void) 666 + { 667 + if (bpf_preload_ops) 668 + /* now user can "rmmod bpf_preload" if necessary */ 669 + module_put(bpf_preload_ops->owner); 670 + } 671 + 672 + static DEFINE_MUTEX(bpf_preload_lock); 673 + 674 + static int populate_bpffs(struct dentry *parent) 675 + { 676 + struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; 677 + struct bpf_link *links[BPF_PRELOAD_LINKS] = {}; 678 + int err = 0, i; 679 + 680 + /* grab the mutex to make sure the kernel interactions with bpf_preload 681 + * UMD are serialized 682 + */ 683 + mutex_lock(&bpf_preload_lock); 684 + 685 + /* if bpf_preload.ko wasn't built into vmlinux then load it */ 686 + if (!bpf_preload_mod_get()) 687 + goto out; 688 + 689 + if (!bpf_preload_ops->info.tgid) { 690 + /* preload() will start UMD that will load BPF iterator programs */ 691 + err = bpf_preload_ops->preload(objs); 692 + if (err) 693 + goto out_put; 694 + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { 695 + links[i] = bpf_link_by_id(objs[i].link_id); 696 + if (IS_ERR(links[i])) { 697 + err = PTR_ERR(links[i]); 698 + goto out_put; 699 + } 700 + } 701 + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { 702 + err = bpf_iter_link_pin_kernel(parent, 703 + objs[i].link_name, links[i]); 704 + if (err) 705 + goto out_put; 706 + /* do not unlink successfully pinned links even 707 + * if later link fails to pin 708 + */ 709 + links[i] = NULL; 710 + } 711 + /* finish() will tell UMD process to exit */ 712 + err = bpf_preload_ops->finish(); 713 + if (err) 714 + goto out_put; 715 + } 716 + out_put: 717 + bpf_preload_mod_put(); 718 + out: 719 + mutex_unlock(&bpf_preload_lock); 720 + for (i = 0; i < BPF_PRELOAD_LINKS && err; i++) 721 + if (!IS_ERR_OR_NULL(links[i])) 722 + bpf_link_put(links[i]); 723 + return err; 724 + } 725 + 664 726 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) 665 727 { 666 728 static const struct tree_descr bpf_rfiles[] = { { "" } }; ··· 762 654 inode = sb->s_root->d_inode; 763 655 inode->i_op = &bpf_dir_iops; 764 656 inode->i_mode &= ~S_IALLUGO; 657 + populate_bpffs(sb->s_root); 765 658 inode->i_mode |= S_ISVTX | opts->mode; 766 - 767 659 return 0; 768 660 } 769 661 ··· 812 704 static int __init bpf_init(void) 813 705 { 814 706 int ret; 707 + 708 + mutex_init(&bpf_preload_lock); 815 709 816 710 ret = sysfs_create_mount_point(fs_kobj, "bpf"); 817 711 if (ret)
+23
kernel/bpf/preload/Kconfig
··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + config USERMODE_DRIVER 3 + bool 4 + default n 5 + 6 + menuconfig BPF_PRELOAD 7 + bool "Preload BPF file system with kernel specific program and map iterators" 8 + depends on BPF 9 + select USERMODE_DRIVER 10 + help 11 + This builds kernel module with several embedded BPF programs that are 12 + pinned into BPF FS mount point as human readable files that are 13 + useful in debugging and introspection of BPF programs and maps. 14 + 15 + if BPF_PRELOAD 16 + config BPF_PRELOAD_UMD 17 + tristate "bpf_preload kernel module with user mode driver" 18 + depends on CC_CAN_LINK 19 + depends on m || CC_CAN_LINK_STATIC 20 + default m 21 + help 22 + This builds bpf_preload kernel module with embedded user mode driver. 23 + endif
+23
kernel/bpf/preload/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + 3 + LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ 4 + LIBBPF_A = $(obj)/libbpf.a 5 + LIBBPF_OUT = $(abspath $(obj)) 6 + 7 + $(LIBBPF_A): 8 + $(Q)$(MAKE) -C $(LIBBPF_SRCS) OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a 9 + 10 + userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ 11 + -I $(srctree)/tools/lib/ -Wno-unused-result 12 + 13 + userprogs := bpf_preload_umd 14 + 15 + bpf_preload_umd-objs := iterators/iterators.o 16 + bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz 17 + 18 + $(obj)/bpf_preload_umd: $(LIBBPF_A) 19 + 20 + $(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd 21 + 22 + obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o 23 + bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o
+16
kernel/bpf/preload/bpf_preload.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BPF_PRELOAD_H 3 + #define _BPF_PRELOAD_H 4 + 5 + #include <linux/usermode_driver.h> 6 + #include "iterators/bpf_preload_common.h" 7 + 8 + struct bpf_preload_ops { 9 + struct umd_info info; 10 + int (*preload)(struct bpf_preload_info *); 11 + int (*finish)(void); 12 + struct module *owner; 13 + }; 14 + extern struct bpf_preload_ops *bpf_preload_ops; 15 + #define BPF_PRELOAD_LINKS 2 16 + #endif
+91
kernel/bpf/preload/bpf_preload_kern.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 + #include <linux/init.h> 4 + #include <linux/module.h> 5 + #include <linux/pid.h> 6 + #include <linux/fs.h> 7 + #include <linux/sched/signal.h> 8 + #include "bpf_preload.h" 9 + 10 + extern char bpf_preload_umd_start; 11 + extern char bpf_preload_umd_end; 12 + 13 + static int preload(struct bpf_preload_info *obj); 14 + static int finish(void); 15 + 16 + static struct bpf_preload_ops umd_ops = { 17 + .info.driver_name = "bpf_preload", 18 + .preload = preload, 19 + .finish = finish, 20 + .owner = THIS_MODULE, 21 + }; 22 + 23 + static int preload(struct bpf_preload_info *obj) 24 + { 25 + int magic = BPF_PRELOAD_START; 26 + loff_t pos = 0; 27 + int i, err; 28 + ssize_t n; 29 + 30 + err = fork_usermode_driver(&umd_ops.info); 31 + if (err) 32 + return err; 33 + 34 + /* send the start magic to let UMD proceed with loading BPF progs */ 35 + n = kernel_write(umd_ops.info.pipe_to_umh, 36 + &magic, sizeof(magic), &pos); 37 + if (n != sizeof(magic)) 38 + return -EPIPE; 39 + 40 + /* receive bpf_link IDs and names from UMD */ 41 + pos = 0; 42 + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { 43 + n = kernel_read(umd_ops.info.pipe_from_umh, 44 + &obj[i], sizeof(*obj), &pos); 45 + if (n != sizeof(*obj)) 46 + return -EPIPE; 47 + } 48 + return 0; 49 + } 50 + 51 + static int finish(void) 52 + { 53 + int magic = BPF_PRELOAD_END; 54 + struct pid *tgid; 55 + loff_t pos = 0; 56 + ssize_t n; 57 + 58 + /* send the last magic to UMD. It will do a normal exit. */ 59 + n = kernel_write(umd_ops.info.pipe_to_umh, 60 + &magic, sizeof(magic), &pos); 61 + if (n != sizeof(magic)) 62 + return -EPIPE; 63 + tgid = umd_ops.info.tgid; 64 + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); 65 + umd_ops.info.tgid = NULL; 66 + return 0; 67 + } 68 + 69 + static int __init load_umd(void) 70 + { 71 + int err; 72 + 73 + err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start, 74 + &bpf_preload_umd_end - &bpf_preload_umd_start); 75 + if (err) 76 + return err; 77 + bpf_preload_ops = &umd_ops; 78 + return err; 79 + } 80 + 81 + static void __exit fini_umd(void) 82 + { 83 + bpf_preload_ops = NULL; 84 + /* kill UMD in case it's still there due to earlier error */ 85 + kill_pid(umd_ops.info.tgid, SIGKILL, 1); 86 + umd_ops.info.tgid = NULL; 87 + umd_unload_blob(&umd_ops.info); 88 + } 89 + late_initcall(load_umd); 90 + module_exit(fini_umd); 91 + MODULE_LICENSE("GPL");
+7
kernel/bpf/preload/bpf_preload_umd_blob.S
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + .section .init.rodata, "a" 3 + .global bpf_preload_umd_start 4 + bpf_preload_umd_start: 5 + .incbin "kernel/bpf/preload/bpf_preload_umd" 6 + .global bpf_preload_umd_end 7 + bpf_preload_umd_end:
+13
kernel/bpf/preload/iterators/bpf_preload_common.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _BPF_PRELOAD_COMMON_H 3 + #define _BPF_PRELOAD_COMMON_H 4 + 5 + #define BPF_PRELOAD_START 0x5555 6 + #define BPF_PRELOAD_END 0xAAAA 7 + 8 + struct bpf_preload_info { 9 + char link_name[16]; 10 + int link_id; 11 + }; 12 + 13 + #endif
+94
kernel/bpf/preload/iterators/iterators.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include <argp.h> 4 + #include <stdio.h> 5 + #include <stdlib.h> 6 + #include <string.h> 7 + #include <unistd.h> 8 + #include <fcntl.h> 9 + #include <sys/resource.h> 10 + #include <bpf/libbpf.h> 11 + #include <bpf/bpf.h> 12 + #include <sys/mount.h> 13 + #include "iterators.skel.h" 14 + #include "bpf_preload_common.h" 15 + 16 + int to_kernel = -1; 17 + int from_kernel = 0; 18 + 19 + static int send_link_to_kernel(struct bpf_link *link, const char *link_name) 20 + { 21 + struct bpf_preload_info obj = {}; 22 + struct bpf_link_info info = {}; 23 + __u32 info_len = sizeof(info); 24 + int err; 25 + 26 + err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); 27 + if (err) 28 + return err; 29 + obj.link_id = info.id; 30 + if (strlen(link_name) >= sizeof(obj.link_name)) 31 + return -E2BIG; 32 + strcpy(obj.link_name, link_name); 33 + if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj)) 34 + return -EPIPE; 35 + return 0; 36 + } 37 + 38 + int main(int argc, char **argv) 39 + { 40 + struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY }; 41 + struct iterators_bpf *skel; 42 + int err, magic; 43 + int debug_fd; 44 + 45 + debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC); 46 + if (debug_fd < 0) 47 + return 1; 48 + to_kernel = dup(1); 49 + close(1); 50 + dup(debug_fd); 51 + /* now stdin and stderr point to /dev/console */ 52 + 53 + read(from_kernel, &magic, sizeof(magic)); 54 + if (magic != BPF_PRELOAD_START) { 55 + printf("bad start magic %d\n", magic); 56 + return 1; 57 + } 58 + setrlimit(RLIMIT_MEMLOCK, &rlim); 59 + /* libbpf opens BPF object and loads it into the kernel */ 60 + skel = iterators_bpf__open_and_load(); 61 + if (!skel) { 62 + /* iterators.skel.h is little endian. 63 + * libbpf doesn't support automatic little->big conversion 64 + * of BPF bytecode yet. 65 + * The program load will fail in such case. 66 + */ 67 + printf("Failed load could be due to wrong endianness\n"); 68 + return 1; 69 + } 70 + err = iterators_bpf__attach(skel); 71 + if (err) 72 + goto cleanup; 73 + 74 + /* send two bpf_link IDs with names to the kernel */ 75 + err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug"); 76 + if (err) 77 + goto cleanup; 78 + err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug"); 79 + if (err) 80 + goto cleanup; 81 + 82 + /* The kernel will proceed with pinnging the links in bpffs. 83 + * UMD will wait on read from pipe. 84 + */ 85 + read(from_kernel, &magic, sizeof(magic)); 86 + if (magic != BPF_PRELOAD_END) { 87 + printf("bad final magic %d\n", magic); 88 + err = -EINVAL; 89 + } 90 + cleanup: 91 + iterators_bpf__destroy(skel); 92 + 93 + return err != 0; 94 + }
+1
net/bpfilter/Kconfig
··· 2 2 menuconfig BPFILTER 3 3 bool "BPF based packet filtering framework (BPFILTER)" 4 4 depends on NET && BPF && INET 5 + select USERMODE_DRIVER 5 6 help 6 7 This builds experimental bpfilter framework that is aiming to 7 8 provide netfilter compatible functionality via BPF
+5 -2
tools/lib/bpf/Makefile
··· 1 1 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 2 # Most of this file is copied from tools/lib/traceevent/Makefile 3 3 4 + RM ?= rm 5 + srctree = $(abs_srctree) 6 + 4 7 LIBBPF_VERSION := $(shell \ 5 8 grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | \ 6 9 sort -rV | head -n1 | cut -d'_' -f2) ··· 191 188 @ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION) 192 189 193 190 $(OUTPUT)libbpf.a: $(BPF_IN_STATIC) 194 - $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^ 191 + $(QUIET_LINK)$(RM) -f $@; $(AR) rcs $@ $^ 195 192 196 193 $(OUTPUT)libbpf.pc: 197 194 $(QUIET_GEN)sed -e "s|@PREFIX@|$(prefix)|" \ ··· 294 291 cscope -b -q -I $(srctree)/include -f cscope.out 295 292 296 293 tags: 297 - rm -f TAGS tags 294 + $(RM) -f TAGS tags 298 295 ls *.c *.h | xargs $(TAGS_PROG) -a 299 296 300 297 # Declare the contents of the .PHONY variable as phony. We keep that