Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+9 -5

arch/arm/net/bpf_jit_32.c

··· 795 795 case BPF_RSH: 796 796 emit(ARM_LSR_I(rd, rd, val), ctx); 797 797 break; 798 + case BPF_ARSH: 799 + emit(ARM_ASR_I(rd, rd, val), ctx); 800 + break; 798 801 case BPF_NEG: 799 802 emit(ARM_RSB_I(rd, rd, val), ctx); 800 803 break; ··· 863 860 emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); 864 861 emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx); 865 862 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx); 866 - _emit(ARM_COND_MI, ARM_B(0), ctx); 867 - emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx); 863 + _emit(ARM_COND_PL, 864 + ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx); 868 865 emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_ASR, rt), ctx); 869 866 870 867 arm_bpf_put_reg32(dst_lo, ARM_LR, ctx); ··· 1411 1408 case BPF_ALU | BPF_MUL | BPF_X: 1412 1409 case BPF_ALU | BPF_LSH | BPF_X: 1413 1410 case BPF_ALU | BPF_RSH | BPF_X: 1414 - case BPF_ALU | BPF_ARSH | BPF_K: 1415 1411 case BPF_ALU | BPF_ARSH | BPF_X: 1416 1412 case BPF_ALU64 | BPF_ADD | BPF_K: 1417 1413 case BPF_ALU64 | BPF_ADD | BPF_X: ··· 1467 1465 case BPF_ALU64 | BPF_MOD | BPF_K: 1468 1466 case BPF_ALU64 | BPF_MOD | BPF_X: 1469 1467 goto notyet; 1470 - /* dst = dst >> imm */ 1471 1468 /* dst = dst << imm */ 1472 - case BPF_ALU | BPF_RSH | BPF_K: 1469 + /* dst = dst >> imm */ 1470 + /* dst = dst >> imm (signed) */ 1473 1471 case BPF_ALU | BPF_LSH | BPF_K: 1472 + case BPF_ALU | BPF_RSH | BPF_K: 1473 + case BPF_ALU | BPF_ARSH | BPF_K: 1474 1474 if (unlikely(imm > 31)) 1475 1475 return -EINVAL; 1476 1476 if (imm)

+3

arch/arm/net/bpf_jit_32.h

··· 94 94 #define ARM_INST_LSR_I 0x01a00020 95 95 #define ARM_INST_LSR_R 0x01a00030 96 96 97 + #define ARM_INST_ASR_I 0x01a00040 98 + #define ARM_INST_ASR_R 0x01a00050 99 + 97 100 #define ARM_INST_MOV_R 0x01a00000 98 101 #define ARM_INST_MOVS_R 0x01b00000 99 102 #define ARM_INST_MOV_I 0x03a00000

+44 -20

arch/riscv/net/bpf_jit_comp64.c

··· 515 515 case BPF_ALU | BPF_LSH | BPF_X: 516 516 case BPF_ALU64 | BPF_LSH | BPF_X: 517 517 emit(is64 ? rv_sll(rd, rd, rs) : rv_sllw(rd, rd, rs), ctx); 518 - if (!is64) 518 + if (!is64 && !aux->verifier_zext) 519 519 emit_zext_32(rd, ctx); 520 520 break; 521 521 case BPF_ALU | BPF_RSH | BPF_X: ··· 542 542 543 543 /* dst = BSWAP##imm(dst) */ 544 544 case BPF_ALU | BPF_END | BPF_FROM_LE: 545 - { 546 - int shift = 64 - imm; 547 - 548 - emit(rv_slli(rd, rd, shift), ctx); 549 - emit(rv_srli(rd, rd, shift), ctx); 545 + switch (imm) { 546 + case 16: 547 + emit(rv_slli(rd, rd, 48), ctx); 548 + emit(rv_srli(rd, rd, 48), ctx); 549 + break; 550 + case 32: 551 + if (!aux->verifier_zext) 552 + emit_zext_32(rd, ctx); 553 + break; 554 + case 64: 555 + /* Do nothing */ 556 + break; 557 + } 550 558 break; 551 - } 559 + 552 560 case BPF_ALU | BPF_END | BPF_FROM_BE: 553 561 emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx); 554 562 ··· 700 692 case BPF_ALU | BPF_LSH | BPF_K: 701 693 case BPF_ALU64 | BPF_LSH | BPF_K: 702 694 emit(is64 ? rv_slli(rd, rd, imm) : rv_slliw(rd, rd, imm), ctx); 703 - if (!is64) 695 + if (!is64 && !aux->verifier_zext) 704 696 emit_zext_32(rd, ctx); 705 697 break; 706 698 case BPF_ALU | BPF_RSH | BPF_K: 707 699 case BPF_ALU64 | BPF_RSH | BPF_K: 708 700 emit(is64 ? rv_srli(rd, rd, imm) : rv_srliw(rd, rd, imm), ctx); 709 - if (!is64) 701 + if (!is64 && !aux->verifier_zext) 710 702 emit_zext_32(rd, ctx); 711 703 break; 712 704 case BPF_ALU | BPF_ARSH | BPF_K: 713 705 case BPF_ALU64 | BPF_ARSH | BPF_K: 714 706 emit(is64 ? rv_srai(rd, rd, imm) : rv_sraiw(rd, rd, imm), ctx); 715 - if (!is64) 707 + if (!is64 && !aux->verifier_zext) 716 708 emit_zext_32(rd, ctx); 717 709 break; 718 710 ··· 792 784 case BPF_JMP32 | BPF_JSGE | BPF_K: 793 785 case BPF_JMP | BPF_JSLE | BPF_K: 794 786 case BPF_JMP32 | BPF_JSLE | BPF_K: 795 - case BPF_JMP | BPF_JSET | BPF_K: 796 - case BPF_JMP32 | BPF_JSET | BPF_K: 797 787 rvoff = rv_offset(i, off, ctx); 798 788 s = ctx->ninsns; 799 - emit_imm(RV_REG_T1, imm, ctx); 789 + if (imm) { 790 + emit_imm(RV_REG_T1, imm, ctx); 791 + rs = RV_REG_T1; 792 + } else { 793 + /* If imm is 0, simply use zero register. */ 794 + rs = RV_REG_ZERO; 795 + } 800 796 if (!is64) { 801 797 if (is_signed_bpf_cond(BPF_OP(code))) 802 798 emit_sext_32_rd(&rd, ctx); ··· 811 799 812 800 /* Adjust for extra insns */ 813 801 rvoff -= (e - s) << 2; 802 + emit_branch(BPF_OP(code), rd, rs, rvoff, ctx); 803 + break; 814 804 815 - if (BPF_OP(code) == BPF_JSET) { 816 - /* Adjust for and */ 817 - rvoff -= 4; 818 - emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx); 819 - emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, 820 - ctx); 805 + case BPF_JMP | BPF_JSET | BPF_K: 806 + case BPF_JMP32 | BPF_JSET | BPF_K: 807 + rvoff = rv_offset(i, off, ctx); 808 + s = ctx->ninsns; 809 + if (is_12b_int(imm)) { 810 + emit(rv_andi(RV_REG_T1, rd, imm), ctx); 821 811 } else { 822 - emit_branch(BPF_OP(code), rd, RV_REG_T1, rvoff, ctx); 812 + emit_imm(RV_REG_T1, imm, ctx); 813 + emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx); 823 814 } 815 + /* For jset32, we should clear the upper 32 bits of t1, but 816 + * sign-extension is sufficient here and saves one instruction, 817 + * as t1 is used only in comparison against zero. 818 + */ 819 + if (!is64 && imm < 0) 820 + emit(rv_addiw(RV_REG_T1, RV_REG_T1, 0), ctx); 821 + e = ctx->ninsns; 822 + rvoff -= (e - s) << 2; 823 + emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, ctx); 824 824 break; 825 825 826 826 /* function call */

+2 -2

arch/x86/net/bpf_jit_comp32.c

··· 1475 1475 for (i = 0; i < insn_cnt; i++, insn++) { 1476 1476 const s32 imm32 = insn->imm; 1477 1477 const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; 1478 - const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true; 1479 - const bool sstk = insn->src_reg == BPF_REG_AX ? false : true; 1478 + const bool dstk = insn->dst_reg != BPF_REG_AX; 1479 + const bool sstk = insn->src_reg != BPF_REG_AX; 1480 1480 const u8 code = insn->code; 1481 1481 const u8 *dst = bpf2ia32[insn->dst_reg]; 1482 1482 const u8 *src = bpf2ia32[insn->src_reg];

+19

fs/proc/proc_net.c

··· 98 98 .proc_release = seq_release_net, 99 99 }; 100 100 101 + int bpf_iter_init_seq_net(void *priv_data) 102 + { 103 + #ifdef CONFIG_NET_NS 104 + struct seq_net_private *p = priv_data; 105 + 106 + p->net = get_net(current->nsproxy->net_ns); 107 + #endif 108 + return 0; 109 + } 110 + 111 + void bpf_iter_fini_seq_net(void *priv_data) 112 + { 113 + #ifdef CONFIG_NET_NS 114 + struct seq_net_private *p = priv_data; 115 + 116 + put_net(p->net); 117 + #endif 118 + } 119 + 101 120 struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, 102 121 struct proc_dir_entry *parent, const struct seq_operations *ops, 103 122 unsigned int state_size, void *data)

+46

include/linux/bpf.h

··· 31 31 struct btf; 32 32 struct btf_type; 33 33 struct exception_table_entry; 34 + struct seq_operations; 34 35 35 36 extern struct idr btf_idr; 36 37 extern spinlock_t btf_idr_lock; ··· 320 319 PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ 321 320 PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ 322 321 PTR_TO_BTF_ID, /* reg points to kernel struct */ 322 + PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ 323 323 }; 324 324 325 325 /* The information passed from prog-specific *_is_valid_access ··· 643 641 u16 reason; 644 642 }; 645 643 644 + /* reg_type info for ctx arguments */ 645 + struct bpf_ctx_arg_aux { 646 + u32 offset; 647 + enum bpf_reg_type reg_type; 648 + }; 649 + 646 650 struct bpf_prog_aux { 647 651 atomic64_t refcnt; 648 652 u32 used_map_cnt; ··· 660 652 u32 func_cnt; /* used by non-func prog as the number of func progs */ 661 653 u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ 662 654 u32 attach_btf_id; /* in-kernel BTF type id to attach to */ 655 + u32 ctx_arg_info_size; 656 + const struct bpf_ctx_arg_aux *ctx_arg_info; 663 657 struct bpf_prog *linked_prog; 664 658 bool verifier_zext; /* Zero extensions has been inserted by verifier. */ 665 659 bool offload_requested; ··· 1031 1021 1032 1022 extern const struct file_operations bpf_map_fops; 1033 1023 extern const struct file_operations bpf_prog_fops; 1024 + extern const struct file_operations bpf_iter_fops; 1034 1025 1035 1026 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ 1036 1027 extern const struct bpf_prog_ops _name ## _prog_ops; \ ··· 1091 1080 int generic_map_delete_batch(struct bpf_map *map, 1092 1081 const union bpf_attr *attr, 1093 1082 union bpf_attr __user *uattr); 1083 + struct bpf_map *bpf_map_get_curr_or_next(u32 *id); 1094 1084 1095 1085 extern int sysctl_unprivileged_bpf_disabled; 1096 1086 ··· 1137 1125 1138 1126 int bpf_obj_pin_user(u32 ufd, const char __user *pathname); 1139 1127 int bpf_obj_get_user(const char __user *pathname, int flags); 1128 + 1129 + #define BPF_ITER_FUNC_PREFIX "bpf_iter_" 1130 + #define DEFINE_BPF_ITER_FUNC(target, args...) \ 1131 + extern int bpf_iter_ ## target(args); \ 1132 + int __init bpf_iter_ ## target(args) { return 0; } 1133 + 1134 + typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); 1135 + typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); 1136 + 1137 + #define BPF_ITER_CTX_ARG_MAX 2 1138 + struct bpf_iter_reg { 1139 + const char *target; 1140 + const struct seq_operations *seq_ops; 1141 + bpf_iter_init_seq_priv_t init_seq_private; 1142 + bpf_iter_fini_seq_priv_t fini_seq_private; 1143 + u32 seq_priv_size; 1144 + u32 ctx_arg_info_size; 1145 + struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; 1146 + }; 1147 + 1148 + struct bpf_iter_meta { 1149 + __bpf_md_ptr(struct seq_file *, seq); 1150 + u64 session_id; 1151 + u64 seq_num; 1152 + }; 1153 + 1154 + int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); 1155 + void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); 1156 + bool bpf_iter_prog_supported(struct bpf_prog *prog); 1157 + int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); 1158 + int bpf_iter_new_fd(struct bpf_link *link); 1159 + bool bpf_link_is_iter(struct bpf_link *link); 1160 + struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); 1161 + int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); 1140 1162 1141 1163 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); 1142 1164 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);

+1

include/linux/bpf_types.h

··· 124 124 #ifdef CONFIG_CGROUP_BPF 125 125 BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) 126 126 #endif 127 + BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)

+4

include/linux/capability.h

··· 251 251 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); 252 252 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); 253 253 extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); 254 + static inline bool perfmon_capable(void) 255 + { 256 + return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); 257 + } 254 258 255 259 /* audit system wants to get cap info from files as well */ 256 260 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);

+2 -4

include/linux/filter.h

··· 545 545 unsigned int (*bpf_func)(const void *ctx, 546 546 const struct bpf_insn *insn); 547 547 /* Instructions for interpreter */ 548 - union { 549 - struct sock_filter insns[0]; 550 - struct bpf_insn insnsi[0]; 551 - }; 548 + struct sock_filter insns[0]; 549 + struct bpf_insn insnsi[]; 552 550 }; 553 551 554 552 struct sk_filter {

+3

include/linux/proc_fs.h

··· 105 105 void *data); 106 106 extern struct pid *tgid_pidfd_to_pid(const struct file *file); 107 107 108 + extern int bpf_iter_init_seq_net(void *priv_data); 109 + extern void bpf_iter_fini_seq_net(void *priv_data); 110 + 108 111 #ifdef CONFIG_PROC_PID_ARCH_STATUS 109 112 /* 110 113 * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must

+7 -1

include/net/inet_common.h

··· 35 35 int inet_listen(struct socket *sock, int backlog); 36 36 void inet_sock_destruct(struct sock *sk); 37 37 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); 38 + /* Don't allocate port at this moment, defer to connect. */ 39 + #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) 40 + /* Grab and release socket lock. */ 41 + #define BIND_WITH_LOCK (1 << 1) 42 + /* Called from BPF program. */ 43 + #define BIND_FROM_BPF (1 << 2) 38 44 int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, 39 - bool force_bind_address_no_port, bool with_lock); 45 + u32 flags); 40 46 int inet_getname(struct socket *sock, struct sockaddr *uaddr, 41 47 int peer); 42 48 int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);

+7

include/net/ip6_fib.h

··· 544 544 return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); 545 545 } 546 546 547 + #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) 548 + struct bpf_iter__ipv6_route { 549 + __bpf_md_ptr(struct bpf_iter_meta *, meta); 550 + __bpf_md_ptr(struct fib6_info *, rt); 551 + }; 552 + #endif 553 + 547 554 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 548 555 static inline bool fib6_has_custom_rules(const struct net *net) 549 556 {

+1 -1

include/net/ipv6_stubs.h

··· 63 63 /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ 64 64 struct ipv6_bpf_stub { 65 65 int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, 66 - bool force_bind_address_no_port, bool with_lock); 66 + u32 flags); 67 67 struct sock *(*udp6_lib_lookup)(struct net *net, 68 68 const struct in6_addr *saddr, __be16 sport, 69 69 const struct in6_addr *daddr, __be16 dport,

+2 -3

include/net/xdp_sock.h

··· 50 50 u32 headroom; 51 51 u32 chunk_size_nohr; 52 52 struct user_struct *user; 53 - unsigned long address; 54 53 refcount_t users; 55 54 struct work_struct work; 56 55 struct page **pgs; ··· 61 62 struct net_device *dev; 62 63 struct xdp_umem_fq_reuse *fq_reuse; 63 64 bool zc; 64 - spinlock_t xsk_list_lock; 65 - struct list_head xsk_list; 65 + spinlock_t xsk_tx_list_lock; 66 + struct list_head xsk_tx_list; 66 67 }; 67 68 68 69 /* Nodes are linked in the struct xdp_sock map_list field, and used to

+131 -42

include/uapi/linux/bpf.h

··· 116 116 BPF_LINK_GET_FD_BY_ID, 117 117 BPF_LINK_GET_NEXT_ID, 118 118 BPF_ENABLE_STATS, 119 + BPF_ITER_CREATE, 119 120 }; 120 121 121 122 enum bpf_map_type { ··· 219 218 BPF_TRACE_FEXIT, 220 219 BPF_MODIFY_RETURN, 221 220 BPF_LSM_MAC, 221 + BPF_TRACE_ITER, 222 222 __MAX_BPF_ATTACH_TYPE 223 223 }; 224 224 ··· 230 228 BPF_LINK_TYPE_RAW_TRACEPOINT = 1, 231 229 BPF_LINK_TYPE_TRACING = 2, 232 230 BPF_LINK_TYPE_CGROUP = 3, 231 + BPF_LINK_TYPE_ITER = 4, 233 232 234 233 MAX_BPF_LINK_TYPE, 235 234 }; ··· 615 612 __u32 type; 616 613 } enable_stats; 617 614 615 + struct { /* struct used by BPF_ITER_CREATE command */ 616 + __u32 link_fd; 617 + __u32 flags; 618 + } iter_create; 619 + 618 620 } __attribute__((aligned(8))); 619 621 620 622 /* The description below is an attempt at providing documentation to eBPF ··· 675 667 * For tracing programs, safely attempt to read *size* bytes from 676 668 * kernel space address *unsafe_ptr* and store the data in *dst*. 677 669 * 678 - * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() 679 - * instead. 670 + * Generally, use **bpf_probe_read_user**\ () or 671 + * **bpf_probe_read_kernel**\ () instead. 680 672 * Return 681 673 * 0 on success, or a negative error in case of failure. 682 674 * ··· 684 676 * Description 685 677 * Return the time elapsed since system boot, in nanoseconds. 686 678 * Does not include time the system was suspended. 687 - * See: clock_gettime(CLOCK_MONOTONIC) 679 + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) 688 680 * Return 689 681 * Current *ktime*. 690 682 * ··· 1543 1535 * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) 1544 1536 * Description 1545 1537 * Copy a NUL terminated string from an unsafe kernel address 1546 - * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for 1538 + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for 1547 1539 * more details. 1548 1540 * 1549 - * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() 1550 - * instead. 1541 + * Generally, use **bpf_probe_read_user_str**\ () or 1542 + * **bpf_probe_read_kernel_str**\ () instead. 1551 1543 * Return 1552 1544 * On success, the strictly positive length of the string, 1553 1545 * including the trailing NUL character. On error, a negative ··· 1575 1567 * 1576 1568 * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) 1577 1569 * Description 1578 - * Equivalent to bpf_get_socket_cookie() helper that accepts 1570 + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts 1579 1571 * *skb*, but gets socket from **struct bpf_sock_ops** context. 1580 1572 * Return 1581 1573 * A 8-byte long non-decreasing number. ··· 1604 1596 * The option value of length *optlen* is pointed by *optval*. 1605 1597 * 1606 1598 * *bpf_socket* should be one of the following: 1599 + * 1607 1600 * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. 1608 1601 * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** 1609 1602 * and **BPF_CGROUP_INET6_CONNECT**. ··· 1673 1664 * 1674 1665 * The lower two bits of *flags* are used as the return code if 1675 1666 * the map lookup fails. This is so that the return value can be 1676 - * one of the XDP program return codes up to XDP_TX, as chosen by 1677 - * the caller. Any higher bits in the *flags* argument must be 1667 + * one of the XDP program return codes up to **XDP_TX**, as chosen 1668 + * by the caller. Any higher bits in the *flags* argument must be 1678 1669 * unset. 1679 1670 * 1680 - * See also bpf_redirect(), which only supports redirecting to an 1681 - * ifindex, but doesn't require a map to do so. 1671 + * See also **bpf_redirect**\ (), which only supports redirecting 1672 + * to an ifindex, but doesn't require a map to do so. 1682 1673 * Return 1683 1674 * **XDP_REDIRECT** on success, or the value of the two lower bits 1684 1675 * of the *flags* argument on error. ··· 1786 1777 * the time running for event since last normalization. The 1787 1778 * enabled and running times are accumulated since the perf event 1788 1779 * open. To achieve scaling factor between two invocations of an 1789 - * eBPF program, users can can use CPU id as the key (which is 1780 + * eBPF program, users can use CPU id as the key (which is 1790 1781 * typical for perf array usage model) to remember the previous 1791 1782 * value and do the calculation inside the eBPF program. 1792 1783 * Return ··· 1813 1804 * *opval* and of length *optlen*. 1814 1805 * 1815 1806 * *bpf_socket* should be one of the following: 1807 + * 1816 1808 * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. 1817 1809 * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** 1818 1810 * and **BPF_CGROUP_INET6_CONNECT**. ··· 1835 1825 * The first argument is the context *regs* on which the kprobe 1836 1826 * works. 1837 1827 * 1838 - * This helper works by setting setting the PC (program counter) 1828 + * This helper works by setting the PC (program counter) 1839 1829 * to an override function which is run in place of the original 1840 1830 * probed function. This means the probed function is not run at 1841 1831 * all. The replacement function just returns with the required ··· 2004 1994 * 2005 1995 * This helper works for IPv4 and IPv6, TCP and UDP sockets. The 2006 1996 * domain (*addr*\ **->sa_family**) must be **AF_INET** (or 2007 - * **AF_INET6**). Looking for a free port to bind to can be 2008 - * expensive, therefore binding to port is not permitted by the 2009 - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) 2010 - * must be set to zero. 1997 + * **AF_INET6**). It's advised to pass zero port (**sin_port** 1998 + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like 1999 + * behavior and lets the kernel efficiently pick up an unused 2000 + * port as long as 4-tuple is unique. Passing non-zero port might 2001 + * lead to degraded performance. 2011 2002 * Return 2012 2003 * 0 on success, or a negative error in case of failure. 2013 2004 * ··· 2302 2291 * **bpf_rc_keydown**\ () again with the same values, or calling 2303 2292 * **bpf_rc_repeat**\ (). 2304 2293 * 2305 - * Some protocols include a toggle bit, in case the button was 2294 + * Some protocols include a toggle bit, in case the button was 2306 2295 * released and pressed again between consecutive scancodes. 2307 2296 * 2308 2297 * The *ctx* should point to the lirc sample as passed into ··· 2648 2637 * 2649 2638 * *th* points to the start of the TCP header, while *th_len* 2650 2639 * contains **sizeof**\ (**struct tcphdr**). 2651 - * 2652 2640 * Return 2653 2641 * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative 2654 2642 * error otherwise. ··· 2830 2820 * 2831 2821 * *th* points to the start of the TCP header, while *th_len* 2832 2822 * contains the length of the TCP header. 2833 - * 2834 2823 * Return 2835 2824 * On success, lower 32 bits hold the generated SYN cookie in 2836 2825 * followed by 16 bits which hold the MSS value for that cookie, ··· 2912 2903 * // size, after checking its boundaries. 2913 2904 * } 2914 2905 * 2915 - * In comparison, using **bpf_probe_read_user()** helper here 2906 + * In comparison, using **bpf_probe_read_user**\ () helper here 2916 2907 * instead to read the string would require to estimate the length 2917 2908 * at compile time, and would often result in copying more memory 2918 2909 * than necessary. ··· 2930 2921 * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) 2931 2922 * Description 2932 2923 * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* 2933 - * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. 2924 + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. 2934 2925 * Return 2935 - * On success, the strictly positive length of the string, including 2926 + * On success, the strictly positive length of the string, including 2936 2927 * the trailing NUL character. On error, a negative value. 2937 2928 * 2938 2929 * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) 2939 2930 * Description 2940 - * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. 2931 + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. 2941 2932 * *rcv_nxt* is the ack_seq to be sent out. 2942 2933 * Return 2943 2934 * 0 on success, or a negative error in case of failure. ··· 2965 2956 * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) 2966 2957 * Description 2967 2958 * For an eBPF program attached to a perf event, retrieve the 2968 - * branch records (struct perf_branch_entry) associated to *ctx* 2969 - * and store it in the buffer pointed by *buf* up to size 2959 + * branch records (**struct perf_branch_entry**) associated to *ctx* 2960 + * and store it in the buffer pointed by *buf* up to size 2970 2961 * *size* bytes. 2971 2962 * Return 2972 2963 * On success, number of bytes written to *buf*. On error, a 2973 2964 * negative value. 2974 2965 * 2975 2966 * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to 2976 - * instead return the number of bytes required to store all the 2967 + * instead return the number of bytes required to store all the 2977 2968 * branch entries. If this flag is set, *buf* may be NULL. 2978 2969 * 2979 2970 * **-EINVAL** if arguments invalid or **size** not a multiple 2980 - * of sizeof(struct perf_branch_entry). 2971 + * of **sizeof**\ (**struct perf_branch_entry**\ ). 2981 2972 * 2982 2973 * **-ENOENT** if architecture does not support branch records. 2983 2974 * ··· 2985 2976 * Description 2986 2977 * Returns 0 on success, values for *pid* and *tgid* as seen from the current 2987 2978 * *namespace* will be returned in *nsdata*. 2988 - * 2989 - * On failure, the returned value is one of the following: 2979 + * Return 2980 + * 0 on success, or one of the following in case of failure: 2990 2981 * 2991 2982 * **-EINVAL** if dev and inum supplied don't match dev_t and inode number 2992 2983 * with nsfs of current task, or if dev conversion to dev_t lost high bits. ··· 3025 3016 * a global identifier that can be assumed unique. If *ctx* is 3026 3017 * NULL, then the helper returns the cookie for the initial 3027 3018 * network namespace. The cookie itself is very similar to that 3028 - * of bpf_get_socket_cookie() helper, but for network namespaces 3029 - * instead of sockets. 3019 + * of **bpf_get_socket_cookie**\ () helper, but for network 3020 + * namespaces instead of sockets. 3030 3021 * Return 3031 3022 * A 8-byte long opaque number. 3032 3023 * ··· 3061 3052 * 3062 3053 * The *flags* argument must be zero. 3063 3054 * Return 3064 - * 0 on success, or a negative errno in case of failure. 3055 + * 0 on success, or a negative error in case of failure: 3065 3056 * 3066 - * * **-EINVAL** Unsupported flags specified. 3067 - * * **-ENOENT** Socket is unavailable for assignment. 3068 - * * **-ENETUNREACH** Socket is unreachable (wrong netns). 3069 - * * **-EOPNOTSUPP** Unsupported operation, for example a 3070 - * call from outside of TC ingress. 3071 - * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). 3057 + * **-EINVAL** if specified *flags* are not supported. 3058 + * 3059 + * **-ENOENT** if the socket is unavailable for assignment. 3060 + * 3061 + * **-ENETUNREACH** if the socket is unreachable (wrong netns). 3062 + * 3063 + * **-EOPNOTSUPP** if the operation is not supported, for example 3064 + * a call from outside of TC ingress. 3065 + * 3066 + * **-ESOCKTNOSUPPORT** if the socket type is not supported 3067 + * (reuseport). 3072 3068 * 3073 3069 * u64 bpf_ktime_get_boot_ns(void) 3074 3070 * Description 3075 3071 * Return the time elapsed since system boot, in nanoseconds. 3076 3072 * Does include the time the system was suspended. 3077 - * See: clock_gettime(CLOCK_BOOTTIME) 3073 + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) 3078 3074 * Return 3079 3075 * Current *ktime*. 3076 + * 3077 + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) 3078 + * Description 3079 + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print 3080 + * out the format string. 3081 + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for 3082 + * the format string itself. The *data* and *data_len* are format string 3083 + * arguments. The *data* are a **u64** array and corresponding format string 3084 + * values are stored in the array. For strings and pointers where pointees 3085 + * are accessed, only the pointer values are stored in the *data* array. 3086 + * The *data_len* is the size of *data* in bytes. 3087 + * 3088 + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. 3089 + * Reading kernel memory may fail due to either invalid address or 3090 + * valid address but requiring a major memory fault. If reading kernel memory 3091 + * fails, the string for **%s** will be an empty string, and the ip 3092 + * address for **%p{i,I}{4,6}** will be 0. Not returning error to 3093 + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. 3094 + * Return 3095 + * 0 on success, or a negative error in case of failure: 3096 + * 3097 + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again 3098 + * by returning 1 from bpf program. 3099 + * 3100 + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. 3101 + * 3102 + * **-E2BIG** if *fmt* contains too many format specifiers. 3103 + * 3104 + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. 3105 + * 3106 + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) 3107 + * Description 3108 + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. 3109 + * The *m* represents the seq_file. The *data* and *len* represent the 3110 + * data to write in bytes. 3111 + * Return 3112 + * 0 on success, or a negative error in case of failure: 3113 + * 3114 + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. 3115 + * 3116 + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) 3117 + * Description 3118 + * Return the cgroup v2 id of the socket *sk*. 3119 + * 3120 + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one 3121 + * returned from **bpf_sk_lookup_xxx**\ (), 3122 + * **bpf_sk_fullsock**\ (), etc. The format of returned id is 3123 + * same as in **bpf_skb_cgroup_id**\ (). 3124 + * 3125 + * This helper is available only if the kernel was compiled with 3126 + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. 3127 + * Return 3128 + * The id is returned or 0 in case the id could not be retrieved. 3129 + * 3130 + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) 3131 + * Description 3132 + * Return id of cgroup v2 that is ancestor of cgroup associated 3133 + * with the *sk* at the *ancestor_level*. The root cgroup is at 3134 + * *ancestor_level* zero and each step down the hierarchy 3135 + * increments the level. If *ancestor_level* == level of cgroup 3136 + * associated with *sk*, then return value will be same as that 3137 + * of **bpf_sk_cgroup_id**\ (). 3138 + * 3139 + * The helper is useful to implement policies based on cgroups 3140 + * that are upper in hierarchy than immediate cgroup associated 3141 + * with *sk*. 3142 + * 3143 + * The format of returned id and helper limitations are same as in 3144 + * **bpf_sk_cgroup_id**\ (). 3145 + * Return 3146 + * The id is returned or 0 in case the id could not be retrieved. 3080 3147 */ 3081 3148 #define __BPF_FUNC_MAPPER(FN) \ 3082 3149 FN(unspec), \ ··· 3280 3195 FN(get_netns_cookie), \ 3281 3196 FN(get_current_ancestor_cgroup_id), \ 3282 3197 FN(sk_assign), \ 3283 - FN(ktime_get_boot_ns), 3198 + FN(ktime_get_boot_ns), \ 3199 + FN(seq_printf), \ 3200 + FN(seq_write), \ 3201 + FN(sk_cgroup_id), \ 3202 + FN(sk_ancestor_cgroup_id), 3284 3203 3285 3204 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 3286 3205 * function eBPF program intends to call ··· 3762 3673 __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. 3763 3674 * Stored in network byte order. 3764 3675 */ 3765 - __u32 user_port; /* Allows 4-byte read and write. 3676 + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. 3766 3677 * Stored in network byte order 3767 3678 */ 3768 3679 __u32 family; /* Allows 4-byte read, but no write */

+7 -1

include/uapi/linux/capability.h

··· 367 367 368 368 #define CAP_AUDIT_READ 37 369 369 370 + /* 371 + * Allow system performance and observability privileged operations 372 + * using perf_events, i915_perf and other kernel subsystems 373 + */ 370 374 371 - #define CAP_LAST_CAP CAP_AUDIT_READ 375 + #define CAP_PERFMON 38 376 + 377 + #define CAP_LAST_CAP CAP_PERFMON 372 378 373 379 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) 374 380

+1 -1

kernel/bpf/Makefile

··· 2 2 obj-y := core.o 3 3 CFLAGS_core.o += $(call cc-disable-warning, override-init) 4 4 5 - obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o 5 + obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o 6 6 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o 7 7 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o 8 8 obj-$(CONFIG_BPF_SYSCALL) += disasm.o

+539

kernel/bpf/bpf_iter.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright (c) 2020 Facebook */ 3 + 4 + #include <linux/fs.h> 5 + #include <linux/anon_inodes.h> 6 + #include <linux/filter.h> 7 + #include <linux/bpf.h> 8 + 9 + struct bpf_iter_target_info { 10 + struct list_head list; 11 + const struct bpf_iter_reg *reg_info; 12 + u32 btf_id; /* cached value */ 13 + }; 14 + 15 + struct bpf_iter_link { 16 + struct bpf_link link; 17 + struct bpf_iter_target_info *tinfo; 18 + }; 19 + 20 + struct bpf_iter_priv_data { 21 + struct bpf_iter_target_info *tinfo; 22 + struct bpf_prog *prog; 23 + u64 session_id; 24 + u64 seq_num; 25 + bool done_stop; 26 + u8 target_private[] __aligned(8); 27 + }; 28 + 29 + static struct list_head targets = LIST_HEAD_INIT(targets); 30 + static DEFINE_MUTEX(targets_mutex); 31 + 32 + /* protect bpf_iter_link changes */ 33 + static DEFINE_MUTEX(link_mutex); 34 + 35 + /* incremented on every opened seq_file */ 36 + static atomic64_t session_id; 37 + 38 + static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); 39 + 40 + static void bpf_iter_inc_seq_num(struct seq_file *seq) 41 + { 42 + struct bpf_iter_priv_data *iter_priv; 43 + 44 + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, 45 + target_private); 46 + iter_priv->seq_num++; 47 + } 48 + 49 + static void bpf_iter_dec_seq_num(struct seq_file *seq) 50 + { 51 + struct bpf_iter_priv_data *iter_priv; 52 + 53 + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, 54 + target_private); 55 + iter_priv->seq_num--; 56 + } 57 + 58 + static void bpf_iter_done_stop(struct seq_file *seq) 59 + { 60 + struct bpf_iter_priv_data *iter_priv; 61 + 62 + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, 63 + target_private); 64 + iter_priv->done_stop = true; 65 + } 66 + 67 + /* bpf_seq_read, a customized and simpler version for bpf iterator. 68 + * no_llseek is assumed for this file. 69 + * The following are differences from seq_read(): 70 + * . fixed buffer size (PAGE_SIZE) 71 + * . assuming no_llseek 72 + * . stop() may call bpf program, handling potential overflow there 73 + */ 74 + static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, 75 + loff_t *ppos) 76 + { 77 + struct seq_file *seq = file->private_data; 78 + size_t n, offs, copied = 0; 79 + int err = 0; 80 + void *p; 81 + 82 + mutex_lock(&seq->lock); 83 + 84 + if (!seq->buf) { 85 + seq->size = PAGE_SIZE; 86 + seq->buf = kmalloc(seq->size, GFP_KERNEL); 87 + if (!seq->buf) { 88 + err = -ENOMEM; 89 + goto done; 90 + } 91 + } 92 + 93 + if (seq->count) { 94 + n = min(seq->count, size); 95 + err = copy_to_user(buf, seq->buf + seq->from, n); 96 + if (err) { 97 + err = -EFAULT; 98 + goto done; 99 + } 100 + seq->count -= n; 101 + seq->from += n; 102 + copied = n; 103 + goto done; 104 + } 105 + 106 + seq->from = 0; 107 + p = seq->op->start(seq, &seq->index); 108 + if (!p) 109 + goto stop; 110 + if (IS_ERR(p)) { 111 + err = PTR_ERR(p); 112 + seq->op->stop(seq, p); 113 + seq->count = 0; 114 + goto done; 115 + } 116 + 117 + err = seq->op->show(seq, p); 118 + if (err > 0) { 119 + /* object is skipped, decrease seq_num, so next 120 + * valid object can reuse the same seq_num. 121 + */ 122 + bpf_iter_dec_seq_num(seq); 123 + seq->count = 0; 124 + } else if (err < 0 || seq_has_overflowed(seq)) { 125 + if (!err) 126 + err = -E2BIG; 127 + seq->op->stop(seq, p); 128 + seq->count = 0; 129 + goto done; 130 + } 131 + 132 + while (1) { 133 + loff_t pos = seq->index; 134 + 135 + offs = seq->count; 136 + p = seq->op->next(seq, p, &seq->index); 137 + if (pos == seq->index) { 138 + pr_info_ratelimited("buggy seq_file .next function %ps " 139 + "did not updated position index\n", 140 + seq->op->next); 141 + seq->index++; 142 + } 143 + 144 + if (IS_ERR_OR_NULL(p)) 145 + break; 146 + 147 + /* got a valid next object, increase seq_num */ 148 + bpf_iter_inc_seq_num(seq); 149 + 150 + if (seq->count >= size) 151 + break; 152 + 153 + err = seq->op->show(seq, p); 154 + if (err > 0) { 155 + bpf_iter_dec_seq_num(seq); 156 + seq->count = offs; 157 + } else if (err < 0 || seq_has_overflowed(seq)) { 158 + seq->count = offs; 159 + if (offs == 0) { 160 + if (!err) 161 + err = -E2BIG; 162 + seq->op->stop(seq, p); 163 + goto done; 164 + } 165 + break; 166 + } 167 + } 168 + stop: 169 + offs = seq->count; 170 + /* bpf program called if !p */ 171 + seq->op->stop(seq, p); 172 + if (!p) { 173 + if (!seq_has_overflowed(seq)) { 174 + bpf_iter_done_stop(seq); 175 + } else { 176 + seq->count = offs; 177 + if (offs == 0) { 178 + err = -E2BIG; 179 + goto done; 180 + } 181 + } 182 + } 183 + 184 + n = min(seq->count, size); 185 + err = copy_to_user(buf, seq->buf, n); 186 + if (err) { 187 + err = -EFAULT; 188 + goto done; 189 + } 190 + copied = n; 191 + seq->count -= n; 192 + seq->from = n; 193 + done: 194 + if (!copied) 195 + copied = err; 196 + else 197 + *ppos += copied; 198 + mutex_unlock(&seq->lock); 199 + return copied; 200 + } 201 + 202 + static int iter_open(struct inode *inode, struct file *file) 203 + { 204 + struct bpf_iter_link *link = inode->i_private; 205 + 206 + return prepare_seq_file(file, link); 207 + } 208 + 209 + static int iter_release(struct inode *inode, struct file *file) 210 + { 211 + struct bpf_iter_priv_data *iter_priv; 212 + struct seq_file *seq; 213 + 214 + seq = file->private_data; 215 + if (!seq) 216 + return 0; 217 + 218 + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, 219 + target_private); 220 + 221 + if (iter_priv->tinfo->reg_info->fini_seq_private) 222 + iter_priv->tinfo->reg_info->fini_seq_private(seq->private); 223 + 224 + bpf_prog_put(iter_priv->prog); 225 + seq->private = iter_priv; 226 + 227 + return seq_release_private(inode, file); 228 + } 229 + 230 + const struct file_operations bpf_iter_fops = { 231 + .open = iter_open, 232 + .llseek = no_llseek, 233 + .read = bpf_seq_read, 234 + .release = iter_release, 235 + }; 236 + 237 + /* The argument reg_info will be cached in bpf_iter_target_info. 238 + * The common practice is to declare target reg_info as 239 + * a const static variable and passed as an argument to 240 + * bpf_iter_reg_target(). 241 + */ 242 + int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) 243 + { 244 + struct bpf_iter_target_info *tinfo; 245 + 246 + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 247 + if (!tinfo) 248 + return -ENOMEM; 249 + 250 + tinfo->reg_info = reg_info; 251 + INIT_LIST_HEAD(&tinfo->list); 252 + 253 + mutex_lock(&targets_mutex); 254 + list_add(&tinfo->list, &targets); 255 + mutex_unlock(&targets_mutex); 256 + 257 + return 0; 258 + } 259 + 260 + void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) 261 + { 262 + struct bpf_iter_target_info *tinfo; 263 + bool found = false; 264 + 265 + mutex_lock(&targets_mutex); 266 + list_for_each_entry(tinfo, &targets, list) { 267 + if (reg_info == tinfo->reg_info) { 268 + list_del(&tinfo->list); 269 + kfree(tinfo); 270 + found = true; 271 + break; 272 + } 273 + } 274 + mutex_unlock(&targets_mutex); 275 + 276 + WARN_ON(found == false); 277 + } 278 + 279 + static void cache_btf_id(struct bpf_iter_target_info *tinfo, 280 + struct bpf_prog *prog) 281 + { 282 + tinfo->btf_id = prog->aux->attach_btf_id; 283 + } 284 + 285 + bool bpf_iter_prog_supported(struct bpf_prog *prog) 286 + { 287 + const char *attach_fname = prog->aux->attach_func_name; 288 + u32 prog_btf_id = prog->aux->attach_btf_id; 289 + const char *prefix = BPF_ITER_FUNC_PREFIX; 290 + struct bpf_iter_target_info *tinfo; 291 + int prefix_len = strlen(prefix); 292 + bool supported = false; 293 + 294 + if (strncmp(attach_fname, prefix, prefix_len)) 295 + return false; 296 + 297 + mutex_lock(&targets_mutex); 298 + list_for_each_entry(tinfo, &targets, list) { 299 + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { 300 + supported = true; 301 + break; 302 + } 303 + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { 304 + cache_btf_id(tinfo, prog); 305 + supported = true; 306 + break; 307 + } 308 + } 309 + mutex_unlock(&targets_mutex); 310 + 311 + if (supported) { 312 + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; 313 + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; 314 + } 315 + 316 + return supported; 317 + } 318 + 319 + static void bpf_iter_link_release(struct bpf_link *link) 320 + { 321 + } 322 + 323 + static void bpf_iter_link_dealloc(struct bpf_link *link) 324 + { 325 + struct bpf_iter_link *iter_link = 326 + container_of(link, struct bpf_iter_link, link); 327 + 328 + kfree(iter_link); 329 + } 330 + 331 + static int bpf_iter_link_replace(struct bpf_link *link, 332 + struct bpf_prog *new_prog, 333 + struct bpf_prog *old_prog) 334 + { 335 + int ret = 0; 336 + 337 + mutex_lock(&link_mutex); 338 + if (old_prog && link->prog != old_prog) { 339 + ret = -EPERM; 340 + goto out_unlock; 341 + } 342 + 343 + if (link->prog->type != new_prog->type || 344 + link->prog->expected_attach_type != new_prog->expected_attach_type || 345 + link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { 346 + ret = -EINVAL; 347 + goto out_unlock; 348 + } 349 + 350 + old_prog = xchg(&link->prog, new_prog); 351 + bpf_prog_put(old_prog); 352 + 353 + out_unlock: 354 + mutex_unlock(&link_mutex); 355 + return ret; 356 + } 357 + 358 + static const struct bpf_link_ops bpf_iter_link_lops = { 359 + .release = bpf_iter_link_release, 360 + .dealloc = bpf_iter_link_dealloc, 361 + .update_prog = bpf_iter_link_replace, 362 + }; 363 + 364 + bool bpf_link_is_iter(struct bpf_link *link) 365 + { 366 + return link->ops == &bpf_iter_link_lops; 367 + } 368 + 369 + int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 370 + { 371 + struct bpf_link_primer link_primer; 372 + struct bpf_iter_target_info *tinfo; 373 + struct bpf_iter_link *link; 374 + bool existed = false; 375 + u32 prog_btf_id; 376 + int err; 377 + 378 + if (attr->link_create.target_fd || attr->link_create.flags) 379 + return -EINVAL; 380 + 381 + prog_btf_id = prog->aux->attach_btf_id; 382 + mutex_lock(&targets_mutex); 383 + list_for_each_entry(tinfo, &targets, list) { 384 + if (tinfo->btf_id == prog_btf_id) { 385 + existed = true; 386 + break; 387 + } 388 + } 389 + mutex_unlock(&targets_mutex); 390 + if (!existed) 391 + return -ENOENT; 392 + 393 + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); 394 + if (!link) 395 + return -ENOMEM; 396 + 397 + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); 398 + link->tinfo = tinfo; 399 + 400 + err = bpf_link_prime(&link->link, &link_primer); 401 + if (err) { 402 + kfree(link); 403 + return err; 404 + } 405 + 406 + return bpf_link_settle(&link_primer); 407 + } 408 + 409 + static void init_seq_meta(struct bpf_iter_priv_data *priv_data, 410 + struct bpf_iter_target_info *tinfo, 411 + struct bpf_prog *prog) 412 + { 413 + priv_data->tinfo = tinfo; 414 + priv_data->prog = prog; 415 + priv_data->session_id = atomic64_inc_return(&session_id); 416 + priv_data->seq_num = 0; 417 + priv_data->done_stop = false; 418 + } 419 + 420 + static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) 421 + { 422 + struct bpf_iter_priv_data *priv_data; 423 + struct bpf_iter_target_info *tinfo; 424 + struct bpf_prog *prog; 425 + u32 total_priv_dsize; 426 + struct seq_file *seq; 427 + int err = 0; 428 + 429 + mutex_lock(&link_mutex); 430 + prog = link->link.prog; 431 + bpf_prog_inc(prog); 432 + mutex_unlock(&link_mutex); 433 + 434 + tinfo = link->tinfo; 435 + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + 436 + tinfo->reg_info->seq_priv_size; 437 + priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, 438 + total_priv_dsize); 439 + if (!priv_data) { 440 + err = -ENOMEM; 441 + goto release_prog; 442 + } 443 + 444 + if (tinfo->reg_info->init_seq_private) { 445 + err = tinfo->reg_info->init_seq_private(priv_data->target_private); 446 + if (err) 447 + goto release_seq_file; 448 + } 449 + 450 + init_seq_meta(priv_data, tinfo, prog); 451 + seq = file->private_data; 452 + seq->private = priv_data->target_private; 453 + 454 + return 0; 455 + 456 + release_seq_file: 457 + seq_release_private(file->f_inode, file); 458 + file->private_data = NULL; 459 + release_prog: 460 + bpf_prog_put(prog); 461 + return err; 462 + } 463 + 464 + int bpf_iter_new_fd(struct bpf_link *link) 465 + { 466 + struct file *file; 467 + unsigned int flags; 468 + int err, fd; 469 + 470 + if (link->ops != &bpf_iter_link_lops) 471 + return -EINVAL; 472 + 473 + flags = O_RDONLY | O_CLOEXEC; 474 + fd = get_unused_fd_flags(flags); 475 + if (fd < 0) 476 + return fd; 477 + 478 + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); 479 + if (IS_ERR(file)) { 480 + err = PTR_ERR(file); 481 + goto free_fd; 482 + } 483 + 484 + err = prepare_seq_file(file, 485 + container_of(link, struct bpf_iter_link, link)); 486 + if (err) 487 + goto free_file; 488 + 489 + fd_install(fd, file); 490 + return fd; 491 + 492 + free_file: 493 + fput(file); 494 + free_fd: 495 + put_unused_fd(fd); 496 + return err; 497 + } 498 + 499 + struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) 500 + { 501 + struct bpf_iter_priv_data *iter_priv; 502 + struct seq_file *seq; 503 + void *seq_priv; 504 + 505 + seq = meta->seq; 506 + if (seq->file->f_op != &bpf_iter_fops) 507 + return NULL; 508 + 509 + seq_priv = seq->private; 510 + iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, 511 + target_private); 512 + 513 + if (in_stop && iter_priv->done_stop) 514 + return NULL; 515 + 516 + meta->session_id = iter_priv->session_id; 517 + meta->seq_num = iter_priv->seq_num; 518 + 519 + return iter_priv->prog; 520 + } 521 + 522 + int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) 523 + { 524 + int ret; 525 + 526 + rcu_read_lock(); 527 + migrate_disable(); 528 + ret = BPF_PROG_RUN(prog, ctx); 529 + migrate_enable(); 530 + rcu_read_unlock(); 531 + 532 + /* bpf program can only return 0 or 1: 533 + * 0 : okay 534 + * 1 : retry the same object 535 + * The bpf_iter_run_prog() return value 536 + * will be seq_ops->show() return value. 537 + */ 538 + return ret == 0 ? 0 : -EAGAIN; 539 + }

+46 -1

kernel/bpf/btf.c

··· 3694 3694 struct bpf_verifier_log *log = info->log; 3695 3695 const struct btf_param *args; 3696 3696 u32 nr_args, arg; 3697 - int ret; 3697 + int i, ret; 3698 3698 3699 3699 if (off % 8) { 3700 3700 bpf_log(log, "func '%s' offset %d is not multiple of 8\n", ··· 3791 3791 3792 3792 /* this is a pointer to another type */ 3793 3793 info->reg_type = PTR_TO_BTF_ID; 3794 + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { 3795 + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; 3796 + 3797 + if (ctx_arg_info->offset == off) { 3798 + info->reg_type = ctx_arg_info->reg_type; 3799 + break; 3800 + } 3801 + } 3794 3802 3795 3803 if (tgt_prog) { 3796 3804 ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); ··· 3838 3830 const struct btf_type *mtype, *elem_type = NULL; 3839 3831 const struct btf_member *member; 3840 3832 const char *tname, *mname; 3833 + u32 vlen; 3841 3834 3842 3835 again: 3843 3836 tname = __btf_name_by_offset(btf_vmlinux, t->name_off); ··· 3847 3838 return -EINVAL; 3848 3839 } 3849 3840 3841 + vlen = btf_type_vlen(t); 3850 3842 if (off + size > t->size) { 3843 + /* If the last element is a variable size array, we may 3844 + * need to relax the rule. 3845 + */ 3846 + struct btf_array *array_elem; 3847 + 3848 + if (vlen == 0) 3849 + goto error; 3850 + 3851 + member = btf_type_member(t) + vlen - 1; 3852 + mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, 3853 + NULL); 3854 + if (!btf_type_is_array(mtype)) 3855 + goto error; 3856 + 3857 + array_elem = (struct btf_array *)(mtype + 1); 3858 + if (array_elem->nelems != 0) 3859 + goto error; 3860 + 3861 + moff = btf_member_bit_offset(t, member) / 8; 3862 + if (off < moff) 3863 + goto error; 3864 + 3865 + /* Only allow structure for now, can be relaxed for 3866 + * other types later. 3867 + */ 3868 + elem_type = btf_type_skip_modifiers(btf_vmlinux, 3869 + array_elem->type, NULL); 3870 + if (!btf_type_is_struct(elem_type)) 3871 + goto error; 3872 + 3873 + off = (off - moff) % elem_type->size; 3874 + return btf_struct_access(log, elem_type, off, size, atype, 3875 + next_btf_id); 3876 + 3877 + error: 3851 3878 bpf_log(log, "access beyond struct %s at off %u size %u\n", 3852 3879 tname, off, size); 3853 3880 return -EACCES;

+4 -1

kernel/bpf/inode.c

··· 358 358 359 359 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) 360 360 { 361 + struct bpf_link *link = arg; 362 + 361 363 return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, 362 - &bpffs_obj_fops); 364 + bpf_link_is_iter(link) ? 365 + &bpf_iter_fops : &bpffs_obj_fops); 363 366 } 364 367 365 368 static struct dentry *

+102

kernel/bpf/map_iter.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright (c) 2020 Facebook */ 3 + #include <linux/bpf.h> 4 + #include <linux/fs.h> 5 + #include <linux/filter.h> 6 + #include <linux/kernel.h> 7 + 8 + struct bpf_iter_seq_map_info { 9 + u32 mid; 10 + }; 11 + 12 + static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) 13 + { 14 + struct bpf_iter_seq_map_info *info = seq->private; 15 + struct bpf_map *map; 16 + 17 + map = bpf_map_get_curr_or_next(&info->mid); 18 + if (!map) 19 + return NULL; 20 + 21 + ++*pos; 22 + return map; 23 + } 24 + 25 + static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 26 + { 27 + struct bpf_iter_seq_map_info *info = seq->private; 28 + struct bpf_map *map; 29 + 30 + ++*pos; 31 + ++info->mid; 32 + bpf_map_put((struct bpf_map *)v); 33 + map = bpf_map_get_curr_or_next(&info->mid); 34 + if (!map) 35 + return NULL; 36 + 37 + return map; 38 + } 39 + 40 + struct bpf_iter__bpf_map { 41 + __bpf_md_ptr(struct bpf_iter_meta *, meta); 42 + __bpf_md_ptr(struct bpf_map *, map); 43 + }; 44 + 45 + DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map) 46 + 47 + static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop) 48 + { 49 + struct bpf_iter__bpf_map ctx; 50 + struct bpf_iter_meta meta; 51 + struct bpf_prog *prog; 52 + int ret = 0; 53 + 54 + ctx.meta = &meta; 55 + ctx.map = v; 56 + meta.seq = seq; 57 + prog = bpf_iter_get_info(&meta, in_stop); 58 + if (prog) 59 + ret = bpf_iter_run_prog(prog, &ctx); 60 + 61 + return ret; 62 + } 63 + 64 + static int bpf_map_seq_show(struct seq_file *seq, void *v) 65 + { 66 + return __bpf_map_seq_show(seq, v, false); 67 + } 68 + 69 + static void bpf_map_seq_stop(struct seq_file *seq, void *v) 70 + { 71 + if (!v) 72 + (void)__bpf_map_seq_show(seq, v, true); 73 + else 74 + bpf_map_put((struct bpf_map *)v); 75 + } 76 + 77 + static const struct seq_operations bpf_map_seq_ops = { 78 + .start = bpf_map_seq_start, 79 + .next = bpf_map_seq_next, 80 + .stop = bpf_map_seq_stop, 81 + .show = bpf_map_seq_show, 82 + }; 83 + 84 + static const struct bpf_iter_reg bpf_map_reg_info = { 85 + .target = "bpf_map", 86 + .seq_ops = &bpf_map_seq_ops, 87 + .init_seq_private = NULL, 88 + .fini_seq_private = NULL, 89 + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), 90 + .ctx_arg_info_size = 1, 91 + .ctx_arg_info = { 92 + { offsetof(struct bpf_iter__bpf_map, map), 93 + PTR_TO_BTF_ID_OR_NULL }, 94 + }, 95 + }; 96 + 97 + static int __init bpf_map_iter_init(void) 98 + { 99 + return bpf_iter_reg_target(&bpf_map_reg_info); 100 + } 101 + 102 + late_initcall(bpf_map_iter_init);

+1 -1

kernel/bpf/queue_stack_maps.c

··· 19 19 u32 head, tail; 20 20 u32 size; /* max_entries + 1 */ 21 21 22 - char elements[0] __aligned(8); 22 + char elements[] __aligned(8); 23 23 }; 24 24 25 25 static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map)

+59

kernel/bpf/syscall.c

··· 2729 2729 case BPF_CGROUP_GETSOCKOPT: 2730 2730 case BPF_CGROUP_SETSOCKOPT: 2731 2731 return BPF_PROG_TYPE_CGROUP_SOCKOPT; 2732 + case BPF_TRACE_ITER: 2733 + return BPF_PROG_TYPE_TRACING; 2732 2734 default: 2733 2735 return BPF_PROG_TYPE_UNSPEC; 2734 2736 } ··· 2932 2930 err = put_user(next_id, &uattr->next_id); 2933 2931 2934 2932 return err; 2933 + } 2934 + 2935 + struct bpf_map *bpf_map_get_curr_or_next(u32 *id) 2936 + { 2937 + struct bpf_map *map; 2938 + 2939 + spin_lock_bh(&map_idr_lock); 2940 + again: 2941 + map = idr_get_next(&map_idr, id); 2942 + if (map) { 2943 + map = __bpf_map_inc_not_zero(map, false); 2944 + if (IS_ERR(map)) { 2945 + (*id)++; 2946 + goto again; 2947 + } 2948 + } 2949 + spin_unlock_bh(&map_idr_lock); 2950 + 2951 + return map; 2935 2952 } 2936 2953 2937 2954 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id ··· 3750 3729 return err; 3751 3730 } 3752 3731 3732 + static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 3733 + { 3734 + if (attr->link_create.attach_type == BPF_TRACE_ITER && 3735 + prog->expected_attach_type == BPF_TRACE_ITER) 3736 + return bpf_iter_link_attach(attr, prog); 3737 + 3738 + return -EINVAL; 3739 + } 3740 + 3753 3741 #define BPF_LINK_CREATE_LAST_FIELD link_create.flags 3754 3742 static int link_create(union bpf_attr *attr) 3755 3743 { ··· 3794 3764 case BPF_PROG_TYPE_CGROUP_SYSCTL: 3795 3765 case BPF_PROG_TYPE_CGROUP_SOCKOPT: 3796 3766 ret = cgroup_bpf_link_attach(attr, prog); 3767 + break; 3768 + case BPF_PROG_TYPE_TRACING: 3769 + ret = tracing_bpf_link_attach(attr, prog); 3797 3770 break; 3798 3771 default: 3799 3772 ret = -EINVAL; ··· 3960 3927 return -EINVAL; 3961 3928 } 3962 3929 3930 + #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags 3931 + 3932 + static int bpf_iter_create(union bpf_attr *attr) 3933 + { 3934 + struct bpf_link *link; 3935 + int err; 3936 + 3937 + if (CHECK_ATTR(BPF_ITER_CREATE)) 3938 + return -EINVAL; 3939 + 3940 + if (attr->iter_create.flags) 3941 + return -EINVAL; 3942 + 3943 + link = bpf_link_get_from_fd(attr->iter_create.link_fd); 3944 + if (IS_ERR(link)) 3945 + return PTR_ERR(link); 3946 + 3947 + err = bpf_iter_new_fd(link); 3948 + bpf_link_put(link); 3949 + 3950 + return err; 3951 + } 3952 + 3963 3953 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 3964 3954 { 3965 3955 union bpf_attr attr; ··· 4109 4053 break; 4110 4054 case BPF_ENABLE_STATS: 4111 4055 err = bpf_enable_stats(&attr); 4056 + break; 4057 + case BPF_ITER_CREATE: 4058 + err = bpf_iter_create(&attr); 4112 4059 break; 4113 4060 default: 4114 4061 err = -EINVAL;

+353

kernel/bpf/task_iter.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright (c) 2020 Facebook */ 3 + 4 + #include <linux/init.h> 5 + #include <linux/namei.h> 6 + #include <linux/pid_namespace.h> 7 + #include <linux/fs.h> 8 + #include <linux/fdtable.h> 9 + #include <linux/filter.h> 10 + 11 + struct bpf_iter_seq_task_common { 12 + struct pid_namespace *ns; 13 + }; 14 + 15 + struct bpf_iter_seq_task_info { 16 + /* The first field must be struct bpf_iter_seq_task_common. 17 + * this is assumed by {init, fini}_seq_pidns() callback functions. 18 + */ 19 + struct bpf_iter_seq_task_common common; 20 + u32 tid; 21 + }; 22 + 23 + static struct task_struct *task_seq_get_next(struct pid_namespace *ns, 24 + u32 *tid) 25 + { 26 + struct task_struct *task = NULL; 27 + struct pid *pid; 28 + 29 + rcu_read_lock(); 30 + retry: 31 + pid = idr_get_next(&ns->idr, tid); 32 + if (pid) { 33 + task = get_pid_task(pid, PIDTYPE_PID); 34 + if (!task) { 35 + ++*tid; 36 + goto retry; 37 + } 38 + } 39 + rcu_read_unlock(); 40 + 41 + return task; 42 + } 43 + 44 + static void *task_seq_start(struct seq_file *seq, loff_t *pos) 45 + { 46 + struct bpf_iter_seq_task_info *info = seq->private; 47 + struct task_struct *task; 48 + 49 + task = task_seq_get_next(info->common.ns, &info->tid); 50 + if (!task) 51 + return NULL; 52 + 53 + ++*pos; 54 + return task; 55 + } 56 + 57 + static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) 58 + { 59 + struct bpf_iter_seq_task_info *info = seq->private; 60 + struct task_struct *task; 61 + 62 + ++*pos; 63 + ++info->tid; 64 + put_task_struct((struct task_struct *)v); 65 + task = task_seq_get_next(info->common.ns, &info->tid); 66 + if (!task) 67 + return NULL; 68 + 69 + return task; 70 + } 71 + 72 + struct bpf_iter__task { 73 + __bpf_md_ptr(struct bpf_iter_meta *, meta); 74 + __bpf_md_ptr(struct task_struct *, task); 75 + }; 76 + 77 + DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) 78 + 79 + static int __task_seq_show(struct seq_file *seq, struct task_struct *task, 80 + bool in_stop) 81 + { 82 + struct bpf_iter_meta meta; 83 + struct bpf_iter__task ctx; 84 + struct bpf_prog *prog; 85 + 86 + meta.seq = seq; 87 + prog = bpf_iter_get_info(&meta, in_stop); 88 + if (!prog) 89 + return 0; 90 + 91 + meta.seq = seq; 92 + ctx.meta = &meta; 93 + ctx.task = task; 94 + return bpf_iter_run_prog(prog, &ctx); 95 + } 96 + 97 + static int task_seq_show(struct seq_file *seq, void *v) 98 + { 99 + return __task_seq_show(seq, v, false); 100 + } 101 + 102 + static void task_seq_stop(struct seq_file *seq, void *v) 103 + { 104 + if (!v) 105 + (void)__task_seq_show(seq, v, true); 106 + else 107 + put_task_struct((struct task_struct *)v); 108 + } 109 + 110 + static const struct seq_operations task_seq_ops = { 111 + .start = task_seq_start, 112 + .next = task_seq_next, 113 + .stop = task_seq_stop, 114 + .show = task_seq_show, 115 + }; 116 + 117 + struct bpf_iter_seq_task_file_info { 118 + /* The first field must be struct bpf_iter_seq_task_common. 119 + * this is assumed by {init, fini}_seq_pidns() callback functions. 120 + */ 121 + struct bpf_iter_seq_task_common common; 122 + struct task_struct *task; 123 + struct files_struct *files; 124 + u32 tid; 125 + u32 fd; 126 + }; 127 + 128 + static struct file * 129 + task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, 130 + struct task_struct **task, struct files_struct **fstruct) 131 + { 132 + struct pid_namespace *ns = info->common.ns; 133 + u32 curr_tid = info->tid, max_fds; 134 + struct files_struct *curr_files; 135 + struct task_struct *curr_task; 136 + int curr_fd = info->fd; 137 + 138 + /* If this function returns a non-NULL file object, 139 + * it held a reference to the task/files_struct/file. 140 + * Otherwise, it does not hold any reference. 141 + */ 142 + again: 143 + if (*task) { 144 + curr_task = *task; 145 + curr_files = *fstruct; 146 + curr_fd = info->fd; 147 + } else { 148 + curr_task = task_seq_get_next(ns, &curr_tid); 149 + if (!curr_task) 150 + return NULL; 151 + 152 + curr_files = get_files_struct(curr_task); 153 + if (!curr_files) { 154 + put_task_struct(curr_task); 155 + curr_tid = ++(info->tid); 156 + info->fd = 0; 157 + goto again; 158 + } 159 + 160 + /* set *fstruct, *task and info->tid */ 161 + *fstruct = curr_files; 162 + *task = curr_task; 163 + if (curr_tid == info->tid) { 164 + curr_fd = info->fd; 165 + } else { 166 + info->tid = curr_tid; 167 + curr_fd = 0; 168 + } 169 + } 170 + 171 + rcu_read_lock(); 172 + max_fds = files_fdtable(curr_files)->max_fds; 173 + for (; curr_fd < max_fds; curr_fd++) { 174 + struct file *f; 175 + 176 + f = fcheck_files(curr_files, curr_fd); 177 + if (!f) 178 + continue; 179 + 180 + /* set info->fd */ 181 + info->fd = curr_fd; 182 + get_file(f); 183 + rcu_read_unlock(); 184 + return f; 185 + } 186 + 187 + /* the current task is done, go to the next task */ 188 + rcu_read_unlock(); 189 + put_files_struct(curr_files); 190 + put_task_struct(curr_task); 191 + *task = NULL; 192 + *fstruct = NULL; 193 + info->fd = 0; 194 + curr_tid = ++(info->tid); 195 + goto again; 196 + } 197 + 198 + static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 199 + { 200 + struct bpf_iter_seq_task_file_info *info = seq->private; 201 + struct files_struct *files = NULL; 202 + struct task_struct *task = NULL; 203 + struct file *file; 204 + 205 + file = task_file_seq_get_next(info, &task, &files); 206 + if (!file) { 207 + info->files = NULL; 208 + info->task = NULL; 209 + return NULL; 210 + } 211 + 212 + ++*pos; 213 + info->task = task; 214 + info->files = files; 215 + 216 + return file; 217 + } 218 + 219 + static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 220 + { 221 + struct bpf_iter_seq_task_file_info *info = seq->private; 222 + struct files_struct *files = info->files; 223 + struct task_struct *task = info->task; 224 + struct file *file; 225 + 226 + ++*pos; 227 + ++info->fd; 228 + fput((struct file *)v); 229 + file = task_file_seq_get_next(info, &task, &files); 230 + if (!file) { 231 + info->files = NULL; 232 + info->task = NULL; 233 + return NULL; 234 + } 235 + 236 + info->task = task; 237 + info->files = files; 238 + 239 + return file; 240 + } 241 + 242 + struct bpf_iter__task_file { 243 + __bpf_md_ptr(struct bpf_iter_meta *, meta); 244 + __bpf_md_ptr(struct task_struct *, task); 245 + u32 fd __aligned(8); 246 + __bpf_md_ptr(struct file *, file); 247 + }; 248 + 249 + DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, 250 + struct task_struct *task, u32 fd, 251 + struct file *file) 252 + 253 + static int __task_file_seq_show(struct seq_file *seq, struct file *file, 254 + bool in_stop) 255 + { 256 + struct bpf_iter_seq_task_file_info *info = seq->private; 257 + struct bpf_iter__task_file ctx; 258 + struct bpf_iter_meta meta; 259 + struct bpf_prog *prog; 260 + 261 + meta.seq = seq; 262 + prog = bpf_iter_get_info(&meta, in_stop); 263 + if (!prog) 264 + return 0; 265 + 266 + ctx.meta = &meta; 267 + ctx.task = info->task; 268 + ctx.fd = info->fd; 269 + ctx.file = file; 270 + return bpf_iter_run_prog(prog, &ctx); 271 + } 272 + 273 + static int task_file_seq_show(struct seq_file *seq, void *v) 274 + { 275 + return __task_file_seq_show(seq, v, false); 276 + } 277 + 278 + static void task_file_seq_stop(struct seq_file *seq, void *v) 279 + { 280 + struct bpf_iter_seq_task_file_info *info = seq->private; 281 + 282 + if (!v) { 283 + (void)__task_file_seq_show(seq, v, true); 284 + } else { 285 + fput((struct file *)v); 286 + put_files_struct(info->files); 287 + put_task_struct(info->task); 288 + info->files = NULL; 289 + info->task = NULL; 290 + } 291 + } 292 + 293 + static int init_seq_pidns(void *priv_data) 294 + { 295 + struct bpf_iter_seq_task_common *common = priv_data; 296 + 297 + common->ns = get_pid_ns(task_active_pid_ns(current)); 298 + return 0; 299 + } 300 + 301 + static void fini_seq_pidns(void *priv_data) 302 + { 303 + struct bpf_iter_seq_task_common *common = priv_data; 304 + 305 + put_pid_ns(common->ns); 306 + } 307 + 308 + static const struct seq_operations task_file_seq_ops = { 309 + .start = task_file_seq_start, 310 + .next = task_file_seq_next, 311 + .stop = task_file_seq_stop, 312 + .show = task_file_seq_show, 313 + }; 314 + 315 + static const struct bpf_iter_reg task_reg_info = { 316 + .target = "task", 317 + .seq_ops = &task_seq_ops, 318 + .init_seq_private = init_seq_pidns, 319 + .fini_seq_private = fini_seq_pidns, 320 + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), 321 + .ctx_arg_info_size = 1, 322 + .ctx_arg_info = { 323 + { offsetof(struct bpf_iter__task, task), 324 + PTR_TO_BTF_ID_OR_NULL }, 325 + }, 326 + }; 327 + 328 + static const struct bpf_iter_reg task_file_reg_info = { 329 + .target = "task_file", 330 + .seq_ops = &task_file_seq_ops, 331 + .init_seq_private = init_seq_pidns, 332 + .fini_seq_private = fini_seq_pidns, 333 + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), 334 + .ctx_arg_info_size = 2, 335 + .ctx_arg_info = { 336 + { offsetof(struct bpf_iter__task_file, task), 337 + PTR_TO_BTF_ID_OR_NULL }, 338 + { offsetof(struct bpf_iter__task_file, file), 339 + PTR_TO_BTF_ID_OR_NULL }, 340 + }, 341 + }; 342 + 343 + static int __init task_iter_init(void) 344 + { 345 + int ret; 346 + 347 + ret = bpf_iter_reg_target(&task_reg_info); 348 + if (ret) 349 + return ret; 350 + 351 + return bpf_iter_reg_target(&task_file_reg_info); 352 + } 353 + late_initcall(task_iter_init);

+37 -4

kernel/bpf/verifier.c

··· 398 398 return type == PTR_TO_MAP_VALUE_OR_NULL || 399 399 type == PTR_TO_SOCKET_OR_NULL || 400 400 type == PTR_TO_SOCK_COMMON_OR_NULL || 401 - type == PTR_TO_TCP_SOCK_OR_NULL; 401 + type == PTR_TO_TCP_SOCK_OR_NULL || 402 + type == PTR_TO_BTF_ID_OR_NULL; 402 403 } 403 404 404 405 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) ··· 484 483 [PTR_TO_TP_BUFFER] = "tp_buffer", 485 484 [PTR_TO_XDP_SOCK] = "xdp_sock", 486 485 [PTR_TO_BTF_ID] = "ptr_", 486 + [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", 487 487 }; 488 488 489 489 static char slot_type_char[] = { ··· 545 543 /* reg->off should be 0 for SCALAR_VALUE */ 546 544 verbose(env, "%lld", reg->var_off.value + reg->off); 547 545 } else { 548 - if (t == PTR_TO_BTF_ID) 546 + if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) 549 547 verbose(env, "%s", kernel_type_name(reg->btf_id)); 550 548 verbose(env, "(id=%d", reg->id); 551 549 if (reg_type_may_be_refcounted_or_null(t)) ··· 2141 2139 case PTR_TO_TCP_SOCK_OR_NULL: 2142 2140 case PTR_TO_XDP_SOCK: 2143 2141 case PTR_TO_BTF_ID: 2142 + case PTR_TO_BTF_ID_OR_NULL: 2144 2143 return true; 2145 2144 default: 2146 2145 return false; ··· 2662 2659 */ 2663 2660 *reg_type = info.reg_type; 2664 2661 2665 - if (*reg_type == PTR_TO_BTF_ID) 2662 + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) 2666 2663 *btf_id = info.btf_id; 2667 2664 else 2668 2665 env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; ··· 3246 3243 * a sub-register. 3247 3244 */ 3248 3245 regs[value_regno].subreg_def = DEF_NOT_SUBREG; 3249 - if (reg_type == PTR_TO_BTF_ID) 3246 + if (reg_type == PTR_TO_BTF_ID || 3247 + reg_type == PTR_TO_BTF_ID_OR_NULL) 3250 3248 regs[value_regno].btf_id = btf_id; 3251 3249 } 3252 3250 regs[value_regno].type = reg_type; ··· 3494 3490 *stype = STACK_MISC; 3495 3491 goto mark; 3496 3492 } 3493 + 3494 + if (state->stack[spi].slot_type[0] == STACK_SPILL && 3495 + state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) 3496 + goto mark; 3497 + 3497 3498 if (state->stack[spi].slot_type[0] == STACK_SPILL && 3498 3499 state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { 3499 3500 __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); ··· 6581 6572 reg->type = PTR_TO_SOCK_COMMON; 6582 6573 } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { 6583 6574 reg->type = PTR_TO_TCP_SOCK; 6575 + } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { 6576 + reg->type = PTR_TO_BTF_ID; 6584 6577 } 6585 6578 if (is_null) { 6586 6579 /* We don't need id and ref_obj_id from this point ··· 7111 7100 if (!env->prog->aux->attach_btf_id) 7112 7101 return 0; 7113 7102 range = tnum_const(0); 7103 + break; 7104 + case BPF_PROG_TYPE_TRACING: 7105 + if (env->prog->expected_attach_type != BPF_TRACE_ITER) 7106 + return 0; 7114 7107 break; 7115 7108 default: 7116 7109 return 0; ··· 8440 8425 case PTR_TO_TCP_SOCK_OR_NULL: 8441 8426 case PTR_TO_XDP_SOCK: 8442 8427 case PTR_TO_BTF_ID: 8428 + case PTR_TO_BTF_ID_OR_NULL: 8443 8429 return false; 8444 8430 default: 8445 8431 return true; ··· 10497 10481 struct bpf_prog *tgt_prog = prog->aux->linked_prog; 10498 10482 u32 btf_id = prog->aux->attach_btf_id; 10499 10483 const char prefix[] = "btf_trace_"; 10484 + struct btf_func_model fmodel; 10500 10485 int ret = 0, subprog = -1, i; 10501 10486 struct bpf_trampoline *tr; 10502 10487 const struct btf_type *t; ··· 10639 10622 prog->aux->attach_func_proto = t; 10640 10623 prog->aux->attach_btf_trace = true; 10641 10624 return 0; 10625 + case BPF_TRACE_ITER: 10626 + if (!btf_type_is_func(t)) { 10627 + verbose(env, "attach_btf_id %u is not a function\n", 10628 + btf_id); 10629 + return -EINVAL; 10630 + } 10631 + t = btf_type_by_id(btf, t->type); 10632 + if (!btf_type_is_func_proto(t)) 10633 + return -EINVAL; 10634 + prog->aux->attach_func_name = tname; 10635 + prog->aux->attach_func_proto = t; 10636 + if (!bpf_iter_prog_supported(prog)) 10637 + return -EINVAL; 10638 + ret = btf_distill_func_proto(&env->log, btf, t, 10639 + tname, &fmodel); 10640 + return ret; 10642 10641 default: 10643 10642 if (!prog_extension) 10644 10643 return -EINVAL;

+1 -1

kernel/sysctl.c

··· 201 201 202 202 #endif /* CONFIG_SYSCTL */ 203 203 204 - #ifdef CONFIG_BPF_SYSCALL 204 + #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) 205 205 static int bpf_stats_handler(struct ctl_table *table, int write, 206 206 void __user *buffer, size_t *lenp, 207 207 loff_t *ppos)

+214

kernel/trace/bpf_trace.c

··· 457 457 return &bpf_trace_printk_proto; 458 458 } 459 459 460 + #define MAX_SEQ_PRINTF_VARARGS 12 461 + #define MAX_SEQ_PRINTF_MAX_MEMCPY 6 462 + #define MAX_SEQ_PRINTF_STR_LEN 128 463 + 464 + struct bpf_seq_printf_buf { 465 + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; 466 + }; 467 + static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); 468 + static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); 469 + 470 + BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, 471 + const void *, data, u32, data_len) 472 + { 473 + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; 474 + int i, buf_used, copy_size, num_args; 475 + u64 params[MAX_SEQ_PRINTF_VARARGS]; 476 + struct bpf_seq_printf_buf *bufs; 477 + const u64 *args = data; 478 + 479 + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); 480 + if (WARN_ON_ONCE(buf_used > 1)) { 481 + err = -EBUSY; 482 + goto out; 483 + } 484 + 485 + bufs = this_cpu_ptr(&bpf_seq_printf_buf); 486 + 487 + /* 488 + * bpf_check()->check_func_arg()->check_stack_boundary() 489 + * guarantees that fmt points to bpf program stack, 490 + * fmt_size bytes of it were initialized and fmt_size > 0 491 + */ 492 + if (fmt[--fmt_size] != 0) 493 + goto out; 494 + 495 + if (data_len & 7) 496 + goto out; 497 + 498 + for (i = 0; i < fmt_size; i++) { 499 + if (fmt[i] == '%') { 500 + if (fmt[i + 1] == '%') 501 + i++; 502 + else if (!data || !data_len) 503 + goto out; 504 + } 505 + } 506 + 507 + num_args = data_len / 8; 508 + 509 + /* check format string for allowed specifiers */ 510 + for (i = 0; i < fmt_size; i++) { 511 + /* only printable ascii for now. */ 512 + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { 513 + err = -EINVAL; 514 + goto out; 515 + } 516 + 517 + if (fmt[i] != '%') 518 + continue; 519 + 520 + if (fmt[i + 1] == '%') { 521 + i++; 522 + continue; 523 + } 524 + 525 + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { 526 + err = -E2BIG; 527 + goto out; 528 + } 529 + 530 + if (fmt_cnt >= num_args) { 531 + err = -EINVAL; 532 + goto out; 533 + } 534 + 535 + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ 536 + i++; 537 + 538 + /* skip optional "[0 +-][num]" width formating field */ 539 + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || 540 + fmt[i] == ' ') 541 + i++; 542 + if (fmt[i] >= '1' && fmt[i] <= '9') { 543 + i++; 544 + while (fmt[i] >= '0' && fmt[i] <= '9') 545 + i++; 546 + } 547 + 548 + if (fmt[i] == 's') { 549 + /* try our best to copy */ 550 + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { 551 + err = -E2BIG; 552 + goto out; 553 + } 554 + 555 + err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], 556 + (void *) (long) args[fmt_cnt], 557 + MAX_SEQ_PRINTF_STR_LEN); 558 + if (err < 0) 559 + bufs->buf[memcpy_cnt][0] = '\0'; 560 + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; 561 + 562 + fmt_cnt++; 563 + memcpy_cnt++; 564 + continue; 565 + } 566 + 567 + if (fmt[i] == 'p') { 568 + if (fmt[i + 1] == 0 || 569 + fmt[i + 1] == 'K' || 570 + fmt[i + 1] == 'x') { 571 + /* just kernel pointers */ 572 + params[fmt_cnt] = args[fmt_cnt]; 573 + fmt_cnt++; 574 + continue; 575 + } 576 + 577 + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ 578 + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { 579 + err = -EINVAL; 580 + goto out; 581 + } 582 + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { 583 + err = -EINVAL; 584 + goto out; 585 + } 586 + 587 + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { 588 + err = -E2BIG; 589 + goto out; 590 + } 591 + 592 + 593 + copy_size = (fmt[i + 2] == '4') ? 4 : 16; 594 + 595 + err = probe_kernel_read(bufs->buf[memcpy_cnt], 596 + (void *) (long) args[fmt_cnt], 597 + copy_size); 598 + if (err < 0) 599 + memset(bufs->buf[memcpy_cnt], 0, copy_size); 600 + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; 601 + 602 + i += 2; 603 + fmt_cnt++; 604 + memcpy_cnt++; 605 + continue; 606 + } 607 + 608 + if (fmt[i] == 'l') { 609 + i++; 610 + if (fmt[i] == 'l') 611 + i++; 612 + } 613 + 614 + if (fmt[i] != 'i' && fmt[i] != 'd' && 615 + fmt[i] != 'u' && fmt[i] != 'x') { 616 + err = -EINVAL; 617 + goto out; 618 + } 619 + 620 + params[fmt_cnt] = args[fmt_cnt]; 621 + fmt_cnt++; 622 + } 623 + 624 + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give 625 + * all of them to seq_printf(). 626 + */ 627 + seq_printf(m, fmt, params[0], params[1], params[2], params[3], 628 + params[4], params[5], params[6], params[7], params[8], 629 + params[9], params[10], params[11]); 630 + 631 + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; 632 + out: 633 + this_cpu_dec(bpf_seq_printf_buf_used); 634 + return err; 635 + } 636 + 637 + static int bpf_seq_printf_btf_ids[5]; 638 + static const struct bpf_func_proto bpf_seq_printf_proto = { 639 + .func = bpf_seq_printf, 640 + .gpl_only = true, 641 + .ret_type = RET_INTEGER, 642 + .arg1_type = ARG_PTR_TO_BTF_ID, 643 + .arg2_type = ARG_PTR_TO_MEM, 644 + .arg3_type = ARG_CONST_SIZE, 645 + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, 646 + .arg5_type = ARG_CONST_SIZE_OR_ZERO, 647 + .btf_id = bpf_seq_printf_btf_ids, 648 + }; 649 + 650 + BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) 651 + { 652 + return seq_write(m, data, len) ? -EOVERFLOW : 0; 653 + } 654 + 655 + static int bpf_seq_write_btf_ids[5]; 656 + static const struct bpf_func_proto bpf_seq_write_proto = { 657 + .func = bpf_seq_write, 658 + .gpl_only = true, 659 + .ret_type = RET_INTEGER, 660 + .arg1_type = ARG_PTR_TO_BTF_ID, 661 + .arg2_type = ARG_PTR_TO_MEM, 662 + .arg3_type = ARG_CONST_SIZE_OR_ZERO, 663 + .btf_id = bpf_seq_write_btf_ids, 664 + }; 665 + 460 666 static __always_inline int 461 667 get_map_perf_counter(struct bpf_map *map, u64 flags, 462 668 u64 *value, u64 *enabled, u64 *running) ··· 1432 1226 case BPF_FUNC_xdp_output: 1433 1227 return &bpf_xdp_output_proto; 1434 1228 #endif 1229 + case BPF_FUNC_seq_printf: 1230 + return prog->expected_attach_type == BPF_TRACE_ITER ? 1231 + &bpf_seq_printf_proto : 1232 + NULL; 1233 + case BPF_FUNC_seq_write: 1234 + return prog->expected_attach_type == BPF_TRACE_ITER ? 1235 + &bpf_seq_write_proto : 1236 + NULL; 1435 1237 default: 1436 1238 return raw_tp_prog_func_proto(func_id, prog); 1437 1239 }

+75 -26

net/core/filter.c

··· 4003 4003 }; 4004 4004 4005 4005 #ifdef CONFIG_SOCK_CGROUP_DATA 4006 + static inline u64 __bpf_sk_cgroup_id(struct sock *sk) 4007 + { 4008 + struct cgroup *cgrp; 4009 + 4010 + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 4011 + return cgroup_id(cgrp); 4012 + } 4013 + 4006 4014 BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) 4007 4015 { 4008 4016 struct sock *sk = skb_to_full_sk(skb); 4009 - struct cgroup *cgrp; 4010 4017 4011 4018 if (!sk || !sk_fullsock(sk)) 4012 4019 return 0; 4013 4020 4014 - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 4015 - return cgroup_id(cgrp); 4021 + return __bpf_sk_cgroup_id(sk); 4016 4022 } 4017 4023 4018 4024 static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { ··· 4028 4022 .arg1_type = ARG_PTR_TO_CTX, 4029 4023 }; 4030 4024 4031 - BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, 4032 - ancestor_level) 4025 + static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, 4026 + int ancestor_level) 4033 4027 { 4034 - struct sock *sk = skb_to_full_sk(skb); 4035 4028 struct cgroup *ancestor; 4036 4029 struct cgroup *cgrp; 4037 - 4038 - if (!sk || !sk_fullsock(sk)) 4039 - return 0; 4040 4030 4041 4031 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 4042 4032 ancestor = cgroup_ancestor(cgrp, ancestor_level); ··· 4042 4040 return cgroup_id(ancestor); 4043 4041 } 4044 4042 4043 + BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, 4044 + ancestor_level) 4045 + { 4046 + struct sock *sk = skb_to_full_sk(skb); 4047 + 4048 + if (!sk || !sk_fullsock(sk)) 4049 + return 0; 4050 + 4051 + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); 4052 + } 4053 + 4045 4054 static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { 4046 4055 .func = bpf_skb_ancestor_cgroup_id, 4047 4056 .gpl_only = false, 4048 4057 .ret_type = RET_INTEGER, 4049 4058 .arg1_type = ARG_PTR_TO_CTX, 4059 + .arg2_type = ARG_ANYTHING, 4060 + }; 4061 + 4062 + BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) 4063 + { 4064 + return __bpf_sk_cgroup_id(sk); 4065 + } 4066 + 4067 + static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { 4068 + .func = bpf_sk_cgroup_id, 4069 + .gpl_only = false, 4070 + .ret_type = RET_INTEGER, 4071 + .arg1_type = ARG_PTR_TO_SOCKET, 4072 + }; 4073 + 4074 + BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) 4075 + { 4076 + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); 4077 + } 4078 + 4079 + static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { 4080 + .func = bpf_sk_ancestor_cgroup_id, 4081 + .gpl_only = false, 4082 + .ret_type = RET_INTEGER, 4083 + .arg1_type = ARG_PTR_TO_SOCKET, 4050 4084 .arg2_type = ARG_ANYTHING, 4051 4085 }; 4052 4086 #endif ··· 4563 4525 { 4564 4526 #ifdef CONFIG_INET 4565 4527 struct sock *sk = ctx->sk; 4528 + u32 flags = BIND_FROM_BPF; 4566 4529 int err; 4567 4530 4568 - /* Binding to port can be expensive so it's prohibited in the helper. 4569 - * Only binding to IP is supported. 4570 - */ 4571 4531 err = -EINVAL; 4572 4532 if (addr_len < offsetofend(struct sockaddr, sa_family)) 4573 4533 return err; 4574 4534 if (addr->sa_family == AF_INET) { 4575 4535 if (addr_len < sizeof(struct sockaddr_in)) 4576 4536 return err; 4577 - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) 4578 - return err; 4579 - return __inet_bind(sk, addr, addr_len, true, false); 4537 + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) 4538 + flags |= BIND_FORCE_ADDRESS_NO_PORT; 4539 + return __inet_bind(sk, addr, addr_len, flags); 4580 4540 #if IS_ENABLED(CONFIG_IPV6) 4581 4541 } else if (addr->sa_family == AF_INET6) { 4582 4542 if (addr_len < SIN6_LEN_RFC2133) 4583 4543 return err; 4584 - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) 4585 - return err; 4544 + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) 4545 + flags |= BIND_FORCE_ADDRESS_NO_PORT; 4586 4546 /* ipv6_bpf_stub cannot be NULL, since it's called from 4587 4547 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded 4588 4548 */ 4589 - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); 4549 + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); 4590 4550 #endif /* CONFIG_IPV6 */ 4591 4551 } 4592 4552 #endif /* CONFIG_INET */ ··· 6195 6159 #ifdef CONFIG_SOCK_CGROUP_DATA 6196 6160 case BPF_FUNC_skb_cgroup_id: 6197 6161 return &bpf_skb_cgroup_id_proto; 6162 + case BPF_FUNC_skb_ancestor_cgroup_id: 6163 + return &bpf_skb_ancestor_cgroup_id_proto; 6164 + case BPF_FUNC_sk_cgroup_id: 6165 + return &bpf_sk_cgroup_id_proto; 6166 + case BPF_FUNC_sk_ancestor_cgroup_id: 6167 + return &bpf_sk_ancestor_cgroup_id_proto; 6198 6168 #endif 6199 6169 #ifdef CONFIG_INET 6170 + case BPF_FUNC_sk_lookup_tcp: 6171 + return &bpf_sk_lookup_tcp_proto; 6172 + case BPF_FUNC_sk_lookup_udp: 6173 + return &bpf_sk_lookup_udp_proto; 6174 + case BPF_FUNC_sk_release: 6175 + return &bpf_sk_release_proto; 6176 + case BPF_FUNC_skc_lookup_tcp: 6177 + return &bpf_skc_lookup_tcp_proto; 6200 6178 case BPF_FUNC_tcp_sock: 6201 6179 return &bpf_tcp_sock_proto; 6202 6180 case BPF_FUNC_get_listener_sock: ··· 7081 7031 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): 7082 7032 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], 7083 7033 msg_src_ip6[3]): 7034 + case bpf_ctx_range(struct bpf_sock_addr, user_port): 7084 7035 if (type == BPF_READ) { 7085 7036 bpf_ctx_record_field_size(info, size_default); 7086 7037 ··· 7111 7060 if (size != size_default) 7112 7061 return false; 7113 7062 } 7114 - break; 7115 - case bpf_ctx_range(struct bpf_sock_addr, user_port): 7116 - if (size != size_default) 7117 - return false; 7118 7063 break; 7119 7064 case offsetof(struct bpf_sock_addr, sk): 7120 7065 if (type != BPF_READ) ··· 8007 7960 struct bpf_insn *insn_buf, 8008 7961 struct bpf_prog *prog, u32 *target_size) 8009 7962 { 7963 + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); 8010 7964 struct bpf_insn *insn = insn_buf; 8011 - int off; 8012 7965 8013 7966 switch (si->off) { 8014 7967 case offsetof(struct bpf_sock_addr, user_family): ··· 8043 7996 offsetof(struct sockaddr_in6, sin6_port)); 8044 7997 BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != 8045 7998 sizeof_field(struct sockaddr_in6, sin6_port)); 8046 - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, 8047 - struct sockaddr_in6, uaddr, 8048 - sin6_port, tmp_reg); 7999 + /* Account for sin6_port being smaller than user_port. */ 8000 + port_size = min(port_size, BPF_LDST_BYTES(si)); 8001 + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( 8002 + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, 8003 + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); 8049 8004 break; 8050 8005 8051 8006 case offsetof(struct bpf_sock_addr, family):

+11 -9

net/ipv4/af_inet.c

··· 450 450 if (err) 451 451 return err; 452 452 453 - return __inet_bind(sk, uaddr, addr_len, false, true); 453 + return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); 454 454 } 455 455 EXPORT_SYMBOL(inet_bind); 456 456 457 457 int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, 458 - bool force_bind_address_no_port, bool with_lock) 458 + u32 flags) 459 459 { 460 460 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 461 461 struct inet_sock *inet = inet_sk(sk); ··· 506 506 * would be illegal to use them (multicast/broadcast) in 507 507 * which case the sending device address is used. 508 508 */ 509 - if (with_lock) 509 + if (flags & BIND_WITH_LOCK) 510 510 lock_sock(sk); 511 511 512 512 /* Check these errors (active socket, double bind). */ ··· 520 520 521 521 /* Make sure we are allowed to bind here. */ 522 522 if (snum || !(inet->bind_address_no_port || 523 - force_bind_address_no_port)) { 523 + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { 524 524 if (sk->sk_prot->get_port(sk, snum)) { 525 525 inet->inet_saddr = inet->inet_rcv_saddr = 0; 526 526 err = -EADDRINUSE; 527 527 goto out_release_sock; 528 528 } 529 - err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); 530 - if (err) { 531 - inet->inet_saddr = inet->inet_rcv_saddr = 0; 532 - goto out_release_sock; 529 + if (!(flags & BIND_FROM_BPF)) { 530 + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); 531 + if (err) { 532 + inet->inet_saddr = inet->inet_rcv_saddr = 0; 533 + goto out_release_sock; 534 + } 533 535 } 534 536 } 535 537 ··· 545 543 sk_dst_reset(sk); 546 544 err = 0; 547 545 out_release_sock: 548 - if (with_lock) 546 + if (flags & BIND_WITH_LOCK) 549 547 release_sock(sk); 550 548 out: 551 549 return err;

+12 -10

net/ipv6/af_inet6.c

··· 273 273 } 274 274 275 275 static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, 276 - bool force_bind_address_no_port, bool with_lock) 276 + u32 flags) 277 277 { 278 278 struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; 279 279 struct inet_sock *inet = inet_sk(sk); ··· 297 297 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) 298 298 return -EACCES; 299 299 300 - if (with_lock) 300 + if (flags & BIND_WITH_LOCK) 301 301 lock_sock(sk); 302 302 303 303 /* Check these errors (active socket, double bind). */ ··· 400 400 401 401 /* Make sure we are allowed to bind here. */ 402 402 if (snum || !(inet->bind_address_no_port || 403 - force_bind_address_no_port)) { 403 + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { 404 404 if (sk->sk_prot->get_port(sk, snum)) { 405 405 sk->sk_ipv6only = saved_ipv6only; 406 406 inet_reset_saddr(sk); 407 407 err = -EADDRINUSE; 408 408 goto out; 409 409 } 410 - err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); 411 - if (err) { 412 - sk->sk_ipv6only = saved_ipv6only; 413 - inet_reset_saddr(sk); 414 - goto out; 410 + if (!(flags & BIND_FROM_BPF)) { 411 + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); 412 + if (err) { 413 + sk->sk_ipv6only = saved_ipv6only; 414 + inet_reset_saddr(sk); 415 + goto out; 416 + } 415 417 } 416 418 } 417 419 ··· 425 423 inet->inet_dport = 0; 426 424 inet->inet_daddr = 0; 427 425 out: 428 - if (with_lock) 426 + if (flags & BIND_WITH_LOCK) 429 427 release_sock(sk); 430 428 return err; 431 429 out_unlock: ··· 453 451 if (err) 454 452 return err; 455 453 456 - return __inet6_bind(sk, uaddr, addr_len, false, true); 454 + return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); 457 455 } 458 456 EXPORT_SYMBOL(inet6_bind); 459 457

+58 -2

net/ipv6/ip6_fib.c

··· 2467 2467 } 2468 2468 2469 2469 #ifdef CONFIG_PROC_FS 2470 - static int ipv6_route_seq_show(struct seq_file *seq, void *v) 2470 + static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) 2471 2471 { 2472 2472 struct fib6_info *rt = v; 2473 2473 struct ipv6_route_iter *iter = seq->private; ··· 2625 2625 return w->node && !(w->state == FWS_U && w->node == w->root); 2626 2626 } 2627 2627 2628 - static void ipv6_route_seq_stop(struct seq_file *seq, void *v) 2628 + static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) 2629 2629 __releases(RCU_BH) 2630 2630 { 2631 2631 struct net *net = seq_file_net(seq); ··· 2636 2636 2637 2637 rcu_read_unlock_bh(); 2638 2638 } 2639 + 2640 + #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) 2641 + static int ipv6_route_prog_seq_show(struct bpf_prog *prog, 2642 + struct bpf_iter_meta *meta, 2643 + void *v) 2644 + { 2645 + struct bpf_iter__ipv6_route ctx; 2646 + 2647 + ctx.meta = meta; 2648 + ctx.rt = v; 2649 + return bpf_iter_run_prog(prog, &ctx); 2650 + } 2651 + 2652 + static int ipv6_route_seq_show(struct seq_file *seq, void *v) 2653 + { 2654 + struct ipv6_route_iter *iter = seq->private; 2655 + struct bpf_iter_meta meta; 2656 + struct bpf_prog *prog; 2657 + int ret; 2658 + 2659 + meta.seq = seq; 2660 + prog = bpf_iter_get_info(&meta, false); 2661 + if (!prog) 2662 + return ipv6_route_native_seq_show(seq, v); 2663 + 2664 + ret = ipv6_route_prog_seq_show(prog, &meta, v); 2665 + iter->w.leaf = NULL; 2666 + 2667 + return ret; 2668 + } 2669 + 2670 + static void ipv6_route_seq_stop(struct seq_file *seq, void *v) 2671 + { 2672 + struct bpf_iter_meta meta; 2673 + struct bpf_prog *prog; 2674 + 2675 + if (!v) { 2676 + meta.seq = seq; 2677 + prog = bpf_iter_get_info(&meta, true); 2678 + if (prog) 2679 + (void)ipv6_route_prog_seq_show(prog, &meta, v); 2680 + } 2681 + 2682 + ipv6_route_native_seq_stop(seq, v); 2683 + } 2684 + #else 2685 + static int ipv6_route_seq_show(struct seq_file *seq, void *v) 2686 + { 2687 + return ipv6_route_native_seq_show(seq, v); 2688 + } 2689 + 2690 + static void ipv6_route_seq_stop(struct seq_file *seq, void *v) 2691 + { 2692 + ipv6_route_native_seq_stop(seq, v); 2693 + } 2694 + #endif 2639 2695 2640 2696 const struct seq_operations ipv6_route_seq_ops = { 2641 2697 .start = ipv6_route_seq_start,

+42

net/ipv6/route.c

··· 6421 6421 #endif 6422 6422 } 6423 6423 6424 + #if IS_BUILTIN(CONFIG_IPV6) 6425 + #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6426 + DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) 6427 + 6428 + static const struct bpf_iter_reg ipv6_route_reg_info = { 6429 + .target = "ipv6_route", 6430 + .seq_ops = &ipv6_route_seq_ops, 6431 + .init_seq_private = bpf_iter_init_seq_net, 6432 + .fini_seq_private = bpf_iter_fini_seq_net, 6433 + .seq_priv_size = sizeof(struct ipv6_route_iter), 6434 + .ctx_arg_info_size = 1, 6435 + .ctx_arg_info = { 6436 + { offsetof(struct bpf_iter__ipv6_route, rt), 6437 + PTR_TO_BTF_ID_OR_NULL }, 6438 + }, 6439 + }; 6440 + 6441 + static int __init bpf_iter_register(void) 6442 + { 6443 + return bpf_iter_reg_target(&ipv6_route_reg_info); 6444 + } 6445 + 6446 + static void bpf_iter_unregister(void) 6447 + { 6448 + bpf_iter_unreg_target(&ipv6_route_reg_info); 6449 + } 6450 + #endif 6451 + #endif 6452 + 6424 6453 int __init ip6_route_init(void) 6425 6454 { 6426 6455 int ret; ··· 6512 6483 if (ret) 6513 6484 goto out_register_late_subsys; 6514 6485 6486 + #if IS_BUILTIN(CONFIG_IPV6) 6487 + #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6488 + ret = bpf_iter_register(); 6489 + if (ret) 6490 + goto out_register_late_subsys; 6491 + #endif 6492 + #endif 6493 + 6515 6494 for_each_possible_cpu(cpu) { 6516 6495 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 6517 6496 ··· 6552 6515 6553 6516 void ip6_route_cleanup(void) 6554 6517 { 6518 + #if IS_BUILTIN(CONFIG_IPV6) 6519 + #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 6520 + bpf_iter_unregister(); 6521 + #endif 6522 + #endif 6555 6523 unregister_netdevice_notifier(&ip6_route_dev_notifier); 6556 6524 unregister_pernet_subsys(&ip6_route_net_late_ops); 6557 6525 fib6_rules_cleanup();

+90 -2

net/netlink/af_netlink.c

··· 2596 2596 return __netlink_seq_next(seq); 2597 2597 } 2598 2598 2599 - static void netlink_seq_stop(struct seq_file *seq, void *v) 2599 + static void netlink_native_seq_stop(struct seq_file *seq, void *v) 2600 2600 { 2601 2601 struct nl_seq_iter *iter = seq->private; 2602 2602 ··· 2607 2607 } 2608 2608 2609 2609 2610 - static int netlink_seq_show(struct seq_file *seq, void *v) 2610 + static int netlink_native_seq_show(struct seq_file *seq, void *v) 2611 2611 { 2612 2612 if (v == SEQ_START_TOKEN) { 2613 2613 seq_puts(seq, ··· 2633 2633 } 2634 2634 return 0; 2635 2635 } 2636 + 2637 + #ifdef CONFIG_BPF_SYSCALL 2638 + struct bpf_iter__netlink { 2639 + __bpf_md_ptr(struct bpf_iter_meta *, meta); 2640 + __bpf_md_ptr(struct netlink_sock *, sk); 2641 + }; 2642 + 2643 + DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) 2644 + 2645 + static int netlink_prog_seq_show(struct bpf_prog *prog, 2646 + struct bpf_iter_meta *meta, 2647 + void *v) 2648 + { 2649 + struct bpf_iter__netlink ctx; 2650 + 2651 + meta->seq_num--; /* skip SEQ_START_TOKEN */ 2652 + ctx.meta = meta; 2653 + ctx.sk = nlk_sk((struct sock *)v); 2654 + return bpf_iter_run_prog(prog, &ctx); 2655 + } 2656 + 2657 + static int netlink_seq_show(struct seq_file *seq, void *v) 2658 + { 2659 + struct bpf_iter_meta meta; 2660 + struct bpf_prog *prog; 2661 + 2662 + meta.seq = seq; 2663 + prog = bpf_iter_get_info(&meta, false); 2664 + if (!prog) 2665 + return netlink_native_seq_show(seq, v); 2666 + 2667 + if (v != SEQ_START_TOKEN) 2668 + return netlink_prog_seq_show(prog, &meta, v); 2669 + 2670 + return 0; 2671 + } 2672 + 2673 + static void netlink_seq_stop(struct seq_file *seq, void *v) 2674 + { 2675 + struct bpf_iter_meta meta; 2676 + struct bpf_prog *prog; 2677 + 2678 + if (!v) { 2679 + meta.seq = seq; 2680 + prog = bpf_iter_get_info(&meta, true); 2681 + if (prog) 2682 + (void)netlink_prog_seq_show(prog, &meta, v); 2683 + } 2684 + 2685 + netlink_native_seq_stop(seq, v); 2686 + } 2687 + #else 2688 + static int netlink_seq_show(struct seq_file *seq, void *v) 2689 + { 2690 + return netlink_native_seq_show(seq, v); 2691 + } 2692 + 2693 + static void netlink_seq_stop(struct seq_file *seq, void *v) 2694 + { 2695 + netlink_native_seq_stop(seq, v); 2696 + } 2697 + #endif 2636 2698 2637 2699 static const struct seq_operations netlink_seq_ops = { 2638 2700 .start = netlink_seq_start, ··· 2802 2740 .automatic_shrinking = true, 2803 2741 }; 2804 2742 2743 + #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2744 + static const struct bpf_iter_reg netlink_reg_info = { 2745 + .target = "netlink", 2746 + .seq_ops = &netlink_seq_ops, 2747 + .init_seq_private = bpf_iter_init_seq_net, 2748 + .fini_seq_private = bpf_iter_fini_seq_net, 2749 + .seq_priv_size = sizeof(struct nl_seq_iter), 2750 + .ctx_arg_info_size = 1, 2751 + .ctx_arg_info = { 2752 + { offsetof(struct bpf_iter__netlink, sk), 2753 + PTR_TO_BTF_ID_OR_NULL }, 2754 + }, 2755 + }; 2756 + 2757 + static int __init bpf_iter_register(void) 2758 + { 2759 + return bpf_iter_reg_target(&netlink_reg_info); 2760 + } 2761 + #endif 2762 + 2805 2763 static int __init netlink_proto_init(void) 2806 2764 { 2807 2765 int i; ··· 2829 2747 2830 2748 if (err != 0) 2831 2749 goto out; 2750 + 2751 + #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 2752 + err = bpf_iter_register(); 2753 + if (err) 2754 + goto out; 2755 + #endif 2832 2756 2833 2757 BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); 2834 2758

+10 -11

net/xdp/xdp_umem.c

··· 30 30 if (!xs->tx) 31 31 return; 32 32 33 - spin_lock_irqsave(&umem->xsk_list_lock, flags); 34 - list_add_rcu(&xs->list, &umem->xsk_list); 35 - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); 33 + spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); 34 + list_add_rcu(&xs->list, &umem->xsk_tx_list); 35 + spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); 36 36 } 37 37 38 38 void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) ··· 42 42 if (!xs->tx) 43 43 return; 44 44 45 - spin_lock_irqsave(&umem->xsk_list_lock, flags); 45 + spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); 46 46 list_del_rcu(&xs->list); 47 - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); 47 + spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); 48 48 } 49 49 50 50 /* The umem is stored both in the _rx struct and the _tx struct as we do ··· 279 279 } 280 280 } 281 281 282 - static int xdp_umem_pin_pages(struct xdp_umem *umem) 282 + static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) 283 283 { 284 284 unsigned int gup_flags = FOLL_WRITE; 285 285 long npgs; ··· 291 291 return -ENOMEM; 292 292 293 293 down_read(&current->mm->mmap_sem); 294 - npgs = pin_user_pages(umem->address, umem->npgs, 294 + npgs = pin_user_pages(address, umem->npgs, 295 295 gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); 296 296 up_read(&current->mm->mmap_sem); 297 297 ··· 385 385 if (headroom >= chunk_size - XDP_PACKET_HEADROOM) 386 386 return -EINVAL; 387 387 388 - umem->address = (unsigned long)addr; 389 388 umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK 390 389 : ~((u64)chunk_size - 1); 391 390 umem->size = size; ··· 394 395 umem->pgs = NULL; 395 396 umem->user = NULL; 396 397 umem->flags = mr->flags; 397 - INIT_LIST_HEAD(&umem->xsk_list); 398 - spin_lock_init(&umem->xsk_list_lock); 398 + INIT_LIST_HEAD(&umem->xsk_tx_list); 399 + spin_lock_init(&umem->xsk_tx_list_lock); 399 400 400 401 refcount_set(&umem->users, 1); 401 402 ··· 403 404 if (err) 404 405 return err; 405 406 406 - err = xdp_umem_pin_pages(umem); 407 + err = xdp_umem_pin_pages(umem, (unsigned long)addr); 407 408 if (err) 408 409 goto out_account; 409 410

+4 -4

net/xdp/xsk.c

··· 75 75 return; 76 76 77 77 rcu_read_lock(); 78 - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 78 + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { 79 79 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; 80 80 } 81 81 rcu_read_unlock(); ··· 102 102 return; 103 103 104 104 rcu_read_lock(); 105 - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 105 + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { 106 106 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; 107 107 } 108 108 rcu_read_unlock(); ··· 305 305 struct xdp_sock *xs; 306 306 307 307 rcu_read_lock(); 308 - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 308 + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { 309 309 __xskq_cons_release(xs->tx); 310 310 xs->sk.sk_write_space(&xs->sk); 311 311 } ··· 318 318 struct xdp_sock *xs; 319 319 320 320 rcu_read_lock(); 321 - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { 321 + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { 322 322 if (!xskq_cons_peek_desc(xs->tx, desc, umem)) 323 323 continue; 324 324

+2 -2

net/xdp/xsk_queue.c

··· 9 9 10 10 #include "xsk_queue.h" 11 11 12 - void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) 12 + void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask) 13 13 { 14 14 if (!q) 15 15 return; 16 16 17 - q->size = size; 17 + q->umem_size = umem_size; 18 18 q->chunk_mask = chunk_mask; 19 19 } 20 20

+4 -4

net/xdp/xsk_queue.h

··· 30 30 31 31 struct xsk_queue { 32 32 u64 chunk_mask; 33 - u64 size; 33 + u64 umem_size; 34 34 u32 ring_mask; 35 35 u32 nentries; 36 36 u32 cached_prod; ··· 123 123 u64 base_addr = xsk_umem_extract_addr(addr); 124 124 125 125 addr = xsk_umem_add_offset_to_addr(addr); 126 - if (base_addr >= q->size || addr >= q->size || 126 + if (base_addr >= q->umem_size || addr >= q->umem_size || 127 127 xskq_cons_crosses_non_contig_pg(umem, addr, length)) { 128 128 q->invalid_descs++; 129 129 return false; ··· 134 134 135 135 static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) 136 136 { 137 - if (addr >= q->size) { 137 + if (addr >= q->umem_size) { 138 138 q->invalid_descs++; 139 139 return false; 140 140 } ··· 379 379 return q ? q->invalid_descs : 0; 380 380 } 381 381 382 - void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); 382 + void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask); 383 383 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); 384 384 void xskq_destroy(struct xsk_queue *q_ops); 385 385

+2 -2

samples/bpf/offwaketime_kern.c

··· 5 5 * License as published by the Free Software Foundation. 6 6 */ 7 7 #include <uapi/linux/bpf.h> 8 - #include <bpf/bpf_helpers.h> 9 - #include <bpf/bpf_tracing.h> 10 8 #include <uapi/linux/ptrace.h> 11 9 #include <uapi/linux/perf_event.h> 12 10 #include <linux/version.h> 13 11 #include <linux/sched.h> 12 + #include <bpf/bpf_helpers.h> 13 + #include <bpf/bpf_tracing.h> 14 14 15 15 #define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;}) 16 16

+2 -2

samples/bpf/sockex2_kern.c

··· 1 1 #include <uapi/linux/bpf.h> 2 - #include <bpf/bpf_helpers.h> 3 - #include "bpf_legacy.h" 4 2 #include <uapi/linux/in.h> 5 3 #include <uapi/linux/if.h> 6 4 #include <uapi/linux/if_ether.h> 7 5 #include <uapi/linux/ip.h> 8 6 #include <uapi/linux/ipv6.h> 9 7 #include <uapi/linux/if_tunnel.h> 8 + #include <bpf/bpf_helpers.h> 9 + #include "bpf_legacy.h" 10 10 #define IP_MF 0x2000 11 11 #define IP_OFFSET 0x1FFF 12 12

+2 -2

samples/bpf/sockex3_kern.c

··· 5 5 * License as published by the Free Software Foundation. 6 6 */ 7 7 #include <uapi/linux/bpf.h> 8 - #include <bpf/bpf_helpers.h> 9 - #include "bpf_legacy.h" 10 8 #include <uapi/linux/in.h> 11 9 #include <uapi/linux/if.h> 12 10 #include <uapi/linux/if_ether.h> ··· 12 14 #include <uapi/linux/ipv6.h> 13 15 #include <uapi/linux/if_tunnel.h> 14 16 #include <uapi/linux/mpls.h> 17 + #include <bpf/bpf_helpers.h> 18 + #include "bpf_legacy.h" 15 19 #define IP_MF 0x2000 16 20 #define IP_OFFSET 0x1FFF 17 21

+1 -1

samples/bpf/xdp_redirect_cpu_kern.c

··· 15 15 #include <bpf/bpf_helpers.h> 16 16 #include "hash_func01.h" 17 17 18 - #define MAX_CPUS 64 /* WARNING - sync with _user.c */ 18 + #define MAX_CPUS NR_CPUS 19 19 20 20 /* Special map type that can XDP_REDIRECT frames to another CPU */ 21 21 struct {

+16 -13

samples/bpf/xdp_redirect_cpu_user.c

··· 13 13 #include <unistd.h> 14 14 #include <locale.h> 15 15 #include <sys/resource.h> 16 + #include <sys/sysinfo.h> 16 17 #include <getopt.h> 17 18 #include <net/if.h> 18 19 #include <time.h> ··· 24 23 25 24 #include <arpa/inet.h> 26 25 #include <linux/if_link.h> 27 - 28 - #define MAX_CPUS 64 /* WARNING - sync with _kern.c */ 29 26 30 27 /* How many xdp_progs are defined in _kern.c */ 31 28 #define MAX_PROG 6 ··· 39 40 static __u32 prog_id; 40 41 41 42 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; 43 + static int n_cpus; 42 44 static int cpu_map_fd; 43 45 static int rx_cnt_map_fd; 44 46 static int redirect_err_cnt_map_fd; ··· 170 170 struct record redir_err; 171 171 struct record kthread; 172 172 struct record exception; 173 - struct record enq[MAX_CPUS]; 173 + struct record enq[]; 174 174 }; 175 175 176 176 static bool map_collect_percpu(int fd, __u32 key, struct record *rec) ··· 225 225 static struct stats_record *alloc_stats_record(void) 226 226 { 227 227 struct stats_record *rec; 228 - int i; 228 + int i, size; 229 229 230 - rec = malloc(sizeof(*rec)); 231 - memset(rec, 0, sizeof(*rec)); 230 + size = sizeof(*rec) + n_cpus * sizeof(struct record); 231 + rec = malloc(size); 232 + memset(rec, 0, size); 232 233 if (!rec) { 233 234 fprintf(stderr, "Mem alloc error\n"); 234 235 exit(EXIT_FAIL_MEM); ··· 238 237 rec->redir_err.cpu = alloc_record_per_cpu(); 239 238 rec->kthread.cpu = alloc_record_per_cpu(); 240 239 rec->exception.cpu = alloc_record_per_cpu(); 241 - for (i = 0; i < MAX_CPUS; i++) 240 + for (i = 0; i < n_cpus; i++) 242 241 rec->enq[i].cpu = alloc_record_per_cpu(); 243 242 244 243 return rec; ··· 248 247 { 249 248 int i; 250 249 251 - for (i = 0; i < MAX_CPUS; i++) 250 + for (i = 0; i < n_cpus; i++) 252 251 free(r->enq[i].cpu); 253 252 free(r->exception.cpu); 254 253 free(r->kthread.cpu); ··· 351 350 } 352 351 353 352 /* cpumap enqueue stats */ 354 - for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { 353 + for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) { 355 354 char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; 356 355 char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; 357 356 char *errstr = ""; ··· 476 475 map_collect_percpu(fd, 1, &rec->redir_err); 477 476 478 477 fd = cpumap_enqueue_cnt_map_fd; 479 - for (i = 0; i < MAX_CPUS; i++) 478 + for (i = 0; i < n_cpus; i++) 480 479 map_collect_percpu(fd, i, &rec->enq[i]); 481 480 482 481 fd = cpumap_kthread_cnt_map_fd; ··· 550 549 */ 551 550 static void mark_cpus_unavailable(void) 552 551 { 553 - __u32 invalid_cpu = MAX_CPUS; 552 + __u32 invalid_cpu = n_cpus; 554 553 int ret, i; 555 554 556 - for (i = 0; i < MAX_CPUS; i++) { 555 + for (i = 0; i < n_cpus; i++) { 557 556 ret = bpf_map_update_elem(cpus_available_map_fd, &i, 558 557 &invalid_cpu, 0); 559 558 if (ret) { ··· 689 688 int prog_fd; 690 689 __u32 qsize; 691 690 691 + n_cpus = get_nprocs_conf(); 692 + 692 693 /* Notice: choosing he queue size is very important with the 693 694 * ixgbe driver, because it's driver page recycling trick is 694 695 * dependend on pages being returned quickly. The number of ··· 760 757 case 'c': 761 758 /* Add multiple CPUs */ 762 759 add_cpu = strtoul(optarg, NULL, 0); 763 - if (add_cpu >= MAX_CPUS) { 760 + if (add_cpu >= n_cpus) { 764 761 fprintf(stderr, 765 762 "--cpu nr too large for cpumap err(%d):%s\n", 766 763 errno, strerror(errno));

+8

scripts/bpf_helpers_doc.py

··· 318 318 of eBPF maps are used with a given helper function. 319 319 * *kernel/bpf/* directory contains other files in which additional helpers are 320 320 defined (for cgroups, sockmaps, etc.). 321 + * The bpftool utility can be used to probe the availability of helper functions 322 + on the system (as well as supported program and map types, and a number of 323 + other parameters). To do so, run **bpftool feature probe** (see 324 + **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to 325 + list features available to unprivileged users. 321 326 322 327 Compatibility between helper functions and program types can generally be found 323 328 in the files where helper functions are defined. Look for the **struct ··· 343 338 ======== 344 339 345 340 **bpf**\ (2), 341 + **bpftool**\ (8), 346 342 **cgroups**\ (7), 347 343 **ip**\ (8), 348 344 **perf_event_open**\ (2), ··· 420 414 'struct sk_reuseport_md', 421 415 'struct sockaddr', 422 416 'struct tcphdr', 417 + 'struct seq_file', 423 418 424 419 'struct __sk_buff', 425 420 'struct sk_msg_md', ··· 457 450 'struct sk_reuseport_md', 458 451 'struct sockaddr', 459 452 'struct tcphdr', 453 + 'struct seq_file', 460 454 } 461 455 mapped_types = { 462 456 'u8': '__u8',

+2 -2

security/selinux/include/classmap.h

··· 27 27 "audit_control", "setfcap" 28 28 29 29 #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ 30 - "wake_alarm", "block_suspend", "audit_read" 30 + "wake_alarm", "block_suspend", "audit_read", "perfmon" 31 31 32 - #if CAP_LAST_CAP > CAP_AUDIT_READ 32 + #if CAP_LAST_CAP > CAP_PERFMON 33 33 #error New capability defined, please update COMMON_CAP2_PERMS. 34 34 #endif 35 35

+8 -3

tools/bpf/bpftool/Documentation/bpftool-btf.rst

··· 230 230 **bpf**\ (2), 231 231 **bpf-helpers**\ (7), 232 232 **bpftool**\ (8), 233 - **bpftool-map**\ (8), 234 - **bpftool-prog**\ (8), 233 + **bpftool-btf**\ (8), 235 234 **bpftool-cgroup**\ (8), 236 235 **bpftool-feature**\ (8), 236 + **bpftool-gen**\ (8), 237 + **bpftool-iter**\ (8), 238 + **bpftool-link**\ (8), 239 + **bpftool-map**\ (8), 237 240 **bpftool-net**\ (8), 238 - **bpftool-perf**\ (8) 241 + **bpftool-perf**\ (8), 242 + **bpftool-prog**\ (8), 243 + **bpftool-struct_ops**\ (8)

+8 -4

tools/bpf/bpftool/Documentation/bpftool-cgroup.rst

··· 20 20 CGROUP COMMANDS 21 21 =============== 22 22 23 - | **bpftool** **cgroup { show | list }** *CGROUP* [**effective**] 23 + | **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] 24 24 | **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] 25 25 | **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] 26 26 | **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* ··· 160 160 **bpf**\ (2), 161 161 **bpf-helpers**\ (7), 162 162 **bpftool**\ (8), 163 - **bpftool-prog**\ (8), 164 - **bpftool-map**\ (8), 163 + **bpftool-btf**\ (8), 165 164 **bpftool-feature**\ (8), 165 + **bpftool-gen**\ (8), 166 + **bpftool-iter**\ (8), 167 + **bpftool-link**\ (8), 168 + **bpftool-map**\ (8), 166 169 **bpftool-net**\ (8), 167 170 **bpftool-perf**\ (8), 168 - **bpftool-btf**\ (8) 171 + **bpftool-prog**\ (8), 172 + **bpftool-struct_ops**\ (8)

+8 -4

tools/bpf/bpftool/Documentation/bpftool-feature.rst

··· 28 28 =========== 29 29 **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] 30 30 Probe the running kernel and dump a number of eBPF-related 31 - parameters, such as availability of the **bpf()** system call, 31 + parameters, such as availability of the **bpf**\ () system call, 32 32 JIT status, eBPF program types availability, eBPF helper 33 33 functions availability, and more. 34 34 ··· 93 93 **bpf**\ (2), 94 94 **bpf-helpers**\ (7), 95 95 **bpftool**\ (8), 96 - **bpftool-prog**\ (8), 97 - **bpftool-map**\ (8), 96 + **bpftool-btf**\ (8), 98 97 **bpftool-cgroup**\ (8), 98 + **bpftool-gen**\ (8), 99 + **bpftool-iter**\ (8), 100 + **bpftool-link**\ (8), 101 + **bpftool-map**\ (8), 99 102 **bpftool-net**\ (8), 100 103 **bpftool-perf**\ (8), 101 - **bpftool-btf**\ (8) 104 + **bpftool-prog**\ (8), 105 + **bpftool-struct_ops**\ (8)

+12 -9

tools/bpf/bpftool/Documentation/bpftool-gen.rst

··· 14 14 15 15 *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } 16 16 17 - *COMMAND* := { **skeleton | **help** } 17 + *COMMAND* := { **skeleton** | **help** } 18 18 19 19 GEN COMMANDS 20 20 ============= ··· 36 36 etc. Skeleton eliminates the need to lookup mentioned 37 37 components by name. Instead, if skeleton instantiation 38 38 succeeds, they are populated in skeleton structure as valid 39 - libbpf types (e.g., struct bpf_map pointer) and can be 39 + libbpf types (e.g., **struct bpf_map** pointer) and can be 40 40 passed to existing generic libbpf APIs. 41 41 42 42 In addition to simple and reliable access to maps and 43 - programs, skeleton provides a storage for BPF links (struct 44 - bpf_link) for each BPF program within BPF object. When 43 + programs, skeleton provides a storage for BPF links (**struct 44 + bpf_link**) for each BPF program within BPF object. When 45 45 requested, supported BPF programs will be automatically 46 46 attached and resulting BPF links stored for further use by 47 47 user in pre-allocated fields in skeleton struct. For BPF ··· 82 82 83 83 - **example__open** and **example__open_opts**. 84 84 These functions are used to instantiate skeleton. It 85 - corresponds to libbpf's **bpf_object__open()** API. 85 + corresponds to libbpf's **bpf_object__open**\ () API. 86 86 **_opts** variants accepts extra **bpf_object_open_opts** 87 87 options. 88 88 89 89 - **example__load**. 90 90 This function creates maps, loads and verifies BPF 91 91 programs, initializes global data maps. It corresponds to 92 - libppf's **bpf_object__load** API. 92 + libppf's **bpf_object__load**\ () API. 93 93 94 94 - **example__open_and_load** combines **example__open** and 95 95 **example__load** invocations in one commonly used ··· 296 296 **bpf**\ (2), 297 297 **bpf-helpers**\ (7), 298 298 **bpftool**\ (8), 299 - **bpftool-map**\ (8), 300 - **bpftool-prog**\ (8), 299 + **bpftool-btf**\ (8), 301 300 **bpftool-cgroup**\ (8), 302 301 **bpftool-feature**\ (8), 302 + **bpftool-iter**\ (8), 303 + **bpftool-link**\ (8), 304 + **bpftool-map**\ (8), 303 305 **bpftool-net**\ (8), 304 306 **bpftool-perf**\ (8), 305 - **bpftool-btf**\ (8) 307 + **bpftool-prog**\ (8), 308 + **bpftool-struct_ops**\ (8)

+81

tools/bpf/bpftool/Documentation/bpftool-iter.rst

··· 1 + ============ 2 + bpftool-iter 3 + ============ 4 + ------------------------------------------------------------------------------- 5 + tool to create BPF iterators 6 + ------------------------------------------------------------------------------- 7 + 8 + :Manual section: 8 9 + 10 + SYNOPSIS 11 + ======== 12 + 13 + **bpftool** [*OPTIONS*] **iter** *COMMAND* 14 + 15 + *COMMANDS* := { **pin** | **help** } 16 + 17 + ITER COMMANDS 18 + =================== 19 + 20 + | **bpftool** **iter pin** *OBJ* *PATH* 21 + | **bpftool** **iter help** 22 + | 23 + | *OBJ* := /a/file/of/bpf_iter_target.o 24 + 25 + DESCRIPTION 26 + =========== 27 + **bpftool iter pin** *OBJ* *PATH* 28 + A bpf iterator combines a kernel iterating of 29 + particular kernel data (e.g., tasks, bpf_maps, etc.) 30 + and a bpf program called for each kernel data object 31 + (e.g., one task, one bpf_map, etc.). User space can 32 + *read* kernel iterator output through *read()* syscall. 33 + 34 + The *pin* command creates a bpf iterator from *OBJ*, 35 + and pin it to *PATH*. The *PATH* should be located 36 + in *bpffs* mount. It must not contain a dot 37 + character ('.'), which is reserved for future extensions 38 + of *bpffs*. 39 + 40 + User can then *cat PATH* to see the bpf iterator output. 41 + 42 + **bpftool iter help** 43 + Print short help message. 44 + 45 + OPTIONS 46 + ======= 47 + -h, --help 48 + Print short generic help message (similar to **bpftool help**). 49 + 50 + -V, --version 51 + Print version number (similar to **bpftool version**). 52 + 53 + -d, --debug 54 + Print all logs available, even debug-level information. This 55 + includes logs from libbpf as well as from the verifier, when 56 + attempting to load programs. 57 + 58 + EXAMPLES 59 + ======== 60 + **# bpftool iter pin bpf_iter_netlink.o /sys/fs/bpf/my_netlink** 61 + 62 + :: 63 + 64 + Create a file-based bpf iterator from bpf_iter_netlink.o and pin it 65 + to /sys/fs/bpf/my_netlink 66 + 67 + SEE ALSO 68 + ======== 69 + **bpf**\ (2), 70 + **bpf-helpers**\ (7), 71 + **bpftool**\ (8), 72 + **bpftool-btf**\ (8), 73 + **bpftool-cgroup**\ (8), 74 + **bpftool-feature**\ (8), 75 + **bpftool-gen**\ (8), 76 + **bpftool-link**\ (8), 77 + **bpftool-map**\ (8), 78 + **bpftool-net**\ (8), 79 + **bpftool-perf**\ (8), 80 + **bpftool-prog**\ (8), 81 + **bpftool-struct_ops**\ (8)

+6 -3

tools/bpf/bpftool/Documentation/bpftool-link.rst

··· 109 109 **bpf**\ (2), 110 110 **bpf-helpers**\ (7), 111 111 **bpftool**\ (8), 112 - **bpftool-prog\ (8), 113 - **bpftool-map**\ (8), 112 + **bpftool-btf**\ (8), 114 113 **bpftool-cgroup**\ (8), 115 114 **bpftool-feature**\ (8), 115 + **bpftool-gen**\ (8), 116 + **bpftool-iter**\ (8), 117 + **bpftool-map**\ (8), 116 118 **bpftool-net**\ (8), 117 119 **bpftool-perf**\ (8), 118 - **bpftool-btf**\ (8) 120 + **bpftool-prog**\ (8), 121 + **bpftool-struct_ops**\ (8)

+24 -13

tools/bpf/bpftool/Documentation/bpftool-map.rst

··· 21 21 MAP COMMANDS 22 22 ============= 23 23 24 - | **bpftool** **map { show | list }** [*MAP*] 24 + | **bpftool** **map** { **show** | **list** } [*MAP*] 25 25 | **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ 26 26 | **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] 27 27 | **bpftool** **map dump** *MAP* ··· 49 49 | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** 50 50 | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** 51 51 | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** 52 - | | **queue** | **stack** } 52 + | | **queue** | **stack** | **sk_storage** | **struct_ops** } 53 53 54 54 DESCRIPTION 55 55 =========== ··· 66 66 Create a new map with given parameters and pin it to *bpffs* 67 67 as *FILE*. 68 68 69 + *FLAGS* should be an integer which is the combination of 70 + desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h 71 + UAPI header for existing flags). 72 + 73 + Keyword **dev** expects a network interface name, and is used 74 + to request hardware offload for the map. 75 + 69 76 **bpftool map dump** *MAP* 70 77 Dump all entries in a given *MAP*. In case of **name**, 71 78 *MAP* may match several maps which will all be dumped. ··· 85 78 exists; **noexist** update only if entry doesn't exist. 86 79 87 80 If the **hex** keyword is provided in front of the bytes 88 - sequence, the bytes are parsed as hexadeximal values, even if 81 + sequence, the bytes are parsed as hexadecimal values, even if 89 82 no "0x" prefix is added. If the keyword is not provided, then 90 83 the bytes are parsed as decimal values, unless a "0x" prefix 91 84 (for hexadecimal) or a "0" prefix (for octal) is provided. ··· 107 100 extensions of *bpffs*. 108 101 109 102 **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] 110 - Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map. 103 + Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. 111 104 112 105 Install perf rings into a perf event array map and dump 113 - output of any bpf_perf_event_output() call in the kernel. 106 + output of any **bpf_perf_event_output**\ () call in the kernel. 114 107 By default read the number of CPUs on the system and 115 108 install perf ring for each CPU in the corresponding index 116 109 in the array. ··· 123 116 receiving events if it installed its rings earlier. 124 117 125 118 **bpftool map peek** *MAP* 126 - Peek next **value** in the queue or stack. 119 + Peek next value in the queue or stack. 127 120 128 121 **bpftool map push** *MAP* **value** *VALUE* 129 - Push **value** onto the stack. 122 + Push *VALUE* onto the stack. 130 123 131 124 **bpftool map pop** *MAP* 132 - Pop and print **value** from the stack. 125 + Pop and print value from the stack. 133 126 134 127 **bpftool map enqueue** *MAP* **value** *VALUE* 135 - Enqueue **value** into the queue. 128 + Enqueue *VALUE* into the queue. 136 129 137 130 **bpftool map dequeue** *MAP* 138 - Dequeue and print **value** from the queue. 131 + Dequeue and print value from the queue. 139 132 140 133 **bpftool map freeze** *MAP* 141 134 Freeze the map as read-only from user space. Entries from a 142 135 frozen map can not longer be updated or deleted with the 143 - **bpf\ ()** system call. This operation is not reversible, 136 + **bpf**\ () system call. This operation is not reversible, 144 137 and the map remains immutable from user space until its 145 138 destruction. However, read and write permissions for BPF 146 139 programs to the map remain unchanged. ··· 276 269 **bpf**\ (2), 277 270 **bpf-helpers**\ (7), 278 271 **bpftool**\ (8), 279 - **bpftool-prog**\ (8), 272 + **bpftool-btf**\ (8), 280 273 **bpftool-cgroup**\ (8), 281 274 **bpftool-feature**\ (8), 275 + **bpftool-gen**\ (8), 276 + **bpftool-iter**\ (8), 277 + **bpftool-link**\ (8), 282 278 **bpftool-net**\ (8), 283 279 **bpftool-perf**\ (8), 284 - **bpftool-btf**\ (8) 280 + **bpftool-prog**\ (8), 281 + **bpftool-struct_ops**\ (8)

+8 -4

tools/bpf/bpftool/Documentation/bpftool-net.rst

··· 20 20 NET COMMANDS 21 21 ============ 22 22 23 - | **bpftool** **net { show | list }** [ **dev** *NAME* ] 23 + | **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] 24 24 | **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] 25 25 | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* 26 26 | **bpftool** **net help** ··· 194 194 **bpf**\ (2), 195 195 **bpf-helpers**\ (7), 196 196 **bpftool**\ (8), 197 - **bpftool-prog**\ (8), 198 - **bpftool-map**\ (8), 197 + **bpftool-btf**\ (8), 199 198 **bpftool-cgroup**\ (8), 200 199 **bpftool-feature**\ (8), 200 + **bpftool-gen**\ (8), 201 + **bpftool-iter**\ (8), 202 + **bpftool-link**\ (8), 203 + **bpftool-map**\ (8), 201 204 **bpftool-perf**\ (8), 202 - **bpftool-btf**\ (8) 205 + **bpftool-prog**\ (8), 206 + **bpftool-struct_ops**\ (8)

+8 -4

tools/bpf/bpftool/Documentation/bpftool-perf.rst

··· 20 20 PERF COMMANDS 21 21 ============= 22 22 23 - | **bpftool** **perf { show | list }** 23 + | **bpftool** **perf** { **show** | **list** } 24 24 | **bpftool** **perf help** 25 25 26 26 DESCRIPTION ··· 85 85 **bpf**\ (2), 86 86 **bpf-helpers**\ (7), 87 87 **bpftool**\ (8), 88 - **bpftool-prog**\ (8), 89 - **bpftool-map**\ (8), 88 + **bpftool-btf**\ (8), 90 89 **bpftool-cgroup**\ (8), 91 90 **bpftool-feature**\ (8), 91 + **bpftool-gen**\ (8), 92 + **bpftool-iter**\ (8), 93 + **bpftool-link**\ (8), 94 + **bpftool-map**\ (8), 92 95 **bpftool-net**\ (8), 93 - **bpftool-btf**\ (8) 96 + **bpftool-prog**\ (8), 97 + **bpftool-struct_ops**\ (8)

+14 -9

tools/bpf/bpftool/Documentation/bpftool-prog.rst

··· 21 21 PROG COMMANDS 22 22 ============= 23 23 24 - | **bpftool** **prog { show | list }** [*PROG*] 24 + | **bpftool** **prog** { **show** | **list** } [*PROG*] 25 25 | **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** | **visual** | **linum**}] 26 26 | **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes** | **linum**}] 27 27 | **bpftool** **prog pin** *PROG* *FILE* 28 - | **bpftool** **prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**pinmaps** *MAP_DIR*] 28 + | **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**pinmaps** *MAP_DIR*] 29 29 | **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] 30 30 | **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] 31 31 | **bpftool** **prog tracelog** ··· 49 49 | *ATTACH_TYPE* := { 50 50 | **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** 51 51 | } 52 - | *METRIC* := { 52 + | *METRICs* := { 53 53 | **cycles** | **instructions** | **l1d_loads** | **llc_misses** 54 54 | } 55 55 ··· 155 155 **bpftool prog tracelog** 156 156 Dump the trace pipe of the system to the console (stdout). 157 157 Hit <Ctrl+C> to stop printing. BPF programs can write to this 158 - trace pipe at runtime with the **bpf_trace_printk()** helper. 158 + trace pipe at runtime with the **bpf_trace_printk**\ () helper. 159 159 This should be used only for debugging purposes. For 160 160 streaming data from BPF programs to user space, one can use 161 161 perf events (see also **bpftool-map**\ (8)). ··· 195 195 196 196 **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* 197 197 Profile *METRICs* for bpf program *PROG* for *DURATION* 198 - seconds or until user hits Ctrl-C. *DURATION* is optional. 198 + seconds or until user hits <Ctrl+C>. *DURATION* is optional. 199 199 If *DURATION* is not specified, the profiling will run up to 200 - UINT_MAX seconds. 200 + **UINT_MAX** seconds. 201 201 202 202 **bpftool prog help** 203 203 Print short help message. ··· 267 267 268 268 | 269 269 | **# bpftool prog dump xlated id 10 file /tmp/t** 270 - | **# ls -l /tmp/t** 270 + | **$ ls -l /tmp/t** 271 271 272 272 :: 273 273 ··· 325 325 | **# bpftool prog profile id 337 duration 10 cycles instructions llc_misses** 326 326 327 327 :: 328 + 328 329 51397 run_cnt 329 330 40176203 cycles (83.05%) 330 331 42518139 instructions # 1.06 insns per cycle (83.39%) ··· 336 335 **bpf**\ (2), 337 336 **bpf-helpers**\ (7), 338 337 **bpftool**\ (8), 339 - **bpftool-map**\ (8), 338 + **bpftool-btf**\ (8), 340 339 **bpftool-cgroup**\ (8), 341 340 **bpftool-feature**\ (8), 341 + **bpftool-gen**\ (8), 342 + **bpftool-iter**\ (8), 343 + **bpftool-link**\ (8), 344 + **bpftool-map**\ (8), 342 345 **bpftool-net**\ (8), 343 346 **bpftool-perf**\ (8), 344 - **bpftool-btf**\ (8) 347 + **bpftool-struct_ops**\ (8)

+6 -5

tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst

··· 105 105 **bpf**\ (2), 106 106 **bpf-helpers**\ (7), 107 107 **bpftool**\ (8), 108 - **bpftool-prog**\ (8), 109 - **bpftool-map**\ (8), 108 + **bpftool-btf**\ (8), 110 109 **bpftool-cgroup**\ (8), 111 110 **bpftool-feature**\ (8), 111 + **bpftool-gen**\ (8), 112 + **bpftool-iter**\ (8), 113 + **bpftool-link**\ (8), 114 + **bpftool-map**\ (8), 112 115 **bpftool-net**\ (8), 113 116 **bpftool-perf**\ (8), 114 - **bpftool-btf**\ (8) 115 - **bpftool-gen**\ (8) 116 - 117 + **bpftool-prog**\ (8)

+7 -4

tools/bpf/bpftool/Documentation/bpftool.rst

··· 75 75 ======== 76 76 **bpf**\ (2), 77 77 **bpf-helpers**\ (7), 78 - **bpftool-prog**\ (8), 79 - **bpftool-map**\ (8), 78 + **bpftool-btf**\ (8), 80 79 **bpftool-cgroup**\ (8), 81 80 **bpftool-feature**\ (8), 81 + **bpftool-gen**\ (8), 82 + **bpftool-iter**\ (8), 83 + **bpftool-link**\ (8), 84 + **bpftool-map**\ (8), 82 85 **bpftool-net**\ (8), 83 86 **bpftool-perf**\ (8), 84 - **bpftool-btf**\ (8), 85 - **bpftool-gen**\ (8), 87 + **bpftool-prog**\ (8), 88 + **bpftool-struct_ops**\ (8)

+13

tools/bpf/bpftool/bash-completion/bpftool

··· 610 610 ;; 611 611 esac 612 612 ;; 613 + iter) 614 + case $command in 615 + pin) 616 + _filedir 617 + return 0 618 + ;; 619 + *) 620 + [[ $prev == $object ]] && \ 621 + COMPREPLY=( $( compgen -W 'pin help' \ 622 + -- "$cur" ) ) 623 + ;; 624 + esac 625 + ;; 613 626 map) 614 627 local MAP_TYPE='id pinned name' 615 628 case $command in

+2 -2

tools/bpf/bpftool/btf_dumper.c

··· 271 271 } 272 272 } 273 273 274 - static void btf_int128_shift(__u64 *print_num, u16 left_shift_bits, 275 - u16 right_shift_bits) 274 + static void btf_int128_shift(__u64 *print_num, __u16 left_shift_bits, 275 + __u16 right_shift_bits) 276 276 { 277 277 __u64 upper_num, lower_num; 278 278

+2 -2

tools/bpf/bpftool/cfg.c

··· 157 157 return false; 158 158 } 159 159 160 - static bool is_jmp_insn(u8 code) 160 + static bool is_jmp_insn(__u8 code) 161 161 { 162 162 return BPF_CLASS(code) == BPF_JMP || BPF_CLASS(code) == BPF_JMP32; 163 163 } ··· 176 176 177 177 for (; cur <= end; cur++) { 178 178 if (is_jmp_insn(cur->code)) { 179 - u8 opcode = BPF_OP(cur->code); 179 + __u8 opcode = BPF_OP(cur->code); 180 180 181 181 if (opcode == BPF_EXIT || opcode == BPF_CALL) 182 182 continue;

+88

tools/bpf/bpftool/iter.c

··· 1 + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + // Copyright (C) 2020 Facebook 3 + 4 + #define _GNU_SOURCE 5 + #include <linux/err.h> 6 + #include <bpf/libbpf.h> 7 + 8 + #include "main.h" 9 + 10 + static int do_pin(int argc, char **argv) 11 + { 12 + const char *objfile, *path; 13 + struct bpf_program *prog; 14 + struct bpf_object *obj; 15 + struct bpf_link *link; 16 + int err; 17 + 18 + if (!REQ_ARGS(2)) 19 + usage(); 20 + 21 + objfile = GET_ARG(); 22 + path = GET_ARG(); 23 + 24 + obj = bpf_object__open(objfile); 25 + if (IS_ERR(obj)) { 26 + p_err("can't open objfile %s", objfile); 27 + return -1; 28 + } 29 + 30 + err = bpf_object__load(obj); 31 + if (err) { 32 + p_err("can't load objfile %s", objfile); 33 + goto close_obj; 34 + } 35 + 36 + prog = bpf_program__next(NULL, obj); 37 + if (!prog) { 38 + p_err("can't find bpf program in objfile %s", objfile); 39 + goto close_obj; 40 + } 41 + 42 + link = bpf_program__attach_iter(prog, NULL); 43 + if (IS_ERR(link)) { 44 + err = PTR_ERR(link); 45 + p_err("attach_iter failed for program %s", 46 + bpf_program__name(prog)); 47 + goto close_obj; 48 + } 49 + 50 + err = mount_bpffs_for_pin(path); 51 + if (err) 52 + goto close_link; 53 + 54 + err = bpf_link__pin(link, path); 55 + if (err) { 56 + p_err("pin_iter failed for program %s to path %s", 57 + bpf_program__name(prog), path); 58 + goto close_link; 59 + } 60 + 61 + close_link: 62 + bpf_link__destroy(link); 63 + close_obj: 64 + bpf_object__close(obj); 65 + return err; 66 + } 67 + 68 + static int do_help(int argc, char **argv) 69 + { 70 + fprintf(stderr, 71 + "Usage: %s %s pin OBJ PATH\n" 72 + " %s %s help\n" 73 + "\n", 74 + bin_name, argv[-2], bin_name, argv[-2]); 75 + 76 + return 0; 77 + } 78 + 79 + static const struct cmd cmds[] = { 80 + { "help", do_help }, 81 + { "pin", do_pin }, 82 + { 0 } 83 + }; 84 + 85 + int do_iter(int argc, char **argv) 86 + { 87 + return cmd_select(cmds, argc, argv, do_help); 88 + }

+1

tools/bpf/bpftool/link.c

··· 16 16 [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", 17 17 [BPF_LINK_TYPE_TRACING] = "tracing", 18 18 [BPF_LINK_TYPE_CGROUP] = "cgroup", 19 + [BPF_LINK_TYPE_ITER] = "iter", 19 20 }; 20 21 21 22 static int link_parse_fd(int *argc, char ***argv)

+2 -1

tools/bpf/bpftool/main.c

··· 59 59 " %s batch file FILE\n" 60 60 " %s version\n" 61 61 "\n" 62 - " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops }\n" 62 + " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n" 63 63 " " HELP_SPEC_OPTIONS "\n" 64 64 "", 65 65 bin_name, bin_name, bin_name); ··· 224 224 { "btf", do_btf }, 225 225 { "gen", do_gen }, 226 226 { "struct_ops", do_struct_ops }, 227 + { "iter", do_iter }, 227 228 { "version", do_version }, 228 229 { 0 } 229 230 };

+4

tools/bpf/bpftool/main.h

··· 18 18 19 19 #include "json_writer.h" 20 20 21 + /* Make sure we do not use kernel-only integer typedefs */ 22 + #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 23 + 21 24 #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) 22 25 23 26 #define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); }) ··· 202 199 int do_btf(int argc, char **argv); 203 200 int do_gen(int argc, char **argv); 204 201 int do_struct_ops(int argc, char **argv); 202 + int do_iter(int argc, char **argv); 205 203 206 204 int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); 207 205 int prog_parse_fd(int *argc, char ***argv);

+2 -1

tools/bpf/bpftool/map.c

··· 1589 1589 " percpu_array | stack_trace | cgroup_array | lru_hash |\n" 1590 1590 " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" 1591 1591 " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" 1592 - " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n" 1592 + " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" 1593 + " queue | stack | sk_storage | struct_ops }\n" 1593 1594 " " HELP_SPEC_OPTIONS "\n" 1594 1595 "", 1595 1596 bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],

+1 -1

tools/bpf/bpftool/map_perf_ring.c

··· 39 39 40 40 struct perf_event_sample { 41 41 struct perf_event_header header; 42 - u64 time; 42 + __u64 time; 43 43 __u32 size; 44 44 unsigned char data[]; 45 45 };

+1 -1

tools/bpf/bpftool/prog.c

··· 238 238 return fd; 239 239 } 240 240 241 - static void show_prog_maps(int fd, u32 num_maps) 241 + static void show_prog_maps(int fd, __u32 num_maps) 242 242 { 243 243 struct bpf_prog_info info = {}; 244 244 __u32 len = sizeof(info);

+2 -1

tools/bpf/runqslower/Makefile

··· 8 8 LIBBPF_SRC := $(abspath ../../lib/bpf) 9 9 BPFOBJ := $(OUTPUT)/libbpf.a 10 10 BPF_INCLUDE := $(OUTPUT) 11 - INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) 11 + INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) \ 12 + -I$(abspath ../../include/uapi) 12 13 CFLAGS := -g -Wall 13 14 14 15 # Try to detect best kernel BTF source

+131 -42

tools/include/uapi/linux/bpf.h

··· 116 116 BPF_LINK_GET_FD_BY_ID, 117 117 BPF_LINK_GET_NEXT_ID, 118 118 BPF_ENABLE_STATS, 119 + BPF_ITER_CREATE, 119 120 }; 120 121 121 122 enum bpf_map_type { ··· 219 218 BPF_TRACE_FEXIT, 220 219 BPF_MODIFY_RETURN, 221 220 BPF_LSM_MAC, 221 + BPF_TRACE_ITER, 222 222 __MAX_BPF_ATTACH_TYPE 223 223 }; 224 224 ··· 230 228 BPF_LINK_TYPE_RAW_TRACEPOINT = 1, 231 229 BPF_LINK_TYPE_TRACING = 2, 232 230 BPF_LINK_TYPE_CGROUP = 3, 231 + BPF_LINK_TYPE_ITER = 4, 233 232 234 233 MAX_BPF_LINK_TYPE, 235 234 }; ··· 615 612 __u32 type; 616 613 } enable_stats; 617 614 615 + struct { /* struct used by BPF_ITER_CREATE command */ 616 + __u32 link_fd; 617 + __u32 flags; 618 + } iter_create; 619 + 618 620 } __attribute__((aligned(8))); 619 621 620 622 /* The description below is an attempt at providing documentation to eBPF ··· 675 667 * For tracing programs, safely attempt to read *size* bytes from 676 668 * kernel space address *unsafe_ptr* and store the data in *dst*. 677 669 * 678 - * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() 679 - * instead. 670 + * Generally, use **bpf_probe_read_user**\ () or 671 + * **bpf_probe_read_kernel**\ () instead. 680 672 * Return 681 673 * 0 on success, or a negative error in case of failure. 682 674 * ··· 684 676 * Description 685 677 * Return the time elapsed since system boot, in nanoseconds. 686 678 * Does not include time the system was suspended. 687 - * See: clock_gettime(CLOCK_MONOTONIC) 679 + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) 688 680 * Return 689 681 * Current *ktime*. 690 682 * ··· 1543 1535 * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) 1544 1536 * Description 1545 1537 * Copy a NUL terminated string from an unsafe kernel address 1546 - * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for 1538 + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for 1547 1539 * more details. 1548 1540 * 1549 - * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() 1550 - * instead. 1541 + * Generally, use **bpf_probe_read_user_str**\ () or 1542 + * **bpf_probe_read_kernel_str**\ () instead. 1551 1543 * Return 1552 1544 * On success, the strictly positive length of the string, 1553 1545 * including the trailing NUL character. On error, a negative ··· 1575 1567 * 1576 1568 * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) 1577 1569 * Description 1578 - * Equivalent to bpf_get_socket_cookie() helper that accepts 1570 + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts 1579 1571 * *skb*, but gets socket from **struct bpf_sock_ops** context. 1580 1572 * Return 1581 1573 * A 8-byte long non-decreasing number. ··· 1604 1596 * The option value of length *optlen* is pointed by *optval*. 1605 1597 * 1606 1598 * *bpf_socket* should be one of the following: 1599 + * 1607 1600 * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. 1608 1601 * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** 1609 1602 * and **BPF_CGROUP_INET6_CONNECT**. ··· 1673 1664 * 1674 1665 * The lower two bits of *flags* are used as the return code if 1675 1666 * the map lookup fails. This is so that the return value can be 1676 - * one of the XDP program return codes up to XDP_TX, as chosen by 1677 - * the caller. Any higher bits in the *flags* argument must be 1667 + * one of the XDP program return codes up to **XDP_TX**, as chosen 1668 + * by the caller. Any higher bits in the *flags* argument must be 1678 1669 * unset. 1679 1670 * 1680 - * See also bpf_redirect(), which only supports redirecting to an 1681 - * ifindex, but doesn't require a map to do so. 1671 + * See also **bpf_redirect**\ (), which only supports redirecting 1672 + * to an ifindex, but doesn't require a map to do so. 1682 1673 * Return 1683 1674 * **XDP_REDIRECT** on success, or the value of the two lower bits 1684 1675 * of the *flags* argument on error. ··· 1786 1777 * the time running for event since last normalization. The 1787 1778 * enabled and running times are accumulated since the perf event 1788 1779 * open. To achieve scaling factor between two invocations of an 1789 - * eBPF program, users can can use CPU id as the key (which is 1780 + * eBPF program, users can use CPU id as the key (which is 1790 1781 * typical for perf array usage model) to remember the previous 1791 1782 * value and do the calculation inside the eBPF program. 1792 1783 * Return ··· 1813 1804 * *opval* and of length *optlen*. 1814 1805 * 1815 1806 * *bpf_socket* should be one of the following: 1807 + * 1816 1808 * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. 1817 1809 * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** 1818 1810 * and **BPF_CGROUP_INET6_CONNECT**. ··· 1835 1825 * The first argument is the context *regs* on which the kprobe 1836 1826 * works. 1837 1827 * 1838 - * This helper works by setting setting the PC (program counter) 1828 + * This helper works by setting the PC (program counter) 1839 1829 * to an override function which is run in place of the original 1840 1830 * probed function. This means the probed function is not run at 1841 1831 * all. The replacement function just returns with the required ··· 2004 1994 * 2005 1995 * This helper works for IPv4 and IPv6, TCP and UDP sockets. The 2006 1996 * domain (*addr*\ **->sa_family**) must be **AF_INET** (or 2007 - * **AF_INET6**). Looking for a free port to bind to can be 2008 - * expensive, therefore binding to port is not permitted by the 2009 - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) 2010 - * must be set to zero. 1997 + * **AF_INET6**). It's advised to pass zero port (**sin_port** 1998 + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like 1999 + * behavior and lets the kernel efficiently pick up an unused 2000 + * port as long as 4-tuple is unique. Passing non-zero port might 2001 + * lead to degraded performance. 2011 2002 * Return 2012 2003 * 0 on success, or a negative error in case of failure. 2013 2004 * ··· 2302 2291 * **bpf_rc_keydown**\ () again with the same values, or calling 2303 2292 * **bpf_rc_repeat**\ (). 2304 2293 * 2305 - * Some protocols include a toggle bit, in case the button was 2294 + * Some protocols include a toggle bit, in case the button was 2306 2295 * released and pressed again between consecutive scancodes. 2307 2296 * 2308 2297 * The *ctx* should point to the lirc sample as passed into ··· 2648 2637 * 2649 2638 * *th* points to the start of the TCP header, while *th_len* 2650 2639 * contains **sizeof**\ (**struct tcphdr**). 2651 - * 2652 2640 * Return 2653 2641 * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative 2654 2642 * error otherwise. ··· 2830 2820 * 2831 2821 * *th* points to the start of the TCP header, while *th_len* 2832 2822 * contains the length of the TCP header. 2833 - * 2834 2823 * Return 2835 2824 * On success, lower 32 bits hold the generated SYN cookie in 2836 2825 * followed by 16 bits which hold the MSS value for that cookie, ··· 2912 2903 * // size, after checking its boundaries. 2913 2904 * } 2914 2905 * 2915 - * In comparison, using **bpf_probe_read_user()** helper here 2906 + * In comparison, using **bpf_probe_read_user**\ () helper here 2916 2907 * instead to read the string would require to estimate the length 2917 2908 * at compile time, and would often result in copying more memory 2918 2909 * than necessary. ··· 2930 2921 * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) 2931 2922 * Description 2932 2923 * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* 2933 - * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. 2924 + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. 2934 2925 * Return 2935 - * On success, the strictly positive length of the string, including 2926 + * On success, the strictly positive length of the string, including 2936 2927 * the trailing NUL character. On error, a negative value. 2937 2928 * 2938 2929 * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) 2939 2930 * Description 2940 - * Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock. 2931 + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. 2941 2932 * *rcv_nxt* is the ack_seq to be sent out. 2942 2933 * Return 2943 2934 * 0 on success, or a negative error in case of failure. ··· 2965 2956 * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) 2966 2957 * Description 2967 2958 * For an eBPF program attached to a perf event, retrieve the 2968 - * branch records (struct perf_branch_entry) associated to *ctx* 2969 - * and store it in the buffer pointed by *buf* up to size 2959 + * branch records (**struct perf_branch_entry**) associated to *ctx* 2960 + * and store it in the buffer pointed by *buf* up to size 2970 2961 * *size* bytes. 2971 2962 * Return 2972 2963 * On success, number of bytes written to *buf*. On error, a 2973 2964 * negative value. 2974 2965 * 2975 2966 * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to 2976 - * instead return the number of bytes required to store all the 2967 + * instead return the number of bytes required to store all the 2977 2968 * branch entries. If this flag is set, *buf* may be NULL. 2978 2969 * 2979 2970 * **-EINVAL** if arguments invalid or **size** not a multiple 2980 - * of sizeof(struct perf_branch_entry). 2971 + * of **sizeof**\ (**struct perf_branch_entry**\ ). 2981 2972 * 2982 2973 * **-ENOENT** if architecture does not support branch records. 2983 2974 * ··· 2985 2976 * Description 2986 2977 * Returns 0 on success, values for *pid* and *tgid* as seen from the current 2987 2978 * *namespace* will be returned in *nsdata*. 2988 - * 2989 - * On failure, the returned value is one of the following: 2979 + * Return 2980 + * 0 on success, or one of the following in case of failure: 2990 2981 * 2991 2982 * **-EINVAL** if dev and inum supplied don't match dev_t and inode number 2992 2983 * with nsfs of current task, or if dev conversion to dev_t lost high bits. ··· 3025 3016 * a global identifier that can be assumed unique. If *ctx* is 3026 3017 * NULL, then the helper returns the cookie for the initial 3027 3018 * network namespace. The cookie itself is very similar to that 3028 - * of bpf_get_socket_cookie() helper, but for network namespaces 3029 - * instead of sockets. 3019 + * of **bpf_get_socket_cookie**\ () helper, but for network 3020 + * namespaces instead of sockets. 3030 3021 * Return 3031 3022 * A 8-byte long opaque number. 3032 3023 * ··· 3061 3052 * 3062 3053 * The *flags* argument must be zero. 3063 3054 * Return 3064 - * 0 on success, or a negative errno in case of failure. 3055 + * 0 on success, or a negative error in case of failure: 3065 3056 * 3066 - * * **-EINVAL** Unsupported flags specified. 3067 - * * **-ENOENT** Socket is unavailable for assignment. 3068 - * * **-ENETUNREACH** Socket is unreachable (wrong netns). 3069 - * * **-EOPNOTSUPP** Unsupported operation, for example a 3070 - * call from outside of TC ingress. 3071 - * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). 3057 + * **-EINVAL** if specified *flags* are not supported. 3058 + * 3059 + * **-ENOENT** if the socket is unavailable for assignment. 3060 + * 3061 + * **-ENETUNREACH** if the socket is unreachable (wrong netns). 3062 + * 3063 + * **-EOPNOTSUPP** if the operation is not supported, for example 3064 + * a call from outside of TC ingress. 3065 + * 3066 + * **-ESOCKTNOSUPPORT** if the socket type is not supported 3067 + * (reuseport). 3072 3068 * 3073 3069 * u64 bpf_ktime_get_boot_ns(void) 3074 3070 * Description 3075 3071 * Return the time elapsed since system boot, in nanoseconds. 3076 3072 * Does include the time the system was suspended. 3077 - * See: clock_gettime(CLOCK_BOOTTIME) 3073 + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) 3078 3074 * Return 3079 3075 * Current *ktime*. 3076 + * 3077 + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) 3078 + * Description 3079 + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print 3080 + * out the format string. 3081 + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for 3082 + * the format string itself. The *data* and *data_len* are format string 3083 + * arguments. The *data* are a **u64** array and corresponding format string 3084 + * values are stored in the array. For strings and pointers where pointees 3085 + * are accessed, only the pointer values are stored in the *data* array. 3086 + * The *data_len* is the size of *data* in bytes. 3087 + * 3088 + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. 3089 + * Reading kernel memory may fail due to either invalid address or 3090 + * valid address but requiring a major memory fault. If reading kernel memory 3091 + * fails, the string for **%s** will be an empty string, and the ip 3092 + * address for **%p{i,I}{4,6}** will be 0. Not returning error to 3093 + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. 3094 + * Return 3095 + * 0 on success, or a negative error in case of failure: 3096 + * 3097 + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again 3098 + * by returning 1 from bpf program. 3099 + * 3100 + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. 3101 + * 3102 + * **-E2BIG** if *fmt* contains too many format specifiers. 3103 + * 3104 + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. 3105 + * 3106 + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) 3107 + * Description 3108 + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. 3109 + * The *m* represents the seq_file. The *data* and *len* represent the 3110 + * data to write in bytes. 3111 + * Return 3112 + * 0 on success, or a negative error in case of failure: 3113 + * 3114 + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. 3115 + * 3116 + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) 3117 + * Description 3118 + * Return the cgroup v2 id of the socket *sk*. 3119 + * 3120 + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one 3121 + * returned from **bpf_sk_lookup_xxx**\ (), 3122 + * **bpf_sk_fullsock**\ (), etc. The format of returned id is 3123 + * same as in **bpf_skb_cgroup_id**\ (). 3124 + * 3125 + * This helper is available only if the kernel was compiled with 3126 + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. 3127 + * Return 3128 + * The id is returned or 0 in case the id could not be retrieved. 3129 + * 3130 + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) 3131 + * Description 3132 + * Return id of cgroup v2 that is ancestor of cgroup associated 3133 + * with the *sk* at the *ancestor_level*. The root cgroup is at 3134 + * *ancestor_level* zero and each step down the hierarchy 3135 + * increments the level. If *ancestor_level* == level of cgroup 3136 + * associated with *sk*, then return value will be same as that 3137 + * of **bpf_sk_cgroup_id**\ (). 3138 + * 3139 + * The helper is useful to implement policies based on cgroups 3140 + * that are upper in hierarchy than immediate cgroup associated 3141 + * with *sk*. 3142 + * 3143 + * The format of returned id and helper limitations are same as in 3144 + * **bpf_sk_cgroup_id**\ (). 3145 + * Return 3146 + * The id is returned or 0 in case the id could not be retrieved. 3080 3147 */ 3081 3148 #define __BPF_FUNC_MAPPER(FN) \ 3082 3149 FN(unspec), \ ··· 3280 3195 FN(get_netns_cookie), \ 3281 3196 FN(get_current_ancestor_cgroup_id), \ 3282 3197 FN(sk_assign), \ 3283 - FN(ktime_get_boot_ns), 3198 + FN(ktime_get_boot_ns), \ 3199 + FN(seq_printf), \ 3200 + FN(seq_write), \ 3201 + FN(sk_cgroup_id), \ 3202 + FN(sk_ancestor_cgroup_id), 3284 3203 3285 3204 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 3286 3205 * function eBPF program intends to call ··· 3762 3673 __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. 3763 3674 * Stored in network byte order. 3764 3675 */ 3765 - __u32 user_port; /* Allows 4-byte read and write. 3676 + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. 3766 3677 * Stored in network byte order 3767 3678 */ 3768 3679 __u32 family; /* Allows 4-byte read, but no write */

+10

tools/lib/bpf/bpf.c

··· 619 619 return sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr)); 620 620 } 621 621 622 + int bpf_iter_create(int link_fd) 623 + { 624 + union bpf_attr attr; 625 + 626 + memset(&attr, 0, sizeof(attr)); 627 + attr.iter_create.link_fd = link_fd; 628 + 629 + return sys_bpf(BPF_ITER_CREATE, &attr, sizeof(attr)); 630 + } 631 + 622 632 int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, 623 633 __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) 624 634 {

+2

tools/lib/bpf/bpf.h

··· 187 187 LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, 188 188 const struct bpf_link_update_opts *opts); 189 189 190 + LIBBPF_API int bpf_iter_create(int link_fd); 191 + 190 192 struct bpf_prog_test_run_attr { 191 193 int prog_fd; 192 194 int repeat;

+14

tools/lib/bpf/bpf_helpers.h

··· 37 37 #endif 38 38 39 39 /* 40 + * Helper macro to manipulate data structures 41 + */ 42 + #ifndef offsetof 43 + #define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) 44 + #endif 45 + #ifndef container_of 46 + #define container_of(ptr, type, member) \ 47 + ({ \ 48 + void *__mptr = (void *)(ptr); \ 49 + ((type *)(__mptr - offsetof(type, member))); \ 50 + }) 51 + #endif 52 + 53 + /* 40 54 * Helper structure used by eBPF C program 41 55 * to describe BPF map attributes to libbpf loader 42 56 */

+16

tools/lib/bpf/bpf_tracing.h

··· 413 413 } \ 414 414 static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) 415 415 416 + /* 417 + * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values 418 + * in a structure. 419 + */ 420 + #define BPF_SEQ_PRINTF(seq, fmt, args...) \ 421 + ({ \ 422 + _Pragma("GCC diagnostic push") \ 423 + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ 424 + static const char ___fmt[] = fmt; \ 425 + unsigned long long ___param[] = { args }; \ 426 + _Pragma("GCC diagnostic pop") \ 427 + int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ 428 + ___param, sizeof(___param)); \ 429 + ___ret; \ 430 + }) 431 + 416 432 #endif

+82 -8

tools/lib/bpf/libbpf.c

··· 3237 3237 } 3238 3238 3239 3239 static int 3240 - bpf_object__probe_name(struct bpf_object *obj) 3240 + bpf_object__probe_loading(struct bpf_object *obj) 3241 3241 { 3242 3242 struct bpf_load_program_attr attr; 3243 3243 char *cp, errmsg[STRERR_BUFSIZE]; ··· 3257 3257 3258 3258 ret = bpf_load_program_xattr(&attr, NULL, 0); 3259 3259 if (ret < 0) { 3260 - cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); 3261 - pr_warn("Error in %s():%s(%d). Couldn't load basic 'r0 = 0' BPF program.\n", 3262 - __func__, cp, errno); 3263 - return -errno; 3260 + ret = errno; 3261 + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); 3262 + pr_warn("Error in %s():%s(%d). Couldn't load trivial BPF " 3263 + "program. Make sure your kernel supports BPF " 3264 + "(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is " 3265 + "set to big enough value.\n", __func__, cp, ret); 3266 + return -ret; 3264 3267 } 3265 3268 close(ret); 3266 3269 3267 - /* now try the same program, but with the name */ 3270 + return 0; 3271 + } 3268 3272 3273 + static int 3274 + bpf_object__probe_name(struct bpf_object *obj) 3275 + { 3276 + struct bpf_load_program_attr attr; 3277 + struct bpf_insn insns[] = { 3278 + BPF_MOV64_IMM(BPF_REG_0, 0), 3279 + BPF_EXIT_INSN(), 3280 + }; 3281 + int ret; 3282 + 3283 + /* make sure loading with name works */ 3284 + 3285 + memset(&attr, 0, sizeof(attr)); 3286 + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; 3287 + attr.insns = insns; 3288 + attr.insns_cnt = ARRAY_SIZE(insns); 3289 + attr.license = "GPL"; 3269 3290 attr.name = "test"; 3270 3291 ret = bpf_load_program_xattr(&attr, NULL, 0); 3271 3292 if (ret >= 0) { ··· 5657 5636 5658 5637 obj->loaded = true; 5659 5638 5660 - err = bpf_object__probe_caps(obj); 5639 + err = bpf_object__probe_loading(obj); 5640 + err = err ? : bpf_object__probe_caps(obj); 5661 5641 err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); 5662 5642 err = err ? : bpf_object__sanitize_and_load_btf(obj); 5663 5643 err = err ? : bpf_object__sanitize_maps(obj); ··· 6608 6586 struct bpf_program *prog); 6609 6587 static struct bpf_link *attach_lsm(const struct bpf_sec_def *sec, 6610 6588 struct bpf_program *prog); 6589 + static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, 6590 + struct bpf_program *prog); 6611 6591 6612 6592 static const struct bpf_sec_def section_defs[] = { 6613 6593 BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), ··· 6653 6629 .is_attach_btf = true, 6654 6630 .expected_attach_type = BPF_LSM_MAC, 6655 6631 .attach_fn = attach_lsm), 6632 + SEC_DEF("iter/", TRACING, 6633 + .expected_attach_type = BPF_TRACE_ITER, 6634 + .is_attach_btf = true, 6635 + .attach_fn = attach_iter), 6656 6636 BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), 6657 6637 BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), 6658 6638 BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), ··· 6919 6891 6920 6892 #define BTF_TRACE_PREFIX "btf_trace_" 6921 6893 #define BTF_LSM_PREFIX "bpf_lsm_" 6894 + #define BTF_ITER_PREFIX "bpf_iter_" 6922 6895 #define BTF_MAX_NAME_SIZE 128 6923 6896 6924 6897 static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, ··· 6949 6920 BTF_KIND_TYPEDEF); 6950 6921 else if (attach_type == BPF_LSM_MAC) 6951 6922 err = find_btf_by_prefix_kind(btf, BTF_LSM_PREFIX, name, 6923 + BTF_KIND_FUNC); 6924 + else if (attach_type == BPF_TRACE_ITER) 6925 + err = find_btf_by_prefix_kind(btf, BTF_ITER_PREFIX, name, 6952 6926 BTF_KIND_FUNC); 6953 6927 else 6954 6928 err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); ··· 7880 7848 return bpf_program__attach_lsm(prog); 7881 7849 } 7882 7850 7851 + static struct bpf_link *attach_iter(const struct bpf_sec_def *sec, 7852 + struct bpf_program *prog) 7853 + { 7854 + return bpf_program__attach_iter(prog, NULL); 7855 + } 7856 + 7883 7857 struct bpf_link * 7884 7858 bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd) 7885 7859 { ··· 7912 7874 link_fd = -errno; 7913 7875 free(link); 7914 7876 pr_warn("program '%s': failed to attach to cgroup: %s\n", 7877 + bpf_program__title(prog, false), 7878 + libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); 7879 + return ERR_PTR(link_fd); 7880 + } 7881 + link->fd = link_fd; 7882 + return link; 7883 + } 7884 + 7885 + struct bpf_link * 7886 + bpf_program__attach_iter(struct bpf_program *prog, 7887 + const struct bpf_iter_attach_opts *opts) 7888 + { 7889 + char errmsg[STRERR_BUFSIZE]; 7890 + struct bpf_link *link; 7891 + int prog_fd, link_fd; 7892 + 7893 + if (!OPTS_VALID(opts, bpf_iter_attach_opts)) 7894 + return ERR_PTR(-EINVAL); 7895 + 7896 + prog_fd = bpf_program__fd(prog); 7897 + if (prog_fd < 0) { 7898 + pr_warn("program '%s': can't attach before loaded\n", 7899 + bpf_program__title(prog, false)); 7900 + return ERR_PTR(-EINVAL); 7901 + } 7902 + 7903 + link = calloc(1, sizeof(*link)); 7904 + if (!link) 7905 + return ERR_PTR(-ENOMEM); 7906 + link->detach = &bpf_link__detach_fd; 7907 + 7908 + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_ITER, NULL); 7909 + if (link_fd < 0) { 7910 + link_fd = -errno; 7911 + free(link); 7912 + pr_warn("program '%s': failed to attach to iterator: %s\n", 7915 7913 bpf_program__title(prog, false), 7916 7914 libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); 7917 7915 return ERR_PTR(link_fd); ··· 8374 8300 struct perf_sample_raw { 8375 8301 struct perf_event_header header; 8376 8302 uint32_t size; 8377 - char data[0]; 8303 + char data[]; 8378 8304 }; 8379 8305 8380 8306 struct perf_sample_lost {

+9

tools/lib/bpf/libbpf.h

··· 258 258 259 259 LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map); 260 260 261 + struct bpf_iter_attach_opts { 262 + size_t sz; /* size of this struct for forward/backward compatibility */ 263 + }; 264 + #define bpf_iter_attach_opts__last_field sz 265 + 266 + LIBBPF_API struct bpf_link * 267 + bpf_program__attach_iter(struct bpf_program *prog, 268 + const struct bpf_iter_attach_opts *opts); 269 + 261 270 struct bpf_insn; 262 271 263 272 /*

+2

tools/lib/bpf/libbpf.map

··· 258 258 LIBBPF_0.0.9 { 259 259 global: 260 260 bpf_enable_stats; 261 + bpf_iter_create; 261 262 bpf_link_get_fd_by_id; 262 263 bpf_link_get_next_id; 264 + bpf_program__attach_iter; 263 265 } LIBBPF_0.0.8;

+1 -1

tools/lib/bpf/libbpf_internal.h

··· 153 153 __u32 sec_name_off; 154 154 __u32 num_info; 155 155 /* Followed by num_info * record_size number of bytes */ 156 - __u8 data[0]; 156 + __u8 data[]; 157 157 }; 158 158 159 159 /* The minimum bpf_func_info checked by the loader */

+4 -1

tools/perf/builtin-stat.c

··· 686 686 break; 687 687 } 688 688 } 689 - if (child_pid != -1) 689 + if (child_pid != -1) { 690 + if (timeout) 691 + kill(child_pid, SIGTERM); 690 692 wait4(child_pid, &status, 0, &stat_config.ru_data); 693 + } 691 694 692 695 if (workload_exec_errno) { 693 696 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));

+20

tools/perf/util/annotate.c

··· 1821 1821 } 1822 1822 #endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) 1823 1823 1824 + static int 1825 + symbol__disassemble_bpf_image(struct symbol *sym, 1826 + struct annotate_args *args) 1827 + { 1828 + struct annotation *notes = symbol__annotation(sym); 1829 + struct disasm_line *dl; 1830 + 1831 + args->offset = -1; 1832 + args->line = strdup("to be implemented"); 1833 + args->line_nr = 0; 1834 + dl = disasm_line__new(args); 1835 + if (dl) 1836 + annotation_line__add(&dl->al, &notes->src->source); 1837 + 1838 + free(args->line); 1839 + return 0; 1840 + } 1841 + 1824 1842 /* 1825 1843 * Possibly create a new version of line with tabs expanded. Returns the 1826 1844 * existing or new line, storage is updated if a new line is allocated. If ··· 1938 1920 1939 1921 if (dso->binary_type == DSO_BINARY_TYPE__BPF_PROG_INFO) { 1940 1922 return symbol__disassemble_bpf(sym, args); 1923 + } else if (dso->binary_type == DSO_BINARY_TYPE__BPF_IMAGE) { 1924 + return symbol__disassemble_bpf_image(sym, args); 1941 1925 } else if (dso__is_kcore(dso)) { 1942 1926 kce.kcore_filename = symfs_filename; 1943 1927 kce.addr = map__rip_2objdump(map, sym->start);

+93

tools/perf/util/bpf-event.c

··· 6 6 #include <bpf/libbpf.h> 7 7 #include <linux/btf.h> 8 8 #include <linux/err.h> 9 + #include <linux/string.h> 10 + #include <internal/lib.h> 11 + #include <symbol/kallsyms.h> 9 12 #include "bpf-event.h" 10 13 #include "debug.h" 11 14 #include "dso.h" ··· 293 290 return err ? -1 : 0; 294 291 } 295 292 293 + struct kallsyms_parse { 294 + union perf_event *event; 295 + perf_event__handler_t process; 296 + struct machine *machine; 297 + struct perf_tool *tool; 298 + }; 299 + 300 + static int 301 + process_bpf_image(char *name, u64 addr, struct kallsyms_parse *data) 302 + { 303 + struct machine *machine = data->machine; 304 + union perf_event *event = data->event; 305 + struct perf_record_ksymbol *ksymbol; 306 + int len; 307 + 308 + ksymbol = &event->ksymbol; 309 + 310 + *ksymbol = (struct perf_record_ksymbol) { 311 + .header = { 312 + .type = PERF_RECORD_KSYMBOL, 313 + .size = offsetof(struct perf_record_ksymbol, name), 314 + }, 315 + .addr = addr, 316 + .len = page_size, 317 + .ksym_type = PERF_RECORD_KSYMBOL_TYPE_BPF, 318 + .flags = 0, 319 + }; 320 + 321 + len = scnprintf(ksymbol->name, KSYM_NAME_LEN, "%s", name); 322 + ksymbol->header.size += PERF_ALIGN(len + 1, sizeof(u64)); 323 + memset((void *) event + event->header.size, 0, machine->id_hdr_size); 324 + event->header.size += machine->id_hdr_size; 325 + 326 + return perf_tool__process_synth_event(data->tool, event, machine, 327 + data->process); 328 + } 329 + 330 + static int 331 + kallsyms_process_symbol(void *data, const char *_name, 332 + char type __maybe_unused, u64 start) 333 + { 334 + char disp[KSYM_NAME_LEN]; 335 + char *module, *name; 336 + unsigned long id; 337 + int err = 0; 338 + 339 + module = strchr(_name, '\t'); 340 + if (!module) 341 + return 0; 342 + 343 + /* We are going after [bpf] module ... */ 344 + if (strcmp(module + 1, "[bpf]")) 345 + return 0; 346 + 347 + name = memdup(_name, (module - _name) + 1); 348 + if (!name) 349 + return -ENOMEM; 350 + 351 + name[module - _name] = 0; 352 + 353 + /* .. and only for trampolines and dispatchers */ 354 + if ((sscanf(name, "bpf_trampoline_%lu", &id) == 1) || 355 + (sscanf(name, "bpf_dispatcher_%s", disp) == 1)) 356 + err = process_bpf_image(name, start, data); 357 + 358 + free(name); 359 + return err; 360 + } 361 + 296 362 int perf_event__synthesize_bpf_events(struct perf_session *session, 297 363 perf_event__handler_t process, 298 364 struct machine *machine, 299 365 struct record_opts *opts) 300 366 { 367 + const char *kallsyms_filename = "/proc/kallsyms"; 368 + struct kallsyms_parse arg; 301 369 union perf_event *event; 302 370 __u32 id = 0; 303 371 int err; ··· 377 303 event = malloc(sizeof(event->bpf) + KSYM_NAME_LEN + machine->id_hdr_size); 378 304 if (!event) 379 305 return -1; 306 + 307 + /* Synthesize all the bpf programs in system. */ 380 308 while (true) { 381 309 err = bpf_prog_get_next_id(id, &id); 382 310 if (err) { ··· 411 335 break; 412 336 } 413 337 } 338 + 339 + /* Synthesize all the bpf images - trampolines/dispatchers. */ 340 + if (symbol_conf.kallsyms_name != NULL) 341 + kallsyms_filename = symbol_conf.kallsyms_name; 342 + 343 + arg = (struct kallsyms_parse) { 344 + .event = event, 345 + .process = process, 346 + .machine = machine, 347 + .tool = session->tool, 348 + }; 349 + 350 + if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) { 351 + pr_err("%s: failed to synthesize bpf images: %s\n", 352 + __func__, strerror(errno)); 353 + } 354 + 414 355 free(event); 415 356 return err; 416 357 }

+1

tools/perf/util/dso.c

··· 191 191 case DSO_BINARY_TYPE__GUEST_KALLSYMS: 192 192 case DSO_BINARY_TYPE__JAVA_JIT: 193 193 case DSO_BINARY_TYPE__BPF_PROG_INFO: 194 + case DSO_BINARY_TYPE__BPF_IMAGE: 194 195 case DSO_BINARY_TYPE__NOT_FOUND: 195 196 ret = -1; 196 197 break;

+1

tools/perf/util/dso.h

··· 40 40 DSO_BINARY_TYPE__GUEST_KCORE, 41 41 DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO, 42 42 DSO_BINARY_TYPE__BPF_PROG_INFO, 43 + DSO_BINARY_TYPE__BPF_IMAGE, 43 44 DSO_BINARY_TYPE__NOT_FOUND, 44 45 }; 45 46

+12

tools/perf/util/machine.c

··· 736 736 return 0; 737 737 } 738 738 739 + static int is_bpf_image(const char *name) 740 + { 741 + return strncmp(name, "bpf_trampoline_", sizeof("bpf_trampoline_") - 1) || 742 + strncmp(name, "bpf_dispatcher_", sizeof("bpf_dispatcher_") - 1); 743 + } 744 + 739 745 static int machine__process_ksymbol_register(struct machine *machine, 740 746 union perf_event *event, 741 747 struct perf_sample *sample __maybe_unused) ··· 765 759 map->start = event->ksymbol.addr; 766 760 map->end = map->start + event->ksymbol.len; 767 761 maps__insert(&machine->kmaps, map); 762 + dso__set_loaded(dso); 763 + 764 + if (is_bpf_image(event->ksymbol.name)) { 765 + dso->binary_type = DSO_BINARY_TYPE__BPF_IMAGE; 766 + dso__set_long_name(dso, "", false); 767 + } 768 768 } 769 769 770 770 sym = symbol__new(map->map_ip(map, map->start),

+1

tools/perf/util/symbol.c

··· 1544 1544 return true; 1545 1545 1546 1546 case DSO_BINARY_TYPE__BPF_PROG_INFO: 1547 + case DSO_BINARY_TYPE__BPF_IMAGE: 1547 1548 case DSO_BINARY_TYPE__NOT_FOUND: 1548 1549 default: 1549 1550 return false;

+1

tools/testing/selftests/bpf/.gitignore

··· 38 38 /bpf_gcc 39 39 /tools 40 40 /runqslower 41 + /bench

+18 -1

tools/testing/selftests/bpf/Makefile

··· 77 77 # Compile but not part of 'make run_tests' 78 78 TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ 79 79 flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ 80 - test_lirc_mode2_user xdping test_cpp runqslower 80 + test_lirc_mode2_user xdping test_cpp runqslower bench 81 81 82 82 TEST_CUSTOM_PROGS = urandom_read 83 83 ··· 265 265 TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ 266 266 $$(filter-out $(SKEL_BLACKLIST), \ 267 267 $$(TRUNNER_BPF_SRCS))) 268 + TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) 268 269 269 270 # Evaluate rules now with extra TRUNNER_XXX variables above already defined 270 271 $$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2)) ··· 355 354 TRUNNER_TESTS_DIR := prog_tests 356 355 TRUNNER_BPF_PROGS_DIR := progs 357 356 TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ 357 + network_helpers.c testing_helpers.c \ 358 358 flow_dissector_load.h 359 359 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ 360 360 $(wildcard progs/btf_dump_test_case_*.c) ··· 406 404 $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) 407 405 $(call msg,CXX,,$@) 408 406 $(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ 407 + 408 + # Benchmark runner 409 + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h 410 + $(call msg,CC,,$@) 411 + $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ 412 + $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h 413 + $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h 414 + $(OUTPUT)/bench.o: bench.h testing_helpers.h 415 + $(OUTPUT)/bench: LDLIBS += -lm 416 + $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ 417 + $(OUTPUT)/bench_count.o \ 418 + $(OUTPUT)/bench_rename.o \ 419 + $(OUTPUT)/bench_trigger.o 420 + $(call msg,BINARY,,$@) 421 + $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) 409 422 410 423 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ 411 424 prog_tests/tests.h map_tests/tests.h verifier/tests.h \

+43

tools/testing/selftests/bpf/README.rst

··· 1 + ================== 2 + BPF Selftest Notes 3 + ================== 4 + 5 + Additional information about selftest failures are 6 + documented here. 7 + 8 + bpf_iter test failures with clang/llvm 10.0.0 9 + ============================================= 10 + 11 + With clang/llvm 10.0.0, the following two bpf_iter tests failed: 12 + * ``bpf_iter/ipv6_route`` 13 + * ``bpf_iter/netlink`` 14 + 15 + The symptom for ``bpf_iter/ipv6_route`` looks like 16 + 17 + .. code-block:: c 18 + 19 + 2: (79) r8 = *(u64 *)(r1 +8) 20 + ... 21 + 14: (bf) r2 = r8 22 + 15: (0f) r2 += r1 23 + ; BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); 24 + 16: (7b) *(u64 *)(r8 +64) = r2 25 + only read is supported 26 + 27 + The symptom for ``bpf_iter/netlink`` looks like 28 + 29 + .. code-block:: c 30 + 31 + ; struct netlink_sock *nlk = ctx->sk; 32 + 2: (79) r7 = *(u64 *)(r1 +8) 33 + ... 34 + 15: (bf) r2 = r7 35 + 16: (0f) r2 += r1 36 + ; BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol); 37 + 17: (7b) *(u64 *)(r7 +0) = r2 38 + only read is supported 39 + 40 + This is due to a llvm BPF backend bug. The fix 41 + https://reviews.llvm.org/D78466 42 + has been pushed to llvm 10.x release branch and will be 43 + available in 10.0.1. The fix is available in llvm 11.0.0 trunk.

+449

tools/testing/selftests/bpf/bench.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #define _GNU_SOURCE 4 + #include <argp.h> 5 + #include <linux/compiler.h> 6 + #include <sys/time.h> 7 + #include <sched.h> 8 + #include <fcntl.h> 9 + #include <pthread.h> 10 + #include <sys/sysinfo.h> 11 + #include <sys/resource.h> 12 + #include <signal.h> 13 + #include "bench.h" 14 + #include "testing_helpers.h" 15 + 16 + struct env env = { 17 + .warmup_sec = 1, 18 + .duration_sec = 5, 19 + .affinity = false, 20 + .consumer_cnt = 1, 21 + .producer_cnt = 1, 22 + }; 23 + 24 + static int libbpf_print_fn(enum libbpf_print_level level, 25 + const char *format, va_list args) 26 + { 27 + if (level == LIBBPF_DEBUG && !env.verbose) 28 + return 0; 29 + return vfprintf(stderr, format, args); 30 + } 31 + 32 + static int bump_memlock_rlimit(void) 33 + { 34 + struct rlimit rlim_new = { 35 + .rlim_cur = RLIM_INFINITY, 36 + .rlim_max = RLIM_INFINITY, 37 + }; 38 + 39 + return setrlimit(RLIMIT_MEMLOCK, &rlim_new); 40 + } 41 + 42 + void setup_libbpf() 43 + { 44 + int err; 45 + 46 + libbpf_set_print(libbpf_print_fn); 47 + 48 + err = bump_memlock_rlimit(); 49 + if (err) 50 + fprintf(stderr, "failed to increase RLIMIT_MEMLOCK: %d", err); 51 + } 52 + 53 + void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns) 54 + { 55 + double hits_per_sec, drops_per_sec; 56 + double hits_per_prod; 57 + 58 + hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0); 59 + hits_per_prod = hits_per_sec / env.producer_cnt; 60 + drops_per_sec = res->drops / 1000000.0 / (delta_ns / 1000000000.0); 61 + 62 + printf("Iter %3d (%7.3lfus): ", 63 + iter, (delta_ns - 1000000000) / 1000.0); 64 + 65 + printf("hits %8.3lfM/s (%7.3lfM/prod), drops %8.3lfM/s\n", 66 + hits_per_sec, hits_per_prod, drops_per_sec); 67 + } 68 + 69 + void hits_drops_report_final(struct bench_res res[], int res_cnt) 70 + { 71 + int i; 72 + double hits_mean = 0.0, drops_mean = 0.0; 73 + double hits_stddev = 0.0, drops_stddev = 0.0; 74 + 75 + for (i = 0; i < res_cnt; i++) { 76 + hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); 77 + drops_mean += res[i].drops / 1000000.0 / (0.0 + res_cnt); 78 + } 79 + 80 + if (res_cnt > 1) { 81 + for (i = 0; i < res_cnt; i++) { 82 + hits_stddev += (hits_mean - res[i].hits / 1000000.0) * 83 + (hits_mean - res[i].hits / 1000000.0) / 84 + (res_cnt - 1.0); 85 + drops_stddev += (drops_mean - res[i].drops / 1000000.0) * 86 + (drops_mean - res[i].drops / 1000000.0) / 87 + (res_cnt - 1.0); 88 + } 89 + hits_stddev = sqrt(hits_stddev); 90 + drops_stddev = sqrt(drops_stddev); 91 + } 92 + printf("Summary: hits %8.3lf \u00B1 %5.3lfM/s (%7.3lfM/prod), ", 93 + hits_mean, hits_stddev, hits_mean / env.producer_cnt); 94 + printf("drops %8.3lf \u00B1 %5.3lfM/s\n", 95 + drops_mean, drops_stddev); 96 + } 97 + 98 + const char *argp_program_version = "benchmark"; 99 + const char *argp_program_bug_address = "<bpf@vger.kernel.org>"; 100 + const char argp_program_doc[] = 101 + "benchmark Generic benchmarking framework.\n" 102 + "\n" 103 + "This tool runs benchmarks.\n" 104 + "\n" 105 + "USAGE: benchmark <bench-name>\n" 106 + "\n" 107 + "EXAMPLES:\n" 108 + " # run 'count-local' benchmark with 1 producer and 1 consumer\n" 109 + " benchmark count-local\n" 110 + " # run 'count-local' with 16 producer and 8 consumer thread, pinned to CPUs\n" 111 + " benchmark -p16 -c8 -a count-local\n"; 112 + 113 + enum { 114 + ARG_PROD_AFFINITY_SET = 1000, 115 + ARG_CONS_AFFINITY_SET = 1001, 116 + }; 117 + 118 + static const struct argp_option opts[] = { 119 + { "list", 'l', NULL, 0, "List available benchmarks"}, 120 + { "duration", 'd', "SEC", 0, "Duration of benchmark, seconds"}, 121 + { "warmup", 'w', "SEC", 0, "Warm-up period, seconds"}, 122 + { "producers", 'p', "NUM", 0, "Number of producer threads"}, 123 + { "consumers", 'c', "NUM", 0, "Number of consumer threads"}, 124 + { "verbose", 'v', NULL, 0, "Verbose debug output"}, 125 + { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"}, 126 + { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0, 127 + "Set of CPUs for producer threads; implies --affinity"}, 128 + { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0, 129 + "Set of CPUs for consumer threads; implies --affinity"}, 130 + {}, 131 + }; 132 + 133 + static error_t parse_arg(int key, char *arg, struct argp_state *state) 134 + { 135 + static int pos_args; 136 + 137 + switch (key) { 138 + case 'v': 139 + env.verbose = true; 140 + break; 141 + case 'l': 142 + env.list = true; 143 + break; 144 + case 'd': 145 + env.duration_sec = strtol(arg, NULL, 10); 146 + if (env.duration_sec <= 0) { 147 + fprintf(stderr, "Invalid duration: %s\n", arg); 148 + argp_usage(state); 149 + } 150 + break; 151 + case 'w': 152 + env.warmup_sec = strtol(arg, NULL, 10); 153 + if (env.warmup_sec <= 0) { 154 + fprintf(stderr, "Invalid warm-up duration: %s\n", arg); 155 + argp_usage(state); 156 + } 157 + break; 158 + case 'p': 159 + env.producer_cnt = strtol(arg, NULL, 10); 160 + if (env.producer_cnt <= 0) { 161 + fprintf(stderr, "Invalid producer count: %s\n", arg); 162 + argp_usage(state); 163 + } 164 + break; 165 + case 'c': 166 + env.consumer_cnt = strtol(arg, NULL, 10); 167 + if (env.consumer_cnt <= 0) { 168 + fprintf(stderr, "Invalid consumer count: %s\n", arg); 169 + argp_usage(state); 170 + } 171 + break; 172 + case 'a': 173 + env.affinity = true; 174 + break; 175 + case ARG_PROD_AFFINITY_SET: 176 + env.affinity = true; 177 + if (parse_num_list(arg, &env.prod_cpus.cpus, 178 + &env.prod_cpus.cpus_len)) { 179 + fprintf(stderr, "Invalid format of CPU set for producers."); 180 + argp_usage(state); 181 + } 182 + break; 183 + case ARG_CONS_AFFINITY_SET: 184 + env.affinity = true; 185 + if (parse_num_list(arg, &env.cons_cpus.cpus, 186 + &env.cons_cpus.cpus_len)) { 187 + fprintf(stderr, "Invalid format of CPU set for consumers."); 188 + argp_usage(state); 189 + } 190 + break; 191 + case ARGP_KEY_ARG: 192 + if (pos_args++) { 193 + fprintf(stderr, 194 + "Unrecognized positional argument: %s\n", arg); 195 + argp_usage(state); 196 + } 197 + env.bench_name = strdup(arg); 198 + break; 199 + default: 200 + return ARGP_ERR_UNKNOWN; 201 + } 202 + return 0; 203 + } 204 + 205 + static void parse_cmdline_args(int argc, char **argv) 206 + { 207 + static const struct argp argp = { 208 + .options = opts, 209 + .parser = parse_arg, 210 + .doc = argp_program_doc, 211 + }; 212 + if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) 213 + exit(1); 214 + if (!env.list && !env.bench_name) { 215 + argp_help(&argp, stderr, ARGP_HELP_DOC, "bench"); 216 + exit(1); 217 + } 218 + } 219 + 220 + static void collect_measurements(long delta_ns); 221 + 222 + static __u64 last_time_ns; 223 + static void sigalarm_handler(int signo) 224 + { 225 + long new_time_ns = get_time_ns(); 226 + long delta_ns = new_time_ns - last_time_ns; 227 + 228 + collect_measurements(delta_ns); 229 + 230 + last_time_ns = new_time_ns; 231 + } 232 + 233 + /* set up periodic 1-second timer */ 234 + static void setup_timer() 235 + { 236 + static struct sigaction sigalarm_action = { 237 + .sa_handler = sigalarm_handler, 238 + }; 239 + struct itimerval timer_settings = {}; 240 + int err; 241 + 242 + last_time_ns = get_time_ns(); 243 + err = sigaction(SIGALRM, &sigalarm_action, NULL); 244 + if (err < 0) { 245 + fprintf(stderr, "failed to install SIGALRM handler: %d\n", -errno); 246 + exit(1); 247 + } 248 + timer_settings.it_interval.tv_sec = 1; 249 + timer_settings.it_value.tv_sec = 1; 250 + err = setitimer(ITIMER_REAL, &timer_settings, NULL); 251 + if (err < 0) { 252 + fprintf(stderr, "failed to arm interval timer: %d\n", -errno); 253 + exit(1); 254 + } 255 + } 256 + 257 + static void set_thread_affinity(pthread_t thread, int cpu) 258 + { 259 + cpu_set_t cpuset; 260 + 261 + CPU_ZERO(&cpuset); 262 + CPU_SET(cpu, &cpuset); 263 + if (pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset)) { 264 + fprintf(stderr, "setting affinity to CPU #%d failed: %d\n", 265 + cpu, errno); 266 + exit(1); 267 + } 268 + } 269 + 270 + static int next_cpu(struct cpu_set *cpu_set) 271 + { 272 + if (cpu_set->cpus) { 273 + int i; 274 + 275 + /* find next available CPU */ 276 + for (i = cpu_set->next_cpu; i < cpu_set->cpus_len; i++) { 277 + if (cpu_set->cpus[i]) { 278 + cpu_set->next_cpu = i + 1; 279 + return i; 280 + } 281 + } 282 + fprintf(stderr, "Not enough CPUs specified, need CPU #%d or higher.\n", i); 283 + exit(1); 284 + } 285 + 286 + return cpu_set->next_cpu++; 287 + } 288 + 289 + static struct bench_state { 290 + int res_cnt; 291 + struct bench_res *results; 292 + pthread_t *consumers; 293 + pthread_t *producers; 294 + } state; 295 + 296 + const struct bench *bench = NULL; 297 + 298 + extern const struct bench bench_count_global; 299 + extern const struct bench bench_count_local; 300 + extern const struct bench bench_rename_base; 301 + extern const struct bench bench_rename_kprobe; 302 + extern const struct bench bench_rename_kretprobe; 303 + extern const struct bench bench_rename_rawtp; 304 + extern const struct bench bench_rename_fentry; 305 + extern const struct bench bench_rename_fexit; 306 + extern const struct bench bench_rename_fmodret; 307 + extern const struct bench bench_trig_base; 308 + extern const struct bench bench_trig_tp; 309 + extern const struct bench bench_trig_rawtp; 310 + extern const struct bench bench_trig_kprobe; 311 + extern const struct bench bench_trig_fentry; 312 + extern const struct bench bench_trig_fmodret; 313 + 314 + static const struct bench *benchs[] = { 315 + &bench_count_global, 316 + &bench_count_local, 317 + &bench_rename_base, 318 + &bench_rename_kprobe, 319 + &bench_rename_kretprobe, 320 + &bench_rename_rawtp, 321 + &bench_rename_fentry, 322 + &bench_rename_fexit, 323 + &bench_rename_fmodret, 324 + &bench_trig_base, 325 + &bench_trig_tp, 326 + &bench_trig_rawtp, 327 + &bench_trig_kprobe, 328 + &bench_trig_fentry, 329 + &bench_trig_fmodret, 330 + }; 331 + 332 + static void setup_benchmark() 333 + { 334 + int i, err; 335 + 336 + if (!env.bench_name) { 337 + fprintf(stderr, "benchmark name is not specified\n"); 338 + exit(1); 339 + } 340 + 341 + for (i = 0; i < ARRAY_SIZE(benchs); i++) { 342 + if (strcmp(benchs[i]->name, env.bench_name) == 0) { 343 + bench = benchs[i]; 344 + break; 345 + } 346 + } 347 + if (!bench) { 348 + fprintf(stderr, "benchmark '%s' not found\n", env.bench_name); 349 + exit(1); 350 + } 351 + 352 + printf("Setting up benchmark '%s'...\n", bench->name); 353 + 354 + state.producers = calloc(env.producer_cnt, sizeof(*state.producers)); 355 + state.consumers = calloc(env.consumer_cnt, sizeof(*state.consumers)); 356 + state.results = calloc(env.duration_sec + env.warmup_sec + 2, 357 + sizeof(*state.results)); 358 + if (!state.producers || !state.consumers || !state.results) 359 + exit(1); 360 + 361 + if (bench->validate) 362 + bench->validate(); 363 + if (bench->setup) 364 + bench->setup(); 365 + 366 + for (i = 0; i < env.consumer_cnt; i++) { 367 + err = pthread_create(&state.consumers[i], NULL, 368 + bench->consumer_thread, (void *)(long)i); 369 + if (err) { 370 + fprintf(stderr, "failed to create consumer thread #%d: %d\n", 371 + i, -errno); 372 + exit(1); 373 + } 374 + if (env.affinity) 375 + set_thread_affinity(state.consumers[i], 376 + next_cpu(&env.cons_cpus)); 377 + } 378 + 379 + /* unless explicit producer CPU list is specified, continue after 380 + * last consumer CPU 381 + */ 382 + if (!env.prod_cpus.cpus) 383 + env.prod_cpus.next_cpu = env.cons_cpus.next_cpu; 384 + 385 + for (i = 0; i < env.producer_cnt; i++) { 386 + err = pthread_create(&state.producers[i], NULL, 387 + bench->producer_thread, (void *)(long)i); 388 + if (err) { 389 + fprintf(stderr, "failed to create producer thread #%d: %d\n", 390 + i, -errno); 391 + exit(1); 392 + } 393 + if (env.affinity) 394 + set_thread_affinity(state.producers[i], 395 + next_cpu(&env.prod_cpus)); 396 + } 397 + 398 + printf("Benchmark '%s' started.\n", bench->name); 399 + } 400 + 401 + static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER; 402 + static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER; 403 + 404 + static void collect_measurements(long delta_ns) { 405 + int iter = state.res_cnt++; 406 + struct bench_res *res = &state.results[iter]; 407 + 408 + bench->measure(res); 409 + 410 + if (bench->report_progress) 411 + bench->report_progress(iter, res, delta_ns); 412 + 413 + if (iter == env.duration_sec + env.warmup_sec) { 414 + pthread_mutex_lock(&bench_done_mtx); 415 + pthread_cond_signal(&bench_done); 416 + pthread_mutex_unlock(&bench_done_mtx); 417 + } 418 + } 419 + 420 + int main(int argc, char **argv) 421 + { 422 + parse_cmdline_args(argc, argv); 423 + 424 + if (env.list) { 425 + int i; 426 + 427 + printf("Available benchmarks:\n"); 428 + for (i = 0; i < ARRAY_SIZE(benchs); i++) { 429 + printf("- %s\n", benchs[i]->name); 430 + } 431 + return 0; 432 + } 433 + 434 + setup_benchmark(); 435 + 436 + setup_timer(); 437 + 438 + pthread_mutex_lock(&bench_done_mtx); 439 + pthread_cond_wait(&bench_done, &bench_done_mtx); 440 + pthread_mutex_unlock(&bench_done_mtx); 441 + 442 + if (bench->report_final) 443 + /* skip first sample */ 444 + bench->report_final(state.results + env.warmup_sec, 445 + state.res_cnt - env.warmup_sec); 446 + 447 + return 0; 448 + } 449 +

+81

tools/testing/selftests/bpf/bench.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #pragma once 3 + #include <stdlib.h> 4 + #include <stdbool.h> 5 + #include <linux/err.h> 6 + #include <errno.h> 7 + #include <unistd.h> 8 + #include <bpf/bpf.h> 9 + #include <bpf/libbpf.h> 10 + #include <math.h> 11 + #include <time.h> 12 + #include <sys/syscall.h> 13 + 14 + struct cpu_set { 15 + bool *cpus; 16 + int cpus_len; 17 + int next_cpu; 18 + }; 19 + 20 + struct env { 21 + char *bench_name; 22 + int duration_sec; 23 + int warmup_sec; 24 + bool verbose; 25 + bool list; 26 + bool affinity; 27 + int consumer_cnt; 28 + int producer_cnt; 29 + struct cpu_set prod_cpus; 30 + struct cpu_set cons_cpus; 31 + }; 32 + 33 + struct bench_res { 34 + long hits; 35 + long drops; 36 + }; 37 + 38 + struct bench { 39 + const char *name; 40 + void (*validate)(); 41 + void (*setup)(); 42 + void *(*producer_thread)(void *ctx); 43 + void *(*consumer_thread)(void *ctx); 44 + void (*measure)(struct bench_res* res); 45 + void (*report_progress)(int iter, struct bench_res* res, long delta_ns); 46 + void (*report_final)(struct bench_res res[], int res_cnt); 47 + }; 48 + 49 + struct counter { 50 + long value; 51 + } __attribute__((aligned(128))); 52 + 53 + extern struct env env; 54 + extern const struct bench *bench; 55 + 56 + void setup_libbpf(); 57 + void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns); 58 + void hits_drops_report_final(struct bench_res res[], int res_cnt); 59 + 60 + static inline __u64 get_time_ns() { 61 + struct timespec t; 62 + 63 + clock_gettime(CLOCK_MONOTONIC, &t); 64 + 65 + return (u64)t.tv_sec * 1000000000 + t.tv_nsec; 66 + } 67 + 68 + static inline void atomic_inc(long *value) 69 + { 70 + (void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED); 71 + } 72 + 73 + static inline void atomic_add(long *value, long n) 74 + { 75 + (void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED); 76 + } 77 + 78 + static inline long atomic_swap(long *value, long n) 79 + { 80 + return __atomic_exchange_n(value, n, __ATOMIC_RELAXED); 81 + }

+91

tools/testing/selftests/bpf/benchs/bench_count.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "bench.h" 4 + 5 + /* COUNT-GLOBAL benchmark */ 6 + 7 + static struct count_global_ctx { 8 + struct counter hits; 9 + } count_global_ctx; 10 + 11 + static void *count_global_producer(void *input) 12 + { 13 + struct count_global_ctx *ctx = &count_global_ctx; 14 + 15 + while (true) { 16 + atomic_inc(&ctx->hits.value); 17 + } 18 + return NULL; 19 + } 20 + 21 + static void *count_global_consumer(void *input) 22 + { 23 + return NULL; 24 + } 25 + 26 + static void count_global_measure(struct bench_res *res) 27 + { 28 + struct count_global_ctx *ctx = &count_global_ctx; 29 + 30 + res->hits = atomic_swap(&ctx->hits.value, 0); 31 + } 32 + 33 + /* COUNT-local benchmark */ 34 + 35 + static struct count_local_ctx { 36 + struct counter *hits; 37 + } count_local_ctx; 38 + 39 + static void count_local_setup() 40 + { 41 + struct count_local_ctx *ctx = &count_local_ctx; 42 + 43 + ctx->hits = calloc(env.consumer_cnt, sizeof(*ctx->hits)); 44 + if (!ctx->hits) 45 + exit(1); 46 + } 47 + 48 + static void *count_local_producer(void *input) 49 + { 50 + struct count_local_ctx *ctx = &count_local_ctx; 51 + int idx = (long)input; 52 + 53 + while (true) { 54 + atomic_inc(&ctx->hits[idx].value); 55 + } 56 + return NULL; 57 + } 58 + 59 + static void *count_local_consumer(void *input) 60 + { 61 + return NULL; 62 + } 63 + 64 + static void count_local_measure(struct bench_res *res) 65 + { 66 + struct count_local_ctx *ctx = &count_local_ctx; 67 + int i; 68 + 69 + for (i = 0; i < env.producer_cnt; i++) { 70 + res->hits += atomic_swap(&ctx->hits[i].value, 0); 71 + } 72 + } 73 + 74 + const struct bench bench_count_global = { 75 + .name = "count-global", 76 + .producer_thread = count_global_producer, 77 + .consumer_thread = count_global_consumer, 78 + .measure = count_global_measure, 79 + .report_progress = hits_drops_report_progress, 80 + .report_final = hits_drops_report_final, 81 + }; 82 + 83 + const struct bench bench_count_local = { 84 + .name = "count-local", 85 + .setup = count_local_setup, 86 + .producer_thread = count_local_producer, 87 + .consumer_thread = count_local_consumer, 88 + .measure = count_local_measure, 89 + .report_progress = hits_drops_report_progress, 90 + .report_final = hits_drops_report_final, 91 + };

+195

tools/testing/selftests/bpf/benchs/bench_rename.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include <fcntl.h> 4 + #include "bench.h" 5 + #include "test_overhead.skel.h" 6 + 7 + /* BPF triggering benchmarks */ 8 + static struct ctx { 9 + struct test_overhead *skel; 10 + struct counter hits; 11 + int fd; 12 + } ctx; 13 + 14 + static void validate() 15 + { 16 + if (env.producer_cnt != 1) { 17 + fprintf(stderr, "benchmark doesn't support multi-producer!\n"); 18 + exit(1); 19 + } 20 + if (env.consumer_cnt != 1) { 21 + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); 22 + exit(1); 23 + } 24 + } 25 + 26 + static void *producer(void *input) 27 + { 28 + char buf[] = "test_overhead"; 29 + int err; 30 + 31 + while (true) { 32 + err = write(ctx.fd, buf, sizeof(buf)); 33 + if (err < 0) { 34 + fprintf(stderr, "write failed\n"); 35 + exit(1); 36 + } 37 + atomic_inc(&ctx.hits.value); 38 + } 39 + } 40 + 41 + static void measure(struct bench_res *res) 42 + { 43 + res->hits = atomic_swap(&ctx.hits.value, 0); 44 + } 45 + 46 + static void setup_ctx() 47 + { 48 + setup_libbpf(); 49 + 50 + ctx.skel = test_overhead__open_and_load(); 51 + if (!ctx.skel) { 52 + fprintf(stderr, "failed to open skeleton\n"); 53 + exit(1); 54 + } 55 + 56 + ctx.fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); 57 + if (ctx.fd < 0) { 58 + fprintf(stderr, "failed to open /proc/self/comm: %d\n", -errno); 59 + exit(1); 60 + } 61 + } 62 + 63 + static void attach_bpf(struct bpf_program *prog) 64 + { 65 + struct bpf_link *link; 66 + 67 + link = bpf_program__attach(prog); 68 + if (IS_ERR(link)) { 69 + fprintf(stderr, "failed to attach program!\n"); 70 + exit(1); 71 + } 72 + } 73 + 74 + static void setup_base() 75 + { 76 + setup_ctx(); 77 + } 78 + 79 + static void setup_kprobe() 80 + { 81 + setup_ctx(); 82 + attach_bpf(ctx.skel->progs.prog1); 83 + } 84 + 85 + static void setup_kretprobe() 86 + { 87 + setup_ctx(); 88 + attach_bpf(ctx.skel->progs.prog2); 89 + } 90 + 91 + static void setup_rawtp() 92 + { 93 + setup_ctx(); 94 + attach_bpf(ctx.skel->progs.prog3); 95 + } 96 + 97 + static void setup_fentry() 98 + { 99 + setup_ctx(); 100 + attach_bpf(ctx.skel->progs.prog4); 101 + } 102 + 103 + static void setup_fexit() 104 + { 105 + setup_ctx(); 106 + attach_bpf(ctx.skel->progs.prog5); 107 + } 108 + 109 + static void setup_fmodret() 110 + { 111 + setup_ctx(); 112 + attach_bpf(ctx.skel->progs.prog6); 113 + } 114 + 115 + static void *consumer(void *input) 116 + { 117 + return NULL; 118 + } 119 + 120 + const struct bench bench_rename_base = { 121 + .name = "rename-base", 122 + .validate = validate, 123 + .setup = setup_base, 124 + .producer_thread = producer, 125 + .consumer_thread = consumer, 126 + .measure = measure, 127 + .report_progress = hits_drops_report_progress, 128 + .report_final = hits_drops_report_final, 129 + }; 130 + 131 + const struct bench bench_rename_kprobe = { 132 + .name = "rename-kprobe", 133 + .validate = validate, 134 + .setup = setup_kprobe, 135 + .producer_thread = producer, 136 + .consumer_thread = consumer, 137 + .measure = measure, 138 + .report_progress = hits_drops_report_progress, 139 + .report_final = hits_drops_report_final, 140 + }; 141 + 142 + const struct bench bench_rename_kretprobe = { 143 + .name = "rename-kretprobe", 144 + .validate = validate, 145 + .setup = setup_kretprobe, 146 + .producer_thread = producer, 147 + .consumer_thread = consumer, 148 + .measure = measure, 149 + .report_progress = hits_drops_report_progress, 150 + .report_final = hits_drops_report_final, 151 + }; 152 + 153 + const struct bench bench_rename_rawtp = { 154 + .name = "rename-rawtp", 155 + .validate = validate, 156 + .setup = setup_rawtp, 157 + .producer_thread = producer, 158 + .consumer_thread = consumer, 159 + .measure = measure, 160 + .report_progress = hits_drops_report_progress, 161 + .report_final = hits_drops_report_final, 162 + }; 163 + 164 + const struct bench bench_rename_fentry = { 165 + .name = "rename-fentry", 166 + .validate = validate, 167 + .setup = setup_fentry, 168 + .producer_thread = producer, 169 + .consumer_thread = consumer, 170 + .measure = measure, 171 + .report_progress = hits_drops_report_progress, 172 + .report_final = hits_drops_report_final, 173 + }; 174 + 175 + const struct bench bench_rename_fexit = { 176 + .name = "rename-fexit", 177 + .validate = validate, 178 + .setup = setup_fexit, 179 + .producer_thread = producer, 180 + .consumer_thread = consumer, 181 + .measure = measure, 182 + .report_progress = hits_drops_report_progress, 183 + .report_final = hits_drops_report_final, 184 + }; 185 + 186 + const struct bench bench_rename_fmodret = { 187 + .name = "rename-fmodret", 188 + .validate = validate, 189 + .setup = setup_fmodret, 190 + .producer_thread = producer, 191 + .consumer_thread = consumer, 192 + .measure = measure, 193 + .report_progress = hits_drops_report_progress, 194 + .report_final = hits_drops_report_final, 195 + };

+167

tools/testing/selftests/bpf/benchs/bench_trigger.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "bench.h" 4 + #include "trigger_bench.skel.h" 5 + 6 + /* BPF triggering benchmarks */ 7 + static struct trigger_ctx { 8 + struct trigger_bench *skel; 9 + } ctx; 10 + 11 + static struct counter base_hits; 12 + 13 + static void trigger_validate() 14 + { 15 + if (env.consumer_cnt != 1) { 16 + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); 17 + exit(1); 18 + } 19 + } 20 + 21 + static void *trigger_base_producer(void *input) 22 + { 23 + while (true) { 24 + (void)syscall(__NR_getpgid); 25 + atomic_inc(&base_hits.value); 26 + } 27 + return NULL; 28 + } 29 + 30 + static void trigger_base_measure(struct bench_res *res) 31 + { 32 + res->hits = atomic_swap(&base_hits.value, 0); 33 + } 34 + 35 + static void *trigger_producer(void *input) 36 + { 37 + while (true) 38 + (void)syscall(__NR_getpgid); 39 + return NULL; 40 + } 41 + 42 + static void trigger_measure(struct bench_res *res) 43 + { 44 + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); 45 + } 46 + 47 + static void setup_ctx() 48 + { 49 + setup_libbpf(); 50 + 51 + ctx.skel = trigger_bench__open_and_load(); 52 + if (!ctx.skel) { 53 + fprintf(stderr, "failed to open skeleton\n"); 54 + exit(1); 55 + } 56 + } 57 + 58 + static void attach_bpf(struct bpf_program *prog) 59 + { 60 + struct bpf_link *link; 61 + 62 + link = bpf_program__attach(prog); 63 + if (IS_ERR(link)) { 64 + fprintf(stderr, "failed to attach program!\n"); 65 + exit(1); 66 + } 67 + } 68 + 69 + static void trigger_tp_setup() 70 + { 71 + setup_ctx(); 72 + attach_bpf(ctx.skel->progs.bench_trigger_tp); 73 + } 74 + 75 + static void trigger_rawtp_setup() 76 + { 77 + setup_ctx(); 78 + attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); 79 + } 80 + 81 + static void trigger_kprobe_setup() 82 + { 83 + setup_ctx(); 84 + attach_bpf(ctx.skel->progs.bench_trigger_kprobe); 85 + } 86 + 87 + static void trigger_fentry_setup() 88 + { 89 + setup_ctx(); 90 + attach_bpf(ctx.skel->progs.bench_trigger_fentry); 91 + } 92 + 93 + static void trigger_fmodret_setup() 94 + { 95 + setup_ctx(); 96 + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); 97 + } 98 + 99 + static void *trigger_consumer(void *input) 100 + { 101 + return NULL; 102 + } 103 + 104 + const struct bench bench_trig_base = { 105 + .name = "trig-base", 106 + .validate = trigger_validate, 107 + .producer_thread = trigger_base_producer, 108 + .consumer_thread = trigger_consumer, 109 + .measure = trigger_base_measure, 110 + .report_progress = hits_drops_report_progress, 111 + .report_final = hits_drops_report_final, 112 + }; 113 + 114 + const struct bench bench_trig_tp = { 115 + .name = "trig-tp", 116 + .validate = trigger_validate, 117 + .setup = trigger_tp_setup, 118 + .producer_thread = trigger_producer, 119 + .consumer_thread = trigger_consumer, 120 + .measure = trigger_measure, 121 + .report_progress = hits_drops_report_progress, 122 + .report_final = hits_drops_report_final, 123 + }; 124 + 125 + const struct bench bench_trig_rawtp = { 126 + .name = "trig-rawtp", 127 + .validate = trigger_validate, 128 + .setup = trigger_rawtp_setup, 129 + .producer_thread = trigger_producer, 130 + .consumer_thread = trigger_consumer, 131 + .measure = trigger_measure, 132 + .report_progress = hits_drops_report_progress, 133 + .report_final = hits_drops_report_final, 134 + }; 135 + 136 + const struct bench bench_trig_kprobe = { 137 + .name = "trig-kprobe", 138 + .validate = trigger_validate, 139 + .setup = trigger_kprobe_setup, 140 + .producer_thread = trigger_producer, 141 + .consumer_thread = trigger_consumer, 142 + .measure = trigger_measure, 143 + .report_progress = hits_drops_report_progress, 144 + .report_final = hits_drops_report_final, 145 + }; 146 + 147 + const struct bench bench_trig_fentry = { 148 + .name = "trig-fentry", 149 + .validate = trigger_validate, 150 + .setup = trigger_fentry_setup, 151 + .producer_thread = trigger_producer, 152 + .consumer_thread = trigger_consumer, 153 + .measure = trigger_measure, 154 + .report_progress = hits_drops_report_progress, 155 + .report_final = hits_drops_report_final, 156 + }; 157 + 158 + const struct bench bench_trig_fmodret = { 159 + .name = "trig-fmodret", 160 + .validate = trigger_validate, 161 + .setup = trigger_fmodret_setup, 162 + .producer_thread = trigger_producer, 163 + .consumer_thread = trigger_consumer, 164 + .measure = trigger_measure, 165 + .report_progress = hits_drops_report_progress, 166 + .report_final = hits_drops_report_final, 167 + };

+9

tools/testing/selftests/bpf/benchs/run_bench_rename.sh

··· 1 + #!/bin/bash 2 + 3 + set -eufo pipefail 4 + 5 + for i in base kprobe kretprobe rawtp fentry fexit fmodret 6 + do 7 + summary=$(sudo ./bench -w2 -d5 -a rename-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) 8 + printf "%-10s: %s\n" $i "$summary" 9 + done

+9

tools/testing/selftests/bpf/benchs/run_bench_trigger.sh

··· 1 + #!/bin/bash 2 + 3 + set -eufo pipefail 4 + 5 + for i in base tp rawtp kprobe fentry fmodret 6 + do 7 + summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) 8 + printf "%-10s: %s\n" $i "$summary" 9 + done

+158

tools/testing/selftests/bpf/network_helpers.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <errno.h> 3 + #include <stdbool.h> 4 + #include <stdio.h> 5 + #include <string.h> 6 + #include <unistd.h> 7 + 8 + #include <sys/epoll.h> 9 + 10 + #include <linux/err.h> 11 + #include <linux/in.h> 12 + #include <linux/in6.h> 13 + 14 + #include "bpf_util.h" 15 + #include "network_helpers.h" 16 + 17 + #define clean_errno() (errno == 0 ? "None" : strerror(errno)) 18 + #define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ 19 + __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) 20 + 21 + struct ipv4_packet pkt_v4 = { 22 + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), 23 + .iph.ihl = 5, 24 + .iph.protocol = IPPROTO_TCP, 25 + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), 26 + .tcp.urg_ptr = 123, 27 + .tcp.doff = 5, 28 + }; 29 + 30 + struct ipv6_packet pkt_v6 = { 31 + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), 32 + .iph.nexthdr = IPPROTO_TCP, 33 + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), 34 + .tcp.urg_ptr = 123, 35 + .tcp.doff = 5, 36 + }; 37 + 38 + int start_server(int family, int type) 39 + { 40 + struct sockaddr_storage addr = {}; 41 + socklen_t len; 42 + int fd; 43 + 44 + if (family == AF_INET) { 45 + struct sockaddr_in *sin = (void *)&addr; 46 + 47 + sin->sin_family = AF_INET; 48 + len = sizeof(*sin); 49 + } else { 50 + struct sockaddr_in6 *sin6 = (void *)&addr; 51 + 52 + sin6->sin6_family = AF_INET6; 53 + len = sizeof(*sin6); 54 + } 55 + 56 + fd = socket(family, type | SOCK_NONBLOCK, 0); 57 + if (fd < 0) { 58 + log_err("Failed to create server socket"); 59 + return -1; 60 + } 61 + 62 + if (bind(fd, (const struct sockaddr *)&addr, len) < 0) { 63 + log_err("Failed to bind socket"); 64 + close(fd); 65 + return -1; 66 + } 67 + 68 + if (type == SOCK_STREAM) { 69 + if (listen(fd, 1) < 0) { 70 + log_err("Failed to listed on socket"); 71 + close(fd); 72 + return -1; 73 + } 74 + } 75 + 76 + return fd; 77 + } 78 + 79 + static const struct timeval timeo_sec = { .tv_sec = 3 }; 80 + static const size_t timeo_optlen = sizeof(timeo_sec); 81 + 82 + int connect_to_fd(int family, int type, int server_fd) 83 + { 84 + int fd, save_errno; 85 + 86 + fd = socket(family, type, 0); 87 + if (fd < 0) { 88 + log_err("Failed to create client socket"); 89 + return -1; 90 + } 91 + 92 + if (connect_fd_to_fd(fd, server_fd) < 0 && errno != EINPROGRESS) { 93 + save_errno = errno; 94 + close(fd); 95 + errno = save_errno; 96 + return -1; 97 + } 98 + 99 + return fd; 100 + } 101 + 102 + int connect_fd_to_fd(int client_fd, int server_fd) 103 + { 104 + struct sockaddr_storage addr; 105 + socklen_t len = sizeof(addr); 106 + int save_errno; 107 + 108 + if (setsockopt(client_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, 109 + timeo_optlen)) { 110 + log_err("Failed to set SO_RCVTIMEO"); 111 + return -1; 112 + } 113 + 114 + if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { 115 + log_err("Failed to get server addr"); 116 + return -1; 117 + } 118 + 119 + if (connect(client_fd, (const struct sockaddr *)&addr, len) < 0) { 120 + if (errno != EINPROGRESS) { 121 + save_errno = errno; 122 + log_err("Failed to connect to server"); 123 + errno = save_errno; 124 + } 125 + return -1; 126 + } 127 + 128 + return 0; 129 + } 130 + 131 + int connect_wait(int fd) 132 + { 133 + struct epoll_event ev = {}, events[2]; 134 + int timeout_ms = 1000; 135 + int efd, nfd; 136 + 137 + efd = epoll_create1(EPOLL_CLOEXEC); 138 + if (efd < 0) { 139 + log_err("Failed to open epoll fd"); 140 + return -1; 141 + } 142 + 143 + ev.events = EPOLLRDHUP | EPOLLOUT; 144 + ev.data.fd = fd; 145 + 146 + if (epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev) < 0) { 147 + log_err("Failed to register fd=%d on epoll fd=%d", fd, efd); 148 + close(efd); 149 + return -1; 150 + } 151 + 152 + nfd = epoll_wait(efd, events, ARRAY_SIZE(events), timeout_ms); 153 + if (nfd < 0) 154 + log_err("Failed to wait for I/O event on epoll fd=%d", efd); 155 + 156 + close(efd); 157 + return nfd; 158 + }

+41

tools/testing/selftests/bpf/network_helpers.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __NETWORK_HELPERS_H 3 + #define __NETWORK_HELPERS_H 4 + #include <sys/socket.h> 5 + #include <sys/types.h> 6 + #include <linux/types.h> 7 + typedef __u16 __sum16; 8 + #include <linux/if_ether.h> 9 + #include <linux/if_packet.h> 10 + #include <linux/ip.h> 11 + #include <linux/ipv6.h> 12 + #include <netinet/tcp.h> 13 + #include <bpf/bpf_endian.h> 14 + 15 + #define MAGIC_VAL 0x1234 16 + #define NUM_ITER 100000 17 + #define VIP_NUM 5 18 + #define MAGIC_BYTES 123 19 + 20 + /* ipv4 test vector */ 21 + struct ipv4_packet { 22 + struct ethhdr eth; 23 + struct iphdr iph; 24 + struct tcphdr tcp; 25 + } __packed; 26 + extern struct ipv4_packet pkt_v4; 27 + 28 + /* ipv6 test vector */ 29 + struct ipv6_packet { 30 + struct ethhdr eth; 31 + struct ipv6hdr iph; 32 + struct tcphdr tcp; 33 + } __packed; 34 + extern struct ipv6_packet pkt_v6; 35 + 36 + int start_server(int family, int type); 37 + int connect_to_fd(int family, int type, int server_fd); 38 + int connect_fd_to_fd(int client_fd, int server_fd); 39 + int connect_wait(int client_fd); 40 + 41 + #endif

+409

tools/testing/selftests/bpf/prog_tests/bpf_iter.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include <test_progs.h> 4 + #include "bpf_iter_ipv6_route.skel.h" 5 + #include "bpf_iter_netlink.skel.h" 6 + #include "bpf_iter_bpf_map.skel.h" 7 + #include "bpf_iter_task.skel.h" 8 + #include "bpf_iter_task_file.skel.h" 9 + #include "bpf_iter_test_kern1.skel.h" 10 + #include "bpf_iter_test_kern2.skel.h" 11 + #include "bpf_iter_test_kern3.skel.h" 12 + #include "bpf_iter_test_kern4.skel.h" 13 + 14 + static int duration; 15 + 16 + static void test_btf_id_or_null(void) 17 + { 18 + struct bpf_iter_test_kern3 *skel; 19 + 20 + skel = bpf_iter_test_kern3__open_and_load(); 21 + if (CHECK(skel, "bpf_iter_test_kern3__open_and_load", 22 + "skeleton open_and_load unexpectedly succeeded\n")) { 23 + bpf_iter_test_kern3__destroy(skel); 24 + return; 25 + } 26 + } 27 + 28 + static void do_dummy_read(struct bpf_program *prog) 29 + { 30 + struct bpf_link *link; 31 + char buf[16] = {}; 32 + int iter_fd, len; 33 + 34 + link = bpf_program__attach_iter(prog, NULL); 35 + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) 36 + return; 37 + 38 + iter_fd = bpf_iter_create(bpf_link__fd(link)); 39 + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) 40 + goto free_link; 41 + 42 + /* not check contents, but ensure read() ends without error */ 43 + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) 44 + ; 45 + CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)); 46 + 47 + close(iter_fd); 48 + 49 + free_link: 50 + bpf_link__destroy(link); 51 + } 52 + 53 + static void test_ipv6_route(void) 54 + { 55 + struct bpf_iter_ipv6_route *skel; 56 + 57 + skel = bpf_iter_ipv6_route__open_and_load(); 58 + if (CHECK(!skel, "bpf_iter_ipv6_route__open_and_load", 59 + "skeleton open_and_load failed\n")) 60 + return; 61 + 62 + do_dummy_read(skel->progs.dump_ipv6_route); 63 + 64 + bpf_iter_ipv6_route__destroy(skel); 65 + } 66 + 67 + static void test_netlink(void) 68 + { 69 + struct bpf_iter_netlink *skel; 70 + 71 + skel = bpf_iter_netlink__open_and_load(); 72 + if (CHECK(!skel, "bpf_iter_netlink__open_and_load", 73 + "skeleton open_and_load failed\n")) 74 + return; 75 + 76 + do_dummy_read(skel->progs.dump_netlink); 77 + 78 + bpf_iter_netlink__destroy(skel); 79 + } 80 + 81 + static void test_bpf_map(void) 82 + { 83 + struct bpf_iter_bpf_map *skel; 84 + 85 + skel = bpf_iter_bpf_map__open_and_load(); 86 + if (CHECK(!skel, "bpf_iter_bpf_map__open_and_load", 87 + "skeleton open_and_load failed\n")) 88 + return; 89 + 90 + do_dummy_read(skel->progs.dump_bpf_map); 91 + 92 + bpf_iter_bpf_map__destroy(skel); 93 + } 94 + 95 + static void test_task(void) 96 + { 97 + struct bpf_iter_task *skel; 98 + 99 + skel = bpf_iter_task__open_and_load(); 100 + if (CHECK(!skel, "bpf_iter_task__open_and_load", 101 + "skeleton open_and_load failed\n")) 102 + return; 103 + 104 + do_dummy_read(skel->progs.dump_task); 105 + 106 + bpf_iter_task__destroy(skel); 107 + } 108 + 109 + static void test_task_file(void) 110 + { 111 + struct bpf_iter_task_file *skel; 112 + 113 + skel = bpf_iter_task_file__open_and_load(); 114 + if (CHECK(!skel, "bpf_iter_task_file__open_and_load", 115 + "skeleton open_and_load failed\n")) 116 + return; 117 + 118 + do_dummy_read(skel->progs.dump_task_file); 119 + 120 + bpf_iter_task_file__destroy(skel); 121 + } 122 + 123 + /* The expected string is less than 16 bytes */ 124 + static int do_read_with_fd(int iter_fd, const char *expected, 125 + bool read_one_char) 126 + { 127 + int err = -1, len, read_buf_len, start; 128 + char buf[16] = {}; 129 + 130 + read_buf_len = read_one_char ? 1 : 16; 131 + start = 0; 132 + while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) { 133 + start += len; 134 + if (CHECK(start >= 16, "read", "read len %d\n", len)) 135 + return -1; 136 + read_buf_len = read_one_char ? 1 : 16 - start; 137 + } 138 + if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno))) 139 + return -1; 140 + 141 + err = strcmp(buf, expected); 142 + if (CHECK(err, "read", "incorrect read result: buf %s, expected %s\n", 143 + buf, expected)) 144 + return -1; 145 + 146 + return 0; 147 + } 148 + 149 + static void test_anon_iter(bool read_one_char) 150 + { 151 + struct bpf_iter_test_kern1 *skel; 152 + struct bpf_link *link; 153 + int iter_fd, err; 154 + 155 + skel = bpf_iter_test_kern1__open_and_load(); 156 + if (CHECK(!skel, "bpf_iter_test_kern1__open_and_load", 157 + "skeleton open_and_load failed\n")) 158 + return; 159 + 160 + err = bpf_iter_test_kern1__attach(skel); 161 + if (CHECK(err, "bpf_iter_test_kern1__attach", 162 + "skeleton attach failed\n")) { 163 + goto out; 164 + } 165 + 166 + link = skel->links.dump_task; 167 + iter_fd = bpf_iter_create(bpf_link__fd(link)); 168 + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) 169 + goto out; 170 + 171 + do_read_with_fd(iter_fd, "abcd", read_one_char); 172 + close(iter_fd); 173 + 174 + out: 175 + bpf_iter_test_kern1__destroy(skel); 176 + } 177 + 178 + static int do_read(const char *path, const char *expected) 179 + { 180 + int err, iter_fd; 181 + 182 + iter_fd = open(path, O_RDONLY); 183 + if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n", 184 + path, strerror(errno))) 185 + return -1; 186 + 187 + err = do_read_with_fd(iter_fd, expected, false); 188 + close(iter_fd); 189 + return err; 190 + } 191 + 192 + static void test_file_iter(void) 193 + { 194 + const char *path = "/sys/fs/bpf/bpf_iter_test1"; 195 + struct bpf_iter_test_kern1 *skel1; 196 + struct bpf_iter_test_kern2 *skel2; 197 + struct bpf_link *link; 198 + int err; 199 + 200 + skel1 = bpf_iter_test_kern1__open_and_load(); 201 + if (CHECK(!skel1, "bpf_iter_test_kern1__open_and_load", 202 + "skeleton open_and_load failed\n")) 203 + return; 204 + 205 + link = bpf_program__attach_iter(skel1->progs.dump_task, NULL); 206 + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) 207 + goto out; 208 + 209 + /* unlink this path if it exists. */ 210 + unlink(path); 211 + 212 + err = bpf_link__pin(link, path); 213 + if (CHECK(err, "pin_iter", "pin_iter to %s failed: %d\n", path, err)) 214 + goto free_link; 215 + 216 + err = do_read(path, "abcd"); 217 + if (err) 218 + goto unlink_path; 219 + 220 + /* file based iterator seems working fine. Let us a link update 221 + * of the underlying link and `cat` the iterator again, its content 222 + * should change. 223 + */ 224 + skel2 = bpf_iter_test_kern2__open_and_load(); 225 + if (CHECK(!skel2, "bpf_iter_test_kern2__open_and_load", 226 + "skeleton open_and_load failed\n")) 227 + goto unlink_path; 228 + 229 + err = bpf_link__update_program(link, skel2->progs.dump_task); 230 + if (CHECK(err, "update_prog", "update_prog failed\n")) 231 + goto destroy_skel2; 232 + 233 + do_read(path, "ABCD"); 234 + 235 + destroy_skel2: 236 + bpf_iter_test_kern2__destroy(skel2); 237 + unlink_path: 238 + unlink(path); 239 + free_link: 240 + bpf_link__destroy(link); 241 + out: 242 + bpf_iter_test_kern1__destroy(skel1); 243 + } 244 + 245 + static void test_overflow(bool test_e2big_overflow, bool ret1) 246 + { 247 + __u32 map_info_len, total_read_len, expected_read_len; 248 + int err, iter_fd, map1_fd, map2_fd, len; 249 + struct bpf_map_info map_info = {}; 250 + struct bpf_iter_test_kern4 *skel; 251 + struct bpf_link *link; 252 + __u32 page_size; 253 + char *buf; 254 + 255 + skel = bpf_iter_test_kern4__open(); 256 + if (CHECK(!skel, "bpf_iter_test_kern4__open", 257 + "skeleton open failed\n")) 258 + return; 259 + 260 + /* create two maps: bpf program will only do bpf_seq_write 261 + * for these two maps. The goal is one map output almost 262 + * fills seq_file buffer and then the other will trigger 263 + * overflow and needs restart. 264 + */ 265 + map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); 266 + if (CHECK(map1_fd < 0, "bpf_create_map", 267 + "map_creation failed: %s\n", strerror(errno))) 268 + goto out; 269 + map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); 270 + if (CHECK(map2_fd < 0, "bpf_create_map", 271 + "map_creation failed: %s\n", strerror(errno))) 272 + goto free_map1; 273 + 274 + /* bpf_seq_printf kernel buffer is one page, so one map 275 + * bpf_seq_write will mostly fill it, and the other map 276 + * will partially fill and then trigger overflow and need 277 + * bpf_seq_read restart. 278 + */ 279 + page_size = sysconf(_SC_PAGE_SIZE); 280 + 281 + if (test_e2big_overflow) { 282 + skel->rodata->print_len = (page_size + 8) / 8; 283 + expected_read_len = 2 * (page_size + 8); 284 + } else if (!ret1) { 285 + skel->rodata->print_len = (page_size - 8) / 8; 286 + expected_read_len = 2 * (page_size - 8); 287 + } else { 288 + skel->rodata->print_len = 1; 289 + expected_read_len = 2 * 8; 290 + } 291 + skel->rodata->ret1 = ret1; 292 + 293 + if (CHECK(bpf_iter_test_kern4__load(skel), 294 + "bpf_iter_test_kern4__load", "skeleton load failed\n")) 295 + goto free_map2; 296 + 297 + /* setup filtering map_id in bpf program */ 298 + map_info_len = sizeof(map_info); 299 + err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len); 300 + if (CHECK(err, "get_map_info", "get map info failed: %s\n", 301 + strerror(errno))) 302 + goto free_map2; 303 + skel->bss->map1_id = map_info.id; 304 + 305 + err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len); 306 + if (CHECK(err, "get_map_info", "get map info failed: %s\n", 307 + strerror(errno))) 308 + goto free_map2; 309 + skel->bss->map2_id = map_info.id; 310 + 311 + link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL); 312 + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) 313 + goto free_map2; 314 + 315 + iter_fd = bpf_iter_create(bpf_link__fd(link)); 316 + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) 317 + goto free_link; 318 + 319 + buf = malloc(expected_read_len); 320 + if (!buf) 321 + goto close_iter; 322 + 323 + /* do read */ 324 + total_read_len = 0; 325 + if (test_e2big_overflow) { 326 + while ((len = read(iter_fd, buf, expected_read_len)) > 0) 327 + total_read_len += len; 328 + 329 + CHECK(len != -1 || errno != E2BIG, "read", 330 + "expected ret -1, errno E2BIG, but get ret %d, error %s\n", 331 + len, strerror(errno)); 332 + goto free_buf; 333 + } else if (!ret1) { 334 + while ((len = read(iter_fd, buf, expected_read_len)) > 0) 335 + total_read_len += len; 336 + 337 + if (CHECK(len < 0, "read", "read failed: %s\n", 338 + strerror(errno))) 339 + goto free_buf; 340 + } else { 341 + do { 342 + len = read(iter_fd, buf, expected_read_len); 343 + if (len > 0) 344 + total_read_len += len; 345 + } while (len > 0 || len == -EAGAIN); 346 + 347 + if (CHECK(len < 0, "read", "read failed: %s\n", 348 + strerror(errno))) 349 + goto free_buf; 350 + } 351 + 352 + if (CHECK(total_read_len != expected_read_len, "read", 353 + "total len %u, expected len %u\n", total_read_len, 354 + expected_read_len)) 355 + goto free_buf; 356 + 357 + if (CHECK(skel->bss->map1_accessed != 1, "map1_accessed", 358 + "expected 1 actual %d\n", skel->bss->map1_accessed)) 359 + goto free_buf; 360 + 361 + if (CHECK(skel->bss->map2_accessed != 2, "map2_accessed", 362 + "expected 2 actual %d\n", skel->bss->map2_accessed)) 363 + goto free_buf; 364 + 365 + CHECK(skel->bss->map2_seqnum1 != skel->bss->map2_seqnum2, 366 + "map2_seqnum", "two different seqnum %lld %lld\n", 367 + skel->bss->map2_seqnum1, skel->bss->map2_seqnum2); 368 + 369 + free_buf: 370 + free(buf); 371 + close_iter: 372 + close(iter_fd); 373 + free_link: 374 + bpf_link__destroy(link); 375 + free_map2: 376 + close(map2_fd); 377 + free_map1: 378 + close(map1_fd); 379 + out: 380 + bpf_iter_test_kern4__destroy(skel); 381 + } 382 + 383 + void test_bpf_iter(void) 384 + { 385 + if (test__start_subtest("btf_id_or_null")) 386 + test_btf_id_or_null(); 387 + if (test__start_subtest("ipv6_route")) 388 + test_ipv6_route(); 389 + if (test__start_subtest("netlink")) 390 + test_netlink(); 391 + if (test__start_subtest("bpf_map")) 392 + test_bpf_map(); 393 + if (test__start_subtest("task")) 394 + test_task(); 395 + if (test__start_subtest("task_file")) 396 + test_task_file(); 397 + if (test__start_subtest("anon")) 398 + test_anon_iter(false); 399 + if (test__start_subtest("anon-read-one-char")) 400 + test_anon_iter(true); 401 + if (test__start_subtest("file")) 402 + test_file_iter(); 403 + if (test__start_subtest("overflow")) 404 + test_overflow(false, false); 405 + if (test__start_subtest("overflow-e2big")) 406 + test_overflow(true, false); 407 + if (test__start_subtest("prog-ret-1")) 408 + test_overflow(false, true); 409 + }

+95

tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2020 Facebook 3 + 4 + #include <test_progs.h> 5 + 6 + #include "network_helpers.h" 7 + #include "cgroup_skb_sk_lookup_kern.skel.h" 8 + 9 + static void run_lookup_test(__u16 *g_serv_port, int out_sk) 10 + { 11 + int serv_sk = -1, in_sk = -1, serv_in_sk = -1, err; 12 + struct sockaddr_in6 addr = {}; 13 + socklen_t addr_len = sizeof(addr); 14 + __u32 duration = 0; 15 + 16 + serv_sk = start_server(AF_INET6, SOCK_STREAM); 17 + if (CHECK(serv_sk < 0, "start_server", "failed to start server\n")) 18 + return; 19 + 20 + err = getsockname(serv_sk, (struct sockaddr *)&addr, &addr_len); 21 + if (CHECK(err, "getsockname", "errno %d\n", errno)) 22 + goto cleanup; 23 + 24 + *g_serv_port = addr.sin6_port; 25 + 26 + /* Client outside of test cgroup should fail to connect by timeout. */ 27 + err = connect_fd_to_fd(out_sk, serv_sk); 28 + if (CHECK(!err || errno != EINPROGRESS, "connect_fd_to_fd", 29 + "unexpected result err %d errno %d\n", err, errno)) 30 + goto cleanup; 31 + 32 + err = connect_wait(out_sk); 33 + if (CHECK(err, "connect_wait", "unexpected result %d\n", err)) 34 + goto cleanup; 35 + 36 + /* Client inside test cgroup should connect just fine. */ 37 + in_sk = connect_to_fd(AF_INET6, SOCK_STREAM, serv_sk); 38 + if (CHECK(in_sk < 0, "connect_to_fd", "errno %d\n", errno)) 39 + goto cleanup; 40 + 41 + serv_in_sk = accept(serv_sk, NULL, NULL); 42 + if (CHECK(serv_in_sk < 0, "accept", "errno %d\n", errno)) 43 + goto cleanup; 44 + 45 + cleanup: 46 + close(serv_in_sk); 47 + close(in_sk); 48 + close(serv_sk); 49 + } 50 + 51 + static void run_cgroup_bpf_test(const char *cg_path, int out_sk) 52 + { 53 + struct cgroup_skb_sk_lookup_kern *skel; 54 + struct bpf_link *link; 55 + __u32 duration = 0; 56 + int cgfd = -1; 57 + 58 + skel = cgroup_skb_sk_lookup_kern__open_and_load(); 59 + if (CHECK(!skel, "skel_open_load", "open_load failed\n")) 60 + return; 61 + 62 + cgfd = test__join_cgroup(cg_path); 63 + if (CHECK(cgfd < 0, "cgroup_join", "cgroup setup failed\n")) 64 + goto cleanup; 65 + 66 + link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd); 67 + if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link))) 68 + goto cleanup; 69 + 70 + run_lookup_test(&skel->bss->g_serv_port, out_sk); 71 + 72 + bpf_link__destroy(link); 73 + 74 + cleanup: 75 + close(cgfd); 76 + cgroup_skb_sk_lookup_kern__destroy(skel); 77 + } 78 + 79 + void test_cgroup_skb_sk_lookup(void) 80 + { 81 + const char *cg_path = "/foo"; 82 + int out_sk; 83 + 84 + /* Create a socket before joining testing cgroup so that its cgroup id 85 + * differs from that of testing cgroup. Moving selftests process to 86 + * testing cgroup won't change cgroup id of an already created socket. 87 + */ 88 + out_sk = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0); 89 + if (CHECK_FAIL(out_sk < 0)) 90 + return; 91 + 92 + run_cgroup_bpf_test(cg_path, out_sk); 93 + 94 + close(out_sk); 95 + }

+115

tools/testing/selftests/bpf/prog_tests/connect_force_port.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <test_progs.h> 4 + #include "cgroup_helpers.h" 5 + #include "network_helpers.h" 6 + 7 + static int verify_port(int family, int fd, int expected) 8 + { 9 + struct sockaddr_storage addr; 10 + socklen_t len = sizeof(addr); 11 + __u16 port; 12 + 13 + if (getsockname(fd, (struct sockaddr *)&addr, &len)) { 14 + log_err("Failed to get server addr"); 15 + return -1; 16 + } 17 + 18 + if (family == AF_INET) 19 + port = ((struct sockaddr_in *)&addr)->sin_port; 20 + else 21 + port = ((struct sockaddr_in6 *)&addr)->sin6_port; 22 + 23 + if (ntohs(port) != expected) { 24 + log_err("Unexpected port %d, expected %d", ntohs(port), 25 + expected); 26 + return -1; 27 + } 28 + 29 + return 0; 30 + } 31 + 32 + static int run_test(int cgroup_fd, int server_fd, int family, int type) 33 + { 34 + struct bpf_prog_load_attr attr = { 35 + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 36 + }; 37 + struct bpf_object *obj; 38 + int expected_port; 39 + int prog_fd; 40 + int err; 41 + int fd; 42 + 43 + if (family == AF_INET) { 44 + attr.file = "./connect_force_port4.o"; 45 + attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; 46 + expected_port = 22222; 47 + } else { 48 + attr.file = "./connect_force_port6.o"; 49 + attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; 50 + expected_port = 22223; 51 + } 52 + 53 + err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); 54 + if (err) { 55 + log_err("Failed to load BPF object"); 56 + return -1; 57 + } 58 + 59 + err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, 60 + 0); 61 + if (err) { 62 + log_err("Failed to attach BPF program"); 63 + goto close_bpf_object; 64 + } 65 + 66 + fd = connect_to_fd(family, type, server_fd); 67 + if (fd < 0) { 68 + err = -1; 69 + goto close_bpf_object; 70 + } 71 + 72 + err = verify_port(family, fd, expected_port); 73 + 74 + close(fd); 75 + 76 + close_bpf_object: 77 + bpf_object__close(obj); 78 + return err; 79 + } 80 + 81 + void test_connect_force_port(void) 82 + { 83 + int server_fd, cgroup_fd; 84 + 85 + cgroup_fd = test__join_cgroup("/connect_force_port"); 86 + if (CHECK_FAIL(cgroup_fd < 0)) 87 + return; 88 + 89 + server_fd = start_server(AF_INET, SOCK_STREAM); 90 + if (CHECK_FAIL(server_fd < 0)) 91 + goto close_cgroup_fd; 92 + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); 93 + close(server_fd); 94 + 95 + server_fd = start_server(AF_INET6, SOCK_STREAM); 96 + if (CHECK_FAIL(server_fd < 0)) 97 + goto close_cgroup_fd; 98 + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); 99 + close(server_fd); 100 + 101 + server_fd = start_server(AF_INET, SOCK_DGRAM); 102 + if (CHECK_FAIL(server_fd < 0)) 103 + goto close_cgroup_fd; 104 + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); 105 + close(server_fd); 106 + 107 + server_fd = start_server(AF_INET6, SOCK_DGRAM); 108 + if (CHECK_FAIL(server_fd < 0)) 109 + goto close_cgroup_fd; 110 + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); 111 + close(server_fd); 112 + 113 + close_cgroup_fd: 114 + close(cgroup_fd); 115 + }

+1

tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* Copyright (c) 2019 Facebook */ 3 3 #include <test_progs.h> 4 + #include <network_helpers.h> 4 5 5 6 static void test_fexit_bpf2bpf_common(const char *obj_file, 6 7 const char *target_obj_file,

+1

tools/testing/selftests/bpf/prog_tests/flow_dissector.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 #include <error.h> 4 5 #include <linux/if.h> 5 6 #include <linux/if_tun.h>

+1

tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 void test_flow_dissector_load_bytes(void) 5 6 {

+1

tools/testing/selftests/bpf/prog_tests/global_data.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 static void test_global_data_number(struct bpf_object *obj, __u32 duration) 5 6 {

+1

tools/testing/selftests/bpf/prog_tests/kfree_skb.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 struct meta { 5 6 int ifindex;

+1

tools/testing/selftests/bpf/prog_tests/l4lb_all.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 static void test_l4lb(const char *file) 5 6 {

+14

tools/testing/selftests/bpf/prog_tests/map_lock.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 4 + 5 + static void *spin_lock_thread(void *arg) 6 + { 7 + __u32 duration, retval; 8 + int err, prog_fd = *(u32 *) arg; 9 + 10 + err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), 11 + NULL, NULL, &retval, &duration); 12 + CHECK(err || retval, "", 13 + "err %d errno %d retval %d duration %d\n", 14 + err, errno, retval, duration); 15 + pthread_exit(arg); 16 + } 3 17 4 18 static void *parallel_map_access(void *arg) 5 19 {

+1

tools/testing/selftests/bpf/prog_tests/pkt_access.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 void test_pkt_access(void) 5 6 {

+1

tools/testing/selftests/bpf/prog_tests/pkt_md_access.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 void test_pkt_md_access(void) 5 6 {

+1

tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 void test_prog_run_xattr(void) 5 6 {

+1

tools/testing/selftests/bpf/prog_tests/queue_stack_map.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 enum { 5 6 QUEUE,

+1

tools/testing/selftests/bpf/prog_tests/signal_pending.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 static void sigalrm_handler(int s) {} 5 6 static struct sigaction sigalrm_action = {

+1

tools/testing/selftests/bpf/prog_tests/skb_ctx.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 void test_skb_ctx(void) 5 6 {

+14

tools/testing/selftests/bpf/prog_tests/spinlock.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 4 + 5 + static void *spin_lock_thread(void *arg) 6 + { 7 + __u32 duration, retval; 8 + int err, prog_fd = *(u32 *) arg; 9 + 10 + err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), 11 + NULL, NULL, &retval, &duration); 12 + CHECK(err || retval, "", 13 + "err %d errno %d retval %d duration %d\n", 14 + err, errno, retval, duration); 15 + pthread_exit(arg); 16 + } 3 17 4 18 void test_spinlock(void) 5 19 {

+4 -112

tools/testing/selftests/bpf/prog_tests/tcp_rtt.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 3 #include "cgroup_helpers.h" 4 + #include "network_helpers.h" 4 5 5 6 struct tcp_rtt_storage { 6 7 __u32 invoked; ··· 88 87 return err; 89 88 } 90 89 91 - static int connect_to_server(int server_fd) 92 - { 93 - struct sockaddr_storage addr; 94 - socklen_t len = sizeof(addr); 95 - int fd; 96 - 97 - fd = socket(AF_INET, SOCK_STREAM, 0); 98 - if (fd < 0) { 99 - log_err("Failed to create client socket"); 100 - return -1; 101 - } 102 - 103 - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { 104 - log_err("Failed to get server addr"); 105 - goto out; 106 - } 107 - 108 - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { 109 - log_err("Fail to connect to server"); 110 - goto out; 111 - } 112 - 113 - return fd; 114 - 115 - out: 116 - close(fd); 117 - return -1; 118 - } 119 90 120 91 static int run_test(int cgroup_fd, int server_fd) 121 92 { ··· 118 145 goto close_bpf_object; 119 146 } 120 147 121 - client_fd = connect_to_server(server_fd); 148 + client_fd = connect_to_fd(AF_INET, SOCK_STREAM, server_fd); 122 149 if (client_fd < 0) { 123 150 err = -1; 124 151 goto close_bpf_object; ··· 153 180 return err; 154 181 } 155 182 156 - static int start_server(void) 157 - { 158 - struct sockaddr_in addr = { 159 - .sin_family = AF_INET, 160 - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), 161 - }; 162 - int fd; 163 - 164 - fd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); 165 - if (fd < 0) { 166 - log_err("Failed to create server socket"); 167 - return -1; 168 - } 169 - 170 - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) { 171 - log_err("Failed to bind socket"); 172 - close(fd); 173 - return -1; 174 - } 175 - 176 - return fd; 177 - } 178 - 179 - static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; 180 - static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; 181 - static volatile bool server_done = false; 182 - 183 - static void *server_thread(void *arg) 184 - { 185 - struct sockaddr_storage addr; 186 - socklen_t len = sizeof(addr); 187 - int fd = *(int *)arg; 188 - int client_fd; 189 - int err; 190 - 191 - err = listen(fd, 1); 192 - 193 - pthread_mutex_lock(&server_started_mtx); 194 - pthread_cond_signal(&server_started); 195 - pthread_mutex_unlock(&server_started_mtx); 196 - 197 - if (CHECK_FAIL(err < 0)) { 198 - perror("Failed to listed on socket"); 199 - return ERR_PTR(err); 200 - } 201 - 202 - while (true) { 203 - client_fd = accept(fd, (struct sockaddr *)&addr, &len); 204 - if (client_fd == -1 && errno == EAGAIN) { 205 - usleep(50); 206 - continue; 207 - } 208 - break; 209 - } 210 - if (CHECK_FAIL(client_fd < 0)) { 211 - perror("Failed to accept client"); 212 - return ERR_PTR(err); 213 - } 214 - 215 - while (!server_done) 216 - usleep(50); 217 - 218 - close(client_fd); 219 - 220 - return NULL; 221 - } 222 - 223 183 void test_tcp_rtt(void) 224 184 { 225 185 int server_fd, cgroup_fd; 226 - pthread_t tid; 227 - void *server_res; 228 186 229 187 cgroup_fd = test__join_cgroup("/tcp_rtt"); 230 188 if (CHECK_FAIL(cgroup_fd < 0)) 231 189 return; 232 190 233 - server_fd = start_server(); 191 + server_fd = start_server(AF_INET, SOCK_STREAM); 234 192 if (CHECK_FAIL(server_fd < 0)) 235 193 goto close_cgroup_fd; 236 194 237 - if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, 238 - (void *)&server_fd))) 239 - goto close_server_fd; 240 - 241 - pthread_mutex_lock(&server_started_mtx); 242 - pthread_cond_wait(&server_started, &server_started_mtx); 243 - pthread_mutex_unlock(&server_started_mtx); 244 - 245 195 CHECK_FAIL(run_test(cgroup_fd, server_fd)); 246 196 247 - server_done = true; 248 - CHECK_FAIL(pthread_join(tid, &server_res)); 249 - CHECK_FAIL(IS_ERR(server_res)); 250 - 251 - close_server_fd: 252 197 close(server_fd); 198 + 253 199 close_cgroup_fd: 254 200 close(cgroup_fd); 255 201 }

+13 -1

tools/testing/selftests/bpf/prog_tests/test_overhead.c

··· 61 61 const char *raw_tp_name = "raw_tp/task_rename"; 62 62 const char *fentry_name = "fentry/__set_task_comm"; 63 63 const char *fexit_name = "fexit/__set_task_comm"; 64 + const char *fmodret_name = "fmod_ret/__set_task_comm"; 64 65 const char *kprobe_func = "__set_task_comm"; 65 66 struct bpf_program *kprobe_prog, *kretprobe_prog, *raw_tp_prog; 66 - struct bpf_program *fentry_prog, *fexit_prog; 67 + struct bpf_program *fentry_prog, *fexit_prog, *fmodret_prog; 67 68 struct bpf_object *obj; 68 69 struct bpf_link *link; 69 70 int err, duration = 0; ··· 96 95 fexit_prog = bpf_object__find_program_by_title(obj, fexit_name); 97 96 if (CHECK(!fexit_prog, "find_probe", 98 97 "prog '%s' not found\n", fexit_name)) 98 + goto cleanup; 99 + fmodret_prog = bpf_object__find_program_by_title(obj, fmodret_name); 100 + if (CHECK(!fmodret_prog, "find_probe", 101 + "prog '%s' not found\n", fmodret_name)) 99 102 goto cleanup; 100 103 101 104 err = bpf_object__load(obj); ··· 146 141 if (CHECK(IS_ERR(link), "attach fexit", "err %ld\n", PTR_ERR(link))) 147 142 goto cleanup; 148 143 test_run("fexit"); 144 + bpf_link__destroy(link); 145 + 146 + /* attach fmod_ret */ 147 + link = bpf_program__attach_trace(fmodret_prog); 148 + if (CHECK(IS_ERR(link), "attach fmod_ret", "err %ld\n", PTR_ERR(link))) 149 + goto cleanup; 150 + test_run("fmod_ret"); 149 151 bpf_link__destroy(link); 150 152 cleanup: 151 153 prctl(PR_SET_NAME, comm, 0L, 0L, 0L);

+1

tools/testing/selftests/bpf/prog_tests/xdp.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 void test_xdp(void) 5 6 {

+1

tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 void test_xdp_adjust_tail(void) 5 6 {

+1

tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 #include <net/if.h> 4 5 #include "test_xdp.skel.h" 5 6 #include "test_xdp_bpf2bpf.skel.h"

+1

tools/testing/selftests/bpf/prog_tests/xdp_noinline.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <test_progs.h> 3 + #include <network_helpers.h> 3 4 4 5 void test_xdp_noinline(void) 5 6 {

+28

tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + #include <bpf/bpf_tracing.h> 6 + 7 + char _license[] SEC("license") = "GPL"; 8 + 9 + SEC("iter/bpf_map") 10 + int dump_bpf_map(struct bpf_iter__bpf_map *ctx) 11 + { 12 + struct seq_file *seq = ctx->meta->seq; 13 + __u64 seq_num = ctx->meta->seq_num; 14 + struct bpf_map *map = ctx->map; 15 + 16 + if (map == (void *)0) { 17 + BPF_SEQ_PRINTF(seq, " %%%%%% END %%%%%%\n"); 18 + return 0; 19 + } 20 + 21 + if (seq_num == 0) 22 + BPF_SEQ_PRINTF(seq, " id refcnt usercnt locked_vm\n"); 23 + 24 + BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter, 25 + map->usercnt.counter, 26 + map->memory.user->locked_vm.counter); 27 + return 0; 28 + }

+62

tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + #include <bpf/bpf_tracing.h> 6 + 7 + char _license[] SEC("license") = "GPL"; 8 + 9 + extern bool CONFIG_IPV6_SUBTREES __kconfig __weak; 10 + 11 + #define RTF_GATEWAY 0x0002 12 + #define IFNAMSIZ 16 13 + #define fib_nh_gw_family nh_common.nhc_gw_family 14 + #define fib_nh_gw6 nh_common.nhc_gw.ipv6 15 + #define fib_nh_dev nh_common.nhc_dev 16 + 17 + SEC("iter/ipv6_route") 18 + int dump_ipv6_route(struct bpf_iter__ipv6_route *ctx) 19 + { 20 + struct seq_file *seq = ctx->meta->seq; 21 + struct fib6_info *rt = ctx->rt; 22 + const struct net_device *dev; 23 + struct fib6_nh *fib6_nh; 24 + unsigned int flags; 25 + struct nexthop *nh; 26 + 27 + if (rt == (void *)0) 28 + return 0; 29 + 30 + fib6_nh = &rt->fib6_nh[0]; 31 + flags = rt->fib6_flags; 32 + 33 + /* FIXME: nexthop_is_multipath is not handled here. */ 34 + nh = rt->nh; 35 + if (rt->nh) 36 + fib6_nh = &nh->nh_info->fib6_nh; 37 + 38 + BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); 39 + 40 + if (CONFIG_IPV6_SUBTREES) 41 + BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_src.addr, 42 + rt->fib6_src.plen); 43 + else 44 + BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 00 "); 45 + 46 + if (fib6_nh->fib_nh_gw_family) { 47 + flags |= RTF_GATEWAY; 48 + BPF_SEQ_PRINTF(seq, "%pi6 ", &fib6_nh->fib_nh_gw6); 49 + } else { 50 + BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 "); 51 + } 52 + 53 + dev = fib6_nh->fib_nh_dev; 54 + if (dev) 55 + BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric, 56 + rt->fib6_ref.refs.counter, 0, flags, dev->name); 57 + else 58 + BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x\n", rt->fib6_metric, 59 + rt->fib6_ref.refs.counter, 0, flags); 60 + 61 + return 0; 62 + }

+66

tools/testing/selftests/bpf/progs/bpf_iter_netlink.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + #include <bpf/bpf_tracing.h> 6 + 7 + char _license[] SEC("license") = "GPL"; 8 + 9 + #define sk_rmem_alloc sk_backlog.rmem_alloc 10 + #define sk_refcnt __sk_common.skc_refcnt 11 + 12 + static inline struct inode *SOCK_INODE(struct socket *socket) 13 + { 14 + return &container_of(socket, struct socket_alloc, socket)->vfs_inode; 15 + } 16 + 17 + SEC("iter/netlink") 18 + int dump_netlink(struct bpf_iter__netlink *ctx) 19 + { 20 + struct seq_file *seq = ctx->meta->seq; 21 + struct netlink_sock *nlk = ctx->sk; 22 + unsigned long group, ino; 23 + struct inode *inode; 24 + struct socket *sk; 25 + struct sock *s; 26 + 27 + if (nlk == (void *)0) 28 + return 0; 29 + 30 + if (ctx->meta->seq_num == 0) 31 + BPF_SEQ_PRINTF(seq, "sk Eth Pid Groups " 32 + "Rmem Wmem Dump Locks Drops " 33 + "Inode\n"); 34 + 35 + s = &nlk->sk; 36 + BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol); 37 + 38 + if (!nlk->groups) { 39 + group = 0; 40 + } else { 41 + /* FIXME: temporary use bpf_probe_read here, needs 42 + * verifier support to do direct access. 43 + */ 44 + bpf_probe_read(&group, sizeof(group), &nlk->groups[0]); 45 + } 46 + BPF_SEQ_PRINTF(seq, "%-10u %08x %-8d %-8d %-5d %-8d ", 47 + nlk->portid, (u32)group, 48 + s->sk_rmem_alloc.counter, 49 + s->sk_wmem_alloc.refs.counter - 1, 50 + nlk->cb_running, s->sk_refcnt.refs.counter); 51 + 52 + sk = s->sk_socket; 53 + if (!sk) { 54 + ino = 0; 55 + } else { 56 + /* FIXME: container_of inside SOCK_INODE has a forced 57 + * type conversion, and direct access cannot be used 58 + * with current verifier. 59 + */ 60 + inode = SOCK_INODE(sk); 61 + bpf_probe_read(&ino, sizeof(ino), &inode->i_ino); 62 + } 63 + BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino); 64 + 65 + return 0; 66 + }

+25

tools/testing/selftests/bpf/progs/bpf_iter_task.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + #include <bpf/bpf_tracing.h> 6 + 7 + char _license[] SEC("license") = "GPL"; 8 + 9 + SEC("iter/task") 10 + int dump_task(struct bpf_iter__task *ctx) 11 + { 12 + struct seq_file *seq = ctx->meta->seq; 13 + struct task_struct *task = ctx->task; 14 + 15 + if (task == (void *)0) { 16 + BPF_SEQ_PRINTF(seq, " === END ===\n"); 17 + return 0; 18 + } 19 + 20 + if (ctx->meta->seq_num == 0) 21 + BPF_SEQ_PRINTF(seq, " tgid gid\n"); 22 + 23 + BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid); 24 + return 0; 25 + }

+26

tools/testing/selftests/bpf/progs/bpf_iter_task_file.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + #include <bpf/bpf_tracing.h> 6 + 7 + char _license[] SEC("license") = "GPL"; 8 + 9 + SEC("iter/task_file") 10 + int dump_task_file(struct bpf_iter__task_file *ctx) 11 + { 12 + struct seq_file *seq = ctx->meta->seq; 13 + struct task_struct *task = ctx->task; 14 + __u32 fd = ctx->fd; 15 + struct file *file = ctx->file; 16 + 17 + if (task == (void *)0 || file == (void *)0) 18 + return 0; 19 + 20 + if (ctx->meta->seq_num == 0) 21 + BPF_SEQ_PRINTF(seq, " tgid gid fd file\n"); 22 + 23 + BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, 24 + (long)file->f_op); 25 + return 0; 26 + }

+4

tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c

+4

tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c

+18

tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + 6 + char _license[] SEC("license") = "GPL"; 7 + 8 + SEC("iter/task") 9 + int dump_task(struct bpf_iter__task *ctx) 10 + { 11 + struct seq_file *seq = ctx->meta->seq; 12 + struct task_struct *task = ctx->task; 13 + int tgid; 14 + 15 + tgid = task->tgid; 16 + bpf_seq_write(seq, &tgid, sizeof(tgid)); 17 + return 0; 18 + }

+52

tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + 6 + char _license[] SEC("license") = "GPL"; 7 + 8 + __u32 map1_id = 0, map2_id = 0; 9 + __u32 map1_accessed = 0, map2_accessed = 0; 10 + __u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0; 11 + 12 + static volatile const __u32 print_len; 13 + static volatile const __u32 ret1; 14 + 15 + SEC("iter/bpf_map") 16 + int dump_bpf_map(struct bpf_iter__bpf_map *ctx) 17 + { 18 + struct seq_file *seq = ctx->meta->seq; 19 + struct bpf_map *map = ctx->map; 20 + __u64 seq_num; 21 + int i, ret = 0; 22 + 23 + if (map == (void *)0) 24 + return 0; 25 + 26 + /* only dump map1_id and map2_id */ 27 + if (map->id != map1_id && map->id != map2_id) 28 + return 0; 29 + 30 + seq_num = ctx->meta->seq_num; 31 + if (map->id == map1_id) { 32 + map1_seqnum = seq_num; 33 + map1_accessed++; 34 + } 35 + 36 + if (map->id == map2_id) { 37 + if (map2_accessed == 0) { 38 + map2_seqnum1 = seq_num; 39 + if (ret1) 40 + ret = 1; 41 + } else { 42 + map2_seqnum2 = seq_num; 43 + } 44 + map2_accessed++; 45 + } 46 + 47 + /* fill seq_file buffer */ 48 + for (i = 0; i < print_len; i++) 49 + bpf_seq_write(seq, &seq_num, sizeof(seq_num)); 50 + 51 + return ret; 52 + }

+22

tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2020 Facebook */ 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + 6 + char _license[] SEC("license") = "GPL"; 7 + int count = 0; 8 + 9 + SEC("iter/task") 10 + int dump_task(struct bpf_iter__task *ctx) 11 + { 12 + struct seq_file *seq = ctx->meta->seq; 13 + char c; 14 + 15 + if (count < 4) { 16 + c = START_CHAR + count; 17 + bpf_seq_write(seq, &c, sizeof(c)); 18 + count++; 19 + } 20 + 21 + return 0; 22 + }

+97

tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2020 Facebook 3 + 4 + #include <linux/bpf.h> 5 + #include <bpf/bpf_endian.h> 6 + #include <bpf/bpf_helpers.h> 7 + 8 + #include <linux/if_ether.h> 9 + #include <linux/in.h> 10 + #include <linux/in6.h> 11 + #include <linux/ipv6.h> 12 + #include <linux/tcp.h> 13 + 14 + #include <sys/types.h> 15 + #include <sys/socket.h> 16 + 17 + int _version SEC("version") = 1; 18 + char _license[] SEC("license") = "GPL"; 19 + 20 + __u16 g_serv_port = 0; 21 + 22 + static inline void set_ip(__u32 *dst, const struct in6_addr *src) 23 + { 24 + dst[0] = src->in6_u.u6_addr32[0]; 25 + dst[1] = src->in6_u.u6_addr32[1]; 26 + dst[2] = src->in6_u.u6_addr32[2]; 27 + dst[3] = src->in6_u.u6_addr32[3]; 28 + } 29 + 30 + static inline void set_tuple(struct bpf_sock_tuple *tuple, 31 + const struct ipv6hdr *ip6h, 32 + const struct tcphdr *tcph) 33 + { 34 + set_ip(tuple->ipv6.saddr, &ip6h->daddr); 35 + set_ip(tuple->ipv6.daddr, &ip6h->saddr); 36 + tuple->ipv6.sport = tcph->dest; 37 + tuple->ipv6.dport = tcph->source; 38 + } 39 + 40 + static inline int is_allowed_peer_cg(struct __sk_buff *skb, 41 + const struct ipv6hdr *ip6h, 42 + const struct tcphdr *tcph) 43 + { 44 + __u64 cgid, acgid, peer_cgid, peer_acgid; 45 + struct bpf_sock_tuple tuple; 46 + size_t tuple_len = sizeof(tuple.ipv6); 47 + struct bpf_sock *peer_sk; 48 + 49 + set_tuple(&tuple, ip6h, tcph); 50 + 51 + peer_sk = bpf_sk_lookup_tcp(skb, &tuple, tuple_len, 52 + BPF_F_CURRENT_NETNS, 0); 53 + if (!peer_sk) 54 + return 0; 55 + 56 + cgid = bpf_skb_cgroup_id(skb); 57 + peer_cgid = bpf_sk_cgroup_id(peer_sk); 58 + 59 + acgid = bpf_skb_ancestor_cgroup_id(skb, 2); 60 + peer_acgid = bpf_sk_ancestor_cgroup_id(peer_sk, 2); 61 + 62 + bpf_sk_release(peer_sk); 63 + 64 + return cgid && cgid == peer_cgid && acgid && acgid == peer_acgid; 65 + } 66 + 67 + SEC("cgroup_skb/ingress") 68 + int ingress_lookup(struct __sk_buff *skb) 69 + { 70 + __u32 serv_port_key = 0; 71 + struct ipv6hdr ip6h; 72 + struct tcphdr tcph; 73 + 74 + if (skb->protocol != bpf_htons(ETH_P_IPV6)) 75 + return 1; 76 + 77 + /* For SYN packets coming to listening socket skb->remote_port will be 78 + * zero, so IPv6/TCP headers are loaded to identify remote peer 79 + * instead. 80 + */ 81 + if (bpf_skb_load_bytes(skb, 0, &ip6h, sizeof(ip6h))) 82 + return 1; 83 + 84 + if (ip6h.nexthdr != IPPROTO_TCP) 85 + return 1; 86 + 87 + if (bpf_skb_load_bytes(skb, sizeof(ip6h), &tcph, sizeof(tcph))) 88 + return 1; 89 + 90 + if (!g_serv_port) 91 + return 0; 92 + 93 + if (tcph.dest != g_serv_port) 94 + return 1; 95 + 96 + return is_allowed_peer_cg(skb, &ip6h, &tcph); 97 + }

+28

tools/testing/selftests/bpf/progs/connect_force_port4.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <string.h> 3 + 4 + #include <linux/bpf.h> 5 + #include <linux/in.h> 6 + #include <linux/in6.h> 7 + #include <sys/socket.h> 8 + 9 + #include <bpf/bpf_helpers.h> 10 + #include <bpf/bpf_endian.h> 11 + 12 + char _license[] SEC("license") = "GPL"; 13 + int _version SEC("version") = 1; 14 + 15 + SEC("cgroup/connect4") 16 + int _connect4(struct bpf_sock_addr *ctx) 17 + { 18 + struct sockaddr_in sa = {}; 19 + 20 + sa.sin_family = AF_INET; 21 + sa.sin_port = bpf_htons(22222); 22 + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ 23 + 24 + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) 25 + return 0; 26 + 27 + return 1; 28 + }

+28

tools/testing/selftests/bpf/progs/connect_force_port6.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <string.h> 3 + 4 + #include <linux/bpf.h> 5 + #include <linux/in.h> 6 + #include <linux/in6.h> 7 + #include <sys/socket.h> 8 + 9 + #include <bpf/bpf_helpers.h> 10 + #include <bpf/bpf_endian.h> 11 + 12 + char _license[] SEC("license") = "GPL"; 13 + int _version SEC("version") = 1; 14 + 15 + SEC("cgroup/connect6") 16 + int _connect6(struct bpf_sock_addr *ctx) 17 + { 18 + struct sockaddr_in6 sa = {}; 19 + 20 + sa.sin6_family = AF_INET6; 21 + sa.sin6_port = bpf_htons(22223); 22 + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ 23 + 24 + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) 25 + return 0; 26 + 27 + return 1; 28 + }

+1 -1

tools/testing/selftests/bpf/progs/core_reloc_types.h

··· 379 379 struct core_reloc_arrays_substruct c[3]; 380 380 struct core_reloc_arrays_substruct d[1][2]; 381 381 /* equivalent to flexible array */ 382 - struct core_reloc_arrays_substruct f[0][2]; 382 + struct core_reloc_arrays_substruct f[][2]; 383 383 }; 384 384 385 385 struct core_reloc_arrays___fixed_arr {

+6

tools/testing/selftests/bpf/progs/test_overhead.c

··· 39 39 return !tsk; 40 40 } 41 41 42 + SEC("fmod_ret/__set_task_comm") 43 + int BPF_PROG(prog6, struct task_struct *tsk, const char *buf, bool exec) 44 + { 45 + return !tsk; 46 + } 47 + 42 48 char _license[] SEC("license") = "GPL";

+47

tools/testing/selftests/bpf/progs/trigger_bench.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2020 Facebook 3 + 4 + #include <linux/bpf.h> 5 + #include <asm/unistd.h> 6 + #include <bpf/bpf_helpers.h> 7 + #include <bpf/bpf_tracing.h> 8 + 9 + char _license[] SEC("license") = "GPL"; 10 + 11 + long hits = 0; 12 + 13 + SEC("tp/syscalls/sys_enter_getpgid") 14 + int bench_trigger_tp(void *ctx) 15 + { 16 + __sync_add_and_fetch(&hits, 1); 17 + return 0; 18 + } 19 + 20 + SEC("raw_tp/sys_enter") 21 + int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) 22 + { 23 + if (id == __NR_getpgid) 24 + __sync_add_and_fetch(&hits, 1); 25 + return 0; 26 + } 27 + 28 + SEC("kprobe/__x64_sys_getpgid") 29 + int bench_trigger_kprobe(void *ctx) 30 + { 31 + __sync_add_and_fetch(&hits, 1); 32 + return 0; 33 + } 34 + 35 + SEC("fentry/__x64_sys_getpgid") 36 + int bench_trigger_fentry(void *ctx) 37 + { 38 + __sync_add_and_fetch(&hits, 1); 39 + return 0; 40 + } 41 + 42 + SEC("fmod_ret/__x64_sys_getpgid") 43 + int bench_trigger_fmodret(void *ctx) 44 + { 45 + __sync_add_and_fetch(&hits, 1); 46 + return -22; 47 + }

+4 -93

tools/testing/selftests/bpf/test_progs.c

··· 222 222 return fd; 223 223 } 224 224 225 - struct ipv4_packet pkt_v4 = { 226 - .eth.h_proto = __bpf_constant_htons(ETH_P_IP), 227 - .iph.ihl = 5, 228 - .iph.protocol = IPPROTO_TCP, 229 - .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), 230 - .tcp.urg_ptr = 123, 231 - .tcp.doff = 5, 232 - }; 233 - 234 - struct ipv6_packet pkt_v6 = { 235 - .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), 236 - .iph.nexthdr = IPPROTO_TCP, 237 - .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), 238 - .tcp.urg_ptr = 123, 239 - .tcp.doff = 5, 240 - }; 241 - 242 225 int bpf_find_map(const char *test, struct bpf_object *obj, const char *name) 243 226 { 244 227 struct bpf_map *map; ··· 341 358 return -1; 342 359 } 343 360 344 - void *spin_lock_thread(void *arg) 345 - { 346 - __u32 duration, retval; 347 - int err, prog_fd = *(u32 *) arg; 348 - 349 - err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), 350 - NULL, NULL, &retval, &duration); 351 - CHECK(err || retval, "", 352 - "err %d errno %d retval %d duration %d\n", 353 - err, errno, retval, duration); 354 - pthread_exit(arg); 355 - } 356 - 357 361 /* extern declarations for test funcs */ 358 362 #define DEFINE_TEST(name) extern void test_##name(void); 359 363 #include <prog_tests/tests.h> ··· 438 468 return -ENOMEM; 439 469 } 440 470 441 - int parse_num_list(const char *s, struct test_selector *sel) 442 - { 443 - int i, set_len = 0, new_len, num, start = 0, end = -1; 444 - bool *set = NULL, *tmp, parsing_end = false; 445 - char *next; 446 - 447 - while (s[0]) { 448 - errno = 0; 449 - num = strtol(s, &next, 10); 450 - if (errno) 451 - return -errno; 452 - 453 - if (parsing_end) 454 - end = num; 455 - else 456 - start = num; 457 - 458 - if (!parsing_end && *next == '-') { 459 - s = next + 1; 460 - parsing_end = true; 461 - continue; 462 - } else if (*next == ',') { 463 - parsing_end = false; 464 - s = next + 1; 465 - end = num; 466 - } else if (*next == '\0') { 467 - parsing_end = false; 468 - s = next; 469 - end = num; 470 - } else { 471 - return -EINVAL; 472 - } 473 - 474 - if (start > end) 475 - return -EINVAL; 476 - 477 - if (end + 1 > set_len) { 478 - new_len = end + 1; 479 - tmp = realloc(set, new_len); 480 - if (!tmp) { 481 - free(set); 482 - return -ENOMEM; 483 - } 484 - for (i = set_len; i < start; i++) 485 - tmp[i] = false; 486 - set = tmp; 487 - set_len = new_len; 488 - } 489 - for (i = start; i <= end; i++) 490 - set[i] = true; 491 - } 492 - 493 - if (!set) 494 - return -EINVAL; 495 - 496 - sel->num_set = set; 497 - sel->num_set_len = set_len; 498 - 499 - return 0; 500 - } 501 - 502 471 extern int extra_prog_load_log_flags; 503 472 504 473 static error_t parse_arg(int key, char *arg, struct argp_state *state) ··· 451 542 if (subtest_str) { 452 543 *subtest_str = '\0'; 453 544 if (parse_num_list(subtest_str + 1, 454 - &env->subtest_selector)) { 545 + &env->subtest_selector.num_set, 546 + &env->subtest_selector.num_set_len)) { 455 547 fprintf(stderr, 456 548 "Failed to parse subtest numbers.\n"); 457 549 return -EINVAL; 458 550 } 459 551 } 460 - if (parse_num_list(arg, &env->test_selector)) { 552 + if (parse_num_list(arg, &env->test_selector.num_set, 553 + &env->test_selector.num_set_len)) { 461 554 fprintf(stderr, "Failed to parse test numbers.\n"); 462 555 return -EINVAL; 463 556 }

+1 -23

tools/testing/selftests/bpf/test_progs.h

··· 37 37 #include "bpf_util.h" 38 38 #include <bpf/bpf_endian.h> 39 39 #include "trace_helpers.h" 40 + #include "testing_helpers.h" 40 41 #include "flow_dissector_load.h" 41 42 42 43 enum verbosity { ··· 88 87 extern void test__fail(void); 89 88 extern int test__join_cgroup(const char *path); 90 89 91 - #define MAGIC_BYTES 123 92 - 93 - /* ipv4 test vector */ 94 - struct ipv4_packet { 95 - struct ethhdr eth; 96 - struct iphdr iph; 97 - struct tcphdr tcp; 98 - } __packed; 99 - extern struct ipv4_packet pkt_v4; 100 - 101 - /* ipv6 test vector */ 102 - struct ipv6_packet { 103 - struct ethhdr eth; 104 - struct ipv6hdr iph; 105 - struct tcphdr tcp; 106 - } __packed; 107 - extern struct ipv6_packet pkt_v6; 108 - 109 90 #define PRINT_FAIL(format...) \ 110 91 ({ \ 111 92 test__fail(); \ ··· 126 143 #define CHECK_ATTR(condition, tag, format...) \ 127 144 _CHECK(condition, tag, tattr.duration, format) 128 145 129 - #define MAGIC_VAL 0x1234 130 - #define NUM_ITER 100000 131 - #define VIP_NUM 5 132 - 133 146 static inline __u64 ptr_to_u64(const void *ptr) 134 147 { 135 148 return (__u64) (unsigned long) ptr; ··· 135 156 int compare_map_keys(int map1_fd, int map2_fd); 136 157 int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len); 137 158 int extract_build_id(char *build_id, size_t size); 138 - void *spin_lock_thread(void *arg); 139 159 140 160 #ifdef __x86_64__ 141 161 #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep"

+28 -10

tools/testing/selftests/bpf/test_sock_addr.c

··· 677 677 uint8_t u4_addr8[4]; 678 678 uint16_t u4_addr16[2]; 679 679 uint32_t u4_addr32; 680 - } ip4; 680 + } ip4, port; 681 681 struct sockaddr_in addr4_rw; 682 682 683 683 if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) { 684 684 log_err("Invalid IPv4: %s", SERV4_IP); 685 685 return -1; 686 686 } 687 + 688 + port.u4_addr32 = htons(SERV4_PORT); 687 689 688 690 if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, 689 691 (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1) ··· 698 696 /* if (sk.family == AF_INET && */ 699 697 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 700 698 offsetof(struct bpf_sock_addr, family)), 701 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 24), 699 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 32), 702 700 703 701 /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */ 704 702 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 705 703 offsetof(struct bpf_sock_addr, type)), 706 704 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1), 707 705 BPF_JMP_A(1), 708 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 20), 706 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 28), 709 707 710 708 /* 1st_byte_of_user_ip4 == expected && */ 711 709 BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 712 710 offsetof(struct bpf_sock_addr, user_ip4)), 713 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 18), 711 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 26), 714 712 715 713 /* 2nd_byte_of_user_ip4 == expected && */ 716 714 BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 717 715 offsetof(struct bpf_sock_addr, user_ip4) + 1), 718 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 16), 716 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 24), 719 717 720 718 /* 3rd_byte_of_user_ip4 == expected && */ 721 719 BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 722 720 offsetof(struct bpf_sock_addr, user_ip4) + 2), 723 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 14), 721 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 22), 724 722 725 723 /* 4th_byte_of_user_ip4 == expected && */ 726 724 BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 727 725 offsetof(struct bpf_sock_addr, user_ip4) + 3), 728 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 12), 726 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 20), 729 727 730 728 /* 1st_half_of_user_ip4 == expected && */ 731 729 BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, 732 730 offsetof(struct bpf_sock_addr, user_ip4)), 733 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 10), 731 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 18), 734 732 735 733 /* 2nd_half_of_user_ip4 == expected && */ 736 734 BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, 737 735 offsetof(struct bpf_sock_addr, user_ip4) + 2), 738 - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 8), 736 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 16), 739 737 740 - /* whole_user_ip4 == expected) { */ 738 + /* whole_user_ip4 == expected && */ 741 739 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 742 740 offsetof(struct bpf_sock_addr, user_ip4)), 743 741 BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */ 742 + BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 12), 743 + 744 + /* 1st_byte_of_user_port == expected && */ 745 + BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, 746 + offsetof(struct bpf_sock_addr, user_port)), 747 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr8[0], 10), 748 + 749 + /* 1st_half_of_user_port == expected && */ 750 + BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, 751 + offsetof(struct bpf_sock_addr, user_port)), 752 + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr16[0], 8), 753 + 754 + /* user_port == expected) { */ 755 + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, 756 + offsetof(struct bpf_sock_addr, user_port)), 757 + BPF_LD_IMM64(BPF_REG_8, port.u4_addr32), /* See [2]. */ 744 758 BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4), 745 759 746 760 /* user_ip4 = addr4_rw.sin_addr */

+66

tools/testing/selftests/bpf/testing_helpers.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + /* Copyright (C) 2020 Facebook, Inc. */ 3 + #include <stdlib.h> 4 + #include <errno.h> 5 + #include "testing_helpers.h" 6 + 7 + int parse_num_list(const char *s, bool **num_set, int *num_set_len) 8 + { 9 + int i, set_len = 0, new_len, num, start = 0, end = -1; 10 + bool *set = NULL, *tmp, parsing_end = false; 11 + char *next; 12 + 13 + while (s[0]) { 14 + errno = 0; 15 + num = strtol(s, &next, 10); 16 + if (errno) 17 + return -errno; 18 + 19 + if (parsing_end) 20 + end = num; 21 + else 22 + start = num; 23 + 24 + if (!parsing_end && *next == '-') { 25 + s = next + 1; 26 + parsing_end = true; 27 + continue; 28 + } else if (*next == ',') { 29 + parsing_end = false; 30 + s = next + 1; 31 + end = num; 32 + } else if (*next == '\0') { 33 + parsing_end = false; 34 + s = next; 35 + end = num; 36 + } else { 37 + return -EINVAL; 38 + } 39 + 40 + if (start > end) 41 + return -EINVAL; 42 + 43 + if (end + 1 > set_len) { 44 + new_len = end + 1; 45 + tmp = realloc(set, new_len); 46 + if (!tmp) { 47 + free(set); 48 + return -ENOMEM; 49 + } 50 + for (i = set_len; i < start; i++) 51 + tmp[i] = false; 52 + set = tmp; 53 + set_len = new_len; 54 + } 55 + for (i = start; i <= end; i++) 56 + set[i] = true; 57 + } 58 + 59 + if (!set) 60 + return -EINVAL; 61 + 62 + *num_set = set; 63 + *num_set_len = set_len; 64 + 65 + return 0; 66 + }

+5

tools/testing/selftests/bpf/testing_helpers.h

··· 1 + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 + /* Copyright (C) 2020 Facebook, Inc. */ 3 + #include <stdbool.h> 4 + 5 + int parse_num_list(const char *s, bool **set, int *set_len);