Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+2 -2

Documentation/bpf/bpf_devel_QA.rst

··· 60 60 A: All patches that are Cc'ed to netdev are queued for review under netdev 61 61 patchwork project: 62 62 63 - http://patchwork.ozlabs.org/project/netdev/list/ 63 + https://patchwork.kernel.org/project/netdevbpf/list/ 64 64 65 65 Those patches which target BPF, are assigned to a 'bpf' delegate for 66 66 further processing from BPF maintainers. The current queue with 67 67 patches under review can be found at: 68 68 69 - https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147 69 + https://patchwork.kernel.org/project/netdevbpf/list/?delegate=121173 70 70 71 71 Once the patches have been reviewed by the BPF community as a whole 72 72 and approved by the BPF maintainers, their status in patchwork will be

+1 -1

MAINTAINERS

··· 3263 3263 R: Martin KaFai Lau <kafai@fb.com> 3264 3264 R: Song Liu <songliubraving@fb.com> 3265 3265 R: Yonghong Song <yhs@fb.com> 3266 - R: Andrii Nakryiko <andriin@fb.com> 3266 + R: Andrii Nakryiko <andrii@kernel.org> 3267 3267 R: John Fastabend <john.fastabend@gmail.com> 3268 3268 R: KP Singh <kpsingh@chromium.org> 3269 3269 L: netdev@vger.kernel.org

+9

drivers/net/veth.c

··· 420 420 return smp_processor_id() % dev->real_num_rx_queues; 421 421 } 422 422 423 + static struct net_device *veth_peer_dev(struct net_device *dev) 424 + { 425 + struct veth_priv *priv = netdev_priv(dev); 426 + 427 + /* Callers must be under RCU read side. */ 428 + return rcu_dereference(priv->peer); 429 + } 430 + 423 431 static int veth_xdp_xmit(struct net_device *dev, int n, 424 432 struct xdp_frame **frames, 425 433 u32 flags, bool ndo_xmit) ··· 1232 1224 .ndo_set_rx_headroom = veth_set_rx_headroom, 1233 1225 .ndo_bpf = veth_xdp, 1234 1226 .ndo_xdp_xmit = veth_ndo_xdp_xmit, 1227 + .ndo_get_peer_dev = veth_peer_dev, 1235 1228 }; 1236 1229 1237 1230 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \

+7 -1

include/linux/bpf.h

··· 82 82 void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, 83 83 int fd); 84 84 void (*map_fd_put_ptr)(void *ptr); 85 - u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); 85 + int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); 86 86 u32 (*map_fd_sys_lookup_elem)(void *ptr); 87 87 void (*map_seq_show_elem)(struct bpf_map *map, void *key, 88 88 struct seq_file *m); ··· 293 293 ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ 294 294 ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ 295 295 ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ 296 + ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ 296 297 __BPF_ARG_TYPE_MAX, 297 298 }; 298 299 ··· 308 307 RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ 309 308 RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ 310 309 RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ 310 + RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ 311 + RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ 311 312 }; 312 313 313 314 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs ··· 408 405 PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ 409 406 PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ 410 407 PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ 408 + PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ 411 409 }; 412 410 413 411 /* The information passed from prog-specific *_is_valid_access ··· 1832 1828 extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; 1833 1829 extern const struct bpf_func_proto bpf_copy_from_user_proto; 1834 1830 extern const struct bpf_func_proto bpf_snprintf_btf_proto; 1831 + extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; 1832 + extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; 1835 1833 1836 1834 const struct bpf_func_proto *bpf_tracing_func_proto( 1837 1835 enum bpf_func_id func_id, const struct bpf_prog *prog);

+7

include/linux/bpf_verifier.h

··· 308 308 u32 map_index; /* index into used_maps[] */ 309 309 u32 map_off; /* offset from value base address */ 310 310 }; 311 + struct { 312 + enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ 313 + union { 314 + u32 btf_id; /* btf_id for struct typed var */ 315 + u32 mem_size; /* mem_size for non-struct typed var */ 316 + }; 317 + } btf_var; 311 318 }; 312 319 u64 map_key_state; /* constant (32 bit) key tracking for maps */ 313 320 int ctx_field_size; /* the ctx field size for load insn, maybe 0 */

+26

include/linux/btf.h

··· 110 110 i < btf_type_vlen(struct_type); \ 111 111 i++, member++) 112 112 113 + #define for_each_vsi(i, datasec_type, member) \ 114 + for (i = 0, member = btf_type_var_secinfo(datasec_type); \ 115 + i < btf_type_vlen(datasec_type); \ 116 + i++, member++) 117 + 113 118 static inline bool btf_type_is_ptr(const struct btf_type *t) 114 119 { 115 120 return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; ··· 150 145 return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; 151 146 } 152 147 148 + static inline bool btf_type_is_var(const struct btf_type *t) 149 + { 150 + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; 151 + } 152 + 153 + /* union is only a special case of struct: 154 + * all its offsetof(member) == 0 155 + */ 156 + static inline bool btf_type_is_struct(const struct btf_type *t) 157 + { 158 + u8 kind = BTF_INFO_KIND(t->info); 159 + 160 + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; 161 + } 162 + 153 163 static inline u16 btf_type_vlen(const struct btf_type *t) 154 164 { 155 165 return BTF_INFO_VLEN(t->info); ··· 197 177 static inline const struct btf_member *btf_type_member(const struct btf_type *t) 198 178 { 199 179 return (const struct btf_member *)(t + 1); 180 + } 181 + 182 + static inline const struct btf_var_secinfo *btf_type_var_secinfo( 183 + const struct btf_type *t) 184 + { 185 + return (const struct btf_var_secinfo *)(t + 1); 200 186 } 201 187 202 188 #ifdef CONFIG_BPF_SYSCALL

+4

include/linux/netdevice.h

··· 1276 1276 * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, 1277 1277 * int cmd); 1278 1278 * Add, change, delete or get information on an IPv4 tunnel. 1279 + * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); 1280 + * If a device is paired with a peer device, return the peer instance. 1281 + * The caller must be under RCU read context. 1279 1282 */ 1280 1283 struct net_device_ops { 1281 1284 int (*ndo_init)(struct net_device *dev); ··· 1486 1483 struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); 1487 1484 int (*ndo_tunnel_ctl)(struct net_device *dev, 1488 1485 struct ip_tunnel_parm *p, int cmd); 1486 + struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); 1489 1487 }; 1490 1488 1491 1489 /**

+2

include/linux/skmsg.h

··· 308 308 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); 309 309 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); 310 310 void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); 311 + void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); 312 + void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); 311 313 312 314 int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, 313 315 struct sk_msg *msg);

-33

include/net/tcp.h

··· 2228 2228 #endif /* CONFIG_NET_SOCK_MSG */ 2229 2229 2230 2230 #ifdef CONFIG_CGROUP_BPF 2231 - /* Copy the listen sk's HDR_OPT_CB flags to its child. 2232 - * 2233 - * During 3-Way-HandShake, the synack is usually sent from 2234 - * the listen sk with the HDR_OPT_CB flags set so that 2235 - * bpf-prog will be called to write the BPF hdr option. 2236 - * 2237 - * In fastopen, the child sk is used to send synack instead 2238 - * of the listen sk. Thus, inheriting the HDR_OPT_CB flags 2239 - * from the listen sk gives the bpf-prog a chance to write 2240 - * BPF hdr option in the synack pkt during fastopen. 2241 - * 2242 - * Both fastopen and non-fastopen child will inherit the 2243 - * HDR_OPT_CB flags to keep the bpf-prog having a consistent 2244 - * behavior when deciding to clear this cb flags (or not) 2245 - * during the PASSIVE_ESTABLISHED_CB. 2246 - * 2247 - * In the future, other cb flags could be inherited here also. 2248 - */ 2249 - static inline void bpf_skops_init_child(const struct sock *sk, 2250 - struct sock *child) 2251 - { 2252 - tcp_sk(child)->bpf_sock_ops_cb_flags = 2253 - tcp_sk(sk)->bpf_sock_ops_cb_flags & 2254 - (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG | 2255 - BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | 2256 - BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); 2257 - } 2258 - 2259 2231 static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, 2260 2232 struct sk_buff *skb, 2261 2233 unsigned int end_offset) ··· 2236 2264 skops->skb_data_end = skb->data + end_offset; 2237 2265 } 2238 2266 #else 2239 - static inline void bpf_skops_init_child(const struct sock *sk, 2240 - struct sock *child) 2241 - { 2242 - } 2243 - 2244 2267 static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, 2245 2268 struct sk_buff *skb, 2246 2269 unsigned int end_offset)

+86 -13

include/uapi/linux/bpf.h

··· 356 356 #define BPF_F_SLEEPABLE (1U << 4) 357 357 358 358 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have 359 - * two extensions: 359 + * the following extensions: 360 360 * 361 - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE 362 - * insn[0].imm: map fd map fd 363 - * insn[1].imm: 0 offset into value 364 - * insn[0].off: 0 0 365 - * insn[1].off: 0 0 366 - * ldimm64 rewrite: address of map address of map[0]+offset 367 - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE 361 + * insn[0].src_reg: BPF_PSEUDO_MAP_FD 362 + * insn[0].imm: map fd 363 + * insn[1].imm: 0 364 + * insn[0].off: 0 365 + * insn[1].off: 0 366 + * ldimm64 rewrite: address of map 367 + * verifier type: CONST_PTR_TO_MAP 368 368 */ 369 369 #define BPF_PSEUDO_MAP_FD 1 370 + /* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE 371 + * insn[0].imm: map fd 372 + * insn[1].imm: offset into value 373 + * insn[0].off: 0 374 + * insn[1].off: 0 375 + * ldimm64 rewrite: address of map[0]+offset 376 + * verifier type: PTR_TO_MAP_VALUE 377 + */ 370 378 #define BPF_PSEUDO_MAP_VALUE 2 379 + /* insn[0].src_reg: BPF_PSEUDO_BTF_ID 380 + * insn[0].imm: kernel btd id of VAR 381 + * insn[1].imm: 0 382 + * insn[0].off: 0 383 + * insn[1].off: 0 384 + * ldimm64 rewrite: address of the kernel variable 385 + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var 386 + * is struct/union. 387 + */ 388 + #define BPF_PSEUDO_BTF_ID 3 371 389 372 390 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative 373 391 * offset to another bpf function ··· 435 417 436 418 /* Share perf_event among processes */ 437 419 BPF_F_PRESERVE_ELEMS = (1U << 11), 420 + 421 + /* Create a map that is suitable to be an inner map with dynamic max entries */ 422 + BPF_F_INNER_MAP = (1U << 12), 438 423 }; 439 424 440 425 /* Flags for BPF_PROG_QUERY. */ ··· 1701 1680 * **TCP_CONGESTION**, **TCP_BPF_IW**, 1702 1681 * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, 1703 1682 * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, 1704 - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. 1683 + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. 1705 1684 * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. 1706 1685 * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. 1707 1686 * Return ··· 2256 2235 * Description 2257 2236 * This helper is used in programs implementing policies at the 2258 2237 * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. 2259 - * if the verdeict eBPF program returns **SK_PASS**), redirect it 2238 + * if the verdict eBPF program returns **SK_PASS**), redirect it 2260 2239 * to the socket referenced by *map* (of type 2261 2240 * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and 2262 2241 * egress interfaces can be used for redirection. The ··· 3682 3661 * Redirect the packet to another net device of index *ifindex* 3683 3662 * and fill in L2 addresses from neighboring subsystem. This helper 3684 3663 * is somewhat similar to **bpf_redirect**\ (), except that it 3685 - * fills in e.g. MAC addresses based on the L3 information from 3686 - * the packet. This helper is supported for IPv4 and IPv6 protocols. 3664 + * populates L2 addresses as well, meaning, internally, the helper 3665 + * performs a FIB lookup based on the skb's networking header to 3666 + * get the address of the next hop and then relies on the neighbor 3667 + * lookup for the L2 address of the nexthop. 3668 + * 3687 3669 * The *flags* argument is reserved and must be 0. The helper is 3688 - * currently only supported for tc BPF program types. 3670 + * currently only supported for tc BPF program types, and enabled 3671 + * for IPv4 and IPv6 protocols. 3672 + * Return 3673 + * The helper returns **TC_ACT_REDIRECT** on success or 3674 + * **TC_ACT_SHOT** on error. 3675 + * 3676 + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) 3677 + * Description 3678 + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a 3679 + * pointer to the percpu kernel variable on *cpu*. A ksym is an 3680 + * extern variable decorated with '__ksym'. For ksym, there is a 3681 + * global var (either static or global) defined of the same name 3682 + * in the kernel. The ksym is percpu if the global var is percpu. 3683 + * The returned pointer points to the global percpu var on *cpu*. 3684 + * 3685 + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the 3686 + * kernel, except that bpf_per_cpu_ptr() may return NULL. This 3687 + * happens if *cpu* is larger than nr_cpu_ids. The caller of 3688 + * bpf_per_cpu_ptr() must check the returned value. 3689 + * Return 3690 + * A pointer pointing to the kernel percpu variable on *cpu*, or 3691 + * NULL, if *cpu* is invalid. 3692 + * 3693 + * void *bpf_this_cpu_ptr(const void *percpu_ptr) 3694 + * Description 3695 + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a 3696 + * pointer to the percpu kernel variable on this cpu. See the 3697 + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). 3698 + * 3699 + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in 3700 + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would 3701 + * never return NULL. 3702 + * Return 3703 + * A pointer pointing to the kernel percpu variable on this cpu. 3704 + * 3705 + * long bpf_redirect_peer(u32 ifindex, u64 flags) 3706 + * Description 3707 + * Redirect the packet to another net device of index *ifindex*. 3708 + * This helper is somewhat similar to **bpf_redirect**\ (), except 3709 + * that the redirection happens to the *ifindex*' peer device and 3710 + * the netns switch takes place from ingress to ingress without 3711 + * going through the CPU's backlog queue. 3712 + * 3713 + * The *flags* argument is reserved and must be 0. The helper is 3714 + * currently only supported for tc BPF program types at the ingress 3715 + * hook and for veth device types. The peer device must reside in a 3716 + * different network namespace. 3689 3717 * Return 3690 3718 * The helper returns **TC_ACT_REDIRECT** on success or 3691 3719 * **TC_ACT_SHOT** on error. ··· 3893 3823 FN(seq_printf_btf), \ 3894 3824 FN(skb_cgroup_classid), \ 3895 3825 FN(redirect_neigh), \ 3826 + FN(bpf_per_cpu_ptr), \ 3827 + FN(bpf_this_cpu_ptr), \ 3828 + FN(redirect_peer), \ 3896 3829 /* */ 3897 3830 3898 3831 /* integer value in 'imm' field of BPF_CALL instruction selects which helper

+11 -6

kernel/bpf/arraymap.c

··· 16 16 17 17 #define ARRAY_CREATE_FLAG_MASK \ 18 18 (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ 19 - BPF_F_PRESERVE_ELEMS) 19 + BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) 20 20 21 21 static void bpf_array_free_percpu(struct bpf_array *array) 22 22 { ··· 62 62 return -EINVAL; 63 63 64 64 if (attr->map_type != BPF_MAP_TYPE_ARRAY && 65 - attr->map_flags & BPF_F_MMAPABLE) 65 + attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) 66 66 return -EINVAL; 67 67 68 68 if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && ··· 214 214 } 215 215 216 216 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ 217 - static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 217 + static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 218 218 { 219 219 struct bpf_array *array = container_of(map, struct bpf_array, map); 220 220 struct bpf_insn *insn = insn_buf; ··· 222 222 const int ret = BPF_REG_0; 223 223 const int map_ptr = BPF_REG_1; 224 224 const int index = BPF_REG_2; 225 + 226 + if (map->map_flags & BPF_F_INNER_MAP) 227 + return -EOPNOTSUPP; 225 228 226 229 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 227 230 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); ··· 499 496 static bool array_map_meta_equal(const struct bpf_map *meta0, 500 497 const struct bpf_map *meta1) 501 498 { 502 - return meta0->max_entries == meta1->max_entries && 503 - bpf_map_meta_equal(meta0, meta1); 499 + if (!bpf_map_meta_equal(meta0, meta1)) 500 + return false; 501 + return meta0->map_flags & BPF_F_INNER_MAP ? true : 502 + meta0->max_entries == meta1->max_entries; 504 503 } 505 504 506 505 struct bpf_iter_seq_array_map_info { ··· 1256 1251 return READ_ONCE(*inner_map); 1257 1252 } 1258 1253 1259 - static u32 array_of_map_gen_lookup(struct bpf_map *map, 1254 + static int array_of_map_gen_lookup(struct bpf_map *map, 1260 1255 struct bpf_insn *insn_buf) 1261 1256 { 1262 1257 struct bpf_array *array = container_of(map, struct bpf_array, map);

-25

kernel/bpf/btf.c

··· 188 188 i < btf_type_vlen(struct_type); \ 189 189 i++, member++) 190 190 191 - #define for_each_vsi(i, struct_type, member) \ 192 - for (i = 0, member = btf_type_var_secinfo(struct_type); \ 193 - i < btf_type_vlen(struct_type); \ 194 - i++, member++) 195 - 196 191 #define for_each_vsi_from(i, from, struct_type, member) \ 197 192 for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ 198 193 i < btf_type_vlen(struct_type); \ ··· 435 440 return !t || btf_type_nosize(t); 436 441 } 437 442 438 - /* union is only a special case of struct: 439 - * all its offsetof(member) == 0 440 - */ 441 - static bool btf_type_is_struct(const struct btf_type *t) 442 - { 443 - u8 kind = BTF_INFO_KIND(t->info); 444 - 445 - return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; 446 - } 447 - 448 443 static bool __btf_type_is_struct(const struct btf_type *t) 449 444 { 450 445 return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; ··· 443 458 static bool btf_type_is_array(const struct btf_type *t) 444 459 { 445 460 return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; 446 - } 447 - 448 - static bool btf_type_is_var(const struct btf_type *t) 449 - { 450 - return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; 451 461 } 452 462 453 463 static bool btf_type_is_datasec(const struct btf_type *t) ··· 591 611 static const struct btf_var *btf_type_var(const struct btf_type *t) 592 612 { 593 613 return (const struct btf_var *)(t + 1); 594 - } 595 - 596 - static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) 597 - { 598 - return (const struct btf_var_secinfo *)(t + 1); 599 614 } 600 615 601 616 static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)

+3 -3

kernel/bpf/hashtab.c

··· 612 612 * bpf_prog 613 613 * __htab_map_lookup_elem 614 614 */ 615 - static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 615 + static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 616 616 { 617 617 struct bpf_insn *insn = insn_buf; 618 618 const int ret = BPF_REG_0; ··· 651 651 return __htab_lru_map_lookup_elem(map, key, false); 652 652 } 653 653 654 - static u32 htab_lru_map_gen_lookup(struct bpf_map *map, 654 + static int htab_lru_map_gen_lookup(struct bpf_map *map, 655 655 struct bpf_insn *insn_buf) 656 656 { 657 657 struct bpf_insn *insn = insn_buf; ··· 2070 2070 return READ_ONCE(*inner_map); 2071 2071 } 2072 2072 2073 - static u32 htab_of_map_gen_lookup(struct bpf_map *map, 2073 + static int htab_of_map_gen_lookup(struct bpf_map *map, 2074 2074 struct bpf_insn *insn_buf) 2075 2075 { 2076 2076 struct bpf_insn *insn = insn_buf;

+32

kernel/bpf/helpers.c

··· 623 623 .arg3_type = ARG_ANYTHING, 624 624 }; 625 625 626 + BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) 627 + { 628 + if (cpu >= nr_cpu_ids) 629 + return (unsigned long)NULL; 630 + 631 + return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); 632 + } 633 + 634 + const struct bpf_func_proto bpf_per_cpu_ptr_proto = { 635 + .func = bpf_per_cpu_ptr, 636 + .gpl_only = false, 637 + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, 638 + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, 639 + .arg2_type = ARG_ANYTHING, 640 + }; 641 + 642 + BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) 643 + { 644 + return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); 645 + } 646 + 647 + const struct bpf_func_proto bpf_this_cpu_ptr_proto = { 648 + .func = bpf_this_cpu_ptr, 649 + .gpl_only = false, 650 + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, 651 + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, 652 + }; 653 + 626 654 const struct bpf_func_proto bpf_get_current_task_proto __weak; 627 655 const struct bpf_func_proto bpf_probe_read_user_proto __weak; 628 656 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; ··· 717 689 return &bpf_snprintf_btf_proto; 718 690 case BPF_FUNC_jiffies64: 719 691 return &bpf_jiffies64_proto; 692 + case BPF_FUNC_bpf_per_cpu_ptr: 693 + return &bpf_per_cpu_ptr_proto; 694 + case BPF_FUNC_bpf_this_cpu_ptr: 695 + return &bpf_this_cpu_ptr_proto; 720 696 default: 721 697 break; 722 698 }

+96 -5

kernel/bpf/percpu_freelist.c

··· 17 17 raw_spin_lock_init(&head->lock); 18 18 head->first = NULL; 19 19 } 20 + raw_spin_lock_init(&s->extralist.lock); 21 + s->extralist.first = NULL; 20 22 return 0; 21 23 } 22 24 ··· 42 40 raw_spin_unlock(&head->lock); 43 41 } 44 42 43 + static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, 44 + struct pcpu_freelist_node *node) 45 + { 46 + if (!raw_spin_trylock(&s->extralist.lock)) 47 + return false; 48 + 49 + pcpu_freelist_push_node(&s->extralist, node); 50 + raw_spin_unlock(&s->extralist.lock); 51 + return true; 52 + } 53 + 54 + static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, 55 + struct pcpu_freelist_node *node) 56 + { 57 + int cpu, orig_cpu; 58 + 59 + orig_cpu = cpu = raw_smp_processor_id(); 60 + while (1) { 61 + struct pcpu_freelist_head *head; 62 + 63 + head = per_cpu_ptr(s->freelist, cpu); 64 + if (raw_spin_trylock(&head->lock)) { 65 + pcpu_freelist_push_node(head, node); 66 + raw_spin_unlock(&head->lock); 67 + return; 68 + } 69 + cpu = cpumask_next(cpu, cpu_possible_mask); 70 + if (cpu >= nr_cpu_ids) 71 + cpu = 0; 72 + 73 + /* cannot lock any per cpu lock, try extralist */ 74 + if (cpu == orig_cpu && 75 + pcpu_freelist_try_push_extra(s, node)) 76 + return; 77 + } 78 + } 79 + 45 80 void __pcpu_freelist_push(struct pcpu_freelist *s, 46 81 struct pcpu_freelist_node *node) 47 82 { 48 - struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); 49 - 50 - ___pcpu_freelist_push(head, node); 83 + if (in_nmi()) 84 + ___pcpu_freelist_push_nmi(s, node); 85 + else 86 + ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); 51 87 } 52 88 53 89 void pcpu_freelist_push(struct pcpu_freelist *s, ··· 121 81 } 122 82 } 123 83 124 - struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) 84 + static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) 125 85 { 126 86 struct pcpu_freelist_head *head; 127 87 struct pcpu_freelist_node *node; ··· 142 102 if (cpu >= nr_cpu_ids) 143 103 cpu = 0; 144 104 if (cpu == orig_cpu) 145 - return NULL; 105 + break; 146 106 } 107 + 108 + /* per cpu lists are all empty, try extralist */ 109 + raw_spin_lock(&s->extralist.lock); 110 + node = s->extralist.first; 111 + if (node) 112 + s->extralist.first = node->next; 113 + raw_spin_unlock(&s->extralist.lock); 114 + return node; 115 + } 116 + 117 + static struct pcpu_freelist_node * 118 + ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) 119 + { 120 + struct pcpu_freelist_head *head; 121 + struct pcpu_freelist_node *node; 122 + int orig_cpu, cpu; 123 + 124 + orig_cpu = cpu = raw_smp_processor_id(); 125 + while (1) { 126 + head = per_cpu_ptr(s->freelist, cpu); 127 + if (raw_spin_trylock(&head->lock)) { 128 + node = head->first; 129 + if (node) { 130 + head->first = node->next; 131 + raw_spin_unlock(&head->lock); 132 + return node; 133 + } 134 + raw_spin_unlock(&head->lock); 135 + } 136 + cpu = cpumask_next(cpu, cpu_possible_mask); 137 + if (cpu >= nr_cpu_ids) 138 + cpu = 0; 139 + if (cpu == orig_cpu) 140 + break; 141 + } 142 + 143 + /* cannot pop from per cpu lists, try extralist */ 144 + if (!raw_spin_trylock(&s->extralist.lock)) 145 + return NULL; 146 + node = s->extralist.first; 147 + if (node) 148 + s->extralist.first = node->next; 149 + raw_spin_unlock(&s->extralist.lock); 150 + return node; 151 + } 152 + 153 + struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) 154 + { 155 + if (in_nmi()) 156 + return ___pcpu_freelist_pop_nmi(s); 157 + return ___pcpu_freelist_pop(s); 147 158 } 148 159 149 160 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)

+1

kernel/bpf/percpu_freelist.h

··· 13 13 14 14 struct pcpu_freelist { 15 15 struct pcpu_freelist_head __percpu *freelist; 16 + struct pcpu_freelist_head extralist; 16 17 }; 17 18 18 19 struct pcpu_freelist_node {

+3 -1

kernel/bpf/syscall.c

··· 4323 4323 used_maps_old = prog->aux->used_maps; 4324 4324 4325 4325 for (i = 0; i < prog->aux->used_map_cnt; i++) 4326 - if (used_maps_old[i] == map) 4326 + if (used_maps_old[i] == map) { 4327 + bpf_map_put(map); 4327 4328 goto out_unlock; 4329 + } 4328 4330 4329 4331 used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, 4330 4332 sizeof(used_maps_new[0]),

+251 -19

kernel/bpf/verifier.c

··· 238 238 u64 msize_max_value; 239 239 int ref_obj_id; 240 240 int func_id; 241 + u32 btf_id; 242 + u32 ret_btf_id; 241 243 }; 242 244 243 245 struct btf *btf_vmlinux; ··· 519 517 [PTR_TO_XDP_SOCK] = "xdp_sock", 520 518 [PTR_TO_BTF_ID] = "ptr_", 521 519 [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", 520 + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", 522 521 [PTR_TO_MEM] = "mem", 523 522 [PTR_TO_MEM_OR_NULL] = "mem_or_null", 524 523 [PTR_TO_RDONLY_BUF] = "rdonly_buf", ··· 586 583 /* reg->off should be 0 for SCALAR_VALUE */ 587 584 verbose(env, "%lld", reg->var_off.value + reg->off); 588 585 } else { 589 - if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) 586 + if (t == PTR_TO_BTF_ID || 587 + t == PTR_TO_BTF_ID_OR_NULL || 588 + t == PTR_TO_PERCPU_BTF_ID) 590 589 verbose(env, "%s", kernel_type_name(reg->btf_id)); 591 590 verbose(env, "(id=%d", reg->id); 592 591 if (reg_type_may_be_refcounted_or_null(t)) ··· 2209 2204 case PTR_TO_RDONLY_BUF_OR_NULL: 2210 2205 case PTR_TO_RDWR_BUF: 2211 2206 case PTR_TO_RDWR_BUF_OR_NULL: 2207 + case PTR_TO_PERCPU_BTF_ID: 2212 2208 return true; 2213 2209 default: 2214 2210 return false; ··· 2225 2219 static bool register_is_const(struct bpf_reg_state *reg) 2226 2220 { 2227 2221 return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); 2222 + } 2223 + 2224 + static bool __is_scalar_unbounded(struct bpf_reg_state *reg) 2225 + { 2226 + return tnum_is_unknown(reg->var_off) && 2227 + reg->smin_value == S64_MIN && reg->smax_value == S64_MAX && 2228 + reg->umin_value == 0 && reg->umax_value == U64_MAX && 2229 + reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX && 2230 + reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX; 2231 + } 2232 + 2233 + static bool register_is_bounded(struct bpf_reg_state *reg) 2234 + { 2235 + return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg); 2228 2236 } 2229 2237 2230 2238 static bool __is_pointer_value(bool allow_ptr_leaks, ··· 2292 2272 if (value_regno >= 0) 2293 2273 reg = &cur->regs[value_regno]; 2294 2274 2295 - if (reg && size == BPF_REG_SIZE && register_is_const(reg) && 2275 + if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && 2296 2276 !register_is_null(reg) && env->bpf_capable) { 2297 2277 if (dst_reg != BPF_REG_FP) { 2298 2278 /* The backtracking logic can only recognize explicit ··· 2687 2667 case BPF_PROG_TYPE_CGROUP_SKB: 2688 2668 if (t == BPF_WRITE) 2689 2669 return false; 2690 - /* fallthrough */ 2670 + fallthrough; 2691 2671 2692 2672 /* Program types with direct read + write access go here! */ 2693 2673 case BPF_PROG_TYPE_SCHED_CLS: ··· 3998 3978 }, 3999 3979 }; 4000 3980 3981 + #ifdef CONFIG_NET 4001 3982 static const struct bpf_reg_types btf_id_sock_common_types = { 4002 3983 .types = { 4003 3984 PTR_TO_SOCK_COMMON, ··· 4009 3988 }, 4010 3989 .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], 4011 3990 }; 3991 + #endif 4012 3992 4013 3993 static const struct bpf_reg_types mem_types = { 4014 3994 .types = { ··· 4039 4017 static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; 4040 4018 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; 4041 4019 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; 4020 + static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; 4042 4021 4043 4022 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { 4044 4023 [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, ··· 4053 4030 [ARG_PTR_TO_CTX] = &context_types, 4054 4031 [ARG_PTR_TO_CTX_OR_NULL] = &context_types, 4055 4032 [ARG_PTR_TO_SOCK_COMMON] = &sock_types, 4033 + #ifdef CONFIG_NET 4056 4034 [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, 4035 + #endif 4057 4036 [ARG_PTR_TO_SOCKET] = &fullsock_types, 4058 4037 [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, 4059 4038 [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, ··· 4067 4042 [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, 4068 4043 [ARG_PTR_TO_INT] = &int_ptr_types, 4069 4044 [ARG_PTR_TO_LONG] = &int_ptr_types, 4045 + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, 4070 4046 }; 4071 4047 4072 4048 static int check_reg_type(struct bpf_verifier_env *env, u32 regno, ··· 4231 4205 err = check_helper_mem_access(env, regno, 4232 4206 meta->map_ptr->value_size, false, 4233 4207 meta); 4208 + } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { 4209 + if (!reg->btf_id) { 4210 + verbose(env, "Helper has invalid btf_id in R%d\n", regno); 4211 + return -EACCES; 4212 + } 4213 + meta->ret_btf_id = reg->btf_id; 4234 4214 } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { 4235 4215 if (meta->func_id == BPF_FUNC_spin_lock) { 4236 4216 if (process_spin_lock(env, regno, true)) ··· 5146 5114 regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; 5147 5115 regs[BPF_REG_0].id = ++env->id_gen; 5148 5116 regs[BPF_REG_0].mem_size = meta.mem_size; 5117 + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || 5118 + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { 5119 + const struct btf_type *t; 5120 + 5121 + mark_reg_known_zero(env, regs, BPF_REG_0); 5122 + t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); 5123 + if (!btf_type_is_struct(t)) { 5124 + u32 tsize; 5125 + const struct btf_type *ret; 5126 + const char *tname; 5127 + 5128 + /* resolve the type size of ksym. */ 5129 + ret = btf_resolve_size(btf_vmlinux, t, &tsize); 5130 + if (IS_ERR(ret)) { 5131 + tname = btf_name_by_offset(btf_vmlinux, t->name_off); 5132 + verbose(env, "unable to resolve the size of type '%s': %ld\n", 5133 + tname, PTR_ERR(ret)); 5134 + return -EINVAL; 5135 + } 5136 + regs[BPF_REG_0].type = 5137 + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? 5138 + PTR_TO_MEM : PTR_TO_MEM_OR_NULL; 5139 + regs[BPF_REG_0].mem_size = tsize; 5140 + } else { 5141 + regs[BPF_REG_0].type = 5142 + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? 5143 + PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; 5144 + regs[BPF_REG_0].btf_id = meta.ret_btf_id; 5145 + } 5149 5146 } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { 5150 5147 int ret_btf_id; 5151 5148 ··· 5493 5432 /* smin_val represents the known value */ 5494 5433 if (known && smin_val == 0 && opcode == BPF_ADD) 5495 5434 break; 5496 - /* fall-through */ 5435 + fallthrough; 5497 5436 case PTR_TO_PACKET_END: 5498 5437 case PTR_TO_SOCKET: 5499 5438 case PTR_TO_SOCKET_OR_NULL: ··· 6450 6389 src_reg = NULL; 6451 6390 if (dst_reg->type != SCALAR_VALUE) 6452 6391 ptr_reg = dst_reg; 6392 + else 6393 + /* Make sure ID is cleared otherwise dst_reg min/max could be 6394 + * incorrectly propagated into other registers by find_equal_scalars() 6395 + */ 6396 + dst_reg->id = 0; 6453 6397 if (BPF_SRC(insn->code) == BPF_X) { 6454 6398 src_reg = &regs[insn->src_reg]; 6455 6399 if (src_reg->type != SCALAR_VALUE) { ··· 6588 6522 /* case: R1 = R2 6589 6523 * copy register state to dest reg 6590 6524 */ 6525 + if (src_reg->type == SCALAR_VALUE && !src_reg->id) 6526 + /* Assign src and dst registers the same ID 6527 + * that will be used by find_equal_scalars() 6528 + * to propagate min/max range. 6529 + */ 6530 + src_reg->id = ++env->id_gen; 6591 6531 *dst_reg = *src_reg; 6592 6532 dst_reg->live |= REG_LIVE_WRITTEN; 6593 6533 dst_reg->subreg_def = DEF_NOT_SUBREG; ··· 6606 6534 return -EACCES; 6607 6535 } else if (src_reg->type == SCALAR_VALUE) { 6608 6536 *dst_reg = *src_reg; 6537 + /* Make sure ID is cleared otherwise 6538 + * dst_reg min/max could be incorrectly 6539 + * propagated into src_reg by find_equal_scalars() 6540 + */ 6541 + dst_reg->id = 0; 6609 6542 dst_reg->live |= REG_LIVE_WRITTEN; 6610 6543 dst_reg->subreg_def = env->insn_idx + 1; 6611 6544 } else { ··· 7399 7322 return true; 7400 7323 } 7401 7324 7325 + static void find_equal_scalars(struct bpf_verifier_state *vstate, 7326 + struct bpf_reg_state *known_reg) 7327 + { 7328 + struct bpf_func_state *state; 7329 + struct bpf_reg_state *reg; 7330 + int i, j; 7331 + 7332 + for (i = 0; i <= vstate->curframe; i++) { 7333 + state = vstate->frame[i]; 7334 + for (j = 0; j < MAX_BPF_REG; j++) { 7335 + reg = &state->regs[j]; 7336 + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) 7337 + *reg = *known_reg; 7338 + } 7339 + 7340 + bpf_for_each_spilled_reg(j, state, reg) { 7341 + if (!reg) 7342 + continue; 7343 + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) 7344 + *reg = *known_reg; 7345 + } 7346 + } 7347 + } 7348 + 7402 7349 static int check_cond_jmp_op(struct bpf_verifier_env *env, 7403 7350 struct bpf_insn *insn, int *insn_idx) 7404 7351 { ··· 7551 7450 reg_combine_min_max(&other_branch_regs[insn->src_reg], 7552 7451 &other_branch_regs[insn->dst_reg], 7553 7452 src_reg, dst_reg, opcode); 7453 + if (src_reg->id) { 7454 + find_equal_scalars(this_branch, src_reg); 7455 + find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); 7456 + } 7457 + 7554 7458 } 7555 7459 } else if (dst_reg->type == SCALAR_VALUE) { 7556 7460 reg_set_min_max(&other_branch_regs[insn->dst_reg], 7557 7461 dst_reg, insn->imm, (u32)insn->imm, 7558 7462 opcode, is_jmp32); 7463 + } 7464 + 7465 + if (dst_reg->type == SCALAR_VALUE && dst_reg->id) { 7466 + find_equal_scalars(this_branch, dst_reg); 7467 + find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); 7559 7468 } 7560 7469 7561 7470 /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). ··· 7599 7488 { 7600 7489 struct bpf_insn_aux_data *aux = cur_aux(env); 7601 7490 struct bpf_reg_state *regs = cur_regs(env); 7491 + struct bpf_reg_state *dst_reg; 7602 7492 struct bpf_map *map; 7603 7493 int err; 7604 7494 ··· 7616 7504 if (err) 7617 7505 return err; 7618 7506 7507 + dst_reg = &regs[insn->dst_reg]; 7619 7508 if (insn->src_reg == 0) { 7620 7509 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; 7621 7510 7622 - regs[insn->dst_reg].type = SCALAR_VALUE; 7511 + dst_reg->type = SCALAR_VALUE; 7623 7512 __mark_reg_known(&regs[insn->dst_reg], imm); 7513 + return 0; 7514 + } 7515 + 7516 + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { 7517 + mark_reg_known_zero(env, regs, insn->dst_reg); 7518 + 7519 + dst_reg->type = aux->btf_var.reg_type; 7520 + switch (dst_reg->type) { 7521 + case PTR_TO_MEM: 7522 + dst_reg->mem_size = aux->btf_var.mem_size; 7523 + break; 7524 + case PTR_TO_BTF_ID: 7525 + case PTR_TO_PERCPU_BTF_ID: 7526 + dst_reg->btf_id = aux->btf_var.btf_id; 7527 + break; 7528 + default: 7529 + verbose(env, "bpf verifier is misconfigured\n"); 7530 + return -EFAULT; 7531 + } 7624 7532 return 0; 7625 7533 } 7626 7534 7627 7535 map = env->used_maps[aux->map_index]; 7628 7536 mark_reg_known_zero(env, regs, insn->dst_reg); 7629 - regs[insn->dst_reg].map_ptr = map; 7537 + dst_reg->map_ptr = map; 7630 7538 7631 7539 if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { 7632 - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; 7633 - regs[insn->dst_reg].off = aux->map_off; 7540 + dst_reg->type = PTR_TO_MAP_VALUE; 7541 + dst_reg->off = aux->map_off; 7634 7542 if (map_value_has_spin_lock(map)) 7635 - regs[insn->dst_reg].id = ++env->id_gen; 7543 + dst_reg->id = ++env->id_gen; 7636 7544 } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { 7637 - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; 7545 + dst_reg->type = CONST_PTR_TO_MAP; 7638 7546 } else { 7639 7547 verbose(env, "bpf verifier is misconfigured\n"); 7640 7548 return -EINVAL; ··· 9556 9424 return 0; 9557 9425 } 9558 9426 9427 + /* replace pseudo btf_id with kernel symbol address */ 9428 + static int check_pseudo_btf_id(struct bpf_verifier_env *env, 9429 + struct bpf_insn *insn, 9430 + struct bpf_insn_aux_data *aux) 9431 + { 9432 + u32 datasec_id, type, id = insn->imm; 9433 + const struct btf_var_secinfo *vsi; 9434 + const struct btf_type *datasec; 9435 + const struct btf_type *t; 9436 + const char *sym_name; 9437 + bool percpu = false; 9438 + u64 addr; 9439 + int i; 9440 + 9441 + if (!btf_vmlinux) { 9442 + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); 9443 + return -EINVAL; 9444 + } 9445 + 9446 + if (insn[1].imm != 0) { 9447 + verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); 9448 + return -EINVAL; 9449 + } 9450 + 9451 + t = btf_type_by_id(btf_vmlinux, id); 9452 + if (!t) { 9453 + verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); 9454 + return -ENOENT; 9455 + } 9456 + 9457 + if (!btf_type_is_var(t)) { 9458 + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", 9459 + id); 9460 + return -EINVAL; 9461 + } 9462 + 9463 + sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); 9464 + addr = kallsyms_lookup_name(sym_name); 9465 + if (!addr) { 9466 + verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", 9467 + sym_name); 9468 + return -ENOENT; 9469 + } 9470 + 9471 + datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", 9472 + BTF_KIND_DATASEC); 9473 + if (datasec_id > 0) { 9474 + datasec = btf_type_by_id(btf_vmlinux, datasec_id); 9475 + for_each_vsi(i, datasec, vsi) { 9476 + if (vsi->type == id) { 9477 + percpu = true; 9478 + break; 9479 + } 9480 + } 9481 + } 9482 + 9483 + insn[0].imm = (u32)addr; 9484 + insn[1].imm = addr >> 32; 9485 + 9486 + type = t->type; 9487 + t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); 9488 + if (percpu) { 9489 + aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; 9490 + aux->btf_var.btf_id = type; 9491 + } else if (!btf_type_is_struct(t)) { 9492 + const struct btf_type *ret; 9493 + const char *tname; 9494 + u32 tsize; 9495 + 9496 + /* resolve the type size of ksym. */ 9497 + ret = btf_resolve_size(btf_vmlinux, t, &tsize); 9498 + if (IS_ERR(ret)) { 9499 + tname = btf_name_by_offset(btf_vmlinux, t->name_off); 9500 + verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", 9501 + tname, PTR_ERR(ret)); 9502 + return -EINVAL; 9503 + } 9504 + aux->btf_var.reg_type = PTR_TO_MEM; 9505 + aux->btf_var.mem_size = tsize; 9506 + } else { 9507 + aux->btf_var.reg_type = PTR_TO_BTF_ID; 9508 + aux->btf_var.btf_id = type; 9509 + } 9510 + return 0; 9511 + } 9512 + 9559 9513 static int check_map_prealloc(struct bpf_map *map) 9560 9514 { 9561 9515 return (map->map_type != BPF_MAP_TYPE_HASH && ··· 9752 9534 map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); 9753 9535 } 9754 9536 9755 - /* look for pseudo eBPF instructions that access map FDs and 9756 - * replace them with actual map pointers 9537 + /* find and rewrite pseudo imm in ld_imm64 instructions: 9538 + * 9539 + * 1. if it accesses map FD, replace it with actual map pointer. 9540 + * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. 9541 + * 9542 + * NOTE: btf_vmlinux is required for converting pseudo btf_id. 9757 9543 */ 9758 - static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) 9544 + static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) 9759 9545 { 9760 9546 struct bpf_insn *insn = env->prog->insnsi; 9761 9547 int insn_cnt = env->prog->len; ··· 9799 9577 if (insn[0].src_reg == 0) 9800 9578 /* valid generic load 64-bit imm */ 9801 9579 goto next_insn; 9580 + 9581 + if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) { 9582 + aux = &env->insn_aux_data[i]; 9583 + err = check_pseudo_btf_id(env, insn, aux); 9584 + if (err) 9585 + return err; 9586 + goto next_insn; 9587 + } 9802 9588 9803 9589 /* In final convert_pseudo_ld_imm64() step, this is 9804 9590 * converted into regular 64-bit imm load insn. ··· 11049 10819 if (insn->imm == BPF_FUNC_map_lookup_elem && 11050 10820 ops->map_gen_lookup) { 11051 10821 cnt = ops->map_gen_lookup(map_ptr, insn_buf); 11052 - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { 10822 + if (cnt == -EOPNOTSUPP) 10823 + goto patch_map_ops_generic; 10824 + if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { 11053 10825 verbose(env, "bpf verifier is misconfigured\n"); 11054 10826 return -EINVAL; 11055 10827 } ··· 11081 10849 (int (*)(struct bpf_map *map, void *value))NULL)); 11082 10850 BUILD_BUG_ON(!__same_type(ops->map_peek_elem, 11083 10851 (int (*)(struct bpf_map *map, void *value))NULL)); 11084 - 10852 + patch_map_ops_generic: 11085 10853 switch (insn->imm) { 11086 10854 case BPF_FUNC_map_lookup_elem: 11087 10855 insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - ··· 11865 11633 if (is_priv) 11866 11634 env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; 11867 11635 11868 - ret = replace_map_fd_with_map_ptr(env); 11869 - if (ret < 0) 11870 - goto skip_full_check; 11871 - 11872 11636 if (bpf_prog_is_dev_bound(env->prog->aux)) { 11873 11637 ret = bpf_prog_offload_verifier_prep(env->prog); 11874 11638 if (ret) ··· 11888 11660 11889 11661 ret = check_attach_btf_id(env); 11890 11662 if (ret) 11663 + goto skip_full_check; 11664 + 11665 + ret = resolve_pseudo_ldimm64(env); 11666 + if (ret < 0) 11891 11667 goto skip_full_check; 11892 11668 11893 11669 ret = check_cfg(env);

+6

kernel/trace/bpf_trace.c

··· 1327 1327 return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; 1328 1328 case BPF_FUNC_snprintf_btf: 1329 1329 return &bpf_snprintf_btf_proto; 1330 + case BPF_FUNC_bpf_per_cpu_ptr: 1331 + return &bpf_per_cpu_ptr_proto; 1332 + case BPF_FUNC_bpf_this_cpu_ptr: 1333 + return &bpf_this_cpu_ptr_proto; 1330 1334 default: 1331 1335 return NULL; 1332 1336 } ··· 1780 1776 }; 1781 1777 1782 1778 const struct bpf_prog_ops raw_tracepoint_prog_ops = { 1779 + #ifdef CONFIG_NET 1783 1780 .test_run = bpf_prog_test_run_raw_tp, 1781 + #endif 1784 1782 }; 1785 1783 1786 1784 const struct bpf_verifier_ops tracing_verifier_ops = {

+12 -3

net/core/dev.c

··· 4930 4930 4931 4931 static inline struct sk_buff * 4932 4932 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, 4933 - struct net_device *orig_dev) 4933 + struct net_device *orig_dev, bool *another) 4934 4934 { 4935 4935 #ifdef CONFIG_NET_CLS_ACT 4936 4936 struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); ··· 4974 4974 * redirecting to another netdev 4975 4975 */ 4976 4976 __skb_push(skb, skb->mac_len); 4977 - skb_do_redirect(skb); 4977 + if (skb_do_redirect(skb) == -EAGAIN) { 4978 + __skb_pull(skb, skb->mac_len); 4979 + *another = true; 4980 + break; 4981 + } 4978 4982 return NULL; 4979 4983 case TC_ACT_CONSUMED: 4980 4984 return NULL; ··· 5167 5163 skip_taps: 5168 5164 #ifdef CONFIG_NET_INGRESS 5169 5165 if (static_branch_unlikely(&ingress_needed_key)) { 5170 - skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); 5166 + bool another = false; 5167 + 5168 + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, 5169 + &another); 5170 + if (another) 5171 + goto another_round; 5171 5172 if (!skb) 5172 5173 goto out; 5173 5174

+99 -8

net/core/filter.c

··· 76 76 #include <net/bpf_sk_storage.h> 77 77 #include <net/transp_v6.h> 78 78 #include <linux/btf_ids.h> 79 + #include <net/tls.h> 79 80 80 81 static const struct bpf_func_proto * 81 82 bpf_sk_base_func_proto(enum bpf_func_id func_id); ··· 2380 2379 2381 2380 /* Internal, non-exposed redirect flags. */ 2382 2381 enum { 2383 - BPF_F_NEIGH = (1ULL << 1), 2384 - #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) 2382 + BPF_F_NEIGH = (1ULL << 1), 2383 + BPF_F_PEER = (1ULL << 2), 2384 + #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) 2385 2385 }; 2386 2386 2387 2387 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) ··· 2431 2429 int skb_do_redirect(struct sk_buff *skb) 2432 2430 { 2433 2431 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 2432 + struct net *net = dev_net(skb->dev); 2434 2433 struct net_device *dev; 2435 2434 u32 flags = ri->flags; 2436 2435 2437 - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); 2436 + dev = dev_get_by_index_rcu(net, ri->tgt_index); 2438 2437 ri->tgt_index = 0; 2439 - if (unlikely(!dev)) { 2440 - kfree_skb(skb); 2441 - return -EINVAL; 2442 - } 2438 + ri->flags = 0; 2439 + if (unlikely(!dev)) 2440 + goto out_drop; 2441 + if (flags & BPF_F_PEER) { 2442 + const struct net_device_ops *ops = dev->netdev_ops; 2443 2443 2444 + if (unlikely(!ops->ndo_get_peer_dev || 2445 + !skb_at_tc_ingress(skb))) 2446 + goto out_drop; 2447 + dev = ops->ndo_get_peer_dev(dev); 2448 + if (unlikely(!dev || 2449 + !is_skb_forwardable(dev, skb) || 2450 + net_eq(net, dev_net(dev)))) 2451 + goto out_drop; 2452 + skb->dev = dev; 2453 + return -EAGAIN; 2454 + } 2444 2455 return flags & BPF_F_NEIGH ? 2445 2456 __bpf_redirect_neigh(skb, dev) : 2446 2457 __bpf_redirect(skb, dev, flags); 2458 + out_drop: 2459 + kfree_skb(skb); 2460 + return -EINVAL; 2447 2461 } 2448 2462 2449 2463 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ··· 2477 2459 2478 2460 static const struct bpf_func_proto bpf_redirect_proto = { 2479 2461 .func = bpf_redirect, 2462 + .gpl_only = false, 2463 + .ret_type = RET_INTEGER, 2464 + .arg1_type = ARG_ANYTHING, 2465 + .arg2_type = ARG_ANYTHING, 2466 + }; 2467 + 2468 + BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) 2469 + { 2470 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 2471 + 2472 + if (unlikely(flags)) 2473 + return TC_ACT_SHOT; 2474 + 2475 + ri->flags = BPF_F_PEER; 2476 + ri->tgt_index = ifindex; 2477 + 2478 + return TC_ACT_REDIRECT; 2479 + } 2480 + 2481 + static const struct bpf_func_proto bpf_redirect_peer_proto = { 2482 + .func = bpf_redirect_peer, 2480 2483 .gpl_only = false, 2481 2484 .ret_type = RET_INTEGER, 2482 2485 .arg1_type = ARG_ANYTHING, ··· 3517 3478 return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : 3518 3479 SKB_MAX_ALLOC; 3519 3480 } 3481 + 3482 + BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, 3483 + u32, mode, u64, flags) 3484 + { 3485 + u32 len_diff_abs = abs(len_diff); 3486 + bool shrink = len_diff < 0; 3487 + int ret = 0; 3488 + 3489 + if (unlikely(flags || mode)) 3490 + return -EINVAL; 3491 + if (unlikely(len_diff_abs > 0xfffU)) 3492 + return -EFAULT; 3493 + 3494 + if (!shrink) { 3495 + ret = skb_cow(skb, len_diff); 3496 + if (unlikely(ret < 0)) 3497 + return ret; 3498 + __skb_push(skb, len_diff_abs); 3499 + memset(skb->data, 0, len_diff_abs); 3500 + } else { 3501 + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) 3502 + return -ENOMEM; 3503 + __skb_pull(skb, len_diff_abs); 3504 + } 3505 + bpf_compute_data_end_sk_skb(skb); 3506 + if (tls_sw_has_ctx_rx(skb->sk)) { 3507 + struct strp_msg *rxm = strp_msg(skb); 3508 + 3509 + rxm->full_len += len_diff; 3510 + } 3511 + return ret; 3512 + } 3513 + 3514 + static const struct bpf_func_proto sk_skb_adjust_room_proto = { 3515 + .func = sk_skb_adjust_room, 3516 + .gpl_only = false, 3517 + .ret_type = RET_INTEGER, 3518 + .arg1_type = ARG_PTR_TO_CTX, 3519 + .arg2_type = ARG_ANYTHING, 3520 + .arg3_type = ARG_ANYTHING, 3521 + .arg4_type = ARG_ANYTHING, 3522 + }; 3520 3523 3521 3524 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, 3522 3525 u32, mode, u64, flags) ··· 4865 4784 else 4866 4785 icsk->icsk_user_timeout = val; 4867 4786 break; 4787 + case TCP_NOTSENT_LOWAT: 4788 + tp->notsent_lowat = val; 4789 + sk->sk_write_space(sk); 4790 + break; 4868 4791 default: 4869 4792 ret = -EINVAL; 4870 4793 } ··· 5234 5149 memcpy(params->smac, dev->dev_addr, ETH_ALEN); 5235 5150 params->h_vlan_TCI = 0; 5236 5151 params->h_vlan_proto = 0; 5237 - params->ifindex = dev->ifindex; 5238 5152 5239 5153 return 0; 5240 5154 } ··· 5330 5246 dev = nhc->nhc_dev; 5331 5247 5332 5248 params->rt_metric = res.fi->fib_priority; 5249 + params->ifindex = dev->ifindex; 5333 5250 5334 5251 /* xdp and cls_bpf programs are run in RCU-bh so 5335 5252 * rcu_read_lock_bh is not needed here ··· 5456 5371 5457 5372 dev = res.nh->fib_nh_dev; 5458 5373 params->rt_metric = res.f6i->fib6_metric; 5374 + params->ifindex = dev->ifindex; 5459 5375 5460 5376 /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is 5461 5377 * not needed here. ··· 6831 6745 func == bpf_skb_change_tail || 6832 6746 func == sk_skb_change_tail || 6833 6747 func == bpf_skb_adjust_room || 6748 + func == sk_skb_adjust_room || 6834 6749 func == bpf_skb_pull_data || 6835 6750 func == sk_skb_pull_data || 6836 6751 func == bpf_clone_redirect || ··· 7092 7005 return &bpf_redirect_proto; 7093 7006 case BPF_FUNC_redirect_neigh: 7094 7007 return &bpf_redirect_neigh_proto; 7008 + case BPF_FUNC_redirect_peer: 7009 + return &bpf_redirect_peer_proto; 7095 7010 case BPF_FUNC_get_route_realm: 7096 7011 return &bpf_get_route_realm_proto; 7097 7012 case BPF_FUNC_get_hash_recalc: ··· 7307 7218 return &sk_skb_change_tail_proto; 7308 7219 case BPF_FUNC_skb_change_head: 7309 7220 return &sk_skb_change_head_proto; 7221 + case BPF_FUNC_skb_adjust_room: 7222 + return &sk_skb_adjust_room_proto; 7310 7223 case BPF_FUNC_get_socket_cookie: 7311 7224 return &bpf_get_socket_cookie_proto; 7312 7225 case BPF_FUNC_get_socket_uid:

+123 -38

net/core/skmsg.c

··· 433 433 static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, 434 434 u32 off, u32 len, bool ingress) 435 435 { 436 - if (ingress) 437 - return sk_psock_skb_ingress(psock, skb); 438 - else 436 + if (!ingress) { 437 + if (!sock_writeable(psock->sk)) 438 + return -EAGAIN; 439 439 return skb_send_sock_locked(psock->sk, skb, off, len); 440 + } 441 + return sk_psock_skb_ingress(psock, skb); 440 442 } 441 443 442 444 static void sk_psock_backlog(struct work_struct *work) ··· 627 625 rcu_assign_sk_user_data(sk, NULL); 628 626 if (psock->progs.skb_parser) 629 627 sk_psock_stop_strp(sk, psock); 628 + else if (psock->progs.skb_verdict) 629 + sk_psock_stop_verdict(sk, psock); 630 630 write_unlock_bh(&sk->sk_callback_lock); 631 631 sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); 632 632 ··· 686 682 static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, 687 683 struct sk_buff *skb) 688 684 { 689 - int ret; 690 - 691 - skb->sk = psock->sk; 692 685 bpf_compute_data_end_sk_skb(skb); 693 - ret = bpf_prog_run_pin_on_cpu(prog, skb); 694 - /* strparser clones the skb before handing it to a upper layer, 695 - * meaning skb_orphan has been called. We NULL sk on the way out 696 - * to ensure we don't trigger a BUG_ON() in skb/sk operations 697 - * later and because we are not charging the memory of this skb 698 - * to any socket yet. 699 - */ 700 - skb->sk = NULL; 701 - return ret; 686 + return bpf_prog_run_pin_on_cpu(prog, skb); 702 687 } 703 688 704 689 static struct sk_psock *sk_psock_from_strp(struct strparser *strp) ··· 702 709 { 703 710 struct sk_psock *psock_other; 704 711 struct sock *sk_other; 705 - bool ingress; 706 712 707 713 sk_other = tcp_skb_bpf_redirect_fetch(skb); 714 + /* This error is a buggy BPF program, it returned a redirect 715 + * return code, but then didn't set a redirect interface. 716 + */ 708 717 if (unlikely(!sk_other)) { 709 718 kfree_skb(skb); 710 719 return; 711 720 } 712 721 psock_other = sk_psock(sk_other); 722 + /* This error indicates the socket is being torn down or had another 723 + * error that caused the pipe to break. We can't send a packet on 724 + * a socket that is in this state so we drop the skb. 725 + */ 713 726 if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || 714 727 !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { 715 728 kfree_skb(skb); 716 729 return; 717 730 } 718 731 719 - ingress = tcp_skb_bpf_ingress(skb); 720 - if ((!ingress && sock_writeable(sk_other)) || 721 - (ingress && 722 - atomic_read(&sk_other->sk_rmem_alloc) <= 723 - sk_other->sk_rcvbuf)) { 724 - if (!ingress) 725 - skb_set_owner_w(skb, sk_other); 726 - skb_queue_tail(&psock_other->ingress_skb, skb); 727 - schedule_work(&psock_other->work); 728 - } else { 729 - kfree_skb(skb); 730 - } 732 + skb_queue_tail(&psock_other->ingress_skb, skb); 733 + schedule_work(&psock_other->work); 731 734 } 732 735 733 - static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict) 736 + static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) 734 737 { 735 738 switch (verdict) { 736 739 case __SK_REDIRECT: 740 + skb_set_owner_r(skb, sk); 737 741 sk_psock_skb_redirect(skb); 738 742 break; 739 743 case __SK_PASS: ··· 748 758 rcu_read_lock(); 749 759 prog = READ_ONCE(psock->progs.skb_verdict); 750 760 if (likely(prog)) { 761 + /* We skip full set_owner_r here because if we do a SK_PASS 762 + * or SK_DROP we can skip skb memory accounting and use the 763 + * TLS context. 764 + */ 765 + skb->sk = psock->sk; 751 766 tcp_skb_bpf_redirect_clear(skb); 752 767 ret = sk_psock_bpf_run(psock, prog, skb); 753 768 ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); 769 + skb->sk = NULL; 754 770 } 755 - sk_psock_tls_verdict_apply(skb, ret); 771 + sk_psock_tls_verdict_apply(skb, psock->sk, ret); 756 772 rcu_read_unlock(); 757 773 return ret; 758 774 } ··· 767 771 static void sk_psock_verdict_apply(struct sk_psock *psock, 768 772 struct sk_buff *skb, int verdict) 769 773 { 774 + struct tcp_skb_cb *tcp; 770 775 struct sock *sk_other; 776 + int err = -EIO; 771 777 772 778 switch (verdict) { 773 779 case __SK_PASS: ··· 778 780 !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { 779 781 goto out_free; 780 782 } 781 - if (atomic_read(&sk_other->sk_rmem_alloc) <= 782 - sk_other->sk_rcvbuf) { 783 - struct tcp_skb_cb *tcp = TCP_SKB_CB(skb); 784 783 785 - tcp->bpf.flags |= BPF_F_INGRESS; 784 + tcp = TCP_SKB_CB(skb); 785 + tcp->bpf.flags |= BPF_F_INGRESS; 786 + 787 + /* If the queue is empty then we can submit directly 788 + * into the msg queue. If its not empty we have to 789 + * queue work otherwise we may get OOO data. Otherwise, 790 + * if sk_psock_skb_ingress errors will be handled by 791 + * retrying later from workqueue. 792 + */ 793 + if (skb_queue_empty(&psock->ingress_skb)) { 794 + err = sk_psock_skb_ingress(psock, skb); 795 + } 796 + if (err < 0) { 786 797 skb_queue_tail(&psock->ingress_skb, skb); 787 798 schedule_work(&psock->work); 788 - break; 789 799 } 790 - goto out_free; 800 + break; 791 801 case __SK_REDIRECT: 792 802 sk_psock_skb_redirect(skb); 793 803 break; ··· 820 814 kfree_skb(skb); 821 815 goto out; 822 816 } 817 + skb_set_owner_r(skb, sk); 823 818 prog = READ_ONCE(psock->progs.skb_verdict); 824 819 if (likely(prog)) { 825 - skb_orphan(skb); 826 820 tcp_skb_bpf_redirect_clear(skb); 827 821 ret = sk_psock_bpf_run(psock, prog, skb); 828 822 ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); ··· 845 839 846 840 rcu_read_lock(); 847 841 prog = READ_ONCE(psock->progs.skb_parser); 848 - if (likely(prog)) 842 + if (likely(prog)) { 843 + skb->sk = psock->sk; 849 844 ret = sk_psock_bpf_run(psock, prog, skb); 845 + skb->sk = NULL; 846 + } 850 847 rcu_read_unlock(); 851 848 return ret; 852 849 } ··· 871 862 } 872 863 } 873 864 rcu_read_unlock(); 865 + } 866 + 867 + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, 868 + unsigned int offset, size_t orig_len) 869 + { 870 + struct sock *sk = (struct sock *)desc->arg.data; 871 + struct sk_psock *psock; 872 + struct bpf_prog *prog; 873 + int ret = __SK_DROP; 874 + int len = skb->len; 875 + 876 + /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ 877 + skb = skb_clone(skb, GFP_ATOMIC); 878 + if (!skb) { 879 + desc->error = -ENOMEM; 880 + return 0; 881 + } 882 + 883 + rcu_read_lock(); 884 + psock = sk_psock(sk); 885 + if (unlikely(!psock)) { 886 + len = 0; 887 + kfree_skb(skb); 888 + goto out; 889 + } 890 + skb_set_owner_r(skb, sk); 891 + prog = READ_ONCE(psock->progs.skb_verdict); 892 + if (likely(prog)) { 893 + tcp_skb_bpf_redirect_clear(skb); 894 + ret = sk_psock_bpf_run(psock, prog, skb); 895 + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); 896 + } 897 + sk_psock_verdict_apply(psock, skb, ret); 898 + out: 899 + rcu_read_unlock(); 900 + return len; 901 + } 902 + 903 + static void sk_psock_verdict_data_ready(struct sock *sk) 904 + { 905 + struct socket *sock = sk->sk_socket; 906 + read_descriptor_t desc; 907 + 908 + if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) 909 + return; 910 + 911 + desc.arg.data = sk; 912 + desc.error = 0; 913 + desc.count = 1; 914 + 915 + sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); 874 916 } 875 917 876 918 static void sk_psock_write_space(struct sock *sk) ··· 953 893 return strp_init(&psock->parser.strp, sk, &cb); 954 894 } 955 895 896 + void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) 897 + { 898 + struct sk_psock_parser *parser = &psock->parser; 899 + 900 + if (parser->enabled) 901 + return; 902 + 903 + parser->saved_data_ready = sk->sk_data_ready; 904 + sk->sk_data_ready = sk_psock_verdict_data_ready; 905 + sk->sk_write_space = sk_psock_write_space; 906 + parser->enabled = true; 907 + } 908 + 956 909 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) 957 910 { 958 911 struct sk_psock_parser *parser = &psock->parser; ··· 989 916 sk->sk_data_ready = parser->saved_data_ready; 990 917 parser->saved_data_ready = NULL; 991 918 strp_stop(&parser->strp); 919 + parser->enabled = false; 920 + } 921 + 922 + void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) 923 + { 924 + struct sk_psock_parser *parser = &psock->parser; 925 + 926 + if (!parser->enabled) 927 + return; 928 + 929 + sk->sk_data_ready = parser->saved_data_ready; 930 + parser->saved_data_ready = NULL; 992 931 parser->enabled = false; 993 932 }

+23 -14

net/core/sock_map.c

··· 148 148 static void sock_map_del_link(struct sock *sk, 149 149 struct sk_psock *psock, void *link_raw) 150 150 { 151 + bool strp_stop = false, verdict_stop = false; 151 152 struct sk_psock_link *link, *tmp; 152 - bool strp_stop = false; 153 153 154 154 spin_lock_bh(&psock->link_lock); 155 155 list_for_each_entry_safe(link, tmp, &psock->link, list) { ··· 159 159 map); 160 160 if (psock->parser.enabled && stab->progs.skb_parser) 161 161 strp_stop = true; 162 + if (psock->parser.enabled && stab->progs.skb_verdict) 163 + verdict_stop = true; 162 164 list_del(&link->list); 163 165 sk_psock_free_link(link); 164 166 } 165 167 } 166 168 spin_unlock_bh(&psock->link_lock); 167 - if (strp_stop) { 169 + if (strp_stop || verdict_stop) { 168 170 write_lock_bh(&sk->sk_callback_lock); 169 - sk_psock_stop_strp(sk, psock); 171 + if (strp_stop) 172 + sk_psock_stop_strp(sk, psock); 173 + else 174 + sk_psock_stop_verdict(sk, psock); 170 175 write_unlock_bh(&sk->sk_callback_lock); 171 176 } 172 177 } ··· 235 230 { 236 231 struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; 237 232 struct sk_psock *psock; 238 - bool skb_progs; 239 233 int ret; 240 234 241 235 skb_verdict = READ_ONCE(progs->skb_verdict); 242 236 skb_parser = READ_ONCE(progs->skb_parser); 243 - skb_progs = skb_parser && skb_verdict; 244 - if (skb_progs) { 237 + if (skb_verdict) { 245 238 skb_verdict = bpf_prog_inc_not_zero(skb_verdict); 246 239 if (IS_ERR(skb_verdict)) 247 240 return PTR_ERR(skb_verdict); 241 + } 242 + if (skb_parser) { 248 243 skb_parser = bpf_prog_inc_not_zero(skb_parser); 249 244 if (IS_ERR(skb_parser)) { 250 245 bpf_prog_put(skb_verdict); ··· 269 264 270 265 if (psock) { 271 266 if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || 272 - (skb_progs && READ_ONCE(psock->progs.skb_parser))) { 267 + (skb_parser && READ_ONCE(psock->progs.skb_parser)) || 268 + (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { 273 269 sk_psock_put(sk, psock); 274 270 ret = -EBUSY; 275 271 goto out_progs; ··· 291 285 goto out_drop; 292 286 293 287 write_lock_bh(&sk->sk_callback_lock); 294 - if (skb_progs && !psock->parser.enabled) { 288 + if (skb_parser && skb_verdict && !psock->parser.enabled) { 295 289 ret = sk_psock_init_strp(sk, psock); 296 - if (ret) { 297 - write_unlock_bh(&sk->sk_callback_lock); 298 - goto out_drop; 299 - } 290 + if (ret) 291 + goto out_unlock_drop; 300 292 psock_set_prog(&psock->progs.skb_verdict, skb_verdict); 301 293 psock_set_prog(&psock->progs.skb_parser, skb_parser); 302 294 sk_psock_start_strp(sk, psock); 295 + } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { 296 + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); 297 + sk_psock_start_verdict(sk,psock); 303 298 } 304 299 write_unlock_bh(&sk->sk_callback_lock); 305 300 return 0; 301 + out_unlock_drop: 302 + write_unlock_bh(&sk->sk_callback_lock); 306 303 out_drop: 307 304 sk_psock_put(sk, psock); 308 305 out_progs: 309 306 if (msg_parser) 310 307 bpf_prog_put(msg_parser); 311 308 out: 312 - if (skb_progs) { 309 + if (skb_verdict) 313 310 bpf_prog_put(skb_verdict); 311 + if (skb_parser) 314 312 bpf_prog_put(skb_parser); 315 - } 316 313 return ret; 317 314 } 318 315

-1

net/ipv4/tcp_minisocks.c

··· 548 548 newtp->fastopen_req = NULL; 549 549 RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); 550 550 551 - bpf_skops_init_child(sk, newsk); 552 551 tcp_bpf_clone(sk, newsk); 553 552 554 553 __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);

-3

net/xdp/xsk_buff_pool.c

··· 3 3 #include <net/xsk_buff_pool.h> 4 4 #include <net/xdp_sock.h> 5 5 #include <net/xdp_sock_drv.h> 6 - #include <linux/dma-direct.h> 7 - #include <linux/dma-noncoherent.h> 8 - #include <linux/swiotlb.h> 9 6 10 7 #include "xsk_queue.h" 11 8 #include "xdp_umem.h"

+4

net/xdp/xsk_queue.h

··· 15 15 16 16 struct xdp_ring { 17 17 u32 producer ____cacheline_aligned_in_smp; 18 + /* Hinder the adjacent cache prefetcher to prefetch the consumer 19 + * pointer if the producer pointer is touched and vice versa. 20 + */ 21 + u32 pad ____cacheline_aligned_in_smp; 18 22 u32 consumer ____cacheline_aligned_in_smp; 19 23 u32 flags; 20 24 };

+1 -1

net/xdp/xskmap.c

··· 132 132 return 0; 133 133 } 134 134 135 - static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 135 + static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 136 136 { 137 137 const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; 138 138 struct bpf_insn *insn = insn_buf;

+12 -3

samples/bpf/Makefile

··· 98 98 per_socket_stats_example-objs := cookie_uid_helper_example.o 99 99 xdp_redirect-objs := xdp_redirect_user.o 100 100 xdp_redirect_map-objs := xdp_redirect_map_user.o 101 - xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o 102 - xdp_monitor-objs := bpf_load.o xdp_monitor_user.o 101 + xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o 102 + xdp_monitor-objs := xdp_monitor_user.o 103 103 xdp_rxq_info-objs := xdp_rxq_info_user.o 104 104 syscall_tp-objs := syscall_tp_user.o 105 105 cpustat-objs := cpustat_user.o ··· 211 211 # make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang 212 212 LLC ?= llc 213 213 CLANG ?= clang 214 + OPT ?= opt 215 + LLVM_DIS ?= llvm-dis 214 216 LLVM_OBJCOPY ?= llvm-objcopy 215 217 BTF_PAHOLE ?= pahole 216 218 ··· 305 303 # asm/sysreg.h - inline assembly used by it is incompatible with llvm. 306 304 # But, there is no easy way to fix it, so just exclude it since it is 307 305 # useless for BPF samples. 306 + # below we use long chain of commands, clang | opt | llvm-dis | llc, 307 + # to generate final object file. 'clang' compiles the source into IR 308 + # with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin 309 + # processing (llvm12) and IR optimizations. 'llvm-dis' converts 310 + # 'opt' output to IR, and finally 'llc' generates bpf byte code. 308 311 $(obj)/%.o: $(src)/%.c 309 312 @echo " CLANG-bpf " $@ 310 313 $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ ··· 321 314 -Wno-address-of-packed-member -Wno-tautological-compare \ 322 315 -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ 323 316 -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ 324 - -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ 317 + -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ 318 + $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ 319 + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ 325 320 ifeq ($(DWARF2BTF),y) 326 321 $(BTF_PAHOLE) -J $@ 327 322 endif

+2 -1

samples/bpf/hbm.c

··· 40 40 #include <errno.h> 41 41 #include <fcntl.h> 42 42 #include <linux/unistd.h> 43 + #include <linux/compiler.h> 43 44 44 45 #include <linux/bpf.h> 45 46 #include <bpf/bpf.h> ··· 484 483 "Option -%c requires an argument.\n\n", 485 484 optopt); 486 485 case 'h': 487 - fallthrough; 486 + __fallthrough; 488 487 default: 489 488 Usage(); 490 489 return 0;

+30 -30

samples/bpf/xdp_monitor_kern.c

··· 6 6 #include <uapi/linux/bpf.h> 7 7 #include <bpf/bpf_helpers.h> 8 8 9 - struct bpf_map_def SEC("maps") redirect_err_cnt = { 10 - .type = BPF_MAP_TYPE_PERCPU_ARRAY, 11 - .key_size = sizeof(u32), 12 - .value_size = sizeof(u64), 13 - .max_entries = 2, 9 + struct { 10 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 11 + __type(key, u32); 12 + __type(value, u64); 13 + __uint(max_entries, 2); 14 14 /* TODO: have entries for all possible errno's */ 15 - }; 15 + } redirect_err_cnt SEC(".maps"); 16 16 17 17 #define XDP_UNKNOWN XDP_REDIRECT + 1 18 - struct bpf_map_def SEC("maps") exception_cnt = { 19 - .type = BPF_MAP_TYPE_PERCPU_ARRAY, 20 - .key_size = sizeof(u32), 21 - .value_size = sizeof(u64), 22 - .max_entries = XDP_UNKNOWN + 1, 23 - }; 18 + struct { 19 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 20 + __type(key, u32); 21 + __type(value, u64); 22 + __uint(max_entries, XDP_UNKNOWN + 1); 23 + } exception_cnt SEC(".maps"); 24 24 25 25 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format 26 26 * Code in: kernel/include/trace/events/xdp.h ··· 129 129 }; 130 130 #define MAX_CPUS 64 131 131 132 - struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { 133 - .type = BPF_MAP_TYPE_PERCPU_ARRAY, 134 - .key_size = sizeof(u32), 135 - .value_size = sizeof(struct datarec), 136 - .max_entries = MAX_CPUS, 137 - }; 132 + struct { 133 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 134 + __type(key, u32); 135 + __type(value, struct datarec); 136 + __uint(max_entries, MAX_CPUS); 137 + } cpumap_enqueue_cnt SEC(".maps"); 138 138 139 - struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { 140 - .type = BPF_MAP_TYPE_PERCPU_ARRAY, 141 - .key_size = sizeof(u32), 142 - .value_size = sizeof(struct datarec), 143 - .max_entries = 1, 144 - }; 139 + struct { 140 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 141 + __type(key, u32); 142 + __type(value, struct datarec); 143 + __uint(max_entries, 1); 144 + } cpumap_kthread_cnt SEC(".maps"); 145 145 146 146 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format 147 147 * Code in: kernel/include/trace/events/xdp.h ··· 210 210 return 0; 211 211 } 212 212 213 - struct bpf_map_def SEC("maps") devmap_xmit_cnt = { 214 - .type = BPF_MAP_TYPE_PERCPU_ARRAY, 215 - .key_size = sizeof(u32), 216 - .value_size = sizeof(struct datarec), 217 - .max_entries = 1, 218 - }; 213 + struct { 214 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 215 + __type(key, u32); 216 + __type(value, struct datarec); 217 + __uint(max_entries, 1); 218 + } devmap_xmit_cnt SEC(".maps"); 219 219 220 220 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format 221 221 * Code in: kernel/include/trace/events/xdp.h

+120 -39

samples/bpf/xdp_monitor_user.c

··· 26 26 #include <net/if.h> 27 27 #include <time.h> 28 28 29 + #include <signal.h> 29 30 #include <bpf/bpf.h> 30 - #include "bpf_load.h" 31 + #include <bpf/libbpf.h> 31 32 #include "bpf_util.h" 32 33 34 + enum map_type { 35 + REDIRECT_ERR_CNT, 36 + EXCEPTION_CNT, 37 + CPUMAP_ENQUEUE_CNT, 38 + CPUMAP_KTHREAD_CNT, 39 + DEVMAP_XMIT_CNT, 40 + }; 41 + 42 + static const char *const map_type_strings[] = { 43 + [REDIRECT_ERR_CNT] = "redirect_err_cnt", 44 + [EXCEPTION_CNT] = "exception_cnt", 45 + [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", 46 + [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", 47 + [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt", 48 + }; 49 + 50 + #define NUM_MAP 5 51 + #define NUM_TP 8 52 + 53 + static int tp_cnt; 54 + static int map_cnt; 33 55 static int verbose = 1; 34 56 static bool debug = false; 57 + struct bpf_map *map_data[NUM_MAP] = {}; 58 + struct bpf_link *tp_links[NUM_TP] = {}; 59 + struct bpf_object *obj; 35 60 36 61 static const struct option long_options[] = { 37 62 {"help", no_argument, NULL, 'h' }, ··· 65 40 {"sec", required_argument, NULL, 's' }, 66 41 {0, 0, NULL, 0 } 67 42 }; 43 + 44 + static void int_exit(int sig) 45 + { 46 + /* Detach tracepoints */ 47 + while (tp_cnt) 48 + bpf_link__destroy(tp_links[--tp_cnt]); 49 + 50 + bpf_object__close(obj); 51 + exit(0); 52 + } 68 53 69 54 /* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ 70 55 #define EXIT_FAIL_MEM 5 ··· 518 483 * this can happen by someone running perf-record -e 519 484 */ 520 485 521 - fd = map_data[0].fd; /* map0: redirect_err_cnt */ 486 + fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]); 522 487 for (i = 0; i < REDIR_RES_MAX; i++) 523 488 map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); 524 489 525 - fd = map_data[1].fd; /* map1: exception_cnt */ 490 + fd = bpf_map__fd(map_data[EXCEPTION_CNT]); 526 491 for (i = 0; i < XDP_ACTION_MAX; i++) { 527 492 map_collect_record_u64(fd, i, &rec->xdp_exception[i]); 528 493 } 529 494 530 - fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */ 495 + fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]); 531 496 for (i = 0; i < MAX_CPUS; i++) 532 497 map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); 533 498 534 - fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ 499 + fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]); 535 500 map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); 536 501 537 - fd = map_data[4].fd; /* map4: devmap_xmit_cnt */ 502 + fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]); 538 503 map_collect_record(fd, 0, &rec->xdp_devmap_xmit); 539 504 540 505 return true; ··· 633 598 634 599 /* TODO Need more advanced stats on error types */ 635 600 if (verbose) { 636 - printf(" - Stats map0: %s\n", map_data[0].name); 637 - printf(" - Stats map1: %s\n", map_data[1].name); 601 + printf(" - Stats map0: %s\n", bpf_map__name(map_data[0])); 602 + printf(" - Stats map1: %s\n", bpf_map__name(map_data[1])); 638 603 printf("\n"); 639 604 } 640 605 fflush(stdout); ··· 653 618 654 619 static void print_bpf_prog_info(void) 655 620 { 656 - int i; 621 + struct bpf_program *prog; 622 + struct bpf_map *map; 623 + int i = 0; 657 624 658 625 /* Prog info */ 659 - printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt); 660 - for (i = 0; i < prog_cnt; i++) { 661 - printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]); 626 + printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt); 627 + bpf_object__for_each_program(prog, obj) { 628 + printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog)); 629 + i++; 662 630 } 663 631 632 + i = 0; 664 633 /* Maps info */ 665 - printf("Loaded BPF prog have %d map(s)\n", map_data_count); 666 - for (i = 0; i < map_data_count; i++) { 667 - char *name = map_data[i].name; 668 - int fd = map_data[i].fd; 634 + printf("Loaded BPF prog have %d map(s)\n", map_cnt); 635 + bpf_object__for_each_map(map, obj) { 636 + const char *name = bpf_map__name(map); 637 + int fd = bpf_map__fd(map); 669 638 670 639 printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name); 640 + i++; 671 641 } 672 642 673 643 /* Event info */ 674 - printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt); 675 - for (i = 0; i < prog_cnt; i++) { 676 - if (event_fd[i] != -1) 677 - printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]); 644 + printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt); 645 + for (i = 0; i < tp_cnt; i++) { 646 + int fd = bpf_link__fd(tp_links[i]); 647 + 648 + if (fd != -1) 649 + printf(" - event_fd[%d] = fd(%d)\n", i, fd); 678 650 } 679 651 } 680 652 681 653 int main(int argc, char **argv) 682 654 { 683 655 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 656 + struct bpf_program *prog; 684 657 int longindex = 0, opt; 685 - int ret = EXIT_SUCCESS; 686 - char bpf_obj_file[256]; 658 + int ret = EXIT_FAILURE; 659 + enum map_type type; 660 + char filename[256]; 687 661 688 662 /* Default settings: */ 689 663 bool errors_only = true; 690 664 int interval = 2; 691 - 692 - snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]); 693 665 694 666 /* Parse commands line args */ 695 667 while ((opt = getopt_long(argc, argv, "hDSs:", ··· 714 672 case 'h': 715 673 default: 716 674 usage(argv); 717 - return EXIT_FAILURE; 675 + return ret; 718 676 } 719 677 } 720 678 679 + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 721 680 if (setrlimit(RLIMIT_MEMLOCK, &r)) { 722 681 perror("setrlimit(RLIMIT_MEMLOCK)"); 723 - return EXIT_FAILURE; 682 + return ret; 724 683 } 725 684 726 - if (load_bpf_file(bpf_obj_file)) { 727 - printf("ERROR - bpf_log_buf: %s", bpf_log_buf); 728 - return EXIT_FAILURE; 685 + /* Remove tracepoint program when program is interrupted or killed */ 686 + signal(SIGINT, int_exit); 687 + signal(SIGTERM, int_exit); 688 + 689 + obj = bpf_object__open_file(filename, NULL); 690 + if (libbpf_get_error(obj)) { 691 + printf("ERROR: opening BPF object file failed\n"); 692 + obj = NULL; 693 + goto cleanup; 729 694 } 730 - if (!prog_fd[0]) { 731 - printf("ERROR - load_bpf_file: %s\n", strerror(errno)); 732 - return EXIT_FAILURE; 695 + 696 + /* load BPF program */ 697 + if (bpf_object__load(obj)) { 698 + printf("ERROR: loading BPF object file failed\n"); 699 + goto cleanup; 700 + } 701 + 702 + for (type = 0; type < NUM_MAP; type++) { 703 + map_data[type] = 704 + bpf_object__find_map_by_name(obj, map_type_strings[type]); 705 + 706 + if (libbpf_get_error(map_data[type])) { 707 + printf("ERROR: finding a map in obj file failed\n"); 708 + goto cleanup; 709 + } 710 + map_cnt++; 711 + } 712 + 713 + bpf_object__for_each_program(prog, obj) { 714 + tp_links[tp_cnt] = bpf_program__attach(prog); 715 + if (libbpf_get_error(tp_links[tp_cnt])) { 716 + printf("ERROR: bpf_program__attach failed\n"); 717 + tp_links[tp_cnt] = NULL; 718 + goto cleanup; 719 + } 720 + tp_cnt++; 733 721 } 734 722 735 723 if (debug) { 736 724 print_bpf_prog_info(); 737 725 } 738 726 739 - /* Unload/stop tracepoint event by closing fd's */ 727 + /* Unload/stop tracepoint event by closing bpf_link's */ 740 728 if (errors_only) { 741 - /* The prog_fd[i] and event_fd[i] depend on the 742 - * order the functions was defined in _kern.c 729 + /* The bpf_link[i] depend on the order of 730 + * the functions was defined in _kern.c 743 731 */ 744 - close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */ 745 - close(prog_fd[2]); /* func: trace_xdp_redirect */ 746 - close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */ 747 - close(prog_fd[3]); /* func: trace_xdp_redirect_map */ 732 + bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */ 733 + tp_links[2] = NULL; 734 + 735 + bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */ 736 + tp_links[3] = NULL; 748 737 } 749 738 750 739 stats_poll(interval, errors_only); 751 740 741 + ret = EXIT_SUCCESS; 742 + 743 + cleanup: 744 + /* Detach tracepoints */ 745 + while (tp_cnt) 746 + bpf_link__destroy(tp_links[--tp_cnt]); 747 + 748 + bpf_object__close(obj); 752 749 return ret; 753 750 }

+70 -79

samples/bpf/xdp_redirect_cpu_user.c

··· 37 37 38 38 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; 39 39 static int n_cpus; 40 - static int cpu_map_fd; 41 - static int rx_cnt_map_fd; 42 - static int redirect_err_cnt_map_fd; 43 - static int cpumap_enqueue_cnt_map_fd; 44 - static int cpumap_kthread_cnt_map_fd; 45 - static int cpus_available_map_fd; 46 - static int cpus_count_map_fd; 47 - static int cpus_iterator_map_fd; 48 - static int exception_cnt_map_fd; 40 + 41 + enum map_type { 42 + CPU_MAP, 43 + RX_CNT, 44 + REDIRECT_ERR_CNT, 45 + CPUMAP_ENQUEUE_CNT, 46 + CPUMAP_KTHREAD_CNT, 47 + CPUS_AVAILABLE, 48 + CPUS_COUNT, 49 + CPUS_ITERATOR, 50 + EXCEPTION_CNT, 51 + }; 52 + 53 + static const char *const map_type_strings[] = { 54 + [CPU_MAP] = "cpu_map", 55 + [RX_CNT] = "rx_cnt", 56 + [REDIRECT_ERR_CNT] = "redirect_err_cnt", 57 + [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", 58 + [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", 59 + [CPUS_AVAILABLE] = "cpus_available", 60 + [CPUS_COUNT] = "cpus_count", 61 + [CPUS_ITERATOR] = "cpus_iterator", 62 + [EXCEPTION_CNT] = "exception_cnt", 63 + }; 49 64 50 65 #define NUM_TP 5 51 - struct bpf_link *tp_links[NUM_TP] = { 0 }; 66 + #define NUM_MAP 9 67 + struct bpf_link *tp_links[NUM_TP] = {}; 68 + static int map_fds[NUM_MAP]; 52 69 static int tp_cnt = 0; 53 70 54 71 /* Exit return codes */ ··· 544 527 { 545 528 int fd, i; 546 529 547 - fd = rx_cnt_map_fd; 530 + fd = map_fds[RX_CNT]; 548 531 map_collect_percpu(fd, 0, &rec->rx_cnt); 549 532 550 - fd = redirect_err_cnt_map_fd; 533 + fd = map_fds[REDIRECT_ERR_CNT]; 551 534 map_collect_percpu(fd, 1, &rec->redir_err); 552 535 553 - fd = cpumap_enqueue_cnt_map_fd; 536 + fd = map_fds[CPUMAP_ENQUEUE_CNT]; 554 537 for (i = 0; i < n_cpus; i++) 555 538 map_collect_percpu(fd, i, &rec->enq[i]); 556 539 557 - fd = cpumap_kthread_cnt_map_fd; 540 + fd = map_fds[CPUMAP_KTHREAD_CNT]; 558 541 map_collect_percpu(fd, 0, &rec->kthread); 559 542 560 - fd = exception_cnt_map_fd; 543 + fd = map_fds[EXCEPTION_CNT]; 561 544 map_collect_percpu(fd, 0, &rec->exception); 562 545 } 563 546 ··· 582 565 /* Add a CPU entry to cpumap, as this allocate a cpu entry in 583 566 * the kernel for the cpu. 584 567 */ 585 - ret = bpf_map_update_elem(cpu_map_fd, &cpu, value, 0); 568 + ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0); 586 569 if (ret) { 587 570 fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); 588 571 exit(EXIT_FAIL_BPF); ··· 591 574 /* Inform bpf_prog's that a new CPU is available to select 592 575 * from via some control maps. 593 576 */ 594 - ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0); 577 + ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0); 595 578 if (ret) { 596 579 fprintf(stderr, "Add to avail CPUs failed\n"); 597 580 exit(EXIT_FAIL_BPF); 598 581 } 599 582 600 583 /* When not replacing/updating existing entry, bump the count */ 601 - ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count); 584 + ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count); 602 585 if (ret) { 603 586 fprintf(stderr, "Failed reading curr cpus_count\n"); 604 587 exit(EXIT_FAIL_BPF); 605 588 } 606 589 if (new) { 607 590 curr_cpus_count++; 608 - ret = bpf_map_update_elem(cpus_count_map_fd, &key, 591 + ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key, 609 592 &curr_cpus_count, 0); 610 593 if (ret) { 611 594 fprintf(stderr, "Failed write curr cpus_count\n"); ··· 629 612 int ret, i; 630 613 631 614 for (i = 0; i < n_cpus; i++) { 632 - ret = bpf_map_update_elem(cpus_available_map_fd, &i, 615 + ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i, 633 616 &invalid_cpu, 0); 634 617 if (ret) { 635 618 fprintf(stderr, "Failed marking CPU unavailable\n"); ··· 682 665 free_stats_record(prev); 683 666 } 684 667 685 - static struct bpf_link * attach_tp(struct bpf_object *obj, 686 - const char *tp_category, 687 - const char* tp_name) 668 + static int init_tracepoints(struct bpf_object *obj) 688 669 { 689 670 struct bpf_program *prog; 690 - struct bpf_link *link; 691 - char sec_name[PATH_MAX]; 692 - int len; 693 671 694 - len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s", 695 - tp_category, tp_name); 696 - if (len < 0) 697 - exit(EXIT_FAIL); 672 + bpf_object__for_each_program(prog, obj) { 673 + if (bpf_program__is_tracepoint(prog) != true) 674 + continue; 698 675 699 - prog = bpf_object__find_program_by_title(obj, sec_name); 700 - if (!prog) { 701 - fprintf(stderr, "ERR: finding progsec: %s\n", sec_name); 702 - exit(EXIT_FAIL_BPF); 676 + tp_links[tp_cnt] = bpf_program__attach(prog); 677 + if (libbpf_get_error(tp_links[tp_cnt])) { 678 + tp_links[tp_cnt] = NULL; 679 + return -EINVAL; 680 + } 681 + tp_cnt++; 703 682 } 704 683 705 - link = bpf_program__attach_tracepoint(prog, tp_category, tp_name); 706 - if (libbpf_get_error(link)) 707 - exit(EXIT_FAIL_BPF); 708 - 709 - return link; 710 - } 711 - 712 - static void init_tracepoints(struct bpf_object *obj) { 713 - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err"); 714 - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err"); 715 - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception"); 716 - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue"); 717 - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread"); 684 + return 0; 718 685 } 719 686 720 687 static int init_map_fds(struct bpf_object *obj) 721 688 { 722 - /* Maps updated by tracepoints */ 723 - redirect_err_cnt_map_fd = 724 - bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt"); 725 - exception_cnt_map_fd = 726 - bpf_object__find_map_fd_by_name(obj, "exception_cnt"); 727 - cpumap_enqueue_cnt_map_fd = 728 - bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt"); 729 - cpumap_kthread_cnt_map_fd = 730 - bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt"); 689 + enum map_type type; 731 690 732 - /* Maps used by XDP */ 733 - rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt"); 734 - cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map"); 735 - cpus_available_map_fd = 736 - bpf_object__find_map_fd_by_name(obj, "cpus_available"); 737 - cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count"); 738 - cpus_iterator_map_fd = 739 - bpf_object__find_map_fd_by_name(obj, "cpus_iterator"); 691 + for (type = 0; type < NUM_MAP; type++) { 692 + map_fds[type] = 693 + bpf_object__find_map_fd_by_name(obj, 694 + map_type_strings[type]); 740 695 741 - if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 || 742 - redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 || 743 - cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 || 744 - cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 || 745 - exception_cnt_map_fd < 0) 746 - return -ENOENT; 696 + if (map_fds[type] < 0) 697 + return -ENOENT; 698 + } 747 699 748 700 return 0; 749 701 } ··· 781 795 bool stress_mode = false; 782 796 struct bpf_program *prog; 783 797 struct bpf_object *obj; 798 + int err = EXIT_FAIL; 784 799 char filename[256]; 785 800 int added_cpus = 0; 786 801 int longindex = 0; 787 802 int interval = 2; 788 803 int add_cpu = -1; 789 - int opt, err; 790 - int prog_fd; 804 + int opt, prog_fd; 791 805 int *cpu, i; 792 806 __u32 qsize; 793 807 ··· 810 824 } 811 825 812 826 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) 813 - return EXIT_FAIL; 827 + return err; 814 828 815 829 if (prog_fd < 0) { 816 830 fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", 817 831 strerror(errno)); 818 - return EXIT_FAIL; 832 + return err; 819 833 } 820 - init_tracepoints(obj); 834 + 835 + if (init_tracepoints(obj) < 0) { 836 + fprintf(stderr, "ERR: bpf_program__attach failed\n"); 837 + return err; 838 + } 839 + 821 840 if (init_map_fds(obj) < 0) { 822 841 fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n"); 823 - return EXIT_FAIL; 842 + return err; 824 843 } 825 844 mark_cpus_unavailable(); 826 845 827 846 cpu = malloc(n_cpus * sizeof(int)); 828 847 if (!cpu) { 829 848 fprintf(stderr, "failed to allocate cpu array\n"); 830 - return EXIT_FAIL; 849 + return err; 831 850 } 832 851 memset(cpu, 0, n_cpus * sizeof(int)); 833 852 ··· 951 960 prog = bpf_object__find_program_by_title(obj, prog_name); 952 961 if (!prog) { 953 962 fprintf(stderr, "bpf_object__find_program_by_title failed\n"); 954 - err = EXIT_FAIL; 955 963 goto out; 956 964 } 957 965 958 966 prog_fd = bpf_program__fd(prog); 959 967 if (prog_fd < 0) { 960 968 fprintf(stderr, "bpf_program__fd failed\n"); 961 - err = EXIT_FAIL; 962 969 goto out; 963 970 } 964 971 ··· 975 986 976 987 stats_poll(interval, use_separators, prog_name, mprog_name, 977 988 &value, stress_mode); 989 + 990 + err = EXIT_OK; 978 991 out: 979 992 free(cpu); 980 993 return err;

+5 -7

samples/bpf/xdp_sample_pkts_kern.c

··· 5 5 #include <bpf/bpf_helpers.h> 6 6 7 7 #define SAMPLE_SIZE 64ul 8 - #define MAX_CPUS 128 9 8 10 - struct bpf_map_def SEC("maps") my_map = { 11 - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, 12 - .key_size = sizeof(int), 13 - .value_size = sizeof(u32), 14 - .max_entries = MAX_CPUS, 15 - }; 9 + struct { 10 + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 11 + __uint(key_size, sizeof(int)); 12 + __uint(value_size, sizeof(u32)); 13 + } my_map SEC(".maps"); 16 14 17 15 SEC("xdp_sample") 18 16 int xdp_sample_prog(struct xdp_md *ctx)

-1

samples/bpf/xdp_sample_pkts_user.c

··· 18 18 19 19 #include "perf-sys.h" 20 20 21 - #define MAX_CPUS 128 22 21 static int if_idx; 23 22 static char *if_name; 24 23 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;

+289 -63

samples/bpf/xdpsock_user.c

··· 11 11 #include <linux/if_xdp.h> 12 12 #include <linux/if_ether.h> 13 13 #include <linux/ip.h> 14 + #include <linux/limits.h> 14 15 #include <linux/udp.h> 15 16 #include <arpa/inet.h> 16 17 #include <locale.h> ··· 80 79 static u32 opt_pkt_fill_pattern = 0x12345678; 81 80 static bool opt_extra_stats; 82 81 static bool opt_quiet; 82 + static bool opt_app_stats; 83 + static const char *opt_irq_str = ""; 84 + static u32 irq_no; 85 + static int irqs_at_init = -1; 83 86 static int opt_poll; 84 87 static int opt_interval = 1; 85 88 static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; ··· 96 91 static u32 opt_num_xsks = 1; 97 92 static u32 prog_id; 98 93 99 - struct xsk_umem_info { 100 - struct xsk_ring_prod fq; 101 - struct xsk_ring_cons cq; 102 - struct xsk_umem *umem; 103 - void *buffer; 104 - }; 105 - 106 - struct xsk_socket_info { 107 - struct xsk_ring_cons rx; 108 - struct xsk_ring_prod tx; 109 - struct xsk_umem_info *umem; 110 - struct xsk_socket *xsk; 94 + struct xsk_ring_stats { 111 95 unsigned long rx_npkts; 112 96 unsigned long tx_npkts; 113 97 unsigned long rx_dropped_npkts; ··· 113 119 unsigned long prev_rx_full_npkts; 114 120 unsigned long prev_rx_fill_empty_npkts; 115 121 unsigned long prev_tx_empty_npkts; 122 + }; 123 + 124 + struct xsk_driver_stats { 125 + unsigned long intrs; 126 + unsigned long prev_intrs; 127 + }; 128 + 129 + struct xsk_app_stats { 130 + unsigned long rx_empty_polls; 131 + unsigned long fill_fail_polls; 132 + unsigned long copy_tx_sendtos; 133 + unsigned long tx_wakeup_sendtos; 134 + unsigned long opt_polls; 135 + unsigned long prev_rx_empty_polls; 136 + unsigned long prev_fill_fail_polls; 137 + unsigned long prev_copy_tx_sendtos; 138 + unsigned long prev_tx_wakeup_sendtos; 139 + unsigned long prev_opt_polls; 140 + }; 141 + 142 + struct xsk_umem_info { 143 + struct xsk_ring_prod fq; 144 + struct xsk_ring_cons cq; 145 + struct xsk_umem *umem; 146 + void *buffer; 147 + }; 148 + 149 + struct xsk_socket_info { 150 + struct xsk_ring_cons rx; 151 + struct xsk_ring_prod tx; 152 + struct xsk_umem_info *umem; 153 + struct xsk_socket *xsk; 154 + struct xsk_ring_stats ring_stats; 155 + struct xsk_app_stats app_stats; 156 + struct xsk_driver_stats drv_stats; 116 157 u32 outstanding_tx; 117 158 }; 118 159 ··· 202 173 return err; 203 174 204 175 if (optlen == sizeof(struct xdp_statistics)) { 205 - xsk->rx_dropped_npkts = stats.rx_dropped; 206 - xsk->rx_invalid_npkts = stats.rx_invalid_descs; 207 - xsk->tx_invalid_npkts = stats.tx_invalid_descs; 208 - xsk->rx_full_npkts = stats.rx_ring_full; 209 - xsk->rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; 210 - xsk->tx_empty_npkts = stats.tx_ring_empty_descs; 176 + xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped; 177 + xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs; 178 + xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs; 179 + xsk->ring_stats.rx_full_npkts = stats.rx_ring_full; 180 + xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; 181 + xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs; 211 182 return 0; 212 183 } 213 184 214 185 return -EINVAL; 186 + } 187 + 188 + static void dump_app_stats(long dt) 189 + { 190 + int i; 191 + 192 + for (i = 0; i < num_socks && xsks[i]; i++) { 193 + char *fmt = "%-18s %'-14.0f %'-14lu\n"; 194 + double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps, 195 + tx_wakeup_sendtos_ps, opt_polls_ps; 196 + 197 + rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls - 198 + xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt; 199 + fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls - 200 + xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt; 201 + copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos - 202 + xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt; 203 + tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos - 204 + xsks[i]->app_stats.prev_tx_wakeup_sendtos) 205 + * 1000000000. / dt; 206 + opt_polls_ps = (xsks[i]->app_stats.opt_polls - 207 + xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt; 208 + 209 + printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count"); 210 + printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls); 211 + printf(fmt, "fill fail polls", fill_fail_polls_ps, 212 + xsks[i]->app_stats.fill_fail_polls); 213 + printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps, 214 + xsks[i]->app_stats.copy_tx_sendtos); 215 + printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps, 216 + xsks[i]->app_stats.tx_wakeup_sendtos); 217 + printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls); 218 + 219 + xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls; 220 + xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls; 221 + xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos; 222 + xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos; 223 + xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls; 224 + } 225 + } 226 + 227 + static bool get_interrupt_number(void) 228 + { 229 + FILE *f_int_proc; 230 + char line[4096]; 231 + bool found = false; 232 + 233 + f_int_proc = fopen("/proc/interrupts", "r"); 234 + if (f_int_proc == NULL) { 235 + printf("Failed to open /proc/interrupts.\n"); 236 + return found; 237 + } 238 + 239 + while (!feof(f_int_proc) && !found) { 240 + /* Make sure to read a full line at a time */ 241 + if (fgets(line, sizeof(line), f_int_proc) == NULL || 242 + line[strlen(line) - 1] != '\n') { 243 + printf("Error reading from interrupts file\n"); 244 + break; 245 + } 246 + 247 + /* Extract interrupt number from line */ 248 + if (strstr(line, opt_irq_str) != NULL) { 249 + irq_no = atoi(line); 250 + found = true; 251 + break; 252 + } 253 + } 254 + 255 + fclose(f_int_proc); 256 + 257 + return found; 258 + } 259 + 260 + static int get_irqs(void) 261 + { 262 + char count_path[PATH_MAX]; 263 + int total_intrs = -1; 264 + FILE *f_count_proc; 265 + char line[4096]; 266 + 267 + snprintf(count_path, sizeof(count_path), 268 + "/sys/kernel/irq/%i/per_cpu_count", irq_no); 269 + f_count_proc = fopen(count_path, "r"); 270 + if (f_count_proc == NULL) { 271 + printf("Failed to open %s\n", count_path); 272 + return total_intrs; 273 + } 274 + 275 + if (fgets(line, sizeof(line), f_count_proc) == NULL || 276 + line[strlen(line) - 1] != '\n') { 277 + printf("Error reading from %s\n", count_path); 278 + } else { 279 + static const char com[2] = ","; 280 + char *token; 281 + 282 + total_intrs = 0; 283 + token = strtok(line, com); 284 + while (token != NULL) { 285 + /* sum up interrupts across all cores */ 286 + total_intrs += atoi(token); 287 + token = strtok(NULL, com); 288 + } 289 + } 290 + 291 + fclose(f_count_proc); 292 + 293 + return total_intrs; 294 + } 295 + 296 + static void dump_driver_stats(long dt) 297 + { 298 + int i; 299 + 300 + for (i = 0; i < num_socks && xsks[i]; i++) { 301 + char *fmt = "%-18s %'-14.0f %'-14lu\n"; 302 + double intrs_ps; 303 + int n_ints = get_irqs(); 304 + 305 + if (n_ints < 0) { 306 + printf("error getting intr info for intr %i\n", irq_no); 307 + return; 308 + } 309 + xsks[i]->drv_stats.intrs = n_ints - irqs_at_init; 310 + 311 + intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) * 312 + 1000000000. / dt; 313 + 314 + printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count"); 315 + printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs); 316 + 317 + xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs; 318 + } 215 319 } 216 320 217 321 static void dump_stats(void) ··· 356 194 prev_time = now; 357 195 358 196 for (i = 0; i < num_socks && xsks[i]; i++) { 359 - char *fmt = "%-15s %'-11.0f %'-11lu\n"; 197 + char *fmt = "%-18s %'-14.0f %'-14lu\n"; 360 198 double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, 361 199 tx_invalid_pps, tx_empty_pps; 362 200 363 - rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) * 201 + rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) * 364 202 1000000000. / dt; 365 - tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) * 203 + tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) * 366 204 1000000000. / dt; 367 205 368 206 printf("\n sock%d@", i); 369 207 print_benchmark(false); 370 208 printf("\n"); 371 209 372 - printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", 210 + printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts", 373 211 dt / 1000000000.); 374 - printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts); 375 - printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts); 212 + printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts); 213 + printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts); 376 214 377 - xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts; 378 - xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts; 215 + xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts; 216 + xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts; 379 217 380 218 if (opt_extra_stats) { 381 219 if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) { 382 - dropped_pps = (xsks[i]->rx_dropped_npkts - 383 - xsks[i]->prev_rx_dropped_npkts) * 1000000000. / dt; 384 - rx_invalid_pps = (xsks[i]->rx_invalid_npkts - 385 - xsks[i]->prev_rx_invalid_npkts) * 1000000000. / dt; 386 - tx_invalid_pps = (xsks[i]->tx_invalid_npkts - 387 - xsks[i]->prev_tx_invalid_npkts) * 1000000000. / dt; 388 - full_pps = (xsks[i]->rx_full_npkts - 389 - xsks[i]->prev_rx_full_npkts) * 1000000000. / dt; 390 - fill_empty_pps = (xsks[i]->rx_fill_empty_npkts - 391 - xsks[i]->prev_rx_fill_empty_npkts) 392 - * 1000000000. / dt; 393 - tx_empty_pps = (xsks[i]->tx_empty_npkts - 394 - xsks[i]->prev_tx_empty_npkts) * 1000000000. / dt; 220 + dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts - 221 + xsks[i]->ring_stats.prev_rx_dropped_npkts) * 222 + 1000000000. / dt; 223 + rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts - 224 + xsks[i]->ring_stats.prev_rx_invalid_npkts) * 225 + 1000000000. / dt; 226 + tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts - 227 + xsks[i]->ring_stats.prev_tx_invalid_npkts) * 228 + 1000000000. / dt; 229 + full_pps = (xsks[i]->ring_stats.rx_full_npkts - 230 + xsks[i]->ring_stats.prev_rx_full_npkts) * 231 + 1000000000. / dt; 232 + fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts - 233 + xsks[i]->ring_stats.prev_rx_fill_empty_npkts) * 234 + 1000000000. / dt; 235 + tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts - 236 + xsks[i]->ring_stats.prev_tx_empty_npkts) * 237 + 1000000000. / dt; 395 238 396 239 printf(fmt, "rx dropped", dropped_pps, 397 - xsks[i]->rx_dropped_npkts); 240 + xsks[i]->ring_stats.rx_dropped_npkts); 398 241 printf(fmt, "rx invalid", rx_invalid_pps, 399 - xsks[i]->rx_invalid_npkts); 242 + xsks[i]->ring_stats.rx_invalid_npkts); 400 243 printf(fmt, "tx invalid", tx_invalid_pps, 401 - xsks[i]->tx_invalid_npkts); 244 + xsks[i]->ring_stats.tx_invalid_npkts); 402 245 printf(fmt, "rx queue full", full_pps, 403 - xsks[i]->rx_full_npkts); 246 + xsks[i]->ring_stats.rx_full_npkts); 404 247 printf(fmt, "fill ring empty", fill_empty_pps, 405 - xsks[i]->rx_fill_empty_npkts); 248 + xsks[i]->ring_stats.rx_fill_empty_npkts); 406 249 printf(fmt, "tx ring empty", tx_empty_pps, 407 - xsks[i]->tx_empty_npkts); 250 + xsks[i]->ring_stats.tx_empty_npkts); 408 251 409 - xsks[i]->prev_rx_dropped_npkts = xsks[i]->rx_dropped_npkts; 410 - xsks[i]->prev_rx_invalid_npkts = xsks[i]->rx_invalid_npkts; 411 - xsks[i]->prev_tx_invalid_npkts = xsks[i]->tx_invalid_npkts; 412 - xsks[i]->prev_rx_full_npkts = xsks[i]->rx_full_npkts; 413 - xsks[i]->prev_rx_fill_empty_npkts = xsks[i]->rx_fill_empty_npkts; 414 - xsks[i]->prev_tx_empty_npkts = xsks[i]->tx_empty_npkts; 252 + xsks[i]->ring_stats.prev_rx_dropped_npkts = 253 + xsks[i]->ring_stats.rx_dropped_npkts; 254 + xsks[i]->ring_stats.prev_rx_invalid_npkts = 255 + xsks[i]->ring_stats.rx_invalid_npkts; 256 + xsks[i]->ring_stats.prev_tx_invalid_npkts = 257 + xsks[i]->ring_stats.tx_invalid_npkts; 258 + xsks[i]->ring_stats.prev_rx_full_npkts = 259 + xsks[i]->ring_stats.rx_full_npkts; 260 + xsks[i]->ring_stats.prev_rx_fill_empty_npkts = 261 + xsks[i]->ring_stats.rx_fill_empty_npkts; 262 + xsks[i]->ring_stats.prev_tx_empty_npkts = 263 + xsks[i]->ring_stats.tx_empty_npkts; 415 264 } else { 416 265 printf("%-15s\n", "Error retrieving extra stats"); 417 266 } 418 267 } 419 268 } 269 + 270 + if (opt_app_stats) 271 + dump_app_stats(dt); 272 + if (irq_no) 273 + dump_driver_stats(dt); 420 274 } 421 275 422 276 static bool is_benchmark_done(void) ··· 871 693 if (ret) 872 694 exit_with_error(-ret); 873 695 696 + xsk->app_stats.rx_empty_polls = 0; 697 + xsk->app_stats.fill_fail_polls = 0; 698 + xsk->app_stats.copy_tx_sendtos = 0; 699 + xsk->app_stats.tx_wakeup_sendtos = 0; 700 + xsk->app_stats.opt_polls = 0; 701 + xsk->app_stats.prev_rx_empty_polls = 0; 702 + xsk->app_stats.prev_fill_fail_polls = 0; 703 + xsk->app_stats.prev_copy_tx_sendtos = 0; 704 + xsk->app_stats.prev_tx_wakeup_sendtos = 0; 705 + xsk->app_stats.prev_opt_polls = 0; 706 + 874 707 return xsk; 875 708 } 876 709 ··· 909 720 {"tx-pkt-pattern", required_argument, 0, 'P'}, 910 721 {"extra-stats", no_argument, 0, 'x'}, 911 722 {"quiet", no_argument, 0, 'Q'}, 723 + {"app-stats", no_argument, 0, 'a'}, 724 + {"irq-string", no_argument, 0, 'I'}, 912 725 {0, 0, 0, 0} 913 726 }; 914 727 ··· 947 756 " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" 948 757 " -x, --extra-stats Display extra statistics.\n" 949 758 " -Q, --quiet Do not display any stats.\n" 759 + " -a, --app-stats Display application (syscall) statistics.\n" 760 + " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" 950 761 "\n"; 951 762 fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, 952 763 opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, ··· 964 771 opterr = 0; 965 772 966 773 for (;;) { 967 - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ", 774 + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:", 968 775 long_options, &option_index); 969 776 if (c == -1) 970 777 break; ··· 1051 858 case 'Q': 1052 859 opt_quiet = 1; 1053 860 break; 861 + case 'a': 862 + opt_app_stats = 1; 863 + break; 864 + case 'I': 865 + opt_irq_str = optarg; 866 + if (get_interrupt_number()) 867 + irqs_at_init = get_irqs(); 868 + if (irqs_at_init < 0) { 869 + fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); 870 + usage(basename(argv[0])); 871 + } 872 + 873 + break; 1054 874 default: 1055 875 usage(basename(argv[0])); 1056 876 } ··· 1114 908 * is driven by the NAPI loop. So as an optimization, we do not have to call 1115 909 * sendto() all the time in zero-copy mode for l2fwd. 1116 910 */ 1117 - if (opt_xdp_bind_flags & XDP_COPY) 911 + if (opt_xdp_bind_flags & XDP_COPY) { 912 + xsk->app_stats.copy_tx_sendtos++; 1118 913 kick_tx(xsk); 914 + } 1119 915 1120 916 ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : 1121 917 xsk->outstanding_tx; ··· 1132 924 while (ret != rcvd) { 1133 925 if (ret < 0) 1134 926 exit_with_error(-ret); 1135 - if (xsk_ring_prod__needs_wakeup(&umem->fq)) 927 + if (xsk_ring_prod__needs_wakeup(&umem->fq)) { 928 + xsk->app_stats.fill_fail_polls++; 1136 929 ret = poll(fds, num_socks, opt_timeout); 930 + } 1137 931 ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); 1138 932 } 1139 933 ··· 1146 936 xsk_ring_prod__submit(&xsk->umem->fq, rcvd); 1147 937 xsk_ring_cons__release(&xsk->umem->cq, rcvd); 1148 938 xsk->outstanding_tx -= rcvd; 1149 - xsk->tx_npkts += rcvd; 939 + xsk->ring_stats.tx_npkts += rcvd; 1150 940 } 1151 941 } 1152 942 ··· 1159 949 if (!xsk->outstanding_tx) 1160 950 return; 1161 951 1162 - if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) 952 + if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) { 953 + xsk->app_stats.tx_wakeup_sendtos++; 1163 954 kick_tx(xsk); 955 + } 1164 956 1165 957 rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); 1166 958 if (rcvd > 0) { 1167 959 xsk_ring_cons__release(&xsk->umem->cq, rcvd); 1168 960 xsk->outstanding_tx -= rcvd; 1169 - xsk->tx_npkts += rcvd; 961 + xsk->ring_stats.tx_npkts += rcvd; 1170 962 } 1171 963 } 1172 964 ··· 1180 968 1181 969 rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); 1182 970 if (!rcvd) { 1183 - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) 971 + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 972 + xsk->app_stats.rx_empty_polls++; 1184 973 ret = poll(fds, num_socks, opt_timeout); 974 + } 1185 975 return; 1186 976 } 1187 977 ··· 1191 977 while (ret != rcvd) { 1192 978 if (ret < 0) 1193 979 exit_with_error(-ret); 1194 - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) 980 + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 981 + xsk->app_stats.fill_fail_polls++; 1195 982 ret = poll(fds, num_socks, opt_timeout); 983 + } 1196 984 ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); 1197 985 } 1198 986 ··· 1212 996 1213 997 xsk_ring_prod__submit(&xsk->umem->fq, rcvd); 1214 998 xsk_ring_cons__release(&xsk->rx, rcvd); 1215 - xsk->rx_npkts += rcvd; 999 + xsk->ring_stats.rx_npkts += rcvd; 1216 1000 } 1217 1001 1218 1002 static void rx_drop_all(void) ··· 1227 1011 1228 1012 for (;;) { 1229 1013 if (opt_poll) { 1014 + for (i = 0; i < num_socks; i++) 1015 + xsks[i]->app_stats.opt_polls++; 1230 1016 ret = poll(fds, num_socks, opt_timeout); 1231 1017 if (ret <= 0) 1232 1018 continue; ··· 1309 1091 int batch_size = get_batch_size(pkt_cnt); 1310 1092 1311 1093 if (opt_poll) { 1094 + for (i = 0; i < num_socks; i++) 1095 + xsks[i]->app_stats.opt_polls++; 1312 1096 ret = poll(fds, num_socks, opt_timeout); 1313 1097 if (ret <= 0) 1314 1098 continue; ··· 1342 1122 1343 1123 rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); 1344 1124 if (!rcvd) { 1345 - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) 1125 + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { 1126 + xsk->app_stats.rx_empty_polls++; 1346 1127 ret = poll(fds, num_socks, opt_timeout); 1128 + } 1347 1129 return; 1348 1130 } 1349 1131 ··· 1354 1132 if (ret < 0) 1355 1133 exit_with_error(-ret); 1356 1134 complete_tx_l2fwd(xsk, fds); 1357 - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) 1135 + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { 1136 + xsk->app_stats.tx_wakeup_sendtos++; 1358 1137 kick_tx(xsk); 1138 + } 1359 1139 ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); 1360 1140 } 1361 1141 ··· 1379 1155 xsk_ring_prod__submit(&xsk->tx, rcvd); 1380 1156 xsk_ring_cons__release(&xsk->rx, rcvd); 1381 1157 1382 - xsk->rx_npkts += rcvd; 1158 + xsk->ring_stats.rx_npkts += rcvd; 1383 1159 xsk->outstanding_tx += rcvd; 1384 1160 } 1385 1161 ··· 1395 1171 1396 1172 for (;;) { 1397 1173 if (opt_poll) { 1174 + for (i = 0; i < num_socks; i++) 1175 + xsks[i]->app_stats.opt_polls++; 1398 1176 ret = poll(fds, num_socks, opt_timeout); 1399 1177 if (ret <= 0) 1400 1178 continue;

+86 -13

tools/include/uapi/linux/bpf.h

··· 356 356 #define BPF_F_SLEEPABLE (1U << 4) 357 357 358 358 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have 359 - * two extensions: 359 + * the following extensions: 360 360 * 361 - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE 362 - * insn[0].imm: map fd map fd 363 - * insn[1].imm: 0 offset into value 364 - * insn[0].off: 0 0 365 - * insn[1].off: 0 0 366 - * ldimm64 rewrite: address of map address of map[0]+offset 367 - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE 361 + * insn[0].src_reg: BPF_PSEUDO_MAP_FD 362 + * insn[0].imm: map fd 363 + * insn[1].imm: 0 364 + * insn[0].off: 0 365 + * insn[1].off: 0 366 + * ldimm64 rewrite: address of map 367 + * verifier type: CONST_PTR_TO_MAP 368 368 */ 369 369 #define BPF_PSEUDO_MAP_FD 1 370 + /* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE 371 + * insn[0].imm: map fd 372 + * insn[1].imm: offset into value 373 + * insn[0].off: 0 374 + * insn[1].off: 0 375 + * ldimm64 rewrite: address of map[0]+offset 376 + * verifier type: PTR_TO_MAP_VALUE 377 + */ 370 378 #define BPF_PSEUDO_MAP_VALUE 2 379 + /* insn[0].src_reg: BPF_PSEUDO_BTF_ID 380 + * insn[0].imm: kernel btd id of VAR 381 + * insn[1].imm: 0 382 + * insn[0].off: 0 383 + * insn[1].off: 0 384 + * ldimm64 rewrite: address of the kernel variable 385 + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var 386 + * is struct/union. 387 + */ 388 + #define BPF_PSEUDO_BTF_ID 3 371 389 372 390 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative 373 391 * offset to another bpf function ··· 435 417 436 418 /* Share perf_event among processes */ 437 419 BPF_F_PRESERVE_ELEMS = (1U << 11), 420 + 421 + /* Create a map that is suitable to be an inner map with dynamic max entries */ 422 + BPF_F_INNER_MAP = (1U << 12), 438 423 }; 439 424 440 425 /* Flags for BPF_PROG_QUERY. */ ··· 1701 1680 * **TCP_CONGESTION**, **TCP_BPF_IW**, 1702 1681 * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, 1703 1682 * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, 1704 - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. 1683 + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. 1705 1684 * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. 1706 1685 * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. 1707 1686 * Return ··· 2256 2235 * Description 2257 2236 * This helper is used in programs implementing policies at the 2258 2237 * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. 2259 - * if the verdeict eBPF program returns **SK_PASS**), redirect it 2238 + * if the verdict eBPF program returns **SK_PASS**), redirect it 2260 2239 * to the socket referenced by *map* (of type 2261 2240 * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and 2262 2241 * egress interfaces can be used for redirection. The ··· 3682 3661 * Redirect the packet to another net device of index *ifindex* 3683 3662 * and fill in L2 addresses from neighboring subsystem. This helper 3684 3663 * is somewhat similar to **bpf_redirect**\ (), except that it 3685 - * fills in e.g. MAC addresses based on the L3 information from 3686 - * the packet. This helper is supported for IPv4 and IPv6 protocols. 3664 + * populates L2 addresses as well, meaning, internally, the helper 3665 + * performs a FIB lookup based on the skb's networking header to 3666 + * get the address of the next hop and then relies on the neighbor 3667 + * lookup for the L2 address of the nexthop. 3668 + * 3687 3669 * The *flags* argument is reserved and must be 0. The helper is 3688 - * currently only supported for tc BPF program types. 3670 + * currently only supported for tc BPF program types, and enabled 3671 + * for IPv4 and IPv6 protocols. 3672 + * Return 3673 + * The helper returns **TC_ACT_REDIRECT** on success or 3674 + * **TC_ACT_SHOT** on error. 3675 + * 3676 + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) 3677 + * Description 3678 + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a 3679 + * pointer to the percpu kernel variable on *cpu*. A ksym is an 3680 + * extern variable decorated with '__ksym'. For ksym, there is a 3681 + * global var (either static or global) defined of the same name 3682 + * in the kernel. The ksym is percpu if the global var is percpu. 3683 + * The returned pointer points to the global percpu var on *cpu*. 3684 + * 3685 + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the 3686 + * kernel, except that bpf_per_cpu_ptr() may return NULL. This 3687 + * happens if *cpu* is larger than nr_cpu_ids. The caller of 3688 + * bpf_per_cpu_ptr() must check the returned value. 3689 + * Return 3690 + * A pointer pointing to the kernel percpu variable on *cpu*, or 3691 + * NULL, if *cpu* is invalid. 3692 + * 3693 + * void *bpf_this_cpu_ptr(const void *percpu_ptr) 3694 + * Description 3695 + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a 3696 + * pointer to the percpu kernel variable on this cpu. See the 3697 + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). 3698 + * 3699 + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in 3700 + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would 3701 + * never return NULL. 3702 + * Return 3703 + * A pointer pointing to the kernel percpu variable on this cpu. 3704 + * 3705 + * long bpf_redirect_peer(u32 ifindex, u64 flags) 3706 + * Description 3707 + * Redirect the packet to another net device of index *ifindex*. 3708 + * This helper is somewhat similar to **bpf_redirect**\ (), except 3709 + * that the redirection happens to the *ifindex*' peer device and 3710 + * the netns switch takes place from ingress to ingress without 3711 + * going through the CPU's backlog queue. 3712 + * 3713 + * The *flags* argument is reserved and must be 0. The helper is 3714 + * currently only supported for tc BPF program types at the ingress 3715 + * hook and for veth device types. The peer device must reside in a 3716 + * different network namespace. 3689 3717 * Return 3690 3718 * The helper returns **TC_ACT_REDIRECT** on success or 3691 3719 * **TC_ACT_SHOT** on error. ··· 3893 3823 FN(seq_printf_btf), \ 3894 3824 FN(skb_cgroup_classid), \ 3895 3825 FN(redirect_neigh), \ 3826 + FN(bpf_per_cpu_ptr), \ 3827 + FN(bpf_this_cpu_ptr), \ 3828 + FN(redirect_peer), \ 3896 3829 /* */ 3897 3830 3898 3831 /* integer value in 'imm' field of BPF_CALL instruction selects which helper

+290 -60

tools/lib/bpf/libbpf.c

··· 390 390 } kcfg; 391 391 struct { 392 392 unsigned long long addr; 393 + 394 + /* target btf_id of the corresponding kernel var. */ 395 + int vmlinux_btf_id; 396 + 397 + /* local btf_id of the ksym extern's type. */ 398 + __u32 type_id; 393 399 } ksym; 394 400 }; 395 401 }; ··· 2528 2522 { 2529 2523 bool need_vmlinux_btf = false; 2530 2524 struct bpf_program *prog; 2531 - int err; 2525 + int i, err; 2532 2526 2533 2527 /* CO-RE relocations need kernel BTF */ 2534 2528 if (obj->btf_ext && obj->btf_ext->core_relo_info.len) 2535 2529 need_vmlinux_btf = true; 2530 + 2531 + /* Support for typed ksyms needs kernel BTF */ 2532 + for (i = 0; i < obj->nr_extern; i++) { 2533 + const struct extern_desc *ext; 2534 + 2535 + ext = &obj->externs[i]; 2536 + if (ext->type == EXT_KSYM && ext->ksym.type_id) { 2537 + need_vmlinux_btf = true; 2538 + break; 2539 + } 2540 + } 2536 2541 2537 2542 bpf_object__for_each_program(prog, obj) { 2538 2543 if (!prog->load) ··· 3173 3156 return -ENOTSUP; 3174 3157 } 3175 3158 } else if (strcmp(sec_name, KSYMS_SEC) == 0) { 3176 - const struct btf_type *vt; 3177 - 3178 3159 ksym_sec = sec; 3179 3160 ext->type = EXT_KSYM; 3180 - 3181 - vt = skip_mods_and_typedefs(obj->btf, t->type, NULL); 3182 - if (!btf_is_void(vt)) { 3183 - pr_warn("extern (ksym) '%s' is not typeless (void)\n", ext_name); 3184 - return -ENOTSUP; 3185 - } 3161 + skip_mods_and_typedefs(obj->btf, t->type, 3162 + &ext->ksym.type_id); 3186 3163 } else { 3187 3164 pr_warn("unrecognized extern section '%s'\n", sec_name); 3188 3165 return -ENOTSUP; ··· 4203 4192 return 0; 4204 4193 } 4205 4194 4195 + static int init_map_slots(struct bpf_map *map) 4196 + { 4197 + const struct bpf_map *targ_map; 4198 + unsigned int i; 4199 + int fd, err; 4200 + 4201 + for (i = 0; i < map->init_slots_sz; i++) { 4202 + if (!map->init_slots[i]) 4203 + continue; 4204 + 4205 + targ_map = map->init_slots[i]; 4206 + fd = bpf_map__fd(targ_map); 4207 + err = bpf_map_update_elem(map->fd, &i, &fd, 0); 4208 + if (err) { 4209 + err = -errno; 4210 + pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", 4211 + map->name, i, targ_map->name, 4212 + fd, err); 4213 + return err; 4214 + } 4215 + pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", 4216 + map->name, i, targ_map->name, fd); 4217 + } 4218 + 4219 + zfree(&map->init_slots); 4220 + map->init_slots_sz = 0; 4221 + 4222 + return 0; 4223 + } 4224 + 4206 4225 static int 4207 4226 bpf_object__create_maps(struct bpf_object *obj) 4208 4227 { ··· 4256 4215 if (map->fd >= 0) { 4257 4216 pr_debug("map '%s': skipping creation (preset fd=%d)\n", 4258 4217 map->name, map->fd); 4259 - continue; 4260 - } 4261 - 4262 - err = bpf_object__create_map(obj, map); 4263 - if (err) 4264 - goto err_out; 4265 - 4266 - pr_debug("map '%s': created successfully, fd=%d\n", map->name, 4267 - map->fd); 4268 - 4269 - if (bpf_map__is_internal(map)) { 4270 - err = bpf_object__populate_internal_map(obj, map); 4271 - if (err < 0) { 4272 - zclose(map->fd); 4218 + } else { 4219 + err = bpf_object__create_map(obj, map); 4220 + if (err) 4273 4221 goto err_out; 4274 - } 4275 - } 4276 4222 4277 - if (map->init_slots_sz) { 4278 - for (j = 0; j < map->init_slots_sz; j++) { 4279 - const struct bpf_map *targ_map; 4280 - int fd; 4223 + pr_debug("map '%s': created successfully, fd=%d\n", 4224 + map->name, map->fd); 4281 4225 4282 - if (!map->init_slots[j]) 4283 - continue; 4284 - 4285 - targ_map = map->init_slots[j]; 4286 - fd = bpf_map__fd(targ_map); 4287 - err = bpf_map_update_elem(map->fd, &j, &fd, 0); 4288 - if (err) { 4289 - err = -errno; 4290 - pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", 4291 - map->name, j, targ_map->name, 4292 - fd, err); 4226 + if (bpf_map__is_internal(map)) { 4227 + err = bpf_object__populate_internal_map(obj, map); 4228 + if (err < 0) { 4229 + zclose(map->fd); 4293 4230 goto err_out; 4294 4231 } 4295 - pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", 4296 - map->name, j, targ_map->name, fd); 4297 4232 } 4298 - zfree(&map->init_slots); 4299 - map->init_slots_sz = 0; 4233 + 4234 + if (map->init_slots_sz) { 4235 + err = init_map_slots(map); 4236 + if (err < 0) { 4237 + zclose(map->fd); 4238 + goto err_out; 4239 + } 4240 + } 4300 4241 } 4301 4242 4302 4243 if (map->pin_path && !map->pinned) { ··· 5040 5017 static int bpf_core_calc_field_relo(const struct bpf_program *prog, 5041 5018 const struct bpf_core_relo *relo, 5042 5019 const struct bpf_core_spec *spec, 5043 - __u32 *val, bool *validate) 5020 + __u32 *val, __u32 *field_sz, __u32 *type_id, 5021 + bool *validate) 5044 5022 { 5045 5023 const struct bpf_core_accessor *acc; 5046 5024 const struct btf_type *t; 5047 - __u32 byte_off, byte_sz, bit_off, bit_sz; 5025 + __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id; 5048 5026 const struct btf_member *m; 5049 5027 const struct btf_type *mt; 5050 5028 bool bitfield; 5051 5029 __s64 sz; 5030 + 5031 + *field_sz = 0; 5052 5032 5053 5033 if (relo->kind == BPF_FIELD_EXISTS) { 5054 5034 *val = spec ? 1 : 0; ··· 5068 5042 if (!acc->name) { 5069 5043 if (relo->kind == BPF_FIELD_BYTE_OFFSET) { 5070 5044 *val = spec->bit_offset / 8; 5045 + /* remember field size for load/store mem size */ 5046 + sz = btf__resolve_size(spec->btf, acc->type_id); 5047 + if (sz < 0) 5048 + return -EINVAL; 5049 + *field_sz = sz; 5050 + *type_id = acc->type_id; 5071 5051 } else if (relo->kind == BPF_FIELD_BYTE_SIZE) { 5072 5052 sz = btf__resolve_size(spec->btf, acc->type_id); 5073 5053 if (sz < 0) ··· 5090 5058 } 5091 5059 5092 5060 m = btf_members(t) + acc->idx; 5093 - mt = skip_mods_and_typedefs(spec->btf, m->type, NULL); 5061 + mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id); 5094 5062 bit_off = spec->bit_offset; 5095 5063 bit_sz = btf_member_bitfield_size(t, acc->idx); 5096 5064 ··· 5110 5078 byte_off = bit_off / 8 / byte_sz * byte_sz; 5111 5079 } 5112 5080 } else { 5113 - sz = btf__resolve_size(spec->btf, m->type); 5081 + sz = btf__resolve_size(spec->btf, field_type_id); 5114 5082 if (sz < 0) 5115 5083 return -EINVAL; 5116 5084 byte_sz = sz; ··· 5128 5096 switch (relo->kind) { 5129 5097 case BPF_FIELD_BYTE_OFFSET: 5130 5098 *val = byte_off; 5099 + if (!bitfield) { 5100 + *field_sz = byte_sz; 5101 + *type_id = field_type_id; 5102 + } 5131 5103 break; 5132 5104 case BPF_FIELD_BYTE_SIZE: 5133 5105 *val = byte_sz; ··· 5232 5196 bool poison; 5233 5197 /* some relocations can't be validated against orig_val */ 5234 5198 bool validate; 5199 + /* for field byte offset relocations or the forms: 5200 + * *(T *)(rX + <off>) = rY 5201 + * rX = *(T *)(rY + <off>), 5202 + * we remember original and resolved field size to adjust direct 5203 + * memory loads of pointers and integers; this is necessary for 32-bit 5204 + * host kernel architectures, but also allows to automatically 5205 + * relocate fields that were resized from, e.g., u32 to u64, etc. 5206 + */ 5207 + bool fail_memsz_adjust; 5208 + __u32 orig_sz; 5209 + __u32 orig_type_id; 5210 + __u32 new_sz; 5211 + __u32 new_type_id; 5235 5212 }; 5236 5213 5237 5214 /* Calculate original and target relocation values, given local and target ··· 5266 5217 res->new_val = 0; 5267 5218 res->poison = false; 5268 5219 res->validate = true; 5220 + res->fail_memsz_adjust = false; 5221 + res->orig_sz = res->new_sz = 0; 5222 + res->orig_type_id = res->new_type_id = 0; 5269 5223 5270 5224 if (core_relo_is_field_based(relo->kind)) { 5271 - err = bpf_core_calc_field_relo(prog, relo, local_spec, &res->orig_val, &res->validate); 5272 - err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec, &res->new_val, NULL); 5225 + err = bpf_core_calc_field_relo(prog, relo, local_spec, 5226 + &res->orig_val, &res->orig_sz, 5227 + &res->orig_type_id, &res->validate); 5228 + err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec, 5229 + &res->new_val, &res->new_sz, 5230 + &res->new_type_id, NULL); 5231 + if (err) 5232 + goto done; 5233 + /* Validate if it's safe to adjust load/store memory size. 5234 + * Adjustments are performed only if original and new memory 5235 + * sizes differ. 5236 + */ 5237 + res->fail_memsz_adjust = false; 5238 + if (res->orig_sz != res->new_sz) { 5239 + const struct btf_type *orig_t, *new_t; 5240 + 5241 + orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id); 5242 + new_t = btf__type_by_id(targ_spec->btf, res->new_type_id); 5243 + 5244 + /* There are two use cases in which it's safe to 5245 + * adjust load/store's mem size: 5246 + * - reading a 32-bit kernel pointer, while on BPF 5247 + * size pointers are always 64-bit; in this case 5248 + * it's safe to "downsize" instruction size due to 5249 + * pointer being treated as unsigned integer with 5250 + * zero-extended upper 32-bits; 5251 + * - reading unsigned integers, again due to 5252 + * zero-extension is preserving the value correctly. 5253 + * 5254 + * In all other cases it's incorrect to attempt to 5255 + * load/store field because read value will be 5256 + * incorrect, so we poison relocated instruction. 5257 + */ 5258 + if (btf_is_ptr(orig_t) && btf_is_ptr(new_t)) 5259 + goto done; 5260 + if (btf_is_int(orig_t) && btf_is_int(new_t) && 5261 + btf_int_encoding(orig_t) != BTF_INT_SIGNED && 5262 + btf_int_encoding(new_t) != BTF_INT_SIGNED) 5263 + goto done; 5264 + 5265 + /* mark as invalid mem size adjustment, but this will 5266 + * only be checked for LDX/STX/ST insns 5267 + */ 5268 + res->fail_memsz_adjust = true; 5269 + } 5273 5270 } else if (core_relo_is_type_based(relo->kind)) { 5274 5271 err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val); 5275 5272 err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val); ··· 5324 5229 err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val); 5325 5230 } 5326 5231 5232 + done: 5327 5233 if (err == -EUCLEAN) { 5328 5234 /* EUCLEAN is used to signal instruction poisoning request */ 5329 5235 res->poison = true; ··· 5364 5268 return insn->code == (BPF_LD | BPF_IMM | BPF_DW); 5365 5269 } 5366 5270 5271 + static int insn_bpf_size_to_bytes(struct bpf_insn *insn) 5272 + { 5273 + switch (BPF_SIZE(insn->code)) { 5274 + case BPF_DW: return 8; 5275 + case BPF_W: return 4; 5276 + case BPF_H: return 2; 5277 + case BPF_B: return 1; 5278 + default: return -1; 5279 + } 5280 + } 5281 + 5282 + static int insn_bytes_to_bpf_size(__u32 sz) 5283 + { 5284 + switch (sz) { 5285 + case 8: return BPF_DW; 5286 + case 4: return BPF_W; 5287 + case 2: return BPF_H; 5288 + case 1: return BPF_B; 5289 + default: return -1; 5290 + } 5291 + } 5292 + 5367 5293 /* 5368 5294 * Patch relocatable BPF instruction. 5369 5295 * ··· 5395 5277 * spec, and is checked before patching instruction. If actual insn->imm value 5396 5278 * is wrong, bail out with error. 5397 5279 * 5398 - * Currently three kinds of BPF instructions are supported: 5280 + * Currently supported classes of BPF instruction are: 5399 5281 * 1. rX = <imm> (assignment with immediate operand); 5400 5282 * 2. rX += <imm> (arithmetic operations with immediate operand); 5401 - * 3. rX = <imm64> (load with 64-bit immediate value). 5283 + * 3. rX = <imm64> (load with 64-bit immediate value); 5284 + * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64}; 5285 + * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64}; 5286 + * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}. 5402 5287 */ 5403 5288 static int bpf_core_patch_insn(struct bpf_program *prog, 5404 5289 const struct bpf_core_relo *relo, ··· 5425 5304 class = BPF_CLASS(insn->code); 5426 5305 5427 5306 if (res->poison) { 5307 + poison: 5428 5308 /* poison second part of ldimm64 to avoid confusing error from 5429 5309 * verifier about "unknown opcode 00" 5430 5310 */ ··· 5468 5346 prog->name, relo_idx, insn_idx, new_val); 5469 5347 return -ERANGE; 5470 5348 } 5349 + if (res->fail_memsz_adjust) { 5350 + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. " 5351 + "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n", 5352 + prog->name, relo_idx, insn_idx); 5353 + goto poison; 5354 + } 5355 + 5471 5356 orig_val = insn->off; 5472 5357 insn->off = new_val; 5473 5358 pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n", 5474 5359 prog->name, relo_idx, insn_idx, orig_val, new_val); 5360 + 5361 + if (res->new_sz != res->orig_sz) { 5362 + int insn_bytes_sz, insn_bpf_sz; 5363 + 5364 + insn_bytes_sz = insn_bpf_size_to_bytes(insn); 5365 + if (insn_bytes_sz != res->orig_sz) { 5366 + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n", 5367 + prog->name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz); 5368 + return -EINVAL; 5369 + } 5370 + 5371 + insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz); 5372 + if (insn_bpf_sz < 0) { 5373 + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n", 5374 + prog->name, relo_idx, insn_idx, res->new_sz); 5375 + return -EINVAL; 5376 + } 5377 + 5378 + insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code); 5379 + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n", 5380 + prog->name, relo_idx, insn_idx, res->orig_sz, res->new_sz); 5381 + } 5475 5382 break; 5476 5383 case BPF_LD: { 5477 5384 __u64 imm; ··· 5842 5691 return 0; 5843 5692 5844 5693 if (targ_btf_path) 5845 - targ_btf = btf__parse_elf(targ_btf_path, NULL); 5694 + targ_btf = btf__parse(targ_btf_path, NULL); 5846 5695 else 5847 5696 targ_btf = obj->btf_vmlinux; 5848 5697 if (IS_ERR_OR_NULL(targ_btf)) { ··· 5893 5742 err = -EINVAL; 5894 5743 goto out; 5895 5744 } 5745 + /* no need to apply CO-RE relocation if the program is 5746 + * not going to be loaded 5747 + */ 5748 + if (!prog->load) 5749 + continue; 5896 5750 5897 5751 err = bpf_core_apply_relo(prog, rec, i, obj->btf, 5898 5752 targ_btf, cand_cache); ··· 5956 5800 insn[0].imm = obj->maps[obj->kconfig_map_idx].fd; 5957 5801 insn[1].imm = ext->kcfg.data_off; 5958 5802 } else /* EXT_KSYM */ { 5959 - insn[0].imm = (__u32)ext->ksym.addr; 5960 - insn[1].imm = ext->ksym.addr >> 32; 5803 + if (ext->ksym.type_id) { /* typed ksyms */ 5804 + insn[0].src_reg = BPF_PSEUDO_BTF_ID; 5805 + insn[0].imm = ext->ksym.vmlinux_btf_id; 5806 + } else { /* typeless ksyms */ 5807 + insn[0].imm = (__u32)ext->ksym.addr; 5808 + insn[1].imm = ext->ksym.addr >> 32; 5809 + } 5961 5810 } 5962 5811 relo->processed = true; 5963 5812 break; ··· 7094 6933 return err; 7095 6934 } 7096 6935 6936 + static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) 6937 + { 6938 + struct extern_desc *ext; 6939 + int i, id; 6940 + 6941 + for (i = 0; i < obj->nr_extern; i++) { 6942 + const struct btf_type *targ_var, *targ_type; 6943 + __u32 targ_type_id, local_type_id; 6944 + const char *targ_var_name; 6945 + int ret; 6946 + 6947 + ext = &obj->externs[i]; 6948 + if (ext->type != EXT_KSYM || !ext->ksym.type_id) 6949 + continue; 6950 + 6951 + id = btf__find_by_name_kind(obj->btf_vmlinux, ext->name, 6952 + BTF_KIND_VAR); 6953 + if (id <= 0) { 6954 + pr_warn("extern (ksym) '%s': failed to find BTF ID in vmlinux BTF.\n", 6955 + ext->name); 6956 + return -ESRCH; 6957 + } 6958 + 6959 + /* find local type_id */ 6960 + local_type_id = ext->ksym.type_id; 6961 + 6962 + /* find target type_id */ 6963 + targ_var = btf__type_by_id(obj->btf_vmlinux, id); 6964 + targ_var_name = btf__name_by_offset(obj->btf_vmlinux, 6965 + targ_var->name_off); 6966 + targ_type = skip_mods_and_typedefs(obj->btf_vmlinux, 6967 + targ_var->type, 6968 + &targ_type_id); 6969 + 6970 + ret = bpf_core_types_are_compat(obj->btf, local_type_id, 6971 + obj->btf_vmlinux, targ_type_id); 6972 + if (ret <= 0) { 6973 + const struct btf_type *local_type; 6974 + const char *targ_name, *local_name; 6975 + 6976 + local_type = btf__type_by_id(obj->btf, local_type_id); 6977 + local_name = btf__name_by_offset(obj->btf, 6978 + local_type->name_off); 6979 + targ_name = btf__name_by_offset(obj->btf_vmlinux, 6980 + targ_type->name_off); 6981 + 6982 + pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", 6983 + ext->name, local_type_id, 6984 + btf_kind_str(local_type), local_name, targ_type_id, 6985 + btf_kind_str(targ_type), targ_name); 6986 + return -EINVAL; 6987 + } 6988 + 6989 + ext->is_set = true; 6990 + ext->ksym.vmlinux_btf_id = id; 6991 + pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n", 6992 + ext->name, id, btf_kind_str(targ_var), targ_var_name); 6993 + } 6994 + return 0; 6995 + } 6996 + 7097 6997 static int bpf_object__resolve_externs(struct bpf_object *obj, 7098 6998 const char *extra_kconfig) 7099 6999 { 7100 7000 bool need_config = false, need_kallsyms = false; 7001 + bool need_vmlinux_btf = false; 7101 7002 struct extern_desc *ext; 7102 7003 void *kcfg_data = NULL; 7103 7004 int err, i; ··· 7190 6967 strncmp(ext->name, "CONFIG_", 7) == 0) { 7191 6968 need_config = true; 7192 6969 } else if (ext->type == EXT_KSYM) { 7193 - need_kallsyms = true; 6970 + if (ext->ksym.type_id) 6971 + need_vmlinux_btf = true; 6972 + else 6973 + need_kallsyms = true; 7194 6974 } else { 7195 6975 pr_warn("unrecognized extern '%s'\n", ext->name); 7196 6976 return -EINVAL; ··· 7219 6993 } 7220 6994 if (need_kallsyms) { 7221 6995 err = bpf_object__read_kallsyms_file(obj); 6996 + if (err) 6997 + return -EINVAL; 6998 + } 6999 + if (need_vmlinux_btf) { 7000 + err = bpf_object__resolve_ksyms_btf_id(obj); 7222 7001 if (err) 7223 7002 return -EINVAL; 7224 7003 } ··· 7259 7028 } 7260 7029 7261 7030 err = bpf_object__probe_loading(obj); 7031 + err = err ? : bpf_object__load_vmlinux_btf(obj); 7262 7032 err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); 7263 7033 err = err ? : bpf_object__sanitize_and_load_btf(obj); 7264 7034 err = err ? : bpf_object__sanitize_maps(obj); 7265 - err = err ? : bpf_object__load_vmlinux_btf(obj); 7266 7035 err = err ? : bpf_object__init_kern_struct_ops_maps(obj); 7267 7036 err = err ? : bpf_object__create_maps(obj); 7268 7037 err = err ? : bpf_object__relocate(obj, attr->target_btf_path); ··· 10584 10353 btf_id = libbpf_find_prog_btf_id(attach_func_name, 10585 10354 attach_prog_fd); 10586 10355 else 10587 - btf_id = __find_vmlinux_btf_id(prog->obj->btf_vmlinux, 10588 - attach_func_name, 10589 - prog->expected_attach_type); 10356 + btf_id = libbpf_find_vmlinux_btf_id(attach_func_name, 10357 + prog->expected_attach_type); 10590 10358 10591 10359 if (btf_id < 0) 10592 10360 return btf_id;

+6 -1

tools/lib/bpf/xsk.c

··· 705 705 struct xsk_ctx *ctx; 706 706 int err, ifindex; 707 707 708 - if (!umem || !xsk_ptr || !(rx || tx) || !fill || !comp) 708 + if (!umem || !xsk_ptr || !(rx || tx)) 709 709 return -EFAULT; 710 710 711 711 xsk = calloc(1, sizeof(*xsk)); ··· 735 735 736 736 ctx = xsk_get_ctx(umem, ifindex, queue_id); 737 737 if (!ctx) { 738 + if (!fill || !comp) { 739 + err = -EFAULT; 740 + goto out_socket; 741 + } 742 + 738 743 ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id, 739 744 fill, comp); 740 745 if (!ctx) {

+38

tools/testing/selftests/bpf/README.rst

··· 7 7 Additional information about selftest failures are 8 8 documented here. 9 9 10 + profiler[23] test failures with clang/llvm <12.0.0 11 + ================================================== 12 + 13 + With clang/llvm <12.0.0, the profiler[23] test may fail. 14 + The symptom looks like 15 + 16 + .. code-block:: c 17 + 18 + // r9 is a pointer to map_value 19 + // r7 is a scalar 20 + 17: bf 96 00 00 00 00 00 00 r6 = r9 21 + 18: 0f 76 00 00 00 00 00 00 r6 += r7 22 + math between map_value pointer and register with unbounded min value is not allowed 23 + 24 + // the instructions below will not be seen in the verifier log 25 + 19: a5 07 01 00 01 01 00 00 if r7 < 257 goto +1 26 + 20: bf 96 00 00 00 00 00 00 r6 = r9 27 + // r6 is used here 28 + 29 + The verifier will reject such code with above error. 30 + At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and 31 + the insn 20 undoes map_value addition. It is currently impossible for the 32 + verifier to understand such speculative pointer arithmetic. 33 + Hence 34 + https://reviews.llvm.org/D85570 35 + addresses it on the compiler side. It was committed on llvm 12. 36 + 37 + The corresponding C code 38 + .. code-block:: c 39 + 40 + for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) { 41 + filepart_length = bpf_probe_read_str(payload, ...); 42 + if (filepart_length <= MAX_PATH) { 43 + barrier_var(filepart_length); // workaround 44 + payload += filepart_length; 45 + } 46 + } 47 + 10 48 bpf_iter test failures with clang/llvm 10.0.0 11 49 ============================================= 12 50

+8 -8

tools/testing/selftests/bpf/prog_tests/align.c

··· 195 195 .prog_type = BPF_PROG_TYPE_SCHED_CLS, 196 196 .matches = { 197 197 {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, 198 - {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, 198 + {8, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, 199 199 {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, 200 - {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, 200 + {10, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, 201 201 {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, 202 - {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, 202 + {12, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, 203 203 {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, 204 - {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, 204 + {14, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, 205 205 {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, 206 206 {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, 207 207 }, ··· 518 518 * the total offset is 4-byte aligned and meets the 519 519 * load's requirements. 520 520 */ 521 - {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, 521 + {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, 522 522 523 523 }, 524 524 }, ··· 561 561 /* Adding 14 makes R6 be (4n+2) */ 562 562 {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, 563 563 /* Subtracting from packet pointer overflows ubounds */ 564 - {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, 564 + {13, "R5_w=pkt(id=2,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, 565 565 /* New unknown value in R7 is (4n), >= 76 */ 566 566 {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, 567 567 /* Adding it to packet pointer gives nice bounds again */ 568 - {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, 568 + {16, "R5_w=pkt(id=3,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, 569 569 /* At the time the word size load is performed from R5, 570 570 * its total fixed offset is NET_IP_ALIGN + reg->off (0) 571 571 * which is 2. Then the variable offset is (4n+2), so 572 572 * the total offset is 4-byte aligned and meets the 573 573 * load's requirements. 574 574 */ 575 - {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, 575 + {20, "R5=pkt(id=3,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, 576 576 }, 577 577 }, 578 578 };

+29 -10

tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c

··· 55 55 56 56 static void test_lookup_update(void) 57 57 { 58 - int err, key = 0, val, i; 58 + int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id; 59 + int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd; 59 60 struct test_btf_map_in_map *skel; 60 - int outer_arr_fd, outer_hash_fd; 61 - int fd, map1_fd, map2_fd, map1_id, map2_id; 61 + int err, key = 0, val, i, fd; 62 62 63 63 skel = test_btf_map_in_map__open_and_load(); 64 64 if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) ··· 70 70 71 71 map1_fd = bpf_map__fd(skel->maps.inner_map1); 72 72 map2_fd = bpf_map__fd(skel->maps.inner_map2); 73 + map3_fd = bpf_map__fd(skel->maps.inner_map3); 74 + map4_fd = bpf_map__fd(skel->maps.inner_map4); 75 + map5_fd = bpf_map__fd(skel->maps.inner_map5); 76 + outer_arr_dyn_fd = bpf_map__fd(skel->maps.outer_arr_dyn); 73 77 outer_arr_fd = bpf_map__fd(skel->maps.outer_arr); 74 78 outer_hash_fd = bpf_map__fd(skel->maps.outer_hash); 75 79 76 - /* inner1 = input, inner2 = input + 1 */ 77 - map1_fd = bpf_map__fd(skel->maps.inner_map1); 80 + /* inner1 = input, inner2 = input + 1, inner3 = input + 2 */ 78 81 bpf_map_update_elem(outer_arr_fd, &key, &map1_fd, 0); 79 - map2_fd = bpf_map__fd(skel->maps.inner_map2); 80 82 bpf_map_update_elem(outer_hash_fd, &key, &map2_fd, 0); 83 + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map3_fd, 0); 81 84 skel->bss->input = 1; 82 85 usleep(1); 83 - 84 86 bpf_map_lookup_elem(map1_fd, &key, &val); 85 87 CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1); 86 88 bpf_map_lookup_elem(map2_fd, &key, &val); 87 89 CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2); 90 + bpf_map_lookup_elem(map3_fd, &key, &val); 91 + CHECK(val != 3, "inner3", "got %d != exp %d\n", val, 3); 88 92 89 - /* inner1 = input + 1, inner2 = input */ 93 + /* inner2 = input, inner1 = input + 1, inner4 = input + 2 */ 90 94 bpf_map_update_elem(outer_arr_fd, &key, &map2_fd, 0); 91 95 bpf_map_update_elem(outer_hash_fd, &key, &map1_fd, 0); 96 + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map4_fd, 0); 92 97 skel->bss->input = 3; 93 98 usleep(1); 94 - 95 99 bpf_map_lookup_elem(map1_fd, &key, &val); 96 100 CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4); 97 101 bpf_map_lookup_elem(map2_fd, &key, &val); 98 102 CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3); 103 + bpf_map_lookup_elem(map4_fd, &key, &val); 104 + CHECK(val != 5, "inner4", "got %d != exp %d\n", val, 5); 105 + 106 + /* inner5 = input + 2 */ 107 + bpf_map_update_elem(outer_arr_dyn_fd, &key, &map5_fd, 0); 108 + skel->bss->input = 5; 109 + usleep(1); 110 + bpf_map_lookup_elem(map5_fd, &key, &val); 111 + CHECK(val != 7, "inner5", "got %d != exp %d\n", val, 7); 99 112 100 113 for (i = 0; i < 5; i++) { 101 114 val = i % 2 ? map1_fd : map2_fd; ··· 119 106 } 120 107 err = bpf_map_update_elem(outer_arr_fd, &key, &val, 0); 121 108 if (CHECK_FAIL(err)) { 122 - printf("failed to update hash_of_maps on iter #%d\n", i); 109 + printf("failed to update array_of_maps on iter #%d\n", i); 110 + goto cleanup; 111 + } 112 + val = i % 2 ? map4_fd : map5_fd; 113 + err = bpf_map_update_elem(outer_arr_dyn_fd, &key, &val, 0); 114 + if (CHECK_FAIL(err)) { 115 + printf("failed to update array_of_maps (dyn) on iter #%d\n", i); 123 116 goto cleanup; 124 117 } 125 118 }

+225

tools/testing/selftests/bpf/prog_tests/core_autosize.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + 4 + #include <test_progs.h> 5 + #include <bpf/btf.h> 6 + 7 + /* real layout and sizes according to test's (32-bit) BTF 8 + * needs to be defined before skeleton is included */ 9 + struct test_struct___real { 10 + unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */ 11 + unsigned int val2; 12 + unsigned long long val1; 13 + unsigned short val3; 14 + unsigned char val4; 15 + unsigned char _pad; 16 + }; 17 + 18 + #include "test_core_autosize.skel.h" 19 + 20 + static int duration = 0; 21 + 22 + static struct { 23 + unsigned long long ptr_samesized; 24 + unsigned long long val1_samesized; 25 + unsigned long long val2_samesized; 26 + unsigned long long val3_samesized; 27 + unsigned long long val4_samesized; 28 + struct test_struct___real output_samesized; 29 + 30 + unsigned long long ptr_downsized; 31 + unsigned long long val1_downsized; 32 + unsigned long long val2_downsized; 33 + unsigned long long val3_downsized; 34 + unsigned long long val4_downsized; 35 + struct test_struct___real output_downsized; 36 + 37 + unsigned long long ptr_probed; 38 + unsigned long long val1_probed; 39 + unsigned long long val2_probed; 40 + unsigned long long val3_probed; 41 + unsigned long long val4_probed; 42 + 43 + unsigned long long ptr_signed; 44 + unsigned long long val1_signed; 45 + unsigned long long val2_signed; 46 + unsigned long long val3_signed; 47 + unsigned long long val4_signed; 48 + struct test_struct___real output_signed; 49 + } out; 50 + 51 + void test_core_autosize(void) 52 + { 53 + char btf_file[] = "/tmp/core_autosize.btf.XXXXXX"; 54 + int err, fd = -1, zero = 0; 55 + int char_id, short_id, int_id, long_long_id, void_ptr_id, id; 56 + struct test_core_autosize* skel = NULL; 57 + struct bpf_object_load_attr load_attr = {}; 58 + struct bpf_program *prog; 59 + struct bpf_map *bss_map; 60 + struct btf *btf = NULL; 61 + size_t written; 62 + const void *raw_data; 63 + __u32 raw_sz; 64 + FILE *f = NULL; 65 + 66 + btf = btf__new_empty(); 67 + if (!ASSERT_OK_PTR(btf, "empty_btf")) 68 + return; 69 + /* Emit the following struct with 32-bit pointer size: 70 + * 71 + * struct test_struct { 72 + * void *ptr; 73 + * unsigned long val2; 74 + * unsigned long long val1; 75 + * unsigned short val3; 76 + * unsigned char val4; 77 + * char: 8; 78 + * }; 79 + * 80 + * This struct is going to be used as the "kernel BTF" for this test. 81 + * It's equivalent memory-layout-wise to test_struct__real above. 82 + */ 83 + 84 + /* force 32-bit pointer size */ 85 + btf__set_pointer_size(btf, 4); 86 + 87 + char_id = btf__add_int(btf, "unsigned char", 1, 0); 88 + ASSERT_EQ(char_id, 1, "char_id"); 89 + short_id = btf__add_int(btf, "unsigned short", 2, 0); 90 + ASSERT_EQ(short_id, 2, "short_id"); 91 + /* "long unsigned int" of 4 byte size tells BTF that sizeof(void *) == 4 */ 92 + int_id = btf__add_int(btf, "long unsigned int", 4, 0); 93 + ASSERT_EQ(int_id, 3, "int_id"); 94 + long_long_id = btf__add_int(btf, "unsigned long long", 8, 0); 95 + ASSERT_EQ(long_long_id, 4, "long_long_id"); 96 + void_ptr_id = btf__add_ptr(btf, 0); 97 + ASSERT_EQ(void_ptr_id, 5, "void_ptr_id"); 98 + 99 + id = btf__add_struct(btf, "test_struct", 20 /* bytes */); 100 + ASSERT_EQ(id, 6, "struct_id"); 101 + err = btf__add_field(btf, "ptr", void_ptr_id, 0, 0); 102 + err = err ?: btf__add_field(btf, "val2", int_id, 32, 0); 103 + err = err ?: btf__add_field(btf, "val1", long_long_id, 64, 0); 104 + err = err ?: btf__add_field(btf, "val3", short_id, 128, 0); 105 + err = err ?: btf__add_field(btf, "val4", char_id, 144, 0); 106 + ASSERT_OK(err, "struct_fields"); 107 + 108 + fd = mkstemp(btf_file); 109 + if (CHECK(fd < 0, "btf_tmp", "failed to create file: %d\n", fd)) 110 + goto cleanup; 111 + f = fdopen(fd, "w"); 112 + if (!ASSERT_OK_PTR(f, "btf_fdopen")) 113 + goto cleanup; 114 + 115 + raw_data = btf__get_raw_data(btf, &raw_sz); 116 + if (!ASSERT_OK_PTR(raw_data, "raw_data")) 117 + goto cleanup; 118 + written = fwrite(raw_data, 1, raw_sz, f); 119 + if (CHECK(written != raw_sz, "btf_write", "written: %zu, errno: %d\n", written, errno)) 120 + goto cleanup; 121 + fflush(f); 122 + fclose(f); 123 + f = NULL; 124 + close(fd); 125 + fd = -1; 126 + 127 + /* open and load BPF program with custom BTF as the kernel BTF */ 128 + skel = test_core_autosize__open(); 129 + if (!ASSERT_OK_PTR(skel, "skel_open")) 130 + return; 131 + 132 + /* disable handle_signed() for now */ 133 + prog = bpf_object__find_program_by_name(skel->obj, "handle_signed"); 134 + if (!ASSERT_OK_PTR(prog, "prog_find")) 135 + goto cleanup; 136 + bpf_program__set_autoload(prog, false); 137 + 138 + load_attr.obj = skel->obj; 139 + load_attr.target_btf_path = btf_file; 140 + err = bpf_object__load_xattr(&load_attr); 141 + if (!ASSERT_OK(err, "prog_load")) 142 + goto cleanup; 143 + 144 + prog = bpf_object__find_program_by_name(skel->obj, "handle_samesize"); 145 + if (!ASSERT_OK_PTR(prog, "prog_find")) 146 + goto cleanup; 147 + skel->links.handle_samesize = bpf_program__attach(prog); 148 + if (!ASSERT_OK_PTR(skel->links.handle_samesize, "prog_attach")) 149 + goto cleanup; 150 + 151 + prog = bpf_object__find_program_by_name(skel->obj, "handle_downsize"); 152 + if (!ASSERT_OK_PTR(prog, "prog_find")) 153 + goto cleanup; 154 + skel->links.handle_downsize = bpf_program__attach(prog); 155 + if (!ASSERT_OK_PTR(skel->links.handle_downsize, "prog_attach")) 156 + goto cleanup; 157 + 158 + prog = bpf_object__find_program_by_name(skel->obj, "handle_probed"); 159 + if (!ASSERT_OK_PTR(prog, "prog_find")) 160 + goto cleanup; 161 + skel->links.handle_probed = bpf_program__attach(prog); 162 + if (!ASSERT_OK_PTR(skel->links.handle_probed, "prog_attach")) 163 + goto cleanup; 164 + 165 + usleep(1); 166 + 167 + bss_map = bpf_object__find_map_by_name(skel->obj, "test_cor.bss"); 168 + if (!ASSERT_OK_PTR(bss_map, "bss_map_find")) 169 + goto cleanup; 170 + 171 + err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &zero, (void *)&out); 172 + if (!ASSERT_OK(err, "bss_lookup")) 173 + goto cleanup; 174 + 175 + ASSERT_EQ(out.ptr_samesized, 0x01020304, "ptr_samesized"); 176 + ASSERT_EQ(out.val1_samesized, 0x1020304050607080, "val1_samesized"); 177 + ASSERT_EQ(out.val2_samesized, 0x0a0b0c0d, "val2_samesized"); 178 + ASSERT_EQ(out.val3_samesized, 0xfeed, "val3_samesized"); 179 + ASSERT_EQ(out.val4_samesized, 0xb9, "val4_samesized"); 180 + ASSERT_EQ(out.output_samesized.ptr, 0x01020304, "ptr_samesized"); 181 + ASSERT_EQ(out.output_samesized.val1, 0x1020304050607080, "val1_samesized"); 182 + ASSERT_EQ(out.output_samesized.val2, 0x0a0b0c0d, "val2_samesized"); 183 + ASSERT_EQ(out.output_samesized.val3, 0xfeed, "val3_samesized"); 184 + ASSERT_EQ(out.output_samesized.val4, 0xb9, "val4_samesized"); 185 + 186 + ASSERT_EQ(out.ptr_downsized, 0x01020304, "ptr_downsized"); 187 + ASSERT_EQ(out.val1_downsized, 0x1020304050607080, "val1_downsized"); 188 + ASSERT_EQ(out.val2_downsized, 0x0a0b0c0d, "val2_downsized"); 189 + ASSERT_EQ(out.val3_downsized, 0xfeed, "val3_downsized"); 190 + ASSERT_EQ(out.val4_downsized, 0xb9, "val4_downsized"); 191 + ASSERT_EQ(out.output_downsized.ptr, 0x01020304, "ptr_downsized"); 192 + ASSERT_EQ(out.output_downsized.val1, 0x1020304050607080, "val1_downsized"); 193 + ASSERT_EQ(out.output_downsized.val2, 0x0a0b0c0d, "val2_downsized"); 194 + ASSERT_EQ(out.output_downsized.val3, 0xfeed, "val3_downsized"); 195 + ASSERT_EQ(out.output_downsized.val4, 0xb9, "val4_downsized"); 196 + 197 + ASSERT_EQ(out.ptr_probed, 0x01020304, "ptr_probed"); 198 + ASSERT_EQ(out.val1_probed, 0x1020304050607080, "val1_probed"); 199 + ASSERT_EQ(out.val2_probed, 0x0a0b0c0d, "val2_probed"); 200 + ASSERT_EQ(out.val3_probed, 0xfeed, "val3_probed"); 201 + ASSERT_EQ(out.val4_probed, 0xb9, "val4_probed"); 202 + 203 + test_core_autosize__destroy(skel); 204 + skel = NULL; 205 + 206 + /* now re-load with handle_signed() enabled, it should fail loading */ 207 + skel = test_core_autosize__open(); 208 + if (!ASSERT_OK_PTR(skel, "skel_open")) 209 + return; 210 + 211 + load_attr.obj = skel->obj; 212 + load_attr.target_btf_path = btf_file; 213 + err = bpf_object__load_xattr(&load_attr); 214 + if (!ASSERT_ERR(err, "bad_prog_load")) 215 + goto cleanup; 216 + 217 + cleanup: 218 + if (f) 219 + fclose(f); 220 + if (fd >= 0) 221 + close(fd); 222 + remove(btf_file); 223 + btf__free(btf); 224 + test_core_autosize__destroy(skel); 225 + }

+13 -25

tools/testing/selftests/bpf/prog_tests/ksyms.c

··· 7 7 8 8 static int duration; 9 9 10 - static __u64 kallsyms_find(const char *sym) 11 - { 12 - char type, name[500]; 13 - __u64 addr, res = 0; 14 - FILE *f; 15 - 16 - f = fopen("/proc/kallsyms", "r"); 17 - if (CHECK(!f, "kallsyms_fopen", "failed to open: %d\n", errno)) 18 - return 0; 19 - 20 - while (fscanf(f, "%llx %c %499s%*[^\n]\n", &addr, &type, name) > 0) { 21 - if (strcmp(name, sym) == 0) { 22 - res = addr; 23 - goto out; 24 - } 25 - } 26 - 27 - CHECK(false, "not_found", "symbol %s not found\n", sym); 28 - out: 29 - fclose(f); 30 - return res; 31 - } 32 - 33 10 void test_ksyms(void) 34 11 { 35 - __u64 per_cpu_start_addr = kallsyms_find("__per_cpu_start"); 36 - __u64 link_fops_addr = kallsyms_find("bpf_link_fops"); 37 12 const char *btf_path = "/sys/kernel/btf/vmlinux"; 38 13 struct test_ksyms *skel; 39 14 struct test_ksyms__data *data; 15 + __u64 link_fops_addr, per_cpu_start_addr; 40 16 struct stat st; 41 17 __u64 btf_size; 42 18 int err; 19 + 20 + err = kallsyms_find("bpf_link_fops", &link_fops_addr); 21 + if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) 22 + return; 23 + if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n")) 24 + return; 25 + 26 + err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr); 27 + if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) 28 + return; 29 + if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n")) 30 + return; 43 31 44 32 if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno)) 45 33 return;

+88

tools/testing/selftests/bpf/prog_tests/ksyms_btf.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Google */ 3 + 4 + #include <test_progs.h> 5 + #include <bpf/libbpf.h> 6 + #include <bpf/btf.h> 7 + #include "test_ksyms_btf.skel.h" 8 + 9 + static int duration; 10 + 11 + void test_ksyms_btf(void) 12 + { 13 + __u64 runqueues_addr, bpf_prog_active_addr; 14 + __u32 this_rq_cpu; 15 + int this_bpf_prog_active; 16 + struct test_ksyms_btf *skel = NULL; 17 + struct test_ksyms_btf__data *data; 18 + struct btf *btf; 19 + int percpu_datasec; 20 + int err; 21 + 22 + err = kallsyms_find("runqueues", &runqueues_addr); 23 + if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) 24 + return; 25 + if (CHECK(err == -ENOENT, "ksym_find", "symbol 'runqueues' not found\n")) 26 + return; 27 + 28 + err = kallsyms_find("bpf_prog_active", &bpf_prog_active_addr); 29 + if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno)) 30 + return; 31 + if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_prog_active' not found\n")) 32 + return; 33 + 34 + btf = libbpf_find_kernel_btf(); 35 + if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n", 36 + PTR_ERR(btf))) 37 + return; 38 + 39 + percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu", 40 + BTF_KIND_DATASEC); 41 + if (percpu_datasec < 0) { 42 + printf("%s:SKIP:no PERCPU DATASEC in kernel btf\n", 43 + __func__); 44 + test__skip(); 45 + goto cleanup; 46 + } 47 + 48 + skel = test_ksyms_btf__open_and_load(); 49 + if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n")) 50 + goto cleanup; 51 + 52 + err = test_ksyms_btf__attach(skel); 53 + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) 54 + goto cleanup; 55 + 56 + /* trigger tracepoint */ 57 + usleep(1); 58 + 59 + data = skel->data; 60 + CHECK(data->out__runqueues_addr != runqueues_addr, "runqueues_addr", 61 + "got %llu, exp %llu\n", 62 + (unsigned long long)data->out__runqueues_addr, 63 + (unsigned long long)runqueues_addr); 64 + CHECK(data->out__bpf_prog_active_addr != bpf_prog_active_addr, "bpf_prog_active_addr", 65 + "got %llu, exp %llu\n", 66 + (unsigned long long)data->out__bpf_prog_active_addr, 67 + (unsigned long long)bpf_prog_active_addr); 68 + 69 + CHECK(data->out__rq_cpu == -1, "rq_cpu", 70 + "got %u, exp != -1\n", data->out__rq_cpu); 71 + CHECK(data->out__bpf_prog_active < 0, "bpf_prog_active", 72 + "got %d, exp >= 0\n", data->out__bpf_prog_active); 73 + CHECK(data->out__cpu_0_rq_cpu != 0, "cpu_rq(0)->cpu", 74 + "got %u, exp 0\n", data->out__cpu_0_rq_cpu); 75 + 76 + this_rq_cpu = data->out__this_rq_cpu; 77 + CHECK(this_rq_cpu != data->out__rq_cpu, "this_rq_cpu", 78 + "got %u, exp %u\n", this_rq_cpu, data->out__rq_cpu); 79 + 80 + this_bpf_prog_active = data->out__this_bpf_prog_active; 81 + CHECK(this_bpf_prog_active != data->out__bpf_prog_active, "this_bpf_prog_active", 82 + "got %d, exp %d\n", this_bpf_prog_active, 83 + data->out__bpf_prog_active); 84 + 85 + cleanup: 86 + btf__free(btf); 87 + test_ksyms_btf__destroy(skel); 88 + }

+48 -1

tools/testing/selftests/bpf/prog_tests/pinning.c

··· 37 37 struct stat statbuf = {}; 38 38 struct bpf_object *obj; 39 39 struct bpf_map *map; 40 - int err; 40 + int err, map_fd; 41 41 DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, 42 42 .pin_root_path = custpath, 43 43 ); ··· 213 213 if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno)) 214 214 goto out; 215 215 216 + /* remove the custom pin path to re-test it with reuse fd below */ 217 + err = unlink(custpinpath); 218 + if (CHECK(err, "unlink custpinpath", "err %d errno %d\n", err, errno)) 219 + goto out; 220 + 221 + err = rmdir(custpath); 222 + if (CHECK(err, "rmdir custpindir", "err %d errno %d\n", err, errno)) 223 + goto out; 224 + 225 + bpf_object__close(obj); 226 + 227 + /* test pinning at custom path with reuse fd */ 228 + obj = bpf_object__open_file(file, NULL); 229 + err = libbpf_get_error(obj); 230 + if (CHECK(err, "default open", "err %d errno %d\n", err, errno)) { 231 + obj = NULL; 232 + goto out; 233 + } 234 + 235 + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(__u32), 236 + sizeof(__u64), 1, 0); 237 + if (CHECK(map_fd < 0, "create pinmap manually", "fd %d\n", map_fd)) 238 + goto out; 239 + 240 + map = bpf_object__find_map_by_name(obj, "pinmap"); 241 + if (CHECK(!map, "find map", "NULL map")) 242 + goto close_map_fd; 243 + 244 + err = bpf_map__reuse_fd(map, map_fd); 245 + if (CHECK(err, "reuse pinmap fd", "err %d errno %d\n", err, errno)) 246 + goto close_map_fd; 247 + 248 + err = bpf_map__set_pin_path(map, custpinpath); 249 + if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno)) 250 + goto close_map_fd; 251 + 252 + err = bpf_object__load(obj); 253 + if (CHECK(err, "custom load", "err %d errno %d\n", err, errno)) 254 + goto close_map_fd; 255 + 256 + /* check that pinmap was pinned at the custom path */ 257 + err = stat(custpinpath, &statbuf); 258 + if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno)) 259 + goto close_map_fd; 260 + 261 + close_map_fd: 262 + close(map_fd); 216 263 out: 217 264 unlink(pinpath); 218 265 unlink(nopinpath);

+1 -1

tools/testing/selftests/bpf/prog_tests/sockmap_basic.c

··· 198 198 { 199 199 DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); 200 200 int err, len, src_fd, iter_fd, duration = 0; 201 - union bpf_iter_link_info linfo = {0}; 201 + union bpf_iter_link_info linfo = {}; 202 202 __u32 i, num_sockets, num_elems; 203 203 struct bpf_iter_sockmap *skel; 204 204 __s64 *sock_fd = NULL;

+12

tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c

··· 264 264 265 265 static void check_hdr_and_close_fds(struct sk_fds *sk_fds) 266 266 { 267 + const __u32 expected_inherit_cb_flags = 268 + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | 269 + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG | 270 + BPF_SOCK_OPS_STATE_CB_FLAG; 271 + 267 272 if (sk_fds_shutdown(sk_fds)) 273 + goto check_linum; 274 + 275 + if (CHECK(expected_inherit_cb_flags != skel->bss->inherit_cb_flags, 276 + "Unexpected inherit_cb_flags", "0x%x != 0x%x\n", 277 + skel->bss->inherit_cb_flags, expected_inherit_cb_flags)) 268 278 goto check_linum; 269 279 270 280 if (check_hdr_stg(&exp_passive_hdr_stg, sk_fds->passive_fd, ··· 330 320 331 321 memset(&skel->bss->active_estab_in, 0, optsize); 332 322 memset(&skel->bss->active_fin_in, 0, optsize); 323 + 324 + skel->bss->inherit_cb_flags = 0; 333 325 334 326 skel->data->test_kind = TCPOPT_EXP; 335 327 skel->data->test_magic = 0xeB9F;

+72

tools/testing/selftests/bpf/prog_tests/test_profiler.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include <test_progs.h> 4 + #include "progs/profiler.h" 5 + #include "profiler1.skel.h" 6 + #include "profiler2.skel.h" 7 + #include "profiler3.skel.h" 8 + 9 + static int sanity_run(struct bpf_program *prog) 10 + { 11 + struct bpf_prog_test_run_attr test_attr = {}; 12 + __u64 args[] = {1, 2, 3}; 13 + __u32 duration = 0; 14 + int err, prog_fd; 15 + 16 + prog_fd = bpf_program__fd(prog); 17 + test_attr.prog_fd = prog_fd; 18 + test_attr.ctx_in = args; 19 + test_attr.ctx_size_in = sizeof(args); 20 + err = bpf_prog_test_run_xattr(&test_attr); 21 + if (CHECK(err || test_attr.retval, "test_run", 22 + "err %d errno %d retval %d duration %d\n", 23 + err, errno, test_attr.retval, duration)) 24 + return -1; 25 + return 0; 26 + } 27 + 28 + void test_test_profiler(void) 29 + { 30 + struct profiler1 *profiler1_skel = NULL; 31 + struct profiler2 *profiler2_skel = NULL; 32 + struct profiler3 *profiler3_skel = NULL; 33 + __u32 duration = 0; 34 + int err; 35 + 36 + profiler1_skel = profiler1__open_and_load(); 37 + if (CHECK(!profiler1_skel, "profiler1_skel_load", "profiler1 skeleton failed\n")) 38 + goto cleanup; 39 + 40 + err = profiler1__attach(profiler1_skel); 41 + if (CHECK(err, "profiler1_attach", "profiler1 attach failed: %d\n", err)) 42 + goto cleanup; 43 + 44 + if (sanity_run(profiler1_skel->progs.raw_tracepoint__sched_process_exec)) 45 + goto cleanup; 46 + 47 + profiler2_skel = profiler2__open_and_load(); 48 + if (CHECK(!profiler2_skel, "profiler2_skel_load", "profiler2 skeleton failed\n")) 49 + goto cleanup; 50 + 51 + err = profiler2__attach(profiler2_skel); 52 + if (CHECK(err, "profiler2_attach", "profiler2 attach failed: %d\n", err)) 53 + goto cleanup; 54 + 55 + if (sanity_run(profiler2_skel->progs.raw_tracepoint__sched_process_exec)) 56 + goto cleanup; 57 + 58 + profiler3_skel = profiler3__open_and_load(); 59 + if (CHECK(!profiler3_skel, "profiler3_skel_load", "profiler3 skeleton failed\n")) 60 + goto cleanup; 61 + 62 + err = profiler3__attach(profiler3_skel); 63 + if (CHECK(err, "profiler3_attach", "profiler3 attach failed: %d\n", err)) 64 + goto cleanup; 65 + 66 + if (sanity_run(profiler3_skel->progs.raw_tracepoint__sched_process_exec)) 67 + goto cleanup; 68 + cleanup: 69 + profiler1__destroy(profiler1_skel); 70 + profiler2__destroy(profiler2_skel); 71 + profiler3__destroy(profiler3_skel); 72 + }

+1 -1

tools/testing/selftests/bpf/prog_tests/xdp_noinline.c

··· 25 25 __u8 flags; 26 26 } real_def = {.dst = MAGIC_VAL}; 27 27 __u32 ch_key = 11, real_num = 3; 28 - __u32 duration, retval, size; 28 + __u32 duration = 0, retval, size; 29 29 int err, i; 30 30 __u64 bytes = 0, pkts = 0; 31 31 char buf[128];

+19

tools/testing/selftests/bpf/progs/connect4_prog.c

··· 23 23 #define TCP_CA_NAME_MAX 16 24 24 #endif 25 25 26 + #ifndef TCP_NOTSENT_LOWAT 27 + #define TCP_NOTSENT_LOWAT 25 28 + #endif 29 + 26 30 #ifndef IFNAMSIZ 27 31 #define IFNAMSIZ 16 28 32 #endif ··· 132 128 return 0; 133 129 } 134 130 131 + static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx) 132 + { 133 + int lowat = 65535; 134 + 135 + if (ctx->type == SOCK_STREAM) { 136 + if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat))) 137 + return 1; 138 + } 139 + 140 + return 0; 141 + } 142 + 135 143 SEC("cgroup/connect4") 136 144 int connect_v4_prog(struct bpf_sock_addr *ctx) 137 145 { ··· 162 146 return 0; 163 147 164 148 if (set_keepalive(ctx)) 149 + return 0; 150 + 151 + if (set_notsent_lowat(ctx)) 165 152 return 0; 166 153 167 154 if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM)

+177

tools/testing/selftests/bpf/progs/profiler.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #pragma once 4 + 5 + #define TASK_COMM_LEN 16 6 + #define MAX_ANCESTORS 4 7 + #define MAX_PATH 256 8 + #define KILL_TARGET_LEN 64 9 + #define CTL_MAXNAME 10 10 + #define MAX_ARGS_LEN 4096 11 + #define MAX_FILENAME_LEN 512 12 + #define MAX_ENVIRON_LEN 8192 13 + #define MAX_PATH_DEPTH 32 14 + #define MAX_FILEPATH_LENGTH (MAX_PATH_DEPTH * MAX_PATH) 15 + #define MAX_CGROUPS_PATH_DEPTH 8 16 + 17 + #define MAX_METADATA_PAYLOAD_LEN TASK_COMM_LEN 18 + 19 + #define MAX_CGROUP_PAYLOAD_LEN \ 20 + (MAX_PATH * 2 + (MAX_PATH * MAX_CGROUPS_PATH_DEPTH)) 21 + 22 + #define MAX_CAP_PAYLOAD_LEN (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN) 23 + 24 + #define MAX_SYSCTL_PAYLOAD_LEN \ 25 + (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + CTL_MAXNAME + MAX_PATH) 26 + 27 + #define MAX_KILL_PAYLOAD_LEN \ 28 + (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + TASK_COMM_LEN + \ 29 + KILL_TARGET_LEN) 30 + 31 + #define MAX_EXEC_PAYLOAD_LEN \ 32 + (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILENAME_LEN + \ 33 + MAX_ARGS_LEN + MAX_ENVIRON_LEN) 34 + 35 + #define MAX_FILEMOD_PAYLOAD_LEN \ 36 + (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILEPATH_LENGTH + \ 37 + MAX_FILEPATH_LENGTH) 38 + 39 + enum data_type { 40 + INVALID_EVENT, 41 + EXEC_EVENT, 42 + FORK_EVENT, 43 + KILL_EVENT, 44 + SYSCTL_EVENT, 45 + FILEMOD_EVENT, 46 + MAX_DATA_TYPE_EVENT 47 + }; 48 + 49 + enum filemod_type { 50 + FMOD_OPEN, 51 + FMOD_LINK, 52 + FMOD_SYMLINK, 53 + }; 54 + 55 + struct ancestors_data_t { 56 + pid_t ancestor_pids[MAX_ANCESTORS]; 57 + uint32_t ancestor_exec_ids[MAX_ANCESTORS]; 58 + uint64_t ancestor_start_times[MAX_ANCESTORS]; 59 + uint32_t num_ancestors; 60 + }; 61 + 62 + struct var_metadata_t { 63 + enum data_type type; 64 + pid_t pid; 65 + uint32_t exec_id; 66 + uid_t uid; 67 + gid_t gid; 68 + uint64_t start_time; 69 + uint32_t cpu_id; 70 + uint64_t bpf_stats_num_perf_events; 71 + uint64_t bpf_stats_start_ktime_ns; 72 + uint8_t comm_length; 73 + }; 74 + 75 + struct cgroup_data_t { 76 + ino_t cgroup_root_inode; 77 + ino_t cgroup_proc_inode; 78 + uint64_t cgroup_root_mtime; 79 + uint64_t cgroup_proc_mtime; 80 + uint16_t cgroup_root_length; 81 + uint16_t cgroup_proc_length; 82 + uint16_t cgroup_full_length; 83 + int cgroup_full_path_root_pos; 84 + }; 85 + 86 + struct var_sysctl_data_t { 87 + struct var_metadata_t meta; 88 + struct cgroup_data_t cgroup_data; 89 + struct ancestors_data_t ancestors_info; 90 + uint8_t sysctl_val_length; 91 + uint16_t sysctl_path_length; 92 + char payload[MAX_SYSCTL_PAYLOAD_LEN]; 93 + }; 94 + 95 + struct var_kill_data_t { 96 + struct var_metadata_t meta; 97 + struct cgroup_data_t cgroup_data; 98 + struct ancestors_data_t ancestors_info; 99 + pid_t kill_target_pid; 100 + int kill_sig; 101 + uint32_t kill_count; 102 + uint64_t last_kill_time; 103 + uint8_t kill_target_name_length; 104 + uint8_t kill_target_cgroup_proc_length; 105 + char payload[MAX_KILL_PAYLOAD_LEN]; 106 + size_t payload_length; 107 + }; 108 + 109 + struct var_exec_data_t { 110 + struct var_metadata_t meta; 111 + struct cgroup_data_t cgroup_data; 112 + pid_t parent_pid; 113 + uint32_t parent_exec_id; 114 + uid_t parent_uid; 115 + uint64_t parent_start_time; 116 + uint16_t bin_path_length; 117 + uint16_t cmdline_length; 118 + uint16_t environment_length; 119 + char payload[MAX_EXEC_PAYLOAD_LEN]; 120 + }; 121 + 122 + struct var_fork_data_t { 123 + struct var_metadata_t meta; 124 + pid_t parent_pid; 125 + uint32_t parent_exec_id; 126 + uint64_t parent_start_time; 127 + char payload[MAX_METADATA_PAYLOAD_LEN]; 128 + }; 129 + 130 + struct var_filemod_data_t { 131 + struct var_metadata_t meta; 132 + struct cgroup_data_t cgroup_data; 133 + enum filemod_type fmod_type; 134 + unsigned int dst_flags; 135 + uint32_t src_device_id; 136 + uint32_t dst_device_id; 137 + ino_t src_inode; 138 + ino_t dst_inode; 139 + uint16_t src_filepath_length; 140 + uint16_t dst_filepath_length; 141 + char payload[MAX_FILEMOD_PAYLOAD_LEN]; 142 + }; 143 + 144 + struct profiler_config_struct { 145 + bool fetch_cgroups_from_bpf; 146 + ino_t cgroup_fs_inode; 147 + ino_t cgroup_login_session_inode; 148 + uint64_t kill_signals_mask; 149 + ino_t inode_filter; 150 + uint32_t stale_info_secs; 151 + bool use_variable_buffers; 152 + bool read_environ_from_exec; 153 + bool enable_cgroup_v1_resolver; 154 + }; 155 + 156 + struct bpf_func_stats_data { 157 + uint64_t time_elapsed_ns; 158 + uint64_t num_executions; 159 + uint64_t num_perf_events; 160 + }; 161 + 162 + struct bpf_func_stats_ctx { 163 + uint64_t start_time_ns; 164 + struct bpf_func_stats_data* bpf_func_stats_data_val; 165 + }; 166 + 167 + enum bpf_function_id { 168 + profiler_bpf_proc_sys_write, 169 + profiler_bpf_sched_process_exec, 170 + profiler_bpf_sched_process_exit, 171 + profiler_bpf_sys_enter_kill, 172 + profiler_bpf_do_filp_open_ret, 173 + profiler_bpf_sched_process_fork, 174 + profiler_bpf_vfs_link, 175 + profiler_bpf_vfs_symlink, 176 + profiler_bpf_max_function_id 177 + };

+969

tools/testing/selftests/bpf/progs/profiler.inc.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #include <vmlinux.h> 4 + #include <bpf/bpf_core_read.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_tracing.h> 7 + 8 + #include "profiler.h" 9 + 10 + #ifndef NULL 11 + #define NULL 0 12 + #endif 13 + 14 + #define O_WRONLY 00000001 15 + #define O_RDWR 00000002 16 + #define O_DIRECTORY 00200000 17 + #define __O_TMPFILE 020000000 18 + #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) 19 + #define MAX_ERRNO 4095 20 + #define S_IFMT 00170000 21 + #define S_IFSOCK 0140000 22 + #define S_IFLNK 0120000 23 + #define S_IFREG 0100000 24 + #define S_IFBLK 0060000 25 + #define S_IFDIR 0040000 26 + #define S_IFCHR 0020000 27 + #define S_IFIFO 0010000 28 + #define S_ISUID 0004000 29 + #define S_ISGID 0002000 30 + #define S_ISVTX 0001000 31 + #define S_ISLNK(m) (((m)&S_IFMT) == S_IFLNK) 32 + #define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR) 33 + #define S_ISCHR(m) (((m)&S_IFMT) == S_IFCHR) 34 + #define S_ISBLK(m) (((m)&S_IFMT) == S_IFBLK) 35 + #define S_ISFIFO(m) (((m)&S_IFMT) == S_IFIFO) 36 + #define S_ISSOCK(m) (((m)&S_IFMT) == S_IFSOCK) 37 + #define IS_ERR_VALUE(x) (unsigned long)(void*)(x) >= (unsigned long)-MAX_ERRNO 38 + 39 + #define KILL_DATA_ARRAY_SIZE 8 40 + 41 + struct var_kill_data_arr_t { 42 + struct var_kill_data_t array[KILL_DATA_ARRAY_SIZE]; 43 + }; 44 + 45 + union any_profiler_data_t { 46 + struct var_exec_data_t var_exec; 47 + struct var_kill_data_t var_kill; 48 + struct var_sysctl_data_t var_sysctl; 49 + struct var_filemod_data_t var_filemod; 50 + struct var_fork_data_t var_fork; 51 + struct var_kill_data_arr_t var_kill_data_arr; 52 + }; 53 + 54 + volatile struct profiler_config_struct bpf_config = {}; 55 + 56 + #define FETCH_CGROUPS_FROM_BPF (bpf_config.fetch_cgroups_from_bpf) 57 + #define CGROUP_FS_INODE (bpf_config.cgroup_fs_inode) 58 + #define CGROUP_LOGIN_SESSION_INODE \ 59 + (bpf_config.cgroup_login_session_inode) 60 + #define KILL_SIGNALS (bpf_config.kill_signals_mask) 61 + #define STALE_INFO (bpf_config.stale_info_secs) 62 + #define INODE_FILTER (bpf_config.inode_filter) 63 + #define READ_ENVIRON_FROM_EXEC (bpf_config.read_environ_from_exec) 64 + #define ENABLE_CGROUP_V1_RESOLVER (bpf_config.enable_cgroup_v1_resolver) 65 + 66 + struct kernfs_iattrs___52 { 67 + struct iattr ia_iattr; 68 + }; 69 + 70 + struct kernfs_node___52 { 71 + union /* kernfs_node_id */ { 72 + struct { 73 + u32 ino; 74 + u32 generation; 75 + }; 76 + u64 id; 77 + } id; 78 + }; 79 + 80 + struct { 81 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 82 + __uint(max_entries, 1); 83 + __type(key, u32); 84 + __type(value, union any_profiler_data_t); 85 + } data_heap SEC(".maps"); 86 + 87 + struct { 88 + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 89 + __uint(key_size, sizeof(int)); 90 + __uint(value_size, sizeof(int)); 91 + } events SEC(".maps"); 92 + 93 + struct { 94 + __uint(type, BPF_MAP_TYPE_HASH); 95 + __uint(max_entries, KILL_DATA_ARRAY_SIZE); 96 + __type(key, u32); 97 + __type(value, struct var_kill_data_arr_t); 98 + } var_tpid_to_data SEC(".maps"); 99 + 100 + struct { 101 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 102 + __uint(max_entries, profiler_bpf_max_function_id); 103 + __type(key, u32); 104 + __type(value, struct bpf_func_stats_data); 105 + } bpf_func_stats SEC(".maps"); 106 + 107 + struct { 108 + __uint(type, BPF_MAP_TYPE_HASH); 109 + __type(key, u32); 110 + __type(value, bool); 111 + __uint(max_entries, 16); 112 + } allowed_devices SEC(".maps"); 113 + 114 + struct { 115 + __uint(type, BPF_MAP_TYPE_HASH); 116 + __type(key, u64); 117 + __type(value, bool); 118 + __uint(max_entries, 1024); 119 + } allowed_file_inodes SEC(".maps"); 120 + 121 + struct { 122 + __uint(type, BPF_MAP_TYPE_HASH); 123 + __type(key, u64); 124 + __type(value, bool); 125 + __uint(max_entries, 1024); 126 + } allowed_directory_inodes SEC(".maps"); 127 + 128 + struct { 129 + __uint(type, BPF_MAP_TYPE_HASH); 130 + __type(key, u32); 131 + __type(value, bool); 132 + __uint(max_entries, 16); 133 + } disallowed_exec_inodes SEC(".maps"); 134 + 135 + #ifndef ARRAY_SIZE 136 + #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 137 + #endif 138 + 139 + static INLINE bool IS_ERR(const void* ptr) 140 + { 141 + return IS_ERR_VALUE((unsigned long)ptr); 142 + } 143 + 144 + static INLINE u32 get_userspace_pid() 145 + { 146 + return bpf_get_current_pid_tgid() >> 32; 147 + } 148 + 149 + static INLINE bool is_init_process(u32 tgid) 150 + { 151 + return tgid == 1 || tgid == 0; 152 + } 153 + 154 + static INLINE unsigned long 155 + probe_read_lim(void* dst, void* src, unsigned long len, unsigned long max) 156 + { 157 + len = len < max ? len : max; 158 + if (len > 1) { 159 + if (bpf_probe_read(dst, len, src)) 160 + return 0; 161 + } else if (len == 1) { 162 + if (bpf_probe_read(dst, 1, src)) 163 + return 0; 164 + } 165 + return len; 166 + } 167 + 168 + static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct, 169 + int spid) 170 + { 171 + #ifdef UNROLL 172 + #pragma unroll 173 + #endif 174 + for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) 175 + if (arr_struct->array[i].meta.pid == spid) 176 + return i; 177 + return -1; 178 + } 179 + 180 + static INLINE void populate_ancestors(struct task_struct* task, 181 + struct ancestors_data_t* ancestors_data) 182 + { 183 + struct task_struct* parent = task; 184 + u32 num_ancestors, ppid; 185 + 186 + ancestors_data->num_ancestors = 0; 187 + #ifdef UNROLL 188 + #pragma unroll 189 + #endif 190 + for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) { 191 + parent = BPF_CORE_READ(parent, real_parent); 192 + if (parent == NULL) 193 + break; 194 + ppid = BPF_CORE_READ(parent, tgid); 195 + if (is_init_process(ppid)) 196 + break; 197 + ancestors_data->ancestor_pids[num_ancestors] = ppid; 198 + ancestors_data->ancestor_exec_ids[num_ancestors] = 199 + BPF_CORE_READ(parent, self_exec_id); 200 + ancestors_data->ancestor_start_times[num_ancestors] = 201 + BPF_CORE_READ(parent, start_time); 202 + ancestors_data->num_ancestors = num_ancestors; 203 + } 204 + } 205 + 206 + static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node, 207 + struct kernfs_node* cgroup_root_node, 208 + void* payload, 209 + int* root_pos) 210 + { 211 + void* payload_start = payload; 212 + size_t filepart_length; 213 + 214 + #ifdef UNROLL 215 + #pragma unroll 216 + #endif 217 + for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) { 218 + filepart_length = 219 + bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(cgroup_node, name)); 220 + if (!cgroup_node) 221 + return payload; 222 + if (cgroup_node == cgroup_root_node) 223 + *root_pos = payload - payload_start; 224 + if (filepart_length <= MAX_PATH) { 225 + barrier_var(filepart_length); 226 + payload += filepart_length; 227 + } 228 + cgroup_node = BPF_CORE_READ(cgroup_node, parent); 229 + } 230 + return payload; 231 + } 232 + 233 + static ino_t get_inode_from_kernfs(struct kernfs_node* node) 234 + { 235 + struct kernfs_node___52* node52 = (void*)node; 236 + 237 + if (bpf_core_field_exists(node52->id.ino)) { 238 + barrier_var(node52); 239 + return BPF_CORE_READ(node52, id.ino); 240 + } else { 241 + barrier_var(node); 242 + return (u64)BPF_CORE_READ(node, id); 243 + } 244 + } 245 + 246 + int pids_cgrp_id = 1; 247 + 248 + static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, 249 + struct task_struct* task, 250 + void* payload) 251 + { 252 + struct kernfs_node* root_kernfs = 253 + BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn); 254 + struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); 255 + 256 + if (ENABLE_CGROUP_V1_RESOLVER) { 257 + #ifdef UNROLL 258 + #pragma unroll 259 + #endif 260 + for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 261 + struct cgroup_subsys_state* subsys = 262 + BPF_CORE_READ(task, cgroups, subsys[i]); 263 + if (subsys != NULL) { 264 + int subsys_id = BPF_CORE_READ(subsys, ss, id); 265 + if (subsys_id == pids_cgrp_id) { 266 + proc_kernfs = BPF_CORE_READ(subsys, cgroup, kn); 267 + root_kernfs = BPF_CORE_READ(subsys, ss, root, kf_root, kn); 268 + break; 269 + } 270 + } 271 + } 272 + } 273 + 274 + cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs); 275 + cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs); 276 + 277 + if (bpf_core_field_exists(root_kernfs->iattr->ia_mtime)) { 278 + cgroup_data->cgroup_root_mtime = 279 + BPF_CORE_READ(root_kernfs, iattr, ia_mtime.tv_nsec); 280 + cgroup_data->cgroup_proc_mtime = 281 + BPF_CORE_READ(proc_kernfs, iattr, ia_mtime.tv_nsec); 282 + } else { 283 + struct kernfs_iattrs___52* root_iattr = 284 + (struct kernfs_iattrs___52*)BPF_CORE_READ(root_kernfs, iattr); 285 + cgroup_data->cgroup_root_mtime = 286 + BPF_CORE_READ(root_iattr, ia_iattr.ia_mtime.tv_nsec); 287 + 288 + struct kernfs_iattrs___52* proc_iattr = 289 + (struct kernfs_iattrs___52*)BPF_CORE_READ(proc_kernfs, iattr); 290 + cgroup_data->cgroup_proc_mtime = 291 + BPF_CORE_READ(proc_iattr, ia_iattr.ia_mtime.tv_nsec); 292 + } 293 + 294 + cgroup_data->cgroup_root_length = 0; 295 + cgroup_data->cgroup_proc_length = 0; 296 + cgroup_data->cgroup_full_length = 0; 297 + 298 + size_t cgroup_root_length = 299 + bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(root_kernfs, name)); 300 + barrier_var(cgroup_root_length); 301 + if (cgroup_root_length <= MAX_PATH) { 302 + barrier_var(cgroup_root_length); 303 + cgroup_data->cgroup_root_length = cgroup_root_length; 304 + payload += cgroup_root_length; 305 + } 306 + 307 + size_t cgroup_proc_length = 308 + bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(proc_kernfs, name)); 309 + barrier_var(cgroup_proc_length); 310 + if (cgroup_proc_length <= MAX_PATH) { 311 + barrier_var(cgroup_proc_length); 312 + cgroup_data->cgroup_proc_length = cgroup_proc_length; 313 + payload += cgroup_proc_length; 314 + } 315 + 316 + if (FETCH_CGROUPS_FROM_BPF) { 317 + cgroup_data->cgroup_full_path_root_pos = -1; 318 + void* payload_end_pos = read_full_cgroup_path(proc_kernfs, root_kernfs, payload, 319 + &cgroup_data->cgroup_full_path_root_pos); 320 + cgroup_data->cgroup_full_length = payload_end_pos - payload; 321 + payload = payload_end_pos; 322 + } 323 + 324 + return (void*)payload; 325 + } 326 + 327 + static INLINE void* populate_var_metadata(struct var_metadata_t* metadata, 328 + struct task_struct* task, 329 + u32 pid, void* payload) 330 + { 331 + u64 uid_gid = bpf_get_current_uid_gid(); 332 + 333 + metadata->uid = (u32)uid_gid; 334 + metadata->gid = uid_gid >> 32; 335 + metadata->pid = pid; 336 + metadata->exec_id = BPF_CORE_READ(task, self_exec_id); 337 + metadata->start_time = BPF_CORE_READ(task, start_time); 338 + metadata->comm_length = 0; 339 + 340 + size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm); 341 + barrier_var(comm_length); 342 + if (comm_length <= TASK_COMM_LEN) { 343 + barrier_var(comm_length); 344 + metadata->comm_length = comm_length; 345 + payload += comm_length; 346 + } 347 + 348 + return (void*)payload; 349 + } 350 + 351 + static INLINE struct var_kill_data_t* 352 + get_var_kill_data(struct pt_regs* ctx, int spid, int tpid, int sig) 353 + { 354 + int zero = 0; 355 + struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero); 356 + 357 + if (kill_data == NULL) 358 + return NULL; 359 + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 360 + 361 + void* payload = populate_var_metadata(&kill_data->meta, task, spid, kill_data->payload); 362 + payload = populate_cgroup_info(&kill_data->cgroup_data, task, payload); 363 + size_t payload_length = payload - (void*)kill_data->payload; 364 + kill_data->payload_length = payload_length; 365 + populate_ancestors(task, &kill_data->ancestors_info); 366 + kill_data->meta.type = KILL_EVENT; 367 + kill_data->kill_target_pid = tpid; 368 + kill_data->kill_sig = sig; 369 + kill_data->kill_count = 1; 370 + kill_data->last_kill_time = bpf_ktime_get_ns(); 371 + return kill_data; 372 + } 373 + 374 + static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig) 375 + { 376 + if ((KILL_SIGNALS & (1ULL << sig)) == 0) 377 + return 0; 378 + 379 + u32 spid = get_userspace_pid(); 380 + struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid); 381 + 382 + if (arr_struct == NULL) { 383 + struct var_kill_data_t* kill_data = get_var_kill_data(ctx, spid, tpid, sig); 384 + int zero = 0; 385 + 386 + if (kill_data == NULL) 387 + return 0; 388 + arr_struct = bpf_map_lookup_elem(&data_heap, &zero); 389 + if (arr_struct == NULL) 390 + return 0; 391 + bpf_probe_read(&arr_struct->array[0], sizeof(arr_struct->array[0]), kill_data); 392 + } else { 393 + int index = get_var_spid_index(arr_struct, spid); 394 + 395 + if (index == -1) { 396 + struct var_kill_data_t* kill_data = 397 + get_var_kill_data(ctx, spid, tpid, sig); 398 + if (kill_data == NULL) 399 + return 0; 400 + #ifdef UNROLL 401 + #pragma unroll 402 + #endif 403 + for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) 404 + if (arr_struct->array[i].meta.pid == 0) { 405 + bpf_probe_read(&arr_struct->array[i], 406 + sizeof(arr_struct->array[i]), kill_data); 407 + bpf_map_update_elem(&var_tpid_to_data, &tpid, 408 + arr_struct, 0); 409 + 410 + return 0; 411 + } 412 + return 0; 413 + } 414 + 415 + struct var_kill_data_t* kill_data = &arr_struct->array[index]; 416 + 417 + u64 delta_sec = 418 + (bpf_ktime_get_ns() - kill_data->last_kill_time) / 1000000000; 419 + 420 + if (delta_sec < STALE_INFO) { 421 + kill_data->kill_count++; 422 + kill_data->last_kill_time = bpf_ktime_get_ns(); 423 + bpf_probe_read(&arr_struct->array[index], 424 + sizeof(arr_struct->array[index]), 425 + kill_data); 426 + } else { 427 + struct var_kill_data_t* kill_data = 428 + get_var_kill_data(ctx, spid, tpid, sig); 429 + if (kill_data == NULL) 430 + return 0; 431 + bpf_probe_read(&arr_struct->array[index], 432 + sizeof(arr_struct->array[index]), 433 + kill_data); 434 + } 435 + } 436 + bpf_map_update_elem(&var_tpid_to_data, &tpid, arr_struct, 0); 437 + return 0; 438 + } 439 + 440 + static INLINE void bpf_stats_enter(struct bpf_func_stats_ctx* bpf_stat_ctx, 441 + enum bpf_function_id func_id) 442 + { 443 + int func_id_key = func_id; 444 + 445 + bpf_stat_ctx->start_time_ns = bpf_ktime_get_ns(); 446 + bpf_stat_ctx->bpf_func_stats_data_val = 447 + bpf_map_lookup_elem(&bpf_func_stats, &func_id_key); 448 + if (bpf_stat_ctx->bpf_func_stats_data_val) 449 + bpf_stat_ctx->bpf_func_stats_data_val->num_executions++; 450 + } 451 + 452 + static INLINE void bpf_stats_exit(struct bpf_func_stats_ctx* bpf_stat_ctx) 453 + { 454 + if (bpf_stat_ctx->bpf_func_stats_data_val) 455 + bpf_stat_ctx->bpf_func_stats_data_val->time_elapsed_ns += 456 + bpf_ktime_get_ns() - bpf_stat_ctx->start_time_ns; 457 + } 458 + 459 + static INLINE void 460 + bpf_stats_pre_submit_var_perf_event(struct bpf_func_stats_ctx* bpf_stat_ctx, 461 + struct var_metadata_t* meta) 462 + { 463 + if (bpf_stat_ctx->bpf_func_stats_data_val) { 464 + bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events++; 465 + meta->bpf_stats_num_perf_events = 466 + bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events; 467 + } 468 + meta->bpf_stats_start_ktime_ns = bpf_stat_ctx->start_time_ns; 469 + meta->cpu_id = bpf_get_smp_processor_id(); 470 + } 471 + 472 + static INLINE size_t 473 + read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload) 474 + { 475 + size_t length = 0; 476 + size_t filepart_length; 477 + struct dentry* parent_dentry; 478 + 479 + #ifdef UNROLL 480 + #pragma unroll 481 + #endif 482 + for (int i = 0; i < MAX_PATH_DEPTH; i++) { 483 + filepart_length = bpf_probe_read_str(payload, MAX_PATH, 484 + BPF_CORE_READ(filp_dentry, d_name.name)); 485 + barrier_var(filepart_length); 486 + if (filepart_length > MAX_PATH) 487 + break; 488 + barrier_var(filepart_length); 489 + payload += filepart_length; 490 + length += filepart_length; 491 + 492 + parent_dentry = BPF_CORE_READ(filp_dentry, d_parent); 493 + if (filp_dentry == parent_dentry) 494 + break; 495 + filp_dentry = parent_dentry; 496 + } 497 + 498 + return length; 499 + } 500 + 501 + static INLINE bool 502 + is_ancestor_in_allowed_inodes(struct dentry* filp_dentry) 503 + { 504 + struct dentry* parent_dentry; 505 + #ifdef UNROLL 506 + #pragma unroll 507 + #endif 508 + for (int i = 0; i < MAX_PATH_DEPTH; i++) { 509 + u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino); 510 + bool* allowed_dir = bpf_map_lookup_elem(&allowed_directory_inodes, &dir_ino); 511 + 512 + if (allowed_dir != NULL) 513 + return true; 514 + parent_dentry = BPF_CORE_READ(filp_dentry, d_parent); 515 + if (filp_dentry == parent_dentry) 516 + break; 517 + filp_dentry = parent_dentry; 518 + } 519 + return false; 520 + } 521 + 522 + static INLINE bool is_dentry_allowed_for_filemod(struct dentry* file_dentry, 523 + u32* device_id, 524 + u64* file_ino) 525 + { 526 + u32 dev_id = BPF_CORE_READ(file_dentry, d_sb, s_dev); 527 + *device_id = dev_id; 528 + bool* allowed_device = bpf_map_lookup_elem(&allowed_devices, &dev_id); 529 + 530 + if (allowed_device == NULL) 531 + return false; 532 + 533 + u64 ino = BPF_CORE_READ(file_dentry, d_inode, i_ino); 534 + *file_ino = ino; 535 + bool* allowed_file = bpf_map_lookup_elem(&allowed_file_inodes, &ino); 536 + 537 + if (allowed_file == NULL) 538 + if (!is_ancestor_in_allowed_inodes(BPF_CORE_READ(file_dentry, d_parent))) 539 + return false; 540 + return true; 541 + } 542 + 543 + SEC("kprobe/proc_sys_write") 544 + ssize_t BPF_KPROBE(kprobe__proc_sys_write, 545 + struct file* filp, const char* buf, 546 + size_t count, loff_t* ppos) 547 + { 548 + struct bpf_func_stats_ctx stats_ctx; 549 + bpf_stats_enter(&stats_ctx, profiler_bpf_proc_sys_write); 550 + 551 + u32 pid = get_userspace_pid(); 552 + int zero = 0; 553 + struct var_sysctl_data_t* sysctl_data = 554 + bpf_map_lookup_elem(&data_heap, &zero); 555 + if (!sysctl_data) 556 + goto out; 557 + 558 + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 559 + sysctl_data->meta.type = SYSCTL_EVENT; 560 + void* payload = populate_var_metadata(&sysctl_data->meta, task, pid, sysctl_data->payload); 561 + payload = populate_cgroup_info(&sysctl_data->cgroup_data, task, payload); 562 + 563 + populate_ancestors(task, &sysctl_data->ancestors_info); 564 + 565 + sysctl_data->sysctl_val_length = 0; 566 + sysctl_data->sysctl_path_length = 0; 567 + 568 + size_t sysctl_val_length = bpf_probe_read_str(payload, CTL_MAXNAME, buf); 569 + barrier_var(sysctl_val_length); 570 + if (sysctl_val_length <= CTL_MAXNAME) { 571 + barrier_var(sysctl_val_length); 572 + sysctl_data->sysctl_val_length = sysctl_val_length; 573 + payload += sysctl_val_length; 574 + } 575 + 576 + size_t sysctl_path_length = bpf_probe_read_str(payload, MAX_PATH, 577 + BPF_CORE_READ(filp, f_path.dentry, d_name.name)); 578 + barrier_var(sysctl_path_length); 579 + if (sysctl_path_length <= MAX_PATH) { 580 + barrier_var(sysctl_path_length); 581 + sysctl_data->sysctl_path_length = sysctl_path_length; 582 + payload += sysctl_path_length; 583 + } 584 + 585 + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &sysctl_data->meta); 586 + unsigned long data_len = payload - (void*)sysctl_data; 587 + data_len = data_len > sizeof(struct var_sysctl_data_t) 588 + ? sizeof(struct var_sysctl_data_t) 589 + : data_len; 590 + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, sysctl_data, data_len); 591 + out: 592 + bpf_stats_exit(&stats_ctx); 593 + return 0; 594 + } 595 + 596 + SEC("tracepoint/syscalls/sys_enter_kill") 597 + int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx) 598 + { 599 + struct bpf_func_stats_ctx stats_ctx; 600 + 601 + bpf_stats_enter(&stats_ctx, profiler_bpf_sys_enter_kill); 602 + int pid = ctx->args[0]; 603 + int sig = ctx->args[1]; 604 + int ret = trace_var_sys_kill(ctx, pid, sig); 605 + bpf_stats_exit(&stats_ctx); 606 + return ret; 607 + }; 608 + 609 + SEC("raw_tracepoint/sched_process_exit") 610 + int raw_tracepoint__sched_process_exit(void* ctx) 611 + { 612 + int zero = 0; 613 + struct bpf_func_stats_ctx stats_ctx; 614 + bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exit); 615 + 616 + u32 tpid = get_userspace_pid(); 617 + 618 + struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid); 619 + struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero); 620 + 621 + if (arr_struct == NULL || kill_data == NULL) 622 + goto out; 623 + 624 + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 625 + struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); 626 + 627 + #ifdef UNROLL 628 + #pragma unroll 629 + #endif 630 + for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) { 631 + struct var_kill_data_t* past_kill_data = &arr_struct->array[i]; 632 + 633 + if (past_kill_data != NULL && past_kill_data->kill_target_pid == tpid) { 634 + bpf_probe_read(kill_data, sizeof(*past_kill_data), past_kill_data); 635 + void* payload = kill_data->payload; 636 + size_t offset = kill_data->payload_length; 637 + if (offset >= MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN) 638 + return 0; 639 + payload += offset; 640 + 641 + kill_data->kill_target_name_length = 0; 642 + kill_data->kill_target_cgroup_proc_length = 0; 643 + 644 + size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm); 645 + barrier_var(comm_length); 646 + if (comm_length <= TASK_COMM_LEN) { 647 + barrier_var(comm_length); 648 + kill_data->kill_target_name_length = comm_length; 649 + payload += comm_length; 650 + } 651 + 652 + size_t cgroup_proc_length = bpf_probe_read_str(payload, KILL_TARGET_LEN, 653 + BPF_CORE_READ(proc_kernfs, name)); 654 + barrier_var(cgroup_proc_length); 655 + if (cgroup_proc_length <= KILL_TARGET_LEN) { 656 + barrier_var(cgroup_proc_length); 657 + kill_data->kill_target_cgroup_proc_length = cgroup_proc_length; 658 + payload += cgroup_proc_length; 659 + } 660 + 661 + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &kill_data->meta); 662 + unsigned long data_len = (void*)payload - (void*)kill_data; 663 + data_len = data_len > sizeof(struct var_kill_data_t) 664 + ? sizeof(struct var_kill_data_t) 665 + : data_len; 666 + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, kill_data, data_len); 667 + } 668 + } 669 + bpf_map_delete_elem(&var_tpid_to_data, &tpid); 670 + out: 671 + bpf_stats_exit(&stats_ctx); 672 + return 0; 673 + } 674 + 675 + SEC("raw_tracepoint/sched_process_exec") 676 + int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx) 677 + { 678 + struct bpf_func_stats_ctx stats_ctx; 679 + bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exec); 680 + 681 + struct linux_binprm* bprm = (struct linux_binprm*)ctx->args[2]; 682 + u64 inode = BPF_CORE_READ(bprm, file, f_inode, i_ino); 683 + 684 + bool* should_filter_binprm = bpf_map_lookup_elem(&disallowed_exec_inodes, &inode); 685 + if (should_filter_binprm != NULL) 686 + goto out; 687 + 688 + int zero = 0; 689 + struct var_exec_data_t* proc_exec_data = bpf_map_lookup_elem(&data_heap, &zero); 690 + if (!proc_exec_data) 691 + goto out; 692 + 693 + if (INODE_FILTER && inode != INODE_FILTER) 694 + return 0; 695 + 696 + u32 pid = get_userspace_pid(); 697 + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 698 + 699 + proc_exec_data->meta.type = EXEC_EVENT; 700 + proc_exec_data->bin_path_length = 0; 701 + proc_exec_data->cmdline_length = 0; 702 + proc_exec_data->environment_length = 0; 703 + void* payload = populate_var_metadata(&proc_exec_data->meta, task, pid, 704 + proc_exec_data->payload); 705 + payload = populate_cgroup_info(&proc_exec_data->cgroup_data, task, payload); 706 + 707 + struct task_struct* parent_task = BPF_CORE_READ(task, real_parent); 708 + proc_exec_data->parent_pid = BPF_CORE_READ(parent_task, tgid); 709 + proc_exec_data->parent_uid = BPF_CORE_READ(parent_task, real_cred, uid.val); 710 + proc_exec_data->parent_exec_id = BPF_CORE_READ(parent_task, self_exec_id); 711 + proc_exec_data->parent_start_time = BPF_CORE_READ(parent_task, start_time); 712 + 713 + const char* filename = BPF_CORE_READ(bprm, filename); 714 + size_t bin_path_length = bpf_probe_read_str(payload, MAX_FILENAME_LEN, filename); 715 + barrier_var(bin_path_length); 716 + if (bin_path_length <= MAX_FILENAME_LEN) { 717 + barrier_var(bin_path_length); 718 + proc_exec_data->bin_path_length = bin_path_length; 719 + payload += bin_path_length; 720 + } 721 + 722 + void* arg_start = (void*)BPF_CORE_READ(task, mm, arg_start); 723 + void* arg_end = (void*)BPF_CORE_READ(task, mm, arg_end); 724 + unsigned int cmdline_length = probe_read_lim(payload, arg_start, 725 + arg_end - arg_start, MAX_ARGS_LEN); 726 + 727 + if (cmdline_length <= MAX_ARGS_LEN) { 728 + barrier_var(cmdline_length); 729 + proc_exec_data->cmdline_length = cmdline_length; 730 + payload += cmdline_length; 731 + } 732 + 733 + if (READ_ENVIRON_FROM_EXEC) { 734 + void* env_start = (void*)BPF_CORE_READ(task, mm, env_start); 735 + void* env_end = (void*)BPF_CORE_READ(task, mm, env_end); 736 + unsigned long env_len = probe_read_lim(payload, env_start, 737 + env_end - env_start, MAX_ENVIRON_LEN); 738 + if (cmdline_length <= MAX_ENVIRON_LEN) { 739 + proc_exec_data->environment_length = env_len; 740 + payload += env_len; 741 + } 742 + } 743 + 744 + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &proc_exec_data->meta); 745 + unsigned long data_len = payload - (void*)proc_exec_data; 746 + data_len = data_len > sizeof(struct var_exec_data_t) 747 + ? sizeof(struct var_exec_data_t) 748 + : data_len; 749 + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, proc_exec_data, data_len); 750 + out: 751 + bpf_stats_exit(&stats_ctx); 752 + return 0; 753 + } 754 + 755 + SEC("kretprobe/do_filp_open") 756 + int kprobe_ret__do_filp_open(struct pt_regs* ctx) 757 + { 758 + struct bpf_func_stats_ctx stats_ctx; 759 + bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret); 760 + 761 + struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx); 762 + 763 + if (filp == NULL || IS_ERR(filp)) 764 + goto out; 765 + unsigned int flags = BPF_CORE_READ(filp, f_flags); 766 + if ((flags & (O_RDWR | O_WRONLY)) == 0) 767 + goto out; 768 + if ((flags & O_TMPFILE) > 0) 769 + goto out; 770 + struct inode* file_inode = BPF_CORE_READ(filp, f_inode); 771 + umode_t mode = BPF_CORE_READ(file_inode, i_mode); 772 + if (S_ISDIR(mode) || S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || 773 + S_ISSOCK(mode)) 774 + goto out; 775 + 776 + struct dentry* filp_dentry = BPF_CORE_READ(filp, f_path.dentry); 777 + u32 device_id = 0; 778 + u64 file_ino = 0; 779 + if (!is_dentry_allowed_for_filemod(filp_dentry, &device_id, &file_ino)) 780 + goto out; 781 + 782 + int zero = 0; 783 + struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero); 784 + if (!filemod_data) 785 + goto out; 786 + 787 + u32 pid = get_userspace_pid(); 788 + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 789 + 790 + filemod_data->meta.type = FILEMOD_EVENT; 791 + filemod_data->fmod_type = FMOD_OPEN; 792 + filemod_data->dst_flags = flags; 793 + filemod_data->src_inode = 0; 794 + filemod_data->dst_inode = file_ino; 795 + filemod_data->src_device_id = 0; 796 + filemod_data->dst_device_id = device_id; 797 + filemod_data->src_filepath_length = 0; 798 + filemod_data->dst_filepath_length = 0; 799 + 800 + void* payload = populate_var_metadata(&filemod_data->meta, task, pid, 801 + filemod_data->payload); 802 + payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload); 803 + 804 + size_t len = read_absolute_file_path_from_dentry(filp_dentry, payload); 805 + barrier_var(len); 806 + if (len <= MAX_FILEPATH_LENGTH) { 807 + barrier_var(len); 808 + payload += len; 809 + filemod_data->dst_filepath_length = len; 810 + } 811 + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta); 812 + unsigned long data_len = payload - (void*)filemod_data; 813 + data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len; 814 + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len); 815 + out: 816 + bpf_stats_exit(&stats_ctx); 817 + return 0; 818 + } 819 + 820 + SEC("kprobe/vfs_link") 821 + int BPF_KPROBE(kprobe__vfs_link, 822 + struct dentry* old_dentry, struct inode* dir, 823 + struct dentry* new_dentry, struct inode** delegated_inode) 824 + { 825 + struct bpf_func_stats_ctx stats_ctx; 826 + bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_link); 827 + 828 + u32 src_device_id = 0; 829 + u64 src_file_ino = 0; 830 + u32 dst_device_id = 0; 831 + u64 dst_file_ino = 0; 832 + if (!is_dentry_allowed_for_filemod(old_dentry, &src_device_id, &src_file_ino) && 833 + !is_dentry_allowed_for_filemod(new_dentry, &dst_device_id, &dst_file_ino)) 834 + goto out; 835 + 836 + int zero = 0; 837 + struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero); 838 + if (!filemod_data) 839 + goto out; 840 + 841 + u32 pid = get_userspace_pid(); 842 + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 843 + 844 + filemod_data->meta.type = FILEMOD_EVENT; 845 + filemod_data->fmod_type = FMOD_LINK; 846 + filemod_data->dst_flags = 0; 847 + filemod_data->src_inode = src_file_ino; 848 + filemod_data->dst_inode = dst_file_ino; 849 + filemod_data->src_device_id = src_device_id; 850 + filemod_data->dst_device_id = dst_device_id; 851 + filemod_data->src_filepath_length = 0; 852 + filemod_data->dst_filepath_length = 0; 853 + 854 + void* payload = populate_var_metadata(&filemod_data->meta, task, pid, 855 + filemod_data->payload); 856 + payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload); 857 + 858 + size_t len = read_absolute_file_path_from_dentry(old_dentry, payload); 859 + barrier_var(len); 860 + if (len <= MAX_FILEPATH_LENGTH) { 861 + barrier_var(len); 862 + payload += len; 863 + filemod_data->src_filepath_length = len; 864 + } 865 + 866 + len = read_absolute_file_path_from_dentry(new_dentry, payload); 867 + barrier_var(len); 868 + if (len <= MAX_FILEPATH_LENGTH) { 869 + barrier_var(len); 870 + payload += len; 871 + filemod_data->dst_filepath_length = len; 872 + } 873 + 874 + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta); 875 + unsigned long data_len = payload - (void*)filemod_data; 876 + data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len; 877 + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len); 878 + out: 879 + bpf_stats_exit(&stats_ctx); 880 + return 0; 881 + } 882 + 883 + SEC("kprobe/vfs_symlink") 884 + int BPF_KPROBE(kprobe__vfs_symlink, struct inode* dir, struct dentry* dentry, 885 + const char* oldname) 886 + { 887 + struct bpf_func_stats_ctx stats_ctx; 888 + bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_symlink); 889 + 890 + u32 dst_device_id = 0; 891 + u64 dst_file_ino = 0; 892 + if (!is_dentry_allowed_for_filemod(dentry, &dst_device_id, &dst_file_ino)) 893 + goto out; 894 + 895 + int zero = 0; 896 + struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero); 897 + if (!filemod_data) 898 + goto out; 899 + 900 + u32 pid = get_userspace_pid(); 901 + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 902 + 903 + filemod_data->meta.type = FILEMOD_EVENT; 904 + filemod_data->fmod_type = FMOD_SYMLINK; 905 + filemod_data->dst_flags = 0; 906 + filemod_data->src_inode = 0; 907 + filemod_data->dst_inode = dst_file_ino; 908 + filemod_data->src_device_id = 0; 909 + filemod_data->dst_device_id = dst_device_id; 910 + filemod_data->src_filepath_length = 0; 911 + filemod_data->dst_filepath_length = 0; 912 + 913 + void* payload = populate_var_metadata(&filemod_data->meta, task, pid, 914 + filemod_data->payload); 915 + payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload); 916 + 917 + size_t len = bpf_probe_read_str(payload, MAX_FILEPATH_LENGTH, oldname); 918 + barrier_var(len); 919 + if (len <= MAX_FILEPATH_LENGTH) { 920 + barrier_var(len); 921 + payload += len; 922 + filemod_data->src_filepath_length = len; 923 + } 924 + len = read_absolute_file_path_from_dentry(dentry, payload); 925 + barrier_var(len); 926 + if (len <= MAX_FILEPATH_LENGTH) { 927 + barrier_var(len); 928 + payload += len; 929 + filemod_data->dst_filepath_length = len; 930 + } 931 + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta); 932 + unsigned long data_len = payload - (void*)filemod_data; 933 + data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len; 934 + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len); 935 + out: 936 + bpf_stats_exit(&stats_ctx); 937 + return 0; 938 + } 939 + 940 + SEC("raw_tracepoint/sched_process_fork") 941 + int raw_tracepoint__sched_process_fork(struct bpf_raw_tracepoint_args* ctx) 942 + { 943 + struct bpf_func_stats_ctx stats_ctx; 944 + bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_fork); 945 + 946 + int zero = 0; 947 + struct var_fork_data_t* fork_data = bpf_map_lookup_elem(&data_heap, &zero); 948 + if (!fork_data) 949 + goto out; 950 + 951 + struct task_struct* parent = (struct task_struct*)ctx->args[0]; 952 + struct task_struct* child = (struct task_struct*)ctx->args[1]; 953 + fork_data->meta.type = FORK_EVENT; 954 + 955 + void* payload = populate_var_metadata(&fork_data->meta, child, 956 + BPF_CORE_READ(child, pid), fork_data->payload); 957 + fork_data->parent_pid = BPF_CORE_READ(parent, pid); 958 + fork_data->parent_exec_id = BPF_CORE_READ(parent, self_exec_id); 959 + fork_data->parent_start_time = BPF_CORE_READ(parent, start_time); 960 + bpf_stats_pre_submit_var_perf_event(&stats_ctx, &fork_data->meta); 961 + 962 + unsigned long data_len = payload - (void*)fork_data; 963 + data_len = data_len > sizeof(*fork_data) ? sizeof(*fork_data) : data_len; 964 + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, fork_data, data_len); 965 + out: 966 + bpf_stats_exit(&stats_ctx); 967 + return 0; 968 + } 969 + char _license[] SEC("license") = "GPL";

+6

tools/testing/selftests/bpf/progs/profiler1.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + #define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var)) 4 + #define UNROLL 5 + #define INLINE __always_inline 6 + #include "profiler.inc.h"

+6

tools/testing/selftests/bpf/progs/profiler2.c

+6

tools/testing/selftests/bpf/progs/profiler3.c

+43

tools/testing/selftests/bpf/progs/test_btf_map_in_map.c

··· 41 41 .values = { (void *)&inner_map1, 0, (void *)&inner_map2 }, 42 42 }; 43 43 44 + struct inner_map_sz3 { 45 + __uint(type, BPF_MAP_TYPE_ARRAY); 46 + __uint(map_flags, BPF_F_INNER_MAP); 47 + __uint(max_entries, 3); 48 + __type(key, int); 49 + __type(value, int); 50 + } inner_map3 SEC(".maps"), 51 + inner_map4 SEC(".maps"); 52 + 53 + struct inner_map_sz4 { 54 + __uint(type, BPF_MAP_TYPE_ARRAY); 55 + __uint(map_flags, BPF_F_INNER_MAP); 56 + __uint(max_entries, 5); 57 + __type(key, int); 58 + __type(value, int); 59 + } inner_map5 SEC(".maps"); 60 + 61 + struct outer_arr_dyn { 62 + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 63 + __uint(max_entries, 3); 64 + __uint(key_size, sizeof(int)); 65 + __uint(value_size, sizeof(int)); 66 + __array(values, struct { 67 + __uint(type, BPF_MAP_TYPE_ARRAY); 68 + __uint(map_flags, BPF_F_INNER_MAP); 69 + __uint(max_entries, 1); 70 + __type(key, int); 71 + __type(value, int); 72 + }); 73 + } outer_arr_dyn SEC(".maps") = { 74 + .values = { 75 + [0] = (void *)&inner_map3, 76 + [1] = (void *)&inner_map4, 77 + [2] = (void *)&inner_map5, 78 + }, 79 + }; 80 + 44 81 struct outer_hash { 45 82 __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); 46 83 __uint(max_entries, 5); ··· 136 99 if (!inner_map) 137 100 return 1; 138 101 val = input + 1; 102 + bpf_map_update_elem(inner_map, &key, &val, 0); 103 + 104 + inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 105 + if (!inner_map) 106 + return 1; 107 + val = input + 2; 139 108 bpf_map_update_elem(inner_map, &key, &val, 0); 140 109 141 110 return 0;

+172

tools/testing/selftests/bpf/progs/test_core_autosize.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Facebook */ 3 + 4 + #include <linux/bpf.h> 5 + #include <stdint.h> 6 + #include <bpf/bpf_helpers.h> 7 + #include <bpf/bpf_core_read.h> 8 + 9 + char _license[] SEC("license") = "GPL"; 10 + 11 + /* fields of exactly the same size */ 12 + struct test_struct___samesize { 13 + void *ptr; 14 + unsigned long long val1; 15 + unsigned int val2; 16 + unsigned short val3; 17 + unsigned char val4; 18 + } __attribute((preserve_access_index)); 19 + 20 + /* unsigned fields that have to be downsized by libbpf */ 21 + struct test_struct___downsize { 22 + void *ptr; 23 + unsigned long val1; 24 + unsigned long val2; 25 + unsigned long val3; 26 + unsigned long val4; 27 + /* total sz: 40 */ 28 + } __attribute__((preserve_access_index)); 29 + 30 + /* fields with signed integers of wrong size, should be rejected */ 31 + struct test_struct___signed { 32 + void *ptr; 33 + long val1; 34 + long val2; 35 + long val3; 36 + long val4; 37 + } __attribute((preserve_access_index)); 38 + 39 + /* real layout and sizes according to test's (32-bit) BTF */ 40 + struct test_struct___real { 41 + unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */ 42 + unsigned int val2; 43 + unsigned long long val1; 44 + unsigned short val3; 45 + unsigned char val4; 46 + unsigned char _pad; 47 + /* total sz: 20 */ 48 + }; 49 + 50 + struct test_struct___real input = { 51 + .ptr = 0x01020304, 52 + .val1 = 0x1020304050607080, 53 + .val2 = 0x0a0b0c0d, 54 + .val3 = 0xfeed, 55 + .val4 = 0xb9, 56 + ._pad = 0xff, /* make sure no accidental zeros are present */ 57 + }; 58 + 59 + unsigned long long ptr_samesized = 0; 60 + unsigned long long val1_samesized = 0; 61 + unsigned long long val2_samesized = 0; 62 + unsigned long long val3_samesized = 0; 63 + unsigned long long val4_samesized = 0; 64 + struct test_struct___real output_samesized = {}; 65 + 66 + unsigned long long ptr_downsized = 0; 67 + unsigned long long val1_downsized = 0; 68 + unsigned long long val2_downsized = 0; 69 + unsigned long long val3_downsized = 0; 70 + unsigned long long val4_downsized = 0; 71 + struct test_struct___real output_downsized = {}; 72 + 73 + unsigned long long ptr_probed = 0; 74 + unsigned long long val1_probed = 0; 75 + unsigned long long val2_probed = 0; 76 + unsigned long long val3_probed = 0; 77 + unsigned long long val4_probed = 0; 78 + 79 + unsigned long long ptr_signed = 0; 80 + unsigned long long val1_signed = 0; 81 + unsigned long long val2_signed = 0; 82 + unsigned long long val3_signed = 0; 83 + unsigned long long val4_signed = 0; 84 + struct test_struct___real output_signed = {}; 85 + 86 + SEC("raw_tp/sys_exit") 87 + int handle_samesize(void *ctx) 88 + { 89 + struct test_struct___samesize *in = (void *)&input; 90 + struct test_struct___samesize *out = (void *)&output_samesized; 91 + 92 + ptr_samesized = (unsigned long long)in->ptr; 93 + val1_samesized = in->val1; 94 + val2_samesized = in->val2; 95 + val3_samesized = in->val3; 96 + val4_samesized = in->val4; 97 + 98 + out->ptr = in->ptr; 99 + out->val1 = in->val1; 100 + out->val2 = in->val2; 101 + out->val3 = in->val3; 102 + out->val4 = in->val4; 103 + 104 + return 0; 105 + } 106 + 107 + SEC("raw_tp/sys_exit") 108 + int handle_downsize(void *ctx) 109 + { 110 + struct test_struct___downsize *in = (void *)&input; 111 + struct test_struct___downsize *out = (void *)&output_downsized; 112 + 113 + ptr_downsized = (unsigned long long)in->ptr; 114 + val1_downsized = in->val1; 115 + val2_downsized = in->val2; 116 + val3_downsized = in->val3; 117 + val4_downsized = in->val4; 118 + 119 + out->ptr = in->ptr; 120 + out->val1 = in->val1; 121 + out->val2 = in->val2; 122 + out->val3 = in->val3; 123 + out->val4 = in->val4; 124 + 125 + return 0; 126 + } 127 + 128 + SEC("raw_tp/sys_enter") 129 + int handle_probed(void *ctx) 130 + { 131 + struct test_struct___downsize *in = (void *)&input; 132 + __u64 tmp; 133 + 134 + tmp = 0; 135 + bpf_core_read(&tmp, bpf_core_field_size(in->ptr), &in->ptr); 136 + ptr_probed = tmp; 137 + 138 + tmp = 0; 139 + bpf_core_read(&tmp, bpf_core_field_size(in->val1), &in->val1); 140 + val1_probed = tmp; 141 + 142 + tmp = 0; 143 + bpf_core_read(&tmp, bpf_core_field_size(in->val2), &in->val2); 144 + val2_probed = tmp; 145 + 146 + tmp = 0; 147 + bpf_core_read(&tmp, bpf_core_field_size(in->val3), &in->val3); 148 + val3_probed = tmp; 149 + 150 + tmp = 0; 151 + bpf_core_read(&tmp, bpf_core_field_size(in->val4), &in->val4); 152 + val4_probed = tmp; 153 + 154 + return 0; 155 + } 156 + 157 + SEC("raw_tp/sys_enter") 158 + int handle_signed(void *ctx) 159 + { 160 + struct test_struct___signed *in = (void *)&input; 161 + struct test_struct___signed *out = (void *)&output_signed; 162 + 163 + val2_signed = in->val2; 164 + val3_signed = in->val3; 165 + val4_signed = in->val4; 166 + 167 + out->val2= in->val2; 168 + out->val3= in->val3; 169 + out->val4= in->val4; 170 + 171 + return 0; 172 + }

+55

tools/testing/selftests/bpf/progs/test_ksyms_btf.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020 Google */ 3 + 4 + #include "vmlinux.h" 5 + 6 + #include <bpf/bpf_helpers.h> 7 + 8 + __u64 out__runqueues_addr = -1; 9 + __u64 out__bpf_prog_active_addr = -1; 10 + 11 + __u32 out__rq_cpu = -1; /* percpu struct fields */ 12 + int out__bpf_prog_active = -1; /* percpu int */ 13 + 14 + __u32 out__this_rq_cpu = -1; 15 + int out__this_bpf_prog_active = -1; 16 + 17 + __u32 out__cpu_0_rq_cpu = -1; /* cpu_rq(0)->cpu */ 18 + 19 + extern const struct rq runqueues __ksym; /* struct type global var. */ 20 + extern const int bpf_prog_active __ksym; /* int type global var. */ 21 + 22 + SEC("raw_tp/sys_enter") 23 + int handler(const void *ctx) 24 + { 25 + struct rq *rq; 26 + int *active; 27 + __u32 cpu; 28 + 29 + out__runqueues_addr = (__u64)&runqueues; 30 + out__bpf_prog_active_addr = (__u64)&bpf_prog_active; 31 + 32 + cpu = bpf_get_smp_processor_id(); 33 + 34 + /* test bpf_per_cpu_ptr() */ 35 + rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, cpu); 36 + if (rq) 37 + out__rq_cpu = rq->cpu; 38 + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); 39 + if (active) 40 + out__bpf_prog_active = *active; 41 + 42 + rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0); 43 + if (rq) /* should always be valid, but we can't spare the check. */ 44 + out__cpu_0_rq_cpu = rq->cpu; 45 + 46 + /* test bpf_this_cpu_ptr */ 47 + rq = (struct rq *)bpf_this_cpu_ptr(&runqueues); 48 + out__this_rq_cpu = rq->cpu; 49 + active = (int *)bpf_this_cpu_ptr(&bpf_prog_active); 50 + out__this_bpf_prog_active = *active; 51 + 52 + return 0; 53 + } 54 + 55 + char _license[] SEC("license") = "GPL";

+2 -2

tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c

··· 304 304 passive_lport_n = __bpf_htons(passive_lport_h); 305 305 bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, 306 306 &true_val, sizeof(true_val)); 307 - set_hdr_cb_flags(skops); 307 + set_hdr_cb_flags(skops, 0); 308 308 break; 309 309 case BPF_SOCK_OPS_TCP_CONNECT_CB: 310 - set_hdr_cb_flags(skops); 310 + set_hdr_cb_flags(skops, 0); 311 311 break; 312 312 case BPF_SOCK_OPS_PARSE_HDR_OPT_CB: 313 313 return handle_parse_hdr(skops);

+25 -9

tools/testing/selftests/bpf/progs/test_sockmap_kern.h

··· 131 131 132 132 } 133 133 134 - SEC("sk_skb3") 135 - int bpf_prog3(struct __sk_buff *skb) 134 + static inline void bpf_write_pass(struct __sk_buff *skb, int offset) 136 135 { 137 - const int one = 1; 138 - int err, *f, ret = SK_PASS; 136 + int err = bpf_skb_pull_data(skb, 6 + offset); 139 137 void *data_end; 140 138 char *c; 141 139 142 - err = bpf_skb_pull_data(skb, 19); 143 140 if (err) 144 - goto tls_out; 141 + return; 145 142 146 143 c = (char *)(long)skb->data; 147 144 data_end = (void *)(long)skb->data_end; 148 145 149 - if (c + 18 < data_end) 150 - memcpy(&c[13], "PASS", 4); 146 + if (c + 5 + offset < data_end) 147 + memcpy(c + offset, "PASS", 4); 148 + } 149 + 150 + SEC("sk_skb3") 151 + int bpf_prog3(struct __sk_buff *skb) 152 + { 153 + int err, *f, ret = SK_PASS; 154 + const int one = 1; 155 + 151 156 f = bpf_map_lookup_elem(&sock_skb_opts, &one); 152 157 if (f && *f) { 153 158 __u64 flags = 0; 154 159 155 160 ret = 0; 156 161 flags = *f; 162 + 163 + err = bpf_skb_adjust_room(skb, -13, 0, 0); 164 + if (err) 165 + return SK_DROP; 166 + err = bpf_skb_adjust_room(skb, 4, 0, 0); 167 + if (err) 168 + return SK_DROP; 169 + bpf_write_pass(skb, 0); 157 170 #ifdef SOCKMAP 158 171 return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags); 159 172 #else 160 173 return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags); 161 174 #endif 162 175 } 163 - 164 176 f = bpf_map_lookup_elem(&sock_skb_opts, &one); 165 177 if (f && *f) 166 178 ret = SK_DROP; 179 + err = bpf_skb_adjust_room(skb, 4, 0, 0); 180 + if (err) 181 + return SK_DROP; 182 + bpf_write_pass(skb, 13); 167 183 tls_out: 168 184 return ret; 169 185 }

+22 -18

tools/testing/selftests/bpf/progs/test_tc_neigh.c

··· 13 13 #include <bpf/bpf_helpers.h> 14 14 #include <bpf/bpf_endian.h> 15 15 16 - #ifndef barrier_data 17 - # define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory") 18 - #endif 19 - 20 16 #ifndef ctx_ptr 21 17 # define ctx_ptr(field) (void *)(long)(field) 22 18 #endif 23 - 24 - #define dst_to_src_tmp 0xeeddddeeU 25 - #define src_to_dst_tmp 0xeeffffeeU 26 19 27 20 #define ip4_src 0xac100164 /* 172.16.1.100 */ 28 21 #define ip4_dst 0xac100264 /* 172.16.2.100 */ ··· 31 38 a.s6_addr32[2] == b.s6_addr32[2] && \ 32 39 a.s6_addr32[3] == b.s6_addr32[3]) 33 40 #endif 41 + 42 + enum { 43 + dev_src, 44 + dev_dst, 45 + }; 46 + 47 + struct bpf_map_def SEC("maps") ifindex_map = { 48 + .type = BPF_MAP_TYPE_ARRAY, 49 + .key_size = sizeof(int), 50 + .value_size = sizeof(int), 51 + .max_entries = 2, 52 + }; 34 53 35 54 static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb, 36 55 __be32 addr) ··· 78 73 return v6_equal(ip6h->daddr, addr); 79 74 } 80 75 81 - SEC("chk_neigh") int tc_chk(struct __sk_buff *skb) 76 + static __always_inline int get_dev_ifindex(int which) 77 + { 78 + int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); 79 + 80 + return ifindex ? *ifindex : 0; 81 + } 82 + 83 + SEC("chk_egress") int tc_chk(struct __sk_buff *skb) 82 84 { 83 85 void *data_end = ctx_ptr(skb->data_end); 84 86 void *data = ctx_ptr(skb->data); ··· 99 87 100 88 SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) 101 89 { 102 - int idx = dst_to_src_tmp; 103 90 __u8 zero[ETH_ALEN * 2]; 104 91 bool redirect = false; 105 92 ··· 114 103 if (!redirect) 115 104 return TC_ACT_OK; 116 105 117 - barrier_data(&idx); 118 - idx = bpf_ntohl(idx); 119 - 120 106 __builtin_memset(&zero, 0, sizeof(zero)); 121 107 if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) 122 108 return TC_ACT_SHOT; 123 109 124 - return bpf_redirect_neigh(idx, 0); 110 + return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0); 125 111 } 126 112 127 113 SEC("src_ingress") int tc_src(struct __sk_buff *skb) 128 114 { 129 - int idx = src_to_dst_tmp; 130 115 __u8 zero[ETH_ALEN * 2]; 131 116 bool redirect = false; 132 117 ··· 138 131 if (!redirect) 139 132 return TC_ACT_OK; 140 133 141 - barrier_data(&idx); 142 - idx = bpf_ntohl(idx); 143 - 144 134 __builtin_memset(&zero, 0, sizeof(zero)); 145 135 if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) 146 136 return TC_ACT_SHOT; 147 137 148 - return bpf_redirect_neigh(idx, 0); 138 + return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0); 149 139 } 150 140 151 141 char __license[] SEC("license") = "GPL";

+45

tools/testing/selftests/bpf/progs/test_tc_peer.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <stdint.h> 3 + #include <stdbool.h> 4 + 5 + #include <linux/bpf.h> 6 + #include <linux/stddef.h> 7 + #include <linux/pkt_cls.h> 8 + 9 + #include <bpf/bpf_helpers.h> 10 + 11 + enum { 12 + dev_src, 13 + dev_dst, 14 + }; 15 + 16 + struct bpf_map_def SEC("maps") ifindex_map = { 17 + .type = BPF_MAP_TYPE_ARRAY, 18 + .key_size = sizeof(int), 19 + .value_size = sizeof(int), 20 + .max_entries = 2, 21 + }; 22 + 23 + static __always_inline int get_dev_ifindex(int which) 24 + { 25 + int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); 26 + 27 + return ifindex ? *ifindex : 0; 28 + } 29 + 30 + SEC("chk_egress") int tc_chk(struct __sk_buff *skb) 31 + { 32 + return TC_ACT_SHOT; 33 + } 34 + 35 + SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) 36 + { 37 + return bpf_redirect_peer(get_dev_ifindex(dev_src), 0); 38 + } 39 + 40 + SEC("src_ingress") int tc_src(struct __sk_buff *skb) 41 + { 42 + return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0); 43 + } 44 + 45 + char __license[] SEC("license") = "GPL";

+5 -2

tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c

··· 21 21 22 22 __u8 test_kind = TCPOPT_EXP; 23 23 __u16 test_magic = 0xeB9F; 24 + __u32 inherit_cb_flags = 0; 24 25 25 26 struct bpf_test_option passive_synack_out = {}; 26 27 struct bpf_test_option passive_fin_out = {}; ··· 468 467 struct tcphdr *th; 469 468 int err; 470 469 470 + inherit_cb_flags = skops->bpf_sock_ops_cb_flags; 471 + 471 472 err = load_option(skops, &passive_estab_in, true); 472 473 if (err == -ENOENT) { 473 474 /* saved_syn is not found. It was in syncookie mode. ··· 603 600 case BPF_SOCK_OPS_TCP_LISTEN_CB: 604 601 bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, 605 602 &true_val, sizeof(true_val)); 606 - set_hdr_cb_flags(skops); 603 + set_hdr_cb_flags(skops, BPF_SOCK_OPS_STATE_CB_FLAG); 607 604 break; 608 605 case BPF_SOCK_OPS_TCP_CONNECT_CB: 609 - set_hdr_cb_flags(skops); 606 + set_hdr_cb_flags(skops, 0); 610 607 break; 611 608 case BPF_SOCK_OPS_PARSE_HDR_OPT_CB: 612 609 return handle_parse_hdr(skops);

+45 -36

tools/testing/selftests/bpf/test_sockmap.c

··· 86 86 int ktls; 87 87 int peek_flag; 88 88 int skb_use_parser; 89 + int txmsg_omit_skb_parser; 89 90 90 91 static const struct option long_options[] = { 91 92 {"help", no_argument, NULL, 'h' }, ··· 112 111 {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, 113 112 {"ktls", no_argument, &ktls, 1 }, 114 113 {"peek", no_argument, &peek_flag, 1 }, 114 + {"txmsg_omit_skb_parser", no_argument, &txmsg_omit_skb_parser, 1}, 115 115 {"whitelist", required_argument, NULL, 'n' }, 116 116 {"blacklist", required_argument, NULL, 'b' }, 117 117 {0, 0, NULL, 0 } ··· 177 175 txmsg_apply = txmsg_cork = 0; 178 176 txmsg_ingress = txmsg_redir_skb = 0; 179 177 txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; 178 + txmsg_omit_skb_parser = 0; 180 179 skb_use_parser = 0; 181 180 } 182 181 ··· 521 518 if (i == 0 && txmsg_ktls_skb) { 522 519 if (msg->msg_iov[i].iov_len < 4) 523 520 return -EIO; 524 - if (txmsg_ktls_skb_redir) { 525 - if (memcmp(&d[13], "PASS", 4) != 0) { 526 - fprintf(stderr, 527 - "detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]); 528 - return -EIO; 529 - } 530 - d[13] = 0; 531 - d[14] = 1; 532 - d[15] = 2; 533 - d[16] = 3; 534 - j = 13; 535 - } else if (txmsg_ktls_skb) { 536 - if (memcmp(d, "PASS", 4) != 0) { 537 - fprintf(stderr, 538 - "detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]); 539 - return -EIO; 540 - } 541 - d[0] = 0; 542 - d[1] = 1; 543 - d[2] = 2; 544 - d[3] = 3; 521 + if (memcmp(d, "PASS", 4) != 0) { 522 + fprintf(stderr, 523 + "detected skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", 524 + i, 0, d[0], d[1], d[2], d[3]); 525 + return -EIO; 545 526 } 527 + j = 4; /* advance index past PASS header */ 546 528 } 547 529 548 530 for (; j < msg->msg_iov[i].iov_len && size; j++) { ··· 915 927 goto run; 916 928 917 929 /* Attach programs to sockmap */ 918 - err = bpf_prog_attach(prog_fd[0], map_fd[0], 919 - BPF_SK_SKB_STREAM_PARSER, 0); 920 - if (err) { 921 - fprintf(stderr, 922 - "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n", 923 - prog_fd[0], map_fd[0], err, strerror(errno)); 924 - return err; 930 + if (!txmsg_omit_skb_parser) { 931 + err = bpf_prog_attach(prog_fd[0], map_fd[0], 932 + BPF_SK_SKB_STREAM_PARSER, 0); 933 + if (err) { 934 + fprintf(stderr, 935 + "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n", 936 + prog_fd[0], map_fd[0], err, strerror(errno)); 937 + return err; 938 + } 925 939 } 926 940 927 941 err = bpf_prog_attach(prog_fd[1], map_fd[0], ··· 936 946 937 947 /* Attach programs to TLS sockmap */ 938 948 if (txmsg_ktls_skb) { 939 - err = bpf_prog_attach(prog_fd[0], map_fd[8], 940 - BPF_SK_SKB_STREAM_PARSER, 0); 941 - if (err) { 942 - fprintf(stderr, 943 - "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", 944 - prog_fd[0], map_fd[8], err, strerror(errno)); 945 - return err; 949 + if (!txmsg_omit_skb_parser) { 950 + err = bpf_prog_attach(prog_fd[0], map_fd[8], 951 + BPF_SK_SKB_STREAM_PARSER, 0); 952 + if (err) { 953 + fprintf(stderr, 954 + "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", 955 + prog_fd[0], map_fd[8], err, strerror(errno)); 956 + return err; 957 + } 946 958 } 947 959 948 960 err = bpf_prog_attach(prog_fd[2], map_fd[8], ··· 1472 1480 txmsg_ktls_skb_drop = 0; 1473 1481 txmsg_ktls_skb_redir = 1; 1474 1482 test_exec(cgrp, opt); 1483 + txmsg_ktls_skb_redir = 0; 1484 + 1485 + /* Tests that omit skb_parser */ 1486 + txmsg_omit_skb_parser = 1; 1487 + ktls = 0; 1488 + txmsg_ktls_skb = 0; 1489 + test_exec(cgrp, opt); 1490 + 1491 + txmsg_ktls_skb_drop = 1; 1492 + test_exec(cgrp, opt); 1493 + txmsg_ktls_skb_drop = 0; 1494 + 1495 + txmsg_ktls_skb_redir = 1; 1496 + test_exec(cgrp, opt); 1497 + 1498 + ktls = 1; 1499 + test_exec(cgrp, opt); 1500 + txmsg_omit_skb_parser = 0; 1475 1501 1476 1502 opt->data_test = data; 1477 1503 ktls = k; 1478 1504 } 1479 - 1480 1505 1481 1506 /* Test cork with hung data. This tests poor usage patterns where 1482 1507 * cork can leave data on the ring if user program is buggy and

-168

tools/testing/selftests/bpf/test_tc_neigh.sh

··· 1 - #!/bin/bash 2 - # SPDX-License-Identifier: GPL-2.0 3 - # 4 - # This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link 5 - # between src and dst. The netns fwd has veth links to each src and dst. The 6 - # client is in src and server in dst. The test installs a TC BPF program to each 7 - # host facing veth in fwd which calls into bpf_redirect_peer() to perform the 8 - # neigh addr population and redirect; it also installs a dropper prog on the 9 - # egress side to drop skbs if neigh addrs were not populated. 10 - 11 - if [[ $EUID -ne 0 ]]; then 12 - echo "This script must be run as root" 13 - echo "FAIL" 14 - exit 1 15 - fi 16 - 17 - # check that nc, dd, ping, ping6 and timeout are present 18 - command -v nc >/dev/null 2>&1 || \ 19 - { echo >&2 "nc is not available"; exit 1; } 20 - command -v dd >/dev/null 2>&1 || \ 21 - { echo >&2 "dd is not available"; exit 1; } 22 - command -v timeout >/dev/null 2>&1 || \ 23 - { echo >&2 "timeout is not available"; exit 1; } 24 - command -v ping >/dev/null 2>&1 || \ 25 - { echo >&2 "ping is not available"; exit 1; } 26 - command -v ping6 >/dev/null 2>&1 || \ 27 - { echo >&2 "ping6 is not available"; exit 1; } 28 - 29 - readonly GREEN='\033[0;92m' 30 - readonly RED='\033[0;31m' 31 - readonly NC='\033[0m' # No Color 32 - 33 - readonly PING_ARG="-c 3 -w 10 -q" 34 - 35 - readonly TIMEOUT=10 36 - 37 - readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" 38 - readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" 39 - readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" 40 - 41 - readonly IP4_SRC="172.16.1.100" 42 - readonly IP4_DST="172.16.2.100" 43 - 44 - readonly IP6_SRC="::1:dead:beef:cafe" 45 - readonly IP6_DST="::2:dead:beef:cafe" 46 - 47 - readonly IP4_SLL="169.254.0.1" 48 - readonly IP4_DLL="169.254.0.2" 49 - readonly IP4_NET="169.254.0.0" 50 - 51 - cleanup() 52 - { 53 - ip netns del ${NS_SRC} 54 - ip netns del ${NS_FWD} 55 - ip netns del ${NS_DST} 56 - } 57 - 58 - trap cleanup EXIT 59 - 60 - set -e 61 - 62 - ip netns add "${NS_SRC}" 63 - ip netns add "${NS_FWD}" 64 - ip netns add "${NS_DST}" 65 - 66 - ip link add veth_src type veth peer name veth_src_fwd 67 - ip link add veth_dst type veth peer name veth_dst_fwd 68 - 69 - ip link set veth_src netns ${NS_SRC} 70 - ip link set veth_src_fwd netns ${NS_FWD} 71 - 72 - ip link set veth_dst netns ${NS_DST} 73 - ip link set veth_dst_fwd netns ${NS_FWD} 74 - 75 - ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src 76 - ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst 77 - 78 - # The fwd netns automatically get a v6 LL address / routes, but also needs v4 79 - # one in order to start ARP probing. IP4_NET route is added to the endpoints 80 - # so that the ARP processing will reply. 81 - 82 - ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd 83 - ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd 84 - 85 - ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad 86 - ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad 87 - 88 - ip -netns ${NS_SRC} link set dev veth_src up 89 - ip -netns ${NS_FWD} link set dev veth_src_fwd up 90 - 91 - ip -netns ${NS_DST} link set dev veth_dst up 92 - ip -netns ${NS_FWD} link set dev veth_dst_fwd up 93 - 94 - ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global 95 - ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global 96 - ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global 97 - 98 - ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global 99 - ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global 100 - 101 - ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global 102 - ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global 103 - ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global 104 - 105 - ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global 106 - ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global 107 - 108 - fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) 109 - fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) 110 - 111 - ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src 112 - ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst 113 - 114 - ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src 115 - ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst 116 - 117 - veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}') 118 - veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}') 119 - 120 - xxd -p < test_tc_neigh.o | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o 121 - xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o 122 - 123 - ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact 124 - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress 125 - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh 126 - 127 - ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact 128 - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress 129 - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh 130 - 131 - rm -f test_tc_neigh.x.o test_tc_neigh.y.o 132 - 133 - ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" 134 - ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" 135 - 136 - set +e 137 - 138 - TEST="TCPv4 connectivity test" 139 - ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" 140 - if [ $? -ne 0 ]; then 141 - echo -e "${TEST}: ${RED}FAIL${NC}" 142 - exit 1 143 - fi 144 - echo -e "${TEST}: ${GREEN}PASS${NC}" 145 - 146 - TEST="TCPv6 connectivity test" 147 - ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" 148 - if [ $? -ne 0 ]; then 149 - echo -e "${TEST}: ${RED}FAIL${NC}" 150 - exit 1 151 - fi 152 - echo -e "${TEST}: ${GREEN}PASS${NC}" 153 - 154 - TEST="ICMPv4 connectivity test" 155 - ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} 156 - if [ $? -ne 0 ]; then 157 - echo -e "${TEST}: ${RED}FAIL${NC}" 158 - exit 1 159 - fi 160 - echo -e "${TEST}: ${GREEN}PASS${NC}" 161 - 162 - TEST="ICMPv6 connectivity test" 163 - ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} 164 - if [ $? -ne 0 ]; then 165 - echo -e "${TEST}: ${RED}FAIL${NC}" 166 - exit 1 167 - fi 168 - echo -e "${TEST}: ${GREEN}PASS${NC}"

+204

tools/testing/selftests/bpf/test_tc_redirect.sh

··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + # 4 + # This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link 5 + # between src and dst. The netns fwd has veth links to each src and dst. The 6 + # client is in src and server in dst. The test installs a TC BPF program to each 7 + # host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the 8 + # neigh addr population and redirect or ii) bpf_redirect_peer() for namespace 9 + # switch from ingress side; it also installs a checker prog on the egress side 10 + # to drop unexpected traffic. 11 + 12 + if [[ $EUID -ne 0 ]]; then 13 + echo "This script must be run as root" 14 + echo "FAIL" 15 + exit 1 16 + fi 17 + 18 + # check that needed tools are present 19 + command -v nc >/dev/null 2>&1 || \ 20 + { echo >&2 "nc is not available"; exit 1; } 21 + command -v dd >/dev/null 2>&1 || \ 22 + { echo >&2 "dd is not available"; exit 1; } 23 + command -v timeout >/dev/null 2>&1 || \ 24 + { echo >&2 "timeout is not available"; exit 1; } 25 + command -v ping >/dev/null 2>&1 || \ 26 + { echo >&2 "ping is not available"; exit 1; } 27 + command -v ping6 >/dev/null 2>&1 || \ 28 + { echo >&2 "ping6 is not available"; exit 1; } 29 + command -v perl >/dev/null 2>&1 || \ 30 + { echo >&2 "perl is not available"; exit 1; } 31 + command -v jq >/dev/null 2>&1 || \ 32 + { echo >&2 "jq is not available"; exit 1; } 33 + command -v bpftool >/dev/null 2>&1 || \ 34 + { echo >&2 "bpftool is not available"; exit 1; } 35 + 36 + readonly GREEN='\033[0;92m' 37 + readonly RED='\033[0;31m' 38 + readonly NC='\033[0m' # No Color 39 + 40 + readonly PING_ARG="-c 3 -w 10 -q" 41 + 42 + readonly TIMEOUT=10 43 + 44 + readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" 45 + readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" 46 + readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" 47 + 48 + readonly IP4_SRC="172.16.1.100" 49 + readonly IP4_DST="172.16.2.100" 50 + 51 + readonly IP6_SRC="::1:dead:beef:cafe" 52 + readonly IP6_DST="::2:dead:beef:cafe" 53 + 54 + readonly IP4_SLL="169.254.0.1" 55 + readonly IP4_DLL="169.254.0.2" 56 + readonly IP4_NET="169.254.0.0" 57 + 58 + netns_cleanup() 59 + { 60 + ip netns del ${NS_SRC} 61 + ip netns del ${NS_FWD} 62 + ip netns del ${NS_DST} 63 + } 64 + 65 + netns_setup() 66 + { 67 + ip netns add "${NS_SRC}" 68 + ip netns add "${NS_FWD}" 69 + ip netns add "${NS_DST}" 70 + 71 + ip link add veth_src type veth peer name veth_src_fwd 72 + ip link add veth_dst type veth peer name veth_dst_fwd 73 + 74 + ip link set veth_src netns ${NS_SRC} 75 + ip link set veth_src_fwd netns ${NS_FWD} 76 + 77 + ip link set veth_dst netns ${NS_DST} 78 + ip link set veth_dst_fwd netns ${NS_FWD} 79 + 80 + ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src 81 + ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst 82 + 83 + # The fwd netns automatically get a v6 LL address / routes, but also 84 + # needs v4 one in order to start ARP probing. IP4_NET route is added 85 + # to the endpoints so that the ARP processing will reply. 86 + 87 + ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd 88 + ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd 89 + 90 + ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad 91 + ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad 92 + 93 + ip -netns ${NS_SRC} link set dev veth_src up 94 + ip -netns ${NS_FWD} link set dev veth_src_fwd up 95 + 96 + ip -netns ${NS_DST} link set dev veth_dst up 97 + ip -netns ${NS_FWD} link set dev veth_dst_fwd up 98 + 99 + ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global 100 + ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global 101 + ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global 102 + 103 + ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global 104 + ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global 105 + 106 + ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global 107 + ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global 108 + ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global 109 + 110 + ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global 111 + ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global 112 + 113 + fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) 114 + fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) 115 + 116 + ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src 117 + ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst 118 + 119 + ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src 120 + ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst 121 + } 122 + 123 + netns_test_connectivity() 124 + { 125 + set +e 126 + 127 + ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" 128 + ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" 129 + 130 + TEST="TCPv4 connectivity test" 131 + ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" 132 + if [ $? -ne 0 ]; then 133 + echo -e "${TEST}: ${RED}FAIL${NC}" 134 + exit 1 135 + fi 136 + echo -e "${TEST}: ${GREEN}PASS${NC}" 137 + 138 + TEST="TCPv6 connectivity test" 139 + ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" 140 + if [ $? -ne 0 ]; then 141 + echo -e "${TEST}: ${RED}FAIL${NC}" 142 + exit 1 143 + fi 144 + echo -e "${TEST}: ${GREEN}PASS${NC}" 145 + 146 + TEST="ICMPv4 connectivity test" 147 + ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} 148 + if [ $? -ne 0 ]; then 149 + echo -e "${TEST}: ${RED}FAIL${NC}" 150 + exit 1 151 + fi 152 + echo -e "${TEST}: ${GREEN}PASS${NC}" 153 + 154 + TEST="ICMPv6 connectivity test" 155 + ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST} 156 + if [ $? -ne 0 ]; then 157 + echo -e "${TEST}: ${RED}FAIL${NC}" 158 + exit 1 159 + fi 160 + echo -e "${TEST}: ${GREEN}PASS${NC}" 161 + 162 + set -e 163 + } 164 + 165 + hex_mem_str() 166 + { 167 + perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1 168 + } 169 + 170 + netns_setup_bpf() 171 + { 172 + local obj=$1 173 + 174 + ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact 175 + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress 176 + ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress 177 + 178 + ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact 179 + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress 180 + ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress 181 + 182 + veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex) 183 + veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex) 184 + 185 + progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]') 186 + for prog in $progs; do 187 + map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]') 188 + if [ ! -z "$map" ]; then 189 + bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src) 190 + bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst) 191 + fi 192 + done 193 + } 194 + 195 + trap netns_cleanup EXIT 196 + set -e 197 + 198 + netns_setup 199 + netns_setup_bpf test_tc_neigh.o 200 + netns_test_connectivity 201 + netns_cleanup 202 + netns_setup 203 + netns_setup_bpf test_tc_peer.o 204 + netns_test_connectivity

+3 -2

tools/testing/selftests/bpf/test_tcp_hdr_options.h

··· 110 110 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)); 111 111 } 112 112 113 - static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops) 113 + static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops, __u32 extra) 114 114 { 115 115 bpf_sock_ops_cb_flags_set(skops, 116 116 skops->bpf_sock_ops_cb_flags | 117 117 BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | 118 - BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); 118 + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG | 119 + extra); 119 120 } 120 121 static inline void 121 122 clear_parse_all_hdr_cb_flags(struct bpf_sock_ops *skops)

+27

tools/testing/selftests/bpf/trace_helpers.c

··· 90 90 return 0; 91 91 } 92 92 93 + /* open kallsyms and read symbol addresses on the fly. Without caching all symbols, 94 + * this is faster than load + find. 95 + */ 96 + int kallsyms_find(const char *sym, unsigned long long *addr) 97 + { 98 + char type, name[500]; 99 + unsigned long long value; 100 + int err = 0; 101 + FILE *f; 102 + 103 + f = fopen("/proc/kallsyms", "r"); 104 + if (!f) 105 + return -EINVAL; 106 + 107 + while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) { 108 + if (strcmp(name, sym) == 0) { 109 + *addr = value; 110 + goto out; 111 + } 112 + } 113 + err = -ENOENT; 114 + 115 + out: 116 + fclose(f); 117 + return err; 118 + } 119 + 93 120 void read_trace_pipe(void) 94 121 { 95 122 int trace_fd;

+4

tools/testing/selftests/bpf/trace_helpers.h

··· 12 12 int load_kallsyms(void); 13 13 struct ksym *ksym_search(long key); 14 14 long ksym_get_addr(const char *name); 15 + 16 + /* open kallsyms and find addresses on the fly, faster than load + search. */ 17 + int kallsyms_find(const char *sym, unsigned long long *addr); 18 + 15 19 void read_trace_pipe(void); 16 20 17 21 #endif

+1 -1

tools/testing/selftests/bpf/verifier/basic.c

··· 2 2 "empty prog", 3 3 .insns = { 4 4 }, 5 - .errstr = "unknown opcode 00", 5 + .errstr = "last insn is not an exit or jmp", 6 6 .result = REJECT, 7 7 }, 8 8 {

+1 -1

tools/testing/selftests/bpf/verifier/direct_packet_access.c

··· 529 529 }, 530 530 .prog_type = BPF_PROG_TYPE_SCHED_CLS, 531 531 .result = REJECT, 532 - .errstr = "invalid access to packet, off=0 size=8, R5(id=1,off=0,r=0)", 532 + .errstr = "invalid access to packet, off=0 size=8, R5(id=2,off=0,r=0)", 533 533 .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, 534 534 }, 535 535 {

-8

tools/testing/selftests/bpf/verifier/ld_imm64.c

··· 51 51 .result = REJECT, 52 52 }, 53 53 { 54 - "test5 ld_imm64", 55 - .insns = { 56 - BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), 57 - }, 58 - .errstr = "invalid bpf_ld_imm64 insn", 59 - .result = REJECT, 60 - }, 61 - { 62 54 "test6 ld_imm64", 63 55 .insns = { 64 56 BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),

+243

tools/testing/selftests/bpf/verifier/regalloc.c

··· 1 + { 2 + "regalloc basic", 3 + .insns = { 4 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 5 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 6 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 7 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 8 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 10 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), 11 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 12 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 13 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 14 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 4), 15 + BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3), 16 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 17 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), 18 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 19 + BPF_EXIT_INSN(), 20 + }, 21 + .fixup_map_hash_48b = { 4 }, 22 + .result = ACCEPT, 23 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 24 + }, 25 + { 26 + "regalloc negative", 27 + .insns = { 28 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 29 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 30 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 31 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 32 + BPF_LD_MAP_FD(BPF_REG_1, 0), 33 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 34 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), 35 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 36 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 37 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 38 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 24, 4), 39 + BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3), 40 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 41 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), 42 + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0), 43 + BPF_EXIT_INSN(), 44 + }, 45 + .fixup_map_hash_48b = { 4 }, 46 + .result = REJECT, 47 + .errstr = "invalid access to map value, value_size=48 off=48 size=1", 48 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 49 + }, 50 + { 51 + "regalloc src_reg mark", 52 + .insns = { 53 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 54 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 55 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 56 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 57 + BPF_LD_MAP_FD(BPF_REG_1, 0), 58 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 59 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), 60 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 61 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 62 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 63 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 5), 64 + BPF_MOV64_IMM(BPF_REG_3, 0), 65 + BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3), 66 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 67 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), 68 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 69 + BPF_EXIT_INSN(), 70 + }, 71 + .fixup_map_hash_48b = { 4 }, 72 + .result = ACCEPT, 73 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 74 + }, 75 + { 76 + "regalloc src_reg negative", 77 + .insns = { 78 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 79 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 80 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 81 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 82 + BPF_LD_MAP_FD(BPF_REG_1, 0), 83 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 84 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), 85 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 86 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 87 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 88 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 22, 5), 89 + BPF_MOV64_IMM(BPF_REG_3, 0), 90 + BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3), 91 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 92 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), 93 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 94 + BPF_EXIT_INSN(), 95 + }, 96 + .fixup_map_hash_48b = { 4 }, 97 + .result = REJECT, 98 + .errstr = "invalid access to map value, value_size=48 off=44 size=8", 99 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 100 + }, 101 + { 102 + "regalloc and spill", 103 + .insns = { 104 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 105 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 106 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 107 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 108 + BPF_LD_MAP_FD(BPF_REG_1, 0), 109 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 110 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), 111 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 112 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 113 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 114 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 7), 115 + /* r0 has upper bound that should propagate into r2 */ 116 + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */ 117 + BPF_MOV64_IMM(BPF_REG_0, 0), 118 + BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */ 119 + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */ 120 + BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2), 121 + /* r3 has lower and upper bounds */ 122 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3), 123 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 124 + BPF_EXIT_INSN(), 125 + }, 126 + .fixup_map_hash_48b = { 4 }, 127 + .result = ACCEPT, 128 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 129 + }, 130 + { 131 + "regalloc and spill negative", 132 + .insns = { 133 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 134 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 135 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 136 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 137 + BPF_LD_MAP_FD(BPF_REG_1, 0), 138 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 139 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), 140 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 141 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 142 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 143 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 48, 7), 144 + /* r0 has upper bound that should propagate into r2 */ 145 + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */ 146 + BPF_MOV64_IMM(BPF_REG_0, 0), 147 + BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */ 148 + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */ 149 + BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2), 150 + /* r3 has lower and upper bounds */ 151 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3), 152 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 153 + BPF_EXIT_INSN(), 154 + }, 155 + .fixup_map_hash_48b = { 4 }, 156 + .result = REJECT, 157 + .errstr = "invalid access to map value, value_size=48 off=48 size=8", 158 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 159 + }, 160 + { 161 + "regalloc three regs", 162 + .insns = { 163 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 164 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 165 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 166 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 167 + BPF_LD_MAP_FD(BPF_REG_1, 0), 168 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 169 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), 170 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 171 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 172 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 173 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), 174 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 12, 5), 175 + BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4), 176 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 177 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2), 178 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_4), 179 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 180 + BPF_EXIT_INSN(), 181 + }, 182 + .fixup_map_hash_48b = { 4 }, 183 + .result = ACCEPT, 184 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 185 + }, 186 + { 187 + "regalloc after call", 188 + .insns = { 189 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 190 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 191 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 192 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 193 + BPF_LD_MAP_FD(BPF_REG_1, 0), 194 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 195 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), 196 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 197 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 198 + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), 199 + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), 200 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), 201 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_8, 20, 4), 202 + BPF_JMP_IMM(BPF_JSLT, BPF_REG_9, 0, 3), 203 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_8), 204 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_9), 205 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 206 + BPF_EXIT_INSN(), 207 + BPF_MOV64_IMM(BPF_REG_0, 0), 208 + BPF_EXIT_INSN(), 209 + }, 210 + .fixup_map_hash_48b = { 4 }, 211 + .result = ACCEPT, 212 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 213 + }, 214 + { 215 + "regalloc in callee", 216 + .insns = { 217 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 218 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 219 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 220 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 221 + BPF_LD_MAP_FD(BPF_REG_1, 0), 222 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 223 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), 224 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 225 + BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32), 226 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 227 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 228 + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), 229 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 230 + BPF_EXIT_INSN(), 231 + BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 20, 5), 232 + BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4), 233 + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1), 234 + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2), 235 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), 236 + BPF_EXIT_INSN(), 237 + BPF_MOV64_IMM(BPF_REG_0, 0), 238 + BPF_EXIT_INSN(), 239 + }, 240 + .fixup_map_hash_48b = { 4 }, 241 + .result = ACCEPT, 242 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 243 + },