Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+2

arch/alpha/include/uapi/asm/socket.h

··· 122 122 #define SO_RCVTIMEO_NEW 66 123 123 #define SO_SNDTIMEO_NEW 67 124 124 125 + #define SO_DETACH_REUSEPORT_BPF 68 126 + 125 127 #if !defined(__KERNEL__) 126 128 127 129 #if __BITS_PER_LONG == 64

+2

arch/mips/include/uapi/asm/socket.h

··· 133 133 #define SO_RCVTIMEO_NEW 66 134 134 #define SO_SNDTIMEO_NEW 67 135 135 136 + #define SO_DETACH_REUSEPORT_BPF 68 137 + 136 138 #if !defined(__KERNEL__) 137 139 138 140 #if __BITS_PER_LONG == 64

+2

arch/parisc/include/uapi/asm/socket.h

··· 114 114 #define SO_RCVTIMEO_NEW 0x4040 115 115 #define SO_SNDTIMEO_NEW 0x4041 116 116 117 + #define SO_DETACH_REUSEPORT_BPF 0x4042 118 + 117 119 #if !defined(__KERNEL__) 118 120 119 121 #if __BITS_PER_LONG == 64

+2

arch/sparc/include/uapi/asm/socket.h

··· 115 115 #define SO_RCVTIMEO_NEW 0x0044 116 116 #define SO_SNDTIMEO_NEW 0x0045 117 117 118 + #define SO_DETACH_REUSEPORT_BPF 0x0047 119 + 118 120 #if !defined(__KERNEL__) 119 121 120 122

+25

include/linux/bpf.h

··· 277 277 PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ 278 278 PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ 279 279 PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ 280 + PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ 280 281 }; 281 282 282 283 /* The information passed from prog-specific *_is_valid_access ··· 1099 1098 struct bpf_insn *insn_buf, 1100 1099 struct bpf_prog *prog, 1101 1100 u32 *target_size); 1101 + 1102 + bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, 1103 + struct bpf_insn_access_aux *info); 1104 + 1105 + u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, 1106 + const struct bpf_insn *si, 1107 + struct bpf_insn *insn_buf, 1108 + struct bpf_prog *prog, 1109 + u32 *target_size); 1102 1110 #else 1103 1111 static inline bool bpf_tcp_sock_is_valid_access(int off, int size, 1104 1112 enum bpf_access_type type, ··· 1117 1107 } 1118 1108 1119 1109 static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, 1110 + const struct bpf_insn *si, 1111 + struct bpf_insn *insn_buf, 1112 + struct bpf_prog *prog, 1113 + u32 *target_size) 1114 + { 1115 + return 0; 1116 + } 1117 + static inline bool bpf_xdp_sock_is_valid_access(int off, int size, 1118 + enum bpf_access_type type, 1119 + struct bpf_insn_access_aux *info) 1120 + { 1121 + return false; 1122 + } 1123 + 1124 + static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, 1120 1125 const struct bpf_insn *si, 1121 1126 struct bpf_insn *insn_buf, 1122 1127 struct bpf_prog *prog,

+68 -1

include/linux/bpf_verifier.h

··· 136 136 */ 137 137 s32 subreg_def; 138 138 enum bpf_reg_liveness live; 139 + /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ 140 + bool precise; 139 141 }; 140 142 141 143 enum bpf_stack_slot_type { ··· 189 187 struct bpf_stack_state *stack; 190 188 }; 191 189 190 + struct bpf_idx_pair { 191 + u32 prev_idx; 192 + u32 idx; 193 + }; 194 + 192 195 #define MAX_CALL_FRAMES 8 193 196 struct bpf_verifier_state { 194 197 /* call stack tracking */ 195 198 struct bpf_func_state *frame[MAX_CALL_FRAMES]; 199 + struct bpf_verifier_state *parent; 200 + /* 201 + * 'branches' field is the number of branches left to explore: 202 + * 0 - all possible paths from this state reached bpf_exit or 203 + * were safely pruned 204 + * 1 - at least one path is being explored. 205 + * This state hasn't reached bpf_exit 206 + * 2 - at least two paths are being explored. 207 + * This state is an immediate parent of two children. 208 + * One is fallthrough branch with branches==1 and another 209 + * state is pushed into stack (to be explored later) also with 210 + * branches==1. The parent of this state has branches==1. 211 + * The verifier state tree connected via 'parent' pointer looks like: 212 + * 1 213 + * 1 214 + * 2 -> 1 (first 'if' pushed into stack) 215 + * 1 216 + * 2 -> 1 (second 'if' pushed into stack) 217 + * 1 218 + * 1 219 + * 1 bpf_exit. 220 + * 221 + * Once do_check() reaches bpf_exit, it calls update_branch_counts() 222 + * and the verifier state tree will look: 223 + * 1 224 + * 1 225 + * 2 -> 1 (first 'if' pushed into stack) 226 + * 1 227 + * 1 -> 1 (second 'if' pushed into stack) 228 + * 0 229 + * 0 230 + * 0 bpf_exit. 231 + * After pop_stack() the do_check() will resume at second 'if'. 232 + * 233 + * If is_state_visited() sees a state with branches > 0 it means 234 + * there is a loop. If such state is exactly equal to the current state 235 + * it's an infinite loop. Note states_equal() checks for states 236 + * equvalency, so two states being 'states_equal' does not mean 237 + * infinite loop. The exact comparison is provided by 238 + * states_maybe_looping() function. It's a stronger pre-check and 239 + * much faster than states_equal(). 240 + * 241 + * This algorithm may not find all possible infinite loops or 242 + * loop iteration count may be too high. 243 + * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in. 244 + */ 245 + u32 branches; 196 246 u32 insn_idx; 197 247 u32 curframe; 198 248 u32 active_spin_lock; 199 249 bool speculative; 250 + 251 + /* first and last insn idx of this verifier state */ 252 + u32 first_insn_idx; 253 + u32 last_insn_idx; 254 + /* jmp history recorded from first to last. 255 + * backtracking is using it to go from last to first. 256 + * For most states jmp_history_cnt is [0-3]. 257 + * For loops can go up to ~40. 258 + */ 259 + struct bpf_idx_pair *jmp_history; 260 + u32 jmp_history_cnt; 200 261 }; 201 262 202 263 #define bpf_get_spilled_reg(slot, frame) \ ··· 374 309 } cfg; 375 310 u32 subprog_cnt; 376 311 /* number of instructions analyzed by the verifier */ 377 - u32 insn_processed; 312 + u32 prev_insn_processed, insn_processed; 313 + /* number of jmps, calls, exits analyzed so far */ 314 + u32 prev_jmps_processed, jmps_processed; 378 315 /* total verification time */ 379 316 u64 verification_time; 380 317 /* maximum number of verifier states kept in 'branching' instructions */

+2

include/net/sock_reuseport.h

··· 35 35 struct sk_buff *skb, 36 36 int hdr_len); 37 37 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); 38 + extern int reuseport_detach_prog(struct sock *sk); 39 + 38 40 int reuseport_get_id(struct sock_reuseport *reuse); 39 41 40 42 #endif /* _SOCK_REUSEPORT_H */

+2 -2

include/net/xdp_sock.h

··· 58 58 struct xdp_umem *umem; 59 59 struct list_head flush_node; 60 60 u16 queue_id; 61 - struct xsk_queue *tx ____cacheline_aligned_in_smp; 62 - struct list_head list; 63 61 bool zc; 64 62 /* Protects multiple processes in the control path */ 65 63 struct mutex mutex; 64 + struct xsk_queue *tx ____cacheline_aligned_in_smp; 65 + struct list_head list; 66 66 /* Mutual exclusion of NAPI TX thread and sendmsg error paths 67 67 * in the SKB destructor callback. 68 68 */

+2

include/uapi/asm-generic/socket.h

··· 117 117 #define SO_RCVTIMEO_NEW 66 118 118 #define SO_SNDTIMEO_NEW 67 119 119 120 + #define SO_DETACH_REUSEPORT_BPF 68 121 + 120 122 #if !defined(__KERNEL__) 121 123 122 124 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))

+6

include/uapi/linux/bpf.h

··· 3085 3085 }; 3086 3086 }; 3087 3087 3088 + struct bpf_xdp_sock { 3089 + __u32 queue_id; 3090 + }; 3091 + 3088 3092 #define XDP_PACKET_HEADROOM 256 3089 3093 3090 3094 /* User return codes for XDP prog type. ··· 3249 3245 __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. 3250 3246 * Stored in network byte order. 3251 3247 */ 3248 + __bpf_md_ptr(struct bpf_sock *, sk); 3252 3249 }; 3253 3250 3254 3251 /* User bpf_sock_ops struct to access socket values and specify request ops ··· 3301 3296 __u32 sk_txhash; 3302 3297 __u64 bytes_received; 3303 3298 __u64 bytes_acked; 3299 + __bpf_md_ptr(struct bpf_sock *, sk); 3304 3300 }; 3305 3301 3306 3302 /* Definitions for bpf_sock_ops_cb_flags */

+1

kernel/bpf/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 obj-y := core.o 3 + CFLAGS_core.o += $(call cc-disable-warning, override-init) 3 4 4 5 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o 5 6 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o

+1 -1

kernel/bpf/devmap.c

··· 80 80 static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 81 81 { 82 82 struct bpf_dtab *dtab; 83 - int err = -EINVAL; 84 83 u64 cost; 84 + int err; 85 85 86 86 if (!capable(CAP_NET_ADMIN)) 87 87 return ERR_PTR(-EPERM);

+723 -68

kernel/bpf/verifier.c

··· 326 326 { 327 327 return type == PTR_TO_SOCKET || 328 328 type == PTR_TO_SOCK_COMMON || 329 - type == PTR_TO_TCP_SOCK; 329 + type == PTR_TO_TCP_SOCK || 330 + type == PTR_TO_XDP_SOCK; 330 331 } 331 332 332 333 static bool reg_type_may_be_null(enum bpf_reg_type type) ··· 399 398 [PTR_TO_TCP_SOCK] = "tcp_sock", 400 399 [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", 401 400 [PTR_TO_TP_BUFFER] = "tp_buffer", 401 + [PTR_TO_XDP_SOCK] = "xdp_sock", 402 402 }; 403 403 404 404 static char slot_type_char[] = { ··· 447 445 verbose(env, " R%d", i); 448 446 print_liveness(env, reg->live); 449 447 verbose(env, "=%s", reg_type_str[t]); 448 + if (t == SCALAR_VALUE && reg->precise) 449 + verbose(env, "P"); 450 450 if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && 451 451 tnum_is_const(reg->var_off)) { 452 452 /* reg->off should be 0 for SCALAR_VALUE */ 453 453 verbose(env, "%lld", reg->var_off.value + reg->off); 454 - if (t == PTR_TO_STACK) 455 - verbose(env, ",call_%d", func(env, reg)->callsite); 456 454 } else { 457 455 verbose(env, "(id=%d", reg->id); 458 456 if (reg_type_may_be_refcounted_or_null(t)) ··· 514 512 continue; 515 513 verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); 516 514 print_liveness(env, state->stack[i].spilled_ptr.live); 517 - if (state->stack[i].slot_type[0] == STACK_SPILL) 518 - verbose(env, "=%s", 519 - reg_type_str[state->stack[i].spilled_ptr.type]); 520 - else 515 + if (state->stack[i].slot_type[0] == STACK_SPILL) { 516 + reg = &state->stack[i].spilled_ptr; 517 + t = reg->type; 518 + verbose(env, "=%s", reg_type_str[t]); 519 + if (t == SCALAR_VALUE && reg->precise) 520 + verbose(env, "P"); 521 + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) 522 + verbose(env, "%lld", reg->var_off.value + reg->off); 523 + } else { 521 524 verbose(env, "=%s", types_buf); 525 + } 522 526 } 523 527 if (state->acquired_refs && state->refs[0].id) { 524 528 verbose(env, " refs=%d", state->refs[0].id); ··· 673 665 kfree(state); 674 666 } 675 667 668 + static void clear_jmp_history(struct bpf_verifier_state *state) 669 + { 670 + kfree(state->jmp_history); 671 + state->jmp_history = NULL; 672 + state->jmp_history_cnt = 0; 673 + } 674 + 676 675 static void free_verifier_state(struct bpf_verifier_state *state, 677 676 bool free_self) 678 677 { ··· 689 674 free_func_state(state->frame[i]); 690 675 state->frame[i] = NULL; 691 676 } 677 + clear_jmp_history(state); 692 678 if (free_self) 693 679 kfree(state); 694 680 } ··· 717 701 const struct bpf_verifier_state *src) 718 702 { 719 703 struct bpf_func_state *dst; 704 + u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; 720 705 int i, err; 706 + 707 + if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { 708 + kfree(dst_state->jmp_history); 709 + dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); 710 + if (!dst_state->jmp_history) 711 + return -ENOMEM; 712 + } 713 + memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); 714 + dst_state->jmp_history_cnt = src->jmp_history_cnt; 721 715 722 716 /* if dst has more stack frames then src frame, free them */ 723 717 for (i = src->curframe + 1; i <= dst_state->curframe; i++) { ··· 737 711 dst_state->speculative = src->speculative; 738 712 dst_state->curframe = src->curframe; 739 713 dst_state->active_spin_lock = src->active_spin_lock; 714 + dst_state->branches = src->branches; 715 + dst_state->parent = src->parent; 716 + dst_state->first_insn_idx = src->first_insn_idx; 717 + dst_state->last_insn_idx = src->last_insn_idx; 740 718 for (i = 0; i <= src->curframe; i++) { 741 719 dst = dst_state->frame[i]; 742 720 if (!dst) { ··· 754 724 return err; 755 725 } 756 726 return 0; 727 + } 728 + 729 + static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) 730 + { 731 + while (st) { 732 + u32 br = --st->branches; 733 + 734 + /* WARN_ON(br > 1) technically makes sense here, 735 + * but see comment in push_stack(), hence: 736 + */ 737 + WARN_ONCE((int)br < 0, 738 + "BUG update_branch_counts:branches_to_explore=%d\n", 739 + br); 740 + if (br) 741 + break; 742 + st = st->parent; 743 + } 757 744 } 758 745 759 746 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, ··· 825 778 verbose(env, "The sequence of %d jumps is too complex.\n", 826 779 env->stack_size); 827 780 goto err; 781 + } 782 + if (elem->st.parent) { 783 + ++elem->st.parent->branches; 784 + /* WARN_ON(branches > 2) technically makes sense here, 785 + * but 786 + * 1. speculative states will bump 'branches' for non-branch 787 + * instructions 788 + * 2. is_state_visited() heuristics may decide not to create 789 + * a new state for a sequence of branches and all such current 790 + * and cloned states will be pointing to a single parent state 791 + * which might have large 'branches' count. 792 + */ 828 793 } 829 794 return &elem->st; 830 795 err: ··· 985 926 reg->smax_value = S64_MAX; 986 927 reg->umin_value = 0; 987 928 reg->umax_value = U64_MAX; 929 + 930 + /* constant backtracking is enabled for root only for now */ 931 + reg->precise = capable(CAP_SYS_ADMIN) ? false : true; 988 932 } 989 933 990 934 /* Mark a register as having a completely unknown (scalar) value. */ ··· 1399 1337 return 0; 1400 1338 } 1401 1339 1340 + /* for any branch, call, exit record the history of jmps in the given state */ 1341 + static int push_jmp_history(struct bpf_verifier_env *env, 1342 + struct bpf_verifier_state *cur) 1343 + { 1344 + u32 cnt = cur->jmp_history_cnt; 1345 + struct bpf_idx_pair *p; 1346 + 1347 + cnt++; 1348 + p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); 1349 + if (!p) 1350 + return -ENOMEM; 1351 + p[cnt - 1].idx = env->insn_idx; 1352 + p[cnt - 1].prev_idx = env->prev_insn_idx; 1353 + cur->jmp_history = p; 1354 + cur->jmp_history_cnt = cnt; 1355 + return 0; 1356 + } 1357 + 1358 + /* Backtrack one insn at a time. If idx is not at the top of recorded 1359 + * history then previous instruction came from straight line execution. 1360 + */ 1361 + static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, 1362 + u32 *history) 1363 + { 1364 + u32 cnt = *history; 1365 + 1366 + if (cnt && st->jmp_history[cnt - 1].idx == i) { 1367 + i = st->jmp_history[cnt - 1].prev_idx; 1368 + (*history)--; 1369 + } else { 1370 + i--; 1371 + } 1372 + return i; 1373 + } 1374 + 1375 + /* For given verifier state backtrack_insn() is called from the last insn to 1376 + * the first insn. Its purpose is to compute a bitmask of registers and 1377 + * stack slots that needs precision in the parent verifier state. 1378 + */ 1379 + static int backtrack_insn(struct bpf_verifier_env *env, int idx, 1380 + u32 *reg_mask, u64 *stack_mask) 1381 + { 1382 + const struct bpf_insn_cbs cbs = { 1383 + .cb_print = verbose, 1384 + .private_data = env, 1385 + }; 1386 + struct bpf_insn *insn = env->prog->insnsi + idx; 1387 + u8 class = BPF_CLASS(insn->code); 1388 + u8 opcode = BPF_OP(insn->code); 1389 + u8 mode = BPF_MODE(insn->code); 1390 + u32 dreg = 1u << insn->dst_reg; 1391 + u32 sreg = 1u << insn->src_reg; 1392 + u32 spi; 1393 + 1394 + if (insn->code == 0) 1395 + return 0; 1396 + if (env->log.level & BPF_LOG_LEVEL) { 1397 + verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); 1398 + verbose(env, "%d: ", idx); 1399 + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); 1400 + } 1401 + 1402 + if (class == BPF_ALU || class == BPF_ALU64) { 1403 + if (!(*reg_mask & dreg)) 1404 + return 0; 1405 + if (opcode == BPF_MOV) { 1406 + if (BPF_SRC(insn->code) == BPF_X) { 1407 + /* dreg = sreg 1408 + * dreg needs precision after this insn 1409 + * sreg needs precision before this insn 1410 + */ 1411 + *reg_mask &= ~dreg; 1412 + *reg_mask |= sreg; 1413 + } else { 1414 + /* dreg = K 1415 + * dreg needs precision after this insn. 1416 + * Corresponding register is already marked 1417 + * as precise=true in this verifier state. 1418 + * No further markings in parent are necessary 1419 + */ 1420 + *reg_mask &= ~dreg; 1421 + } 1422 + } else { 1423 + if (BPF_SRC(insn->code) == BPF_X) { 1424 + /* dreg += sreg 1425 + * both dreg and sreg need precision 1426 + * before this insn 1427 + */ 1428 + *reg_mask |= sreg; 1429 + } /* else dreg += K 1430 + * dreg still needs precision before this insn 1431 + */ 1432 + } 1433 + } else if (class == BPF_LDX) { 1434 + if (!(*reg_mask & dreg)) 1435 + return 0; 1436 + *reg_mask &= ~dreg; 1437 + 1438 + /* scalars can only be spilled into stack w/o losing precision. 1439 + * Load from any other memory can be zero extended. 1440 + * The desire to keep that precision is already indicated 1441 + * by 'precise' mark in corresponding register of this state. 1442 + * No further tracking necessary. 1443 + */ 1444 + if (insn->src_reg != BPF_REG_FP) 1445 + return 0; 1446 + if (BPF_SIZE(insn->code) != BPF_DW) 1447 + return 0; 1448 + 1449 + /* dreg = *(u64 *)[fp - off] was a fill from the stack. 1450 + * that [fp - off] slot contains scalar that needs to be 1451 + * tracked with precision 1452 + */ 1453 + spi = (-insn->off - 1) / BPF_REG_SIZE; 1454 + if (spi >= 64) { 1455 + verbose(env, "BUG spi %d\n", spi); 1456 + WARN_ONCE(1, "verifier backtracking bug"); 1457 + return -EFAULT; 1458 + } 1459 + *stack_mask |= 1ull << spi; 1460 + } else if (class == BPF_STX) { 1461 + if (*reg_mask & dreg) 1462 + /* stx shouldn't be using _scalar_ dst_reg 1463 + * to access memory. It means backtracking 1464 + * encountered a case of pointer subtraction. 1465 + */ 1466 + return -ENOTSUPP; 1467 + /* scalars can only be spilled into stack */ 1468 + if (insn->dst_reg != BPF_REG_FP) 1469 + return 0; 1470 + if (BPF_SIZE(insn->code) != BPF_DW) 1471 + return 0; 1472 + spi = (-insn->off - 1) / BPF_REG_SIZE; 1473 + if (spi >= 64) { 1474 + verbose(env, "BUG spi %d\n", spi); 1475 + WARN_ONCE(1, "verifier backtracking bug"); 1476 + return -EFAULT; 1477 + } 1478 + if (!(*stack_mask & (1ull << spi))) 1479 + return 0; 1480 + *stack_mask &= ~(1ull << spi); 1481 + *reg_mask |= sreg; 1482 + } else if (class == BPF_JMP || class == BPF_JMP32) { 1483 + if (opcode == BPF_CALL) { 1484 + if (insn->src_reg == BPF_PSEUDO_CALL) 1485 + return -ENOTSUPP; 1486 + /* regular helper call sets R0 */ 1487 + *reg_mask &= ~1; 1488 + if (*reg_mask & 0x3f) { 1489 + /* if backtracing was looking for registers R1-R5 1490 + * they should have been found already. 1491 + */ 1492 + verbose(env, "BUG regs %x\n", *reg_mask); 1493 + WARN_ONCE(1, "verifier backtracking bug"); 1494 + return -EFAULT; 1495 + } 1496 + } else if (opcode == BPF_EXIT) { 1497 + return -ENOTSUPP; 1498 + } 1499 + } else if (class == BPF_LD) { 1500 + if (!(*reg_mask & dreg)) 1501 + return 0; 1502 + *reg_mask &= ~dreg; 1503 + /* It's ld_imm64 or ld_abs or ld_ind. 1504 + * For ld_imm64 no further tracking of precision 1505 + * into parent is necessary 1506 + */ 1507 + if (mode == BPF_IND || mode == BPF_ABS) 1508 + /* to be analyzed */ 1509 + return -ENOTSUPP; 1510 + } else if (class == BPF_ST) { 1511 + if (*reg_mask & dreg) 1512 + /* likely pointer subtraction */ 1513 + return -ENOTSUPP; 1514 + } 1515 + return 0; 1516 + } 1517 + 1518 + /* the scalar precision tracking algorithm: 1519 + * . at the start all registers have precise=false. 1520 + * . scalar ranges are tracked as normal through alu and jmp insns. 1521 + * . once precise value of the scalar register is used in: 1522 + * . ptr + scalar alu 1523 + * . if (scalar cond K|scalar) 1524 + * . helper_call(.., scalar, ...) where ARG_CONST is expected 1525 + * backtrack through the verifier states and mark all registers and 1526 + * stack slots with spilled constants that these scalar regisers 1527 + * should be precise. 1528 + * . during state pruning two registers (or spilled stack slots) 1529 + * are equivalent if both are not precise. 1530 + * 1531 + * Note the verifier cannot simply walk register parentage chain, 1532 + * since many different registers and stack slots could have been 1533 + * used to compute single precise scalar. 1534 + * 1535 + * The approach of starting with precise=true for all registers and then 1536 + * backtrack to mark a register as not precise when the verifier detects 1537 + * that program doesn't care about specific value (e.g., when helper 1538 + * takes register as ARG_ANYTHING parameter) is not safe. 1539 + * 1540 + * It's ok to walk single parentage chain of the verifier states. 1541 + * It's possible that this backtracking will go all the way till 1st insn. 1542 + * All other branches will be explored for needing precision later. 1543 + * 1544 + * The backtracking needs to deal with cases like: 1545 + * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) 1546 + * r9 -= r8 1547 + * r5 = r9 1548 + * if r5 > 0x79f goto pc+7 1549 + * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) 1550 + * r5 += 1 1551 + * ... 1552 + * call bpf_perf_event_output#25 1553 + * where .arg5_type = ARG_CONST_SIZE_OR_ZERO 1554 + * 1555 + * and this case: 1556 + * r6 = 1 1557 + * call foo // uses callee's r6 inside to compute r0 1558 + * r0 += r6 1559 + * if r0 == 0 goto 1560 + * 1561 + * to track above reg_mask/stack_mask needs to be independent for each frame. 1562 + * 1563 + * Also if parent's curframe > frame where backtracking started, 1564 + * the verifier need to mark registers in both frames, otherwise callees 1565 + * may incorrectly prune callers. This is similar to 1566 + * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") 1567 + * 1568 + * For now backtracking falls back into conservative marking. 1569 + */ 1570 + static void mark_all_scalars_precise(struct bpf_verifier_env *env, 1571 + struct bpf_verifier_state *st) 1572 + { 1573 + struct bpf_func_state *func; 1574 + struct bpf_reg_state *reg; 1575 + int i, j; 1576 + 1577 + /* big hammer: mark all scalars precise in this path. 1578 + * pop_stack may still get !precise scalars. 1579 + */ 1580 + for (; st; st = st->parent) 1581 + for (i = 0; i <= st->curframe; i++) { 1582 + func = st->frame[i]; 1583 + for (j = 0; j < BPF_REG_FP; j++) { 1584 + reg = &func->regs[j]; 1585 + if (reg->type != SCALAR_VALUE) 1586 + continue; 1587 + reg->precise = true; 1588 + } 1589 + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { 1590 + if (func->stack[j].slot_type[0] != STACK_SPILL) 1591 + continue; 1592 + reg = &func->stack[j].spilled_ptr; 1593 + if (reg->type != SCALAR_VALUE) 1594 + continue; 1595 + reg->precise = true; 1596 + } 1597 + } 1598 + } 1599 + 1600 + static int mark_chain_precision(struct bpf_verifier_env *env, int regno) 1601 + { 1602 + struct bpf_verifier_state *st = env->cur_state; 1603 + int first_idx = st->first_insn_idx; 1604 + int last_idx = env->insn_idx; 1605 + struct bpf_func_state *func; 1606 + struct bpf_reg_state *reg; 1607 + u32 reg_mask = 1u << regno; 1608 + u64 stack_mask = 0; 1609 + bool skip_first = true; 1610 + int i, err; 1611 + 1612 + if (!env->allow_ptr_leaks) 1613 + /* backtracking is root only for now */ 1614 + return 0; 1615 + 1616 + func = st->frame[st->curframe]; 1617 + reg = &func->regs[regno]; 1618 + if (reg->type != SCALAR_VALUE) { 1619 + WARN_ONCE(1, "backtracing misuse"); 1620 + return -EFAULT; 1621 + } 1622 + if (reg->precise) 1623 + return 0; 1624 + func->regs[regno].precise = true; 1625 + 1626 + for (;;) { 1627 + DECLARE_BITMAP(mask, 64); 1628 + bool new_marks = false; 1629 + u32 history = st->jmp_history_cnt; 1630 + 1631 + if (env->log.level & BPF_LOG_LEVEL) 1632 + verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); 1633 + for (i = last_idx;;) { 1634 + if (skip_first) { 1635 + err = 0; 1636 + skip_first = false; 1637 + } else { 1638 + err = backtrack_insn(env, i, &reg_mask, &stack_mask); 1639 + } 1640 + if (err == -ENOTSUPP) { 1641 + mark_all_scalars_precise(env, st); 1642 + return 0; 1643 + } else if (err) { 1644 + return err; 1645 + } 1646 + if (!reg_mask && !stack_mask) 1647 + /* Found assignment(s) into tracked register in this state. 1648 + * Since this state is already marked, just return. 1649 + * Nothing to be tracked further in the parent state. 1650 + */ 1651 + return 0; 1652 + if (i == first_idx) 1653 + break; 1654 + i = get_prev_insn_idx(st, i, &history); 1655 + if (i >= env->prog->len) { 1656 + /* This can happen if backtracking reached insn 0 1657 + * and there are still reg_mask or stack_mask 1658 + * to backtrack. 1659 + * It means the backtracking missed the spot where 1660 + * particular register was initialized with a constant. 1661 + */ 1662 + verbose(env, "BUG backtracking idx %d\n", i); 1663 + WARN_ONCE(1, "verifier backtracking bug"); 1664 + return -EFAULT; 1665 + } 1666 + } 1667 + st = st->parent; 1668 + if (!st) 1669 + break; 1670 + 1671 + func = st->frame[st->curframe]; 1672 + bitmap_from_u64(mask, reg_mask); 1673 + for_each_set_bit(i, mask, 32) { 1674 + reg = &func->regs[i]; 1675 + if (reg->type != SCALAR_VALUE) 1676 + continue; 1677 + if (!reg->precise) 1678 + new_marks = true; 1679 + reg->precise = true; 1680 + } 1681 + 1682 + bitmap_from_u64(mask, stack_mask); 1683 + for_each_set_bit(i, mask, 64) { 1684 + if (i >= func->allocated_stack / BPF_REG_SIZE) { 1685 + /* This can happen if backtracking 1686 + * is propagating stack precision where 1687 + * caller has larger stack frame 1688 + * than callee, but backtrack_insn() should 1689 + * have returned -ENOTSUPP. 1690 + */ 1691 + verbose(env, "BUG spi %d stack_size %d\n", 1692 + i, func->allocated_stack); 1693 + WARN_ONCE(1, "verifier backtracking bug"); 1694 + return -EFAULT; 1695 + } 1696 + 1697 + if (func->stack[i].slot_type[0] != STACK_SPILL) 1698 + continue; 1699 + reg = &func->stack[i].spilled_ptr; 1700 + if (reg->type != SCALAR_VALUE) 1701 + continue; 1702 + if (!reg->precise) 1703 + new_marks = true; 1704 + reg->precise = true; 1705 + } 1706 + if (env->log.level & BPF_LOG_LEVEL) { 1707 + print_verifier_state(env, func); 1708 + verbose(env, "parent %s regs=%x stack=%llx marks\n", 1709 + new_marks ? "didn't have" : "already had", 1710 + reg_mask, stack_mask); 1711 + } 1712 + 1713 + if (!new_marks) 1714 + break; 1715 + 1716 + last_idx = st->last_insn_idx; 1717 + first_idx = st->first_insn_idx; 1718 + } 1719 + return 0; 1720 + } 1721 + 1722 + 1402 1723 static bool is_spillable_regtype(enum bpf_reg_type type) 1403 1724 { 1404 1725 switch (type) { ··· 1800 1355 case PTR_TO_SOCK_COMMON_OR_NULL: 1801 1356 case PTR_TO_TCP_SOCK: 1802 1357 case PTR_TO_TCP_SOCK_OR_NULL: 1358 + case PTR_TO_XDP_SOCK: 1803 1359 return true; 1804 1360 default: 1805 1361 return false; ··· 1813 1367 return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); 1814 1368 } 1815 1369 1370 + static bool register_is_const(struct bpf_reg_state *reg) 1371 + { 1372 + return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); 1373 + } 1374 + 1375 + static void save_register_state(struct bpf_func_state *state, 1376 + int spi, struct bpf_reg_state *reg) 1377 + { 1378 + int i; 1379 + 1380 + state->stack[spi].spilled_ptr = *reg; 1381 + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; 1382 + 1383 + for (i = 0; i < BPF_REG_SIZE; i++) 1384 + state->stack[spi].slot_type[i] = STACK_SPILL; 1385 + } 1386 + 1816 1387 /* check_stack_read/write functions track spill/fill of registers, 1817 1388 * stack boundary and alignment are checked in check_mem_access() 1818 1389 */ ··· 1839 1376 { 1840 1377 struct bpf_func_state *cur; /* state of the current function */ 1841 1378 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; 1842 - enum bpf_reg_type type; 1379 + u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; 1380 + struct bpf_reg_state *reg = NULL; 1843 1381 1844 1382 err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), 1845 1383 state->acquired_refs, true); ··· 1857 1393 } 1858 1394 1859 1395 cur = env->cur_state->frame[env->cur_state->curframe]; 1860 - if (value_regno >= 0 && 1861 - is_spillable_regtype((type = cur->regs[value_regno].type))) { 1396 + if (value_regno >= 0) 1397 + reg = &cur->regs[value_regno]; 1862 1398 1399 + if (reg && size == BPF_REG_SIZE && register_is_const(reg) && 1400 + !register_is_null(reg) && env->allow_ptr_leaks) { 1401 + if (dst_reg != BPF_REG_FP) { 1402 + /* The backtracking logic can only recognize explicit 1403 + * stack slot address like [fp - 8]. Other spill of 1404 + * scalar via different register has to be conervative. 1405 + * Backtrack from here and mark all registers as precise 1406 + * that contributed into 'reg' being a constant. 1407 + */ 1408 + err = mark_chain_precision(env, value_regno); 1409 + if (err) 1410 + return err; 1411 + } 1412 + save_register_state(state, spi, reg); 1413 + } else if (reg && is_spillable_regtype(reg->type)) { 1863 1414 /* register containing pointer is being spilled into stack */ 1864 1415 if (size != BPF_REG_SIZE) { 1416 + verbose_linfo(env, insn_idx, "; "); 1865 1417 verbose(env, "invalid size of register spill\n"); 1866 1418 return -EACCES; 1867 1419 } 1868 1420 1869 - if (state != cur && type == PTR_TO_STACK) { 1421 + if (state != cur && reg->type == PTR_TO_STACK) { 1870 1422 verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); 1871 1423 return -EINVAL; 1872 1424 } 1873 1425 1874 - /* save register state */ 1875 - state->stack[spi].spilled_ptr = cur->regs[value_regno]; 1876 - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; 1426 + if (!env->allow_ptr_leaks) { 1427 + bool sanitize = false; 1877 1428 1878 - for (i = 0; i < BPF_REG_SIZE; i++) { 1879 - if (state->stack[spi].slot_type[i] == STACK_MISC && 1880 - !env->allow_ptr_leaks) { 1429 + if (state->stack[spi].slot_type[0] == STACK_SPILL && 1430 + register_is_const(&state->stack[spi].spilled_ptr)) 1431 + sanitize = true; 1432 + for (i = 0; i < BPF_REG_SIZE; i++) 1433 + if (state->stack[spi].slot_type[i] == STACK_MISC) { 1434 + sanitize = true; 1435 + break; 1436 + } 1437 + if (sanitize) { 1881 1438 int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; 1882 1439 int soff = (-spi - 1) * BPF_REG_SIZE; 1883 1440 ··· 1921 1436 } 1922 1437 *poff = soff; 1923 1438 } 1924 - state->stack[spi].slot_type[i] = STACK_SPILL; 1925 1439 } 1440 + save_register_state(state, spi, reg); 1926 1441 } else { 1927 1442 u8 type = STACK_MISC; 1928 1443 ··· 1945 1460 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; 1946 1461 1947 1462 /* when we zero initialize stack slots mark them as such */ 1948 - if (value_regno >= 0 && 1949 - register_is_null(&cur->regs[value_regno])) 1463 + if (reg && register_is_null(reg)) { 1464 + /* backtracking doesn't work for STACK_ZERO yet. */ 1465 + err = mark_chain_precision(env, value_regno); 1466 + if (err) 1467 + return err; 1950 1468 type = STACK_ZERO; 1469 + } 1951 1470 1952 1471 /* Mark slots affected by this stack write. */ 1953 1472 for (i = 0; i < size; i++) ··· 1968 1479 struct bpf_verifier_state *vstate = env->cur_state; 1969 1480 struct bpf_func_state *state = vstate->frame[vstate->curframe]; 1970 1481 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; 1482 + struct bpf_reg_state *reg; 1971 1483 u8 *stype; 1972 1484 1973 1485 if (reg_state->allocated_stack <= slot) { ··· 1977 1487 return -EACCES; 1978 1488 } 1979 1489 stype = reg_state->stack[spi].slot_type; 1490 + reg = &reg_state->stack[spi].spilled_ptr; 1980 1491 1981 1492 if (stype[0] == STACK_SPILL) { 1982 1493 if (size != BPF_REG_SIZE) { 1983 - verbose(env, "invalid size of register spill\n"); 1984 - return -EACCES; 1494 + if (reg->type != SCALAR_VALUE) { 1495 + verbose_linfo(env, env->insn_idx, "; "); 1496 + verbose(env, "invalid size of register fill\n"); 1497 + return -EACCES; 1498 + } 1499 + if (value_regno >= 0) { 1500 + mark_reg_unknown(env, state->regs, value_regno); 1501 + state->regs[value_regno].live |= REG_LIVE_WRITTEN; 1502 + } 1503 + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); 1504 + return 0; 1985 1505 } 1986 1506 for (i = 1; i < BPF_REG_SIZE; i++) { 1987 1507 if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { ··· 2002 1502 2003 1503 if (value_regno >= 0) { 2004 1504 /* restore register state from stack */ 2005 - state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; 1505 + state->regs[value_regno] = *reg; 2006 1506 /* mark reg as written since spilled pointer state likely 2007 1507 * has its liveness marks cleared by is_state_visited() 2008 1508 * which resets stack/reg liveness for state transitions 2009 1509 */ 2010 1510 state->regs[value_regno].live |= REG_LIVE_WRITTEN; 2011 1511 } 2012 - mark_reg_read(env, &reg_state->stack[spi].spilled_ptr, 2013 - reg_state->stack[spi].spilled_ptr.parent, 2014 - REG_LIVE_READ64); 2015 - return 0; 1512 + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); 2016 1513 } else { 2017 1514 int zeros = 0; 2018 1515 ··· 2024 1527 off, i, size); 2025 1528 return -EACCES; 2026 1529 } 2027 - mark_reg_read(env, &reg_state->stack[spi].spilled_ptr, 2028 - reg_state->stack[spi].spilled_ptr.parent, 2029 - REG_LIVE_READ64); 1530 + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); 2030 1531 if (value_regno >= 0) { 2031 1532 if (zeros == size) { 2032 1533 /* any size read into register is zero extended, 2033 1534 * so the whole register == const_zero 2034 1535 */ 2035 1536 __mark_reg_const_zero(&state->regs[value_regno]); 1537 + /* backtracking doesn't support STACK_ZERO yet, 1538 + * so mark it precise here, so that later 1539 + * backtracking can stop here. 1540 + * Backtracking may not need this if this register 1541 + * doesn't participate in pointer adjustment. 1542 + * Forward propagation of precise flag is not 1543 + * necessary either. This mark is only to stop 1544 + * backtracking. Any register that contributed 1545 + * to const 0 was marked precise before spill. 1546 + */ 1547 + state->regs[value_regno].precise = true; 2036 1548 } else { 2037 1549 /* have read misc data from the stack */ 2038 1550 mark_reg_unknown(env, state->regs, value_regno); 2039 1551 } 2040 1552 state->regs[value_regno].live |= REG_LIVE_WRITTEN; 2041 1553 } 2042 - return 0; 2043 1554 } 1555 + return 0; 2044 1556 } 2045 1557 2046 1558 static int check_stack_access(struct bpf_verifier_env *env, ··· 2341 1835 case PTR_TO_TCP_SOCK: 2342 1836 valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); 2343 1837 break; 1838 + case PTR_TO_XDP_SOCK: 1839 + valid = bpf_xdp_sock_is_valid_access(off, size, t, &info); 1840 + break; 2344 1841 default: 2345 1842 valid = false; 2346 1843 } ··· 2507 1998 break; 2508 1999 case PTR_TO_TCP_SOCK: 2509 2000 pointer_desc = "tcp_sock "; 2001 + break; 2002 + case PTR_TO_XDP_SOCK: 2003 + pointer_desc = "xdp_sock "; 2510 2004 break; 2511 2005 default: 2512 2006 break; ··· 2910 2398 { 2911 2399 struct bpf_reg_state *reg = reg_state(env, regno); 2912 2400 struct bpf_func_state *state = func(env, reg); 2913 - int err, min_off, max_off, i, slot, spi; 2401 + int err, min_off, max_off, i, j, slot, spi; 2914 2402 2915 2403 if (reg->type != PTR_TO_STACK) { 2916 2404 /* Allow zero-byte read from NULL, regardless of pointer type */ ··· 2998 2486 *stype = STACK_MISC; 2999 2487 goto mark; 3000 2488 } 2489 + if (state->stack[spi].slot_type[0] == STACK_SPILL && 2490 + state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { 2491 + __mark_reg_unknown(&state->stack[spi].spilled_ptr); 2492 + for (j = 0; j < BPF_REG_SIZE; j++) 2493 + state->stack[spi].slot_type[j] = STACK_MISC; 2494 + goto mark; 2495 + } 2496 + 3001 2497 err: 3002 2498 if (tnum_is_const(reg->var_off)) { 3003 2499 verbose(env, "invalid indirect read from stack off %d+%d size %d\n", ··· 3357 2837 err = check_helper_mem_access(env, regno - 1, 3358 2838 reg->umax_value, 3359 2839 zero_size_allowed, meta); 2840 + if (!err) 2841 + err = mark_chain_precision(env, regno); 3360 2842 } else if (arg_type_is_int_ptr(arg_type)) { 3361 2843 int size = int_ptr_type_to_size(arg_type); 3362 2844 ··· 3419 2897 * appear. 3420 2898 */ 3421 2899 case BPF_MAP_TYPE_CPUMAP: 3422 - case BPF_MAP_TYPE_XSKMAP: 3423 2900 if (func_id != BPF_FUNC_redirect_map) 2901 + goto error; 2902 + break; 2903 + case BPF_MAP_TYPE_XSKMAP: 2904 + if (func_id != BPF_FUNC_redirect_map && 2905 + func_id != BPF_FUNC_map_lookup_elem) 3424 2906 goto error; 3425 2907 break; 3426 2908 case BPF_MAP_TYPE_ARRAY_OF_MAPS: ··· 4317 3791 case PTR_TO_SOCK_COMMON_OR_NULL: 4318 3792 case PTR_TO_TCP_SOCK: 4319 3793 case PTR_TO_TCP_SOCK_OR_NULL: 3794 + case PTR_TO_XDP_SOCK: 4320 3795 verbose(env, "R%d pointer arithmetic on %s prohibited\n", 4321 3796 dst, reg_type_str[ptr_reg->type]); 4322 3797 return -EACCES; ··· 4795 4268 struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; 4796 4269 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; 4797 4270 u8 opcode = BPF_OP(insn->code); 4271 + int err; 4798 4272 4799 4273 dst_reg = &regs[insn->dst_reg]; 4800 4274 src_reg = NULL; ··· 4822 4294 * This is legal, but we have to reverse our 4823 4295 * src/dest handling in computing the range 4824 4296 */ 4297 + err = mark_chain_precision(env, insn->dst_reg); 4298 + if (err) 4299 + return err; 4825 4300 return adjust_ptr_min_max_vals(env, insn, 4826 4301 src_reg, dst_reg); 4827 4302 } 4828 4303 } else if (ptr_reg) { 4829 4304 /* pointer += scalar */ 4305 + err = mark_chain_precision(env, insn->src_reg); 4306 + if (err) 4307 + return err; 4830 4308 return adjust_ptr_min_max_vals(env, insn, 4831 4309 dst_reg, src_reg); 4832 4310 } ··· 5564 5030 if (reg->map_ptr->inner_map_meta) { 5565 5031 reg->type = CONST_PTR_TO_MAP; 5566 5032 reg->map_ptr = reg->map_ptr->inner_map_meta; 5033 + } else if (reg->map_ptr->map_type == 5034 + BPF_MAP_TYPE_XSKMAP) { 5035 + reg->type = PTR_TO_XDP_SOCK; 5567 5036 } else { 5568 5037 reg->type = PTR_TO_MAP_VALUE; 5569 5038 } ··· 5738 5201 struct bpf_verifier_state *this_branch = env->cur_state; 5739 5202 struct bpf_verifier_state *other_branch; 5740 5203 struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; 5741 - struct bpf_reg_state *dst_reg, *other_branch_regs; 5204 + struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; 5742 5205 u8 opcode = BPF_OP(insn->code); 5743 5206 bool is_jmp32; 5207 + int pred = -1; 5744 5208 int err; 5745 5209 5746 5210 /* Only conditional jumps are expected to reach here. */ ··· 5766 5228 insn->src_reg); 5767 5229 return -EACCES; 5768 5230 } 5231 + src_reg = &regs[insn->src_reg]; 5769 5232 } else { 5770 5233 if (insn->src_reg != BPF_REG_0) { 5771 5234 verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); ··· 5782 5243 dst_reg = &regs[insn->dst_reg]; 5783 5244 is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; 5784 5245 5785 - if (BPF_SRC(insn->code) == BPF_K) { 5786 - int pred = is_branch_taken(dst_reg, insn->imm, opcode, 5787 - is_jmp32); 5788 - 5789 - if (pred == 1) { 5790 - /* only follow the goto, ignore fall-through */ 5791 - *insn_idx += insn->off; 5792 - return 0; 5793 - } else if (pred == 0) { 5794 - /* only follow fall-through branch, since 5795 - * that's where the program will go 5796 - */ 5797 - return 0; 5798 - } 5246 + if (BPF_SRC(insn->code) == BPF_K) 5247 + pred = is_branch_taken(dst_reg, insn->imm, 5248 + opcode, is_jmp32); 5249 + else if (src_reg->type == SCALAR_VALUE && 5250 + tnum_is_const(src_reg->var_off)) 5251 + pred = is_branch_taken(dst_reg, src_reg->var_off.value, 5252 + opcode, is_jmp32); 5253 + if (pred >= 0) { 5254 + err = mark_chain_precision(env, insn->dst_reg); 5255 + if (BPF_SRC(insn->code) == BPF_X && !err) 5256 + err = mark_chain_precision(env, insn->src_reg); 5257 + if (err) 5258 + return err; 5259 + } 5260 + if (pred == 1) { 5261 + /* only follow the goto, ignore fall-through */ 5262 + *insn_idx += insn->off; 5263 + return 0; 5264 + } else if (pred == 0) { 5265 + /* only follow fall-through branch, since 5266 + * that's where the program will go 5267 + */ 5268 + return 0; 5799 5269 } 5800 5270 5801 5271 other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, ··· 6164 5616 * w - next instruction 6165 5617 * e - edge 6166 5618 */ 6167 - static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) 5619 + static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, 5620 + bool loop_ok) 6168 5621 { 6169 5622 int *insn_stack = env->cfg.insn_stack; 6170 5623 int *insn_state = env->cfg.insn_state; ··· 6195 5646 insn_stack[env->cfg.cur_stack++] = w; 6196 5647 return 1; 6197 5648 } else if ((insn_state[w] & 0xF0) == DISCOVERED) { 5649 + if (loop_ok && env->allow_ptr_leaks) 5650 + return 0; 6198 5651 verbose_linfo(env, t, "%d: ", t); 6199 5652 verbose_linfo(env, w, "%d: ", w); 6200 5653 verbose(env, "back-edge from insn %d to %d\n", t, w); ··· 6248 5697 if (opcode == BPF_EXIT) { 6249 5698 goto mark_explored; 6250 5699 } else if (opcode == BPF_CALL) { 6251 - ret = push_insn(t, t + 1, FALLTHROUGH, env); 5700 + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); 6252 5701 if (ret == 1) 6253 5702 goto peek_stack; 6254 5703 else if (ret < 0) ··· 6257 5706 init_explored_state(env, t + 1); 6258 5707 if (insns[t].src_reg == BPF_PSEUDO_CALL) { 6259 5708 init_explored_state(env, t); 6260 - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); 5709 + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, 5710 + env, false); 6261 5711 if (ret == 1) 6262 5712 goto peek_stack; 6263 5713 else if (ret < 0) ··· 6271 5719 } 6272 5720 /* unconditional jump with single edge */ 6273 5721 ret = push_insn(t, t + insns[t].off + 1, 6274 - FALLTHROUGH, env); 5722 + FALLTHROUGH, env, true); 6275 5723 if (ret == 1) 6276 5724 goto peek_stack; 6277 5725 else if (ret < 0) 6278 5726 goto err_free; 5727 + /* unconditional jmp is not a good pruning point, 5728 + * but it's marked, since backtracking needs 5729 + * to record jmp history in is_state_visited(). 5730 + */ 5731 + init_explored_state(env, t + insns[t].off + 1); 6279 5732 /* tell verifier to check for equivalent states 6280 5733 * after every call and jump 6281 5734 */ ··· 6289 5732 } else { 6290 5733 /* conditional jump with two edges */ 6291 5734 init_explored_state(env, t); 6292 - ret = push_insn(t, t + 1, FALLTHROUGH, env); 5735 + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); 6293 5736 if (ret == 1) 6294 5737 goto peek_stack; 6295 5738 else if (ret < 0) 6296 5739 goto err_free; 6297 5740 6298 - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); 5741 + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); 6299 5742 if (ret == 1) 6300 5743 goto peek_stack; 6301 5744 else if (ret < 0) ··· 6305 5748 /* all other non-branch instructions with single 6306 5749 * fall-through edge 6307 5750 */ 6308 - ret = push_insn(t, t + 1, FALLTHROUGH, env); 5751 + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); 6309 5752 if (ret == 1) 6310 5753 goto peek_stack; 6311 5754 else if (ret < 0) ··· 6738 6181 6739 6182 sl = *explored_state(env, insn); 6740 6183 while (sl) { 6184 + if (sl->state.branches) 6185 + goto next; 6741 6186 if (sl->state.insn_idx != insn || 6742 6187 sl->state.curframe != cur->curframe) 6743 6188 goto next; ··· 6781 6222 switch (rold->type) { 6782 6223 case SCALAR_VALUE: 6783 6224 if (rcur->type == SCALAR_VALUE) { 6225 + if (!rold->precise && !rcur->precise) 6226 + return true; 6784 6227 /* new val must satisfy old val knowledge */ 6785 6228 return range_within(rold, rcur) && 6786 6229 tnum_in(rold->var_off, rcur->var_off); ··· 6855 6294 case PTR_TO_SOCK_COMMON_OR_NULL: 6856 6295 case PTR_TO_TCP_SOCK: 6857 6296 case PTR_TO_TCP_SOCK_OR_NULL: 6297 + case PTR_TO_XDP_SOCK: 6858 6298 /* Only valid matches are exact, which memcmp() above 6859 6299 * would have accepted 6860 6300 */ ··· 7106 6544 return 0; 7107 6545 } 7108 6546 6547 + static bool states_maybe_looping(struct bpf_verifier_state *old, 6548 + struct bpf_verifier_state *cur) 6549 + { 6550 + struct bpf_func_state *fold, *fcur; 6551 + int i, fr = cur->curframe; 6552 + 6553 + if (old->curframe != fr) 6554 + return false; 6555 + 6556 + fold = old->frame[fr]; 6557 + fcur = cur->frame[fr]; 6558 + for (i = 0; i < MAX_BPF_REG; i++) 6559 + if (memcmp(&fold->regs[i], &fcur->regs[i], 6560 + offsetof(struct bpf_reg_state, parent))) 6561 + return false; 6562 + return true; 6563 + } 6564 + 6565 + 7109 6566 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) 7110 6567 { 7111 6568 struct bpf_verifier_state_list *new_sl; 7112 6569 struct bpf_verifier_state_list *sl, **pprev; 7113 6570 struct bpf_verifier_state *cur = env->cur_state, *new; 7114 6571 int i, j, err, states_cnt = 0; 6572 + bool add_new_state = false; 7115 6573 6574 + cur->last_insn_idx = env->prev_insn_idx; 7116 6575 if (!env->insn_aux_data[insn_idx].prune_point) 7117 6576 /* this 'insn_idx' instruction wasn't marked, so we will not 7118 6577 * be doing state search here 7119 6578 */ 7120 6579 return 0; 6580 + 6581 + /* bpf progs typically have pruning point every 4 instructions 6582 + * http://vger.kernel.org/bpfconf2019.html#session-1 6583 + * Do not add new state for future pruning if the verifier hasn't seen 6584 + * at least 2 jumps and at least 8 instructions. 6585 + * This heuristics helps decrease 'total_states' and 'peak_states' metric. 6586 + * In tests that amounts to up to 50% reduction into total verifier 6587 + * memory consumption and 20% verifier time speedup. 6588 + */ 6589 + if (env->jmps_processed - env->prev_jmps_processed >= 2 && 6590 + env->insn_processed - env->prev_insn_processed >= 8) 6591 + add_new_state = true; 7121 6592 7122 6593 pprev = explored_state(env, insn_idx); 7123 6594 sl = *pprev; ··· 7161 6566 states_cnt++; 7162 6567 if (sl->state.insn_idx != insn_idx) 7163 6568 goto next; 6569 + if (sl->state.branches) { 6570 + if (states_maybe_looping(&sl->state, cur) && 6571 + states_equal(env, &sl->state, cur)) { 6572 + verbose_linfo(env, insn_idx, "; "); 6573 + verbose(env, "infinite loop detected at insn %d\n", insn_idx); 6574 + return -EINVAL; 6575 + } 6576 + /* if the verifier is processing a loop, avoid adding new state 6577 + * too often, since different loop iterations have distinct 6578 + * states and may not help future pruning. 6579 + * This threshold shouldn't be too low to make sure that 6580 + * a loop with large bound will be rejected quickly. 6581 + * The most abusive loop will be: 6582 + * r1 += 1 6583 + * if r1 < 1000000 goto pc-2 6584 + * 1M insn_procssed limit / 100 == 10k peak states. 6585 + * This threshold shouldn't be too high either, since states 6586 + * at the end of the loop are likely to be useful in pruning. 6587 + */ 6588 + if (env->jmps_processed - env->prev_jmps_processed < 20 && 6589 + env->insn_processed - env->prev_insn_processed < 100) 6590 + add_new_state = false; 6591 + goto miss; 6592 + } 7164 6593 if (states_equal(env, &sl->state, cur)) { 7165 6594 sl->hit_cnt++; 7166 6595 /* reached equivalent register/stack state, ··· 7202 6583 return err; 7203 6584 return 1; 7204 6585 } 7205 - sl->miss_cnt++; 6586 + miss: 6587 + /* when new state is not going to be added do not increase miss count. 6588 + * Otherwise several loop iterations will remove the state 6589 + * recorded earlier. The goal of these heuristics is to have 6590 + * states from some iterations of the loop (some in the beginning 6591 + * and some at the end) to help pruning. 6592 + */ 6593 + if (add_new_state) 6594 + sl->miss_cnt++; 7206 6595 /* heuristic to determine whether this state is beneficial 7207 6596 * to keep checking from state equivalence point of view. 7208 6597 * Higher numbers increase max_states_per_insn and verification time, ··· 7222 6595 */ 7223 6596 *pprev = sl->next; 7224 6597 if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { 6598 + u32 br = sl->state.branches; 6599 + 6600 + WARN_ONCE(br, 6601 + "BUG live_done but branches_to_explore %d\n", 6602 + br); 7225 6603 free_verifier_state(&sl->state, false); 7226 6604 kfree(sl); 7227 6605 env->peak_states--; ··· 7250 6618 env->max_states_per_insn = states_cnt; 7251 6619 7252 6620 if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) 7253 - return 0; 6621 + return push_jmp_history(env, cur); 7254 6622 7255 - /* there were no equivalent states, remember current one. 7256 - * technically the current state is not proven to be safe yet, 6623 + if (!add_new_state) 6624 + return push_jmp_history(env, cur); 6625 + 6626 + /* There were no equivalent states, remember the current one. 6627 + * Technically the current state is not proven to be safe yet, 7257 6628 * but it will either reach outer most bpf_exit (which means it's safe) 7258 - * or it will be rejected. Since there are no loops, we won't be 6629 + * or it will be rejected. When there are no loops the verifier won't be 7259 6630 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) 7260 - * again on the way to bpf_exit 6631 + * again on the way to bpf_exit. 6632 + * When looping the sl->state.branches will be > 0 and this state 6633 + * will not be considered for equivalence until branches == 0. 7261 6634 */ 7262 6635 new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); 7263 6636 if (!new_sl) 7264 6637 return -ENOMEM; 7265 6638 env->total_states++; 7266 6639 env->peak_states++; 6640 + env->prev_jmps_processed = env->jmps_processed; 6641 + env->prev_insn_processed = env->insn_processed; 7267 6642 7268 6643 /* add new state to the head of linked list */ 7269 6644 new = &new_sl->state; ··· 7281 6642 return err; 7282 6643 } 7283 6644 new->insn_idx = insn_idx; 6645 + WARN_ONCE(new->branches != 1, 6646 + "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); 6647 + 6648 + cur->parent = new; 6649 + cur->first_insn_idx = insn_idx; 6650 + clear_jmp_history(cur); 7284 6651 new_sl->next = *explored_state(env, insn_idx); 7285 6652 *explored_state(env, insn_idx) = new_sl; 7286 6653 /* connect new state to parentage chain. Current frame needs all ··· 7296 6651 * the state of the call instruction (with WRITTEN set), and r0 comes 7297 6652 * from callee with its full parentage chain, anyway. 7298 6653 */ 7299 - for (j = 0; j <= cur->curframe; j++) 7300 - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) 7301 - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; 7302 6654 /* clear write marks in current state: the writes we did are not writes 7303 6655 * our child did, so they don't screen off its reads from us. 7304 6656 * (There are no read marks in current state, because reads always mark 7305 6657 * their parent and current state never has children yet. Only 7306 6658 * explored_states can get read marks.) 7307 6659 */ 7308 - for (i = 0; i < BPF_REG_FP; i++) 7309 - cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; 6660 + for (j = 0; j <= cur->curframe; j++) { 6661 + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) 6662 + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; 6663 + for (i = 0; i < BPF_REG_FP; i++) 6664 + cur->frame[j]->regs[i].live = REG_LIVE_NONE; 6665 + } 7310 6666 7311 6667 /* all stack frames are accessible from callee, clear them all */ 7312 6668 for (j = 0; j <= cur->curframe; j++) { ··· 7334 6688 case PTR_TO_SOCK_COMMON_OR_NULL: 7335 6689 case PTR_TO_TCP_SOCK: 7336 6690 case PTR_TO_TCP_SOCK_OR_NULL: 6691 + case PTR_TO_XDP_SOCK: 7337 6692 return false; 7338 6693 default: 7339 6694 return true; ··· 7366 6719 struct bpf_reg_state *regs; 7367 6720 int insn_cnt = env->prog->len; 7368 6721 bool do_print_state = false; 6722 + int prev_insn_idx = -1; 7369 6723 7370 6724 env->prev_linfo = NULL; 7371 6725 ··· 7375 6727 return -ENOMEM; 7376 6728 state->curframe = 0; 7377 6729 state->speculative = false; 6730 + state->branches = 1; 7378 6731 state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); 7379 6732 if (!state->frame[0]) { 7380 6733 kfree(state); ··· 7392 6743 u8 class; 7393 6744 int err; 7394 6745 6746 + env->prev_insn_idx = prev_insn_idx; 7395 6747 if (env->insn_idx >= insn_cnt) { 7396 6748 verbose(env, "invalid insn idx %d insn_cnt %d\n", 7397 6749 env->insn_idx, insn_cnt); ··· 7465 6815 7466 6816 regs = cur_regs(env); 7467 6817 env->insn_aux_data[env->insn_idx].seen = true; 6818 + prev_insn_idx = env->insn_idx; 7468 6819 7469 6820 if (class == BPF_ALU || class == BPF_ALU64) { 7470 6821 err = check_alu_op(env, insn); ··· 7584 6933 } else if (class == BPF_JMP || class == BPF_JMP32) { 7585 6934 u8 opcode = BPF_OP(insn->code); 7586 6935 6936 + env->jmps_processed++; 7587 6937 if (opcode == BPF_CALL) { 7588 6938 if (BPF_SRC(insn->code) != BPF_K || 7589 6939 insn->off != 0 || ··· 7639 6987 7640 6988 if (state->curframe) { 7641 6989 /* exit from nested function */ 7642 - env->prev_insn_idx = env->insn_idx; 7643 6990 err = prepare_func_exit(env, &env->insn_idx); 7644 6991 if (err) 7645 6992 return err; ··· 7669 7018 if (err) 7670 7019 return err; 7671 7020 process_bpf_exit: 7672 - err = pop_stack(env, &env->prev_insn_idx, 7021 + update_branch_counts(env, env->cur_state); 7022 + err = pop_stack(env, &prev_insn_idx, 7673 7023 &env->insn_idx); 7674 7024 if (err < 0) { 7675 7025 if (err != -ENOENT) ··· 8472 7820 break; 8473 7821 case PTR_TO_TCP_SOCK: 8474 7822 convert_ctx_access = bpf_tcp_sock_convert_ctx_access; 7823 + break; 7824 + case PTR_TO_XDP_SOCK: 7825 + convert_ctx_access = bpf_xdp_sock_convert_ctx_access; 8475 7826 break; 8476 7827 default: 8477 7828 continue;

+8 -1

kernel/bpf/xskmap.c

··· 17 17 18 18 static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) 19 19 { 20 - int cpu, err = -EINVAL; 21 20 struct xsk_map *m; 21 + int cpu, err; 22 22 u64 cost; 23 23 24 24 if (!capable(CAP_NET_ADMIN)) ··· 152 152 153 153 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) 154 154 { 155 + WARN_ON_ONCE(!rcu_read_lock_held()); 156 + return __xsk_map_lookup_elem(map, *(u32 *)key); 157 + } 158 + 159 + static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) 160 + { 155 161 return ERR_PTR(-EOPNOTSUPP); 156 162 } 157 163 ··· 224 218 .map_free = xsk_map_free, 225 219 .map_get_next_key = xsk_map_get_next_key, 226 220 .map_lookup_elem = xsk_map_lookup_elem, 221 + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, 227 222 .map_update_elem = xsk_map_update_elem, 228 223 .map_delete_elem = xsk_map_delete_elem, 229 224 .map_check_btf = map_check_no_btf,

+86

net/core/filter.c

··· 5695 5695 return INET_ECN_set_ce(skb); 5696 5696 } 5697 5697 5698 + bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, 5699 + struct bpf_insn_access_aux *info) 5700 + { 5701 + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) 5702 + return false; 5703 + 5704 + if (off % size != 0) 5705 + return false; 5706 + 5707 + switch (off) { 5708 + default: 5709 + return size == sizeof(__u32); 5710 + } 5711 + } 5712 + 5713 + u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, 5714 + const struct bpf_insn *si, 5715 + struct bpf_insn *insn_buf, 5716 + struct bpf_prog *prog, u32 *target_size) 5717 + { 5718 + struct bpf_insn *insn = insn_buf; 5719 + 5720 + #define BPF_XDP_SOCK_GET(FIELD) \ 5721 + do { \ 5722 + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \ 5723 + FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \ 5724 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ 5725 + si->dst_reg, si->src_reg, \ 5726 + offsetof(struct xdp_sock, FIELD)); \ 5727 + } while (0) 5728 + 5729 + switch (si->off) { 5730 + case offsetof(struct bpf_xdp_sock, queue_id): 5731 + BPF_XDP_SOCK_GET(queue_id); 5732 + break; 5733 + } 5734 + 5735 + return insn - insn_buf; 5736 + } 5737 + 5698 5738 static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { 5699 5739 .func = bpf_skb_ecn_set_ce, 5700 5740 .gpl_only = false, ··· 5937 5897 case BPF_FUNC_skc_lookup_tcp: 5938 5898 return &bpf_sock_addr_skc_lookup_tcp_proto; 5939 5899 #endif /* CONFIG_INET */ 5900 + case BPF_FUNC_sk_storage_get: 5901 + return &bpf_sk_storage_get_proto; 5902 + case BPF_FUNC_sk_storage_delete: 5903 + return &bpf_sk_storage_delete_proto; 5940 5904 default: 5941 5905 return bpf_base_func_proto(func_id); 5942 5906 } ··· 5978 5934 return &bpf_sk_storage_get_proto; 5979 5935 case BPF_FUNC_sk_storage_delete: 5980 5936 return &bpf_sk_storage_delete_proto; 5937 + #ifdef CONFIG_SOCK_CGROUP_DATA 5938 + case BPF_FUNC_skb_cgroup_id: 5939 + return &bpf_skb_cgroup_id_proto; 5940 + #endif 5981 5941 #ifdef CONFIG_INET 5982 5942 case BPF_FUNC_tcp_sock: 5983 5943 return &bpf_tcp_sock_proto; ··· 6162 6114 return &bpf_get_local_storage_proto; 6163 6115 case BPF_FUNC_perf_event_output: 6164 6116 return &bpf_sockopt_event_output_proto; 6117 + case BPF_FUNC_sk_storage_get: 6118 + return &bpf_sk_storage_get_proto; 6119 + case BPF_FUNC_sk_storage_delete: 6120 + return &bpf_sk_storage_delete_proto; 6121 + #ifdef CONFIG_INET 6122 + case BPF_FUNC_tcp_sock: 6123 + return &bpf_tcp_sock_proto; 6124 + #endif /* CONFIG_INET */ 6165 6125 default: 6166 6126 return bpf_base_func_proto(func_id); 6167 6127 } ··· 6857 6801 if (size != size_default) 6858 6802 return false; 6859 6803 break; 6804 + case offsetof(struct bpf_sock_addr, sk): 6805 + if (type != BPF_READ) 6806 + return false; 6807 + if (size != sizeof(__u64)) 6808 + return false; 6809 + info->reg_type = PTR_TO_SOCKET; 6810 + break; 6860 6811 default: 6861 6812 if (type == BPF_READ) { 6862 6813 if (size != size_default) ··· 6906 6843 bytes_acked): 6907 6844 if (size != sizeof(__u64)) 6908 6845 return false; 6846 + break; 6847 + case offsetof(struct bpf_sock_ops, sk): 6848 + if (size != sizeof(__u64)) 6849 + return false; 6850 + info->reg_type = PTR_TO_SOCKET_OR_NULL; 6909 6851 break; 6910 6852 default: 6911 6853 if (size != size_default) ··· 7819 7751 struct bpf_sock_addr_kern, struct in6_addr, t_ctx, 7820 7752 s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); 7821 7753 break; 7754 + case offsetof(struct bpf_sock_addr, sk): 7755 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), 7756 + si->dst_reg, si->src_reg, 7757 + offsetof(struct bpf_sock_addr_kern, sk)); 7758 + break; 7822 7759 } 7823 7760 7824 7761 return insn - insn_buf; ··· 8082 8009 case offsetof(struct bpf_sock_ops, sk_txhash): 8083 8010 SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, 8084 8011 struct sock, type); 8012 + break; 8013 + case offsetof(struct bpf_sock_ops, sk): 8014 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 8015 + struct bpf_sock_ops_kern, 8016 + is_fullsock), 8017 + si->dst_reg, si->src_reg, 8018 + offsetof(struct bpf_sock_ops_kern, 8019 + is_fullsock)); 8020 + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 8021 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 8022 + struct bpf_sock_ops_kern, sk), 8023 + si->dst_reg, si->src_reg, 8024 + offsetof(struct bpf_sock_ops_kern, sk)); 8085 8025 break; 8086 8026 } 8087 8027 return insn - insn_buf;

+4

net/core/sock.c

··· 1039 1039 } 1040 1040 break; 1041 1041 1042 + case SO_DETACH_REUSEPORT_BPF: 1043 + ret = reuseport_detach_prog(sk); 1044 + break; 1045 + 1042 1046 case SO_DETACH_FILTER: 1043 1047 ret = sk_detach_filter(sk); 1044 1048 break;

+24

net/core/sock_reuseport.c

··· 332 332 return 0; 333 333 } 334 334 EXPORT_SYMBOL(reuseport_attach_prog); 335 + 336 + int reuseport_detach_prog(struct sock *sk) 337 + { 338 + struct sock_reuseport *reuse; 339 + struct bpf_prog *old_prog; 340 + 341 + if (!rcu_access_pointer(sk->sk_reuseport_cb)) 342 + return sk->sk_reuseport ? -ENOENT : -EINVAL; 343 + 344 + old_prog = NULL; 345 + spin_lock_bh(&reuseport_lock); 346 + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 347 + lockdep_is_held(&reuseport_lock)); 348 + rcu_swap_protected(reuse->prog, old_prog, 349 + lockdep_is_held(&reuseport_lock)); 350 + spin_unlock_bh(&reuseport_lock); 351 + 352 + if (!old_prog) 353 + return -ENOENT; 354 + 355 + sk_reuseport_prog_free(old_prog); 356 + return 0; 357 + } 358 + EXPORT_SYMBOL(reuseport_detach_prog);

+13 -10

samples/bpf/Makefile

··· 170 170 always += hbm_out_kern.o 171 171 172 172 KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include 173 - KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ 173 + KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/ 174 174 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ 175 175 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include 176 176 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/perf 177 177 178 178 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable 179 - HOSTCFLAGS_trace_helpers.o += -I$(srctree)/tools/lib/bpf/ 180 - 181 - HOSTCFLAGS_trace_output_user.o += -I$(srctree)/tools/lib/bpf/ 182 - HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/ 183 - HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/ 184 - HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/ 185 - HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/ 186 - HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/ 187 - HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/ 188 179 189 180 KBUILD_HOSTLDLIBS += $(LIBBPF) -lelf 190 181 HOSTLDLIBS_tracex4 += -lrt ··· 197 206 CLANG_ARCH_ARGS = -target $(ARCH) 198 207 endif 199 208 209 + # Don't evaluate probes and warnings if we need to run make recursively 210 + ifneq ($(src),) 211 + HDR_PROBE := $(shell echo "\#include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \ 212 + $(HOSTCC) $(KBUILD_HOSTCFLAGS) -x c - -o /dev/null 2>/dev/null && \ 213 + echo okay) 214 + 215 + ifeq ($(HDR_PROBE),) 216 + $(warning WARNING: Detected possible issues with include path.) 217 + $(warning WARNING: Please install kernel headers locally (make headers_install).) 218 + endif 219 + 200 220 BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris) 201 221 BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) 202 222 BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') ··· 223 221 EXTRA_CFLAGS += -g 224 222 LLC_FLAGS += -mattr=dwarfris 225 223 DWARF2BTF = y 224 + endif 226 225 endif 227 226 endif 228 227

+1 -1

samples/bpf/fds_example.c

··· 14 14 15 15 #include <bpf/bpf.h> 16 16 17 - #include "bpf/libbpf.h" 17 + #include "libbpf.h" 18 18 #include "bpf_insn.h" 19 19 #include "sock_example.h" 20 20

+3 -3

samples/bpf/hbm.c

··· 50 50 #include "cgroup_helpers.h" 51 51 #include "hbm.h" 52 52 #include "bpf_util.h" 53 - #include "bpf/bpf.h" 54 - #include "bpf/libbpf.h" 53 + #include "bpf.h" 54 + #include "libbpf.h" 55 55 56 56 bool outFlag = true; 57 57 int minRate = 1000; /* cgroup rate limit in Mbps */ ··· 411 411 " -l also limit flows using loopback\n" 412 412 " -n <#> to create cgroup \"/hbm#\" and attach prog\n" 413 413 " Default is /hbm1\n" 414 - " --no_cn disable CN notifcations\n" 414 + " --no_cn disable CN notifications\n" 415 415 " -r <rate> Rate in Mbps\n" 416 416 " -s Update HBM stats\n" 417 417 " -t <time> Exit after specified seconds (default is 0)\n"

+1 -1

samples/bpf/ibumad_user.c

··· 25 25 26 26 #include "bpf_load.h" 27 27 #include "bpf_util.h" 28 - #include "bpf/libbpf.h" 28 + #include "libbpf.h" 29 29 30 30 static void dump_counts(int fd) 31 31 {

+1 -1

samples/bpf/sockex1_user.c

··· 3 3 #include <assert.h> 4 4 #include <linux/bpf.h> 5 5 #include <bpf/bpf.h> 6 - #include "bpf/libbpf.h" 6 + #include "libbpf.h" 7 7 #include "sock_example.h" 8 8 #include <unistd.h> 9 9 #include <arpa/inet.h>

+1 -1

samples/bpf/sockex2_user.c

··· 3 3 #include <assert.h> 4 4 #include <linux/bpf.h> 5 5 #include <bpf/bpf.h> 6 - #include "bpf/libbpf.h" 6 + #include "libbpf.h" 7 7 #include "sock_example.h" 8 8 #include <unistd.h> 9 9 #include <arpa/inet.h>

+2 -2

samples/bpf/xdp1_user.c

··· 15 15 #include <net/if.h> 16 16 17 17 #include "bpf_util.h" 18 - #include "bpf/bpf.h" 19 - #include "bpf/libbpf.h" 18 + #include "bpf.h" 19 + #include "libbpf.h" 20 20 21 21 static int ifindex; 22 22 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;

+2 -2

samples/bpf/xdp_adjust_tail_user.c

··· 18 18 #include <netinet/ether.h> 19 19 #include <unistd.h> 20 20 #include <time.h> 21 - #include "bpf/bpf.h" 22 - #include "bpf/libbpf.h" 21 + #include "bpf.h" 22 + #include "libbpf.h" 23 23 24 24 #define STATS_INTERVAL_S 2U 25 25

+1 -1

samples/bpf/xdp_fwd_user.c

··· 24 24 #include <fcntl.h> 25 25 #include <libgen.h> 26 26 27 - #include "bpf/libbpf.h" 27 + #include "libbpf.h" 28 28 #include <bpf/bpf.h> 29 29 30 30

+1 -1

samples/bpf/xdp_redirect_cpu_user.c

··· 25 25 #define MAX_PROG 6 26 26 27 27 #include <bpf/bpf.h> 28 - #include "bpf/libbpf.h" 28 + #include "libbpf.h" 29 29 30 30 #include "bpf_util.h" 31 31

+1 -1

samples/bpf/xdp_redirect_map_user.c

··· 16 16 17 17 #include "bpf_util.h" 18 18 #include <bpf/bpf.h> 19 - #include "bpf/libbpf.h" 19 + #include "libbpf.h" 20 20 21 21 static int ifindex_in; 22 22 static int ifindex_out;

+1 -1

samples/bpf/xdp_redirect_user.c

··· 16 16 17 17 #include "bpf_util.h" 18 18 #include <bpf/bpf.h> 19 - #include "bpf/libbpf.h" 19 + #include "libbpf.h" 20 20 21 21 static int ifindex_in; 22 22 static int ifindex_out;

+1 -1

samples/bpf/xdp_router_ipv4_user.c

··· 21 21 #include <sys/ioctl.h> 22 22 #include <sys/syscall.h> 23 23 #include "bpf_util.h" 24 - #include "bpf/libbpf.h" 24 + #include "libbpf.h" 25 25 #include <sys/resource.h> 26 26 #include <libgen.h> 27 27

+2 -2

samples/bpf/xdp_rxq_info_user.c

··· 22 22 #include <arpa/inet.h> 23 23 #include <linux/if_link.h> 24 24 25 - #include "bpf/bpf.h" 26 - #include "bpf/libbpf.h" 25 + #include "bpf.h" 26 + #include "libbpf.h" 27 27 #include "bpf_util.h" 28 28 29 29 static int ifindex = -1;

+1 -1

samples/bpf/xdp_tx_iptunnel_user.c

··· 14 14 #include <netinet/ether.h> 15 15 #include <unistd.h> 16 16 #include <time.h> 17 - #include "bpf/libbpf.h" 17 + #include "libbpf.h" 18 18 #include <bpf/bpf.h> 19 19 #include "bpf_util.h" 20 20 #include "xdp_tx_iptunnel_common.h"

+2 -2

samples/bpf/xdpsock_user.c

··· 27 27 #include <time.h> 28 28 #include <unistd.h> 29 29 30 - #include "bpf/libbpf.h" 31 - #include "bpf/xsk.h" 30 + #include "libbpf.h" 31 + #include "xsk.h" 32 32 #include <bpf/bpf.h> 33 33 34 34 #ifndef SOL_XDP

+5 -48

tools/bpf/bpftool/common.c

··· 21 21 #include <sys/vfs.h> 22 22 23 23 #include <bpf.h> 24 + #include <libbpf.h> /* libbpf_num_possible_cpus */ 24 25 25 26 #include "main.h" 26 27 ··· 440 439 441 440 unsigned int get_possible_cpus(void) 442 441 { 443 - static unsigned int result; 444 - char buf[128]; 445 - long int n; 446 - char *ptr; 447 - int fd; 442 + int cpus = libbpf_num_possible_cpus(); 448 443 449 - if (result) 450 - return result; 451 - 452 - fd = open("/sys/devices/system/cpu/possible", O_RDONLY); 453 - if (fd < 0) { 454 - p_err("can't open sysfs possible cpus"); 444 + if (cpus < 0) { 445 + p_err("Can't get # of possible cpus: %s", strerror(-cpus)); 455 446 exit(-1); 456 447 } 457 - 458 - n = read(fd, buf, sizeof(buf)); 459 - if (n < 2) { 460 - p_err("can't read sysfs possible cpus"); 461 - exit(-1); 462 - } 463 - close(fd); 464 - 465 - if (n == sizeof(buf)) { 466 - p_err("read sysfs possible cpus overflow"); 467 - exit(-1); 468 - } 469 - 470 - ptr = buf; 471 - n = 0; 472 - while (*ptr && *ptr != '\n') { 473 - unsigned int a, b; 474 - 475 - if (sscanf(ptr, "%u-%u", &a, &b) == 2) { 476 - n += b - a + 1; 477 - 478 - ptr = strchr(ptr, '-') + 1; 479 - } else if (sscanf(ptr, "%u", &a) == 1) { 480 - n++; 481 - } else { 482 - assert(0); 483 - } 484 - 485 - while (isdigit(*ptr)) 486 - ptr++; 487 - if (*ptr == ',') 488 - ptr++; 489 - } 490 - 491 - result = n; 492 - 493 - return result; 448 + return cpus; 494 449 } 495 450 496 451 static char *

+147

tools/include/uapi/asm-generic/socket.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 + #ifndef __ASM_GENERIC_SOCKET_H 3 + #define __ASM_GENERIC_SOCKET_H 4 + 5 + #include <linux/posix_types.h> 6 + #include <asm/sockios.h> 7 + 8 + /* For setsockopt(2) */ 9 + #define SOL_SOCKET 1 10 + 11 + #define SO_DEBUG 1 12 + #define SO_REUSEADDR 2 13 + #define SO_TYPE 3 14 + #define SO_ERROR 4 15 + #define SO_DONTROUTE 5 16 + #define SO_BROADCAST 6 17 + #define SO_SNDBUF 7 18 + #define SO_RCVBUF 8 19 + #define SO_SNDBUFFORCE 32 20 + #define SO_RCVBUFFORCE 33 21 + #define SO_KEEPALIVE 9 22 + #define SO_OOBINLINE 10 23 + #define SO_NO_CHECK 11 24 + #define SO_PRIORITY 12 25 + #define SO_LINGER 13 26 + #define SO_BSDCOMPAT 14 27 + #define SO_REUSEPORT 15 28 + #ifndef SO_PASSCRED /* powerpc only differs in these */ 29 + #define SO_PASSCRED 16 30 + #define SO_PEERCRED 17 31 + #define SO_RCVLOWAT 18 32 + #define SO_SNDLOWAT 19 33 + #define SO_RCVTIMEO_OLD 20 34 + #define SO_SNDTIMEO_OLD 21 35 + #endif 36 + 37 + /* Security levels - as per NRL IPv6 - don't actually do anything */ 38 + #define SO_SECURITY_AUTHENTICATION 22 39 + #define SO_SECURITY_ENCRYPTION_TRANSPORT 23 40 + #define SO_SECURITY_ENCRYPTION_NETWORK 24 41 + 42 + #define SO_BINDTODEVICE 25 43 + 44 + /* Socket filtering */ 45 + #define SO_ATTACH_FILTER 26 46 + #define SO_DETACH_FILTER 27 47 + #define SO_GET_FILTER SO_ATTACH_FILTER 48 + 49 + #define SO_PEERNAME 28 50 + 51 + #define SO_ACCEPTCONN 30 52 + 53 + #define SO_PEERSEC 31 54 + #define SO_PASSSEC 34 55 + 56 + #define SO_MARK 36 57 + 58 + #define SO_PROTOCOL 38 59 + #define SO_DOMAIN 39 60 + 61 + #define SO_RXQ_OVFL 40 62 + 63 + #define SO_WIFI_STATUS 41 64 + #define SCM_WIFI_STATUS SO_WIFI_STATUS 65 + #define SO_PEEK_OFF 42 66 + 67 + /* Instruct lower device to use last 4-bytes of skb data as FCS */ 68 + #define SO_NOFCS 43 69 + 70 + #define SO_LOCK_FILTER 44 71 + 72 + #define SO_SELECT_ERR_QUEUE 45 73 + 74 + #define SO_BUSY_POLL 46 75 + 76 + #define SO_MAX_PACING_RATE 47 77 + 78 + #define SO_BPF_EXTENSIONS 48 79 + 80 + #define SO_INCOMING_CPU 49 81 + 82 + #define SO_ATTACH_BPF 50 83 + #define SO_DETACH_BPF SO_DETACH_FILTER 84 + 85 + #define SO_ATTACH_REUSEPORT_CBPF 51 86 + #define SO_ATTACH_REUSEPORT_EBPF 52 87 + 88 + #define SO_CNX_ADVICE 53 89 + 90 + #define SCM_TIMESTAMPING_OPT_STATS 54 91 + 92 + #define SO_MEMINFO 55 93 + 94 + #define SO_INCOMING_NAPI_ID 56 95 + 96 + #define SO_COOKIE 57 97 + 98 + #define SCM_TIMESTAMPING_PKTINFO 58 99 + 100 + #define SO_PEERGROUPS 59 101 + 102 + #define SO_ZEROCOPY 60 103 + 104 + #define SO_TXTIME 61 105 + #define SCM_TXTIME SO_TXTIME 106 + 107 + #define SO_BINDTOIFINDEX 62 108 + 109 + #define SO_TIMESTAMP_OLD 29 110 + #define SO_TIMESTAMPNS_OLD 35 111 + #define SO_TIMESTAMPING_OLD 37 112 + 113 + #define SO_TIMESTAMP_NEW 63 114 + #define SO_TIMESTAMPNS_NEW 64 115 + #define SO_TIMESTAMPING_NEW 65 116 + 117 + #define SO_RCVTIMEO_NEW 66 118 + #define SO_SNDTIMEO_NEW 67 119 + 120 + #define SO_DETACH_REUSEPORT_BPF 68 121 + 122 + #if !defined(__KERNEL__) 123 + 124 + #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) 125 + /* on 64-bit and x32, avoid the ?: operator */ 126 + #define SO_TIMESTAMP SO_TIMESTAMP_OLD 127 + #define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD 128 + #define SO_TIMESTAMPING SO_TIMESTAMPING_OLD 129 + 130 + #define SO_RCVTIMEO SO_RCVTIMEO_OLD 131 + #define SO_SNDTIMEO SO_SNDTIMEO_OLD 132 + #else 133 + #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) 134 + #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) 135 + #define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW) 136 + 137 + #define SO_RCVTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_RCVTIMEO_OLD : SO_RCVTIMEO_NEW) 138 + #define SO_SNDTIMEO (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_SNDTIMEO_OLD : SO_SNDTIMEO_NEW) 139 + #endif 140 + 141 + #define SCM_TIMESTAMP SO_TIMESTAMP 142 + #define SCM_TIMESTAMPNS SO_TIMESTAMPNS 143 + #define SCM_TIMESTAMPING SO_TIMESTAMPING 144 + 145 + #endif 146 + 147 + #endif /* __ASM_GENERIC_SOCKET_H */

+6

tools/include/uapi/linux/bpf.h

··· 3085 3085 }; 3086 3086 }; 3087 3087 3088 + struct bpf_xdp_sock { 3089 + __u32 queue_id; 3090 + }; 3091 + 3088 3092 #define XDP_PACKET_HEADROOM 256 3089 3093 3090 3094 /* User return codes for XDP prog type. ··· 3249 3245 __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. 3250 3246 * Stored in network byte order. 3251 3247 */ 3248 + __bpf_md_ptr(struct bpf_sock *, sk); 3252 3249 }; 3253 3250 3254 3251 /* User bpf_sock_ops struct to access socket values and specify request ops ··· 3301 3296 __u32 sk_txhash; 3302 3297 __u64 bytes_received; 3303 3298 __u64 bytes_acked; 3299 + __bpf_md_ptr(struct bpf_sock *, sk); 3304 3300 }; 3305 3301 3306 3302 /* Definitions for bpf_sock_ops_cb_flags */

+2 -5

tools/lib/bpf/bpf.c

··· 26 26 #include <memory.h> 27 27 #include <unistd.h> 28 28 #include <asm/unistd.h> 29 + #include <errno.h> 29 30 #include <linux/bpf.h> 30 31 #include "bpf.h" 31 32 #include "libbpf.h" 32 - #include <errno.h> 33 + #include "libbpf_internal.h" 33 34 34 35 /* 35 36 * When building perf, unistd.h is overridden. __NR_bpf is ··· 52 51 # else 53 52 # error __NR_bpf not defined. libbpf does not support your arch. 54 53 # endif 55 - #endif 56 - 57 - #ifndef min 58 - #define min(x, y) ((x) < (y) ? (x) : (y)) 59 54 #endif 60 55 61 56 static inline __u64 ptr_to_u64(const void *ptr)

+1 -4

tools/lib/bpf/bpf_prog_linfo.c

··· 6 6 #include <linux/err.h> 7 7 #include <linux/bpf.h> 8 8 #include "libbpf.h" 9 - 10 - #ifndef min 11 - #define min(x, y) ((x) < (y) ? (x) : (y)) 12 - #endif 9 + #include "libbpf_internal.h" 13 10 14 11 struct bpf_prog_linfo { 15 12 void *raw_linfo;

-3

tools/lib/bpf/btf.c

··· 16 16 #include "libbpf_internal.h" 17 17 #include "hashmap.h" 18 18 19 - #define max(a, b) ((a) > (b) ? (a) : (b)) 20 - #define min(a, b) ((a) < (b) ? (a) : (b)) 21 - 22 19 #define BTF_MAX_NR_TYPES 0x7fffffff 23 20 #define BTF_MAX_STR_OFFSET 0x7fffffff 24 21

+1

tools/lib/bpf/btf.h

··· 17 17 18 18 #define BTF_ELF_SEC ".BTF" 19 19 #define BTF_EXT_ELF_SEC ".BTF.ext" 20 + #define MAPS_ELF_SEC ".maps" 20 21 21 22 struct btf; 22 23 struct btf_ext;

-3

tools/lib/bpf/btf_dump.c

··· 18 18 #include "libbpf.h" 19 19 #include "libbpf_internal.h" 20 20 21 - #define min(x, y) ((x) < (y) ? (x) : (y)) 22 - #define max(x, y) ((x) < (y) ? (y) : (x)) 23 - 24 21 static const char PREFIXES[] = "\t\t\t\t\t\t\t\t\t\t\t\t\t"; 25 22 static const size_t PREFIX_CNT = sizeof(PREFIXES) - 1; 26 23

+689 -238

tools/lib/bpf/libbpf.c

··· 207 207 struct bpf_map { 208 208 int fd; 209 209 char *name; 210 - size_t offset; 210 + int sec_idx; 211 + size_t sec_offset; 211 212 int map_ifindex; 212 213 int inner_map_fd; 213 214 struct bpf_map_def def; ··· 235 234 size_t nr_programs; 236 235 struct bpf_map *maps; 237 236 size_t nr_maps; 237 + size_t maps_cap; 238 238 struct bpf_secdata sections; 239 239 240 240 bool loaded; ··· 262 260 } *reloc; 263 261 int nr_reloc; 264 262 int maps_shndx; 263 + int btf_maps_shndx; 265 264 int text_shndx; 266 265 int data_shndx; 267 266 int rodata_shndx; ··· 515 512 obj->efile.obj_buf = obj_buf; 516 513 obj->efile.obj_buf_sz = obj_buf_sz; 517 514 obj->efile.maps_shndx = -1; 515 + obj->efile.btf_maps_shndx = -1; 518 516 obj->efile.data_shndx = -1; 519 517 obj->efile.rodata_shndx = -1; 520 518 obj->efile.bss_shndx = -1; ··· 650 646 const struct bpf_map *a = _a; 651 647 const struct bpf_map *b = _b; 652 648 653 - return a->offset - b->offset; 649 + if (a->sec_idx != b->sec_idx) 650 + return a->sec_idx - b->sec_idx; 651 + return a->sec_offset - b->sec_offset; 654 652 } 655 653 656 654 static bool bpf_map_type__is_map_in_map(enum bpf_map_type type) ··· 769 763 return -ENOENT; 770 764 } 771 765 772 - static bool bpf_object__has_maps(const struct bpf_object *obj) 766 + static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) 773 767 { 774 - return obj->efile.maps_shndx >= 0 || 775 - obj->efile.data_shndx >= 0 || 776 - obj->efile.rodata_shndx >= 0 || 777 - obj->efile.bss_shndx >= 0; 768 + struct bpf_map *new_maps; 769 + size_t new_cap; 770 + int i; 771 + 772 + if (obj->nr_maps < obj->maps_cap) 773 + return &obj->maps[obj->nr_maps++]; 774 + 775 + new_cap = max(4ul, obj->maps_cap * 3 / 2); 776 + new_maps = realloc(obj->maps, new_cap * sizeof(*obj->maps)); 777 + if (!new_maps) { 778 + pr_warning("alloc maps for object failed\n"); 779 + return ERR_PTR(-ENOMEM); 780 + } 781 + 782 + obj->maps_cap = new_cap; 783 + obj->maps = new_maps; 784 + 785 + /* zero out new maps */ 786 + memset(obj->maps + obj->nr_maps, 0, 787 + (obj->maps_cap - obj->nr_maps) * sizeof(*obj->maps)); 788 + /* 789 + * fill all fd with -1 so won't close incorrect fd (fd=0 is stdin) 790 + * when failure (zclose won't close negative fd)). 791 + */ 792 + for (i = obj->nr_maps; i < obj->maps_cap; i++) { 793 + obj->maps[i].fd = -1; 794 + obj->maps[i].inner_map_fd = -1; 795 + } 796 + 797 + return &obj->maps[obj->nr_maps++]; 778 798 } 779 799 780 800 static int 781 - bpf_object__init_internal_map(struct bpf_object *obj, struct bpf_map *map, 782 - enum libbpf_map_type type, Elf_Data *data, 783 - void **data_buff) 801 + bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, 802 + int sec_idx, Elf_Data *data, void **data_buff) 784 803 { 785 - struct bpf_map_def *def = &map->def; 786 804 char map_name[BPF_OBJ_NAME_LEN]; 805 + struct bpf_map_def *def; 806 + struct bpf_map *map; 807 + 808 + map = bpf_object__add_map(obj); 809 + if (IS_ERR(map)) 810 + return PTR_ERR(map); 787 811 788 812 map->libbpf_type = type; 789 - map->offset = ~(typeof(map->offset))0; 813 + map->sec_idx = sec_idx; 814 + map->sec_offset = 0; 790 815 snprintf(map_name, sizeof(map_name), "%.8s%.7s", obj->name, 791 816 libbpf_type_to_btf_name[type]); 792 817 map->name = strdup(map_name); ··· 825 788 pr_warning("failed to alloc map name\n"); 826 789 return -ENOMEM; 827 790 } 791 + pr_debug("map '%s' (global data): at sec_idx %d, offset %zu.\n", 792 + map_name, map->sec_idx, map->sec_offset); 828 793 794 + def = &map->def; 829 795 def->type = BPF_MAP_TYPE_ARRAY; 830 796 def->key_size = sizeof(int); 831 797 def->value_size = data->d_size; ··· 848 808 return 0; 849 809 } 850 810 851 - static int bpf_object__init_maps(struct bpf_object *obj, int flags) 811 + static int bpf_object__init_global_data_maps(struct bpf_object *obj) 852 812 { 853 - int i, map_idx, map_def_sz = 0, nr_syms, nr_maps = 0, nr_maps_glob = 0; 854 - bool strict = !(flags & MAPS_RELAX_COMPAT); 813 + int err; 814 + 815 + if (!obj->caps.global_data) 816 + return 0; 817 + /* 818 + * Populate obj->maps with libbpf internal maps. 819 + */ 820 + if (obj->efile.data_shndx >= 0) { 821 + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_DATA, 822 + obj->efile.data_shndx, 823 + obj->efile.data, 824 + &obj->sections.data); 825 + if (err) 826 + return err; 827 + } 828 + if (obj->efile.rodata_shndx >= 0) { 829 + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_RODATA, 830 + obj->efile.rodata_shndx, 831 + obj->efile.rodata, 832 + &obj->sections.rodata); 833 + if (err) 834 + return err; 835 + } 836 + if (obj->efile.bss_shndx >= 0) { 837 + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_BSS, 838 + obj->efile.bss_shndx, 839 + obj->efile.bss, NULL); 840 + if (err) 841 + return err; 842 + } 843 + return 0; 844 + } 845 + 846 + static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) 847 + { 855 848 Elf_Data *symbols = obj->efile.symbols; 849 + int i, map_def_sz = 0, nr_maps = 0, nr_syms; 856 850 Elf_Data *data = NULL; 857 - int ret = 0; 851 + Elf_Scn *scn; 852 + 853 + if (obj->efile.maps_shndx < 0) 854 + return 0; 858 855 859 856 if (!symbols) 860 857 return -EINVAL; 861 - nr_syms = symbols->d_size / sizeof(GElf_Sym); 862 858 863 - if (obj->efile.maps_shndx >= 0) { 864 - Elf_Scn *scn = elf_getscn(obj->efile.elf, 865 - obj->efile.maps_shndx); 866 - 867 - if (scn) 868 - data = elf_getdata(scn, NULL); 869 - if (!scn || !data) { 870 - pr_warning("failed to get Elf_Data from map section %d\n", 871 - obj->efile.maps_shndx); 872 - return -EINVAL; 873 - } 859 + scn = elf_getscn(obj->efile.elf, obj->efile.maps_shndx); 860 + if (scn) 861 + data = elf_getdata(scn, NULL); 862 + if (!scn || !data) { 863 + pr_warning("failed to get Elf_Data from map section %d\n", 864 + obj->efile.maps_shndx); 865 + return -EINVAL; 874 866 } 875 867 876 868 /* ··· 912 840 * 913 841 * TODO: Detect array of map and report error. 914 842 */ 915 - if (obj->caps.global_data) { 916 - if (obj->efile.data_shndx >= 0) 917 - nr_maps_glob++; 918 - if (obj->efile.rodata_shndx >= 0) 919 - nr_maps_glob++; 920 - if (obj->efile.bss_shndx >= 0) 921 - nr_maps_glob++; 922 - } 923 - 924 - for (i = 0; data && i < nr_syms; i++) { 843 + nr_syms = symbols->d_size / sizeof(GElf_Sym); 844 + for (i = 0; i < nr_syms; i++) { 925 845 GElf_Sym sym; 926 846 927 847 if (!gelf_getsym(symbols, i, &sym)) ··· 922 858 continue; 923 859 nr_maps++; 924 860 } 925 - 926 - if (!nr_maps && !nr_maps_glob) 927 - return 0; 928 - 929 861 /* Assume equally sized map definitions */ 930 - if (data) { 931 - pr_debug("maps in %s: %d maps in %zd bytes\n", obj->path, 932 - nr_maps, data->d_size); 862 + pr_debug("maps in %s: %d maps in %zd bytes\n", 863 + obj->path, nr_maps, data->d_size); 933 864 934 - map_def_sz = data->d_size / nr_maps; 935 - if (!data->d_size || (data->d_size % nr_maps) != 0) { 936 - pr_warning("unable to determine map definition size " 937 - "section %s, %d maps in %zd bytes\n", 938 - obj->path, nr_maps, data->d_size); 939 - return -EINVAL; 940 - } 865 + map_def_sz = data->d_size / nr_maps; 866 + if (!data->d_size || (data->d_size % nr_maps) != 0) { 867 + pr_warning("unable to determine map definition size " 868 + "section %s, %d maps in %zd bytes\n", 869 + obj->path, nr_maps, data->d_size); 870 + return -EINVAL; 941 871 } 942 872 943 - nr_maps += nr_maps_glob; 944 - obj->maps = calloc(nr_maps, sizeof(obj->maps[0])); 945 - if (!obj->maps) { 946 - pr_warning("alloc maps for object failed\n"); 947 - return -ENOMEM; 948 - } 949 - obj->nr_maps = nr_maps; 950 - 951 - for (i = 0; i < nr_maps; i++) { 952 - /* 953 - * fill all fd with -1 so won't close incorrect 954 - * fd (fd=0 is stdin) when failure (zclose won't close 955 - * negative fd)). 956 - */ 957 - obj->maps[i].fd = -1; 958 - obj->maps[i].inner_map_fd = -1; 959 - } 960 - 961 - /* 962 - * Fill obj->maps using data in "maps" section. 963 - */ 964 - for (i = 0, map_idx = 0; data && i < nr_syms; i++) { 873 + /* Fill obj->maps using data in "maps" section. */ 874 + for (i = 0; i < nr_syms; i++) { 965 875 GElf_Sym sym; 966 876 const char *map_name; 967 877 struct bpf_map_def *def; 878 + struct bpf_map *map; 968 879 969 880 if (!gelf_getsym(symbols, i, &sym)) 970 881 continue; 971 882 if (sym.st_shndx != obj->efile.maps_shndx) 972 883 continue; 973 884 974 - map_name = elf_strptr(obj->efile.elf, 975 - obj->efile.strtabidx, 885 + map = bpf_object__add_map(obj); 886 + if (IS_ERR(map)) 887 + return PTR_ERR(map); 888 + 889 + map_name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, 976 890 sym.st_name); 977 891 if (!map_name) { 978 892 pr_warning("failed to get map #%d name sym string for obj %s\n", 979 - map_idx, obj->path); 893 + i, obj->path); 980 894 return -LIBBPF_ERRNO__FORMAT; 981 895 } 982 896 983 - obj->maps[map_idx].libbpf_type = LIBBPF_MAP_UNSPEC; 984 - obj->maps[map_idx].offset = sym.st_value; 897 + map->libbpf_type = LIBBPF_MAP_UNSPEC; 898 + map->sec_idx = sym.st_shndx; 899 + map->sec_offset = sym.st_value; 900 + pr_debug("map '%s' (legacy): at sec_idx %d, offset %zu.\n", 901 + map_name, map->sec_idx, map->sec_offset); 985 902 if (sym.st_value + map_def_sz > data->d_size) { 986 903 pr_warning("corrupted maps section in %s: last map \"%s\" too small\n", 987 904 obj->path, map_name); 988 905 return -EINVAL; 989 906 } 990 907 991 - obj->maps[map_idx].name = strdup(map_name); 992 - if (!obj->maps[map_idx].name) { 908 + map->name = strdup(map_name); 909 + if (!map->name) { 993 910 pr_warning("failed to alloc map name\n"); 994 911 return -ENOMEM; 995 912 } 996 - pr_debug("map %d is \"%s\"\n", map_idx, 997 - obj->maps[map_idx].name); 913 + pr_debug("map %d is \"%s\"\n", i, map->name); 998 914 def = (struct bpf_map_def *)(data->d_buf + sym.st_value); 999 915 /* 1000 916 * If the definition of the map in the object file fits in ··· 983 939 * calloc above. 984 940 */ 985 941 if (map_def_sz <= sizeof(struct bpf_map_def)) { 986 - memcpy(&obj->maps[map_idx].def, def, map_def_sz); 942 + memcpy(&map->def, def, map_def_sz); 987 943 } else { 988 944 /* 989 945 * Here the map structure being read is bigger than what ··· 1003 959 return -EINVAL; 1004 960 } 1005 961 } 1006 - memcpy(&obj->maps[map_idx].def, def, 1007 - sizeof(struct bpf_map_def)); 962 + memcpy(&map->def, def, sizeof(struct bpf_map_def)); 1008 963 } 1009 - map_idx++; 964 + } 965 + return 0; 966 + } 967 + 968 + static const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, 969 + __u32 id) 970 + { 971 + const struct btf_type *t = btf__type_by_id(btf, id); 972 + 973 + while (true) { 974 + switch (BTF_INFO_KIND(t->info)) { 975 + case BTF_KIND_VOLATILE: 976 + case BTF_KIND_CONST: 977 + case BTF_KIND_RESTRICT: 978 + case BTF_KIND_TYPEDEF: 979 + t = btf__type_by_id(btf, t->type); 980 + break; 981 + default: 982 + return t; 983 + } 984 + } 985 + } 986 + 987 + static bool get_map_field_int(const char *map_name, 988 + const struct btf *btf, 989 + const struct btf_type *def, 990 + const struct btf_member *m, 991 + const void *data, __u32 *res) { 992 + const struct btf_type *t = skip_mods_and_typedefs(btf, m->type); 993 + const char *name = btf__name_by_offset(btf, m->name_off); 994 + __u32 int_info = *(const __u32 *)(const void *)(t + 1); 995 + 996 + if (BTF_INFO_KIND(t->info) != BTF_KIND_INT) { 997 + pr_warning("map '%s': attr '%s': expected INT, got %u.\n", 998 + map_name, name, BTF_INFO_KIND(t->info)); 999 + return false; 1000 + } 1001 + if (t->size != 4 || BTF_INT_BITS(int_info) != 32 || 1002 + BTF_INT_OFFSET(int_info)) { 1003 + pr_warning("map '%s': attr '%s': expected 32-bit non-bitfield integer, " 1004 + "got %u-byte (%d-bit) one with bit offset %d.\n", 1005 + map_name, name, t->size, BTF_INT_BITS(int_info), 1006 + BTF_INT_OFFSET(int_info)); 1007 + return false; 1008 + } 1009 + if (BTF_INFO_KFLAG(def->info) && BTF_MEMBER_BITFIELD_SIZE(m->offset)) { 1010 + pr_warning("map '%s': attr '%s': bitfield is not supported.\n", 1011 + map_name, name); 1012 + return false; 1013 + } 1014 + if (m->offset % 32) { 1015 + pr_warning("map '%s': attr '%s': unaligned fields are not supported.\n", 1016 + map_name, name); 1017 + return false; 1010 1018 } 1011 1019 1012 - if (!obj->caps.global_data) 1013 - goto finalize; 1020 + *res = *(const __u32 *)(data + m->offset / 8); 1021 + return true; 1022 + } 1014 1023 1015 - /* 1016 - * Populate rest of obj->maps with libbpf internal maps. 1017 - */ 1018 - if (obj->efile.data_shndx >= 0) 1019 - ret = bpf_object__init_internal_map(obj, &obj->maps[map_idx++], 1020 - LIBBPF_MAP_DATA, 1021 - obj->efile.data, 1022 - &obj->sections.data); 1023 - if (!ret && obj->efile.rodata_shndx >= 0) 1024 - ret = bpf_object__init_internal_map(obj, &obj->maps[map_idx++], 1025 - LIBBPF_MAP_RODATA, 1026 - obj->efile.rodata, 1027 - &obj->sections.rodata); 1028 - if (!ret && obj->efile.bss_shndx >= 0) 1029 - ret = bpf_object__init_internal_map(obj, &obj->maps[map_idx++], 1030 - LIBBPF_MAP_BSS, 1031 - obj->efile.bss, NULL); 1032 - finalize: 1033 - if (!ret) 1024 + static int bpf_object__init_user_btf_map(struct bpf_object *obj, 1025 + const struct btf_type *sec, 1026 + int var_idx, int sec_idx, 1027 + const Elf_Data *data, bool strict) 1028 + { 1029 + const struct btf_type *var, *def, *t; 1030 + const struct btf_var_secinfo *vi; 1031 + const struct btf_var *var_extra; 1032 + const struct btf_member *m; 1033 + const void *def_data; 1034 + const char *map_name; 1035 + struct bpf_map *map; 1036 + int vlen, i; 1037 + 1038 + vi = (const struct btf_var_secinfo *)(const void *)(sec + 1) + var_idx; 1039 + var = btf__type_by_id(obj->btf, vi->type); 1040 + var_extra = (const void *)(var + 1); 1041 + map_name = btf__name_by_offset(obj->btf, var->name_off); 1042 + vlen = BTF_INFO_VLEN(var->info); 1043 + 1044 + if (map_name == NULL || map_name[0] == '\0') { 1045 + pr_warning("map #%d: empty name.\n", var_idx); 1046 + return -EINVAL; 1047 + } 1048 + if ((__u64)vi->offset + vi->size > data->d_size) { 1049 + pr_warning("map '%s' BTF data is corrupted.\n", map_name); 1050 + return -EINVAL; 1051 + } 1052 + if (BTF_INFO_KIND(var->info) != BTF_KIND_VAR) { 1053 + pr_warning("map '%s': unexpected var kind %u.\n", 1054 + map_name, BTF_INFO_KIND(var->info)); 1055 + return -EINVAL; 1056 + } 1057 + if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED && 1058 + var_extra->linkage != BTF_VAR_STATIC) { 1059 + pr_warning("map '%s': unsupported var linkage %u.\n", 1060 + map_name, var_extra->linkage); 1061 + return -EOPNOTSUPP; 1062 + } 1063 + 1064 + def = skip_mods_and_typedefs(obj->btf, var->type); 1065 + if (BTF_INFO_KIND(def->info) != BTF_KIND_STRUCT) { 1066 + pr_warning("map '%s': unexpected def kind %u.\n", 1067 + map_name, BTF_INFO_KIND(var->info)); 1068 + return -EINVAL; 1069 + } 1070 + if (def->size > vi->size) { 1071 + pr_warning("map '%s': invalid def size.\n", map_name); 1072 + return -EINVAL; 1073 + } 1074 + 1075 + map = bpf_object__add_map(obj); 1076 + if (IS_ERR(map)) 1077 + return PTR_ERR(map); 1078 + map->name = strdup(map_name); 1079 + if (!map->name) { 1080 + pr_warning("map '%s': failed to alloc map name.\n", map_name); 1081 + return -ENOMEM; 1082 + } 1083 + map->libbpf_type = LIBBPF_MAP_UNSPEC; 1084 + map->def.type = BPF_MAP_TYPE_UNSPEC; 1085 + map->sec_idx = sec_idx; 1086 + map->sec_offset = vi->offset; 1087 + pr_debug("map '%s': at sec_idx %d, offset %zu.\n", 1088 + map_name, map->sec_idx, map->sec_offset); 1089 + 1090 + def_data = data->d_buf + vi->offset; 1091 + vlen = BTF_INFO_VLEN(def->info); 1092 + m = (const void *)(def + 1); 1093 + for (i = 0; i < vlen; i++, m++) { 1094 + const char *name = btf__name_by_offset(obj->btf, m->name_off); 1095 + 1096 + if (!name) { 1097 + pr_warning("map '%s': invalid field #%d.\n", 1098 + map_name, i); 1099 + return -EINVAL; 1100 + } 1101 + if (strcmp(name, "type") == 0) { 1102 + if (!get_map_field_int(map_name, obj->btf, def, m, 1103 + def_data, &map->def.type)) 1104 + return -EINVAL; 1105 + pr_debug("map '%s': found type = %u.\n", 1106 + map_name, map->def.type); 1107 + } else if (strcmp(name, "max_entries") == 0) { 1108 + if (!get_map_field_int(map_name, obj->btf, def, m, 1109 + def_data, &map->def.max_entries)) 1110 + return -EINVAL; 1111 + pr_debug("map '%s': found max_entries = %u.\n", 1112 + map_name, map->def.max_entries); 1113 + } else if (strcmp(name, "map_flags") == 0) { 1114 + if (!get_map_field_int(map_name, obj->btf, def, m, 1115 + def_data, &map->def.map_flags)) 1116 + return -EINVAL; 1117 + pr_debug("map '%s': found map_flags = %u.\n", 1118 + map_name, map->def.map_flags); 1119 + } else if (strcmp(name, "key_size") == 0) { 1120 + __u32 sz; 1121 + 1122 + if (!get_map_field_int(map_name, obj->btf, def, m, 1123 + def_data, &sz)) 1124 + return -EINVAL; 1125 + pr_debug("map '%s': found key_size = %u.\n", 1126 + map_name, sz); 1127 + if (map->def.key_size && map->def.key_size != sz) { 1128 + pr_warning("map '%s': conflictling key size %u != %u.\n", 1129 + map_name, map->def.key_size, sz); 1130 + return -EINVAL; 1131 + } 1132 + map->def.key_size = sz; 1133 + } else if (strcmp(name, "key") == 0) { 1134 + __s64 sz; 1135 + 1136 + t = btf__type_by_id(obj->btf, m->type); 1137 + if (!t) { 1138 + pr_warning("map '%s': key type [%d] not found.\n", 1139 + map_name, m->type); 1140 + return -EINVAL; 1141 + } 1142 + if (BTF_INFO_KIND(t->info) != BTF_KIND_PTR) { 1143 + pr_warning("map '%s': key spec is not PTR: %u.\n", 1144 + map_name, BTF_INFO_KIND(t->info)); 1145 + return -EINVAL; 1146 + } 1147 + sz = btf__resolve_size(obj->btf, t->type); 1148 + if (sz < 0) { 1149 + pr_warning("map '%s': can't determine key size for type [%u]: %lld.\n", 1150 + map_name, t->type, sz); 1151 + return sz; 1152 + } 1153 + pr_debug("map '%s': found key [%u], sz = %lld.\n", 1154 + map_name, t->type, sz); 1155 + if (map->def.key_size && map->def.key_size != sz) { 1156 + pr_warning("map '%s': conflictling key size %u != %lld.\n", 1157 + map_name, map->def.key_size, sz); 1158 + return -EINVAL; 1159 + } 1160 + map->def.key_size = sz; 1161 + map->btf_key_type_id = t->type; 1162 + } else if (strcmp(name, "value_size") == 0) { 1163 + __u32 sz; 1164 + 1165 + if (!get_map_field_int(map_name, obj->btf, def, m, 1166 + def_data, &sz)) 1167 + return -EINVAL; 1168 + pr_debug("map '%s': found value_size = %u.\n", 1169 + map_name, sz); 1170 + if (map->def.value_size && map->def.value_size != sz) { 1171 + pr_warning("map '%s': conflictling value size %u != %u.\n", 1172 + map_name, map->def.value_size, sz); 1173 + return -EINVAL; 1174 + } 1175 + map->def.value_size = sz; 1176 + } else if (strcmp(name, "value") == 0) { 1177 + __s64 sz; 1178 + 1179 + t = btf__type_by_id(obj->btf, m->type); 1180 + if (!t) { 1181 + pr_warning("map '%s': value type [%d] not found.\n", 1182 + map_name, m->type); 1183 + return -EINVAL; 1184 + } 1185 + if (BTF_INFO_KIND(t->info) != BTF_KIND_PTR) { 1186 + pr_warning("map '%s': value spec is not PTR: %u.\n", 1187 + map_name, BTF_INFO_KIND(t->info)); 1188 + return -EINVAL; 1189 + } 1190 + sz = btf__resolve_size(obj->btf, t->type); 1191 + if (sz < 0) { 1192 + pr_warning("map '%s': can't determine value size for type [%u]: %lld.\n", 1193 + map_name, t->type, sz); 1194 + return sz; 1195 + } 1196 + pr_debug("map '%s': found value [%u], sz = %lld.\n", 1197 + map_name, t->type, sz); 1198 + if (map->def.value_size && map->def.value_size != sz) { 1199 + pr_warning("map '%s': conflictling value size %u != %lld.\n", 1200 + map_name, map->def.value_size, sz); 1201 + return -EINVAL; 1202 + } 1203 + map->def.value_size = sz; 1204 + map->btf_value_type_id = t->type; 1205 + } else { 1206 + if (strict) { 1207 + pr_warning("map '%s': unknown field '%s'.\n", 1208 + map_name, name); 1209 + return -ENOTSUP; 1210 + } 1211 + pr_debug("map '%s': ignoring unknown field '%s'.\n", 1212 + map_name, name); 1213 + } 1214 + } 1215 + 1216 + if (map->def.type == BPF_MAP_TYPE_UNSPEC) { 1217 + pr_warning("map '%s': map type isn't specified.\n", map_name); 1218 + return -EINVAL; 1219 + } 1220 + 1221 + return 0; 1222 + } 1223 + 1224 + static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict) 1225 + { 1226 + const struct btf_type *sec = NULL; 1227 + int nr_types, i, vlen, err; 1228 + const struct btf_type *t; 1229 + const char *name; 1230 + Elf_Data *data; 1231 + Elf_Scn *scn; 1232 + 1233 + if (obj->efile.btf_maps_shndx < 0) 1234 + return 0; 1235 + 1236 + scn = elf_getscn(obj->efile.elf, obj->efile.btf_maps_shndx); 1237 + if (scn) 1238 + data = elf_getdata(scn, NULL); 1239 + if (!scn || !data) { 1240 + pr_warning("failed to get Elf_Data from map section %d (%s)\n", 1241 + obj->efile.maps_shndx, MAPS_ELF_SEC); 1242 + return -EINVAL; 1243 + } 1244 + 1245 + nr_types = btf__get_nr_types(obj->btf); 1246 + for (i = 1; i <= nr_types; i++) { 1247 + t = btf__type_by_id(obj->btf, i); 1248 + if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) 1249 + continue; 1250 + name = btf__name_by_offset(obj->btf, t->name_off); 1251 + if (strcmp(name, MAPS_ELF_SEC) == 0) { 1252 + sec = t; 1253 + break; 1254 + } 1255 + } 1256 + 1257 + if (!sec) { 1258 + pr_warning("DATASEC '%s' not found.\n", MAPS_ELF_SEC); 1259 + return -ENOENT; 1260 + } 1261 + 1262 + vlen = BTF_INFO_VLEN(sec->info); 1263 + for (i = 0; i < vlen; i++) { 1264 + err = bpf_object__init_user_btf_map(obj, sec, i, 1265 + obj->efile.btf_maps_shndx, 1266 + data, strict); 1267 + if (err) 1268 + return err; 1269 + } 1270 + 1271 + return 0; 1272 + } 1273 + 1274 + static int bpf_object__init_maps(struct bpf_object *obj, int flags) 1275 + { 1276 + bool strict = !(flags & MAPS_RELAX_COMPAT); 1277 + int err; 1278 + 1279 + err = bpf_object__init_user_maps(obj, strict); 1280 + if (err) 1281 + return err; 1282 + 1283 + err = bpf_object__init_user_btf_maps(obj, strict); 1284 + if (err) 1285 + return err; 1286 + 1287 + err = bpf_object__init_global_data_maps(obj); 1288 + if (err) 1289 + return err; 1290 + 1291 + if (obj->nr_maps) { 1034 1292 qsort(obj->maps, obj->nr_maps, sizeof(obj->maps[0]), 1035 1293 compare_bpf_map); 1036 - return ret; 1294 + } 1295 + return 0; 1037 1296 } 1038 1297 1039 1298 static bool section_have_execinstr(struct bpf_object *obj, int idx) ··· 1425 1078 } 1426 1079 } 1427 1080 1081 + static bool bpf_object__is_btf_mandatory(const struct bpf_object *obj) 1082 + { 1083 + return obj->efile.btf_maps_shndx >= 0; 1084 + } 1085 + 1086 + static int bpf_object__init_btf(struct bpf_object *obj, 1087 + Elf_Data *btf_data, 1088 + Elf_Data *btf_ext_data) 1089 + { 1090 + bool btf_required = bpf_object__is_btf_mandatory(obj); 1091 + int err = 0; 1092 + 1093 + if (btf_data) { 1094 + obj->btf = btf__new(btf_data->d_buf, btf_data->d_size); 1095 + if (IS_ERR(obj->btf)) { 1096 + pr_warning("Error loading ELF section %s: %d.\n", 1097 + BTF_ELF_SEC, err); 1098 + goto out; 1099 + } 1100 + err = btf__finalize_data(obj, obj->btf); 1101 + if (err) { 1102 + pr_warning("Error finalizing %s: %d.\n", 1103 + BTF_ELF_SEC, err); 1104 + goto out; 1105 + } 1106 + } 1107 + if (btf_ext_data) { 1108 + if (!obj->btf) { 1109 + pr_debug("Ignore ELF section %s because its depending ELF section %s is not found.\n", 1110 + BTF_EXT_ELF_SEC, BTF_ELF_SEC); 1111 + goto out; 1112 + } 1113 + obj->btf_ext = btf_ext__new(btf_ext_data->d_buf, 1114 + btf_ext_data->d_size); 1115 + if (IS_ERR(obj->btf_ext)) { 1116 + pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n", 1117 + BTF_EXT_ELF_SEC, PTR_ERR(obj->btf_ext)); 1118 + obj->btf_ext = NULL; 1119 + goto out; 1120 + } 1121 + } 1122 + out: 1123 + if (err || IS_ERR(obj->btf)) { 1124 + if (btf_required) 1125 + err = err ? : PTR_ERR(obj->btf); 1126 + else 1127 + err = 0; 1128 + if (!IS_ERR_OR_NULL(obj->btf)) 1129 + btf__free(obj->btf); 1130 + obj->btf = NULL; 1131 + } 1132 + if (btf_required && !obj->btf) { 1133 + pr_warning("BTF is required, but is missing or corrupted.\n"); 1134 + return err == 0 ? -ENOENT : err; 1135 + } 1136 + return 0; 1137 + } 1138 + 1139 + static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) 1140 + { 1141 + int err = 0; 1142 + 1143 + if (!obj->btf) 1144 + return 0; 1145 + 1146 + bpf_object__sanitize_btf(obj); 1147 + bpf_object__sanitize_btf_ext(obj); 1148 + 1149 + err = btf__load(obj->btf); 1150 + if (err) { 1151 + pr_warning("Error loading %s into kernel: %d.\n", 1152 + BTF_ELF_SEC, err); 1153 + btf__free(obj->btf); 1154 + obj->btf = NULL; 1155 + if (bpf_object__is_btf_mandatory(obj)) 1156 + return err; 1157 + } 1158 + return 0; 1159 + } 1160 + 1428 1161 static int bpf_object__elf_collect(struct bpf_object *obj, int flags) 1429 1162 { 1430 1163 Elf *elf = obj->efile.elf; ··· 1529 1102 if (gelf_getshdr(scn, &sh) != &sh) { 1530 1103 pr_warning("failed to get section(%d) header from %s\n", 1531 1104 idx, obj->path); 1532 - err = -LIBBPF_ERRNO__FORMAT; 1533 - goto out; 1105 + return -LIBBPF_ERRNO__FORMAT; 1534 1106 } 1535 1107 1536 1108 name = elf_strptr(elf, ep->e_shstrndx, sh.sh_name); 1537 1109 if (!name) { 1538 1110 pr_warning("failed to get section(%d) name from %s\n", 1539 1111 idx, obj->path); 1540 - err = -LIBBPF_ERRNO__FORMAT; 1541 - goto out; 1112 + return -LIBBPF_ERRNO__FORMAT; 1542 1113 } 1543 1114 1544 1115 data = elf_getdata(scn, 0); 1545 1116 if (!data) { 1546 1117 pr_warning("failed to get section(%d) data from %s(%s)\n", 1547 1118 idx, name, obj->path); 1548 - err = -LIBBPF_ERRNO__FORMAT; 1549 - goto out; 1119 + return -LIBBPF_ERRNO__FORMAT; 1550 1120 } 1551 1121 pr_debug("section(%d) %s, size %ld, link %d, flags %lx, type=%d\n", 1552 1122 idx, name, (unsigned long)data->d_size, ··· 1554 1130 err = bpf_object__init_license(obj, 1555 1131 data->d_buf, 1556 1132 data->d_size); 1133 + if (err) 1134 + return err; 1557 1135 } else if (strcmp(name, "version") == 0) { 1558 1136 err = bpf_object__init_kversion(obj, 1559 1137 data->d_buf, 1560 1138 data->d_size); 1139 + if (err) 1140 + return err; 1561 1141 } else if (strcmp(name, "maps") == 0) { 1562 1142 obj->efile.maps_shndx = idx; 1143 + } else if (strcmp(name, MAPS_ELF_SEC) == 0) { 1144 + obj->efile.btf_maps_shndx = idx; 1563 1145 } else if (strcmp(name, BTF_ELF_SEC) == 0) { 1564 1146 btf_data = data; 1565 1147 } else if (strcmp(name, BTF_EXT_ELF_SEC) == 0) { ··· 1574 1144 if (obj->efile.symbols) { 1575 1145 pr_warning("bpf: multiple SYMTAB in %s\n", 1576 1146 obj->path); 1577 - err = -LIBBPF_ERRNO__FORMAT; 1578 - } else { 1579 - obj->efile.symbols = data; 1580 - obj->efile.strtabidx = sh.sh_link; 1147 + return -LIBBPF_ERRNO__FORMAT; 1581 1148 } 1149 + obj->efile.symbols = data; 1150 + obj->efile.strtabidx = sh.sh_link; 1582 1151 } else if (sh.sh_type == SHT_PROGBITS && data->d_size > 0) { 1583 1152 if (sh.sh_flags & SHF_EXECINSTR) { 1584 1153 if (strcmp(name, ".text") == 0) ··· 1591 1162 1592 1163 pr_warning("failed to alloc program %s (%s): %s", 1593 1164 name, obj->path, cp); 1165 + return err; 1594 1166 } 1595 1167 } else if (strcmp(name, ".data") == 0) { 1596 1168 obj->efile.data = data; ··· 1603 1173 pr_debug("skip section(%d) %s\n", idx, name); 1604 1174 } 1605 1175 } else if (sh.sh_type == SHT_REL) { 1176 + int nr_reloc = obj->efile.nr_reloc; 1606 1177 void *reloc = obj->efile.reloc; 1607 - int nr_reloc = obj->efile.nr_reloc + 1; 1608 1178 int sec = sh.sh_info; /* points to other section */ 1609 1179 1610 1180 /* Only do relo for section with exec instructions */ ··· 1614 1184 continue; 1615 1185 } 1616 1186 1617 - reloc = reallocarray(reloc, nr_reloc, 1187 + reloc = reallocarray(reloc, nr_reloc + 1, 1618 1188 sizeof(*obj->efile.reloc)); 1619 1189 if (!reloc) { 1620 1190 pr_warning("realloc failed\n"); 1621 - err = -ENOMEM; 1622 - } else { 1623 - int n = nr_reloc - 1; 1624 - 1625 - obj->efile.reloc = reloc; 1626 - obj->efile.nr_reloc = nr_reloc; 1627 - 1628 - obj->efile.reloc[n].shdr = sh; 1629 - obj->efile.reloc[n].data = data; 1191 + return -ENOMEM; 1630 1192 } 1193 + 1194 + obj->efile.reloc = reloc; 1195 + obj->efile.nr_reloc++; 1196 + 1197 + obj->efile.reloc[nr_reloc].shdr = sh; 1198 + obj->efile.reloc[nr_reloc].data = data; 1631 1199 } else if (sh.sh_type == SHT_NOBITS && strcmp(name, ".bss") == 0) { 1632 1200 obj->efile.bss = data; 1633 1201 obj->efile.bss_shndx = idx; 1634 1202 } else { 1635 1203 pr_debug("skip section(%d) %s\n", idx, name); 1636 1204 } 1637 - if (err) 1638 - goto out; 1639 1205 } 1640 1206 1641 1207 if (!obj->efile.strtabidx || obj->efile.strtabidx >= idx) { 1642 1208 pr_warning("Corrupted ELF file: index of strtab invalid\n"); 1643 1209 return -LIBBPF_ERRNO__FORMAT; 1644 1210 } 1645 - if (btf_data) { 1646 - obj->btf = btf__new(btf_data->d_buf, btf_data->d_size); 1647 - if (IS_ERR(obj->btf)) { 1648 - pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n", 1649 - BTF_ELF_SEC, PTR_ERR(obj->btf)); 1650 - obj->btf = NULL; 1651 - } else { 1652 - err = btf__finalize_data(obj, obj->btf); 1653 - if (!err) { 1654 - bpf_object__sanitize_btf(obj); 1655 - err = btf__load(obj->btf); 1656 - } 1657 - if (err) { 1658 - pr_warning("Error finalizing and loading %s into kernel: %d. Ignored and continue.\n", 1659 - BTF_ELF_SEC, err); 1660 - btf__free(obj->btf); 1661 - obj->btf = NULL; 1662 - err = 0; 1663 - } 1664 - } 1665 - } 1666 - if (btf_ext_data) { 1667 - if (!obj->btf) { 1668 - pr_debug("Ignore ELF section %s because its depending ELF section %s is not found.\n", 1669 - BTF_EXT_ELF_SEC, BTF_ELF_SEC); 1670 - } else { 1671 - obj->btf_ext = btf_ext__new(btf_ext_data->d_buf, 1672 - btf_ext_data->d_size); 1673 - if (IS_ERR(obj->btf_ext)) { 1674 - pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n", 1675 - BTF_EXT_ELF_SEC, 1676 - PTR_ERR(obj->btf_ext)); 1677 - obj->btf_ext = NULL; 1678 - } else { 1679 - bpf_object__sanitize_btf_ext(obj); 1680 - } 1681 - } 1682 - } 1683 - if (bpf_object__has_maps(obj)) { 1211 + err = bpf_object__init_btf(obj, btf_data, btf_ext_data); 1212 + if (!err) 1684 1213 err = bpf_object__init_maps(obj, flags); 1685 - if (err) 1686 - goto out; 1687 - } 1688 - err = bpf_object__init_prog_names(obj); 1689 - out: 1214 + if (!err) 1215 + err = bpf_object__sanitize_and_load_btf(obj); 1216 + if (!err) 1217 + err = bpf_object__init_prog_names(obj); 1690 1218 return err; 1691 1219 } 1692 1220 ··· 1663 1275 } 1664 1276 1665 1277 struct bpf_program * 1666 - bpf_object__find_program_by_title(struct bpf_object *obj, const char *title) 1278 + bpf_object__find_program_by_title(const struct bpf_object *obj, 1279 + const char *title) 1667 1280 { 1668 1281 struct bpf_program *pos; 1669 1282 ··· 1686 1297 static bool bpf_object__shndx_is_maps(const struct bpf_object *obj, 1687 1298 int shndx) 1688 1299 { 1689 - return shndx == obj->efile.maps_shndx; 1300 + return shndx == obj->efile.maps_shndx || 1301 + shndx == obj->efile.btf_maps_shndx; 1690 1302 } 1691 1303 1692 1304 static bool bpf_object__relo_in_known_section(const struct bpf_object *obj, ··· 1731 1341 prog->nr_reloc = nrels; 1732 1342 1733 1343 for (i = 0; i < nrels; i++) { 1734 - GElf_Sym sym; 1735 - GElf_Rel rel; 1736 - unsigned int insn_idx; 1737 - unsigned int shdr_idx; 1738 1344 struct bpf_insn *insns = prog->insns; 1739 1345 enum libbpf_map_type type; 1346 + unsigned int insn_idx; 1347 + unsigned int shdr_idx; 1740 1348 const char *name; 1741 1349 size_t map_idx; 1350 + GElf_Sym sym; 1351 + GElf_Rel rel; 1742 1352 1743 1353 if (!gelf_getrel(data, i, &rel)) { 1744 1354 pr_warning("relocation: failed to get %d reloc\n", i); ··· 1806 1416 if (maps[map_idx].libbpf_type != type) 1807 1417 continue; 1808 1418 if (type != LIBBPF_MAP_UNSPEC || 1809 - maps[map_idx].offset == sym.st_value) { 1810 - pr_debug("relocation: find map %zd (%s) for insn %u\n", 1811 - map_idx, maps[map_idx].name, insn_idx); 1419 + (maps[map_idx].sec_idx == sym.st_shndx && 1420 + maps[map_idx].sec_offset == sym.st_value)) { 1421 + pr_debug("relocation: found map %zd (%s, sec_idx %d, offset %zu) for insn %u\n", 1422 + map_idx, maps[map_idx].name, 1423 + maps[map_idx].sec_idx, 1424 + maps[map_idx].sec_offset, 1425 + insn_idx); 1812 1426 break; 1813 1427 } 1814 1428 } ··· 1832 1438 return 0; 1833 1439 } 1834 1440 1835 - static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf) 1441 + static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) 1836 1442 { 1837 1443 struct bpf_map_def *def = &map->def; 1838 1444 __u32 key_type_id = 0, value_type_id = 0; 1839 1445 int ret; 1840 1446 1447 + /* if it's BTF-defined map, we don't need to search for type IDs */ 1448 + if (map->sec_idx == obj->efile.btf_maps_shndx) 1449 + return 0; 1450 + 1841 1451 if (!bpf_map__is_internal(map)) { 1842 - ret = btf__get_map_kv_tids(btf, map->name, def->key_size, 1452 + ret = btf__get_map_kv_tids(obj->btf, map->name, def->key_size, 1843 1453 def->value_size, &key_type_id, 1844 1454 &value_type_id); 1845 1455 } else { ··· 1851 1453 * LLVM annotates global data differently in BTF, that is, 1852 1454 * only as '.data', '.bss' or '.rodata'. 1853 1455 */ 1854 - ret = btf__find_by_name(btf, 1456 + ret = btf__find_by_name(obj->btf, 1855 1457 libbpf_type_to_btf_name[map->libbpf_type]); 1856 1458 } 1857 1459 if (ret < 0) ··· 2138 1740 create_attr.key_size = def->key_size; 2139 1741 create_attr.value_size = def->value_size; 2140 1742 create_attr.max_entries = def->max_entries; 2141 - create_attr.btf_fd = -1; 1743 + create_attr.btf_fd = 0; 2142 1744 create_attr.btf_key_type_id = 0; 2143 1745 create_attr.btf_value_type_id = 0; 2144 1746 if (bpf_map_type__is_map_in_map(def->type) && 2145 1747 map->inner_map_fd >= 0) 2146 1748 create_attr.inner_map_fd = map->inner_map_fd; 2147 1749 2148 - if (obj->btf && !bpf_map_find_btf_info(map, obj->btf)) { 1750 + if (obj->btf && !bpf_map_find_btf_info(obj, map)) { 2149 1751 create_attr.btf_fd = btf__fd(obj->btf); 2150 1752 create_attr.btf_key_type_id = map->btf_key_type_id; 2151 1753 create_attr.btf_value_type_id = map->btf_value_type_id; 2152 1754 } 2153 1755 2154 1756 *pfd = bpf_create_map_xattr(&create_attr); 2155 - if (*pfd < 0 && create_attr.btf_fd >= 0) { 1757 + if (*pfd < 0 && (create_attr.btf_key_type_id || 1758 + create_attr.btf_value_type_id)) { 2156 1759 cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); 2157 1760 pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", 2158 1761 map->name, cp, errno); 2159 - create_attr.btf_fd = -1; 1762 + create_attr.btf_fd = 0; 2160 1763 create_attr.btf_key_type_id = 0; 2161 1764 create_attr.btf_value_type_id = 0; 2162 1765 map->btf_key_type_id = 0; ··· 2448 2049 load_attr.license = license; 2449 2050 load_attr.kern_version = kern_version; 2450 2051 load_attr.prog_ifindex = prog->prog_ifindex; 2451 - load_attr.prog_btf_fd = prog->btf_fd; 2052 + load_attr.prog_btf_fd = prog->btf_fd >= 0 ? prog->btf_fd : 0; 2452 2053 load_attr.func_info = prog->func_info; 2453 2054 load_attr.func_info_rec_size = prog->func_info_rec_size; 2454 2055 load_attr.func_info_cnt = prog->func_info_cnt; ··· 2594 2195 return err; 2595 2196 } 2596 2197 2597 - static bool bpf_program__is_function_storage(struct bpf_program *prog, 2598 - struct bpf_object *obj) 2198 + static bool bpf_program__is_function_storage(const struct bpf_program *prog, 2199 + const struct bpf_object *obj) 2599 2200 { 2600 2201 return prog->idx == obj->efile.text_shndx && obj->has_pseudo_calls; 2601 2202 } ··· 3301 2902 return next; 3302 2903 } 3303 2904 3304 - const char *bpf_object__name(struct bpf_object *obj) 2905 + const char *bpf_object__name(const struct bpf_object *obj) 3305 2906 { 3306 2907 return obj ? obj->path : ERR_PTR(-EINVAL); 3307 2908 } 3308 2909 3309 - unsigned int bpf_object__kversion(struct bpf_object *obj) 2910 + unsigned int bpf_object__kversion(const struct bpf_object *obj) 3310 2911 { 3311 2912 return obj ? obj->kern_version : 0; 3312 2913 } 3313 2914 3314 - struct btf *bpf_object__btf(struct bpf_object *obj) 2915 + struct btf *bpf_object__btf(const struct bpf_object *obj) 3315 2916 { 3316 2917 return obj ? obj->btf : NULL; 3317 2918 } ··· 3332 2933 return 0; 3333 2934 } 3334 2935 3335 - void *bpf_object__priv(struct bpf_object *obj) 2936 + void *bpf_object__priv(const struct bpf_object *obj) 3336 2937 { 3337 2938 return obj ? obj->priv : ERR_PTR(-EINVAL); 3338 2939 } 3339 2940 3340 2941 static struct bpf_program * 3341 - __bpf_program__iter(struct bpf_program *p, struct bpf_object *obj, bool forward) 2942 + __bpf_program__iter(const struct bpf_program *p, const struct bpf_object *obj, 2943 + bool forward) 3342 2944 { 3343 2945 size_t nr_programs = obj->nr_programs; 3344 2946 ssize_t idx; ··· 3364 2964 } 3365 2965 3366 2966 struct bpf_program * 3367 - bpf_program__next(struct bpf_program *prev, struct bpf_object *obj) 2967 + bpf_program__next(struct bpf_program *prev, const struct bpf_object *obj) 3368 2968 { 3369 2969 struct bpf_program *prog = prev; 3370 2970 ··· 3376 2976 } 3377 2977 3378 2978 struct bpf_program * 3379 - bpf_program__prev(struct bpf_program *next, struct bpf_object *obj) 2979 + bpf_program__prev(struct bpf_program *next, const struct bpf_object *obj) 3380 2980 { 3381 2981 struct bpf_program *prog = next; 3382 2982 ··· 3398 2998 return 0; 3399 2999 } 3400 3000 3401 - void *bpf_program__priv(struct bpf_program *prog) 3001 + void *bpf_program__priv(const struct bpf_program *prog) 3402 3002 { 3403 3003 return prog ? prog->priv : ERR_PTR(-EINVAL); 3404 3004 } ··· 3408 3008 prog->prog_ifindex = ifindex; 3409 3009 } 3410 3010 3411 - const char *bpf_program__title(struct bpf_program *prog, bool needs_copy) 3011 + const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy) 3412 3012 { 3413 3013 const char *title; 3414 3014 ··· 3424 3024 return title; 3425 3025 } 3426 3026 3427 - int bpf_program__fd(struct bpf_program *prog) 3027 + int bpf_program__fd(const struct bpf_program *prog) 3428 3028 { 3429 3029 return bpf_program__nth_fd(prog, 0); 3430 3030 } ··· 3457 3057 return 0; 3458 3058 } 3459 3059 3460 - int bpf_program__nth_fd(struct bpf_program *prog, int n) 3060 + int bpf_program__nth_fd(const struct bpf_program *prog, int n) 3461 3061 { 3462 3062 int fd; 3463 3063 ··· 3485 3085 prog->type = type; 3486 3086 } 3487 3087 3488 - static bool bpf_program__is_type(struct bpf_program *prog, 3088 + static bool bpf_program__is_type(const struct bpf_program *prog, 3489 3089 enum bpf_prog_type type) 3490 3090 { 3491 3091 return prog ? (prog->type == type) : false; 3492 3092 } 3493 3093 3494 - #define BPF_PROG_TYPE_FNS(NAME, TYPE) \ 3495 - int bpf_program__set_##NAME(struct bpf_program *prog) \ 3496 - { \ 3497 - if (!prog) \ 3498 - return -EINVAL; \ 3499 - bpf_program__set_type(prog, TYPE); \ 3500 - return 0; \ 3501 - } \ 3502 - \ 3503 - bool bpf_program__is_##NAME(struct bpf_program *prog) \ 3504 - { \ 3505 - return bpf_program__is_type(prog, TYPE); \ 3506 - } \ 3094 + #define BPF_PROG_TYPE_FNS(NAME, TYPE) \ 3095 + int bpf_program__set_##NAME(struct bpf_program *prog) \ 3096 + { \ 3097 + if (!prog) \ 3098 + return -EINVAL; \ 3099 + bpf_program__set_type(prog, TYPE); \ 3100 + return 0; \ 3101 + } \ 3102 + \ 3103 + bool bpf_program__is_##NAME(const struct bpf_program *prog) \ 3104 + { \ 3105 + return bpf_program__is_type(prog, TYPE); \ 3106 + } \ 3507 3107 3508 3108 BPF_PROG_TYPE_FNS(socket_filter, BPF_PROG_TYPE_SOCKET_FILTER); 3509 3109 BPF_PROG_TYPE_FNS(kprobe, BPF_PROG_TYPE_KPROBE); ··· 3702 3302 expected_attach_type); 3703 3303 } 3704 3304 3705 - int bpf_map__fd(struct bpf_map *map) 3305 + int bpf_map__fd(const struct bpf_map *map) 3706 3306 { 3707 3307 return map ? map->fd : -EINVAL; 3708 3308 } 3709 3309 3710 - const struct bpf_map_def *bpf_map__def(struct bpf_map *map) 3310 + const struct bpf_map_def *bpf_map__def(const struct bpf_map *map) 3711 3311 { 3712 3312 return map ? &map->def : ERR_PTR(-EINVAL); 3713 3313 } 3714 3314 3715 - const char *bpf_map__name(struct bpf_map *map) 3315 + const char *bpf_map__name(const struct bpf_map *map) 3716 3316 { 3717 3317 return map ? map->name : NULL; 3718 3318 } ··· 3743 3343 return 0; 3744 3344 } 3745 3345 3746 - void *bpf_map__priv(struct bpf_map *map) 3346 + void *bpf_map__priv(const struct bpf_map *map) 3747 3347 { 3748 3348 return map ? map->priv : ERR_PTR(-EINVAL); 3749 3349 } 3750 3350 3751 - bool bpf_map__is_offload_neutral(struct bpf_map *map) 3351 + bool bpf_map__is_offload_neutral(const struct bpf_map *map) 3752 3352 { 3753 3353 return map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; 3754 3354 } 3755 3355 3756 - bool bpf_map__is_internal(struct bpf_map *map) 3356 + bool bpf_map__is_internal(const struct bpf_map *map) 3757 3357 { 3758 3358 return map->libbpf_type != LIBBPF_MAP_UNSPEC; 3759 3359 } ··· 3778 3378 } 3779 3379 3780 3380 static struct bpf_map * 3781 - __bpf_map__iter(struct bpf_map *m, struct bpf_object *obj, int i) 3381 + __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) 3782 3382 { 3783 3383 ssize_t idx; 3784 3384 struct bpf_map *s, *e; ··· 3802 3402 } 3803 3403 3804 3404 struct bpf_map * 3805 - bpf_map__next(struct bpf_map *prev, struct bpf_object *obj) 3405 + bpf_map__next(const struct bpf_map *prev, const struct bpf_object *obj) 3806 3406 { 3807 3407 if (prev == NULL) 3808 3408 return obj->maps; ··· 3811 3411 } 3812 3412 3813 3413 struct bpf_map * 3814 - bpf_map__prev(struct bpf_map *next, struct bpf_object *obj) 3414 + bpf_map__prev(const struct bpf_map *next, const struct bpf_object *obj) 3815 3415 { 3816 3416 if (next == NULL) { 3817 3417 if (!obj->nr_maps) ··· 3823 3423 } 3824 3424 3825 3425 struct bpf_map * 3826 - bpf_object__find_map_by_name(struct bpf_object *obj, const char *name) 3426 + bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name) 3827 3427 { 3828 3428 struct bpf_map *pos; 3829 3429 ··· 3835 3435 } 3836 3436 3837 3437 int 3838 - bpf_object__find_map_fd_by_name(struct bpf_object *obj, const char *name) 3438 + bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) 3839 3439 { 3840 3440 return bpf_map__fd(bpf_object__find_map_by_name(obj, name)); 3841 3441 } ··· 3843 3443 struct bpf_map * 3844 3444 bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset) 3845 3445 { 3846 - int i; 3847 - 3848 - for (i = 0; i < obj->nr_maps; i++) { 3849 - if (obj->maps[i].offset == offset) 3850 - return &obj->maps[i]; 3851 - } 3852 - return ERR_PTR(-ENOENT); 3446 + return ERR_PTR(-ENOTSUP); 3853 3447 } 3854 3448 3855 3449 long libbpf_get_error(const void *ptr) ··· 4228 3834 bpf_prog_info_set_offset_u64(&info_linear->info, 4229 3835 desc->array_offset, addr); 4230 3836 } 3837 + } 3838 + 3839 + int libbpf_num_possible_cpus(void) 3840 + { 3841 + static const char *fcpu = "/sys/devices/system/cpu/possible"; 3842 + int len = 0, n = 0, il = 0, ir = 0; 3843 + unsigned int start = 0, end = 0; 3844 + static int cpus; 3845 + char buf[128]; 3846 + int error = 0; 3847 + int fd = -1; 3848 + 3849 + if (cpus > 0) 3850 + return cpus; 3851 + 3852 + fd = open(fcpu, O_RDONLY); 3853 + if (fd < 0) { 3854 + error = errno; 3855 + pr_warning("Failed to open file %s: %s\n", 3856 + fcpu, strerror(error)); 3857 + return -error; 3858 + } 3859 + len = read(fd, buf, sizeof(buf)); 3860 + close(fd); 3861 + if (len <= 0) { 3862 + error = len ? errno : EINVAL; 3863 + pr_warning("Failed to read # of possible cpus from %s: %s\n", 3864 + fcpu, strerror(error)); 3865 + return -error; 3866 + } 3867 + if (len == sizeof(buf)) { 3868 + pr_warning("File %s size overflow\n", fcpu); 3869 + return -EOVERFLOW; 3870 + } 3871 + buf[len] = '\0'; 3872 + 3873 + for (ir = 0, cpus = 0; ir <= len; ir++) { 3874 + /* Each sub string separated by ',' has format \d+-\d+ or \d+ */ 3875 + if (buf[ir] == ',' || buf[ir] == '\0') { 3876 + buf[ir] = '\0'; 3877 + n = sscanf(&buf[il], "%u-%u", &start, &end); 3878 + if (n <= 0) { 3879 + pr_warning("Failed to get # CPUs from %s\n", 3880 + &buf[il]); 3881 + return -EINVAL; 3882 + } else if (n == 1) { 3883 + end = start; 3884 + } 3885 + cpus += end - start + 1; 3886 + il = ir + 1; 3887 + } 3888 + } 3889 + if (cpus <= 0) { 3890 + pr_warning("Invalid #CPUs %d from %s\n", cpus, fcpu); 3891 + return -EINVAL; 3892 + } 3893 + return cpus; 4231 3894 }

+47 -31

tools/lib/bpf/libbpf.h

··· 98 98 LIBBPF_API int bpf_object__load(struct bpf_object *obj); 99 99 LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); 100 100 LIBBPF_API int bpf_object__unload(struct bpf_object *obj); 101 - LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); 102 - LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); 101 + LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); 102 + LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); 103 103 104 104 struct btf; 105 - LIBBPF_API struct btf *bpf_object__btf(struct bpf_object *obj); 105 + LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); 106 106 LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); 107 107 108 108 LIBBPF_API struct bpf_program * 109 - bpf_object__find_program_by_title(struct bpf_object *obj, const char *title); 109 + bpf_object__find_program_by_title(const struct bpf_object *obj, 110 + const char *title); 110 111 111 112 LIBBPF_API struct bpf_object *bpf_object__next(struct bpf_object *prev); 112 113 #define bpf_object__for_each_safe(pos, tmp) \ ··· 119 118 typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); 120 119 LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv, 121 120 bpf_object_clear_priv_t clear_priv); 122 - LIBBPF_API void *bpf_object__priv(struct bpf_object *prog); 121 + LIBBPF_API void *bpf_object__priv(const struct bpf_object *prog); 123 122 124 123 LIBBPF_API int 125 124 libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, ··· 130 129 /* Accessors of bpf_program */ 131 130 struct bpf_program; 132 131 LIBBPF_API struct bpf_program *bpf_program__next(struct bpf_program *prog, 133 - struct bpf_object *obj); 132 + const struct bpf_object *obj); 134 133 135 134 #define bpf_object__for_each_program(pos, obj) \ 136 135 for ((pos) = bpf_program__next(NULL, (obj)); \ ··· 138 137 (pos) = bpf_program__next((pos), (obj))) 139 138 140 139 LIBBPF_API struct bpf_program *bpf_program__prev(struct bpf_program *prog, 141 - struct bpf_object *obj); 140 + const struct bpf_object *obj); 142 141 143 - typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, 144 - void *); 142 + typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, void *); 145 143 146 144 LIBBPF_API int bpf_program__set_priv(struct bpf_program *prog, void *priv, 147 145 bpf_program_clear_priv_t clear_priv); 148 146 149 - LIBBPF_API void *bpf_program__priv(struct bpf_program *prog); 147 + LIBBPF_API void *bpf_program__priv(const struct bpf_program *prog); 150 148 LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, 151 149 __u32 ifindex); 152 150 153 - LIBBPF_API const char *bpf_program__title(struct bpf_program *prog, 151 + LIBBPF_API const char *bpf_program__title(const struct bpf_program *prog, 154 152 bool needs_copy); 155 153 156 154 LIBBPF_API int bpf_program__load(struct bpf_program *prog, char *license, 157 155 __u32 kern_version); 158 - LIBBPF_API int bpf_program__fd(struct bpf_program *prog); 156 + LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); 159 157 LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog, 160 158 const char *path, 161 159 int instance); ··· 227 227 LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, 228 228 bpf_program_prep_t prep); 229 229 230 - LIBBPF_API int bpf_program__nth_fd(struct bpf_program *prog, int n); 230 + LIBBPF_API int bpf_program__nth_fd(const struct bpf_program *prog, int n); 231 231 232 232 /* 233 233 * Adjust type of BPF program. Default is kprobe. ··· 246 246 bpf_program__set_expected_attach_type(struct bpf_program *prog, 247 247 enum bpf_attach_type type); 248 248 249 - LIBBPF_API bool bpf_program__is_socket_filter(struct bpf_program *prog); 250 - LIBBPF_API bool bpf_program__is_tracepoint(struct bpf_program *prog); 251 - LIBBPF_API bool bpf_program__is_raw_tracepoint(struct bpf_program *prog); 252 - LIBBPF_API bool bpf_program__is_kprobe(struct bpf_program *prog); 253 - LIBBPF_API bool bpf_program__is_sched_cls(struct bpf_program *prog); 254 - LIBBPF_API bool bpf_program__is_sched_act(struct bpf_program *prog); 255 - LIBBPF_API bool bpf_program__is_xdp(struct bpf_program *prog); 256 - LIBBPF_API bool bpf_program__is_perf_event(struct bpf_program *prog); 249 + LIBBPF_API bool bpf_program__is_socket_filter(const struct bpf_program *prog); 250 + LIBBPF_API bool bpf_program__is_tracepoint(const struct bpf_program *prog); 251 + LIBBPF_API bool bpf_program__is_raw_tracepoint(const struct bpf_program *prog); 252 + LIBBPF_API bool bpf_program__is_kprobe(const struct bpf_program *prog); 253 + LIBBPF_API bool bpf_program__is_sched_cls(const struct bpf_program *prog); 254 + LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog); 255 + LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog); 256 + LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog); 257 257 258 258 /* 259 259 * No need for __attribute__((packed)), all members of 'bpf_map_def' ··· 275 275 */ 276 276 struct bpf_map; 277 277 LIBBPF_API struct bpf_map * 278 - bpf_object__find_map_by_name(struct bpf_object *obj, const char *name); 278 + bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name); 279 279 280 280 LIBBPF_API int 281 - bpf_object__find_map_fd_by_name(struct bpf_object *obj, const char *name); 281 + bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name); 282 282 283 283 /* 284 284 * Get bpf_map through the offset of corresponding struct bpf_map_def ··· 288 288 bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); 289 289 290 290 LIBBPF_API struct bpf_map * 291 - bpf_map__next(struct bpf_map *map, struct bpf_object *obj); 291 + bpf_map__next(const struct bpf_map *map, const struct bpf_object *obj); 292 292 #define bpf_object__for_each_map(pos, obj) \ 293 293 for ((pos) = bpf_map__next(NULL, (obj)); \ 294 294 (pos) != NULL; \ ··· 296 296 #define bpf_map__for_each bpf_object__for_each_map 297 297 298 298 LIBBPF_API struct bpf_map * 299 - bpf_map__prev(struct bpf_map *map, struct bpf_object *obj); 299 + bpf_map__prev(const struct bpf_map *map, const struct bpf_object *obj); 300 300 301 - LIBBPF_API int bpf_map__fd(struct bpf_map *map); 302 - LIBBPF_API const struct bpf_map_def *bpf_map__def(struct bpf_map *map); 303 - LIBBPF_API const char *bpf_map__name(struct bpf_map *map); 301 + LIBBPF_API int bpf_map__fd(const struct bpf_map *map); 302 + LIBBPF_API const struct bpf_map_def *bpf_map__def(const struct bpf_map *map); 303 + LIBBPF_API const char *bpf_map__name(const struct bpf_map *map); 304 304 LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map); 305 305 LIBBPF_API __u32 bpf_map__btf_value_type_id(const struct bpf_map *map); 306 306 307 307 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); 308 308 LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, 309 309 bpf_map_clear_priv_t clear_priv); 310 - LIBBPF_API void *bpf_map__priv(struct bpf_map *map); 310 + LIBBPF_API void *bpf_map__priv(const struct bpf_map *map); 311 311 LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); 312 312 LIBBPF_API int bpf_map__resize(struct bpf_map *map, __u32 max_entries); 313 - LIBBPF_API bool bpf_map__is_offload_neutral(struct bpf_map *map); 314 - LIBBPF_API bool bpf_map__is_internal(struct bpf_map *map); 313 + LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map); 314 + LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map); 315 315 LIBBPF_API void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); 316 316 LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); 317 317 LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); ··· 453 453 454 454 LIBBPF_API void 455 455 bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear); 456 + 457 + /* 458 + * A helper function to get the number of possible CPUs before looking up 459 + * per-CPU maps. Negative errno is returned on failure. 460 + * 461 + * Example usage: 462 + * 463 + * int ncpus = libbpf_num_possible_cpus(); 464 + * if (ncpus < 0) { 465 + * // error handling 466 + * } 467 + * long values[ncpus]; 468 + * bpf_map_lookup_elem(per_cpu_map_fd, key, values); 469 + * 470 + */ 471 + LIBBPF_API int libbpf_num_possible_cpus(void); 456 472 457 473 #ifdef __cplusplus 458 474 } /* extern "C" */

+1

tools/lib/bpf/libbpf.map

··· 172 172 btf_dump__new; 173 173 btf__parse_elf; 174 174 bpf_object__load_xattr; 175 + libbpf_num_possible_cpus; 175 176 } LIBBPF_0.0.3;

+7

tools/lib/bpf/libbpf_internal.h

··· 23 23 #define BTF_PARAM_ENC(name, type) (name), (type) 24 24 #define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size) 25 25 26 + #ifndef min 27 + # define min(x, y) ((x) < (y) ? (x) : (y)) 28 + #endif 29 + #ifndef max 30 + # define max(x, y) ((x) < (y) ? (y) : (x)) 31 + #endif 32 + 26 33 extern void libbpf_print(enum libbpf_print_level level, 27 34 const char *format, ...) 28 35 __attribute__((format(printf, 2, 3)));

+28 -75

tools/lib/bpf/xsk.c

··· 60 60 struct xsk_umem *umem; 61 61 struct xsk_socket_config config; 62 62 int fd; 63 - int xsks_map; 64 63 int ifindex; 65 64 int prog_fd; 66 - int qidconf_map_fd; 67 65 int xsks_map_fd; 68 66 __u32 queue_id; 69 67 char ifname[IFNAMSIZ]; ··· 263 265 /* This is the C-program: 264 266 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) 265 267 * { 266 - * int *qidconf, index = ctx->rx_queue_index; 268 + * int index = ctx->rx_queue_index; 267 269 * 268 270 * // A set entry here means that the correspnding queue_id 269 271 * // has an active AF_XDP socket bound to it. 270 - * qidconf = bpf_map_lookup_elem(&qidconf_map, &index); 271 - * if (!qidconf) 272 - * return XDP_ABORTED; 273 - * 274 - * if (*qidconf) 272 + * if (bpf_map_lookup_elem(&xsks_map, &index)) 275 273 * return bpf_redirect_map(&xsks_map, index, 0); 276 274 * 277 275 * return XDP_PASS; ··· 280 286 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -4), 281 287 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 282 288 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), 283 - BPF_LD_MAP_FD(BPF_REG_1, xsk->qidconf_map_fd), 289 + BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd), 284 290 BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 285 291 BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 286 - BPF_MOV32_IMM(BPF_REG_0, 0), 287 - /* if r1 == 0 goto +8 */ 288 - BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8), 289 292 BPF_MOV32_IMM(BPF_REG_0, 2), 290 - /* r1 = *(u32 *)(r1 + 0) */ 291 - BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), 292 293 /* if r1 == 0 goto +5 */ 293 294 BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), 294 295 /* r2 = *(u32 *)(r10 - 4) */ ··· 355 366 if (max_queues < 0) 356 367 return max_queues; 357 368 358 - fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "qidconf_map", 369 + fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map", 359 370 sizeof(int), sizeof(int), max_queues, 0); 360 371 if (fd < 0) 361 372 return fd; 362 - xsk->qidconf_map_fd = fd; 363 373 364 - fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map", 365 - sizeof(int), sizeof(int), max_queues, 0); 366 - if (fd < 0) { 367 - close(xsk->qidconf_map_fd); 368 - return fd; 369 - } 370 374 xsk->xsks_map_fd = fd; 371 375 372 376 return 0; ··· 367 385 368 386 static void xsk_delete_bpf_maps(struct xsk_socket *xsk) 369 387 { 370 - close(xsk->qidconf_map_fd); 388 + bpf_map_delete_elem(xsk->xsks_map_fd, &xsk->queue_id); 371 389 close(xsk->xsks_map_fd); 372 - xsk->qidconf_map_fd = -1; 373 - xsk->xsks_map_fd = -1; 374 390 } 375 391 376 392 static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) ··· 397 417 if (err) 398 418 goto out_map_ids; 399 419 400 - for (i = 0; i < prog_info.nr_map_ids; i++) { 401 - if (xsk->qidconf_map_fd != -1 && xsk->xsks_map_fd != -1) 402 - break; 420 + xsk->xsks_map_fd = -1; 403 421 422 + for (i = 0; i < prog_info.nr_map_ids; i++) { 404 423 fd = bpf_map_get_fd_by_id(map_ids[i]); 405 424 if (fd < 0) 406 425 continue; ··· 407 428 err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); 408 429 if (err) { 409 430 close(fd); 410 - continue; 411 - } 412 - 413 - if (!strcmp(map_info.name, "qidconf_map")) { 414 - xsk->qidconf_map_fd = fd; 415 431 continue; 416 432 } 417 433 ··· 419 445 } 420 446 421 447 err = 0; 422 - if (xsk->qidconf_map_fd < 0 || xsk->xsks_map_fd < 0) { 448 + if (xsk->xsks_map_fd == -1) 423 449 err = -ENOENT; 424 - xsk_delete_bpf_maps(xsk); 425 - } 426 450 427 451 out_map_ids: 428 452 free(map_ids); 429 453 return err; 430 454 } 431 455 432 - static void xsk_clear_bpf_maps(struct xsk_socket *xsk) 433 - { 434 - int qid = false; 435 - 436 - bpf_map_update_elem(xsk->qidconf_map_fd, &xsk->queue_id, &qid, 0); 437 - bpf_map_delete_elem(xsk->xsks_map_fd, &xsk->queue_id); 438 - } 439 - 440 456 static int xsk_set_bpf_maps(struct xsk_socket *xsk) 441 457 { 442 - int qid = true, fd = xsk->fd, err; 443 - 444 - err = bpf_map_update_elem(xsk->qidconf_map_fd, &xsk->queue_id, &qid, 0); 445 - if (err) 446 - goto out; 447 - 448 - err = bpf_map_update_elem(xsk->xsks_map_fd, &xsk->queue_id, &fd, 0); 449 - if (err) 450 - goto out; 451 - 452 - return 0; 453 - out: 454 - xsk_clear_bpf_maps(xsk); 455 - return err; 458 + return bpf_map_update_elem(xsk->xsks_map_fd, &xsk->queue_id, 459 + &xsk->fd, 0); 456 460 } 457 461 458 462 static int xsk_setup_xdp_prog(struct xsk_socket *xsk) ··· 449 497 return err; 450 498 451 499 err = xsk_load_xdp_prog(xsk); 452 - if (err) 453 - goto out_maps; 500 + if (err) { 501 + xsk_delete_bpf_maps(xsk); 502 + return err; 503 + } 454 504 } else { 455 505 xsk->prog_fd = bpf_prog_get_fd_by_id(prog_id); 456 506 err = xsk_lookup_bpf_maps(xsk); 457 - if (err) 458 - goto out_load; 507 + if (err) { 508 + close(xsk->prog_fd); 509 + return err; 510 + } 459 511 } 460 512 461 513 err = xsk_set_bpf_maps(xsk); 462 - if (err) 463 - goto out_load; 514 + if (err) { 515 + xsk_delete_bpf_maps(xsk); 516 + close(xsk->prog_fd); 517 + return err; 518 + } 464 519 465 520 return 0; 466 - 467 - out_load: 468 - close(xsk->prog_fd); 469 - out_maps: 470 - xsk_delete_bpf_maps(xsk); 471 - return err; 472 521 } 473 522 474 523 int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, ··· 596 643 goto out_mmap_tx; 597 644 } 598 645 599 - xsk->qidconf_map_fd = -1; 600 - xsk->xsks_map_fd = -1; 601 - 646 + xsk->prog_fd = -1; 602 647 if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { 603 648 err = xsk_setup_xdp_prog(xsk); 604 649 if (err) ··· 659 708 if (!xsk) 660 709 return; 661 710 662 - xsk_clear_bpf_maps(xsk); 663 - xsk_delete_bpf_maps(xsk); 711 + if (xsk->prog_fd != -1) { 712 + xsk_delete_bpf_maps(xsk); 713 + close(xsk->prog_fd); 714 + } 664 715 665 716 optlen = sizeof(off); 666 717 err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);

+2 -1

tools/testing/selftests/bpf/Makefile

··· 280 280 ) > $(VERIFIER_TESTS_H)) 281 281 282 282 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(ALU32_BUILD_DIR) \ 283 - $(VERIFIER_TESTS_H) $(PROG_TESTS_H) $(MAP_TESTS_H) 283 + $(VERIFIER_TESTS_H) $(PROG_TESTS_H) $(MAP_TESTS_H) \ 284 + feature

+1

tools/testing/selftests/bpf/bpf_endian.h

··· 2 2 #ifndef __BPF_ENDIAN__ 3 3 #define __BPF_ENDIAN__ 4 4 5 + #include <linux/stddef.h> 5 6 #include <linux/swab.h> 6 7 7 8 /* LLVM's BPF target selects the endianness of the CPU

+2 -2

tools/testing/selftests/bpf/bpf_helpers.h

··· 31 31 (void *) BPF_FUNC_map_pop_elem; 32 32 static int (*bpf_map_peek_elem)(void *map, void *value) = 33 33 (void *) BPF_FUNC_map_peek_elem; 34 - static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = 34 + static int (*bpf_probe_read)(void *dst, int size, const void *unsafe_ptr) = 35 35 (void *) BPF_FUNC_probe_read; 36 36 static unsigned long long (*bpf_ktime_get_ns)(void) = 37 37 (void *) BPF_FUNC_ktime_get_ns; ··· 62 62 (void *) BPF_FUNC_perf_event_output; 63 63 static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = 64 64 (void *) BPF_FUNC_get_stackid; 65 - static int (*bpf_probe_write_user)(void *dst, void *src, int size) = 65 + static int (*bpf_probe_write_user)(void *dst, const void *src, int size) = 66 66 (void *) BPF_FUNC_probe_write_user; 67 67 static int (*bpf_current_task_under_cgroup)(void *map, int index) = 68 68 (void *) BPF_FUNC_current_task_under_cgroup;

+5 -32

tools/testing/selftests/bpf/bpf_util.h

··· 6 6 #include <stdlib.h> 7 7 #include <string.h> 8 8 #include <errno.h> 9 + #include <libbpf.h> /* libbpf_num_possible_cpus */ 9 10 10 11 static inline unsigned int bpf_num_possible_cpus(void) 11 12 { 12 - static const char *fcpu = "/sys/devices/system/cpu/possible"; 13 - unsigned int start, end, possible_cpus = 0; 14 - char buff[128]; 15 - FILE *fp; 16 - int len, n, i, j = 0; 13 + int possible_cpus = libbpf_num_possible_cpus(); 17 14 18 - fp = fopen(fcpu, "r"); 19 - if (!fp) { 20 - printf("Failed to open %s: '%s'!\n", fcpu, strerror(errno)); 15 + if (possible_cpus < 0) { 16 + printf("Failed to get # of possible cpus: '%s'!\n", 17 + strerror(-possible_cpus)); 21 18 exit(1); 22 19 } 23 - 24 - if (!fgets(buff, sizeof(buff), fp)) { 25 - printf("Failed to read %s!\n", fcpu); 26 - exit(1); 27 - } 28 - 29 - len = strlen(buff); 30 - for (i = 0; i <= len; i++) { 31 - if (buff[i] == ',' || buff[i] == '\0') { 32 - buff[i] = '\0'; 33 - n = sscanf(&buff[j], "%u-%u", &start, &end); 34 - if (n <= 0) { 35 - printf("Failed to retrieve # possible CPUs!\n"); 36 - exit(1); 37 - } else if (n == 1) { 38 - end = start; 39 - } 40 - possible_cpus += end - start + 1; 41 - j = i + 1; 42 - } 43 - } 44 - 45 - fclose(fp); 46 - 47 20 return possible_cpus; 48 21 } 49 22

+1 -1

tools/testing/selftests/bpf/cgroup_helpers.c

··· 47 47 char buf[PATH_MAX]; 48 48 char *c, *c2; 49 49 int fd, cfd; 50 - size_t len; 50 + ssize_t len; 51 51 52 52 snprintf(path, sizeof(path), "%s/cgroup.controllers", cgroup_path); 53 53 fd = open(path, O_RDONLY);

+56 -11

tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c

··· 5 5 const char *format, va_list args) 6 6 { 7 7 if (level != LIBBPF_DEBUG) 8 - return 0; 8 + return vfprintf(stderr, format, args); 9 9 10 10 if (!strstr(format, "verifier log")) 11 11 return 0; ··· 32 32 33 33 void test_bpf_verif_scale(void) 34 34 { 35 - const char *scale[] = { 36 - "./test_verif_scale1.o", "./test_verif_scale2.o", "./test_verif_scale3.o" 35 + const char *sched_cls[] = { 36 + "./test_verif_scale1.o", "./test_verif_scale2.o", "./test_verif_scale3.o", 37 37 }; 38 - const char *pyperf[] = { 39 - "./pyperf50.o", "./pyperf100.o", "./pyperf180.o" 38 + const char *raw_tp[] = { 39 + /* full unroll by llvm */ 40 + "./pyperf50.o", "./pyperf100.o", "./pyperf180.o", 41 + 42 + /* partial unroll. llvm will unroll loop ~150 times. 43 + * C loop count -> 600. 44 + * Asm loop count -> 4. 45 + * 16k insns in loop body. 46 + * Total of 5 such loops. Total program size ~82k insns. 47 + */ 48 + "./pyperf600.o", 49 + 50 + /* no unroll at all. 51 + * C loop count -> 600. 52 + * ASM loop count -> 600. 53 + * ~110 insns in loop body. 54 + * Total of 5 such loops. Total program size ~1500 insns. 55 + */ 56 + "./pyperf600_nounroll.o", 57 + 58 + "./loop1.o", "./loop2.o", 59 + 60 + /* partial unroll. 19k insn in a loop. 61 + * Total program size 20.8k insn. 62 + * ~350k processed_insns 63 + */ 64 + "./strobemeta.o", 65 + 66 + /* no unroll, tiny loops */ 67 + "./strobemeta_nounroll1.o", 68 + "./strobemeta_nounroll2.o", 69 + }; 70 + const char *cg_sysctl[] = { 71 + "./test_sysctl_loop1.o", "./test_sysctl_loop2.o", 40 72 }; 41 73 int err, i; 42 74 43 75 if (verifier_stats) 44 76 libbpf_set_print(libbpf_debug_print); 45 77 46 - for (i = 0; i < ARRAY_SIZE(scale); i++) { 47 - err = check_load(scale[i], BPF_PROG_TYPE_SCHED_CLS); 48 - printf("test_scale:%s:%s\n", scale[i], err ? "FAIL" : "OK"); 78 + err = check_load("./loop3.o", BPF_PROG_TYPE_RAW_TRACEPOINT); 79 + printf("test_scale:loop3:%s\n", err ? (error_cnt--, "OK") : "FAIL"); 80 + 81 + for (i = 0; i < ARRAY_SIZE(sched_cls); i++) { 82 + err = check_load(sched_cls[i], BPF_PROG_TYPE_SCHED_CLS); 83 + printf("test_scale:%s:%s\n", sched_cls[i], err ? "FAIL" : "OK"); 49 84 } 50 85 51 - for (i = 0; i < ARRAY_SIZE(pyperf); i++) { 52 - err = check_load(pyperf[i], BPF_PROG_TYPE_RAW_TRACEPOINT); 53 - printf("test_scale:%s:%s\n", pyperf[i], err ? "FAIL" : "OK"); 86 + for (i = 0; i < ARRAY_SIZE(raw_tp); i++) { 87 + err = check_load(raw_tp[i], BPF_PROG_TYPE_RAW_TRACEPOINT); 88 + printf("test_scale:%s:%s\n", raw_tp[i], err ? "FAIL" : "OK"); 54 89 } 90 + 91 + for (i = 0; i < ARRAY_SIZE(cg_sysctl); i++) { 92 + err = check_load(cg_sysctl[i], BPF_PROG_TYPE_CGROUP_SYSCTL); 93 + printf("test_scale:%s:%s\n", cg_sysctl[i], err ? "FAIL" : "OK"); 94 + } 95 + err = check_load("./test_xdp_loop.o", BPF_PROG_TYPE_XDP); 96 + printf("test_scale:test_xdp_loop:%s\n", err ? "FAIL" : "OK"); 97 + 98 + err = check_load("./test_seg6_loop.o", BPF_PROG_TYPE_LWT_SEG6LOCAL); 99 + printf("test_scale:test_seg6_loop:%s\n", err ? "FAIL" : "OK"); 55 100 }

+13 -5

tools/testing/selftests/bpf/progs/bpf_flow.c

··· 57 57 __be32 identification; 58 58 }; 59 59 60 - struct bpf_map_def SEC("maps") jmp_table = { 60 + struct { 61 + __u32 type; 62 + __u32 max_entries; 63 + __u32 key_size; 64 + __u32 value_size; 65 + } jmp_table SEC(".maps") = { 61 66 .type = BPF_MAP_TYPE_PROG_ARRAY, 67 + .max_entries = 8, 62 68 .key_size = sizeof(__u32), 63 69 .value_size = sizeof(__u32), 64 - .max_entries = 8 65 70 }; 66 71 67 - struct bpf_map_def SEC("maps") last_dissection = { 72 + struct { 73 + __u32 type; 74 + __u32 max_entries; 75 + __u32 *key; 76 + struct bpf_flow_keys *value; 77 + } last_dissection SEC(".maps") = { 68 78 .type = BPF_MAP_TYPE_ARRAY, 69 - .key_size = sizeof(__u32), 70 - .value_size = sizeof(struct bpf_flow_keys), 71 79 .max_entries = 1, 72 80 }; 73 81

+28

tools/testing/selftests/bpf/progs/loop1.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + #include <linux/sched.h> 4 + #include <linux/ptrace.h> 5 + #include <stdint.h> 6 + #include <stddef.h> 7 + #include <stdbool.h> 8 + #include <linux/bpf.h> 9 + #include "bpf_helpers.h" 10 + 11 + char _license[] SEC("license") = "GPL"; 12 + 13 + SEC("raw_tracepoint/kfree_skb") 14 + int nested_loops(volatile struct pt_regs* ctx) 15 + { 16 + int i, j, sum = 0, m; 17 + 18 + for (j = 0; j < 300; j++) 19 + for (i = 0; i < j; i++) { 20 + if (j & 1) 21 + m = ctx->rax; 22 + else 23 + m = j; 24 + sum += i * m; 25 + } 26 + 27 + return sum; 28 + }

+28

tools/testing/selftests/bpf/progs/loop2.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + #include <linux/sched.h> 4 + #include <linux/ptrace.h> 5 + #include <stdint.h> 6 + #include <stddef.h> 7 + #include <stdbool.h> 8 + #include <linux/bpf.h> 9 + #include "bpf_helpers.h" 10 + 11 + char _license[] SEC("license") = "GPL"; 12 + 13 + SEC("raw_tracepoint/consume_skb") 14 + int while_true(volatile struct pt_regs* ctx) 15 + { 16 + int i = 0; 17 + 18 + while (true) { 19 + if (ctx->rax & 1) 20 + i += 3; 21 + else 22 + i += 7; 23 + if (i > 40) 24 + break; 25 + } 26 + 27 + return i; 28 + }

+22

tools/testing/selftests/bpf/progs/loop3.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + #include <linux/sched.h> 4 + #include <linux/ptrace.h> 5 + #include <stdint.h> 6 + #include <stddef.h> 7 + #include <stdbool.h> 8 + #include <linux/bpf.h> 9 + #include "bpf_helpers.h" 10 + 11 + char _license[] SEC("license") = "GPL"; 12 + 13 + SEC("raw_tracepoint/consume_skb") 14 + int while_true(volatile struct pt_regs* ctx) 15 + { 16 + __u64 i = 0, sum = 0; 17 + do { 18 + i++; 19 + sum += ctx->rax; 20 + } while (i < 0x100000000ULL); 21 + return sum; 22 + }

+10 -12

tools/testing/selftests/bpf/progs/netcnt_prog.c

··· 10 10 #define REFRESH_TIME_NS 100000000 11 11 #define NS_PER_SEC 1000000000 12 12 13 - struct bpf_map_def SEC("maps") percpu_netcnt = { 13 + struct { 14 + __u32 type; 15 + struct bpf_cgroup_storage_key *key; 16 + struct percpu_net_cnt *value; 17 + } percpu_netcnt SEC(".maps") = { 14 18 .type = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, 15 - .key_size = sizeof(struct bpf_cgroup_storage_key), 16 - .value_size = sizeof(struct percpu_net_cnt), 17 19 }; 18 20 19 - BPF_ANNOTATE_KV_PAIR(percpu_netcnt, struct bpf_cgroup_storage_key, 20 - struct percpu_net_cnt); 21 - 22 - struct bpf_map_def SEC("maps") netcnt = { 21 + struct { 22 + __u32 type; 23 + struct bpf_cgroup_storage_key *key; 24 + struct net_cnt *value; 25 + } netcnt SEC(".maps") = { 23 26 .type = BPF_MAP_TYPE_CGROUP_STORAGE, 24 - .key_size = sizeof(struct bpf_cgroup_storage_key), 25 - .value_size = sizeof(struct net_cnt), 26 27 }; 27 - 28 - BPF_ANNOTATE_KV_PAIR(netcnt, struct bpf_cgroup_storage_key, 29 - struct net_cnt); 30 28 31 29 SEC("cgroup/skb") 32 30 int bpf_nextcnt(struct __sk_buff *skb)

+5 -1

tools/testing/selftests/bpf/progs/pyperf.h

··· 220 220 int32_t* symbol_counter = bpf_map_lookup_elem(&symbolmap, &sym); 221 221 if (symbol_counter == NULL) 222 222 return 0; 223 - #pragma unroll 223 + #ifdef NO_UNROLL 224 + #pragma clang loop unroll(disable) 225 + #else 226 + #pragma clang loop unroll(full) 227 + #endif 224 228 /* Unwind python stack */ 225 229 for (int i = 0; i < STACK_MAX_LEN; ++i) { 226 230 if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) {

+9

tools/testing/selftests/bpf/progs/pyperf600.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + #define STACK_MAX_LEN 600 4 + /* clang will not unroll the loop 600 times. 5 + * Instead it will unroll it to the amount it deemed 6 + * appropriate, but the loop will still execute 600 times. 7 + * Total program size is around 90k insns 8 + */ 9 + #include "pyperf.h"

+8

tools/testing/selftests/bpf/progs/pyperf600_nounroll.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + #define STACK_MAX_LEN 600 4 + #define NO_UNROLL 5 + /* clang will not unroll at all. 6 + * Total program size is around 2k insns 7 + */ 8 + #include "pyperf.h"

+32 -19

tools/testing/selftests/bpf/progs/socket_cookie_prog.c

··· 7 7 #include "bpf_helpers.h" 8 8 #include "bpf_endian.h" 9 9 10 - struct bpf_map_def SEC("maps") socket_cookies = { 11 - .type = BPF_MAP_TYPE_HASH, 12 - .key_size = sizeof(__u64), 13 - .value_size = sizeof(__u32), 14 - .max_entries = 1 << 8, 10 + struct socket_cookie { 11 + __u64 cookie_key; 12 + __u32 cookie_value; 13 + }; 14 + 15 + struct { 16 + __u32 type; 17 + __u32 map_flags; 18 + int *key; 19 + struct socket_cookie *value; 20 + } socket_cookies SEC(".maps") = { 21 + .type = BPF_MAP_TYPE_SK_STORAGE, 22 + .map_flags = BPF_F_NO_PREALLOC, 15 23 }; 16 24 17 25 SEC("cgroup/connect6") 18 26 int set_cookie(struct bpf_sock_addr *ctx) 19 27 { 20 - __u32 cookie_value = 0xFF; 21 - __u64 cookie_key; 28 + struct socket_cookie *p; 22 29 23 30 if (ctx->family != AF_INET6 || ctx->user_family != AF_INET6) 24 31 return 1; 25 32 26 - cookie_key = bpf_get_socket_cookie(ctx); 27 - if (bpf_map_update_elem(&socket_cookies, &cookie_key, &cookie_value, 0)) 28 - return 0; 33 + p = bpf_sk_storage_get(&socket_cookies, ctx->sk, 0, 34 + BPF_SK_STORAGE_GET_F_CREATE); 35 + if (!p) 36 + return 1; 37 + 38 + p->cookie_value = 0xFF; 39 + p->cookie_key = bpf_get_socket_cookie(ctx); 29 40 30 41 return 1; 31 42 } ··· 44 33 SEC("sockops") 45 34 int update_cookie(struct bpf_sock_ops *ctx) 46 35 { 47 - __u32 new_cookie_value; 48 - __u32 *cookie_value; 49 - __u64 cookie_key; 36 + struct bpf_sock *sk; 37 + struct socket_cookie *p; 50 38 51 39 if (ctx->family != AF_INET6) 52 40 return 1; ··· 53 43 if (ctx->op != BPF_SOCK_OPS_TCP_CONNECT_CB) 54 44 return 1; 55 45 56 - cookie_key = bpf_get_socket_cookie(ctx); 57 - 58 - cookie_value = bpf_map_lookup_elem(&socket_cookies, &cookie_key); 59 - if (!cookie_value) 46 + if (!ctx->sk) 60 47 return 1; 61 48 62 - new_cookie_value = (ctx->local_port << 8) | *cookie_value; 63 - bpf_map_update_elem(&socket_cookies, &cookie_key, &new_cookie_value, 0); 49 + p = bpf_sk_storage_get(&socket_cookies, ctx->sk, 0, 0); 50 + if (!p) 51 + return 1; 52 + 53 + if (p->cookie_key != bpf_get_socket_cookie(ctx)) 54 + return 1; 55 + 56 + p->cookie_value = (ctx->local_port << 8) | p->cookie_value; 64 57 65 58 return 1; 66 59 }

-1

tools/testing/selftests/bpf/progs/sockmap_parse_prog.c

··· 1 1 #include <linux/bpf.h> 2 2 #include "bpf_helpers.h" 3 - #include "bpf_util.h" 4 3 #include "bpf_endian.h" 5 4 6 5 int _version SEC("version") = 1;

+1 -1

tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c

··· 1 1 #include <linux/bpf.h> 2 + 2 3 #include "bpf_helpers.h" 3 - #include "bpf_util.h" 4 4 #include "bpf_endian.h" 5 5 6 6 int _version SEC("version") = 1;

-1

tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c

··· 1 1 #include <linux/bpf.h> 2 2 #include "bpf_helpers.h" 3 - #include "bpf_util.h" 4 3 #include "bpf_endian.h" 5 4 6 5 int _version SEC("version") = 1;

+10

tools/testing/selftests/bpf/progs/strobemeta.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + // Copyright (c) 2019 Facebook 3 + 4 + #define STROBE_MAX_INTS 2 5 + #define STROBE_MAX_STRS 25 6 + #define STROBE_MAX_MAPS 100 7 + #define STROBE_MAX_MAP_ENTRIES 20 8 + /* full unroll by llvm #undef NO_UNROLL */ 9 + #include "strobemeta.h" 10 +

+528

tools/testing/selftests/bpf/progs/strobemeta.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + 4 + #include <stdint.h> 5 + #include <stddef.h> 6 + #include <stdbool.h> 7 + #include <linux/bpf.h> 8 + #include <linux/ptrace.h> 9 + #include <linux/sched.h> 10 + #include <linux/types.h> 11 + #include "bpf_helpers.h" 12 + 13 + typedef uint32_t pid_t; 14 + struct task_struct {}; 15 + 16 + #define TASK_COMM_LEN 16 17 + #define PERF_MAX_STACK_DEPTH 127 18 + 19 + #define STROBE_TYPE_INVALID 0 20 + #define STROBE_TYPE_INT 1 21 + #define STROBE_TYPE_STR 2 22 + #define STROBE_TYPE_MAP 3 23 + 24 + #define STACK_TABLE_EPOCH_SHIFT 20 25 + #define STROBE_MAX_STR_LEN 1 26 + #define STROBE_MAX_CFGS 32 27 + #define STROBE_MAX_PAYLOAD \ 28 + (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ 29 + STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) 30 + 31 + struct strobe_value_header { 32 + /* 33 + * meaning depends on type: 34 + * 1. int: 0, if value not set, 1 otherwise 35 + * 2. str: 1 always, whether value is set or not is determined by ptr 36 + * 3. map: 1 always, pointer points to additional struct with number 37 + * of entries (up to STROBE_MAX_MAP_ENTRIES) 38 + */ 39 + uint16_t len; 40 + /* 41 + * _reserved might be used for some future fields/flags, but we always 42 + * want to keep strobe_value_header to be 8 bytes, so BPF can read 16 43 + * bytes in one go and get both header and value 44 + */ 45 + uint8_t _reserved[6]; 46 + }; 47 + 48 + /* 49 + * strobe_value_generic is used from BPF probe only, but needs to be a union 50 + * of strobe_value_int/strobe_value_str/strobe_value_map 51 + */ 52 + struct strobe_value_generic { 53 + struct strobe_value_header header; 54 + union { 55 + int64_t val; 56 + void *ptr; 57 + }; 58 + }; 59 + 60 + struct strobe_value_int { 61 + struct strobe_value_header header; 62 + int64_t value; 63 + }; 64 + 65 + struct strobe_value_str { 66 + struct strobe_value_header header; 67 + const char* value; 68 + }; 69 + 70 + struct strobe_value_map { 71 + struct strobe_value_header header; 72 + const struct strobe_map_raw* value; 73 + }; 74 + 75 + struct strobe_map_entry { 76 + const char* key; 77 + const char* val; 78 + }; 79 + 80 + /* 81 + * Map of C-string key/value pairs with fixed maximum capacity. Each map has 82 + * corresponding int64 ID, which application can use (or ignore) in whatever 83 + * way appropriate. Map is "write-only", there is no way to get data out of 84 + * map. Map is intended to be used to provide metadata for profilers and is 85 + * not to be used for internal in-app communication. All methods are 86 + * thread-safe. 87 + */ 88 + struct strobe_map_raw { 89 + /* 90 + * general purpose unique ID that's up to application to decide 91 + * whether and how to use; for request metadata use case id is unique 92 + * request ID that's used to match metadata with stack traces on 93 + * Strobelight backend side 94 + */ 95 + int64_t id; 96 + /* number of used entries in map */ 97 + int64_t cnt; 98 + /* 99 + * having volatile doesn't change anything on BPF side, but clang 100 + * emits warnings for passing `volatile const char *` into 101 + * bpf_probe_read_str that expects just `const char *` 102 + */ 103 + const char* tag; 104 + /* 105 + * key/value entries, each consisting of 2 pointers to key and value 106 + * C strings 107 + */ 108 + struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES]; 109 + }; 110 + 111 + /* Following values define supported values of TLS mode */ 112 + #define TLS_NOT_SET -1 113 + #define TLS_LOCAL_EXEC 0 114 + #define TLS_IMM_EXEC 1 115 + #define TLS_GENERAL_DYN 2 116 + 117 + /* 118 + * structure that universally represents TLS location (both for static 119 + * executables and shared libraries) 120 + */ 121 + struct strobe_value_loc { 122 + /* 123 + * tls_mode defines what TLS mode was used for particular metavariable: 124 + * - -1 (TLS_NOT_SET) - no metavariable; 125 + * - 0 (TLS_LOCAL_EXEC) - Local Executable mode; 126 + * - 1 (TLS_IMM_EXEC) - Immediate Executable mode; 127 + * - 2 (TLS_GENERAL_DYN) - General Dynamic mode; 128 + * Local Dynamic mode is not yet supported, because never seen in 129 + * practice. Mode defines how offset field is interpreted. See 130 + * calc_location() in below for details. 131 + */ 132 + int64_t tls_mode; 133 + /* 134 + * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64, 135 + * tpidr_el0 for aarch64). 136 + * TLS_IMM_EXEC: absolute address of GOT entry containing offset 137 + * from thread pointer; 138 + * TLS_GENERAL_DYN: absolute addres of double GOT entry 139 + * containing tls_index_t struct; 140 + */ 141 + int64_t offset; 142 + }; 143 + 144 + struct strobemeta_cfg { 145 + int64_t req_meta_idx; 146 + struct strobe_value_loc int_locs[STROBE_MAX_INTS]; 147 + struct strobe_value_loc str_locs[STROBE_MAX_STRS]; 148 + struct strobe_value_loc map_locs[STROBE_MAX_MAPS]; 149 + }; 150 + 151 + struct strobe_map_descr { 152 + uint64_t id; 153 + int16_t tag_len; 154 + /* 155 + * cnt <0 - map value isn't set; 156 + * 0 - map has id set, but no key/value entries 157 + */ 158 + int16_t cnt; 159 + /* 160 + * both key_lens[i] and val_lens[i] should be >0 for present key/value 161 + * entry 162 + */ 163 + uint16_t key_lens[STROBE_MAX_MAP_ENTRIES]; 164 + uint16_t val_lens[STROBE_MAX_MAP_ENTRIES]; 165 + }; 166 + 167 + struct strobemeta_payload { 168 + /* req_id has valid request ID, if req_meta_valid == 1 */ 169 + int64_t req_id; 170 + uint8_t req_meta_valid; 171 + /* 172 + * mask has Nth bit set to 1, if Nth metavar was present and 173 + * successfully read 174 + */ 175 + uint64_t int_vals_set_mask; 176 + int64_t int_vals[STROBE_MAX_INTS]; 177 + /* len is >0 for present values */ 178 + uint16_t str_lens[STROBE_MAX_STRS]; 179 + /* if map_descrs[i].cnt == -1, metavar is not present/set */ 180 + struct strobe_map_descr map_descrs[STROBE_MAX_MAPS]; 181 + /* 182 + * payload has compactly packed values of str and map variables in the 183 + * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0 184 + * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines 185 + * value length 186 + */ 187 + char payload[STROBE_MAX_PAYLOAD]; 188 + }; 189 + 190 + struct strobelight_bpf_sample { 191 + uint64_t ktime; 192 + char comm[TASK_COMM_LEN]; 193 + pid_t pid; 194 + int user_stack_id; 195 + int kernel_stack_id; 196 + int has_meta; 197 + struct strobemeta_payload metadata; 198 + /* 199 + * makes it possible to pass (<real payload size> + 1) as data size to 200 + * perf_submit() to avoid perf_submit's paranoia about passing zero as 201 + * size, as it deduces that <real payload size> might be 202 + * **theoretically** zero 203 + */ 204 + char dummy_safeguard; 205 + }; 206 + 207 + struct bpf_map_def SEC("maps") samples = { 208 + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, 209 + .key_size = sizeof(int), 210 + .value_size = sizeof(int), 211 + .max_entries = 32, 212 + }; 213 + 214 + struct bpf_map_def SEC("maps") stacks_0 = { 215 + .type = BPF_MAP_TYPE_STACK_TRACE, 216 + .key_size = sizeof(uint32_t), 217 + .value_size = sizeof(uint64_t) * PERF_MAX_STACK_DEPTH, 218 + .max_entries = 16, 219 + }; 220 + 221 + struct bpf_map_def SEC("maps") stacks_1 = { 222 + .type = BPF_MAP_TYPE_STACK_TRACE, 223 + .key_size = sizeof(uint32_t), 224 + .value_size = sizeof(uint64_t) * PERF_MAX_STACK_DEPTH, 225 + .max_entries = 16, 226 + }; 227 + 228 + struct bpf_map_def SEC("maps") sample_heap = { 229 + .type = BPF_MAP_TYPE_PERCPU_ARRAY, 230 + .key_size = sizeof(uint32_t), 231 + .value_size = sizeof(struct strobelight_bpf_sample), 232 + .max_entries = 1, 233 + }; 234 + 235 + struct bpf_map_def SEC("maps") strobemeta_cfgs = { 236 + .type = BPF_MAP_TYPE_PERCPU_ARRAY, 237 + .key_size = sizeof(pid_t), 238 + .value_size = sizeof(struct strobemeta_cfg), 239 + .max_entries = STROBE_MAX_CFGS, 240 + }; 241 + 242 + /* Type for the dtv. */ 243 + /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */ 244 + typedef union dtv { 245 + size_t counter; 246 + struct { 247 + void* val; 248 + bool is_static; 249 + } pointer; 250 + } dtv_t; 251 + 252 + /* Partial definition for tcbhead_t */ 253 + /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */ 254 + struct tcbhead { 255 + void* tcb; 256 + dtv_t* dtv; 257 + }; 258 + 259 + /* 260 + * TLS module/offset information for shared library case. 261 + * For x86-64, this is mapped onto two entries in GOT. 262 + * For aarch64, this is pointed to by second GOT entry. 263 + */ 264 + struct tls_index { 265 + uint64_t module; 266 + uint64_t offset; 267 + }; 268 + 269 + static inline __attribute__((always_inline)) 270 + void *calc_location(struct strobe_value_loc *loc, void *tls_base) 271 + { 272 + /* 273 + * tls_mode value is: 274 + * - -1 (TLS_NOT_SET), if no metavar is present; 275 + * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS 276 + * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64); 277 + * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS; 278 + * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS; 279 + * This schema allows to use something like: 280 + * (tls_mode + 1) * (tls_base + offset) 281 + * to get NULL for "no metavar" location, or correct pointer for local 282 + * executable mode without doing extra ifs. 283 + */ 284 + if (loc->tls_mode <= TLS_LOCAL_EXEC) { 285 + /* static executable is simple, we just have offset from 286 + * tls_base */ 287 + void *addr = tls_base + loc->offset; 288 + /* multiply by (tls_mode + 1) to get NULL, if we have no 289 + * metavar in this slot */ 290 + return (void *)((loc->tls_mode + 1) * (int64_t)addr); 291 + } 292 + /* 293 + * Other modes are more complicated, we need to jump through few hoops. 294 + * 295 + * For immediate executable mode (currently supported only for aarch64): 296 + * - loc->offset is pointing to a GOT entry containing fixed offset 297 + * relative to tls_base; 298 + * 299 + * For general dynamic mode: 300 + * - loc->offset is pointing to a beginning of double GOT entries; 301 + * - (for aarch64 only) second entry points to tls_index_t struct; 302 + * - (for x86-64 only) two GOT entries are already tls_index_t; 303 + * - tls_index_t->module is used to find start of TLS section in 304 + * which variable resides; 305 + * - tls_index_t->offset provides offset within that TLS section, 306 + * pointing to value of variable. 307 + */ 308 + struct tls_index tls_index; 309 + dtv_t *dtv; 310 + void *tls_ptr; 311 + 312 + bpf_probe_read(&tls_index, sizeof(struct tls_index), 313 + (void *)loc->offset); 314 + /* valid module index is always positive */ 315 + if (tls_index.module > 0) { 316 + /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */ 317 + bpf_probe_read(&dtv, sizeof(dtv), 318 + &((struct tcbhead *)tls_base)->dtv); 319 + dtv += tls_index.module; 320 + } else { 321 + dtv = NULL; 322 + } 323 + bpf_probe_read(&tls_ptr, sizeof(void *), dtv); 324 + /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ 325 + return tls_ptr && tls_ptr != (void *)-1 326 + ? tls_ptr + tls_index.offset 327 + : NULL; 328 + } 329 + 330 + static inline __attribute__((always_inline)) 331 + void read_int_var(struct strobemeta_cfg *cfg, size_t idx, void *tls_base, 332 + struct strobe_value_generic *value, 333 + struct strobemeta_payload *data) 334 + { 335 + void *location = calc_location(&cfg->int_locs[idx], tls_base); 336 + if (!location) 337 + return; 338 + 339 + bpf_probe_read(value, sizeof(struct strobe_value_generic), location); 340 + data->int_vals[idx] = value->val; 341 + if (value->header.len) 342 + data->int_vals_set_mask |= (1 << idx); 343 + } 344 + 345 + static inline __attribute__((always_inline)) 346 + uint64_t read_str_var(struct strobemeta_cfg* cfg, size_t idx, void *tls_base, 347 + struct strobe_value_generic *value, 348 + struct strobemeta_payload *data, void *payload) 349 + { 350 + void *location; 351 + uint32_t len; 352 + 353 + data->str_lens[idx] = 0; 354 + location = calc_location(&cfg->str_locs[idx], tls_base); 355 + if (!location) 356 + return 0; 357 + 358 + bpf_probe_read(value, sizeof(struct strobe_value_generic), location); 359 + len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, value->ptr); 360 + /* 361 + * if bpf_probe_read_str returns error (<0), due to casting to 362 + * unsinged int, it will become big number, so next check is 363 + * sufficient to check for errors AND prove to BPF verifier, that 364 + * bpf_probe_read_str won't return anything bigger than 365 + * STROBE_MAX_STR_LEN 366 + */ 367 + if (len > STROBE_MAX_STR_LEN) 368 + return 0; 369 + 370 + data->str_lens[idx] = len; 371 + return len; 372 + } 373 + 374 + static inline __attribute__((always_inline)) 375 + void *read_map_var(struct strobemeta_cfg *cfg, size_t idx, void *tls_base, 376 + struct strobe_value_generic *value, 377 + struct strobemeta_payload* data, void *payload) 378 + { 379 + struct strobe_map_descr* descr = &data->map_descrs[idx]; 380 + struct strobe_map_raw map; 381 + void *location; 382 + uint32_t len; 383 + int i; 384 + 385 + descr->tag_len = 0; /* presume no tag is set */ 386 + descr->cnt = -1; /* presume no value is set */ 387 + 388 + location = calc_location(&cfg->map_locs[idx], tls_base); 389 + if (!location) 390 + return payload; 391 + 392 + bpf_probe_read(value, sizeof(struct strobe_value_generic), location); 393 + if (bpf_probe_read(&map, sizeof(struct strobe_map_raw), value->ptr)) 394 + return payload; 395 + 396 + descr->id = map.id; 397 + descr->cnt = map.cnt; 398 + if (cfg->req_meta_idx == idx) { 399 + data->req_id = map.id; 400 + data->req_meta_valid = 1; 401 + } 402 + 403 + len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.tag); 404 + if (len <= STROBE_MAX_STR_LEN) { 405 + descr->tag_len = len; 406 + payload += len; 407 + } 408 + 409 + #ifdef NO_UNROLL 410 + #pragma clang loop unroll(disable) 411 + #else 412 + #pragma unroll 413 + #endif 414 + for (int i = 0; i < STROBE_MAX_MAP_ENTRIES && i < map.cnt; ++i) { 415 + descr->key_lens[i] = 0; 416 + len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, 417 + map.entries[i].key); 418 + if (len <= STROBE_MAX_STR_LEN) { 419 + descr->key_lens[i] = len; 420 + payload += len; 421 + } 422 + descr->val_lens[i] = 0; 423 + len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, 424 + map.entries[i].val); 425 + if (len <= STROBE_MAX_STR_LEN) { 426 + descr->val_lens[i] = len; 427 + payload += len; 428 + } 429 + } 430 + 431 + return payload; 432 + } 433 + 434 + /* 435 + * read_strobe_meta returns NULL, if no metadata was read; otherwise returns 436 + * pointer to *right after* payload ends 437 + */ 438 + static inline __attribute__((always_inline)) 439 + void *read_strobe_meta(struct task_struct* task, 440 + struct strobemeta_payload* data) { 441 + pid_t pid = bpf_get_current_pid_tgid() >> 32; 442 + struct strobe_value_generic value = {0}; 443 + struct strobemeta_cfg *cfg; 444 + void *tls_base, *payload; 445 + 446 + cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); 447 + if (!cfg) 448 + return NULL; 449 + 450 + data->int_vals_set_mask = 0; 451 + data->req_meta_valid = 0; 452 + payload = data->payload; 453 + /* 454 + * we don't have struct task_struct definition, it should be: 455 + * tls_base = (void *)task->thread.fsbase; 456 + */ 457 + tls_base = (void *)task; 458 + 459 + #ifdef NO_UNROLL 460 + #pragma clang loop unroll(disable) 461 + #else 462 + #pragma unroll 463 + #endif 464 + for (int i = 0; i < STROBE_MAX_INTS; ++i) { 465 + read_int_var(cfg, i, tls_base, &value, data); 466 + } 467 + #ifdef NO_UNROLL 468 + #pragma clang loop unroll(disable) 469 + #else 470 + #pragma unroll 471 + #endif 472 + for (int i = 0; i < STROBE_MAX_STRS; ++i) { 473 + payload += read_str_var(cfg, i, tls_base, &value, data, payload); 474 + } 475 + #ifdef NO_UNROLL 476 + #pragma clang loop unroll(disable) 477 + #else 478 + #pragma unroll 479 + #endif 480 + for (int i = 0; i < STROBE_MAX_MAPS; ++i) { 481 + payload = read_map_var(cfg, i, tls_base, &value, data, payload); 482 + } 483 + /* 484 + * return pointer right after end of payload, so it's possible to 485 + * calculate exact amount of useful data that needs to be sent 486 + */ 487 + return payload; 488 + } 489 + 490 + SEC("raw_tracepoint/kfree_skb") 491 + int on_event(struct pt_regs *ctx) { 492 + pid_t pid = bpf_get_current_pid_tgid() >> 32; 493 + struct strobelight_bpf_sample* sample; 494 + struct task_struct *task; 495 + uint32_t zero = 0; 496 + uint64_t ktime_ns; 497 + void *sample_end; 498 + 499 + sample = bpf_map_lookup_elem(&sample_heap, &zero); 500 + if (!sample) 501 + return 0; /* this will never happen */ 502 + 503 + sample->pid = pid; 504 + bpf_get_current_comm(&sample->comm, TASK_COMM_LEN); 505 + ktime_ns = bpf_ktime_get_ns(); 506 + sample->ktime = ktime_ns; 507 + 508 + task = (struct task_struct *)bpf_get_current_task(); 509 + sample_end = read_strobe_meta(task, &sample->metadata); 510 + sample->has_meta = sample_end != NULL; 511 + sample_end = sample_end ? : &sample->metadata; 512 + 513 + if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) { 514 + sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0); 515 + sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK); 516 + } else { 517 + sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0); 518 + sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK); 519 + } 520 + 521 + uint64_t sample_size = sample_end - (void *)sample; 522 + /* should always be true */ 523 + if (sample_size < sizeof(struct strobelight_bpf_sample)) 524 + bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size); 525 + return 0; 526 + } 527 + 528 + char _license[] SEC("license") = "GPL";

+9

tools/testing/selftests/bpf/progs/strobemeta_nounroll1.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + // Copyright (c) 2019 Facebook 3 + 4 + #define STROBE_MAX_INTS 2 5 + #define STROBE_MAX_STRS 25 6 + #define STROBE_MAX_MAPS 13 7 + #define STROBE_MAX_MAP_ENTRIES 20 8 + #define NO_UNROLL 9 + #include "strobemeta.h"

+9

tools/testing/selftests/bpf/progs/strobemeta_nounroll2.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + // Copyright (c) 2019 Facebook 3 + 4 + #define STROBE_MAX_INTS 2 5 + #define STROBE_MAX_STRS 25 6 + #define STROBE_MAX_MAPS 30 7 + #define STROBE_MAX_MAP_ENTRIES 20 8 + #define NO_UNROLL 9 + #include "strobemeta.h"

+73

tools/testing/selftests/bpf/progs/test_btf_newkv.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2018 Facebook */ 3 + #include <linux/bpf.h> 4 + #include "bpf_helpers.h" 5 + 6 + int _version SEC("version") = 1; 7 + 8 + struct ipv_counts { 9 + unsigned int v4; 10 + unsigned int v6; 11 + }; 12 + 13 + /* just to validate we can handle maps in multiple sections */ 14 + struct bpf_map_def SEC("maps") btf_map_legacy = { 15 + .type = BPF_MAP_TYPE_ARRAY, 16 + .key_size = sizeof(int), 17 + .value_size = sizeof(long long), 18 + .max_entries = 4, 19 + }; 20 + 21 + BPF_ANNOTATE_KV_PAIR(btf_map_legacy, int, struct ipv_counts); 22 + 23 + struct { 24 + int *key; 25 + struct ipv_counts *value; 26 + unsigned int type; 27 + unsigned int max_entries; 28 + } btf_map SEC(".maps") = { 29 + .type = BPF_MAP_TYPE_ARRAY, 30 + .max_entries = 4, 31 + }; 32 + 33 + struct dummy_tracepoint_args { 34 + unsigned long long pad; 35 + struct sock *sock; 36 + }; 37 + 38 + __attribute__((noinline)) 39 + static int test_long_fname_2(struct dummy_tracepoint_args *arg) 40 + { 41 + struct ipv_counts *counts; 42 + int key = 0; 43 + 44 + if (!arg->sock) 45 + return 0; 46 + 47 + counts = bpf_map_lookup_elem(&btf_map, &key); 48 + if (!counts) 49 + return 0; 50 + 51 + counts->v6++; 52 + 53 + /* just verify we can reference both maps */ 54 + counts = bpf_map_lookup_elem(&btf_map_legacy, &key); 55 + if (!counts) 56 + return 0; 57 + 58 + return 0; 59 + } 60 + 61 + __attribute__((noinline)) 62 + static int test_long_fname_1(struct dummy_tracepoint_args *arg) 63 + { 64 + return test_long_fname_2(arg); 65 + } 66 + 67 + SEC("dummy_tracepoint") 68 + int _dummy_tracepoint(struct dummy_tracepoint_args *arg) 69 + { 70 + return test_long_fname_1(arg); 71 + } 72 + 73 + char _license[] SEC("license") = "GPL";

+19 -8

tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c

··· 15 15 struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP]; 16 16 }; 17 17 18 - struct bpf_map_def SEC("maps") perfmap = { 18 + struct { 19 + __u32 type; 20 + __u32 max_entries; 21 + __u32 key_size; 22 + __u32 value_size; 23 + } perfmap SEC(".maps") = { 19 24 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, 25 + .max_entries = 2, 20 26 .key_size = sizeof(int), 21 27 .value_size = sizeof(__u32), 22 - .max_entries = 2, 23 28 }; 24 29 25 - struct bpf_map_def SEC("maps") stackdata_map = { 30 + struct { 31 + __u32 type; 32 + __u32 max_entries; 33 + __u32 *key; 34 + struct stack_trace_t *value; 35 + } stackdata_map SEC(".maps") = { 26 36 .type = BPF_MAP_TYPE_PERCPU_ARRAY, 27 - .key_size = sizeof(__u32), 28 - .value_size = sizeof(struct stack_trace_t), 29 37 .max_entries = 1, 30 38 }; 31 39 ··· 55 47 * issue and avoid complicated C programming massaging. 56 48 * This is an acceptable workaround since there is one entry here. 57 49 */ 58 - struct bpf_map_def SEC("maps") rawdata_map = { 50 + struct { 51 + __u32 type; 52 + __u32 max_entries; 53 + __u32 *key; 54 + __u64 (*value)[2 * MAX_STACK_RAWTP]; 55 + } rawdata_map SEC(".maps") = { 59 56 .type = BPF_MAP_TYPE_PERCPU_ARRAY, 60 - .key_size = sizeof(__u32), 61 - .value_size = MAX_STACK_RAWTP * sizeof(__u64) * 2, 62 57 .max_entries = 1, 63 58 }; 64 59

+18 -9

tools/testing/selftests/bpf/progs/test_global_data.c

··· 7 7 8 8 #include "bpf_helpers.h" 9 9 10 - struct bpf_map_def SEC("maps") result_number = { 10 + struct { 11 + __u32 type; 12 + __u32 max_entries; 13 + __u32 *key; 14 + __u64 *value; 15 + } result_number SEC(".maps") = { 11 16 .type = BPF_MAP_TYPE_ARRAY, 12 - .key_size = sizeof(__u32), 13 - .value_size = sizeof(__u64), 14 17 .max_entries = 11, 15 18 }; 16 19 17 - struct bpf_map_def SEC("maps") result_string = { 20 + struct { 21 + __u32 type; 22 + __u32 max_entries; 23 + __u32 *key; 24 + const char (*value)[32]; 25 + } result_string SEC(".maps") = { 18 26 .type = BPF_MAP_TYPE_ARRAY, 19 - .key_size = sizeof(__u32), 20 - .value_size = 32, 21 27 .max_entries = 5, 22 28 }; 23 29 ··· 33 27 __u64 c; 34 28 }; 35 29 36 - struct bpf_map_def SEC("maps") result_struct = { 30 + struct { 31 + __u32 type; 32 + __u32 max_entries; 33 + __u32 *key; 34 + struct foo *value; 35 + } result_struct SEC(".maps") = { 37 36 .type = BPF_MAP_TYPE_ARRAY, 38 - .key_size = sizeof(__u32), 39 - .value_size = sizeof(struct foo), 40 37 .max_entries = 5, 41 38 }; 42 39

+30 -15

tools/testing/selftests/bpf/progs/test_l4lb.c

··· 169 169 unsigned short eth_proto; 170 170 }; 171 171 172 - struct bpf_map_def SEC("maps") vip_map = { 172 + struct { 173 + __u32 type; 174 + __u32 max_entries; 175 + struct vip *key; 176 + struct vip_meta *value; 177 + } vip_map SEC(".maps") = { 173 178 .type = BPF_MAP_TYPE_HASH, 174 - .key_size = sizeof(struct vip), 175 - .value_size = sizeof(struct vip_meta), 176 179 .max_entries = MAX_VIPS, 177 180 }; 178 181 179 - struct bpf_map_def SEC("maps") ch_rings = { 182 + struct { 183 + __u32 type; 184 + __u32 max_entries; 185 + __u32 *key; 186 + __u32 *value; 187 + } ch_rings SEC(".maps") = { 180 188 .type = BPF_MAP_TYPE_ARRAY, 181 - .key_size = sizeof(__u32), 182 - .value_size = sizeof(__u32), 183 189 .max_entries = CH_RINGS_SIZE, 184 190 }; 185 191 186 - struct bpf_map_def SEC("maps") reals = { 192 + struct { 193 + __u32 type; 194 + __u32 max_entries; 195 + __u32 *key; 196 + struct real_definition *value; 197 + } reals SEC(".maps") = { 187 198 .type = BPF_MAP_TYPE_ARRAY, 188 - .key_size = sizeof(__u32), 189 - .value_size = sizeof(struct real_definition), 190 199 .max_entries = MAX_REALS, 191 200 }; 192 201 193 - struct bpf_map_def SEC("maps") stats = { 202 + struct { 203 + __u32 type; 204 + __u32 max_entries; 205 + __u32 *key; 206 + struct vip_stats *value; 207 + } stats SEC(".maps") = { 194 208 .type = BPF_MAP_TYPE_PERCPU_ARRAY, 195 - .key_size = sizeof(__u32), 196 - .value_size = sizeof(struct vip_stats), 197 209 .max_entries = MAX_VIPS, 198 210 }; 199 211 200 - struct bpf_map_def SEC("maps") ctl_array = { 212 + struct { 213 + __u32 type; 214 + __u32 max_entries; 215 + __u32 *key; 216 + struct ctl_value *value; 217 + } ctl_array SEC(".maps") = { 201 218 .type = BPF_MAP_TYPE_ARRAY, 202 - .key_size = sizeof(__u32), 203 - .value_size = sizeof(struct ctl_value), 204 219 .max_entries = CTL_MAP_SIZE, 205 220 }; 206 221

+30 -15

tools/testing/selftests/bpf/progs/test_l4lb_noinline.c

··· 165 165 unsigned short eth_proto; 166 166 }; 167 167 168 - struct bpf_map_def SEC("maps") vip_map = { 168 + struct { 169 + __u32 type; 170 + __u32 max_entries; 171 + struct vip *key; 172 + struct vip_meta *value; 173 + } vip_map SEC(".maps") = { 169 174 .type = BPF_MAP_TYPE_HASH, 170 - .key_size = sizeof(struct vip), 171 - .value_size = sizeof(struct vip_meta), 172 175 .max_entries = MAX_VIPS, 173 176 }; 174 177 175 - struct bpf_map_def SEC("maps") ch_rings = { 178 + struct { 179 + __u32 type; 180 + __u32 max_entries; 181 + __u32 *key; 182 + __u32 *value; 183 + } ch_rings SEC(".maps") = { 176 184 .type = BPF_MAP_TYPE_ARRAY, 177 - .key_size = sizeof(__u32), 178 - .value_size = sizeof(__u32), 179 185 .max_entries = CH_RINGS_SIZE, 180 186 }; 181 187 182 - struct bpf_map_def SEC("maps") reals = { 188 + struct { 189 + __u32 type; 190 + __u32 max_entries; 191 + __u32 *key; 192 + struct real_definition *value; 193 + } reals SEC(".maps") = { 183 194 .type = BPF_MAP_TYPE_ARRAY, 184 - .key_size = sizeof(__u32), 185 - .value_size = sizeof(struct real_definition), 186 195 .max_entries = MAX_REALS, 187 196 }; 188 197 189 - struct bpf_map_def SEC("maps") stats = { 198 + struct { 199 + __u32 type; 200 + __u32 max_entries; 201 + __u32 *key; 202 + struct vip_stats *value; 203 + } stats SEC(".maps") = { 190 204 .type = BPF_MAP_TYPE_PERCPU_ARRAY, 191 - .key_size = sizeof(__u32), 192 - .value_size = sizeof(struct vip_stats), 193 205 .max_entries = MAX_VIPS, 194 206 }; 195 207 196 - struct bpf_map_def SEC("maps") ctl_array = { 208 + struct { 209 + __u32 type; 210 + __u32 max_entries; 211 + __u32 *key; 212 + struct ctl_value *value; 213 + } ctl_array SEC(".maps") = { 197 214 .type = BPF_MAP_TYPE_ARRAY, 198 - .key_size = sizeof(__u32), 199 - .value_size = sizeof(struct ctl_value), 200 215 .max_entries = CTL_MAP_SIZE, 201 216 }; 202 217

+12 -10

tools/testing/selftests/bpf/progs/test_map_lock.c

··· 11 11 int var[VAR_NUM]; 12 12 }; 13 13 14 - struct bpf_map_def SEC("maps") hash_map = { 14 + struct { 15 + __u32 type; 16 + __u32 max_entries; 17 + __u32 *key; 18 + struct hmap_elem *value; 19 + } hash_map SEC(".maps") = { 15 20 .type = BPF_MAP_TYPE_HASH, 16 - .key_size = sizeof(int), 17 - .value_size = sizeof(struct hmap_elem), 18 21 .max_entries = 1, 19 22 }; 20 - 21 - BPF_ANNOTATE_KV_PAIR(hash_map, int, struct hmap_elem); 22 23 23 24 struct array_elem { 24 25 struct bpf_spin_lock lock; 25 26 int var[VAR_NUM]; 26 27 }; 27 28 28 - struct bpf_map_def SEC("maps") array_map = { 29 + struct { 30 + __u32 type; 31 + __u32 max_entries; 32 + int *key; 33 + struct array_elem *value; 34 + } array_map SEC(".maps") = { 29 35 .type = BPF_MAP_TYPE_ARRAY, 30 - .key_size = sizeof(int), 31 - .value_size = sizeof(struct array_elem), 32 36 .max_entries = 1, 33 37 }; 34 - 35 - BPF_ANNOTATE_KV_PAIR(array_map, int, struct array_elem); 36 38 37 39 SEC("map_lock_demo") 38 40 int bpf_map_lock_test(struct __sk_buff *skb)

+261

tools/testing/selftests/bpf/progs/test_seg6_loop.c

··· 1 + #include <stddef.h> 2 + #include <inttypes.h> 3 + #include <errno.h> 4 + #include <linux/seg6_local.h> 5 + #include <linux/bpf.h> 6 + #include "bpf_helpers.h" 7 + #include "bpf_endian.h" 8 + 9 + /* Packet parsing state machine helpers. */ 10 + #define cursor_advance(_cursor, _len) \ 11 + ({ void *_tmp = _cursor; _cursor += _len; _tmp; }) 12 + 13 + #define SR6_FLAG_ALERT (1 << 4) 14 + 15 + #define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \ 16 + 0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32)) 17 + #define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \ 18 + 0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32)) 19 + #define BPF_PACKET_HEADER __attribute__((packed)) 20 + 21 + struct ip6_t { 22 + unsigned int ver:4; 23 + unsigned int priority:8; 24 + unsigned int flow_label:20; 25 + unsigned short payload_len; 26 + unsigned char next_header; 27 + unsigned char hop_limit; 28 + unsigned long long src_hi; 29 + unsigned long long src_lo; 30 + unsigned long long dst_hi; 31 + unsigned long long dst_lo; 32 + } BPF_PACKET_HEADER; 33 + 34 + struct ip6_addr_t { 35 + unsigned long long hi; 36 + unsigned long long lo; 37 + } BPF_PACKET_HEADER; 38 + 39 + struct ip6_srh_t { 40 + unsigned char nexthdr; 41 + unsigned char hdrlen; 42 + unsigned char type; 43 + unsigned char segments_left; 44 + unsigned char first_segment; 45 + unsigned char flags; 46 + unsigned short tag; 47 + 48 + struct ip6_addr_t segments[0]; 49 + } BPF_PACKET_HEADER; 50 + 51 + struct sr6_tlv_t { 52 + unsigned char type; 53 + unsigned char len; 54 + unsigned char value[0]; 55 + } BPF_PACKET_HEADER; 56 + 57 + static __attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff *skb) 58 + { 59 + void *cursor, *data_end; 60 + struct ip6_srh_t *srh; 61 + struct ip6_t *ip; 62 + uint8_t *ipver; 63 + 64 + data_end = (void *)(long)skb->data_end; 65 + cursor = (void *)(long)skb->data; 66 + ipver = (uint8_t *)cursor; 67 + 68 + if ((void *)ipver + sizeof(*ipver) > data_end) 69 + return NULL; 70 + 71 + if ((*ipver >> 4) != 6) 72 + return NULL; 73 + 74 + ip = cursor_advance(cursor, sizeof(*ip)); 75 + if ((void *)ip + sizeof(*ip) > data_end) 76 + return NULL; 77 + 78 + if (ip->next_header != 43) 79 + return NULL; 80 + 81 + srh = cursor_advance(cursor, sizeof(*srh)); 82 + if ((void *)srh + sizeof(*srh) > data_end) 83 + return NULL; 84 + 85 + if (srh->type != 4) 86 + return NULL; 87 + 88 + return srh; 89 + } 90 + 91 + static __attribute__((always_inline)) 92 + int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad, 93 + uint32_t old_pad, uint32_t pad_off) 94 + { 95 + int err; 96 + 97 + if (new_pad != old_pad) { 98 + err = bpf_lwt_seg6_adjust_srh(skb, pad_off, 99 + (int) new_pad - (int) old_pad); 100 + if (err) 101 + return err; 102 + } 103 + 104 + if (new_pad > 0) { 105 + char pad_tlv_buf[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 106 + 0, 0, 0}; 107 + struct sr6_tlv_t *pad_tlv = (struct sr6_tlv_t *) pad_tlv_buf; 108 + 109 + pad_tlv->type = SR6_TLV_PADDING; 110 + pad_tlv->len = new_pad - 2; 111 + 112 + err = bpf_lwt_seg6_store_bytes(skb, pad_off, 113 + (void *)pad_tlv_buf, new_pad); 114 + if (err) 115 + return err; 116 + } 117 + 118 + return 0; 119 + } 120 + 121 + static __attribute__((always_inline)) 122 + int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh, 123 + uint32_t *tlv_off, uint32_t *pad_size, 124 + uint32_t *pad_off) 125 + { 126 + uint32_t srh_off, cur_off; 127 + int offset_valid = 0; 128 + int err; 129 + 130 + srh_off = (char *)srh - (char *)(long)skb->data; 131 + // cur_off = end of segments, start of possible TLVs 132 + cur_off = srh_off + sizeof(*srh) + 133 + sizeof(struct ip6_addr_t) * (srh->first_segment + 1); 134 + 135 + *pad_off = 0; 136 + 137 + // we can only go as far as ~10 TLVs due to the BPF max stack size 138 + #pragma clang loop unroll(disable) 139 + for (int i = 0; i < 100; i++) { 140 + struct sr6_tlv_t tlv; 141 + 142 + if (cur_off == *tlv_off) 143 + offset_valid = 1; 144 + 145 + if (cur_off >= srh_off + ((srh->hdrlen + 1) << 3)) 146 + break; 147 + 148 + err = bpf_skb_load_bytes(skb, cur_off, &tlv, sizeof(tlv)); 149 + if (err) 150 + return err; 151 + 152 + if (tlv.type == SR6_TLV_PADDING) { 153 + *pad_size = tlv.len + sizeof(tlv); 154 + *pad_off = cur_off; 155 + 156 + if (*tlv_off == srh_off) { 157 + *tlv_off = cur_off; 158 + offset_valid = 1; 159 + } 160 + break; 161 + 162 + } else if (tlv.type == SR6_TLV_HMAC) { 163 + break; 164 + } 165 + 166 + cur_off += sizeof(tlv) + tlv.len; 167 + } // we reached the padding or HMAC TLVs, or the end of the SRH 168 + 169 + if (*pad_off == 0) 170 + *pad_off = cur_off; 171 + 172 + if (*tlv_off == -1) 173 + *tlv_off = cur_off; 174 + else if (!offset_valid) 175 + return -EINVAL; 176 + 177 + return 0; 178 + } 179 + 180 + static __attribute__((always_inline)) 181 + int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off, 182 + struct sr6_tlv_t *itlv, uint8_t tlv_size) 183 + { 184 + uint32_t srh_off = (char *)srh - (char *)(long)skb->data; 185 + uint8_t len_remaining, new_pad; 186 + uint32_t pad_off = 0; 187 + uint32_t pad_size = 0; 188 + uint32_t partial_srh_len; 189 + int err; 190 + 191 + if (tlv_off != -1) 192 + tlv_off += srh_off; 193 + 194 + if (itlv->type == SR6_TLV_PADDING || itlv->type == SR6_TLV_HMAC) 195 + return -EINVAL; 196 + 197 + err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off); 198 + if (err) 199 + return err; 200 + 201 + err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, sizeof(*itlv) + itlv->len); 202 + if (err) 203 + return err; 204 + 205 + err = bpf_lwt_seg6_store_bytes(skb, tlv_off, (void *)itlv, tlv_size); 206 + if (err) 207 + return err; 208 + 209 + // the following can't be moved inside update_tlv_pad because the 210 + // bpf verifier has some issues with it 211 + pad_off += sizeof(*itlv) + itlv->len; 212 + partial_srh_len = pad_off - srh_off; 213 + len_remaining = partial_srh_len % 8; 214 + new_pad = 8 - len_remaining; 215 + 216 + if (new_pad == 1) // cannot pad for 1 byte only 217 + new_pad = 9; 218 + else if (new_pad == 8) 219 + new_pad = 0; 220 + 221 + return update_tlv_pad(skb, new_pad, pad_size, pad_off); 222 + } 223 + 224 + // Add an Egress TLV fc00::4, add the flag A, 225 + // and apply End.X action to fc42::1 226 + SEC("lwt_seg6local") 227 + int __add_egr_x(struct __sk_buff *skb) 228 + { 229 + unsigned long long hi = 0xfc42000000000000; 230 + unsigned long long lo = 0x1; 231 + struct ip6_srh_t *srh = get_srh(skb); 232 + uint8_t new_flags = SR6_FLAG_ALERT; 233 + struct ip6_addr_t addr; 234 + int err, offset; 235 + 236 + if (srh == NULL) 237 + return BPF_DROP; 238 + 239 + uint8_t tlv[20] = {2, 18, 0, 0, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 240 + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4}; 241 + 242 + err = add_tlv(skb, srh, (srh->hdrlen+1) << 3, 243 + (struct sr6_tlv_t *)&tlv, 20); 244 + if (err) 245 + return BPF_DROP; 246 + 247 + offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags); 248 + err = bpf_lwt_seg6_store_bytes(skb, offset, 249 + (void *)&new_flags, sizeof(new_flags)); 250 + if (err) 251 + return BPF_DROP; 252 + 253 + addr.lo = htonll(lo); 254 + addr.hi = htonll(hi); 255 + err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X, 256 + (void *)&addr, sizeof(addr)); 257 + if (err) 258 + return BPF_DROP; 259 + return BPF_REDIRECT; 260 + } 261 + char __license[] SEC("license") = "GPL";

+31 -14

tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c

··· 21 21 #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) 22 22 #endif 23 23 24 - struct bpf_map_def SEC("maps") outer_map = { 24 + struct { 25 + __u32 type; 26 + __u32 max_entries; 27 + __u32 key_size; 28 + __u32 value_size; 29 + } outer_map SEC(".maps") = { 25 30 .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, 31 + .max_entries = 1, 26 32 .key_size = sizeof(__u32), 27 33 .value_size = sizeof(__u32), 28 - .max_entries = 1, 29 34 }; 30 35 31 - struct bpf_map_def SEC("maps") result_map = { 36 + struct { 37 + __u32 type; 38 + __u32 max_entries; 39 + __u32 *key; 40 + __u32 *value; 41 + } result_map SEC(".maps") = { 32 42 .type = BPF_MAP_TYPE_ARRAY, 33 - .key_size = sizeof(__u32), 34 - .value_size = sizeof(__u32), 35 43 .max_entries = NR_RESULTS, 36 44 }; 37 45 38 - struct bpf_map_def SEC("maps") tmp_index_ovr_map = { 46 + struct { 47 + __u32 type; 48 + __u32 max_entries; 49 + __u32 *key; 50 + int *value; 51 + } tmp_index_ovr_map SEC(".maps") = { 39 52 .type = BPF_MAP_TYPE_ARRAY, 40 - .key_size = sizeof(__u32), 41 - .value_size = sizeof(int), 42 53 .max_entries = 1, 43 54 }; 44 55 45 - struct bpf_map_def SEC("maps") linum_map = { 56 + struct { 57 + __u32 type; 58 + __u32 max_entries; 59 + __u32 *key; 60 + __u32 *value; 61 + } linum_map SEC(".maps") = { 46 62 .type = BPF_MAP_TYPE_ARRAY, 47 - .key_size = sizeof(__u32), 48 - .value_size = sizeof(__u32), 49 63 .max_entries = 1, 50 64 }; 51 65 52 - struct bpf_map_def SEC("maps") data_check_map = { 66 + struct { 67 + __u32 type; 68 + __u32 max_entries; 69 + __u32 *key; 70 + struct data_check *value; 71 + } data_check_map SEC(".maps") = { 53 72 .type = BPF_MAP_TYPE_ARRAY, 54 - .key_size = sizeof(__u32), 55 - .value_size = sizeof(struct data_check), 56 73 .max_entries = 1, 57 74 }; 58 75

+12 -10

tools/testing/selftests/bpf/progs/test_send_signal_kern.c

··· 4 4 #include <linux/version.h> 5 5 #include "bpf_helpers.h" 6 6 7 - struct bpf_map_def SEC("maps") info_map = { 7 + struct { 8 + __u32 type; 9 + __u32 max_entries; 10 + __u32 *key; 11 + __u64 *value; 12 + } info_map SEC(".maps") = { 8 13 .type = BPF_MAP_TYPE_ARRAY, 9 - .key_size = sizeof(__u32), 10 - .value_size = sizeof(__u64), 11 14 .max_entries = 1, 12 15 }; 13 16 14 - BPF_ANNOTATE_KV_PAIR(info_map, __u32, __u64); 15 - 16 - struct bpf_map_def SEC("maps") status_map = { 17 + struct { 18 + __u32 type; 19 + __u32 max_entries; 20 + __u32 *key; 21 + __u64 *value; 22 + } status_map SEC(".maps") = { 17 23 .type = BPF_MAP_TYPE_ARRAY, 18 - .key_size = sizeof(__u32), 19 - .value_size = sizeof(__u64), 20 24 .max_entries = 1, 21 25 }; 22 - 23 - BPF_ANNOTATE_KV_PAIR(status_map, __u32, __u64); 24 26 25 27 SEC("send_signal_demo") 26 28 int bpf_send_signal_test(void *ctx)

+36 -24

tools/testing/selftests/bpf/progs/test_sock_fields_kern.c

··· 27 27 __NR_BPF_LINUM_ARRAY_IDX, 28 28 }; 29 29 30 - struct bpf_map_def SEC("maps") addr_map = { 30 + struct { 31 + __u32 type; 32 + __u32 max_entries; 33 + __u32 *key; 34 + struct sockaddr_in6 *value; 35 + } addr_map SEC(".maps") = { 31 36 .type = BPF_MAP_TYPE_ARRAY, 32 - .key_size = sizeof(__u32), 33 - .value_size = sizeof(struct sockaddr_in6), 34 37 .max_entries = __NR_BPF_ADDR_ARRAY_IDX, 35 38 }; 36 39 37 - struct bpf_map_def SEC("maps") sock_result_map = { 40 + struct { 41 + __u32 type; 42 + __u32 max_entries; 43 + __u32 *key; 44 + struct bpf_sock *value; 45 + } sock_result_map SEC(".maps") = { 38 46 .type = BPF_MAP_TYPE_ARRAY, 39 - .key_size = sizeof(__u32), 40 - .value_size = sizeof(struct bpf_sock), 41 47 .max_entries = __NR_BPF_RESULT_ARRAY_IDX, 42 48 }; 43 49 44 - struct bpf_map_def SEC("maps") tcp_sock_result_map = { 50 + struct { 51 + __u32 type; 52 + __u32 max_entries; 53 + __u32 *key; 54 + struct bpf_tcp_sock *value; 55 + } tcp_sock_result_map SEC(".maps") = { 45 56 .type = BPF_MAP_TYPE_ARRAY, 46 - .key_size = sizeof(__u32), 47 - .value_size = sizeof(struct bpf_tcp_sock), 48 57 .max_entries = __NR_BPF_RESULT_ARRAY_IDX, 49 58 }; 50 59 51 - struct bpf_map_def SEC("maps") linum_map = { 60 + struct { 61 + __u32 type; 62 + __u32 max_entries; 63 + __u32 *key; 64 + __u32 *value; 65 + } linum_map SEC(".maps") = { 52 66 .type = BPF_MAP_TYPE_ARRAY, 53 - .key_size = sizeof(__u32), 54 - .value_size = sizeof(__u32), 55 67 .max_entries = __NR_BPF_LINUM_ARRAY_IDX, 56 68 }; 57 69 ··· 72 60 __u32 cnt; 73 61 }; 74 62 75 - struct bpf_map_def SEC("maps") sk_pkt_out_cnt = { 63 + struct { 64 + __u32 type; 65 + __u32 map_flags; 66 + int *key; 67 + struct bpf_spinlock_cnt *value; 68 + } sk_pkt_out_cnt SEC(".maps") = { 76 69 .type = BPF_MAP_TYPE_SK_STORAGE, 77 - .key_size = sizeof(int), 78 - .value_size = sizeof(struct bpf_spinlock_cnt), 79 - .max_entries = 0, 80 70 .map_flags = BPF_F_NO_PREALLOC, 81 71 }; 82 72 83 - BPF_ANNOTATE_KV_PAIR(sk_pkt_out_cnt, int, struct bpf_spinlock_cnt); 84 - 85 - struct bpf_map_def SEC("maps") sk_pkt_out_cnt10 = { 73 + struct { 74 + __u32 type; 75 + __u32 map_flags; 76 + int *key; 77 + struct bpf_spinlock_cnt *value; 78 + } sk_pkt_out_cnt10 SEC(".maps") = { 86 79 .type = BPF_MAP_TYPE_SK_STORAGE, 87 - .key_size = sizeof(int), 88 - .value_size = sizeof(struct bpf_spinlock_cnt), 89 - .max_entries = 0, 90 80 .map_flags = BPF_F_NO_PREALLOC, 91 81 }; 92 - 93 - BPF_ANNOTATE_KV_PAIR(sk_pkt_out_cnt10, int, struct bpf_spinlock_cnt); 94 82 95 83 static bool is_loopback6(__u32 *a6) 96 84 {

+17 -16

tools/testing/selftests/bpf/progs/test_spin_lock.c

··· 10 10 int test_padding; 11 11 }; 12 12 13 - struct bpf_map_def SEC("maps") hmap = { 13 + struct { 14 + __u32 type; 15 + __u32 max_entries; 16 + int *key; 17 + struct hmap_elem *value; 18 + } hmap SEC(".maps") = { 14 19 .type = BPF_MAP_TYPE_HASH, 15 - .key_size = sizeof(int), 16 - .value_size = sizeof(struct hmap_elem), 17 20 .max_entries = 1, 18 21 }; 19 - 20 - BPF_ANNOTATE_KV_PAIR(hmap, int, struct hmap_elem); 21 - 22 22 23 23 struct cls_elem { 24 24 struct bpf_spin_lock lock; 25 25 volatile int cnt; 26 26 }; 27 27 28 - struct bpf_map_def SEC("maps") cls_map = { 28 + struct { 29 + __u32 type; 30 + struct bpf_cgroup_storage_key *key; 31 + struct cls_elem *value; 32 + } cls_map SEC(".maps") = { 29 33 .type = BPF_MAP_TYPE_CGROUP_STORAGE, 30 - .key_size = sizeof(struct bpf_cgroup_storage_key), 31 - .value_size = sizeof(struct cls_elem), 32 34 }; 33 - 34 - BPF_ANNOTATE_KV_PAIR(cls_map, struct bpf_cgroup_storage_key, 35 - struct cls_elem); 36 35 37 36 struct bpf_vqueue { 38 37 struct bpf_spin_lock lock; ··· 41 42 unsigned int rate; 42 43 }; 43 44 44 - struct bpf_map_def SEC("maps") vqueue = { 45 + struct { 46 + __u32 type; 47 + __u32 max_entries; 48 + int *key; 49 + struct bpf_vqueue *value; 50 + } vqueue SEC(".maps") = { 45 51 .type = BPF_MAP_TYPE_ARRAY, 46 - .key_size = sizeof(int), 47 - .value_size = sizeof(struct bpf_vqueue), 48 52 .max_entries = 1, 49 53 }; 50 54 51 - BPF_ANNOTATE_KV_PAIR(vqueue, int, struct bpf_vqueue); 52 55 #define CREDIT_PER_NS(delta, rate) (((delta) * rate) >> 20) 53 56 54 57 SEC("spin_lock_demo")

+30 -14

tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c

··· 8 8 #define PERF_MAX_STACK_DEPTH 127 9 9 #endif 10 10 11 - struct bpf_map_def SEC("maps") control_map = { 11 + struct { 12 + __u32 type; 13 + __u32 max_entries; 14 + __u32 *key; 15 + __u32 *value; 16 + } control_map SEC(".maps") = { 12 17 .type = BPF_MAP_TYPE_ARRAY, 13 - .key_size = sizeof(__u32), 14 - .value_size = sizeof(__u32), 15 18 .max_entries = 1, 16 19 }; 17 20 18 - struct bpf_map_def SEC("maps") stackid_hmap = { 21 + struct { 22 + __u32 type; 23 + __u32 max_entries; 24 + __u32 *key; 25 + __u32 *value; 26 + } stackid_hmap SEC(".maps") = { 19 27 .type = BPF_MAP_TYPE_HASH, 20 - .key_size = sizeof(__u32), 21 - .value_size = sizeof(__u32), 22 28 .max_entries = 16384, 23 29 }; 24 30 25 - struct bpf_map_def SEC("maps") stackmap = { 31 + typedef struct bpf_stack_build_id stack_trace_t[PERF_MAX_STACK_DEPTH]; 32 + 33 + struct { 34 + __u32 type; 35 + __u32 max_entries; 36 + __u32 map_flags; 37 + __u32 key_size; 38 + __u32 value_size; 39 + } stackmap SEC(".maps") = { 26 40 .type = BPF_MAP_TYPE_STACK_TRACE, 27 - .key_size = sizeof(__u32), 28 - .value_size = sizeof(struct bpf_stack_build_id) 29 - * PERF_MAX_STACK_DEPTH, 30 41 .max_entries = 128, 31 42 .map_flags = BPF_F_STACK_BUILD_ID, 43 + .key_size = sizeof(__u32), 44 + .value_size = sizeof(stack_trace_t), 32 45 }; 33 46 34 - struct bpf_map_def SEC("maps") stack_amap = { 47 + struct { 48 + __u32 type; 49 + __u32 max_entries; 50 + __u32 *key; 51 + /* there seems to be a bug in kernel not handling typedef properly */ 52 + struct bpf_stack_build_id (*value)[PERF_MAX_STACK_DEPTH]; 53 + } stack_amap SEC(".maps") = { 35 54 .type = BPF_MAP_TYPE_ARRAY, 36 - .key_size = sizeof(__u32), 37 - .value_size = sizeof(struct bpf_stack_build_id) 38 - * PERF_MAX_STACK_DEPTH, 39 55 .max_entries = 128, 40 56 }; 41 57

+28 -12

tools/testing/selftests/bpf/progs/test_stacktrace_map.c

··· 8 8 #define PERF_MAX_STACK_DEPTH 127 9 9 #endif 10 10 11 - struct bpf_map_def SEC("maps") control_map = { 11 + struct { 12 + __u32 type; 13 + __u32 max_entries; 14 + __u32 *key; 15 + __u32 *value; 16 + } control_map SEC(".maps") = { 12 17 .type = BPF_MAP_TYPE_ARRAY, 13 - .key_size = sizeof(__u32), 14 - .value_size = sizeof(__u32), 15 18 .max_entries = 1, 16 19 }; 17 20 18 - struct bpf_map_def SEC("maps") stackid_hmap = { 21 + struct { 22 + __u32 type; 23 + __u32 max_entries; 24 + __u32 *key; 25 + __u32 *value; 26 + } stackid_hmap SEC(".maps") = { 19 27 .type = BPF_MAP_TYPE_HASH, 20 - .key_size = sizeof(__u32), 21 - .value_size = sizeof(__u32), 22 28 .max_entries = 16384, 23 29 }; 24 30 25 - struct bpf_map_def SEC("maps") stackmap = { 31 + typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH]; 32 + 33 + struct { 34 + __u32 type; 35 + __u32 max_entries; 36 + __u32 key_size; 37 + __u32 value_size; 38 + } stackmap SEC(".maps") = { 26 39 .type = BPF_MAP_TYPE_STACK_TRACE, 27 - .key_size = sizeof(__u32), 28 - .value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH, 29 40 .max_entries = 16384, 41 + .key_size = sizeof(__u32), 42 + .value_size = sizeof(stack_trace_t), 30 43 }; 31 44 32 - struct bpf_map_def SEC("maps") stack_amap = { 45 + struct { 46 + __u32 type; 47 + __u32 max_entries; 48 + __u32 *key; 49 + __u64 (*value)[PERF_MAX_STACK_DEPTH]; 50 + } stack_amap SEC(".maps") = { 33 51 .type = BPF_MAP_TYPE_ARRAY, 34 - .key_size = sizeof(__u32), 35 - .value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH, 36 52 .max_entries = 16384, 37 53 }; 38 54

+71

tools/testing/selftests/bpf/progs/test_sysctl_loop1.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + 4 + #include <stdint.h> 5 + #include <string.h> 6 + 7 + #include <linux/stddef.h> 8 + #include <linux/bpf.h> 9 + 10 + #include "bpf_helpers.h" 11 + 12 + #ifndef ARRAY_SIZE 13 + #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 14 + #endif 15 + 16 + /* tcp_mem sysctl has only 3 ints, but this test is doing TCP_MEM_LOOPS */ 17 + #define TCP_MEM_LOOPS 28 /* because 30 doesn't fit into 512 bytes of stack */ 18 + #define MAX_ULONG_STR_LEN 7 19 + #define MAX_VALUE_STR_LEN (TCP_MEM_LOOPS * MAX_ULONG_STR_LEN) 20 + 21 + static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) 22 + { 23 + volatile char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string"; 24 + unsigned char i; 25 + char name[64]; 26 + int ret; 27 + 28 + memset(name, 0, sizeof(name)); 29 + ret = bpf_sysctl_get_name(ctx, name, sizeof(name), 0); 30 + if (ret < 0 || ret != sizeof(tcp_mem_name) - 1) 31 + return 0; 32 + 33 + #pragma clang loop unroll(disable) 34 + for (i = 0; i < sizeof(tcp_mem_name); ++i) 35 + if (name[i] != tcp_mem_name[i]) 36 + return 0; 37 + 38 + return 1; 39 + } 40 + 41 + SEC("cgroup/sysctl") 42 + int sysctl_tcp_mem(struct bpf_sysctl *ctx) 43 + { 44 + unsigned long tcp_mem[TCP_MEM_LOOPS] = {}; 45 + char value[MAX_VALUE_STR_LEN]; 46 + unsigned char i, off = 0; 47 + int ret; 48 + 49 + if (ctx->write) 50 + return 0; 51 + 52 + if (!is_tcp_mem(ctx)) 53 + return 0; 54 + 55 + ret = bpf_sysctl_get_current_value(ctx, value, MAX_VALUE_STR_LEN); 56 + if (ret < 0 || ret >= MAX_VALUE_STR_LEN) 57 + return 0; 58 + 59 + #pragma clang loop unroll(disable) 60 + for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) { 61 + ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0, 62 + tcp_mem + i); 63 + if (ret <= 0 || ret > MAX_ULONG_STR_LEN) 64 + return 0; 65 + off += ret & MAX_ULONG_STR_LEN; 66 + } 67 + 68 + return tcp_mem[0] < tcp_mem[1] && tcp_mem[1] < tcp_mem[2]; 69 + } 70 + 71 + char _license[] SEC("license") = "GPL";

+72

tools/testing/selftests/bpf/progs/test_sysctl_loop2.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + 4 + #include <stdint.h> 5 + #include <string.h> 6 + 7 + #include <linux/stddef.h> 8 + #include <linux/bpf.h> 9 + 10 + #include "bpf_helpers.h" 11 + 12 + #ifndef ARRAY_SIZE 13 + #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 14 + #endif 15 + 16 + /* tcp_mem sysctl has only 3 ints, but this test is doing TCP_MEM_LOOPS */ 17 + #define TCP_MEM_LOOPS 20 /* because 30 doesn't fit into 512 bytes of stack */ 18 + #define MAX_ULONG_STR_LEN 7 19 + #define MAX_VALUE_STR_LEN (TCP_MEM_LOOPS * MAX_ULONG_STR_LEN) 20 + 21 + static __attribute__((noinline)) int is_tcp_mem(struct bpf_sysctl *ctx) 22 + { 23 + volatile char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string_to_stress_byte_loop"; 24 + unsigned char i; 25 + char name[64]; 26 + int ret; 27 + 28 + memset(name, 0, sizeof(name)); 29 + ret = bpf_sysctl_get_name(ctx, name, sizeof(name), 0); 30 + if (ret < 0 || ret != sizeof(tcp_mem_name) - 1) 31 + return 0; 32 + 33 + #pragma clang loop unroll(disable) 34 + for (i = 0; i < sizeof(tcp_mem_name); ++i) 35 + if (name[i] != tcp_mem_name[i]) 36 + return 0; 37 + 38 + return 1; 39 + } 40 + 41 + 42 + SEC("cgroup/sysctl") 43 + int sysctl_tcp_mem(struct bpf_sysctl *ctx) 44 + { 45 + unsigned long tcp_mem[TCP_MEM_LOOPS] = {}; 46 + char value[MAX_VALUE_STR_LEN]; 47 + unsigned char i, off = 0; 48 + int ret; 49 + 50 + if (ctx->write) 51 + return 0; 52 + 53 + if (!is_tcp_mem(ctx)) 54 + return 0; 55 + 56 + ret = bpf_sysctl_get_current_value(ctx, value, MAX_VALUE_STR_LEN); 57 + if (ret < 0 || ret >= MAX_VALUE_STR_LEN) 58 + return 0; 59 + 60 + #pragma clang loop unroll(disable) 61 + for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) { 62 + ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0, 63 + tcp_mem + i); 64 + if (ret <= 0 || ret > MAX_ULONG_STR_LEN) 65 + return 0; 66 + off += ret & MAX_ULONG_STR_LEN; 67 + } 68 + 69 + return tcp_mem[0] < tcp_mem[1] && tcp_mem[1] < tcp_mem[2]; 70 + } 71 + 72 + char _license[] SEC("license") = "GPL";

+4 -1

tools/testing/selftests/bpf/progs/test_sysctl_prog.c

··· 8 8 #include <linux/bpf.h> 9 9 10 10 #include "bpf_helpers.h" 11 - #include "bpf_util.h" 12 11 13 12 /* Max supported length of a string with unsigned long in base 10 (pow2 - 1). */ 14 13 #define MAX_ULONG_STR_LEN 0xF 15 14 16 15 /* Max supported length of sysctl value string (pow2). */ 17 16 #define MAX_VALUE_STR_LEN 0x40 17 + 18 + #ifndef ARRAY_SIZE 19 + #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 20 + #endif 18 21 19 22 static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) 20 23 {

+6 -3

tools/testing/selftests/bpf/progs/test_tcp_estats.c

··· 148 148 struct tcp_estats_conn_id conn_id; 149 149 }; 150 150 151 - struct bpf_map_def SEC("maps") ev_record_map = { 151 + struct { 152 + __u32 type; 153 + __u32 max_entries; 154 + __u32 *key; 155 + struct tcp_estats_basic_event *value; 156 + } ev_record_map SEC(".maps") = { 152 157 .type = BPF_MAP_TYPE_HASH, 153 - .key_size = sizeof(__u32), 154 - .value_size = sizeof(struct tcp_estats_basic_event), 155 158 .max_entries = 1024, 156 159 }; 157 160

+12 -6

tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c

··· 14 14 #include "bpf_endian.h" 15 15 #include "test_tcpbpf.h" 16 16 17 - struct bpf_map_def SEC("maps") global_map = { 17 + struct { 18 + __u32 type; 19 + __u32 max_entries; 20 + __u32 *key; 21 + struct tcpbpf_globals *value; 22 + } global_map SEC(".maps") = { 18 23 .type = BPF_MAP_TYPE_ARRAY, 19 - .key_size = sizeof(__u32), 20 - .value_size = sizeof(struct tcpbpf_globals), 21 24 .max_entries = 4, 22 25 }; 23 26 24 - struct bpf_map_def SEC("maps") sockopt_results = { 27 + struct { 28 + __u32 type; 29 + __u32 max_entries; 30 + __u32 *key; 31 + int *value; 32 + } sockopt_results SEC(".maps") = { 25 33 .type = BPF_MAP_TYPE_ARRAY, 26 - .key_size = sizeof(__u32), 27 - .value_size = sizeof(int), 28 34 .max_entries = 2, 29 35 }; 30 36

+13 -5

tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c

··· 14 14 #include "bpf_endian.h" 15 15 #include "test_tcpnotify.h" 16 16 17 - struct bpf_map_def SEC("maps") global_map = { 17 + struct { 18 + __u32 type; 19 + __u32 max_entries; 20 + __u32 *key; 21 + struct tcpnotify_globals *value; 22 + } global_map SEC(".maps") = { 18 23 .type = BPF_MAP_TYPE_ARRAY, 19 - .key_size = sizeof(__u32), 20 - .value_size = sizeof(struct tcpnotify_globals), 21 24 .max_entries = 4, 22 25 }; 23 26 24 - struct bpf_map_def SEC("maps") perf_event_map = { 27 + struct { 28 + __u32 type; 29 + __u32 max_entries; 30 + __u32 key_size; 31 + __u32 value_size; 32 + } perf_event_map SEC(".maps") = { 25 33 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, 34 + .max_entries = 2, 26 35 .key_size = sizeof(int), 27 36 .value_size = sizeof(__u32), 28 - .max_entries = 2, 29 37 }; 30 38 31 39 int _version SEC("version") = 1;

+12 -6

tools/testing/selftests/bpf/progs/test_xdp.c

··· 22 22 23 23 int _version SEC("version") = 1; 24 24 25 - struct bpf_map_def SEC("maps") rxcnt = { 25 + struct { 26 + __u32 type; 27 + __u32 max_entries; 28 + __u32 *key; 29 + __u64 *value; 30 + } rxcnt SEC(".maps") = { 26 31 .type = BPF_MAP_TYPE_PERCPU_ARRAY, 27 - .key_size = sizeof(__u32), 28 - .value_size = sizeof(__u64), 29 32 .max_entries = 256, 30 33 }; 31 34 32 - struct bpf_map_def SEC("maps") vip2tnl = { 35 + struct { 36 + __u32 type; 37 + __u32 max_entries; 38 + struct vip *key; 39 + struct iptnl_info *value; 40 + } vip2tnl SEC(".maps") = { 33 41 .type = BPF_MAP_TYPE_HASH, 34 - .key_size = sizeof(struct vip), 35 - .value_size = sizeof(struct iptnl_info), 36 42 .max_entries = MAX_IPTNL_ENTRIES, 37 43 }; 38 44

+231

tools/testing/selftests/bpf/progs/test_xdp_loop.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + #include <stddef.h> 4 + #include <string.h> 5 + #include <linux/bpf.h> 6 + #include <linux/if_ether.h> 7 + #include <linux/if_packet.h> 8 + #include <linux/ip.h> 9 + #include <linux/ipv6.h> 10 + #include <linux/in.h> 11 + #include <linux/udp.h> 12 + #include <linux/tcp.h> 13 + #include <linux/pkt_cls.h> 14 + #include <sys/socket.h> 15 + #include "bpf_helpers.h" 16 + #include "bpf_endian.h" 17 + #include "test_iptunnel_common.h" 18 + 19 + int _version SEC("version") = 1; 20 + 21 + struct bpf_map_def SEC("maps") rxcnt = { 22 + .type = BPF_MAP_TYPE_PERCPU_ARRAY, 23 + .key_size = sizeof(__u32), 24 + .value_size = sizeof(__u64), 25 + .max_entries = 256, 26 + }; 27 + 28 + struct bpf_map_def SEC("maps") vip2tnl = { 29 + .type = BPF_MAP_TYPE_HASH, 30 + .key_size = sizeof(struct vip), 31 + .value_size = sizeof(struct iptnl_info), 32 + .max_entries = MAX_IPTNL_ENTRIES, 33 + }; 34 + 35 + static __always_inline void count_tx(__u32 protocol) 36 + { 37 + __u64 *rxcnt_count; 38 + 39 + rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol); 40 + if (rxcnt_count) 41 + *rxcnt_count += 1; 42 + } 43 + 44 + static __always_inline int get_dport(void *trans_data, void *data_end, 45 + __u8 protocol) 46 + { 47 + struct tcphdr *th; 48 + struct udphdr *uh; 49 + 50 + switch (protocol) { 51 + case IPPROTO_TCP: 52 + th = (struct tcphdr *)trans_data; 53 + if (th + 1 > data_end) 54 + return -1; 55 + return th->dest; 56 + case IPPROTO_UDP: 57 + uh = (struct udphdr *)trans_data; 58 + if (uh + 1 > data_end) 59 + return -1; 60 + return uh->dest; 61 + default: 62 + return 0; 63 + } 64 + } 65 + 66 + static __always_inline void set_ethhdr(struct ethhdr *new_eth, 67 + const struct ethhdr *old_eth, 68 + const struct iptnl_info *tnl, 69 + __be16 h_proto) 70 + { 71 + memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); 72 + memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest)); 73 + new_eth->h_proto = h_proto; 74 + } 75 + 76 + static __always_inline int handle_ipv4(struct xdp_md *xdp) 77 + { 78 + void *data_end = (void *)(long)xdp->data_end; 79 + void *data = (void *)(long)xdp->data; 80 + struct iptnl_info *tnl; 81 + struct ethhdr *new_eth; 82 + struct ethhdr *old_eth; 83 + struct iphdr *iph = data + sizeof(struct ethhdr); 84 + __u16 *next_iph; 85 + __u16 payload_len; 86 + struct vip vip = {}; 87 + int dport; 88 + __u32 csum = 0; 89 + int i; 90 + 91 + if (iph + 1 > data_end) 92 + return XDP_DROP; 93 + 94 + dport = get_dport(iph + 1, data_end, iph->protocol); 95 + if (dport == -1) 96 + return XDP_DROP; 97 + 98 + vip.protocol = iph->protocol; 99 + vip.family = AF_INET; 100 + vip.daddr.v4 = iph->daddr; 101 + vip.dport = dport; 102 + payload_len = bpf_ntohs(iph->tot_len); 103 + 104 + tnl = bpf_map_lookup_elem(&vip2tnl, &vip); 105 + /* It only does v4-in-v4 */ 106 + if (!tnl || tnl->family != AF_INET) 107 + return XDP_PASS; 108 + 109 + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr))) 110 + return XDP_DROP; 111 + 112 + data = (void *)(long)xdp->data; 113 + data_end = (void *)(long)xdp->data_end; 114 + 115 + new_eth = data; 116 + iph = data + sizeof(*new_eth); 117 + old_eth = data + sizeof(*iph); 118 + 119 + if (new_eth + 1 > data_end || 120 + old_eth + 1 > data_end || 121 + iph + 1 > data_end) 122 + return XDP_DROP; 123 + 124 + set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP)); 125 + 126 + iph->version = 4; 127 + iph->ihl = sizeof(*iph) >> 2; 128 + iph->frag_off = 0; 129 + iph->protocol = IPPROTO_IPIP; 130 + iph->check = 0; 131 + iph->tos = 0; 132 + iph->tot_len = bpf_htons(payload_len + sizeof(*iph)); 133 + iph->daddr = tnl->daddr.v4; 134 + iph->saddr = tnl->saddr.v4; 135 + iph->ttl = 8; 136 + 137 + next_iph = (__u16 *)iph; 138 + #pragma clang loop unroll(disable) 139 + for (i = 0; i < sizeof(*iph) >> 1; i++) 140 + csum += *next_iph++; 141 + 142 + iph->check = ~((csum & 0xffff) + (csum >> 16)); 143 + 144 + count_tx(vip.protocol); 145 + 146 + return XDP_TX; 147 + } 148 + 149 + static __always_inline int handle_ipv6(struct xdp_md *xdp) 150 + { 151 + void *data_end = (void *)(long)xdp->data_end; 152 + void *data = (void *)(long)xdp->data; 153 + struct iptnl_info *tnl; 154 + struct ethhdr *new_eth; 155 + struct ethhdr *old_eth; 156 + struct ipv6hdr *ip6h = data + sizeof(struct ethhdr); 157 + __u16 payload_len; 158 + struct vip vip = {}; 159 + int dport; 160 + 161 + if (ip6h + 1 > data_end) 162 + return XDP_DROP; 163 + 164 + dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr); 165 + if (dport == -1) 166 + return XDP_DROP; 167 + 168 + vip.protocol = ip6h->nexthdr; 169 + vip.family = AF_INET6; 170 + memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr)); 171 + vip.dport = dport; 172 + payload_len = ip6h->payload_len; 173 + 174 + tnl = bpf_map_lookup_elem(&vip2tnl, &vip); 175 + /* It only does v6-in-v6 */ 176 + if (!tnl || tnl->family != AF_INET6) 177 + return XDP_PASS; 178 + 179 + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr))) 180 + return XDP_DROP; 181 + 182 + data = (void *)(long)xdp->data; 183 + data_end = (void *)(long)xdp->data_end; 184 + 185 + new_eth = data; 186 + ip6h = data + sizeof(*new_eth); 187 + old_eth = data + sizeof(*ip6h); 188 + 189 + if (new_eth + 1 > data_end || old_eth + 1 > data_end || 190 + ip6h + 1 > data_end) 191 + return XDP_DROP; 192 + 193 + set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6)); 194 + 195 + ip6h->version = 6; 196 + ip6h->priority = 0; 197 + memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl)); 198 + ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + sizeof(*ip6h)); 199 + ip6h->nexthdr = IPPROTO_IPV6; 200 + ip6h->hop_limit = 8; 201 + memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6)); 202 + memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6)); 203 + 204 + count_tx(vip.protocol); 205 + 206 + return XDP_TX; 207 + } 208 + 209 + SEC("xdp_tx_iptunnel") 210 + int _xdp_tx_iptunnel(struct xdp_md *xdp) 211 + { 212 + void *data_end = (void *)(long)xdp->data_end; 213 + void *data = (void *)(long)xdp->data; 214 + struct ethhdr *eth = data; 215 + __u16 h_proto; 216 + 217 + if (eth + 1 > data_end) 218 + return XDP_DROP; 219 + 220 + h_proto = eth->h_proto; 221 + 222 + if (h_proto == bpf_htons(ETH_P_IP)) 223 + return handle_ipv4(xdp); 224 + else if (h_proto == bpf_htons(ETH_P_IPV6)) 225 + 226 + return handle_ipv6(xdp); 227 + else 228 + return XDP_DROP; 229 + } 230 + 231 + char _license[] SEC("license") = "GPL";

+37 -23

tools/testing/selftests/bpf/progs/test_xdp_noinline.c

··· 163 163 __u64 v1; 164 164 }; 165 165 166 - struct bpf_map_def __attribute__ ((section("maps"), used)) vip_map = { 166 + struct { 167 + __u32 type; 168 + __u32 max_entries; 169 + struct vip_definition *key; 170 + struct vip_meta *value; 171 + } vip_map SEC(".maps") = { 167 172 .type = BPF_MAP_TYPE_HASH, 168 - .key_size = sizeof(struct vip_definition), 169 - .value_size = sizeof(struct vip_meta), 170 173 .max_entries = 512, 171 - .map_flags = 0, 172 174 }; 173 175 174 - struct bpf_map_def __attribute__ ((section("maps"), used)) lru_cache = { 176 + struct { 177 + __u32 type; 178 + __u32 max_entries; 179 + __u32 map_flags; 180 + struct flow_key *key; 181 + struct real_pos_lru *value; 182 + } lru_cache SEC(".maps") = { 175 183 .type = BPF_MAP_TYPE_LRU_HASH, 176 - .key_size = sizeof(struct flow_key), 177 - .value_size = sizeof(struct real_pos_lru), 178 184 .max_entries = 300, 179 185 .map_flags = 1U << 1, 180 186 }; 181 187 182 - struct bpf_map_def __attribute__ ((section("maps"), used)) ch_rings = { 188 + struct { 189 + __u32 type; 190 + __u32 max_entries; 191 + __u32 *key; 192 + __u32 *value; 193 + } ch_rings SEC(".maps") = { 183 194 .type = BPF_MAP_TYPE_ARRAY, 184 - .key_size = sizeof(__u32), 185 - .value_size = sizeof(__u32), 186 195 .max_entries = 12 * 655, 187 - .map_flags = 0, 188 196 }; 189 197 190 - struct bpf_map_def __attribute__ ((section("maps"), used)) reals = { 198 + struct { 199 + __u32 type; 200 + __u32 max_entries; 201 + __u32 *key; 202 + struct real_definition *value; 203 + } reals SEC(".maps") = { 191 204 .type = BPF_MAP_TYPE_ARRAY, 192 - .key_size = sizeof(__u32), 193 - .value_size = sizeof(struct real_definition), 194 205 .max_entries = 40, 195 - .map_flags = 0, 196 206 }; 197 207 198 - struct bpf_map_def __attribute__ ((section("maps"), used)) stats = { 208 + struct { 209 + __u32 type; 210 + __u32 max_entries; 211 + __u32 *key; 212 + struct lb_stats *value; 213 + } stats SEC(".maps") = { 199 214 .type = BPF_MAP_TYPE_PERCPU_ARRAY, 200 - .key_size = sizeof(__u32), 201 - .value_size = sizeof(struct lb_stats), 202 215 .max_entries = 515, 203 - .map_flags = 0, 204 216 }; 205 217 206 - struct bpf_map_def __attribute__ ((section("maps"), used)) ctl_array = { 218 + struct { 219 + __u32 type; 220 + __u32 max_entries; 221 + __u32 *key; 222 + struct ctl_value *value; 223 + } ctl_array SEC(".maps") = { 207 224 .type = BPF_MAP_TYPE_ARRAY, 208 - .key_size = sizeof(__u32), 209 - .value_size = sizeof(struct ctl_value), 210 225 .max_entries = 16, 211 - .map_flags = 0, 212 226 }; 213 227 214 228 struct eth_hdr {

+3 -7

tools/testing/selftests/bpf/test_btf.c

··· 4016 4016 }; 4017 4017 4018 4018 static struct btf_file_test file_tests[] = { 4019 - { 4020 - .file = "test_btf_haskv.o", 4021 - }, 4022 - { 4023 - .file = "test_btf_nokv.o", 4024 - .btf_kv_notfound = true, 4025 - }, 4019 + { .file = "test_btf_haskv.o", }, 4020 + { .file = "test_btf_newkv.o", }, 4021 + { .file = "test_btf_nokv.o", .btf_kv_notfound = true, }, 4026 4022 }; 4027 4023 4028 4024 static int do_test_file(unsigned int test_num)

+54

tools/testing/selftests/bpf/test_select_reuseport.c

··· 523 523 printf("OK\n"); 524 524 } 525 525 526 + static void test_detach_bpf(int type, sa_family_t family) 527 + { 528 + #ifdef SO_DETACH_REUSEPORT_BPF 529 + __u32 nr_run_before = 0, nr_run_after = 0, tmp, i; 530 + struct epoll_event ev; 531 + int cli_fd, err, nev; 532 + struct cmd cmd = {}; 533 + int optvalue = 0; 534 + 535 + printf("%s: ", __func__); 536 + err = setsockopt(sk_fds[0], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF, 537 + &optvalue, sizeof(optvalue)); 538 + CHECK(err == -1, "setsockopt(SO_DETACH_REUSEPORT_BPF)", 539 + "err:%d errno:%d\n", err, errno); 540 + 541 + err = setsockopt(sk_fds[1], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF, 542 + &optvalue, sizeof(optvalue)); 543 + CHECK(err == 0 || errno != ENOENT, "setsockopt(SO_DETACH_REUSEPORT_BPF)", 544 + "err:%d errno:%d\n", err, errno); 545 + 546 + for (i = 0; i < NR_RESULTS; i++) { 547 + err = bpf_map_lookup_elem(result_map, &i, &tmp); 548 + CHECK(err == -1, "lookup_elem(result_map)", 549 + "i:%u err:%d errno:%d\n", i, err, errno); 550 + nr_run_before += tmp; 551 + } 552 + 553 + cli_fd = send_data(type, family, &cmd, sizeof(cmd), PASS); 554 + nev = epoll_wait(epfd, &ev, 1, 5); 555 + CHECK(nev <= 0, "nev <= 0", 556 + "nev:%d expected:1 type:%d family:%d data:(0, 0)\n", 557 + nev, type, family); 558 + 559 + for (i = 0; i < NR_RESULTS; i++) { 560 + err = bpf_map_lookup_elem(result_map, &i, &tmp); 561 + CHECK(err == -1, "lookup_elem(result_map)", 562 + "i:%u err:%d errno:%d\n", i, err, errno); 563 + nr_run_after += tmp; 564 + } 565 + 566 + CHECK(nr_run_before != nr_run_after, 567 + "nr_run_before != nr_run_after", 568 + "nr_run_before:%u nr_run_after:%u\n", 569 + nr_run_before, nr_run_after); 570 + 571 + printf("OK\n"); 572 + close(cli_fd); 573 + #else 574 + printf("%s: SKIP\n", __func__); 575 + #endif 576 + } 577 + 526 578 static void prepare_sk_fds(int type, sa_family_t family, bool inany) 527 579 { 528 580 const int first = REUSEPORT_ARRAY_SIZE - 1; ··· 716 664 test_pass(type, family); 717 665 test_syncookie(type, family); 718 666 test_pass_on_err(type, family); 667 + /* Must be the last test */ 668 + test_detach_bpf(type, family); 719 669 720 670 cleanup_per_test(); 721 671 printf("\n");

+9 -15

tools/testing/selftests/bpf/test_socket_cookie.c

··· 18 18 #define CG_PATH "/foo" 19 19 #define SOCKET_COOKIE_PROG "./socket_cookie_prog.o" 20 20 21 + struct socket_cookie { 22 + __u64 cookie_key; 23 + __u32 cookie_value; 24 + }; 25 + 21 26 static int start_server(void) 22 27 { 23 28 struct sockaddr_in6 addr; ··· 94 89 __u32 cookie_expected_value; 95 90 struct sockaddr_in6 addr; 96 91 socklen_t len = sizeof(addr); 97 - __u32 cookie_value; 98 - __u64 cookie_key; 92 + struct socket_cookie val; 99 93 int err = 0; 100 94 int map_fd; 101 95 ··· 105 101 106 102 map_fd = bpf_map__fd(map); 107 103 108 - err = bpf_map_get_next_key(map_fd, NULL, &cookie_key); 109 - if (err) { 110 - log_err("Can't get cookie key from map"); 111 - goto out; 112 - } 113 - 114 - err = bpf_map_lookup_elem(map_fd, &cookie_key, &cookie_value); 115 - if (err) { 116 - log_err("Can't get cookie value from map"); 117 - goto out; 118 - } 104 + err = bpf_map_lookup_elem(map_fd, &client_fd, &val); 119 105 120 106 err = getsockname(client_fd, (struct sockaddr *)&addr, &len); 121 107 if (err) { ··· 114 120 } 115 121 116 122 cookie_expected_value = (ntohs(addr.sin6_port) << 8) | 0xFF; 117 - if (cookie_value != cookie_expected_value) { 118 - log_err("Unexpected value in map: %x != %x", cookie_value, 123 + if (val.cookie_value != cookie_expected_value) { 124 + log_err("Unexpected value in map: %x != %x", val.cookie_value, 119 125 cookie_expected_value); 120 126 goto err; 121 127 }

+4 -7

tools/testing/selftests/bpf/test_verifier.c

··· 234 234 insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 235 235 -8 * (k % 64 + 1)); 236 236 } 237 - /* every jump adds 1 step to insn_processed, so to stay exactly 238 - * within 1m limit add MAX_TEST_INSNS - MAX_JMP_SEQ - 1 MOVs and 1 EXIT 237 + /* is_state_visited() doesn't allocate state for pruning for every jump. 238 + * Hence multiply jmps by 4 to accommodate that heuristic 239 239 */ 240 - while (i < MAX_TEST_INSNS - MAX_JMP_SEQ - 1) 240 + while (i < MAX_TEST_INSNS - MAX_JMP_SEQ * 4) 241 241 insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42); 242 242 insn[i] = BPF_EXIT_INSN(); 243 243 self->prog_len = i + 1; ··· 266 266 insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 267 267 -8 * (k % (64 - 4 * FUNC_NEST) + 1)); 268 268 } 269 - /* every jump adds 1 step to insn_processed, so to stay exactly 270 - * within 1m limit add MAX_TEST_INSNS - MAX_JMP_SEQ - 1 MOVs and 1 EXIT 271 - */ 272 - while (i < MAX_TEST_INSNS - MAX_JMP_SEQ - 1) 269 + while (i < MAX_TEST_INSNS - MAX_JMP_SEQ * 4) 273 270 insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42); 274 271 insn[i] = BPF_EXIT_INSN(); 275 272 self->prog_len = i + 1;

+13 -9

tools/testing/selftests/bpf/verifier/calls.c

··· 215 215 BPF_MOV64_IMM(BPF_REG_0, 3), 216 216 BPF_JMP_IMM(BPF_JA, 0, 0, -6), 217 217 }, 218 - .prog_type = BPF_PROG_TYPE_TRACEPOINT, 219 - .errstr = "back-edge from insn", 220 - .result = REJECT, 218 + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, 219 + .errstr_unpriv = "back-edge from insn", 220 + .result_unpriv = REJECT, 221 + .result = ACCEPT, 222 + .retval = 1, 221 223 }, 222 224 { 223 225 "calls: conditional call 4", ··· 252 250 BPF_MOV64_IMM(BPF_REG_0, 3), 253 251 BPF_EXIT_INSN(), 254 252 }, 255 - .prog_type = BPF_PROG_TYPE_TRACEPOINT, 256 - .errstr = "back-edge from insn", 257 - .result = REJECT, 253 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 254 + .result = ACCEPT, 255 + .retval = 1, 258 256 }, 259 257 { 260 258 "calls: conditional call 6", 261 259 .insns = { 260 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 261 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 262 262 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 263 - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -2), 263 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -3), 264 264 BPF_EXIT_INSN(), 265 265 BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 266 266 offsetof(struct __sk_buff, mark)), 267 267 BPF_EXIT_INSN(), 268 268 }, 269 - .prog_type = BPF_PROG_TYPE_TRACEPOINT, 270 - .errstr = "back-edge from insn", 269 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 270 + .errstr = "infinite loop detected", 271 271 .result = REJECT, 272 272 }, 273 273 {

+7 -4

tools/testing/selftests/bpf/verifier/cfg.c

··· 41 41 BPF_JMP_IMM(BPF_JA, 0, 0, -1), 42 42 BPF_EXIT_INSN(), 43 43 }, 44 - .errstr = "back-edge", 44 + .errstr = "unreachable insn 1", 45 + .errstr_unpriv = "back-edge", 45 46 .result = REJECT, 46 47 }, 47 48 { ··· 54 53 BPF_JMP_IMM(BPF_JA, 0, 0, -4), 55 54 BPF_EXIT_INSN(), 56 55 }, 57 - .errstr = "back-edge", 56 + .errstr = "unreachable insn 4", 57 + .errstr_unpriv = "back-edge", 58 58 .result = REJECT, 59 59 }, 60 60 { 61 61 "conditional loop", 62 62 .insns = { 63 - BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 63 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), 64 64 BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 65 65 BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), 66 66 BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3), 67 67 BPF_EXIT_INSN(), 68 68 }, 69 - .errstr = "back-edge", 69 + .errstr = "infinite loop detected", 70 + .errstr_unpriv = "back-edge", 70 71 .result = REJECT, 71 72 },

+2 -1

tools/testing/selftests/bpf/verifier/direct_packet_access.c

··· 511 511 offsetof(struct __sk_buff, data)), 512 512 BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 513 513 offsetof(struct __sk_buff, data_end)), 514 - BPF_MOV64_IMM(BPF_REG_0, 0xffffffff), 514 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 515 + offsetof(struct __sk_buff, mark)), 515 516 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), 516 517 BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), 517 518 BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffff),

+15 -13

tools/testing/selftests/bpf/verifier/helper_access_var_len.c

··· 29 29 { 30 30 "helper access to variable memory: stack, bitwise AND, zero included", 31 31 .insns = { 32 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), 32 33 BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 33 34 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), 34 - BPF_MOV64_IMM(BPF_REG_2, 16), 35 35 BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), 36 36 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), 37 37 BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), ··· 46 46 { 47 47 "helper access to variable memory: stack, bitwise AND + JMP, wrong max", 48 48 .insns = { 49 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), 49 50 BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 50 51 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), 51 - BPF_MOV64_IMM(BPF_REG_2, 16), 52 52 BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), 53 53 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), 54 54 BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 65), ··· 122 122 { 123 123 "helper access to variable memory: stack, JMP, bounds + offset", 124 124 .insns = { 125 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), 125 126 BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 126 127 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), 127 - BPF_MOV64_IMM(BPF_REG_2, 16), 128 128 BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), 129 129 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), 130 130 BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 5), ··· 143 143 { 144 144 "helper access to variable memory: stack, JMP, wrong max", 145 145 .insns = { 146 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), 146 147 BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 147 148 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), 148 - BPF_MOV64_IMM(BPF_REG_2, 16), 149 149 BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), 150 150 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), 151 151 BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 65, 4), ··· 163 163 { 164 164 "helper access to variable memory: stack, JMP, no max check", 165 165 .insns = { 166 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), 166 167 BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 167 168 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), 168 - BPF_MOV64_IMM(BPF_REG_2, 16), 169 169 BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), 170 170 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), 171 171 BPF_MOV64_IMM(BPF_REG_4, 0), ··· 183 183 { 184 184 "helper access to variable memory: stack, JMP, no min check", 185 185 .insns = { 186 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), 186 187 BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 187 188 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), 188 - BPF_MOV64_IMM(BPF_REG_2, 16), 189 189 BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), 190 190 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), 191 191 BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3), ··· 201 201 { 202 202 "helper access to variable memory: stack, JMP (signed), no min check", 203 203 .insns = { 204 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), 204 205 BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 205 206 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), 206 - BPF_MOV64_IMM(BPF_REG_2, 16), 207 207 BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128), 208 208 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), 209 209 BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3), ··· 244 244 { 245 245 "helper access to variable memory: map, JMP, wrong max", 246 246 .insns = { 247 + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8), 247 248 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 248 249 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 249 250 BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), ··· 252 251 BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 253 252 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10), 254 253 BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 255 - BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), 254 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_6), 256 255 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), 257 256 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), 258 257 BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, sizeof(struct test_val) + 1, 4), ··· 263 262 BPF_MOV64_IMM(BPF_REG_0, 0), 264 263 BPF_EXIT_INSN(), 265 264 }, 266 - .fixup_map_hash_48b = { 3 }, 265 + .fixup_map_hash_48b = { 4 }, 267 266 .errstr = "invalid access to map value, value_size=48 off=0 size=49", 268 267 .result = REJECT, 269 268 .prog_type = BPF_PROG_TYPE_TRACEPOINT, ··· 297 296 { 298 297 "helper access to variable memory: map adjusted, JMP, wrong max", 299 298 .insns = { 299 + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8), 300 300 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 301 301 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 302 302 BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), ··· 306 304 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), 307 305 BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 308 306 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20), 309 - BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), 307 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_6), 310 308 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), 311 309 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), 312 310 BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, sizeof(struct test_val) - 19, 4), ··· 317 315 BPF_MOV64_IMM(BPF_REG_0, 0), 318 316 BPF_EXIT_INSN(), 319 317 }, 320 - .fixup_map_hash_48b = { 3 }, 318 + .fixup_map_hash_48b = { 4 }, 321 319 .errstr = "R1 min value is outside of the array range", 322 320 .result = REJECT, 323 321 .prog_type = BPF_PROG_TYPE_TRACEPOINT, ··· 339 337 { 340 338 "helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)", 341 339 .insns = { 340 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), 342 341 BPF_MOV64_IMM(BPF_REG_1, 0), 343 - BPF_MOV64_IMM(BPF_REG_2, 1), 344 342 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), 345 343 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), 346 344 BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), ··· 564 562 { 565 563 "helper access to variable memory: 8 bytes leak", 566 564 .insns = { 565 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), 567 566 BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 568 567 BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64), 569 568 BPF_MOV64_IMM(BPF_REG_0, 0), ··· 575 572 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), 576 573 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), 577 574 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), 578 - BPF_MOV64_IMM(BPF_REG_2, 1), 579 575 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), 580 576 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), 581 577 BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63),

+161

tools/testing/selftests/bpf/verifier/loops1.c

··· 1 + { 2 + "bounded loop, count to 4", 3 + .insns = { 4 + BPF_MOV64_IMM(BPF_REG_0, 0), 5 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 6 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2), 7 + BPF_EXIT_INSN(), 8 + }, 9 + .result = ACCEPT, 10 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 11 + .retval = 4, 12 + }, 13 + { 14 + "bounded loop, count to 20", 15 + .insns = { 16 + BPF_MOV64_IMM(BPF_REG_0, 0), 17 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 3), 18 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 20, -2), 19 + BPF_EXIT_INSN(), 20 + }, 21 + .result = ACCEPT, 22 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 23 + }, 24 + { 25 + "bounded loop, count from positive unknown to 4", 26 + .insns = { 27 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), 28 + BPF_JMP_IMM(BPF_JSLT, BPF_REG_0, 0, 2), 29 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 30 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2), 31 + BPF_EXIT_INSN(), 32 + }, 33 + .result = ACCEPT, 34 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 35 + .retval = 4, 36 + }, 37 + { 38 + "bounded loop, count from totally unknown to 4", 39 + .insns = { 40 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), 41 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 42 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2), 43 + BPF_EXIT_INSN(), 44 + }, 45 + .result = ACCEPT, 46 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 47 + }, 48 + { 49 + "bounded loop, count to 4 with equality", 50 + .insns = { 51 + BPF_MOV64_IMM(BPF_REG_0, 0), 52 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 53 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, -2), 54 + BPF_EXIT_INSN(), 55 + }, 56 + .result = ACCEPT, 57 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 58 + }, 59 + { 60 + "bounded loop, start in the middle", 61 + .insns = { 62 + BPF_MOV64_IMM(BPF_REG_0, 0), 63 + BPF_JMP_A(1), 64 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 65 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2), 66 + BPF_EXIT_INSN(), 67 + }, 68 + .result = REJECT, 69 + .errstr = "back-edge", 70 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 71 + .retval = 4, 72 + }, 73 + { 74 + "bounded loop containing a forward jump", 75 + .insns = { 76 + BPF_MOV64_IMM(BPF_REG_0, 0), 77 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 78 + BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_0, 0), 79 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -3), 80 + BPF_EXIT_INSN(), 81 + }, 82 + .result = ACCEPT, 83 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 84 + .retval = 4, 85 + }, 86 + { 87 + "bounded loop that jumps out rather than in", 88 + .insns = { 89 + BPF_MOV64_IMM(BPF_REG_6, 0), 90 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1), 91 + BPF_JMP_IMM(BPF_JGT, BPF_REG_6, 10000, 2), 92 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), 93 + BPF_JMP_A(-4), 94 + BPF_EXIT_INSN(), 95 + }, 96 + .result = ACCEPT, 97 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 98 + }, 99 + { 100 + "infinite loop after a conditional jump", 101 + .insns = { 102 + BPF_MOV64_IMM(BPF_REG_0, 5), 103 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, 2), 104 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 105 + BPF_JMP_A(-2), 106 + BPF_EXIT_INSN(), 107 + }, 108 + .result = REJECT, 109 + .errstr = "program is too large", 110 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 111 + }, 112 + { 113 + "bounded recursion", 114 + .insns = { 115 + BPF_MOV64_IMM(BPF_REG_1, 0), 116 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 117 + BPF_EXIT_INSN(), 118 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), 119 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), 120 + BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 4, 1), 121 + BPF_EXIT_INSN(), 122 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), 123 + BPF_EXIT_INSN(), 124 + }, 125 + .result = REJECT, 126 + .errstr = "back-edge", 127 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 128 + }, 129 + { 130 + "infinite loop in two jumps", 131 + .insns = { 132 + BPF_MOV64_IMM(BPF_REG_0, 0), 133 + BPF_JMP_A(0), 134 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2), 135 + BPF_EXIT_INSN(), 136 + }, 137 + .result = REJECT, 138 + .errstr = "loop detected", 139 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 140 + }, 141 + { 142 + "infinite loop: three-jump trick", 143 + .insns = { 144 + BPF_MOV64_IMM(BPF_REG_0, 0), 145 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 146 + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), 147 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 2, 1), 148 + BPF_EXIT_INSN(), 149 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 150 + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), 151 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 2, 1), 152 + BPF_EXIT_INSN(), 153 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), 154 + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), 155 + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 2, -11), 156 + BPF_EXIT_INSN(), 157 + }, 158 + .result = REJECT, 159 + .errstr = "loop detected", 160 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 161 + },

-15

tools/testing/selftests/bpf/verifier/prevent_map_lookup.c

··· 29 29 .prog_type = BPF_PROG_TYPE_SOCK_OPS, 30 30 }, 31 31 { 32 - "prevent map lookup in xskmap", 33 - .insns = { 34 - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 35 - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 36 - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 37 - BPF_LD_MAP_FD(BPF_REG_1, 0), 38 - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), 39 - BPF_EXIT_INSN(), 40 - }, 41 - .fixup_map_xskmap = { 3 }, 42 - .result = REJECT, 43 - .errstr = "cannot pass map_type 17 into func bpf_map_lookup_elem", 44 - .prog_type = BPF_PROG_TYPE_XDP, 45 - }, 46 - { 47 32 "prevent map lookup in stack trace", 48 33 .insns = { 49 34 BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),

+18

tools/testing/selftests/bpf/verifier/sock.c

··· 498 498 .result = REJECT, 499 499 .errstr = "cannot pass map_type 24 into func bpf_map_lookup_elem", 500 500 }, 501 + { 502 + "bpf_map_lookup_elem(xskmap, &key); xs->queue_id", 503 + .insns = { 504 + BPF_ST_MEM(BPF_W, BPF_REG_10, -8, 0), 505 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 506 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 507 + BPF_LD_MAP_FD(BPF_REG_1, 0), 508 + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 509 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), 510 + BPF_EXIT_INSN(), 511 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_xdp_sock, queue_id)), 512 + BPF_MOV64_IMM(BPF_REG_0, 0), 513 + BPF_EXIT_INSN(), 514 + }, 515 + .fixup_map_xskmap = { 3 }, 516 + .prog_type = BPF_PROG_TYPE_XDP, 517 + .result = ACCEPT, 518 + },