Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+15

Documentation/bpf/bpf_design_QA.rst

··· 258 258 helpers, etc be added out of kernel module code? 259 259 260 260 A: NO. 261 + 262 + Q: Directly calling kernel function is an ABI? 263 + ---------------------------------------------- 264 + Q: Some kernel functions (e.g. tcp_slow_start) can be called 265 + by BPF programs. Do these kernel functions become an ABI? 266 + 267 + A: NO. 268 + 269 + The kernel function protos will change and the bpf programs will be 270 + rejected by the verifier. Also, for example, some of the bpf-callable 271 + kernel functions have already been used by other kernel tcp 272 + cc (congestion-control) implementations. If any of these kernel 273 + functions has changed, both the in-tree and out-of-tree kernel tcp cc 274 + implementations have to be changed. The same goes for the bpf 275 + programs and they have to be adjusted accordingly.

+5

arch/x86/net/bpf_jit_comp.c

··· 2346 2346 tmp : orig_prog); 2347 2347 return prog; 2348 2348 } 2349 + 2350 + bool bpf_jit_supports_kfunc_call(void) 2351 + { 2352 + return true; 2353 + }

+198

arch/x86/net/bpf_jit_comp32.c

··· 1390 1390 *pprog = prog; 1391 1391 } 1392 1392 1393 + static void emit_push_r32(const u8 src[], u8 **pprog) 1394 + { 1395 + u8 *prog = *pprog; 1396 + int cnt = 0; 1397 + 1398 + /* mov ecx,dword ptr [ebp+off] */ 1399 + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo)); 1400 + /* push ecx */ 1401 + EMIT1(0x51); 1402 + 1403 + *pprog = prog; 1404 + } 1405 + 1393 1406 static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo) 1394 1407 { 1395 1408 u8 jmp_cond; ··· 1470 1457 } 1471 1458 1472 1459 return jmp_cond; 1460 + } 1461 + 1462 + /* i386 kernel compiles with "-mregparm=3". From gcc document: 1463 + * 1464 + * ==== snippet ==== 1465 + * regparm (number) 1466 + * On x86-32 targets, the regparm attribute causes the compiler 1467 + * to pass arguments number one to (number) if they are of integral 1468 + * type in registers EAX, EDX, and ECX instead of on the stack. 1469 + * Functions that take a variable number of arguments continue 1470 + * to be passed all of their arguments on the stack. 1471 + * ==== snippet ==== 1472 + * 1473 + * The first three args of a function will be considered for 1474 + * putting into the 32bit register EAX, EDX, and ECX. 1475 + * 1476 + * Two 32bit registers are used to pass a 64bit arg. 1477 + * 1478 + * For example, 1479 + * void foo(u32 a, u32 b, u32 c, u32 d): 1480 + * u32 a: EAX 1481 + * u32 b: EDX 1482 + * u32 c: ECX 1483 + * u32 d: stack 1484 + * 1485 + * void foo(u64 a, u32 b, u32 c): 1486 + * u64 a: EAX (lo32) EDX (hi32) 1487 + * u32 b: ECX 1488 + * u32 c: stack 1489 + * 1490 + * void foo(u32 a, u64 b, u32 c): 1491 + * u32 a: EAX 1492 + * u64 b: EDX (lo32) ECX (hi32) 1493 + * u32 c: stack 1494 + * 1495 + * void foo(u32 a, u32 b, u64 c): 1496 + * u32 a: EAX 1497 + * u32 b: EDX 1498 + * u64 c: stack 1499 + * 1500 + * The return value will be stored in the EAX (and EDX for 64bit value). 1501 + * 1502 + * For example, 1503 + * u32 foo(u32 a, u32 b, u32 c): 1504 + * return value: EAX 1505 + * 1506 + * u64 foo(u32 a, u32 b, u32 c): 1507 + * return value: EAX (lo32) EDX (hi32) 1508 + * 1509 + * Notes: 1510 + * The verifier only accepts function having integer and pointers 1511 + * as its args and return value, so it does not have 1512 + * struct-by-value. 1513 + * 1514 + * emit_kfunc_call() finds out the btf_func_model by calling 1515 + * bpf_jit_find_kfunc_model(). A btf_func_model 1516 + * has the details about the number of args, size of each arg, 1517 + * and the size of the return value. 1518 + * 1519 + * It first decides how many args can be passed by EAX, EDX, and ECX. 1520 + * That will decide what args should be pushed to the stack: 1521 + * [first_stack_regno, last_stack_regno] are the bpf regnos 1522 + * that should be pushed to the stack. 1523 + * 1524 + * It will first push all args to the stack because the push 1525 + * will need to use ECX. Then, it moves 1526 + * [BPF_REG_1, first_stack_regno) to EAX, EDX, and ECX. 1527 + * 1528 + * When emitting a call (0xE8), it needs to figure out 1529 + * the jmp_offset relative to the jit-insn address immediately 1530 + * following the call (0xE8) instruction. At this point, it knows 1531 + * the end of the jit-insn address after completely translated the 1532 + * current (BPF_JMP | BPF_CALL) bpf-insn. It is passed as "end_addr" 1533 + * to the emit_kfunc_call(). Thus, it can learn the "immediate-follow-call" 1534 + * address by figuring out how many jit-insn is generated between 1535 + * the call (0xE8) and the end_addr: 1536 + * - 0-1 jit-insn (3 bytes each) to restore the esp pointer if there 1537 + * is arg pushed to the stack. 1538 + * - 0-2 jit-insns (3 bytes each) to handle the return value. 1539 + */ 1540 + static int emit_kfunc_call(const struct bpf_prog *bpf_prog, u8 *end_addr, 1541 + const struct bpf_insn *insn, u8 **pprog) 1542 + { 1543 + const u8 arg_regs[] = { IA32_EAX, IA32_EDX, IA32_ECX }; 1544 + int i, cnt = 0, first_stack_regno, last_stack_regno; 1545 + int free_arg_regs = ARRAY_SIZE(arg_regs); 1546 + const struct btf_func_model *fm; 1547 + int bytes_in_stack = 0; 1548 + const u8 *cur_arg_reg; 1549 + u8 *prog = *pprog; 1550 + s64 jmp_offset; 1551 + 1552 + fm = bpf_jit_find_kfunc_model(bpf_prog, insn); 1553 + if (!fm) 1554 + return -EINVAL; 1555 + 1556 + first_stack_regno = BPF_REG_1; 1557 + for (i = 0; i < fm->nr_args; i++) { 1558 + int regs_needed = fm->arg_size[i] > sizeof(u32) ? 2 : 1; 1559 + 1560 + if (regs_needed > free_arg_regs) 1561 + break; 1562 + 1563 + free_arg_regs -= regs_needed; 1564 + first_stack_regno++; 1565 + } 1566 + 1567 + /* Push the args to the stack */ 1568 + last_stack_regno = BPF_REG_0 + fm->nr_args; 1569 + for (i = last_stack_regno; i >= first_stack_regno; i--) { 1570 + if (fm->arg_size[i - 1] > sizeof(u32)) { 1571 + emit_push_r64(bpf2ia32[i], &prog); 1572 + bytes_in_stack += 8; 1573 + } else { 1574 + emit_push_r32(bpf2ia32[i], &prog); 1575 + bytes_in_stack += 4; 1576 + } 1577 + } 1578 + 1579 + cur_arg_reg = &arg_regs[0]; 1580 + for (i = BPF_REG_1; i < first_stack_regno; i++) { 1581 + /* mov e[adc]x,dword ptr [ebp+off] */ 1582 + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++), 1583 + STACK_VAR(bpf2ia32[i][0])); 1584 + if (fm->arg_size[i - 1] > sizeof(u32)) 1585 + /* mov e[adc]x,dword ptr [ebp+off] */ 1586 + EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++), 1587 + STACK_VAR(bpf2ia32[i][1])); 1588 + } 1589 + 1590 + if (bytes_in_stack) 1591 + /* add esp,"bytes_in_stack" */ 1592 + end_addr -= 3; 1593 + 1594 + /* mov dword ptr [ebp+off],edx */ 1595 + if (fm->ret_size > sizeof(u32)) 1596 + end_addr -= 3; 1597 + 1598 + /* mov dword ptr [ebp+off],eax */ 1599 + if (fm->ret_size) 1600 + end_addr -= 3; 1601 + 1602 + jmp_offset = (u8 *)__bpf_call_base + insn->imm - end_addr; 1603 + if (!is_simm32(jmp_offset)) { 1604 + pr_err("unsupported BPF kernel function jmp_offset:%lld\n", 1605 + jmp_offset); 1606 + return -EINVAL; 1607 + } 1608 + 1609 + EMIT1_off32(0xE8, jmp_offset); 1610 + 1611 + if (fm->ret_size) 1612 + /* mov dword ptr [ebp+off],eax */ 1613 + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), 1614 + STACK_VAR(bpf2ia32[BPF_REG_0][0])); 1615 + 1616 + if (fm->ret_size > sizeof(u32)) 1617 + /* mov dword ptr [ebp+off],edx */ 1618 + EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX), 1619 + STACK_VAR(bpf2ia32[BPF_REG_0][1])); 1620 + 1621 + if (bytes_in_stack) 1622 + /* add esp,"bytes_in_stack" */ 1623 + EMIT3(0x83, add_1reg(0xC0, IA32_ESP), bytes_in_stack); 1624 + 1625 + *pprog = prog; 1626 + 1627 + return 0; 1473 1628 } 1474 1629 1475 1630 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, ··· 2068 1887 2069 1888 if (insn->src_reg == BPF_PSEUDO_CALL) 2070 1889 goto notyet; 1890 + 1891 + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { 1892 + int err; 1893 + 1894 + err = emit_kfunc_call(bpf_prog, 1895 + image + addrs[i], 1896 + insn, &prog); 1897 + 1898 + if (err) 1899 + return err; 1900 + break; 1901 + } 2071 1902 2072 1903 func = (u8 *) __bpf_call_base + imm32; 2073 1904 jmp_offset = func - (image + addrs[i]); ··· 2585 2392 bpf_jit_prog_release_other(prog, prog == orig_prog ? 2586 2393 tmp : orig_prog); 2587 2394 return prog; 2395 + } 2396 + 2397 + bool bpf_jit_supports_kfunc_call(void) 2398 + { 2399 + return true; 2588 2400 }

+12

drivers/net/veth.c

··· 218 218 } 219 219 } 220 220 221 + static void veth_get_channels(struct net_device *dev, 222 + struct ethtool_channels *channels) 223 + { 224 + channels->tx_count = dev->real_num_tx_queues; 225 + channels->rx_count = dev->real_num_rx_queues; 226 + channels->max_tx = dev->real_num_tx_queues; 227 + channels->max_rx = dev->real_num_rx_queues; 228 + channels->combined_count = min(dev->real_num_rx_queues, dev->real_num_tx_queues); 229 + channels->max_combined = min(dev->real_num_rx_queues, dev->real_num_tx_queues); 230 + } 231 + 221 232 static const struct ethtool_ops veth_ethtool_ops = { 222 233 .get_drvinfo = veth_get_drvinfo, 223 234 .get_link = ethtool_op_get_link, ··· 237 226 .get_ethtool_stats = veth_get_ethtool_stats, 238 227 .get_link_ksettings = veth_get_link_ksettings, 239 228 .get_ts_info = ethtool_op_get_ts_info, 229 + .get_channels = veth_get_channels, 240 230 }; 241 231 242 232 /* general routines */

+49 -8

include/linux/bpf-cgroup.h

··· 20 20 struct bpf_cgroup_storage; 21 21 struct ctl_table; 22 22 struct ctl_table_header; 23 + struct task_struct; 23 24 24 25 #ifdef CONFIG_CGROUP_BPF 25 26 26 27 extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; 27 28 #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) 28 29 29 - DECLARE_PER_CPU(struct bpf_cgroup_storage*, 30 - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); 30 + #define BPF_CGROUP_STORAGE_NEST_MAX 8 31 + 32 + struct bpf_cgroup_storage_info { 33 + struct task_struct *task; 34 + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; 35 + }; 36 + 37 + /* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks 38 + * to use bpf cgroup storage simultaneously. 39 + */ 40 + DECLARE_PER_CPU(struct bpf_cgroup_storage_info, 41 + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); 31 42 32 43 #define for_each_cgroup_storage_type(stype) \ 33 44 for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) ··· 172 161 return BPF_CGROUP_STORAGE_SHARED; 173 162 } 174 163 175 - static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage 176 - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) 164 + static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage 165 + *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) 177 166 { 178 167 enum bpf_cgroup_storage_type stype; 168 + int i, err = 0; 179 169 180 - for_each_cgroup_storage_type(stype) 181 - this_cpu_write(bpf_cgroup_storage[stype], storage[stype]); 170 + preempt_disable(); 171 + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { 172 + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) 173 + continue; 174 + 175 + this_cpu_write(bpf_cgroup_storage_info[i].task, current); 176 + for_each_cgroup_storage_type(stype) 177 + this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], 178 + storage[stype]); 179 + goto out; 180 + } 181 + err = -EBUSY; 182 + WARN_ON_ONCE(1); 183 + 184 + out: 185 + preempt_enable(); 186 + return err; 187 + } 188 + 189 + static inline void bpf_cgroup_storage_unset(void) 190 + { 191 + int i; 192 + 193 + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { 194 + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) 195 + continue; 196 + 197 + this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); 198 + return; 199 + } 182 200 } 183 201 184 202 struct bpf_cgroup_storage * ··· 488 448 return -EINVAL; 489 449 } 490 450 491 - static inline void bpf_cgroup_storage_set( 492 - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} 451 + static inline int bpf_cgroup_storage_set( 452 + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } 453 + static inline void bpf_cgroup_storage_unset(void) {} 493 454 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, 494 455 struct bpf_map *map) { return 0; } 495 456 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(

+51 -7

include/linux/bpf.h

··· 56 56 u32 seq_priv_size; 57 57 }; 58 58 59 - /* map is generic key/value storage optionally accesible by eBPF programs */ 59 + /* map is generic key/value storage optionally accessible by eBPF programs */ 60 60 struct bpf_map_ops { 61 61 /* funcs callable from userspace (via syscall) */ 62 62 int (*map_alloc_check)(union bpf_attr *attr); ··· 427 427 PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ 428 428 PTR_TO_FUNC, /* reg points to a bpf program function */ 429 429 PTR_TO_MAP_KEY, /* reg points to a map element key */ 430 + __BPF_REG_TYPE_MAX, 430 431 }; 431 432 432 433 /* The information passed from prog-specific *_is_valid_access ··· 481 480 const struct btf_type *t, int off, int size, 482 481 enum bpf_access_type atype, 483 482 u32 *next_btf_id); 483 + bool (*check_kfunc_call)(u32 kfunc_btf_id); 484 484 }; 485 485 486 486 struct bpf_prog_offload_ops { ··· 798 796 struct module *module; 799 797 }; 800 798 799 + struct bpf_kfunc_desc_tab; 800 + 801 801 struct bpf_prog_aux { 802 802 atomic64_t refcnt; 803 803 u32 used_map_cnt; ··· 836 832 struct bpf_prog **func; 837 833 void *jit_data; /* JIT specific data. arch dependent */ 838 834 struct bpf_jit_poke_descriptor *poke_tab; 835 + struct bpf_kfunc_desc_tab *kfunc_tab; 839 836 u32 size_poke_tab; 840 837 struct bpf_ksym ksym; 841 838 const struct bpf_prog_ops *ops; ··· 1111 1106 /* BPF program asks to set CN on the packet. */ 1112 1107 #define BPF_RET_SET_CN (1 << 0) 1113 1108 1109 + /* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY, 1110 + * if bpf_cgroup_storage_set() failed, the rest of programs 1111 + * will not execute. This should be a really rare scenario 1112 + * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of 1113 + * preemptions all between bpf_cgroup_storage_set() and 1114 + * bpf_cgroup_storage_unset() on the same cpu. 1115 + */ 1114 1116 #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ 1115 1117 ({ \ 1116 1118 struct bpf_prog_array_item *_item; \ ··· 1130 1118 _array = rcu_dereference(array); \ 1131 1119 _item = &_array->items[0]; \ 1132 1120 while ((_prog = READ_ONCE(_item->prog))) { \ 1133 - bpf_cgroup_storage_set(_item->cgroup_storage); \ 1121 + if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ 1122 + break; \ 1134 1123 func_ret = func(_prog, ctx); \ 1135 1124 _ret &= (func_ret & 1); \ 1136 1125 *(ret_flags) |= (func_ret >> 1); \ 1126 + bpf_cgroup_storage_unset(); \ 1137 1127 _item++; \ 1138 1128 } \ 1139 1129 rcu_read_unlock(); \ ··· 1156 1142 goto _out; \ 1157 1143 _item = &_array->items[0]; \ 1158 1144 while ((_prog = READ_ONCE(_item->prog))) { \ 1159 - if (set_cg_storage) \ 1160 - bpf_cgroup_storage_set(_item->cgroup_storage); \ 1161 - _ret &= func(_prog, ctx); \ 1145 + if (!set_cg_storage) { \ 1146 + _ret &= func(_prog, ctx); \ 1147 + } else { \ 1148 + if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ 1149 + break; \ 1150 + _ret &= func(_prog, ctx); \ 1151 + bpf_cgroup_storage_unset(); \ 1152 + } \ 1162 1153 _item++; \ 1163 1154 } \ 1164 1155 _out: \ ··· 1532 1513 int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, 1533 1514 const union bpf_attr *kattr, 1534 1515 union bpf_attr __user *uattr); 1516 + bool bpf_prog_test_check_kfunc_call(u32 kfunc_id); 1535 1517 bool btf_ctx_access(int off, int size, enum bpf_access_type type, 1536 1518 const struct bpf_prog *prog, 1537 1519 struct bpf_insn_access_aux *info); ··· 1551 1531 struct btf_func_model *m); 1552 1532 1553 1533 struct bpf_reg_state; 1554 - int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, 1555 - struct bpf_reg_state *regs); 1534 + int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, 1535 + struct bpf_reg_state *regs); 1536 + int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, 1537 + const struct btf *btf, u32 func_id, 1538 + struct bpf_reg_state *regs); 1556 1539 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, 1557 1540 struct bpf_reg_state *reg); 1558 1541 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, ··· 1566 1543 1567 1544 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); 1568 1545 void bpf_task_storage_free(struct task_struct *task); 1546 + bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); 1547 + const struct btf_func_model * 1548 + bpf_jit_find_kfunc_model(const struct bpf_prog *prog, 1549 + const struct bpf_insn *insn); 1569 1550 #else /* !CONFIG_BPF_SYSCALL */ 1570 1551 static inline struct bpf_prog *bpf_prog_get(u32 ufd) 1571 1552 { ··· 1732 1705 return -ENOTSUPP; 1733 1706 } 1734 1707 1708 + static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) 1709 + { 1710 + return false; 1711 + } 1712 + 1735 1713 static inline void bpf_map_put(struct bpf_map *map) 1736 1714 { 1737 1715 } ··· 1754 1722 1755 1723 static inline void bpf_task_storage_free(struct task_struct *task) 1756 1724 { 1725 + } 1726 + 1727 + static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) 1728 + { 1729 + return false; 1730 + } 1731 + 1732 + static inline const struct btf_func_model * 1733 + bpf_jit_find_kfunc_model(const struct bpf_prog *prog, 1734 + const struct bpf_insn *insn) 1735 + { 1736 + return NULL; 1757 1737 } 1758 1738 #endif /* CONFIG_BPF_SYSCALL */ 1759 1739

+6

include/linux/btf.h

··· 110 110 const struct btf_type * 111 111 btf_resolve_size(const struct btf *btf, const struct btf_type *type, 112 112 u32 *type_size); 113 + const char *btf_type_str(const struct btf_type *t); 113 114 114 115 #define for_each_member(i, struct_type, member) \ 115 116 for (i = 0, member = btf_type_member(struct_type); \ ··· 140 139 static inline bool btf_type_is_enum(const struct btf_type *t) 141 140 { 142 141 return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; 142 + } 143 + 144 + static inline bool btf_type_is_scalar(const struct btf_type *t) 145 + { 146 + return btf_type_is_int(t) || btf_type_is_enum(t); 143 147 } 144 148 145 149 static inline bool btf_type_is_typedef(const struct btf_type *t)

+2 -11

include/linux/filter.h

··· 877 877 void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, 878 878 const u32 *insn_to_jit_off); 879 879 int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); 880 - void bpf_prog_free_jited_linfo(struct bpf_prog *prog); 881 - void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); 880 + void bpf_prog_jit_attempt_done(struct bpf_prog *prog); 882 881 883 882 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); 884 883 struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); ··· 918 919 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); 919 920 void bpf_jit_compile(struct bpf_prog *prog); 920 921 bool bpf_jit_needs_zext(void); 922 + bool bpf_jit_supports_kfunc_call(void); 921 923 bool bpf_helper_changes_pkt_data(void *func); 922 924 923 925 static inline bool bpf_dump_raw_ok(const struct cred *cred) ··· 1245 1245 1246 1246 void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, 1247 1247 int k, unsigned int size); 1248 - 1249 - static inline void *bpf_load_pointer(const struct sk_buff *skb, int k, 1250 - unsigned int size, void *buffer) 1251 - { 1252 - if (k >= 0) 1253 - return skb_header_pointer(skb, k, size, buffer); 1254 - 1255 - return bpf_internal_load_pointer_neg_helper(skb, k, size); 1256 - } 1257 1248 1258 1249 static inline int bpf_tell_extensions(void) 1259 1250 {

+1

include/linux/skbuff.h

··· 3626 3626 unsigned int flags); 3627 3627 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 3628 3628 int len); 3629 + int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); 3629 3630 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); 3630 3631 unsigned int skb_zerocopy_headlen(const struct sk_buff *from); 3631 3632 int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,

+58 -19

include/linux/skmsg.h

··· 58 58 struct bpf_prog *msg_parser; 59 59 struct bpf_prog *stream_parser; 60 60 struct bpf_prog *stream_verdict; 61 + struct bpf_prog *skb_verdict; 61 62 }; 62 63 63 64 enum sk_psock_state_bits { ··· 90 89 #endif 91 90 struct sk_buff_head ingress_skb; 92 91 struct list_head ingress_msg; 92 + spinlock_t ingress_lock; 93 93 unsigned long state; 94 94 struct list_head link; 95 95 spinlock_t link_lock; ··· 99 97 void (*saved_close)(struct sock *sk, long timeout); 100 98 void (*saved_write_space)(struct sock *sk); 101 99 void (*saved_data_ready)(struct sock *sk); 100 + int (*psock_update_sk_prot)(struct sock *sk, bool restore); 102 101 struct proto *sk_proto; 102 + struct mutex work_mutex; 103 103 struct sk_psock_work_state work_state; 104 104 struct work_struct work; 105 - union { 106 - struct rcu_head rcu; 107 - struct work_struct gc; 108 - }; 105 + struct rcu_work rwork; 109 106 }; 110 107 111 108 int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, ··· 125 124 struct sk_msg *msg, u32 bytes); 126 125 int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, 127 126 struct sk_msg *msg, u32 bytes); 127 + int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, 128 + long timeo, int *err); 129 + int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 130 + int len, int flags); 128 131 129 132 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) 130 133 { ··· 289 284 static inline void sk_psock_queue_msg(struct sk_psock *psock, 290 285 struct sk_msg *msg) 291 286 { 287 + spin_lock_bh(&psock->ingress_lock); 292 288 list_add_tail(&msg->list, &psock->ingress_msg); 289 + spin_unlock_bh(&psock->ingress_lock); 290 + } 291 + 292 + static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) 293 + { 294 + struct sk_msg *msg; 295 + 296 + spin_lock_bh(&psock->ingress_lock); 297 + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); 298 + if (msg) 299 + list_del(&msg->list); 300 + spin_unlock_bh(&psock->ingress_lock); 301 + return msg; 302 + } 303 + 304 + static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) 305 + { 306 + struct sk_msg *msg; 307 + 308 + spin_lock_bh(&psock->ingress_lock); 309 + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); 310 + spin_unlock_bh(&psock->ingress_lock); 311 + return msg; 312 + } 313 + 314 + static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, 315 + struct sk_msg *msg) 316 + { 317 + struct sk_msg *ret; 318 + 319 + spin_lock_bh(&psock->ingress_lock); 320 + if (list_is_last(&msg->list, &psock->ingress_msg)) 321 + ret = NULL; 322 + else 323 + ret = list_next_entry(msg, list); 324 + spin_unlock_bh(&psock->ingress_lock); 325 + return ret; 293 326 } 294 327 295 328 static inline bool sk_psock_queue_empty(const struct sk_psock *psock) 296 329 { 297 330 return psock ? list_empty(&psock->ingress_msg) : true; 331 + } 332 + 333 + static inline void kfree_sk_msg(struct sk_msg *msg) 334 + { 335 + if (msg->skb) 336 + consume_skb(msg->skb); 337 + kfree(msg); 298 338 } 299 339 300 340 static inline void sk_psock_report_error(struct sk_psock *psock, int err) ··· 351 301 } 352 302 353 303 struct sk_psock *sk_psock_init(struct sock *sk, int node); 304 + void sk_psock_stop(struct sk_psock *psock, bool wait); 354 305 355 306 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) 356 307 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); ··· 400 349 } 401 350 } 402 351 403 - static inline void sk_psock_update_proto(struct sock *sk, 404 - struct sk_psock *psock, 405 - struct proto *ops) 406 - { 407 - /* Pairs with lockless read in sk_clone_lock() */ 408 - WRITE_ONCE(sk->sk_prot, ops); 409 - } 410 - 411 352 static inline void sk_psock_restore_proto(struct sock *sk, 412 353 struct sk_psock *psock) 413 354 { 414 355 sk->sk_prot->unhash = psock->saved_unhash; 415 - if (inet_csk_has_ulp(sk)) { 416 - tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); 417 - } else { 418 - sk->sk_write_space = psock->saved_write_space; 419 - /* Pairs with lockless read in sk_clone_lock() */ 420 - WRITE_ONCE(sk->sk_prot, psock->sk_proto); 421 - } 356 + if (psock->psock_update_sk_prot) 357 + psock->psock_update_sk_prot(sk, true); 422 358 } 423 359 424 360 static inline void sk_psock_set_state(struct sk_psock *psock, ··· 480 442 psock_set_prog(&progs->msg_parser, NULL); 481 443 psock_set_prog(&progs->stream_parser, NULL); 482 444 psock_set_prog(&progs->stream_verdict, NULL); 445 + psock_set_prog(&progs->skb_verdict, NULL); 483 446 } 484 447 485 448 int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);

-1

include/net/bpf_sk_storage.h

··· 27 27 struct bpf_sk_storage_diag; 28 28 struct sk_buff; 29 29 struct nlattr; 30 - struct sock; 31 30 32 31 #ifdef CONFIG_BPF_SYSCALL 33 32 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);

+3

include/net/sock.h

··· 1184 1184 void (*unhash)(struct sock *sk); 1185 1185 void (*rehash)(struct sock *sk); 1186 1186 int (*get_port)(struct sock *sk, unsigned short snum); 1187 + #ifdef CONFIG_BPF_SYSCALL 1188 + int (*psock_update_sk_prot)(struct sock *sk, bool restore); 1189 + #endif 1187 1190 1188 1191 /* Keeping track of sockets in use */ 1189 1192 #ifdef CONFIG_PROC_FS

+1 -2

include/net/tcp.h

··· 2203 2203 2204 2204 #ifdef CONFIG_BPF_SYSCALL 2205 2205 struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); 2206 + int tcp_bpf_update_proto(struct sock *sk, bool restore); 2206 2207 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); 2207 2208 #endif /* CONFIG_BPF_SYSCALL */ 2208 2209 2209 2210 int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, 2210 2211 int flags); 2211 - int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, 2212 - struct msghdr *msg, int len, int flags); 2213 2212 #endif /* CONFIG_NET_SOCK_MSG */ 2214 2213 2215 2214 #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)

+3

include/net/udp.h

··· 329 329 struct sk_buff *skb); 330 330 struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, 331 331 __be16 sport, __be16 dport); 332 + int udp_read_sock(struct sock *sk, read_descriptor_t *desc, 333 + sk_read_actor_t recv_actor); 332 334 333 335 /* UDP uses skb->dev_scratch to cache as much information as possible and avoid 334 336 * possibly multiple cache miss on dequeue() ··· 543 541 #ifdef CONFIG_BPF_SYSCALL 544 542 struct sk_psock; 545 543 struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); 544 + int udp_bpf_update_proto(struct sock *sk, bool restore); 546 545 #endif 547 546 548 547 #endif /* _UDP_H */

+5

include/uapi/linux/bpf.h

··· 957 957 BPF_XDP_CPUMAP, 958 958 BPF_SK_LOOKUP, 959 959 BPF_XDP, 960 + BPF_SK_SKB_VERDICT, 960 961 __MAX_BPF_ATTACH_TYPE 961 962 }; 962 963 ··· 1118 1117 * offset to another bpf function 1119 1118 */ 1120 1119 #define BPF_PSEUDO_CALL 1 1120 + /* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, 1121 + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel 1122 + */ 1123 + #define BPF_PSEUDO_KFUNC_CALL 2 1121 1124 1122 1125 /* flags for BPF_MAP_UPDATE_ELEM command */ 1123 1126 enum {

+158 -91

kernel/bpf/btf.c

··· 283 283 [BTF_KIND_FLOAT] = "FLOAT", 284 284 }; 285 285 286 - static const char *btf_type_str(const struct btf_type *t) 286 + const char *btf_type_str(const struct btf_type *t) 287 287 { 288 288 return btf_kind_str[BTF_INFO_KIND(t->info)]; 289 289 } ··· 789 789 790 790 while (btf_type_is_modifier(t) && 791 791 BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { 792 - id = t->type; 793 792 t = btf_type_by_id(btf, t->type); 794 793 } 795 794 ··· 4376 4377 #undef BPF_LINK_TYPE 4377 4378 4378 4379 static const struct btf_member * 4379 - btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, 4380 + btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, 4380 4381 const struct btf_type *t, enum bpf_prog_type prog_type, 4381 4382 int arg) 4382 4383 { ··· 5361 5362 return btf_check_func_type_match(log, btf1, t1, btf2, t2); 5362 5363 } 5363 5364 5365 + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { 5366 + #ifdef CONFIG_NET 5367 + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], 5368 + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], 5369 + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], 5370 + #endif 5371 + }; 5372 + 5373 + static int btf_check_func_arg_match(struct bpf_verifier_env *env, 5374 + const struct btf *btf, u32 func_id, 5375 + struct bpf_reg_state *regs, 5376 + bool ptr_to_mem_ok) 5377 + { 5378 + struct bpf_verifier_log *log = &env->log; 5379 + const char *func_name, *ref_tname; 5380 + const struct btf_type *t, *ref_t; 5381 + const struct btf_param *args; 5382 + u32 i, nargs, ref_id; 5383 + 5384 + t = btf_type_by_id(btf, func_id); 5385 + if (!t || !btf_type_is_func(t)) { 5386 + /* These checks were already done by the verifier while loading 5387 + * struct bpf_func_info or in add_kfunc_call(). 5388 + */ 5389 + bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", 5390 + func_id); 5391 + return -EFAULT; 5392 + } 5393 + func_name = btf_name_by_offset(btf, t->name_off); 5394 + 5395 + t = btf_type_by_id(btf, t->type); 5396 + if (!t || !btf_type_is_func_proto(t)) { 5397 + bpf_log(log, "Invalid BTF of func %s\n", func_name); 5398 + return -EFAULT; 5399 + } 5400 + args = (const struct btf_param *)(t + 1); 5401 + nargs = btf_type_vlen(t); 5402 + if (nargs > MAX_BPF_FUNC_REG_ARGS) { 5403 + bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs, 5404 + MAX_BPF_FUNC_REG_ARGS); 5405 + return -EINVAL; 5406 + } 5407 + 5408 + /* check that BTF function arguments match actual types that the 5409 + * verifier sees. 5410 + */ 5411 + for (i = 0; i < nargs; i++) { 5412 + u32 regno = i + 1; 5413 + struct bpf_reg_state *reg = &regs[regno]; 5414 + 5415 + t = btf_type_skip_modifiers(btf, args[i].type, NULL); 5416 + if (btf_type_is_scalar(t)) { 5417 + if (reg->type == SCALAR_VALUE) 5418 + continue; 5419 + bpf_log(log, "R%d is not a scalar\n", regno); 5420 + return -EINVAL; 5421 + } 5422 + 5423 + if (!btf_type_is_ptr(t)) { 5424 + bpf_log(log, "Unrecognized arg#%d type %s\n", 5425 + i, btf_type_str(t)); 5426 + return -EINVAL; 5427 + } 5428 + 5429 + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); 5430 + ref_tname = btf_name_by_offset(btf, ref_t->name_off); 5431 + if (btf_is_kernel(btf)) { 5432 + const struct btf_type *reg_ref_t; 5433 + const struct btf *reg_btf; 5434 + const char *reg_ref_tname; 5435 + u32 reg_ref_id; 5436 + 5437 + if (!btf_type_is_struct(ref_t)) { 5438 + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", 5439 + func_name, i, btf_type_str(ref_t), 5440 + ref_tname); 5441 + return -EINVAL; 5442 + } 5443 + 5444 + if (reg->type == PTR_TO_BTF_ID) { 5445 + reg_btf = reg->btf; 5446 + reg_ref_id = reg->btf_id; 5447 + } else if (reg2btf_ids[reg->type]) { 5448 + reg_btf = btf_vmlinux; 5449 + reg_ref_id = *reg2btf_ids[reg->type]; 5450 + } else { 5451 + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", 5452 + func_name, i, 5453 + btf_type_str(ref_t), ref_tname, regno); 5454 + return -EINVAL; 5455 + } 5456 + 5457 + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, 5458 + &reg_ref_id); 5459 + reg_ref_tname = btf_name_by_offset(reg_btf, 5460 + reg_ref_t->name_off); 5461 + if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, 5462 + reg->off, btf, ref_id)) { 5463 + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", 5464 + func_name, i, 5465 + btf_type_str(ref_t), ref_tname, 5466 + regno, btf_type_str(reg_ref_t), 5467 + reg_ref_tname); 5468 + return -EINVAL; 5469 + } 5470 + } else if (btf_get_prog_ctx_type(log, btf, t, 5471 + env->prog->type, i)) { 5472 + /* If function expects ctx type in BTF check that caller 5473 + * is passing PTR_TO_CTX. 5474 + */ 5475 + if (reg->type != PTR_TO_CTX) { 5476 + bpf_log(log, 5477 + "arg#%d expected pointer to ctx, but got %s\n", 5478 + i, btf_type_str(t)); 5479 + return -EINVAL; 5480 + } 5481 + if (check_ctx_reg(env, reg, regno)) 5482 + return -EINVAL; 5483 + } else if (ptr_to_mem_ok) { 5484 + const struct btf_type *resolve_ret; 5485 + u32 type_size; 5486 + 5487 + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); 5488 + if (IS_ERR(resolve_ret)) { 5489 + bpf_log(log, 5490 + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", 5491 + i, btf_type_str(ref_t), ref_tname, 5492 + PTR_ERR(resolve_ret)); 5493 + return -EINVAL; 5494 + } 5495 + 5496 + if (check_mem_reg(env, reg, regno, type_size)) 5497 + return -EINVAL; 5498 + } else { 5499 + return -EINVAL; 5500 + } 5501 + } 5502 + 5503 + return 0; 5504 + } 5505 + 5364 5506 /* Compare BTF of a function with given bpf_reg_state. 5365 5507 * Returns: 5366 5508 * EFAULT - there is a verifier bug. Abort verification. ··· 5509 5369 * 0 - BTF matches with what bpf_reg_state expects. 5510 5370 * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. 5511 5371 */ 5512 - int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, 5513 - struct bpf_reg_state *regs) 5372 + int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, 5373 + struct bpf_reg_state *regs) 5514 5374 { 5515 - struct bpf_verifier_log *log = &env->log; 5516 5375 struct bpf_prog *prog = env->prog; 5517 5376 struct btf *btf = prog->aux->btf; 5518 - const struct btf_param *args; 5519 - const struct btf_type *t, *ref_t; 5520 - u32 i, nargs, btf_id, type_size; 5521 - const char *tname; 5522 5377 bool is_global; 5378 + u32 btf_id; 5379 + int err; 5523 5380 5524 5381 if (!prog->aux->func_info) 5525 5382 return -EINVAL; ··· 5528 5391 if (prog->aux->func_info_aux[subprog].unreliable) 5529 5392 return -EINVAL; 5530 5393 5531 - t = btf_type_by_id(btf, btf_id); 5532 - if (!t || !btf_type_is_func(t)) { 5533 - /* These checks were already done by the verifier while loading 5534 - * struct bpf_func_info 5535 - */ 5536 - bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", 5537 - subprog); 5538 - return -EFAULT; 5539 - } 5540 - tname = btf_name_by_offset(btf, t->name_off); 5541 - 5542 - t = btf_type_by_id(btf, t->type); 5543 - if (!t || !btf_type_is_func_proto(t)) { 5544 - bpf_log(log, "Invalid BTF of func %s\n", tname); 5545 - return -EFAULT; 5546 - } 5547 - args = (const struct btf_param *)(t + 1); 5548 - nargs = btf_type_vlen(t); 5549 - if (nargs > MAX_BPF_FUNC_REG_ARGS) { 5550 - bpf_log(log, "Function %s has %d > %d args\n", tname, nargs, 5551 - MAX_BPF_FUNC_REG_ARGS); 5552 - goto out; 5553 - } 5554 - 5555 5394 is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; 5556 - /* check that BTF function arguments match actual types that the 5557 - * verifier sees. 5558 - */ 5559 - for (i = 0; i < nargs; i++) { 5560 - struct bpf_reg_state *reg = &regs[i + 1]; 5395 + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); 5561 5396 5562 - t = btf_type_by_id(btf, args[i].type); 5563 - while (btf_type_is_modifier(t)) 5564 - t = btf_type_by_id(btf, t->type); 5565 - if (btf_type_is_int(t) || btf_type_is_enum(t)) { 5566 - if (reg->type == SCALAR_VALUE) 5567 - continue; 5568 - bpf_log(log, "R%d is not a scalar\n", i + 1); 5569 - goto out; 5570 - } 5571 - if (btf_type_is_ptr(t)) { 5572 - /* If function expects ctx type in BTF check that caller 5573 - * is passing PTR_TO_CTX. 5574 - */ 5575 - if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { 5576 - if (reg->type != PTR_TO_CTX) { 5577 - bpf_log(log, 5578 - "arg#%d expected pointer to ctx, but got %s\n", 5579 - i, btf_kind_str[BTF_INFO_KIND(t->info)]); 5580 - goto out; 5581 - } 5582 - if (check_ctx_reg(env, reg, i + 1)) 5583 - goto out; 5584 - continue; 5585 - } 5586 - 5587 - if (!is_global) 5588 - goto out; 5589 - 5590 - t = btf_type_skip_modifiers(btf, t->type, NULL); 5591 - 5592 - ref_t = btf_resolve_size(btf, t, &type_size); 5593 - if (IS_ERR(ref_t)) { 5594 - bpf_log(log, 5595 - "arg#%d reference type('%s %s') size cannot be determined: %ld\n", 5596 - i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), 5597 - PTR_ERR(ref_t)); 5598 - goto out; 5599 - } 5600 - 5601 - if (check_mem_reg(env, reg, i + 1, type_size)) 5602 - goto out; 5603 - 5604 - continue; 5605 - } 5606 - bpf_log(log, "Unrecognized arg#%d type %s\n", 5607 - i, btf_kind_str[BTF_INFO_KIND(t->info)]); 5608 - goto out; 5609 - } 5610 - return 0; 5611 - out: 5612 5397 /* Compiler optimizations can remove arguments from static functions 5613 5398 * or mismatched type can be passed into a global function. 5614 5399 * In such cases mark the function as unreliable from BTF point of view. 5615 5400 */ 5616 - prog->aux->func_info_aux[subprog].unreliable = true; 5617 - return -EINVAL; 5401 + if (err) 5402 + prog->aux->func_info_aux[subprog].unreliable = true; 5403 + return err; 5404 + } 5405 + 5406 + int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, 5407 + const struct btf *btf, u32 func_id, 5408 + struct bpf_reg_state *regs) 5409 + { 5410 + return btf_check_func_arg_match(env, btf, func_id, regs, false); 5618 5411 } 5619 5412 5620 5413 /* Convert BTF of a function into bpf_reg_state if possible

+24 -23

kernel/bpf/core.c

··· 143 143 if (!prog->aux->nr_linfo || !prog->jit_requested) 144 144 return 0; 145 145 146 - prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, 147 - sizeof(*prog->aux->jited_linfo), 148 - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 146 + prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, 147 + sizeof(*prog->aux->jited_linfo), 148 + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 149 149 if (!prog->aux->jited_linfo) 150 150 return -ENOMEM; 151 151 152 152 return 0; 153 153 } 154 154 155 - void bpf_prog_free_jited_linfo(struct bpf_prog *prog) 155 + void bpf_prog_jit_attempt_done(struct bpf_prog *prog) 156 156 { 157 - kfree(prog->aux->jited_linfo); 158 - prog->aux->jited_linfo = NULL; 159 - } 157 + if (prog->aux->jited_linfo && 158 + (!prog->jited || !prog->aux->jited_linfo[0])) { 159 + kvfree(prog->aux->jited_linfo); 160 + prog->aux->jited_linfo = NULL; 161 + } 160 162 161 - void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog) 162 - { 163 - if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0]) 164 - bpf_prog_free_jited_linfo(prog); 163 + kfree(prog->aux->kfunc_tab); 164 + prog->aux->kfunc_tab = NULL; 165 165 } 166 166 167 167 /* The jit engine is responsible to provide an array ··· 215 215 */ 216 216 jited_linfo[i] = prog->bpf_func + 217 217 insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; 218 - } 219 - 220 - void bpf_prog_free_linfo(struct bpf_prog *prog) 221 - { 222 - bpf_prog_free_jited_linfo(prog); 223 - kvfree(prog->aux->linfo); 224 218 } 225 219 226 220 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, ··· 1843 1849 /* In case of BPF to BPF calls, verifier did all the prep 1844 1850 * work with regards to JITing, etc. 1845 1851 */ 1852 + bool jit_needed = false; 1853 + 1846 1854 if (fp->bpf_func) 1847 1855 goto finalize; 1856 + 1857 + if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || 1858 + bpf_prog_has_kfunc_call(fp)) 1859 + jit_needed = true; 1848 1860 1849 1861 bpf_prog_select_func(fp); 1850 1862 ··· 1866 1866 return fp; 1867 1867 1868 1868 fp = bpf_int_jit_compile(fp); 1869 - if (!fp->jited) { 1870 - bpf_prog_free_jited_linfo(fp); 1871 - #ifdef CONFIG_BPF_JIT_ALWAYS_ON 1869 + bpf_prog_jit_attempt_done(fp); 1870 + if (!fp->jited && jit_needed) { 1872 1871 *err = -ENOTSUPP; 1873 1872 return fp; 1874 - #endif 1875 - } else { 1876 - bpf_prog_free_unused_jited_linfo(fp); 1877 1873 } 1878 1874 } else { 1879 1875 *err = bpf_prog_offload_compile(fp); ··· 2346 2350 * them using insn_is_zext. 2347 2351 */ 2348 2352 bool __weak bpf_jit_needs_zext(void) 2353 + { 2354 + return false; 2355 + } 2356 + 2357 + bool __weak bpf_jit_supports_kfunc_call(void) 2349 2358 { 2350 2359 return false; 2351 2360 }

+10 -3

kernel/bpf/disasm.c

··· 19 19 { 20 20 BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); 21 21 22 - if (insn->src_reg != BPF_PSEUDO_CALL && 22 + if (!insn->src_reg && 23 23 insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && 24 24 func_id_str[insn->imm]) 25 25 return func_id_str[insn->imm]; 26 26 27 - if (cbs && cbs->cb_call) 28 - return cbs->cb_call(cbs->private_data, insn); 27 + if (cbs && cbs->cb_call) { 28 + const char *res; 29 + 30 + res = cbs->cb_call(cbs->private_data, insn); 31 + if (res) 32 + return res; 33 + } 29 34 30 35 if (insn->src_reg == BPF_PSEUDO_CALL) 31 36 snprintf(buff, len, "%+d", insn->imm); 37 + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) 38 + snprintf(buff, len, "kernel-function"); 32 39 33 40 return buff; 34 41 }

+11 -4

kernel/bpf/helpers.c

··· 382 382 }; 383 383 384 384 #ifdef CONFIG_CGROUP_BPF 385 - DECLARE_PER_CPU(struct bpf_cgroup_storage*, 386 - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); 385 + DECLARE_PER_CPU(struct bpf_cgroup_storage_info, 386 + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); 387 387 388 388 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) 389 389 { ··· 392 392 * verifier checks that its value is correct. 393 393 */ 394 394 enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); 395 - struct bpf_cgroup_storage *storage; 395 + struct bpf_cgroup_storage *storage = NULL; 396 396 void *ptr; 397 + int i; 397 398 398 - storage = this_cpu_read(bpf_cgroup_storage[stype]); 399 + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { 400 + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) 401 + continue; 402 + 403 + storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); 404 + break; 405 + } 399 406 400 407 if (stype == BPF_CGROUP_STORAGE_SHARED) 401 408 ptr = &READ_ONCE(storage->buf)->data[0];

+3 -2

kernel/bpf/local_storage.c

··· 9 9 #include <linux/slab.h> 10 10 #include <uapi/linux/btf.h> 11 11 12 - DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); 13 - 14 12 #ifdef CONFIG_CGROUP_BPF 13 + 14 + DEFINE_PER_CPU(struct bpf_cgroup_storage_info, 15 + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); 15 16 16 17 #include "../cgroup/cgroup-internal.h" 17 18

+3

kernel/bpf/lpm_trie.c

··· 726 726 .map_lookup_elem = trie_lookup_elem, 727 727 .map_update_elem = trie_update_elem, 728 728 .map_delete_elem = trie_delete_elem, 729 + .map_lookup_batch = generic_map_lookup_batch, 730 + .map_update_batch = generic_map_update_batch, 731 + .map_delete_batch = generic_map_delete_batch, 729 732 .map_check_btf = trie_check_btf, 730 733 .map_btf_name = "lpm_trie", 731 734 .map_btf_id = &trie_map_btf_id,

+4 -1

kernel/bpf/syscall.c

··· 1694 1694 { 1695 1695 bpf_prog_kallsyms_del_all(prog); 1696 1696 btf_put(prog->aux->btf); 1697 - bpf_prog_free_linfo(prog); 1697 + kvfree(prog->aux->jited_linfo); 1698 + kvfree(prog->aux->linfo); 1699 + kfree(prog->aux->kfunc_tab); 1698 1700 if (prog->aux->attach_btf) 1699 1701 btf_put(prog->aux->attach_btf); 1700 1702 ··· 2948 2946 return BPF_PROG_TYPE_SK_MSG; 2949 2947 case BPF_SK_SKB_STREAM_PARSER: 2950 2948 case BPF_SK_SKB_STREAM_VERDICT: 2949 + case BPF_SK_SKB_VERDICT: 2951 2950 return BPF_PROG_TYPE_SK_SKB; 2952 2951 case BPF_LIRC_MODE2: 2953 2952 return BPF_PROG_TYPE_LIRC_MODE2;

+348 -42

kernel/bpf/verifier.c

··· 234 234 insn->src_reg == BPF_PSEUDO_CALL; 235 235 } 236 236 237 + static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) 238 + { 239 + return insn->code == (BPF_JMP | BPF_CALL) && 240 + insn->src_reg == BPF_PSEUDO_KFUNC_CALL; 241 + } 242 + 237 243 static bool bpf_pseudo_func(const struct bpf_insn *insn) 238 244 { 239 245 return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && ··· 1560 1554 verbose(env, "too many subprograms\n"); 1561 1555 return -E2BIG; 1562 1556 } 1557 + /* determine subprog starts. The end is one before the next starts */ 1563 1558 env->subprog_info[env->subprog_cnt++].start = off; 1564 1559 sort(env->subprog_info, env->subprog_cnt, 1565 1560 sizeof(env->subprog_info[0]), cmp_subprogs, NULL); 1566 1561 return env->subprog_cnt - 1; 1567 1562 } 1568 1563 1569 - static int check_subprogs(struct bpf_verifier_env *env) 1564 + struct bpf_kfunc_desc { 1565 + struct btf_func_model func_model; 1566 + u32 func_id; 1567 + s32 imm; 1568 + }; 1569 + 1570 + #define MAX_KFUNC_DESCS 256 1571 + struct bpf_kfunc_desc_tab { 1572 + struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; 1573 + u32 nr_descs; 1574 + }; 1575 + 1576 + static int kfunc_desc_cmp_by_id(const void *a, const void *b) 1570 1577 { 1571 - int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; 1578 + const struct bpf_kfunc_desc *d0 = a; 1579 + const struct bpf_kfunc_desc *d1 = b; 1580 + 1581 + /* func_id is not greater than BTF_MAX_TYPE */ 1582 + return d0->func_id - d1->func_id; 1583 + } 1584 + 1585 + static const struct bpf_kfunc_desc * 1586 + find_kfunc_desc(const struct bpf_prog *prog, u32 func_id) 1587 + { 1588 + struct bpf_kfunc_desc desc = { 1589 + .func_id = func_id, 1590 + }; 1591 + struct bpf_kfunc_desc_tab *tab; 1592 + 1593 + tab = prog->aux->kfunc_tab; 1594 + return bsearch(&desc, tab->descs, tab->nr_descs, 1595 + sizeof(tab->descs[0]), kfunc_desc_cmp_by_id); 1596 + } 1597 + 1598 + static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id) 1599 + { 1600 + const struct btf_type *func, *func_proto; 1601 + struct bpf_kfunc_desc_tab *tab; 1602 + struct bpf_prog_aux *prog_aux; 1603 + struct bpf_kfunc_desc *desc; 1604 + const char *func_name; 1605 + unsigned long addr; 1606 + int err; 1607 + 1608 + prog_aux = env->prog->aux; 1609 + tab = prog_aux->kfunc_tab; 1610 + if (!tab) { 1611 + if (!btf_vmlinux) { 1612 + verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n"); 1613 + return -ENOTSUPP; 1614 + } 1615 + 1616 + if (!env->prog->jit_requested) { 1617 + verbose(env, "JIT is required for calling kernel function\n"); 1618 + return -ENOTSUPP; 1619 + } 1620 + 1621 + if (!bpf_jit_supports_kfunc_call()) { 1622 + verbose(env, "JIT does not support calling kernel function\n"); 1623 + return -ENOTSUPP; 1624 + } 1625 + 1626 + if (!env->prog->gpl_compatible) { 1627 + verbose(env, "cannot call kernel function from non-GPL compatible program\n"); 1628 + return -EINVAL; 1629 + } 1630 + 1631 + tab = kzalloc(sizeof(*tab), GFP_KERNEL); 1632 + if (!tab) 1633 + return -ENOMEM; 1634 + prog_aux->kfunc_tab = tab; 1635 + } 1636 + 1637 + if (find_kfunc_desc(env->prog, func_id)) 1638 + return 0; 1639 + 1640 + if (tab->nr_descs == MAX_KFUNC_DESCS) { 1641 + verbose(env, "too many different kernel function calls\n"); 1642 + return -E2BIG; 1643 + } 1644 + 1645 + func = btf_type_by_id(btf_vmlinux, func_id); 1646 + if (!func || !btf_type_is_func(func)) { 1647 + verbose(env, "kernel btf_id %u is not a function\n", 1648 + func_id); 1649 + return -EINVAL; 1650 + } 1651 + func_proto = btf_type_by_id(btf_vmlinux, func->type); 1652 + if (!func_proto || !btf_type_is_func_proto(func_proto)) { 1653 + verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", 1654 + func_id); 1655 + return -EINVAL; 1656 + } 1657 + 1658 + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); 1659 + addr = kallsyms_lookup_name(func_name); 1660 + if (!addr) { 1661 + verbose(env, "cannot find address for kernel function %s\n", 1662 + func_name); 1663 + return -EINVAL; 1664 + } 1665 + 1666 + desc = &tab->descs[tab->nr_descs++]; 1667 + desc->func_id = func_id; 1668 + desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base; 1669 + err = btf_distill_func_proto(&env->log, btf_vmlinux, 1670 + func_proto, func_name, 1671 + &desc->func_model); 1672 + if (!err) 1673 + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), 1674 + kfunc_desc_cmp_by_id, NULL); 1675 + return err; 1676 + } 1677 + 1678 + static int kfunc_desc_cmp_by_imm(const void *a, const void *b) 1679 + { 1680 + const struct bpf_kfunc_desc *d0 = a; 1681 + const struct bpf_kfunc_desc *d1 = b; 1682 + 1683 + if (d0->imm > d1->imm) 1684 + return 1; 1685 + else if (d0->imm < d1->imm) 1686 + return -1; 1687 + return 0; 1688 + } 1689 + 1690 + static void sort_kfunc_descs_by_imm(struct bpf_prog *prog) 1691 + { 1692 + struct bpf_kfunc_desc_tab *tab; 1693 + 1694 + tab = prog->aux->kfunc_tab; 1695 + if (!tab) 1696 + return; 1697 + 1698 + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), 1699 + kfunc_desc_cmp_by_imm, NULL); 1700 + } 1701 + 1702 + bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) 1703 + { 1704 + return !!prog->aux->kfunc_tab; 1705 + } 1706 + 1707 + const struct btf_func_model * 1708 + bpf_jit_find_kfunc_model(const struct bpf_prog *prog, 1709 + const struct bpf_insn *insn) 1710 + { 1711 + const struct bpf_kfunc_desc desc = { 1712 + .imm = insn->imm, 1713 + }; 1714 + const struct bpf_kfunc_desc *res; 1715 + struct bpf_kfunc_desc_tab *tab; 1716 + 1717 + tab = prog->aux->kfunc_tab; 1718 + res = bsearch(&desc, tab->descs, tab->nr_descs, 1719 + sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm); 1720 + 1721 + return res ? &res->func_model : NULL; 1722 + } 1723 + 1724 + static int add_subprog_and_kfunc(struct bpf_verifier_env *env) 1725 + { 1572 1726 struct bpf_subprog_info *subprog = env->subprog_info; 1573 1727 struct bpf_insn *insn = env->prog->insnsi; 1574 - int insn_cnt = env->prog->len; 1728 + int i, ret, insn_cnt = env->prog->len; 1575 1729 1576 1730 /* Add entry function. */ 1577 1731 ret = add_subprog(env, 0); 1578 - if (ret < 0) 1732 + if (ret) 1579 1733 return ret; 1580 1734 1581 - /* determine subprog starts. The end is one before the next starts */ 1582 - for (i = 0; i < insn_cnt; i++) { 1583 - if (bpf_pseudo_func(insn + i)) { 1584 - if (!env->bpf_capable) { 1585 - verbose(env, 1586 - "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); 1587 - return -EPERM; 1588 - } 1589 - ret = add_subprog(env, i + insn[i].imm + 1); 1590 - if (ret < 0) 1591 - return ret; 1592 - /* remember subprog */ 1593 - insn[i + 1].imm = ret; 1735 + for (i = 0; i < insn_cnt; i++, insn++) { 1736 + if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) && 1737 + !bpf_pseudo_kfunc_call(insn)) 1594 1738 continue; 1595 - } 1596 - if (!bpf_pseudo_call(insn + i)) 1597 - continue; 1739 + 1598 1740 if (!env->bpf_capable) { 1599 - verbose(env, 1600 - "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); 1741 + verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); 1601 1742 return -EPERM; 1602 1743 } 1603 - ret = add_subprog(env, i + insn[i].imm + 1); 1744 + 1745 + if (bpf_pseudo_func(insn)) { 1746 + ret = add_subprog(env, i + insn->imm + 1); 1747 + if (ret >= 0) 1748 + /* remember subprog */ 1749 + insn[1].imm = ret; 1750 + } else if (bpf_pseudo_call(insn)) { 1751 + ret = add_subprog(env, i + insn->imm + 1); 1752 + } else { 1753 + ret = add_kfunc_call(env, insn->imm); 1754 + } 1755 + 1604 1756 if (ret < 0) 1605 1757 return ret; 1606 1758 } ··· 1771 1607 if (env->log.level & BPF_LOG_LEVEL2) 1772 1608 for (i = 0; i < env->subprog_cnt; i++) 1773 1609 verbose(env, "func#%d @%d\n", i, subprog[i].start); 1610 + 1611 + return 0; 1612 + } 1613 + 1614 + static int check_subprogs(struct bpf_verifier_env *env) 1615 + { 1616 + int i, subprog_start, subprog_end, off, cur_subprog = 0; 1617 + struct bpf_subprog_info *subprog = env->subprog_info; 1618 + struct bpf_insn *insn = env->prog->insnsi; 1619 + int insn_cnt = env->prog->len; 1774 1620 1775 1621 /* now check that all jumps are within the same subprog */ 1776 1622 subprog_start = subprog[cur_subprog].start; ··· 2090 1916 return i; 2091 1917 } 2092 1918 1919 + static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) 1920 + { 1921 + const struct btf_type *func; 1922 + 1923 + if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) 1924 + return NULL; 1925 + 1926 + func = btf_type_by_id(btf_vmlinux, insn->imm); 1927 + return btf_name_by_offset(btf_vmlinux, func->name_off); 1928 + } 1929 + 2093 1930 /* For given verifier state backtrack_insn() is called from the last insn to 2094 1931 * the first insn. Its purpose is to compute a bitmask of registers and 2095 1932 * stack slots that needs precision in the parent verifier state. ··· 2109 1924 u32 *reg_mask, u64 *stack_mask) 2110 1925 { 2111 1926 const struct bpf_insn_cbs cbs = { 1927 + .cb_call = disasm_kfunc_name, 2112 1928 .cb_print = verbose, 2113 1929 .private_data = env, 2114 1930 }; ··· 5551 5365 func_info_aux = env->prog->aux->func_info_aux; 5552 5366 if (func_info_aux) 5553 5367 is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; 5554 - err = btf_check_func_arg_match(env, subprog, caller->regs); 5368 + err = btf_check_subprog_arg_match(env, subprog, caller->regs); 5555 5369 if (err == -EFAULT) 5556 5370 return err; 5557 5371 if (is_global) { ··· 6146 5960 return 0; 6147 5961 } 6148 5962 5963 + /* mark_btf_func_reg_size() is used when the reg size is determined by 5964 + * the BTF func_proto's return value size and argument. 5965 + */ 5966 + static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, 5967 + size_t reg_size) 5968 + { 5969 + struct bpf_reg_state *reg = &cur_regs(env)[regno]; 5970 + 5971 + if (regno == BPF_REG_0) { 5972 + /* Function return value */ 5973 + reg->live |= REG_LIVE_WRITTEN; 5974 + reg->subreg_def = reg_size == sizeof(u64) ? 5975 + DEF_NOT_SUBREG : env->insn_idx + 1; 5976 + } else { 5977 + /* Function argument */ 5978 + if (reg_size == sizeof(u64)) { 5979 + mark_insn_zext(env, reg); 5980 + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); 5981 + } else { 5982 + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); 5983 + } 5984 + } 5985 + } 5986 + 5987 + static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) 5988 + { 5989 + const struct btf_type *t, *func, *func_proto, *ptr_type; 5990 + struct bpf_reg_state *regs = cur_regs(env); 5991 + const char *func_name, *ptr_type_name; 5992 + u32 i, nargs, func_id, ptr_type_id; 5993 + const struct btf_param *args; 5994 + int err; 5995 + 5996 + func_id = insn->imm; 5997 + func = btf_type_by_id(btf_vmlinux, func_id); 5998 + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); 5999 + func_proto = btf_type_by_id(btf_vmlinux, func->type); 6000 + 6001 + if (!env->ops->check_kfunc_call || 6002 + !env->ops->check_kfunc_call(func_id)) { 6003 + verbose(env, "calling kernel function %s is not allowed\n", 6004 + func_name); 6005 + return -EACCES; 6006 + } 6007 + 6008 + /* Check the arguments */ 6009 + err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs); 6010 + if (err) 6011 + return err; 6012 + 6013 + for (i = 0; i < CALLER_SAVED_REGS; i++) 6014 + mark_reg_not_init(env, regs, caller_saved[i]); 6015 + 6016 + /* Check return type */ 6017 + t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL); 6018 + if (btf_type_is_scalar(t)) { 6019 + mark_reg_unknown(env, regs, BPF_REG_0); 6020 + mark_btf_func_reg_size(env, BPF_REG_0, t->size); 6021 + } else if (btf_type_is_ptr(t)) { 6022 + ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type, 6023 + &ptr_type_id); 6024 + if (!btf_type_is_struct(ptr_type)) { 6025 + ptr_type_name = btf_name_by_offset(btf_vmlinux, 6026 + ptr_type->name_off); 6027 + verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", 6028 + func_name, btf_type_str(ptr_type), 6029 + ptr_type_name); 6030 + return -EINVAL; 6031 + } 6032 + mark_reg_known_zero(env, regs, BPF_REG_0); 6033 + regs[BPF_REG_0].btf = btf_vmlinux; 6034 + regs[BPF_REG_0].type = PTR_TO_BTF_ID; 6035 + regs[BPF_REG_0].btf_id = ptr_type_id; 6036 + mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); 6037 + } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ 6038 + 6039 + nargs = btf_type_vlen(func_proto); 6040 + args = (const struct btf_param *)(func_proto + 1); 6041 + for (i = 0; i < nargs; i++) { 6042 + u32 regno = i + 1; 6043 + 6044 + t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL); 6045 + if (btf_type_is_ptr(t)) 6046 + mark_btf_func_reg_size(env, regno, sizeof(void *)); 6047 + else 6048 + /* scalar. ensured by btf_check_kfunc_arg_match() */ 6049 + mark_btf_func_reg_size(env, regno, t->size); 6050 + } 6051 + 6052 + return 0; 6053 + } 6054 + 6149 6055 static bool signed_add_overflows(s64 a, s64 b) 6150 6056 { 6151 6057 /* Do the add in u64, where overflow is well-defined */ ··· 6340 6062 else 6341 6063 *ptr_limit = -off - 1; 6342 6064 return *ptr_limit >= max ? -ERANGE : 0; 6343 - case PTR_TO_MAP_KEY: 6344 - /* Currently, this code is not exercised as the only use 6345 - * is bpf_for_each_map_elem() helper which requires 6346 - * bpf_capble. The code has been tested manually for 6347 - * future use. 6348 - */ 6349 - if (mask_to_left) { 6350 - *ptr_limit = ptr_reg->umax_value + ptr_reg->off; 6351 - } else { 6352 - off = ptr_reg->smin_value + ptr_reg->off; 6353 - *ptr_limit = ptr_reg->map_ptr->key_size - off; 6354 - } 6355 - return 0; 6356 6065 case PTR_TO_MAP_VALUE: 6357 6066 max = ptr_reg->map_ptr->value_size; 6358 6067 if (mask_to_left) { ··· 6546 6281 verbose(env, "R%d pointer arithmetic on %s prohibited\n", 6547 6282 dst, reg_type_str[ptr_reg->type]); 6548 6283 return -EACCES; 6549 - case PTR_TO_MAP_KEY: 6550 6284 case PTR_TO_MAP_VALUE: 6551 6285 if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { 6552 6286 verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", ··· 10440 10176 10441 10177 if (env->log.level & BPF_LOG_LEVEL) { 10442 10178 const struct bpf_insn_cbs cbs = { 10179 + .cb_call = disasm_kfunc_name, 10443 10180 .cb_print = verbose, 10444 10181 .private_data = env, 10445 10182 }; ··· 10588 10323 if (BPF_SRC(insn->code) != BPF_K || 10589 10324 insn->off != 0 || 10590 10325 (insn->src_reg != BPF_REG_0 && 10591 - insn->src_reg != BPF_PSEUDO_CALL) || 10326 + insn->src_reg != BPF_PSEUDO_CALL && 10327 + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || 10592 10328 insn->dst_reg != BPF_REG_0 || 10593 10329 class == BPF_JMP32) { 10594 10330 verbose(env, "BPF_CALL uses reserved fields\n"); ··· 10604 10338 } 10605 10339 if (insn->src_reg == BPF_PSEUDO_CALL) 10606 10340 err = check_func_call(env, insn, &env->insn_idx); 10341 + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) 10342 + err = check_kfunc_call(env, insn); 10607 10343 else 10608 10344 err = check_helper_call(env, insn, &env->insn_idx); 10609 10345 if (err) ··· 11916 11648 func[i]->aux->name[0] = 'F'; 11917 11649 func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; 11918 11650 func[i]->jit_requested = 1; 11651 + func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; 11919 11652 func[i]->aux->linfo = prog->aux->linfo; 11920 11653 func[i]->aux->nr_linfo = prog->aux->nr_linfo; 11921 11654 func[i]->aux->jited_linfo = prog->aux->jited_linfo; ··· 12024 11755 prog->bpf_func = func[0]->bpf_func; 12025 11756 prog->aux->func = func; 12026 11757 prog->aux->func_cnt = env->subprog_cnt; 12027 - bpf_prog_free_unused_jited_linfo(prog); 11758 + bpf_prog_jit_attempt_done(prog); 12028 11759 return 0; 12029 11760 out_free: 12030 11761 for (i = 0; i < env->subprog_cnt; i++) { ··· 12047 11778 insn->off = 0; 12048 11779 insn->imm = env->insn_aux_data[i].call_imm; 12049 11780 } 12050 - bpf_prog_free_jited_linfo(prog); 11781 + bpf_prog_jit_attempt_done(prog); 12051 11782 return err; 12052 11783 } 12053 11784 ··· 12056 11787 #ifndef CONFIG_BPF_JIT_ALWAYS_ON 12057 11788 struct bpf_prog *prog = env->prog; 12058 11789 struct bpf_insn *insn = prog->insnsi; 11790 + bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); 12059 11791 int i, depth; 12060 11792 #endif 12061 11793 int err = 0; ··· 12070 11800 return err; 12071 11801 } 12072 11802 #ifndef CONFIG_BPF_JIT_ALWAYS_ON 11803 + if (has_kfunc_call) { 11804 + verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); 11805 + return -EINVAL; 11806 + } 12073 11807 if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { 12074 11808 /* When JIT fails the progs with bpf2bpf calls and tail_calls 12075 11809 * have to be rejected, since interpreter doesn't support them yet. ··· 12100 11826 err = 0; 12101 11827 #endif 12102 11828 return err; 11829 + } 11830 + 11831 + static int fixup_kfunc_call(struct bpf_verifier_env *env, 11832 + struct bpf_insn *insn) 11833 + { 11834 + const struct bpf_kfunc_desc *desc; 11835 + 11836 + /* insn->imm has the btf func_id. Replace it with 11837 + * an address (relative to __bpf_base_call). 11838 + */ 11839 + desc = find_kfunc_desc(env->prog, insn->imm); 11840 + if (!desc) { 11841 + verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", 11842 + insn->imm); 11843 + return -EFAULT; 11844 + } 11845 + 11846 + insn->imm = desc->imm; 11847 + 11848 + return 0; 12103 11849 } 12104 11850 12105 11851 /* Do various post-verification rewrites in a single program pass. ··· 12257 11963 continue; 12258 11964 if (insn->src_reg == BPF_PSEUDO_CALL) 12259 11965 continue; 11966 + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { 11967 + ret = fixup_kfunc_call(env, insn); 11968 + if (ret) 11969 + return ret; 11970 + continue; 11971 + } 12260 11972 12261 11973 if (insn->imm == BPF_FUNC_get_route_realm) 12262 11974 prog->dst_needed = 1; ··· 12492 12192 } 12493 12193 } 12494 12194 12195 + sort_kfunc_descs_by_imm(env->prog); 12196 + 12495 12197 return 0; 12496 12198 } 12497 12199 ··· 12604 12302 /* 1st arg to a function */ 12605 12303 regs[BPF_REG_1].type = PTR_TO_CTX; 12606 12304 mark_reg_known_zero(env, regs, BPF_REG_1); 12607 - ret = btf_check_func_arg_match(env, subprog, regs); 12305 + ret = btf_check_subprog_arg_match(env, subprog, regs); 12608 12306 if (ret == -EFAULT) 12609 12307 /* unlikely verifier bug. abort. 12610 12308 * ret == 0 and ret < 0 are sadly acceptable for ··· 13197 12895 GFP_USER); 13198 12896 ret = -ENOMEM; 13199 12897 if (!env->explored_states) 12898 + goto skip_full_check; 12899 + 12900 + ret = add_subprog_and_kfunc(env); 12901 + if (ret < 0) 13200 12902 goto skip_full_check; 13201 12903 13202 12904 ret = check_subprogs(env);

+33 -1

net/bpf/test_run.c

··· 2 2 /* Copyright (c) 2017 Facebook 3 3 */ 4 4 #include <linux/bpf.h> 5 + #include <linux/btf_ids.h> 5 6 #include <linux/slab.h> 6 7 #include <linux/vmalloc.h> 7 8 #include <linux/etherdevice.h> ··· 107 106 108 107 bpf_test_timer_enter(&t); 109 108 do { 110 - bpf_cgroup_storage_set(storage); 109 + ret = bpf_cgroup_storage_set(storage); 110 + if (ret) 111 + break; 111 112 112 113 if (xdp) 113 114 *retval = bpf_prog_run_xdp(prog, ctx); 114 115 else 115 116 *retval = BPF_PROG_RUN(prog, ctx); 117 + 118 + bpf_cgroup_storage_unset(); 116 119 } while (bpf_test_timer_continue(&t, repeat, &ret, time)); 117 120 bpf_test_timer_leave(&t); 118 121 ··· 214 209 *b += 1; 215 210 return a + *b; 216 211 } 212 + 213 + u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d) 214 + { 215 + return a + b + c + d; 216 + } 217 + 218 + int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b) 219 + { 220 + return a + b; 221 + } 222 + 223 + struct sock * noinline bpf_kfunc_call_test3(struct sock *sk) 224 + { 225 + return sk; 226 + } 227 + 217 228 __diag_pop(); 218 229 219 230 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); 231 + 232 + BTF_SET_START(test_sk_kfunc_ids) 233 + BTF_ID(func, bpf_kfunc_call_test1) 234 + BTF_ID(func, bpf_kfunc_call_test2) 235 + BTF_ID(func, bpf_kfunc_call_test3) 236 + BTF_SET_END(test_sk_kfunc_ids) 237 + 238 + bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) 239 + { 240 + return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id); 241 + } 220 242 221 243 static void *bpf_test_init(const union bpf_attr *kattr, u32 size, 222 244 u32 headroom, u32 tailroom)

+1

net/core/filter.c

··· 9813 9813 .convert_ctx_access = tc_cls_act_convert_ctx_access, 9814 9814 .gen_prologue = tc_cls_act_prologue, 9815 9815 .gen_ld_abs = bpf_gen_ld_abs, 9816 + .check_kfunc_call = bpf_prog_test_check_kfunc_call, 9816 9817 }; 9817 9818 9818 9819 const struct bpf_prog_ops tc_cls_act_prog_ops = {

+48 -7

net/core/skbuff.c

··· 2500 2500 } 2501 2501 EXPORT_SYMBOL_GPL(skb_splice_bits); 2502 2502 2503 - /* Send skb data on a socket. Socket must be locked. */ 2504 - int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 2505 - int len) 2503 + static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, 2504 + struct kvec *vec, size_t num, size_t size) 2505 + { 2506 + struct socket *sock = sk->sk_socket; 2507 + 2508 + if (!sock) 2509 + return -EINVAL; 2510 + return kernel_sendmsg(sock, msg, vec, num, size); 2511 + } 2512 + 2513 + static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, 2514 + size_t size, int flags) 2515 + { 2516 + struct socket *sock = sk->sk_socket; 2517 + 2518 + if (!sock) 2519 + return -EINVAL; 2520 + return kernel_sendpage(sock, page, offset, size, flags); 2521 + } 2522 + 2523 + typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, 2524 + struct kvec *vec, size_t num, size_t size); 2525 + typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, 2526 + size_t size, int flags); 2527 + static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, 2528 + int len, sendmsg_func sendmsg, sendpage_func sendpage) 2506 2529 { 2507 2530 unsigned int orig_len = len; 2508 2531 struct sk_buff *head = skb; ··· 2545 2522 memset(&msg, 0, sizeof(msg)); 2546 2523 msg.msg_flags = MSG_DONTWAIT; 2547 2524 2548 - ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); 2525 + ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, 2526 + sendmsg_unlocked, sk, &msg, &kv, 1, slen); 2549 2527 if (ret <= 0) 2550 2528 goto error; 2551 2529 ··· 2577 2553 slen = min_t(size_t, len, skb_frag_size(frag) - offset); 2578 2554 2579 2555 while (slen) { 2580 - ret = kernel_sendpage_locked(sk, skb_frag_page(frag), 2581 - skb_frag_off(frag) + offset, 2582 - slen, MSG_DONTWAIT); 2556 + ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, 2557 + sendpage_unlocked, sk, 2558 + skb_frag_page(frag), 2559 + skb_frag_off(frag) + offset, 2560 + slen, MSG_DONTWAIT); 2583 2561 if (ret <= 0) 2584 2562 goto error; 2585 2563 ··· 2613 2587 error: 2614 2588 return orig_len == len ? ret : orig_len - len; 2615 2589 } 2590 + 2591 + /* Send skb data on a socket. Socket must be locked. */ 2592 + int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 2593 + int len) 2594 + { 2595 + return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, 2596 + kernel_sendpage_locked); 2597 + } 2616 2598 EXPORT_SYMBOL_GPL(skb_send_sock_locked); 2599 + 2600 + /* Send skb data on a socket. Socket must be unlocked. */ 2601 + int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) 2602 + { 2603 + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 2604 + sendpage_unlocked); 2605 + } 2617 2606 2618 2607 /** 2619 2608 * skb_store_bits - store bits from kernel buffer to skb

+143 -34

net/core/skmsg.c

··· 399 399 } 400 400 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); 401 401 402 + int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, 403 + long timeo, int *err) 404 + { 405 + DEFINE_WAIT_FUNC(wait, woken_wake_function); 406 + int ret = 0; 407 + 408 + if (sk->sk_shutdown & RCV_SHUTDOWN) 409 + return 1; 410 + 411 + if (!timeo) 412 + return ret; 413 + 414 + add_wait_queue(sk_sleep(sk), &wait); 415 + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 416 + ret = sk_wait_event(sk, &timeo, 417 + !list_empty(&psock->ingress_msg) || 418 + !skb_queue_empty(&sk->sk_receive_queue), &wait); 419 + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 420 + remove_wait_queue(sk_sleep(sk), &wait); 421 + return ret; 422 + } 423 + EXPORT_SYMBOL_GPL(sk_msg_wait_data); 424 + 425 + /* Receive sk_msg from psock->ingress_msg to @msg. */ 426 + int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 427 + int len, int flags) 428 + { 429 + struct iov_iter *iter = &msg->msg_iter; 430 + int peek = flags & MSG_PEEK; 431 + struct sk_msg *msg_rx; 432 + int i, copied = 0; 433 + 434 + msg_rx = sk_psock_peek_msg(psock); 435 + while (copied != len) { 436 + struct scatterlist *sge; 437 + 438 + if (unlikely(!msg_rx)) 439 + break; 440 + 441 + i = msg_rx->sg.start; 442 + do { 443 + struct page *page; 444 + int copy; 445 + 446 + sge = sk_msg_elem(msg_rx, i); 447 + copy = sge->length; 448 + page = sg_page(sge); 449 + if (copied + copy > len) 450 + copy = len - copied; 451 + copy = copy_page_to_iter(page, sge->offset, copy, iter); 452 + if (!copy) 453 + return copied ? copied : -EFAULT; 454 + 455 + copied += copy; 456 + if (likely(!peek)) { 457 + sge->offset += copy; 458 + sge->length -= copy; 459 + if (!msg_rx->skb) 460 + sk_mem_uncharge(sk, copy); 461 + msg_rx->sg.size -= copy; 462 + 463 + if (!sge->length) { 464 + sk_msg_iter_var_next(i); 465 + if (!msg_rx->skb) 466 + put_page(page); 467 + } 468 + } else { 469 + /* Lets not optimize peek case if copy_page_to_iter 470 + * didn't copy the entire length lets just break. 471 + */ 472 + if (copy != sge->length) 473 + return copied; 474 + sk_msg_iter_var_next(i); 475 + } 476 + 477 + if (copied == len) 478 + break; 479 + } while (i != msg_rx->sg.end); 480 + 481 + if (unlikely(peek)) { 482 + msg_rx = sk_psock_next_msg(psock, msg_rx); 483 + if (!msg_rx) 484 + break; 485 + continue; 486 + } 487 + 488 + msg_rx->sg.start = i; 489 + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { 490 + msg_rx = sk_psock_dequeue_msg(psock); 491 + kfree_sk_msg(msg_rx); 492 + } 493 + msg_rx = sk_psock_peek_msg(psock); 494 + } 495 + 496 + return copied; 497 + } 498 + EXPORT_SYMBOL_GPL(sk_msg_recvmsg); 499 + 402 500 static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, 403 501 struct sk_buff *skb) 404 502 { ··· 508 410 if (!sk_rmem_schedule(sk, skb, skb->truesize)) 509 411 return NULL; 510 412 511 - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); 413 + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); 512 414 if (unlikely(!msg)) 513 415 return NULL; 514 416 ··· 595 497 if (!ingress) { 596 498 if (!sock_writeable(psock->sk)) 597 499 return -EAGAIN; 598 - return skb_send_sock_locked(psock->sk, skb, off, len); 500 + return skb_send_sock(psock->sk, skb, off, len); 599 501 } 600 502 return sk_psock_skb_ingress(psock, skb); 601 503 } ··· 609 511 u32 len, off; 610 512 int ret; 611 513 612 - /* Lock sock to avoid losing sk_socket during loop. */ 613 - lock_sock(psock->sk); 514 + mutex_lock(&psock->work_mutex); 614 515 if (state->skb) { 615 516 skb = state->skb; 616 517 len = state->len; ··· 626 529 skb_bpf_redirect_clear(skb); 627 530 do { 628 531 ret = -EIO; 629 - if (likely(psock->sk->sk_socket)) 532 + if (!sock_flag(psock->sk, SOCK_DEAD)) 630 533 ret = sk_psock_handle_skb(psock, skb, off, 631 534 len, ingress); 632 535 if (ret <= 0) { ··· 650 553 kfree_skb(skb); 651 554 } 652 555 end: 653 - release_sock(psock->sk); 556 + mutex_unlock(&psock->work_mutex); 654 557 } 655 558 656 559 struct sk_psock *sk_psock_init(struct sock *sk, int node) ··· 659 562 struct proto *prot; 660 563 661 564 write_lock_bh(&sk->sk_callback_lock); 662 - 663 - if (inet_csk_has_ulp(sk)) { 664 - psock = ERR_PTR(-EINVAL); 665 - goto out; 666 - } 667 565 668 566 if (sk->sk_user_data) { 669 567 psock = ERR_PTR(-EBUSY); ··· 683 591 spin_lock_init(&psock->link_lock); 684 592 685 593 INIT_WORK(&psock->work, sk_psock_backlog); 594 + mutex_init(&psock->work_mutex); 686 595 INIT_LIST_HEAD(&psock->ingress_msg); 596 + spin_lock_init(&psock->ingress_lock); 687 597 skb_queue_head_init(&psock->ingress_skb); 688 598 689 599 sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); ··· 724 630 } 725 631 } 726 632 727 - static void sk_psock_zap_ingress(struct sk_psock *psock) 633 + static void __sk_psock_zap_ingress(struct sk_psock *psock) 728 634 { 729 635 struct sk_buff *skb; 730 636 731 - while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { 637 + while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { 732 638 skb_bpf_redirect_clear(skb); 733 639 kfree_skb(skb); 734 640 } ··· 745 651 } 746 652 } 747 653 654 + void sk_psock_stop(struct sk_psock *psock, bool wait) 655 + { 656 + spin_lock_bh(&psock->ingress_lock); 657 + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); 658 + sk_psock_cork_free(psock); 659 + __sk_psock_zap_ingress(psock); 660 + spin_unlock_bh(&psock->ingress_lock); 661 + 662 + if (wait) 663 + cancel_work_sync(&psock->work); 664 + } 665 + 748 666 static void sk_psock_done_strp(struct sk_psock *psock); 749 667 750 - static void sk_psock_destroy_deferred(struct work_struct *gc) 668 + static void sk_psock_destroy(struct work_struct *work) 751 669 { 752 - struct sk_psock *psock = container_of(gc, struct sk_psock, gc); 753 - 670 + struct sk_psock *psock = container_of(to_rcu_work(work), 671 + struct sk_psock, rwork); 754 672 /* No sk_callback_lock since already detached. */ 755 673 756 674 sk_psock_done_strp(psock); 757 675 758 676 cancel_work_sync(&psock->work); 677 + mutex_destroy(&psock->work_mutex); 759 678 760 679 psock_progs_drop(&psock->progs); 761 680 762 681 sk_psock_link_destroy(psock); 763 682 sk_psock_cork_free(psock); 764 - sk_psock_zap_ingress(psock); 765 683 766 684 if (psock->sk_redir) 767 685 sock_put(psock->sk_redir); ··· 781 675 kfree(psock); 782 676 } 783 677 784 - static void sk_psock_destroy(struct rcu_head *rcu) 785 - { 786 - struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); 787 - 788 - INIT_WORK(&psock->gc, sk_psock_destroy_deferred); 789 - schedule_work(&psock->gc); 790 - } 791 - 792 678 void sk_psock_drop(struct sock *sk, struct sk_psock *psock) 793 679 { 794 - sk_psock_cork_free(psock); 795 - sk_psock_zap_ingress(psock); 680 + sk_psock_stop(psock, false); 796 681 797 682 write_lock_bh(&sk->sk_callback_lock); 798 683 sk_psock_restore_proto(sk, psock); 799 684 rcu_assign_sk_user_data(sk, NULL); 800 685 if (psock->progs.stream_parser) 801 686 sk_psock_stop_strp(sk, psock); 802 - else if (psock->progs.stream_verdict) 687 + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) 803 688 sk_psock_stop_verdict(sk, psock); 804 689 write_unlock_bh(&sk->sk_callback_lock); 805 - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); 806 690 807 - call_rcu(&psock->rcu, sk_psock_destroy); 691 + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); 692 + queue_rcu_work(system_wq, &psock->rwork); 808 693 } 809 694 EXPORT_SYMBOL_GPL(sk_psock_drop); 810 695 ··· 864 767 * error that caused the pipe to break. We can't send a packet on 865 768 * a socket that is in this state so we drop the skb. 866 769 */ 867 - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || 868 - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { 770 + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { 771 + kfree_skb(skb); 772 + return; 773 + } 774 + spin_lock_bh(&psock_other->ingress_lock); 775 + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { 776 + spin_unlock_bh(&psock_other->ingress_lock); 869 777 kfree_skb(skb); 870 778 return; 871 779 } 872 780 873 781 skb_queue_tail(&psock_other->ingress_skb, skb); 874 782 schedule_work(&psock_other->work); 783 + spin_unlock_bh(&psock_other->ingress_lock); 875 784 } 876 785 877 786 static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) ··· 945 842 err = sk_psock_skb_ingress_self(psock, skb); 946 843 } 947 844 if (err < 0) { 948 - skb_queue_tail(&psock->ingress_skb, skb); 949 - schedule_work(&psock->work); 845 + spin_lock_bh(&psock->ingress_lock); 846 + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { 847 + skb_queue_tail(&psock->ingress_skb, skb); 848 + schedule_work(&psock->work); 849 + } 850 + spin_unlock_bh(&psock->ingress_lock); 950 851 } 951 852 break; 952 853 case __SK_REDIRECT: ··· 1117 1010 } 1118 1011 skb_set_owner_r(skb, sk); 1119 1012 prog = READ_ONCE(psock->progs.stream_verdict); 1013 + if (!prog) 1014 + prog = READ_ONCE(psock->progs.skb_verdict); 1120 1015 if (likely(prog)) { 1121 1016 skb_dst_drop(skb); 1122 1017 skb_bpf_redirect_clear(skb);

+55 -63

net/core/sock_map.c

··· 26 26 27 27 static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, 28 28 struct bpf_prog *old, u32 which); 29 + static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); 29 30 30 31 static struct bpf_map *sock_map_alloc(union bpf_attr *attr) 31 32 { ··· 156 155 strp_stop = true; 157 156 if (psock->saved_data_ready && stab->progs.stream_verdict) 158 157 verdict_stop = true; 158 + if (psock->saved_data_ready && stab->progs.skb_verdict) 159 + verdict_stop = true; 159 160 list_del(&link->list); 160 161 sk_psock_free_link(link); 161 162 } ··· 185 182 186 183 static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) 187 184 { 188 - struct proto *prot; 189 - 190 - switch (sk->sk_type) { 191 - case SOCK_STREAM: 192 - prot = tcp_bpf_get_proto(sk, psock); 193 - break; 194 - 195 - case SOCK_DGRAM: 196 - prot = udp_bpf_get_proto(sk, psock); 197 - break; 198 - 199 - default: 185 + if (!sk->sk_prot->psock_update_sk_prot) 200 186 return -EINVAL; 201 - } 202 - 203 - if (IS_ERR(prot)) 204 - return PTR_ERR(prot); 205 - 206 - sk_psock_update_proto(sk, psock, prot); 207 - return 0; 187 + psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; 188 + return sk->sk_prot->psock_update_sk_prot(sk, false); 208 189 } 209 190 210 191 static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) ··· 211 224 return psock; 212 225 } 213 226 214 - static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, 215 - struct sock *sk) 227 + static bool sock_map_redirect_allowed(const struct sock *sk); 228 + 229 + static int sock_map_link(struct bpf_map *map, struct sock *sk) 216 230 { 217 - struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; 231 + struct sk_psock_progs *progs = sock_map_progs(map); 232 + struct bpf_prog *stream_verdict = NULL; 233 + struct bpf_prog *stream_parser = NULL; 234 + struct bpf_prog *skb_verdict = NULL; 235 + struct bpf_prog *msg_parser = NULL; 218 236 struct sk_psock *psock; 219 237 int ret; 238 + 239 + /* Only sockets we can redirect into/from in BPF need to hold 240 + * refs to parser/verdict progs and have their sk_data_ready 241 + * and sk_write_space callbacks overridden. 242 + */ 243 + if (!sock_map_redirect_allowed(sk)) 244 + goto no_progs; 220 245 221 246 stream_verdict = READ_ONCE(progs->stream_verdict); 222 247 if (stream_verdict) { ··· 255 256 } 256 257 } 257 258 259 + skb_verdict = READ_ONCE(progs->skb_verdict); 260 + if (skb_verdict) { 261 + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); 262 + if (IS_ERR(skb_verdict)) { 263 + ret = PTR_ERR(skb_verdict); 264 + goto out_put_msg_parser; 265 + } 266 + } 267 + 268 + no_progs: 258 269 psock = sock_map_psock_get_checked(sk); 259 270 if (IS_ERR(psock)) { 260 271 ret = PTR_ERR(psock); ··· 274 265 if (psock) { 275 266 if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || 276 267 (stream_parser && READ_ONCE(psock->progs.stream_parser)) || 268 + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || 269 + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || 270 + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || 277 271 (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { 278 272 sk_psock_put(sk, psock); 279 273 ret = -EBUSY; ··· 308 296 } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { 309 297 psock_set_prog(&psock->progs.stream_verdict, stream_verdict); 310 298 sk_psock_start_verdict(sk,psock); 299 + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { 300 + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); 301 + sk_psock_start_verdict(sk, psock); 311 302 } 312 303 write_unlock_bh(&sk->sk_callback_lock); 313 304 return 0; ··· 319 304 out_drop: 320 305 sk_psock_put(sk, psock); 321 306 out_progs: 307 + if (skb_verdict) 308 + bpf_prog_put(skb_verdict); 309 + out_put_msg_parser: 322 310 if (msg_parser) 323 311 bpf_prog_put(msg_parser); 324 312 out_put_stream_parser: ··· 330 312 out_put_stream_verdict: 331 313 if (stream_verdict) 332 314 bpf_prog_put(stream_verdict); 333 - return ret; 334 - } 335 - 336 - static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) 337 - { 338 - struct sk_psock *psock; 339 - int ret; 340 - 341 - psock = sock_map_psock_get_checked(sk); 342 - if (IS_ERR(psock)) 343 - return PTR_ERR(psock); 344 - 345 - if (!psock) { 346 - psock = sk_psock_init(sk, map->numa_node); 347 - if (IS_ERR(psock)) 348 - return PTR_ERR(psock); 349 - } 350 - 351 - ret = sock_map_init_proto(sk, psock); 352 - if (ret < 0) 353 - sk_psock_put(sk, psock); 354 315 return ret; 355 316 } 356 317 ··· 463 466 return 0; 464 467 } 465 468 466 - static bool sock_map_redirect_allowed(const struct sock *sk); 467 - 468 469 static int sock_map_update_common(struct bpf_map *map, u32 idx, 469 470 struct sock *sk, u64 flags) 470 471 { ··· 482 487 if (!link) 483 488 return -ENOMEM; 484 489 485 - /* Only sockets we can redirect into/from in BPF need to hold 486 - * refs to parser/verdict progs and have their sk_data_ready 487 - * and sk_write_space callbacks overridden. 488 - */ 489 - if (sock_map_redirect_allowed(sk)) 490 - ret = sock_map_link(map, &stab->progs, sk); 491 - else 492 - ret = sock_map_link_no_progs(map, sk); 490 + ret = sock_map_link(map, sk); 493 491 if (ret < 0) 494 492 goto out_free; 495 493 ··· 535 547 536 548 static bool sock_map_redirect_allowed(const struct sock *sk) 537 549 { 538 - return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN; 550 + if (sk_is_tcp(sk)) 551 + return sk->sk_state != TCP_LISTEN; 552 + else 553 + return sk->sk_state == TCP_ESTABLISHED; 539 554 } 540 555 541 556 static bool sock_map_sk_is_suitable(const struct sock *sk) 542 557 { 543 - return sk_is_tcp(sk) || sk_is_udp(sk); 558 + return !!sk->sk_prot->psock_update_sk_prot; 544 559 } 545 560 546 561 static bool sock_map_sk_state_allowed(const struct sock *sk) ··· 990 999 if (!link) 991 1000 return -ENOMEM; 992 1001 993 - /* Only sockets we can redirect into/from in BPF need to hold 994 - * refs to parser/verdict progs and have their sk_data_ready 995 - * and sk_write_space callbacks overridden. 996 - */ 997 - if (sock_map_redirect_allowed(sk)) 998 - ret = sock_map_link(map, &htab->progs, sk); 999 - else 1000 - ret = sock_map_link_no_progs(map, sk); 1002 + ret = sock_map_link(map, sk); 1001 1003 if (ret < 0) 1002 1004 goto out_free; 1003 1005 ··· 1450 1466 break; 1451 1467 #endif 1452 1468 case BPF_SK_SKB_STREAM_VERDICT: 1469 + if (progs->skb_verdict) 1470 + return -EBUSY; 1453 1471 pprog = &progs->stream_verdict; 1472 + break; 1473 + case BPF_SK_SKB_VERDICT: 1474 + if (progs->stream_verdict) 1475 + return -EBUSY; 1476 + pprog = &progs->skb_verdict; 1454 1477 break; 1455 1478 default: 1456 1479 return -EOPNOTSUPP; ··· 1531 1540 saved_close = psock->saved_close; 1532 1541 sock_map_remove_links(sk, psock); 1533 1542 rcu_read_unlock(); 1543 + sk_psock_stop(psock, true); 1534 1544 release_sock(sk); 1535 1545 saved_close(sk, timeout); 1536 1546 }

+1

net/ipv4/af_inet.c

··· 1070 1070 .setsockopt = sock_common_setsockopt, 1071 1071 .getsockopt = sock_common_getsockopt, 1072 1072 .sendmsg = inet_sendmsg, 1073 + .read_sock = udp_read_sock, 1073 1074 .recvmsg = inet_recvmsg, 1074 1075 .mmap = sock_no_mmap, 1075 1076 .sendpage = inet_sendpage,

+43

net/ipv4/bpf_tcp_ca.c

··· 5 5 #include <linux/bpf_verifier.h> 6 6 #include <linux/bpf.h> 7 7 #include <linux/btf.h> 8 + #include <linux/btf_ids.h> 8 9 #include <linux/filter.h> 9 10 #include <net/tcp.h> 10 11 #include <net/bpf_sk_storage.h> ··· 179 178 } 180 179 } 181 180 181 + BTF_SET_START(bpf_tcp_ca_kfunc_ids) 182 + BTF_ID(func, tcp_reno_ssthresh) 183 + BTF_ID(func, tcp_reno_cong_avoid) 184 + BTF_ID(func, tcp_reno_undo_cwnd) 185 + BTF_ID(func, tcp_slow_start) 186 + BTF_ID(func, tcp_cong_avoid_ai) 187 + #ifdef CONFIG_DYNAMIC_FTRACE 188 + #if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC) 189 + BTF_ID(func, cubictcp_init) 190 + BTF_ID(func, cubictcp_recalc_ssthresh) 191 + BTF_ID(func, cubictcp_cong_avoid) 192 + BTF_ID(func, cubictcp_state) 193 + BTF_ID(func, cubictcp_cwnd_event) 194 + BTF_ID(func, cubictcp_acked) 195 + #endif 196 + #if IS_BUILTIN(CONFIG_TCP_CONG_DCTCP) 197 + BTF_ID(func, dctcp_init) 198 + BTF_ID(func, dctcp_update_alpha) 199 + BTF_ID(func, dctcp_cwnd_event) 200 + BTF_ID(func, dctcp_ssthresh) 201 + BTF_ID(func, dctcp_cwnd_undo) 202 + BTF_ID(func, dctcp_state) 203 + #endif 204 + #if IS_BUILTIN(CONFIG_TCP_CONG_BBR) 205 + BTF_ID(func, bbr_init) 206 + BTF_ID(func, bbr_main) 207 + BTF_ID(func, bbr_sndbuf_expand) 208 + BTF_ID(func, bbr_undo_cwnd) 209 + BTF_ID(func, bbr_cwnd_event) 210 + BTF_ID(func, bbr_ssthresh) 211 + BTF_ID(func, bbr_min_tso_segs) 212 + BTF_ID(func, bbr_set_state) 213 + #endif 214 + #endif /* CONFIG_DYNAMIC_FTRACE */ 215 + BTF_SET_END(bpf_tcp_ca_kfunc_ids) 216 + 217 + static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id) 218 + { 219 + return btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id); 220 + } 221 + 182 222 static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { 183 223 .get_func_proto = bpf_tcp_ca_get_func_proto, 184 224 .is_valid_access = bpf_tcp_ca_is_valid_access, 185 225 .btf_struct_access = bpf_tcp_ca_btf_struct_access, 226 + .check_kfunc_call = bpf_tcp_ca_check_kfunc_call, 186 227 }; 187 228 188 229 static int bpf_tcp_ca_init_member(const struct btf_type *t,

+23 -107

net/ipv4/tcp_bpf.c

··· 10 10 #include <net/inet_common.h> 11 11 #include <net/tls.h> 12 12 13 - int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, 14 - struct msghdr *msg, int len, int flags) 15 - { 16 - struct iov_iter *iter = &msg->msg_iter; 17 - int peek = flags & MSG_PEEK; 18 - struct sk_msg *msg_rx; 19 - int i, copied = 0; 20 - 21 - msg_rx = list_first_entry_or_null(&psock->ingress_msg, 22 - struct sk_msg, list); 23 - 24 - while (copied != len) { 25 - struct scatterlist *sge; 26 - 27 - if (unlikely(!msg_rx)) 28 - break; 29 - 30 - i = msg_rx->sg.start; 31 - do { 32 - struct page *page; 33 - int copy; 34 - 35 - sge = sk_msg_elem(msg_rx, i); 36 - copy = sge->length; 37 - page = sg_page(sge); 38 - if (copied + copy > len) 39 - copy = len - copied; 40 - copy = copy_page_to_iter(page, sge->offset, copy, iter); 41 - if (!copy) 42 - return copied ? copied : -EFAULT; 43 - 44 - copied += copy; 45 - if (likely(!peek)) { 46 - sge->offset += copy; 47 - sge->length -= copy; 48 - if (!msg_rx->skb) 49 - sk_mem_uncharge(sk, copy); 50 - msg_rx->sg.size -= copy; 51 - 52 - if (!sge->length) { 53 - sk_msg_iter_var_next(i); 54 - if (!msg_rx->skb) 55 - put_page(page); 56 - } 57 - } else { 58 - /* Lets not optimize peek case if copy_page_to_iter 59 - * didn't copy the entire length lets just break. 60 - */ 61 - if (copy != sge->length) 62 - return copied; 63 - sk_msg_iter_var_next(i); 64 - } 65 - 66 - if (copied == len) 67 - break; 68 - } while (i != msg_rx->sg.end); 69 - 70 - if (unlikely(peek)) { 71 - if (msg_rx == list_last_entry(&psock->ingress_msg, 72 - struct sk_msg, list)) 73 - break; 74 - msg_rx = list_next_entry(msg_rx, list); 75 - continue; 76 - } 77 - 78 - msg_rx->sg.start = i; 79 - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { 80 - list_del(&msg_rx->list); 81 - if (msg_rx->skb) 82 - consume_skb(msg_rx->skb); 83 - kfree(msg_rx); 84 - } 85 - msg_rx = list_first_entry_or_null(&psock->ingress_msg, 86 - struct sk_msg, list); 87 - } 88 - 89 - return copied; 90 - } 91 - EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); 92 - 93 13 static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, 94 14 struct sk_msg *msg, u32 apply_bytes, int flags) 95 15 { ··· 163 243 return !empty; 164 244 } 165 245 166 - static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, 167 - int flags, long timeo, int *err) 168 - { 169 - DEFINE_WAIT_FUNC(wait, woken_wake_function); 170 - int ret = 0; 171 - 172 - if (sk->sk_shutdown & RCV_SHUTDOWN) 173 - return 1; 174 - 175 - if (!timeo) 176 - return ret; 177 - 178 - add_wait_queue(sk_sleep(sk), &wait); 179 - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 180 - ret = sk_wait_event(sk, &timeo, 181 - !list_empty(&psock->ingress_msg) || 182 - !skb_queue_empty(&sk->sk_receive_queue), &wait); 183 - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 184 - remove_wait_queue(sk_sleep(sk), &wait); 185 - return ret; 186 - } 187 - 188 246 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 189 247 int nonblock, int flags, int *addr_len) 190 248 { ··· 182 284 } 183 285 lock_sock(sk); 184 286 msg_bytes_ready: 185 - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); 287 + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); 186 288 if (!copied) { 187 289 int data, err = 0; 188 290 long timeo; 189 291 190 292 timeo = sock_rcvtimeo(sk, nonblock); 191 - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); 293 + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); 192 294 if (data) { 193 295 if (!sk_psock_queue_empty(psock)) 194 296 goto msg_bytes_ready; ··· 499 601 ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; 500 602 } 501 603 502 - struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) 604 + int tcp_bpf_update_proto(struct sock *sk, bool restore) 503 605 { 606 + struct sk_psock *psock = sk_psock(sk); 504 607 int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; 505 608 int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; 506 609 610 + if (restore) { 611 + if (inet_csk_has_ulp(sk)) { 612 + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); 613 + } else { 614 + sk->sk_write_space = psock->saved_write_space; 615 + /* Pairs with lockless read in sk_clone_lock() */ 616 + WRITE_ONCE(sk->sk_prot, psock->sk_proto); 617 + } 618 + return 0; 619 + } 620 + 621 + if (inet_csk_has_ulp(sk)) 622 + return -EINVAL; 623 + 507 624 if (sk->sk_family == AF_INET6) { 508 625 if (tcp_bpf_assert_proto_ops(psock->sk_proto)) 509 - return ERR_PTR(-EINVAL); 626 + return -EINVAL; 510 627 511 628 tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); 512 629 } 513 630 514 - return &tcp_bpf_prots[family][config]; 631 + /* Pairs with lockless read in sk_clone_lock() */ 632 + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); 633 + return 0; 515 634 } 635 + EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); 516 636 517 637 /* If a child got cloned from a listening socket that had tcp_bpf 518 638 * protocol callbacks installed, we need to restore the callbacks to

+12 -12

net/ipv4/tcp_cubic.c

··· 124 124 ca->sample_cnt = 0; 125 125 } 126 126 127 - static void bictcp_init(struct sock *sk) 127 + static void cubictcp_init(struct sock *sk) 128 128 { 129 129 struct bictcp *ca = inet_csk_ca(sk); 130 130 ··· 137 137 tcp_sk(sk)->snd_ssthresh = initial_ssthresh; 138 138 } 139 139 140 - static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) 140 + static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) 141 141 { 142 142 if (event == CA_EVENT_TX_START) { 143 143 struct bictcp *ca = inet_csk_ca(sk); ··· 319 319 ca->cnt = max(ca->cnt, 2U); 320 320 } 321 321 322 - static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) 322 + static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) 323 323 { 324 324 struct tcp_sock *tp = tcp_sk(sk); 325 325 struct bictcp *ca = inet_csk_ca(sk); ··· 338 338 tcp_cong_avoid_ai(tp, ca->cnt, acked); 339 339 } 340 340 341 - static u32 bictcp_recalc_ssthresh(struct sock *sk) 341 + static u32 cubictcp_recalc_ssthresh(struct sock *sk) 342 342 { 343 343 const struct tcp_sock *tp = tcp_sk(sk); 344 344 struct bictcp *ca = inet_csk_ca(sk); ··· 355 355 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); 356 356 } 357 357 358 - static void bictcp_state(struct sock *sk, u8 new_state) 358 + static void cubictcp_state(struct sock *sk, u8 new_state) 359 359 { 360 360 if (new_state == TCP_CA_Loss) { 361 361 bictcp_reset(inet_csk_ca(sk)); ··· 442 442 } 443 443 } 444 444 445 - static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) 445 + static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) 446 446 { 447 447 const struct tcp_sock *tp = tcp_sk(sk); 448 448 struct bictcp *ca = inet_csk_ca(sk); ··· 471 471 } 472 472 473 473 static struct tcp_congestion_ops cubictcp __read_mostly = { 474 - .init = bictcp_init, 475 - .ssthresh = bictcp_recalc_ssthresh, 476 - .cong_avoid = bictcp_cong_avoid, 477 - .set_state = bictcp_state, 474 + .init = cubictcp_init, 475 + .ssthresh = cubictcp_recalc_ssthresh, 476 + .cong_avoid = cubictcp_cong_avoid, 477 + .set_state = cubictcp_state, 478 478 .undo_cwnd = tcp_reno_undo_cwnd, 479 - .cwnd_event = bictcp_cwnd_event, 480 - .pkts_acked = bictcp_acked, 479 + .cwnd_event = cubictcp_cwnd_event, 480 + .pkts_acked = cubictcp_acked, 481 481 .owner = THIS_MODULE, 482 482 .name = "cubic", 483 483 };

+3

net/ipv4/tcp_ipv4.c

··· 2806 2806 .hash = inet_hash, 2807 2807 .unhash = inet_unhash, 2808 2808 .get_port = inet_csk_get_port, 2809 + #ifdef CONFIG_BPF_SYSCALL 2810 + .psock_update_sk_prot = tcp_bpf_update_proto, 2811 + #endif 2809 2812 .enter_memory_pressure = tcp_enter_memory_pressure, 2810 2813 .leave_memory_pressure = tcp_leave_memory_pressure, 2811 2814 .stream_memory_free = tcp_stream_memory_free,

+32

net/ipv4/udp.c

··· 1782 1782 } 1783 1783 EXPORT_SYMBOL(__skb_recv_udp); 1784 1784 1785 + int udp_read_sock(struct sock *sk, read_descriptor_t *desc, 1786 + sk_read_actor_t recv_actor) 1787 + { 1788 + int copied = 0; 1789 + 1790 + while (1) { 1791 + struct sk_buff *skb; 1792 + int err, used; 1793 + 1794 + skb = skb_recv_udp(sk, 0, 1, &err); 1795 + if (!skb) 1796 + return err; 1797 + used = recv_actor(desc, skb, 0, skb->len); 1798 + if (used <= 0) { 1799 + if (!copied) 1800 + copied = used; 1801 + break; 1802 + } else if (used <= skb->len) { 1803 + copied += used; 1804 + } 1805 + 1806 + if (!desc->count) 1807 + break; 1808 + } 1809 + 1810 + return copied; 1811 + } 1812 + EXPORT_SYMBOL(udp_read_sock); 1813 + 1785 1814 /* 1786 1815 * This should be easy, if there is something there we 1787 1816 * return it, otherwise we block. ··· 2883 2854 .unhash = udp_lib_unhash, 2884 2855 .rehash = udp_v4_rehash, 2885 2856 .get_port = udp_v4_get_port, 2857 + #ifdef CONFIG_BPF_SYSCALL 2858 + .psock_update_sk_prot = udp_bpf_update_proto, 2859 + #endif 2886 2860 .memory_allocated = &udp_memory_allocated, 2887 2861 .sysctl_mem = sysctl_udp_mem, 2888 2862 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),

+76 -3

net/ipv4/udp_bpf.c

··· 4 4 #include <linux/skmsg.h> 5 5 #include <net/sock.h> 6 6 #include <net/udp.h> 7 + #include <net/inet_common.h> 8 + 9 + #include "udp_impl.h" 10 + 11 + static struct proto *udpv6_prot_saved __read_mostly; 12 + 13 + static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 14 + int noblock, int flags, int *addr_len) 15 + { 16 + #if IS_ENABLED(CONFIG_IPV6) 17 + if (sk->sk_family == AF_INET6) 18 + return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags, 19 + addr_len); 20 + #endif 21 + return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); 22 + } 23 + 24 + static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 25 + int nonblock, int flags, int *addr_len) 26 + { 27 + struct sk_psock *psock; 28 + int copied, ret; 29 + 30 + if (unlikely(flags & MSG_ERRQUEUE)) 31 + return inet_recv_error(sk, msg, len, addr_len); 32 + 33 + psock = sk_psock_get(sk); 34 + if (unlikely(!psock)) 35 + return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); 36 + 37 + lock_sock(sk); 38 + if (sk_psock_queue_empty(psock)) { 39 + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); 40 + goto out; 41 + } 42 + 43 + msg_bytes_ready: 44 + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); 45 + if (!copied) { 46 + int data, err = 0; 47 + long timeo; 48 + 49 + timeo = sock_rcvtimeo(sk, nonblock); 50 + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); 51 + if (data) { 52 + if (!sk_psock_queue_empty(psock)) 53 + goto msg_bytes_ready; 54 + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); 55 + goto out; 56 + } 57 + if (err) { 58 + ret = err; 59 + goto out; 60 + } 61 + copied = -EAGAIN; 62 + } 63 + ret = copied; 64 + out: 65 + release_sock(sk); 66 + sk_psock_put(sk, psock); 67 + return ret; 68 + } 7 69 8 70 enum { 9 71 UDP_BPF_IPV4, ··· 73 11 UDP_BPF_NUM_PROTS, 74 12 }; 75 13 76 - static struct proto *udpv6_prot_saved __read_mostly; 77 14 static DEFINE_SPINLOCK(udpv6_prot_lock); 78 15 static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; 79 16 ··· 81 20 *prot = *base; 82 21 prot->unhash = sock_map_unhash; 83 22 prot->close = sock_map_close; 23 + prot->recvmsg = udp_bpf_recvmsg; 84 24 } 85 25 86 26 static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) ··· 103 41 } 104 42 core_initcall(udp_bpf_v4_build_proto); 105 43 106 - struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) 44 + int udp_bpf_update_proto(struct sock *sk, bool restore) 107 45 { 108 46 int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; 47 + struct sk_psock *psock = sk_psock(sk); 48 + 49 + if (restore) { 50 + sk->sk_write_space = psock->saved_write_space; 51 + /* Pairs with lockless read in sk_clone_lock() */ 52 + WRITE_ONCE(sk->sk_prot, psock->sk_proto); 53 + return 0; 54 + } 109 55 110 56 if (sk->sk_family == AF_INET6) 111 57 udp_bpf_check_v6_needs_rebuild(psock->sk_proto); 112 58 113 - return &udp_bpf_prots[family]; 59 + /* Pairs with lockless read in sk_clone_lock() */ 60 + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); 61 + return 0; 114 62 } 63 + EXPORT_SYMBOL_GPL(udp_bpf_update_proto);

+1

net/ipv6/af_inet6.c

··· 714 714 .getsockopt = sock_common_getsockopt, /* ok */ 715 715 .sendmsg = inet6_sendmsg, /* retpoline's sake */ 716 716 .recvmsg = inet6_recvmsg, /* retpoline's sake */ 717 + .read_sock = udp_read_sock, 717 718 .mmap = sock_no_mmap, 718 719 .sendpage = sock_no_sendpage, 719 720 .set_peek_off = sk_set_peek_off,

+3

net/ipv6/tcp_ipv6.c

··· 2139 2139 .hash = inet6_hash, 2140 2140 .unhash = inet_unhash, 2141 2141 .get_port = inet_csk_get_port, 2142 + #ifdef CONFIG_BPF_SYSCALL 2143 + .psock_update_sk_prot = tcp_bpf_update_proto, 2144 + #endif 2142 2145 .enter_memory_pressure = tcp_enter_memory_pressure, 2143 2146 .leave_memory_pressure = tcp_leave_memory_pressure, 2144 2147 .stream_memory_free = tcp_stream_memory_free,

+3

net/ipv6/udp.c

··· 1714 1714 .unhash = udp_lib_unhash, 1715 1715 .rehash = udp_v6_rehash, 1716 1716 .get_port = udp_v6_get_port, 1717 + #ifdef CONFIG_BPF_SYSCALL 1718 + .psock_update_sk_prot = udp_bpf_update_proto, 1719 + #endif 1717 1720 .memory_allocated = &udp_memory_allocated, 1718 1721 .sysctl_mem = sysctl_udp_mem, 1719 1722 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),

+2 -2

net/tls/tls_sw.c

··· 1789 1789 skb = tls_wait_data(sk, psock, flags, timeo, &err); 1790 1790 if (!skb) { 1791 1791 if (psock) { 1792 - int ret = __tcp_bpf_recvmsg(sk, psock, 1793 - msg, len, flags); 1792 + int ret = sk_msg_recvmsg(sk, psock, msg, len, 1793 + flags); 1794 1794 1795 1795 if (ret > 0) { 1796 1796 decrypted += ret;

-1

samples/bpf/sampleip_kern.c

··· 4 4 * modify it under the terms of version 2 of the GNU General Public 5 5 * License as published by the Free Software Foundation. 6 6 */ 7 - #include <linux/version.h> 8 7 #include <linux/ptrace.h> 9 8 #include <uapi/linux/bpf.h> 10 9 #include <uapi/linux/bpf_perf_event.h>

-1

samples/bpf/trace_event_kern.c

··· 5 5 * License as published by the Free Software Foundation. 6 6 */ 7 7 #include <linux/ptrace.h> 8 - #include <linux/version.h> 9 8 #include <uapi/linux/bpf.h> 10 9 #include <uapi/linux/bpf_perf_event.h> 11 10 #include <uapi/linux/perf_event.h>

+18 -45

samples/bpf/xdpsock_user.c

··· 96 96 static int opt_timeout = 1000; 97 97 static bool opt_need_wakeup = true; 98 98 static u32 opt_num_xsks = 1; 99 - static u32 prog_id; 100 99 static bool opt_busy_poll; 101 100 static bool opt_reduced_cap; 102 101 ··· 461 462 return NULL; 462 463 } 463 464 464 - static void remove_xdp_program(void) 465 - { 466 - u32 curr_prog_id = 0; 467 - int cmd = CLOSE_CONN; 468 - 469 - if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { 470 - printf("bpf_get_link_xdp_id failed\n"); 471 - exit(EXIT_FAILURE); 472 - } 473 - if (prog_id == curr_prog_id) 474 - bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); 475 - else if (!curr_prog_id) 476 - printf("couldn't find a prog id on a given interface\n"); 477 - else 478 - printf("program on interface changed, not removing\n"); 479 - 480 - if (opt_reduced_cap) { 481 - if (write(sock, &cmd, sizeof(int)) < 0) { 482 - fprintf(stderr, "Error writing into stream socket: %s", strerror(errno)); 483 - exit(EXIT_FAILURE); 484 - } 485 - } 486 - } 487 - 488 465 static void int_exit(int sig) 489 466 { 490 467 benchmark_done = true; 491 - } 492 - 493 - static void xdpsock_cleanup(void) 494 - { 495 - struct xsk_umem *umem = xsks[0]->umem->umem; 496 - int i; 497 - 498 - dump_stats(); 499 - for (i = 0; i < num_socks; i++) 500 - xsk_socket__delete(xsks[i]->xsk); 501 - (void)xsk_umem__delete(umem); 502 - remove_xdp_program(); 503 468 } 504 469 505 470 static void __exit_with_error(int error, const char *file, const char *func, ··· 471 508 { 472 509 fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, 473 510 line, error, strerror(error)); 474 - dump_stats(); 475 - remove_xdp_program(); 476 511 exit(EXIT_FAILURE); 477 512 } 478 513 479 - #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \ 480 - __LINE__) 514 + #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) 515 + 516 + static void xdpsock_cleanup(void) 517 + { 518 + struct xsk_umem *umem = xsks[0]->umem->umem; 519 + int i, cmd = CLOSE_CONN; 520 + 521 + dump_stats(); 522 + for (i = 0; i < num_socks; i++) 523 + xsk_socket__delete(xsks[i]->xsk); 524 + (void)xsk_umem__delete(umem); 525 + 526 + if (opt_reduced_cap) { 527 + if (write(sock, &cmd, sizeof(int)) < 0) 528 + exit_with_error(errno); 529 + } 530 + } 531 + 481 532 static void swap_mac_addresses(void *data) 482 533 { 483 534 struct ether_header *eth = (struct ether_header *)data; ··· 854 877 txr = tx ? &xsk->tx : NULL; 855 878 ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem, 856 879 rxr, txr, &cfg); 857 - if (ret) 858 - exit_with_error(-ret); 859 - 860 - ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags); 861 880 if (ret) 862 881 exit_with_error(-ret); 863 882

+1

tools/bpf/bpftool/common.c

··· 57 57 58 58 [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", 59 59 [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", 60 + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", 60 61 [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", 61 62 [BPF_LIRC_MODE2] = "lirc_mode2", 62 63 [BPF_FLOW_DISSECTOR] = "flow_dissector",

+1

tools/bpf/bpftool/prog.c

··· 76 76 static const char * const attach_type_strings[] = { 77 77 [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", 78 78 [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", 79 + [BPF_SK_SKB_VERDICT] = "skb_verdict", 79 80 [BPF_SK_MSG_VERDICT] = "msg_verdict", 80 81 [BPF_FLOW_DISSECTOR] = "flow_dissector", 81 82 [__MAX_BPF_ATTACH_TYPE] = NULL,

+5 -6

tools/bpf/resolve_btfids/main.c

··· 115 115 116 116 static int verbose; 117 117 118 - int eprintf(int level, int var, const char *fmt, ...) 118 + static int eprintf(int level, int var, const char *fmt, ...) 119 119 { 120 120 va_list args; 121 - int ret; 121 + int ret = 0; 122 122 123 123 if (var >= level) { 124 124 va_start(args, fmt); ··· 385 385 static int symbols_collect(struct object *obj) 386 386 { 387 387 Elf_Scn *scn = NULL; 388 - int n, i, err = 0; 388 + int n, i; 389 389 GElf_Shdr sh; 390 390 char *name; 391 391 ··· 402 402 * Scan symbols and look for the ones starting with 403 403 * __BTF_ID__* over .BTF_ids section. 404 404 */ 405 - for (i = 0; !err && i < n; i++) { 406 - char *tmp, *prefix; 405 + for (i = 0; i < n; i++) { 406 + char *prefix; 407 407 struct btf_id *id; 408 408 GElf_Sym sym; 409 - int err = -1; 410 409 411 410 if (!gelf_getsym(obj->efile.symbols, i, &sym)) 412 411 return -1;

+5

tools/include/uapi/linux/bpf.h

··· 957 957 BPF_XDP_CPUMAP, 958 958 BPF_SK_LOOKUP, 959 959 BPF_XDP, 960 + BPF_SK_SKB_VERDICT, 960 961 __MAX_BPF_ATTACH_TYPE 961 962 }; 962 963 ··· 1118 1117 * offset to another bpf function 1119 1118 */ 1120 1119 #define BPF_PSEUDO_CALL 1 1120 + /* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, 1121 + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel 1122 + */ 1123 + #define BPF_PSEUDO_KFUNC_CALL 2 1121 1124 1122 1125 /* flags for BPF_MAP_UPDATE_ELEM command */ 1123 1126 enum {

+303 -106

tools/lib/bpf/libbpf.c

··· 185 185 RELO_LD64, 186 186 RELO_CALL, 187 187 RELO_DATA, 188 - RELO_EXTERN, 188 + RELO_EXTERN_VAR, 189 + RELO_EXTERN_FUNC, 189 190 RELO_SUBPROG_ADDR, 190 191 }; 191 192 ··· 574 573 insn->off == 0; 575 574 } 576 575 577 - static bool is_ldimm64(struct bpf_insn *insn) 576 + static bool is_ldimm64_insn(struct bpf_insn *insn) 578 577 { 579 578 return insn->code == (BPF_LD | BPF_IMM | BPF_DW); 580 579 } 581 580 581 + static bool is_call_insn(const struct bpf_insn *insn) 582 + { 583 + return insn->code == (BPF_JMP | BPF_CALL); 584 + } 585 + 582 586 static bool insn_is_pseudo_func(struct bpf_insn *insn) 583 587 { 584 - return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; 588 + return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC; 585 589 } 586 590 587 591 static int ··· 1927 1921 return btf_is_func_proto(t) ? t : NULL; 1928 1922 } 1929 1923 1930 - static const char *btf_kind_str(const struct btf_type *t) 1924 + static const char *__btf_kind_str(__u16 kind) 1931 1925 { 1932 - switch (btf_kind(t)) { 1926 + switch (kind) { 1933 1927 case BTF_KIND_UNKN: return "void"; 1934 1928 case BTF_KIND_INT: return "int"; 1935 1929 case BTF_KIND_PTR: return "ptr"; ··· 1949 1943 case BTF_KIND_FLOAT: return "float"; 1950 1944 default: return "unknown"; 1951 1945 } 1946 + } 1947 + 1948 + static const char *btf_kind_str(const struct btf_type *t) 1949 + { 1950 + return __btf_kind_str(btf_kind(t)); 1951 + } 1952 + 1953 + static enum btf_func_linkage btf_func_linkage(const struct btf_type *t) 1954 + { 1955 + return (enum btf_func_linkage)BTF_INFO_VLEN(t->info); 1952 1956 } 1953 1957 1954 1958 /* ··· 3025 3009 static int find_extern_btf_id(const struct btf *btf, const char *ext_name) 3026 3010 { 3027 3011 const struct btf_type *t; 3028 - const char *var_name; 3012 + const char *tname; 3029 3013 int i, n; 3030 3014 3031 3015 if (!btf) ··· 3035 3019 for (i = 1; i <= n; i++) { 3036 3020 t = btf__type_by_id(btf, i); 3037 3021 3038 - if (!btf_is_var(t)) 3022 + if (!btf_is_var(t) && !btf_is_func(t)) 3039 3023 continue; 3040 3024 3041 - var_name = btf__name_by_offset(btf, t->name_off); 3042 - if (strcmp(var_name, ext_name)) 3025 + tname = btf__name_by_offset(btf, t->name_off); 3026 + if (strcmp(tname, ext_name)) 3043 3027 continue; 3044 3028 3045 - if (btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) 3029 + if (btf_is_var(t) && 3030 + btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) 3031 + return -EINVAL; 3032 + 3033 + if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN) 3046 3034 return -EINVAL; 3047 3035 3048 3036 return i; ··· 3159 3139 return 0; 3160 3140 } 3161 3141 3142 + static int add_dummy_ksym_var(struct btf *btf) 3143 + { 3144 + int i, int_btf_id, sec_btf_id, dummy_var_btf_id; 3145 + const struct btf_var_secinfo *vs; 3146 + const struct btf_type *sec; 3147 + 3148 + sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, 3149 + BTF_KIND_DATASEC); 3150 + if (sec_btf_id < 0) 3151 + return 0; 3152 + 3153 + sec = btf__type_by_id(btf, sec_btf_id); 3154 + vs = btf_var_secinfos(sec); 3155 + for (i = 0; i < btf_vlen(sec); i++, vs++) { 3156 + const struct btf_type *vt; 3157 + 3158 + vt = btf__type_by_id(btf, vs->type); 3159 + if (btf_is_func(vt)) 3160 + break; 3161 + } 3162 + 3163 + /* No func in ksyms sec. No need to add dummy var. */ 3164 + if (i == btf_vlen(sec)) 3165 + return 0; 3166 + 3167 + int_btf_id = find_int_btf_id(btf); 3168 + dummy_var_btf_id = btf__add_var(btf, 3169 + "dummy_ksym", 3170 + BTF_VAR_GLOBAL_ALLOCATED, 3171 + int_btf_id); 3172 + if (dummy_var_btf_id < 0) 3173 + pr_warn("cannot create a dummy_ksym var\n"); 3174 + 3175 + return dummy_var_btf_id; 3176 + } 3177 + 3162 3178 static int bpf_object__collect_externs(struct bpf_object *obj) 3163 3179 { 3164 3180 struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL; 3165 3181 const struct btf_type *t; 3166 3182 struct extern_desc *ext; 3167 - int i, n, off; 3183 + int i, n, off, dummy_var_btf_id; 3168 3184 const char *ext_name, *sec_name; 3169 3185 Elf_Scn *scn; 3170 3186 GElf_Shdr sh; ··· 3211 3155 scn = elf_sec_by_idx(obj, obj->efile.symbols_shndx); 3212 3156 if (elf_sec_hdr(obj, scn, &sh)) 3213 3157 return -LIBBPF_ERRNO__FORMAT; 3158 + 3159 + dummy_var_btf_id = add_dummy_ksym_var(obj->btf); 3160 + if (dummy_var_btf_id < 0) 3161 + return dummy_var_btf_id; 3214 3162 3215 3163 n = sh.sh_size / sh.sh_entsize; 3216 3164 pr_debug("looking for externs among %d symbols...\n", n); ··· 3260 3200 sec_name = btf__name_by_offset(obj->btf, sec->name_off); 3261 3201 3262 3202 if (strcmp(sec_name, KCONFIG_SEC) == 0) { 3203 + if (btf_is_func(t)) { 3204 + pr_warn("extern function %s is unsupported under %s section\n", 3205 + ext->name, KCONFIG_SEC); 3206 + return -ENOTSUP; 3207 + } 3263 3208 kcfg_sec = sec; 3264 3209 ext->type = EXT_KCFG; 3265 3210 ext->kcfg.sz = btf__resolve_size(obj->btf, t->type); ··· 3286 3221 return -ENOTSUP; 3287 3222 } 3288 3223 } else if (strcmp(sec_name, KSYMS_SEC) == 0) { 3224 + if (btf_is_func(t) && ext->is_weak) { 3225 + pr_warn("extern weak function %s is unsupported\n", 3226 + ext->name); 3227 + return -ENOTSUP; 3228 + } 3289 3229 ksym_sec = sec; 3290 3230 ext->type = EXT_KSYM; 3291 3231 skip_mods_and_typedefs(obj->btf, t->type, ··· 3317 3247 * extern variables in DATASEC 3318 3248 */ 3319 3249 int int_btf_id = find_int_btf_id(obj->btf); 3250 + /* For extern function, a dummy_var added earlier 3251 + * will be used to replace the vs->type and 3252 + * its name string will be used to refill 3253 + * the missing param's name. 3254 + */ 3255 + const struct btf_type *dummy_var; 3320 3256 3257 + dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id); 3321 3258 for (i = 0; i < obj->nr_extern; i++) { 3322 3259 ext = &obj->externs[i]; 3323 3260 if (ext->type != EXT_KSYM) ··· 3343 3266 ext_name = btf__name_by_offset(obj->btf, vt->name_off); 3344 3267 ext = find_extern_by_name(obj, ext_name); 3345 3268 if (!ext) { 3346 - pr_warn("failed to find extern definition for BTF var '%s'\n", 3347 - ext_name); 3269 + pr_warn("failed to find extern definition for BTF %s '%s'\n", 3270 + btf_kind_str(vt), ext_name); 3348 3271 return -ESRCH; 3349 3272 } 3350 - btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; 3351 - vt->type = int_btf_id; 3273 + if (btf_is_func(vt)) { 3274 + const struct btf_type *func_proto; 3275 + struct btf_param *param; 3276 + int j; 3277 + 3278 + func_proto = btf__type_by_id(obj->btf, 3279 + vt->type); 3280 + param = btf_params(func_proto); 3281 + /* Reuse the dummy_var string if the 3282 + * func proto does not have param name. 3283 + */ 3284 + for (j = 0; j < btf_vlen(func_proto); j++) 3285 + if (param[j].type && !param[j].name_off) 3286 + param[j].name_off = 3287 + dummy_var->name_off; 3288 + vs->type = dummy_var_btf_id; 3289 + vt->info &= ~0xffff; 3290 + vt->info |= BTF_FUNC_GLOBAL; 3291 + } else { 3292 + btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; 3293 + vt->type = int_btf_id; 3294 + } 3352 3295 vs->offset = off; 3353 3296 vs->size = sizeof(int); 3354 3297 } ··· 3500 3403 3501 3404 reloc_desc->processed = false; 3502 3405 3503 - /* sub-program call relocation */ 3504 - if (insn->code == (BPF_JMP | BPF_CALL)) { 3505 - if (insn->src_reg != BPF_PSEUDO_CALL) { 3506 - pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); 3507 - return -LIBBPF_ERRNO__RELOC; 3508 - } 3509 - /* text_shndx can be 0, if no default "main" program exists */ 3510 - if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { 3511 - sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); 3512 - pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", 3513 - prog->name, sym_name, sym_sec_name); 3514 - return -LIBBPF_ERRNO__RELOC; 3515 - } 3516 - if (sym->st_value % BPF_INSN_SZ) { 3517 - pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", 3518 - prog->name, sym_name, (size_t)sym->st_value); 3519 - return -LIBBPF_ERRNO__RELOC; 3520 - } 3521 - reloc_desc->type = RELO_CALL; 3522 - reloc_desc->insn_idx = insn_idx; 3523 - reloc_desc->sym_off = sym->st_value; 3524 - return 0; 3525 - } 3526 - 3527 - if (!is_ldimm64(insn)) { 3406 + if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { 3528 3407 pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", 3529 3408 prog->name, sym_name, insn_idx, insn->code); 3530 3409 return -LIBBPF_ERRNO__RELOC; ··· 3523 3450 } 3524 3451 pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", 3525 3452 prog->name, i, ext->name, ext->sym_idx, insn_idx); 3526 - reloc_desc->type = RELO_EXTERN; 3453 + if (insn->code == (BPF_JMP | BPF_CALL)) 3454 + reloc_desc->type = RELO_EXTERN_FUNC; 3455 + else 3456 + reloc_desc->type = RELO_EXTERN_VAR; 3527 3457 reloc_desc->insn_idx = insn_idx; 3528 3458 reloc_desc->sym_off = i; /* sym_off stores extern index */ 3459 + return 0; 3460 + } 3461 + 3462 + /* sub-program call relocation */ 3463 + if (is_call_insn(insn)) { 3464 + if (insn->src_reg != BPF_PSEUDO_CALL) { 3465 + pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); 3466 + return -LIBBPF_ERRNO__RELOC; 3467 + } 3468 + /* text_shndx can be 0, if no default "main" program exists */ 3469 + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { 3470 + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); 3471 + pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", 3472 + prog->name, sym_name, sym_sec_name); 3473 + return -LIBBPF_ERRNO__RELOC; 3474 + } 3475 + if (sym->st_value % BPF_INSN_SZ) { 3476 + pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", 3477 + prog->name, sym_name, (size_t)sym->st_value); 3478 + return -LIBBPF_ERRNO__RELOC; 3479 + } 3480 + reloc_desc->type = RELO_CALL; 3481 + reloc_desc->insn_idx = insn_idx; 3482 + reloc_desc->sym_off = sym->st_value; 3529 3483 return 0; 3530 3484 } 3531 3485 ··· 5795 5695 /* poison second part of ldimm64 to avoid confusing error from 5796 5696 * verifier about "unknown opcode 00" 5797 5697 */ 5798 - if (is_ldimm64(insn)) 5698 + if (is_ldimm64_insn(insn)) 5799 5699 bpf_core_poison_insn(prog, relo_idx, insn_idx + 1, insn + 1); 5800 5700 bpf_core_poison_insn(prog, relo_idx, insn_idx, insn); 5801 5701 return 0; ··· 5871 5771 case BPF_LD: { 5872 5772 __u64 imm; 5873 5773 5874 - if (!is_ldimm64(insn) || 5774 + if (!is_ldimm64_insn(insn) || 5875 5775 insn[0].src_reg != 0 || insn[0].off != 0 || 5876 5776 insn_idx + 1 >= prog->insns_cnt || 5877 5777 insn[1].code != 0 || insn[1].dst_reg != 0 || ··· 6313 6213 insn[0].imm = obj->maps[relo->map_idx].fd; 6314 6214 relo->processed = true; 6315 6215 break; 6316 - case RELO_EXTERN: 6216 + case RELO_EXTERN_VAR: 6317 6217 ext = &obj->externs[relo->sym_off]; 6318 6218 if (ext->type == EXT_KCFG) { 6319 6219 insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; ··· 6329 6229 insn[1].imm = ext->ksym.addr >> 32; 6330 6230 } 6331 6231 } 6232 + relo->processed = true; 6233 + break; 6234 + case RELO_EXTERN_FUNC: 6235 + ext = &obj->externs[relo->sym_off]; 6236 + insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; 6237 + insn[0].imm = ext->ksym.kernel_btf_id; 6332 6238 relo->processed = true; 6333 6239 break; 6334 6240 case RELO_SUBPROG_ADDR: ··· 7457 7351 { 7458 7352 char sym_type, sym_name[500]; 7459 7353 unsigned long long sym_addr; 7354 + const struct btf_type *t; 7460 7355 struct extern_desc *ext; 7461 7356 int ret, err = 0; 7462 7357 FILE *f; ··· 7484 7377 if (!ext || ext->type != EXT_KSYM) 7485 7378 continue; 7486 7379 7380 + t = btf__type_by_id(obj->btf, ext->btf_id); 7381 + if (!btf_is_var(t)) 7382 + continue; 7383 + 7487 7384 if (ext->is_set && ext->ksym.addr != sym_addr) { 7488 7385 pr_warn("extern (ksym) '%s' resolution is ambiguous: 0x%llx or 0x%llx\n", 7489 7386 sym_name, ext->ksym.addr, sym_addr); ··· 7506 7395 return err; 7507 7396 } 7508 7397 7398 + static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, 7399 + __u16 kind, struct btf **res_btf, 7400 + int *res_btf_fd) 7401 + { 7402 + int i, id, btf_fd, err; 7403 + struct btf *btf; 7404 + 7405 + btf = obj->btf_vmlinux; 7406 + btf_fd = 0; 7407 + id = btf__find_by_name_kind(btf, ksym_name, kind); 7408 + 7409 + if (id == -ENOENT) { 7410 + err = load_module_btfs(obj); 7411 + if (err) 7412 + return err; 7413 + 7414 + for (i = 0; i < obj->btf_module_cnt; i++) { 7415 + btf = obj->btf_modules[i].btf; 7416 + /* we assume module BTF FD is always >0 */ 7417 + btf_fd = obj->btf_modules[i].fd; 7418 + id = btf__find_by_name_kind(btf, ksym_name, kind); 7419 + if (id != -ENOENT) 7420 + break; 7421 + } 7422 + } 7423 + if (id <= 0) { 7424 + pr_warn("extern (%s ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", 7425 + __btf_kind_str(kind), ksym_name); 7426 + return -ESRCH; 7427 + } 7428 + 7429 + *res_btf = btf; 7430 + *res_btf_fd = btf_fd; 7431 + return id; 7432 + } 7433 + 7434 + static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, 7435 + struct extern_desc *ext) 7436 + { 7437 + const struct btf_type *targ_var, *targ_type; 7438 + __u32 targ_type_id, local_type_id; 7439 + const char *targ_var_name; 7440 + int id, btf_fd = 0, err; 7441 + struct btf *btf = NULL; 7442 + 7443 + id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &btf_fd); 7444 + if (id < 0) 7445 + return id; 7446 + 7447 + /* find local type_id */ 7448 + local_type_id = ext->ksym.type_id; 7449 + 7450 + /* find target type_id */ 7451 + targ_var = btf__type_by_id(btf, id); 7452 + targ_var_name = btf__name_by_offset(btf, targ_var->name_off); 7453 + targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); 7454 + 7455 + err = bpf_core_types_are_compat(obj->btf, local_type_id, 7456 + btf, targ_type_id); 7457 + if (err <= 0) { 7458 + const struct btf_type *local_type; 7459 + const char *targ_name, *local_name; 7460 + 7461 + local_type = btf__type_by_id(obj->btf, local_type_id); 7462 + local_name = btf__name_by_offset(obj->btf, local_type->name_off); 7463 + targ_name = btf__name_by_offset(btf, targ_type->name_off); 7464 + 7465 + pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", 7466 + ext->name, local_type_id, 7467 + btf_kind_str(local_type), local_name, targ_type_id, 7468 + btf_kind_str(targ_type), targ_name); 7469 + return -EINVAL; 7470 + } 7471 + 7472 + ext->is_set = true; 7473 + ext->ksym.kernel_btf_obj_fd = btf_fd; 7474 + ext->ksym.kernel_btf_id = id; 7475 + pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n", 7476 + ext->name, id, btf_kind_str(targ_var), targ_var_name); 7477 + 7478 + return 0; 7479 + } 7480 + 7481 + static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, 7482 + struct extern_desc *ext) 7483 + { 7484 + int local_func_proto_id, kfunc_proto_id, kfunc_id; 7485 + const struct btf_type *kern_func; 7486 + struct btf *kern_btf = NULL; 7487 + int ret, kern_btf_fd = 0; 7488 + 7489 + local_func_proto_id = ext->ksym.type_id; 7490 + 7491 + kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, 7492 + &kern_btf, &kern_btf_fd); 7493 + if (kfunc_id < 0) { 7494 + pr_warn("extern (func ksym) '%s': not found in kernel BTF\n", 7495 + ext->name); 7496 + return kfunc_id; 7497 + } 7498 + 7499 + if (kern_btf != obj->btf_vmlinux) { 7500 + pr_warn("extern (func ksym) '%s': function in kernel module is not supported\n", 7501 + ext->name); 7502 + return -ENOTSUP; 7503 + } 7504 + 7505 + kern_func = btf__type_by_id(kern_btf, kfunc_id); 7506 + kfunc_proto_id = kern_func->type; 7507 + 7508 + ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, 7509 + kern_btf, kfunc_proto_id); 7510 + if (ret <= 0) { 7511 + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n", 7512 + ext->name, local_func_proto_id, kfunc_proto_id); 7513 + return -EINVAL; 7514 + } 7515 + 7516 + ext->is_set = true; 7517 + ext->ksym.kernel_btf_obj_fd = kern_btf_fd; 7518 + ext->ksym.kernel_btf_id = kfunc_id; 7519 + pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n", 7520 + ext->name, kfunc_id); 7521 + 7522 + return 0; 7523 + } 7524 + 7509 7525 static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) 7510 7526 { 7527 + const struct btf_type *t; 7511 7528 struct extern_desc *ext; 7512 - struct btf *btf; 7513 - int i, j, id, btf_fd, err; 7529 + int i, err; 7514 7530 7515 7531 for (i = 0; i < obj->nr_extern; i++) { 7516 - const struct btf_type *targ_var, *targ_type; 7517 - __u32 targ_type_id, local_type_id; 7518 - const char *targ_var_name; 7519 - int ret; 7520 - 7521 7532 ext = &obj->externs[i]; 7522 7533 if (ext->type != EXT_KSYM || !ext->ksym.type_id) 7523 7534 continue; 7524 7535 7525 - btf = obj->btf_vmlinux; 7526 - btf_fd = 0; 7527 - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); 7528 - if (id == -ENOENT) { 7529 - err = load_module_btfs(obj); 7530 - if (err) 7531 - return err; 7532 - 7533 - for (j = 0; j < obj->btf_module_cnt; j++) { 7534 - btf = obj->btf_modules[j].btf; 7535 - /* we assume module BTF FD is always >0 */ 7536 - btf_fd = obj->btf_modules[j].fd; 7537 - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); 7538 - if (id != -ENOENT) 7539 - break; 7540 - } 7541 - } 7542 - if (id <= 0) { 7543 - pr_warn("extern (ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", 7544 - ext->name); 7545 - return -ESRCH; 7546 - } 7547 - 7548 - /* find local type_id */ 7549 - local_type_id = ext->ksym.type_id; 7550 - 7551 - /* find target type_id */ 7552 - targ_var = btf__type_by_id(btf, id); 7553 - targ_var_name = btf__name_by_offset(btf, targ_var->name_off); 7554 - targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); 7555 - 7556 - ret = bpf_core_types_are_compat(obj->btf, local_type_id, 7557 - btf, targ_type_id); 7558 - if (ret <= 0) { 7559 - const struct btf_type *local_type; 7560 - const char *targ_name, *local_name; 7561 - 7562 - local_type = btf__type_by_id(obj->btf, local_type_id); 7563 - local_name = btf__name_by_offset(obj->btf, local_type->name_off); 7564 - targ_name = btf__name_by_offset(btf, targ_type->name_off); 7565 - 7566 - pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", 7567 - ext->name, local_type_id, 7568 - btf_kind_str(local_type), local_name, targ_type_id, 7569 - btf_kind_str(targ_type), targ_name); 7570 - return -EINVAL; 7571 - } 7572 - 7573 - ext->is_set = true; 7574 - ext->ksym.kernel_btf_obj_fd = btf_fd; 7575 - ext->ksym.kernel_btf_id = id; 7576 - pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n", 7577 - ext->name, id, btf_kind_str(targ_var), targ_var_name); 7536 + t = btf__type_by_id(obj->btf, ext->btf_id); 7537 + if (btf_is_var(t)) 7538 + err = bpf_object__resolve_ksym_var_btf_id(obj, ext); 7539 + else 7540 + err = bpf_object__resolve_ksym_func_btf_id(obj, ext); 7541 + if (err) 7542 + return err; 7578 7543 } 7579 7544 return 0; 7580 7545 } ··· 8457 8270 return obj->btf ? btf__fd(obj->btf) : -1; 8458 8271 } 8459 8272 8273 + int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) 8274 + { 8275 + if (obj->loaded) 8276 + return -EINVAL; 8277 + 8278 + obj->kern_version = kern_version; 8279 + 8280 + return 0; 8281 + } 8282 + 8460 8283 int bpf_object__set_priv(struct bpf_object *obj, void *priv, 8461 8284 bpf_object_clear_priv_t clear_priv) 8462 8285 { ··· 8655 8458 return fd; 8656 8459 } 8657 8460 8658 - enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog) 8461 + enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog) 8659 8462 { 8660 8463 return prog->type; 8661 8464 } ··· 8700 8503 BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); 8701 8504 8702 8505 enum bpf_attach_type 8703 - bpf_program__get_expected_attach_type(struct bpf_program *prog) 8506 + bpf_program__get_expected_attach_type(const struct bpf_program *prog) 8704 8507 { 8705 8508 return prog->expected_attach_type; 8706 8509 }

+3 -2

tools/lib/bpf/libbpf.h

··· 143 143 144 144 LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); 145 145 LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); 146 + LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version); 146 147 147 148 struct btf; 148 149 LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); ··· 362 361 LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); 363 362 LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); 364 363 365 - LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog); 364 + LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); 366 365 LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, 367 366 enum bpf_prog_type type); 368 367 369 368 LIBBPF_API enum bpf_attach_type 370 - bpf_program__get_expected_attach_type(struct bpf_program *prog); 369 + bpf_program__get_expected_attach_type(const struct bpf_program *prog); 371 370 LIBBPF_API void 372 371 bpf_program__set_expected_attach_type(struct bpf_program *prog, 373 372 enum bpf_attach_type type);

+1

tools/lib/bpf/libbpf.map

··· 359 359 bpf_linker__finalize; 360 360 bpf_linker__free; 361 361 bpf_linker__new; 362 + bpf_object__set_kversion; 362 363 } LIBBPF_0.3.0;

+28 -9

tools/lib/bpf/linker.c

··· 94 94 int sec_sym_idx; 95 95 96 96 /* section's DATASEC variable info, emitted on BTF finalization */ 97 + bool has_btf; 97 98 int sec_var_cnt; 98 99 struct btf_var_secinfo *sec_vars; 99 100 ··· 1437 1436 continue; 1438 1437 dst_sec = &linker->secs[src_sec->dst_id]; 1439 1438 1439 + /* Mark section as having BTF regardless of the presence of 1440 + * variables. In some cases compiler might generate empty BTF 1441 + * with no variables information. E.g., when promoting local 1442 + * array/structure variable initial values and BPF object 1443 + * file otherwise has no read-only static variables in 1444 + * .rodata. We need to preserve such empty BTF and just set 1445 + * correct section size. 1446 + */ 1447 + dst_sec->has_btf = true; 1448 + 1440 1449 t = btf__type_by_id(obj->btf, src_sec->sec_type_id); 1441 1450 src_var = btf_var_secinfos(t); 1442 1451 n = btf_vlen(t); ··· 1728 1717 for (i = 1; i < linker->sec_cnt; i++) { 1729 1718 struct dst_sec *sec = &linker->secs[i]; 1730 1719 1731 - if (!sec->sec_var_cnt) 1720 + if (!sec->has_btf) 1732 1721 continue; 1733 1722 1734 1723 id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz); ··· 1906 1895 struct dst_sec *sec = &linker->secs[i]; 1907 1896 1908 1897 sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info); 1909 - if (sz < 0) 1910 - return sz; 1898 + if (sz < 0) { 1899 + err = sz; 1900 + goto out; 1901 + } 1911 1902 1912 1903 cur += sz; 1913 1904 } ··· 1923 1910 struct dst_sec *sec = &linker->secs[i]; 1924 1911 1925 1912 sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info); 1926 - if (sz < 0) 1927 - return sz; 1913 + if (sz < 0) { 1914 + err = sz; 1915 + goto out; 1916 + } 1928 1917 1929 1918 cur += sz; 1930 1919 } ··· 1940 1925 struct dst_sec *sec = &linker->secs[i]; 1941 1926 1942 1927 sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info); 1943 - if (sz < 0) 1944 - return sz; 1928 + if (sz < 0) { 1929 + err = sz; 1930 + goto out; 1931 + } 1945 1932 1946 1933 cur += sz; 1947 1934 } ··· 1954 1937 if (err) { 1955 1938 linker->btf_ext = NULL; 1956 1939 pr_warn("failed to parse final .BTF.ext data: %d\n", err); 1957 - return err; 1940 + goto out; 1958 1941 } 1959 1942 1960 - return 0; 1943 + out: 1944 + free(data); 1945 + return err; 1961 1946 }

+217 -49

tools/lib/bpf/xsk.c

··· 28 28 #include <sys/mman.h> 29 29 #include <sys/socket.h> 30 30 #include <sys/types.h> 31 + #include <linux/if_link.h> 31 32 32 33 #include "bpf.h" 33 34 #include "libbpf.h" ··· 71 70 int ifindex; 72 71 struct list_head list; 73 72 int prog_fd; 73 + int link_fd; 74 74 int xsks_map_fd; 75 75 char ifname[IFNAMSIZ]; 76 + bool has_bpf_link; 76 77 }; 77 78 78 79 struct xsk_socket { ··· 412 409 static const int log_buf_size = 16 * 1024; 413 410 struct xsk_ctx *ctx = xsk->ctx; 414 411 char log_buf[log_buf_size]; 415 - int err, prog_fd; 412 + int prog_fd; 416 413 417 414 /* This is the fallback C-program: 418 415 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) ··· 502 499 return prog_fd; 503 500 } 504 501 505 - err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd, 506 - xsk->config.xdp_flags); 502 + ctx->prog_fd = prog_fd; 503 + return 0; 504 + } 505 + 506 + static int xsk_create_bpf_link(struct xsk_socket *xsk) 507 + { 508 + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); 509 + struct xsk_ctx *ctx = xsk->ctx; 510 + __u32 prog_id = 0; 511 + int link_fd; 512 + int err; 513 + 514 + err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); 507 515 if (err) { 508 - close(prog_fd); 516 + pr_warn("getting XDP prog id failed\n"); 509 517 return err; 510 518 } 511 519 512 - ctx->prog_fd = prog_fd; 520 + /* if there's a netlink-based XDP prog loaded on interface, bail out 521 + * and ask user to do the removal by himself 522 + */ 523 + if (prog_id) { 524 + pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n"); 525 + return -EINVAL; 526 + } 527 + 528 + opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE); 529 + 530 + link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts); 531 + if (link_fd < 0) { 532 + pr_warn("bpf_link_create failed: %s\n", strerror(errno)); 533 + return link_fd; 534 + } 535 + 536 + ctx->link_fd = link_fd; 513 537 return 0; 514 538 } 515 539 ··· 655 625 close(fd); 656 626 } 657 627 658 - err = 0; 659 628 if (ctx->xsks_map_fd == -1) 660 629 err = -ENOENT; 661 630 ··· 669 640 670 641 return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, 671 642 &xsk->fd, 0); 643 + } 644 + 645 + static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd) 646 + { 647 + struct bpf_link_info link_info; 648 + __u32 link_len; 649 + __u32 id = 0; 650 + int err; 651 + int fd; 652 + 653 + while (true) { 654 + err = bpf_link_get_next_id(id, &id); 655 + if (err) { 656 + if (errno == ENOENT) { 657 + err = 0; 658 + break; 659 + } 660 + pr_warn("can't get next link: %s\n", strerror(errno)); 661 + break; 662 + } 663 + 664 + fd = bpf_link_get_fd_by_id(id); 665 + if (fd < 0) { 666 + if (errno == ENOENT) 667 + continue; 668 + pr_warn("can't get link by id (%u): %s\n", id, strerror(errno)); 669 + err = -errno; 670 + break; 671 + } 672 + 673 + link_len = sizeof(struct bpf_link_info); 674 + memset(&link_info, 0, link_len); 675 + err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len); 676 + if (err) { 677 + pr_warn("can't get link info: %s\n", strerror(errno)); 678 + close(fd); 679 + break; 680 + } 681 + if (link_info.type == BPF_LINK_TYPE_XDP) { 682 + if (link_info.xdp.ifindex == ifindex) { 683 + *link_fd = fd; 684 + if (prog_id) 685 + *prog_id = link_info.prog_id; 686 + break; 687 + } 688 + } 689 + close(fd); 690 + } 691 + 692 + return err; 693 + } 694 + 695 + static bool xsk_probe_bpf_link(void) 696 + { 697 + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, 698 + .flags = XDP_FLAGS_SKB_MODE); 699 + struct bpf_load_program_attr prog_attr; 700 + struct bpf_insn insns[2] = { 701 + BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), 702 + BPF_EXIT_INSN() 703 + }; 704 + int prog_fd, link_fd = -1; 705 + int ifindex_lo = 1; 706 + bool ret = false; 707 + int err; 708 + 709 + err = xsk_link_lookup(ifindex_lo, NULL, &link_fd); 710 + if (err) 711 + return ret; 712 + 713 + if (link_fd >= 0) 714 + return true; 715 + 716 + memset(&prog_attr, 0, sizeof(prog_attr)); 717 + prog_attr.prog_type = BPF_PROG_TYPE_XDP; 718 + prog_attr.insns = insns; 719 + prog_attr.insns_cnt = ARRAY_SIZE(insns); 720 + prog_attr.license = "GPL"; 721 + 722 + prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0); 723 + if (prog_fd < 0) 724 + return ret; 725 + 726 + link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts); 727 + close(prog_fd); 728 + 729 + if (link_fd >= 0) { 730 + ret = true; 731 + close(link_fd); 732 + } 733 + 734 + return ret; 672 735 } 673 736 674 737 static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) ··· 784 663 ctx->ifname[IFNAMSIZ - 1] = 0; 785 664 786 665 xsk->ctx = ctx; 666 + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); 787 667 788 668 return 0; 789 669 } 790 670 791 - static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, 792 - int *xsks_map_fd) 671 + static int xsk_init_xdp_res(struct xsk_socket *xsk, 672 + int *xsks_map_fd) 673 + { 674 + struct xsk_ctx *ctx = xsk->ctx; 675 + int err; 676 + 677 + err = xsk_create_bpf_maps(xsk); 678 + if (err) 679 + return err; 680 + 681 + err = xsk_load_xdp_prog(xsk); 682 + if (err) 683 + goto err_load_xdp_prog; 684 + 685 + if (ctx->has_bpf_link) 686 + err = xsk_create_bpf_link(xsk); 687 + else 688 + err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, ctx->prog_fd, 689 + xsk->config.xdp_flags); 690 + 691 + if (err) 692 + goto err_attach_xdp_prog; 693 + 694 + if (!xsk->rx) 695 + return err; 696 + 697 + err = xsk_set_bpf_maps(xsk); 698 + if (err) 699 + goto err_set_bpf_maps; 700 + 701 + return err; 702 + 703 + err_set_bpf_maps: 704 + if (ctx->has_bpf_link) 705 + close(ctx->link_fd); 706 + else 707 + bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); 708 + err_attach_xdp_prog: 709 + close(ctx->prog_fd); 710 + err_load_xdp_prog: 711 + xsk_delete_bpf_maps(xsk); 712 + return err; 713 + } 714 + 715 + static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id) 716 + { 717 + struct xsk_ctx *ctx = xsk->ctx; 718 + int err; 719 + 720 + ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); 721 + if (ctx->prog_fd < 0) { 722 + err = -errno; 723 + goto err_prog_fd; 724 + } 725 + err = xsk_lookup_bpf_maps(xsk); 726 + if (err) 727 + goto err_lookup_maps; 728 + 729 + if (!xsk->rx) 730 + return err; 731 + 732 + err = xsk_set_bpf_maps(xsk); 733 + if (err) 734 + goto err_set_maps; 735 + 736 + return err; 737 + 738 + err_set_maps: 739 + close(ctx->xsks_map_fd); 740 + err_lookup_maps: 741 + close(ctx->prog_fd); 742 + err_prog_fd: 743 + if (ctx->has_bpf_link) 744 + close(ctx->link_fd); 745 + return err; 746 + } 747 + 748 + static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) 793 749 { 794 750 struct xsk_socket *xsk = _xdp; 795 751 struct xsk_ctx *ctx = xsk->ctx; 796 752 __u32 prog_id = 0; 797 753 int err; 798 754 799 - err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, 800 - xsk->config.xdp_flags); 755 + if (ctx->has_bpf_link) 756 + err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); 757 + else 758 + err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); 759 + 801 760 if (err) 802 761 return err; 803 762 804 - if (!prog_id) { 805 - err = xsk_create_bpf_maps(xsk); 806 - if (err) 807 - return err; 763 + err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) : 764 + xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id); 808 765 809 - err = xsk_load_xdp_prog(xsk); 810 - if (err) { 811 - goto err_load_xdp_prog; 812 - } 813 - } else { 814 - ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); 815 - if (ctx->prog_fd < 0) 816 - return -errno; 817 - err = xsk_lookup_bpf_maps(xsk); 818 - if (err) { 819 - close(ctx->prog_fd); 820 - return err; 821 - } 822 - } 823 - 824 - if (xsk->rx) { 825 - err = xsk_set_bpf_maps(xsk); 826 - if (err) { 827 - if (!prog_id) { 828 - goto err_set_bpf_maps; 829 - } else { 830 - close(ctx->prog_fd); 831 - return err; 832 - } 833 - } 834 - } 835 - if (xsks_map_fd) 766 + if (!err && xsks_map_fd) 836 767 *xsks_map_fd = ctx->xsks_map_fd; 837 - 838 - return 0; 839 - 840 - err_set_bpf_maps: 841 - close(ctx->prog_fd); 842 - bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); 843 - err_load_xdp_prog: 844 - xsk_delete_bpf_maps(xsk); 845 768 846 769 return err; 847 770 } ··· 1063 898 } 1064 899 } 1065 900 xsk->ctx = ctx; 901 + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); 1066 902 1067 903 if (rx) { 1068 904 err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, ··· 1220 1054 if (ctx->prog_fd != -1) { 1221 1055 xsk_delete_bpf_maps(xsk); 1222 1056 close(ctx->prog_fd); 1057 + if (ctx->has_bpf_link) 1058 + close(ctx->link_fd); 1223 1059 } 1224 1060 1225 1061 err = xsk_get_mmap_offsets(xsk->fd, &off);

+14

tools/testing/selftests/bpf/README.rst

··· 179 179 either crash when compiling these tests, or generate an incorrect BTF. 180 180 181 181 __ https://reviews.llvm.org/D83289 182 + 183 + Kernel function call test and Clang version 184 + =========================================== 185 + 186 + Some selftests (e.g. kfunc_call and bpf_tcp_ca) require a LLVM support 187 + to generate extern function in BTF. It was introduced in `Clang 13`__. 188 + 189 + Without it, the error from compiling bpf selftests looks like: 190 + 191 + .. code-block:: console 192 + 193 + libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2 194 + 195 + __ https://reviews.llvm.org/D93563

+2 -27

tools/testing/selftests/bpf/bpf_tcp_helpers.h

··· 187 187 typeof(y) __y = (y); \ 188 188 __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) 189 189 190 - static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) 191 - { 192 - __u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); 193 - 194 - acked -= cwnd - tp->snd_cwnd; 195 - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); 196 - 197 - return acked; 198 - } 199 - 200 190 static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) 201 191 { 202 192 return tp->snd_cwnd < tp->snd_ssthresh; ··· 203 213 return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); 204 214 } 205 215 206 - static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) 207 - { 208 - /* If credits accumulated at a higher w, apply them gently now. */ 209 - if (tp->snd_cwnd_cnt >= w) { 210 - tp->snd_cwnd_cnt = 0; 211 - tp->snd_cwnd++; 212 - } 213 - 214 - tp->snd_cwnd_cnt += acked; 215 - if (tp->snd_cwnd_cnt >= w) { 216 - __u32 delta = tp->snd_cwnd_cnt / w; 217 - 218 - tp->snd_cwnd_cnt -= delta * w; 219 - tp->snd_cwnd += delta; 220 - } 221 - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); 222 - } 216 + extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; 217 + extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; 223 218 224 219 #endif

+158

tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <arpa/inet.h> 4 + #include <linux/bpf.h> 5 + #include <netinet/in.h> 6 + #include <stdio.h> 7 + #include <errno.h> 8 + #include <string.h> 9 + #include <stdlib.h> 10 + 11 + #include <bpf/bpf.h> 12 + #include <bpf/libbpf.h> 13 + 14 + #include <test_maps.h> 15 + 16 + struct test_lpm_key { 17 + __u32 prefix; 18 + struct in_addr ipv4; 19 + }; 20 + 21 + static void map_batch_update(int map_fd, __u32 max_entries, 22 + struct test_lpm_key *keys, int *values) 23 + { 24 + __u32 i; 25 + int err; 26 + char buff[16] = { 0 }; 27 + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, 28 + .elem_flags = 0, 29 + .flags = 0, 30 + ); 31 + 32 + for (i = 0; i < max_entries; i++) { 33 + keys[i].prefix = 32; 34 + snprintf(buff, 16, "192.168.1.%d", i + 1); 35 + inet_pton(AF_INET, buff, &keys[i].ipv4); 36 + values[i] = i + 1; 37 + } 38 + 39 + err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts); 40 + CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno)); 41 + } 42 + 43 + static void map_batch_verify(int *visited, __u32 max_entries, 44 + struct test_lpm_key *keys, int *values) 45 + { 46 + char buff[16] = { 0 }; 47 + int lower_byte = 0; 48 + __u32 i; 49 + 50 + memset(visited, 0, max_entries * sizeof(*visited)); 51 + for (i = 0; i < max_entries; i++) { 52 + inet_ntop(AF_INET, &keys[i].ipv4, buff, 32); 53 + CHECK(sscanf(buff, "192.168.1.%d", &lower_byte) == EOF, 54 + "sscanf()", "error: i %d\n", i); 55 + CHECK(lower_byte != values[i], "key/value checking", 56 + "error: i %d key %s value %d\n", i, buff, values[i]); 57 + visited[i] = 1; 58 + } 59 + for (i = 0; i < max_entries; i++) { 60 + CHECK(visited[i] != 1, "visited checking", 61 + "error: keys array at index %d missing\n", i); 62 + } 63 + } 64 + 65 + void test_lpm_trie_map_batch_ops(void) 66 + { 67 + struct bpf_create_map_attr xattr = { 68 + .name = "lpm_trie_map", 69 + .map_type = BPF_MAP_TYPE_LPM_TRIE, 70 + .key_size = sizeof(struct test_lpm_key), 71 + .value_size = sizeof(int), 72 + .map_flags = BPF_F_NO_PREALLOC, 73 + }; 74 + struct test_lpm_key *keys, key; 75 + int map_fd, *values, *visited; 76 + __u32 step, count, total, total_success; 77 + const __u32 max_entries = 10; 78 + __u64 batch = 0; 79 + int err; 80 + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, 81 + .elem_flags = 0, 82 + .flags = 0, 83 + ); 84 + 85 + xattr.max_entries = max_entries; 86 + map_fd = bpf_create_map_xattr(&xattr); 87 + CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n", 88 + strerror(errno)); 89 + 90 + keys = malloc(max_entries * sizeof(struct test_lpm_key)); 91 + values = malloc(max_entries * sizeof(int)); 92 + visited = malloc(max_entries * sizeof(int)); 93 + CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", 94 + strerror(errno)); 95 + 96 + total_success = 0; 97 + for (step = 1; step < max_entries; step++) { 98 + map_batch_update(map_fd, max_entries, keys, values); 99 + map_batch_verify(visited, max_entries, keys, values); 100 + memset(keys, 0, max_entries * sizeof(*keys)); 101 + memset(values, 0, max_entries * sizeof(*values)); 102 + batch = 0; 103 + total = 0; 104 + /* iteratively lookup/delete elements with 'step' 105 + * elements each. 106 + */ 107 + count = step; 108 + while (true) { 109 + err = bpf_map_lookup_batch(map_fd, 110 + total ? &batch : NULL, &batch, 111 + keys + total, values + total, &count, &opts); 112 + 113 + CHECK((err && errno != ENOENT), "lookup with steps", 114 + "error: %s\n", strerror(errno)); 115 + 116 + total += count; 117 + if (err) 118 + break; 119 + } 120 + 121 + CHECK(total != max_entries, "lookup with steps", 122 + "total = %u, max_entries = %u\n", total, max_entries); 123 + 124 + map_batch_verify(visited, max_entries, keys, values); 125 + 126 + total = 0; 127 + count = step; 128 + while (total < max_entries) { 129 + if (max_entries - total < step) 130 + count = max_entries - total; 131 + err = bpf_map_delete_batch(map_fd, keys + total, &count, 132 + &opts); 133 + CHECK((err && errno != ENOENT), "delete batch", 134 + "error: %s\n", strerror(errno)); 135 + total += count; 136 + if (err) 137 + break; 138 + } 139 + CHECK(total != max_entries, "delete with steps", 140 + "total = %u, max_entries = %u\n", total, max_entries); 141 + 142 + /* check map is empty, errono == ENOENT */ 143 + err = bpf_map_get_next_key(map_fd, NULL, &key); 144 + CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()", 145 + "error: %s\n", strerror(errno)); 146 + 147 + total_success++; 148 + } 149 + 150 + CHECK(total_success == 0, "check total_success", 151 + "unexpected failure\n"); 152 + 153 + printf("%s:PASS\n", __func__); 154 + 155 + free(keys); 156 + free(values); 157 + free(visited); 158 + }

+59

tools/testing/selftests/bpf/prog_tests/kfunc_call.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2021 Facebook */ 3 + #include <test_progs.h> 4 + #include <network_helpers.h> 5 + #include "kfunc_call_test.skel.h" 6 + #include "kfunc_call_test_subprog.skel.h" 7 + 8 + static void test_main(void) 9 + { 10 + struct kfunc_call_test *skel; 11 + int prog_fd, retval, err; 12 + 13 + skel = kfunc_call_test__open_and_load(); 14 + if (!ASSERT_OK_PTR(skel, "skel")) 15 + return; 16 + 17 + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); 18 + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), 19 + NULL, NULL, (__u32 *)&retval, NULL); 20 + ASSERT_OK(err, "bpf_prog_test_run(test1)"); 21 + ASSERT_EQ(retval, 12, "test1-retval"); 22 + 23 + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2); 24 + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), 25 + NULL, NULL, (__u32 *)&retval, NULL); 26 + ASSERT_OK(err, "bpf_prog_test_run(test2)"); 27 + ASSERT_EQ(retval, 3, "test2-retval"); 28 + 29 + kfunc_call_test__destroy(skel); 30 + } 31 + 32 + static void test_subprog(void) 33 + { 34 + struct kfunc_call_test_subprog *skel; 35 + int prog_fd, retval, err; 36 + 37 + skel = kfunc_call_test_subprog__open_and_load(); 38 + if (!ASSERT_OK_PTR(skel, "skel")) 39 + return; 40 + 41 + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); 42 + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), 43 + NULL, NULL, (__u32 *)&retval, NULL); 44 + ASSERT_OK(err, "bpf_prog_test_run(test1)"); 45 + ASSERT_EQ(retval, 10, "test1-retval"); 46 + ASSERT_NEQ(skel->data->active_res, -1, "active_res"); 47 + ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state"); 48 + 49 + kfunc_call_test_subprog__destroy(skel); 50 + } 51 + 52 + void test_kfunc_call(void) 53 + { 54 + if (test__start_subtest("main")) 55 + test_main(); 56 + 57 + if (test__start_subtest("subprog")) 58 + test_subprog(); 59 + }

+40

tools/testing/selftests/bpf/prog_tests/sockmap_basic.c

··· 7 7 #include "test_skmsg_load_helpers.skel.h" 8 8 #include "test_sockmap_update.skel.h" 9 9 #include "test_sockmap_invalid_update.skel.h" 10 + #include "test_sockmap_skb_verdict_attach.skel.h" 10 11 #include "bpf_iter_sockmap.skel.h" 11 12 12 13 #define TCP_REPAIR 19 /* TCP sock is under repair right now */ ··· 282 281 bpf_iter_sockmap__destroy(skel); 283 282 } 284 283 284 + static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first, 285 + enum bpf_attach_type second) 286 + { 287 + struct test_sockmap_skb_verdict_attach *skel; 288 + int err, map, verdict; 289 + 290 + skel = test_sockmap_skb_verdict_attach__open_and_load(); 291 + if (CHECK_FAIL(!skel)) { 292 + perror("test_sockmap_skb_verdict_attach__open_and_load"); 293 + return; 294 + } 295 + 296 + verdict = bpf_program__fd(skel->progs.prog_skb_verdict); 297 + map = bpf_map__fd(skel->maps.sock_map); 298 + 299 + err = bpf_prog_attach(verdict, map, first, 0); 300 + if (CHECK_FAIL(err)) { 301 + perror("bpf_prog_attach"); 302 + goto out; 303 + } 304 + 305 + err = bpf_prog_attach(verdict, map, second, 0); 306 + assert(err == -1 && errno == EBUSY); 307 + 308 + err = bpf_prog_detach2(verdict, map, first); 309 + if (CHECK_FAIL(err)) { 310 + perror("bpf_prog_detach2"); 311 + goto out; 312 + } 313 + out: 314 + test_sockmap_skb_verdict_attach__destroy(skel); 315 + } 316 + 285 317 void test_sockmap_basic(void) 286 318 { 287 319 if (test__start_subtest("sockmap create_update_free")) ··· 335 301 test_sockmap_copy(BPF_MAP_TYPE_SOCKMAP); 336 302 if (test__start_subtest("sockhash copy")) 337 303 test_sockmap_copy(BPF_MAP_TYPE_SOCKHASH); 304 + if (test__start_subtest("sockmap skb_verdict attach")) { 305 + test_sockmap_skb_verdict_attach(BPF_SK_SKB_VERDICT, 306 + BPF_SK_SKB_STREAM_VERDICT); 307 + test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, 308 + BPF_SK_SKB_VERDICT); 309 + } 338 310 }

+136

tools/testing/selftests/bpf/prog_tests/sockmap_listen.c

··· 1603 1603 } 1604 1604 } 1605 1605 1606 + static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, 1607 + int verd_mapfd, enum redir_mode mode) 1608 + { 1609 + const char *log_prefix = redir_mode_str(mode); 1610 + struct sockaddr_storage addr; 1611 + int c0, c1, p0, p1; 1612 + unsigned int pass; 1613 + socklen_t len; 1614 + int err, n; 1615 + u64 value; 1616 + u32 key; 1617 + char b; 1618 + 1619 + zero_verdict_count(verd_mapfd); 1620 + 1621 + p0 = socket_loopback(family, sotype | SOCK_NONBLOCK); 1622 + if (p0 < 0) 1623 + return; 1624 + len = sizeof(addr); 1625 + err = xgetsockname(p0, sockaddr(&addr), &len); 1626 + if (err) 1627 + goto close_peer0; 1628 + 1629 + c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0); 1630 + if (c0 < 0) 1631 + goto close_peer0; 1632 + err = xconnect(c0, sockaddr(&addr), len); 1633 + if (err) 1634 + goto close_cli0; 1635 + err = xgetsockname(c0, sockaddr(&addr), &len); 1636 + if (err) 1637 + goto close_cli0; 1638 + err = xconnect(p0, sockaddr(&addr), len); 1639 + if (err) 1640 + goto close_cli0; 1641 + 1642 + p1 = socket_loopback(family, sotype | SOCK_NONBLOCK); 1643 + if (p1 < 0) 1644 + goto close_cli0; 1645 + err = xgetsockname(p1, sockaddr(&addr), &len); 1646 + if (err) 1647 + goto close_cli0; 1648 + 1649 + c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0); 1650 + if (c1 < 0) 1651 + goto close_peer1; 1652 + err = xconnect(c1, sockaddr(&addr), len); 1653 + if (err) 1654 + goto close_cli1; 1655 + err = xgetsockname(c1, sockaddr(&addr), &len); 1656 + if (err) 1657 + goto close_cli1; 1658 + err = xconnect(p1, sockaddr(&addr), len); 1659 + if (err) 1660 + goto close_cli1; 1661 + 1662 + key = 0; 1663 + value = p0; 1664 + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); 1665 + if (err) 1666 + goto close_cli1; 1667 + 1668 + key = 1; 1669 + value = p1; 1670 + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); 1671 + if (err) 1672 + goto close_cli1; 1673 + 1674 + n = write(c1, "a", 1); 1675 + if (n < 0) 1676 + FAIL_ERRNO("%s: write", log_prefix); 1677 + if (n == 0) 1678 + FAIL("%s: incomplete write", log_prefix); 1679 + if (n < 1) 1680 + goto close_cli1; 1681 + 1682 + key = SK_PASS; 1683 + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); 1684 + if (err) 1685 + goto close_cli1; 1686 + if (pass != 1) 1687 + FAIL("%s: want pass count 1, have %d", log_prefix, pass); 1688 + 1689 + n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); 1690 + if (n < 0) 1691 + FAIL_ERRNO("%s: read", log_prefix); 1692 + if (n == 0) 1693 + FAIL("%s: incomplete read", log_prefix); 1694 + 1695 + close_cli1: 1696 + xclose(c1); 1697 + close_peer1: 1698 + xclose(p1); 1699 + close_cli0: 1700 + xclose(c0); 1701 + close_peer0: 1702 + xclose(p0); 1703 + } 1704 + 1705 + static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, 1706 + struct bpf_map *inner_map, int family) 1707 + { 1708 + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); 1709 + int verdict_map = bpf_map__fd(skel->maps.verdict_map); 1710 + int sock_map = bpf_map__fd(inner_map); 1711 + int err; 1712 + 1713 + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); 1714 + if (err) 1715 + return; 1716 + 1717 + skel->bss->test_ingress = false; 1718 + udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, 1719 + REDIR_EGRESS); 1720 + skel->bss->test_ingress = true; 1721 + udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, 1722 + REDIR_INGRESS); 1723 + 1724 + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); 1725 + } 1726 + 1727 + static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map, 1728 + int family) 1729 + { 1730 + const char *family_name, *map_name; 1731 + char s[MAX_TEST_NAME]; 1732 + 1733 + family_name = family_str(family); 1734 + map_name = map_type_str(map); 1735 + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); 1736 + if (!test__start_subtest(s)) 1737 + return; 1738 + udp_skb_redir_to_connected(skel, map, family); 1739 + } 1740 + 1606 1741 static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, 1607 1742 int family) 1608 1743 { ··· 1746 1611 test_redir(skel, map, family, SOCK_STREAM); 1747 1612 test_reuseport(skel, map, family, SOCK_STREAM); 1748 1613 test_reuseport(skel, map, family, SOCK_DGRAM); 1614 + test_udp_redir(skel, map, family); 1749 1615 } 1750 1616 1751 1617 void test_sockmap_listen(void)

+4 -2

tools/testing/selftests/bpf/prog_tests/test_ima.c

··· 68 68 goto close_prog; 69 69 70 70 snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir); 71 - if (CHECK_FAIL(system(cmd))) 71 + err = system(cmd); 72 + if (CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno)) 72 73 goto close_clean; 73 74 74 75 err = run_measured_process(measured_dir, &skel->bss->monitored_pid); ··· 82 81 83 82 close_clean: 84 83 snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir); 85 - CHECK_FAIL(system(cmd)); 84 + err = system(cmd); 85 + CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno); 86 86 close_prog: 87 87 ima__destroy(skel); 88 88 }

+18 -18

tools/testing/selftests/bpf/progs/bpf_cubic.c

··· 174 174 * as long as it is used in one of the func ptr 175 175 * under SEC(".struct_ops"). 176 176 */ 177 - SEC("struct_ops/bictcp_init") 178 - void BPF_PROG(bictcp_init, struct sock *sk) 177 + SEC("struct_ops/bpf_cubic_init") 178 + void BPF_PROG(bpf_cubic_init, struct sock *sk) 179 179 { 180 180 struct bictcp *ca = inet_csk_ca(sk); 181 181 ··· 192 192 * The remaining tcp-cubic functions have an easier way. 193 193 */ 194 194 SEC("no-sec-prefix-bictcp_cwnd_event") 195 - void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event) 195 + void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) 196 196 { 197 197 if (event == CA_EVENT_TX_START) { 198 198 struct bictcp *ca = inet_csk_ca(sk); ··· 384 384 } 385 385 386 386 /* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ 387 - void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) 387 + void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) 388 388 { 389 389 struct tcp_sock *tp = tcp_sk(sk); 390 390 struct bictcp *ca = inet_csk_ca(sk); ··· 403 403 tcp_cong_avoid_ai(tp, ca->cnt, acked); 404 404 } 405 405 406 - __u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) 406 + __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) 407 407 { 408 408 const struct tcp_sock *tp = tcp_sk(sk); 409 409 struct bictcp *ca = inet_csk_ca(sk); ··· 420 420 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); 421 421 } 422 422 423 - void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state) 423 + void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) 424 424 { 425 425 if (new_state == TCP_CA_Loss) { 426 426 bictcp_reset(inet_csk_ca(sk)); ··· 496 496 } 497 497 } 498 498 499 - void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, 499 + void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, 500 500 const struct ack_sample *sample) 501 501 { 502 502 const struct tcp_sock *tp = tcp_sk(sk); ··· 525 525 hystart_update(sk, delay); 526 526 } 527 527 528 - __u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk) 529 - { 530 - const struct tcp_sock *tp = tcp_sk(sk); 528 + extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; 531 529 532 - return max(tp->snd_cwnd, tp->prior_cwnd); 530 + __u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) 531 + { 532 + return tcp_reno_undo_cwnd(sk); 533 533 } 534 534 535 535 SEC(".struct_ops") 536 536 struct tcp_congestion_ops cubic = { 537 - .init = (void *)bictcp_init, 538 - .ssthresh = (void *)bictcp_recalc_ssthresh, 539 - .cong_avoid = (void *)bictcp_cong_avoid, 540 - .set_state = (void *)bictcp_state, 541 - .undo_cwnd = (void *)tcp_reno_undo_cwnd, 542 - .cwnd_event = (void *)bictcp_cwnd_event, 543 - .pkts_acked = (void *)bictcp_acked, 537 + .init = (void *)bpf_cubic_init, 538 + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, 539 + .cong_avoid = (void *)bpf_cubic_cong_avoid, 540 + .set_state = (void *)bpf_cubic_state, 541 + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, 542 + .cwnd_event = (void *)bpf_cubic_cwnd_event, 543 + .pkts_acked = (void *)bpf_cubic_acked, 544 544 .name = "bpf_cubic", 545 545 };

+6 -16

tools/testing/selftests/bpf/progs/bpf_dctcp.c

··· 194 194 return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); 195 195 } 196 196 197 - SEC("struct_ops/tcp_reno_cong_avoid") 198 - void BPF_PROG(tcp_reno_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) 197 + extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; 198 + 199 + SEC("struct_ops/dctcp_reno_cong_avoid") 200 + void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) 199 201 { 200 - struct tcp_sock *tp = tcp_sk(sk); 201 - 202 - if (!tcp_is_cwnd_limited(sk)) 203 - return; 204 - 205 - /* In "safe" area, increase. */ 206 - if (tcp_in_slow_start(tp)) { 207 - acked = tcp_slow_start(tp, acked); 208 - if (!acked) 209 - return; 210 - } 211 - /* In dangerous area, increase slowly. */ 212 - tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); 202 + tcp_reno_cong_avoid(sk, ack, acked); 213 203 } 214 204 215 205 SEC(".struct_ops") ··· 216 226 .in_ack_event = (void *)dctcp_update_alpha, 217 227 .cwnd_event = (void *)dctcp_cwnd_event, 218 228 .ssthresh = (void *)dctcp_ssthresh, 219 - .cong_avoid = (void *)tcp_reno_cong_avoid, 229 + .cong_avoid = (void *)dctcp_cong_avoid, 220 230 .undo_cwnd = (void *)dctcp_cwnd_undo, 221 231 .set_state = (void *)dctcp_state, 222 232 .flags = TCP_CONG_NEEDS_ECN,

+47

tools/testing/selftests/bpf/progs/kfunc_call_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2021 Facebook */ 3 + #include <linux/bpf.h> 4 + #include <bpf/bpf_helpers.h> 5 + #include "bpf_tcp_helpers.h" 6 + 7 + extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym; 8 + extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, 9 + __u32 c, __u64 d) __ksym; 10 + 11 + SEC("classifier") 12 + int kfunc_call_test2(struct __sk_buff *skb) 13 + { 14 + struct bpf_sock *sk = skb->sk; 15 + 16 + if (!sk) 17 + return -1; 18 + 19 + sk = bpf_sk_fullsock(sk); 20 + if (!sk) 21 + return -1; 22 + 23 + return bpf_kfunc_call_test2((struct sock *)sk, 1, 2); 24 + } 25 + 26 + SEC("classifier") 27 + int kfunc_call_test1(struct __sk_buff *skb) 28 + { 29 + struct bpf_sock *sk = skb->sk; 30 + __u64 a = 1ULL << 32; 31 + __u32 ret; 32 + 33 + if (!sk) 34 + return -1; 35 + 36 + sk = bpf_sk_fullsock(sk); 37 + if (!sk) 38 + return -1; 39 + 40 + a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4); 41 + ret = a >> 32; /* ret should be 2 */ 42 + ret += (__u32)a; /* ret should be 12 */ 43 + 44 + return ret; 45 + } 46 + 47 + char _license[] SEC("license") = "GPL";

+42

tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2021 Facebook */ 3 + #include <linux/bpf.h> 4 + #include <bpf/bpf_helpers.h> 5 + #include "bpf_tcp_helpers.h" 6 + 7 + extern const int bpf_prog_active __ksym; 8 + extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, 9 + __u32 c, __u64 d) __ksym; 10 + extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; 11 + int active_res = -1; 12 + int sk_state = -1; 13 + 14 + int __noinline f1(struct __sk_buff *skb) 15 + { 16 + struct bpf_sock *sk = skb->sk; 17 + int *active; 18 + 19 + if (!sk) 20 + return -1; 21 + 22 + sk = bpf_sk_fullsock(sk); 23 + if (!sk) 24 + return -1; 25 + 26 + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, 27 + bpf_get_smp_processor_id()); 28 + if (active) 29 + active_res = *active; 30 + 31 + sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state; 32 + 33 + return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4); 34 + } 35 + 36 + SEC("classifier") 37 + int kfunc_call_test1(struct __sk_buff *skb) 38 + { 39 + return f1(skb); 40 + } 41 + 42 + char _license[] SEC("license") = "GPL";

+22

tools/testing/selftests/bpf/progs/test_sockmap_listen.c

··· 29 29 } verdict_map SEC(".maps"); 30 30 31 31 static volatile bool test_sockmap; /* toggled by user-space */ 32 + static volatile bool test_ingress; /* toggled by user-space */ 32 33 33 34 SEC("sk_skb/stream_parser") 34 35 int prog_stream_parser(struct __sk_buff *skb) ··· 48 47 verdict = bpf_sk_redirect_map(skb, &sock_map, zero, 0); 49 48 else 50 49 verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero, 0); 50 + 51 + count = bpf_map_lookup_elem(&verdict_map, &verdict); 52 + if (count) 53 + (*count)++; 54 + 55 + return verdict; 56 + } 57 + 58 + SEC("sk_skb/skb_verdict") 59 + int prog_skb_verdict(struct __sk_buff *skb) 60 + { 61 + unsigned int *count; 62 + __u32 zero = 0; 63 + int verdict; 64 + 65 + if (test_sockmap) 66 + verdict = bpf_sk_redirect_map(skb, &sock_map, zero, 67 + test_ingress ? BPF_F_INGRESS : 0); 68 + else 69 + verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero, 70 + test_ingress ? BPF_F_INGRESS : 0); 51 71 52 72 count = bpf_map_lookup_elem(&verdict_map, &verdict); 53 73 if (count)

+18

tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include "vmlinux.h" 3 + #include <bpf/bpf_helpers.h> 4 + 5 + struct { 6 + __uint(type, BPF_MAP_TYPE_SOCKMAP); 7 + __uint(max_entries, 2); 8 + __type(key, __u32); 9 + __type(value, __u64); 10 + } sock_map SEC(".maps"); 11 + 12 + SEC("sk_skb/skb_verdict") 13 + int prog_skb_verdict(struct __sk_buff *skb) 14 + { 15 + return SK_DROP; 16 + } 17 + 18 + char _license[] SEC("license") = "GPL";

+2 -1

tools/testing/selftests/bpf/test_xsk.sh

··· 107 107 echo "setting up ${VETH0}: namespace: ${NS0}" 108 108 fi 109 109 ip netns add ${NS1} 110 - ip link add ${VETH0} type veth peer name ${VETH1} 110 + ip link add ${VETH0} numtxqueues 4 numrxqueues 4 type veth peer name ${VETH1} numtxqueues 4 numrxqueues 4 111 111 if [ -f /proc/net/if_inet6 ]; then 112 112 echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 113 113 fi ··· 118 118 ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} 119 119 ip link set ${VETH0} mtu ${MTU} 120 120 ip netns exec ${NS1} ip link set ${VETH1} up 121 + ip netns exec ${NS1} ip link set dev lo up 121 122 ip link set ${VETH0} up 122 123 } 123 124

+6 -6

tools/testing/selftests/bpf/verifier/calls.c

··· 19 19 BPF_MOV64_IMM(BPF_REG_0, 2), 20 20 BPF_EXIT_INSN(), 21 21 }, 22 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 22 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 23 23 .result_unpriv = REJECT, 24 24 .result = ACCEPT, 25 25 .retval = 1, ··· 136 136 { 137 137 "calls: wrong src reg", 138 138 .insns = { 139 - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0), 139 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0), 140 140 BPF_MOV64_IMM(BPF_REG_0, 1), 141 141 BPF_EXIT_INSN(), 142 142 }, ··· 397 397 BPF_MOV64_IMM(BPF_REG_0, 1), 398 398 BPF_EXIT_INSN(), 399 399 }, 400 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 400 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 401 401 .fixup_map_hash_48b = { 3 }, 402 402 .result_unpriv = REJECT, 403 403 .result = ACCEPT, ··· 1977 1977 BPF_EXIT_INSN(), 1978 1978 }, 1979 1979 .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, 1980 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 1980 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 1981 1981 .result_unpriv = REJECT, 1982 1982 .result = ACCEPT, 1983 1983 }, ··· 2003 2003 BPF_EXIT_INSN(), 2004 2004 }, 2005 2005 .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, 2006 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 2006 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 2007 2007 .errstr = "!read_ok", 2008 2008 .result = REJECT, 2009 2009 }, ··· 2028 2028 BPF_EXIT_INSN(), 2029 2029 }, 2030 2030 .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, 2031 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 2031 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 2032 2032 .errstr = "!read_ok", 2033 2033 .result = REJECT, 2034 2034 },

+5 -5

tools/testing/selftests/bpf/verifier/dead_code.c

··· 85 85 BPF_MOV64_IMM(BPF_REG_0, 12), 86 86 BPF_EXIT_INSN(), 87 87 }, 88 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 88 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 89 89 .result_unpriv = REJECT, 90 90 .result = ACCEPT, 91 91 .retval = 7, ··· 103 103 BPF_MOV64_IMM(BPF_REG_0, 12), 104 104 BPF_EXIT_INSN(), 105 105 }, 106 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 106 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 107 107 .result_unpriv = REJECT, 108 108 .result = ACCEPT, 109 109 .retval = 7, ··· 121 121 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), 122 122 BPF_EXIT_INSN(), 123 123 }, 124 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 124 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 125 125 .result_unpriv = REJECT, 126 126 .result = ACCEPT, 127 127 .retval = 7, ··· 137 137 BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), 138 138 BPF_EXIT_INSN(), 139 139 }, 140 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 140 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 141 141 .result_unpriv = REJECT, 142 142 .result = ACCEPT, 143 143 .retval = 2, ··· 152 152 BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), 153 153 BPF_EXIT_INSN(), 154 154 }, 155 - .errstr_unpriv = "function calls to other bpf functions are allowed for", 155 + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", 156 156 .result_unpriv = REJECT, 157 157 .result = ACCEPT, 158 158 .retval = 2,

+28 -11

tools/testing/selftests/bpf/vmtest.sh

··· 24 24 usage() 25 25 { 26 26 cat <<EOF 27 - Usage: $0 [-i] [-d <output_dir>] -- [<command>] 27 + Usage: $0 [-i] [-s] [-d <output_dir>] -- [<command>] 28 28 29 29 <command> is the command you would normally run when you are in 30 30 tools/testing/selftests/bpf. e.g: 31 31 32 32 $0 -- ./test_progs -t test_lsm 33 33 34 - If no command is specified, "${DEFAULT_COMMAND}" will be run by 35 - default. 34 + If no command is specified and a debug shell (-s) is not requested, 35 + "${DEFAULT_COMMAND}" will be run by default. 36 36 37 37 If you build your kernel using KBUILD_OUTPUT= or O= options, these 38 38 can be passed as environment variables to the script: ··· 49 49 -d) Update the output directory (default: ${OUTPUT_DIR}) 50 50 -j) Number of jobs for compilation, similar to -j in make 51 51 (default: ${NUM_COMPILE_JOBS}) 52 + -s) Instead of powering off the VM, start an interactive 53 + shell. If <command> is specified, the shell runs after 54 + the command finishes executing 52 55 EOF 53 56 } 54 57 ··· 152 149 local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d" 153 150 local init_script="${init_script_dir}/S50-startup" 154 151 local command="$1" 152 + local exit_command="$2" 155 153 156 154 mount_image 157 155 ··· 166 162 167 163 fi 168 164 169 - sudo bash -c "cat >${init_script}" <<EOF 170 - #!/bin/bash 165 + sudo bash -c "echo '#!/bin/bash' > ${init_script}" 171 166 167 + if [[ "${command}" != "" ]]; then 168 + sudo bash -c "cat >>${init_script}" <<EOF 172 169 # Have a default value in the exit status file 173 170 # incase the VM is forcefully stopped. 174 171 echo "130" > "/root/${EXIT_STATUS_FILE}" ··· 180 175 stdbuf -oL -eL ${command} 181 176 echo "\$?" > "/root/${EXIT_STATUS_FILE}" 182 177 } 2>&1 | tee "/root/${LOG_FILE}" 183 - poweroff -f 178 + # Ensure that the logs are written to disk 179 + sync 184 180 EOF 181 + fi 185 182 183 + sudo bash -c "echo ${exit_command} >> ${init_script}" 186 184 sudo chmod a+x "${init_script}" 187 185 unmount_image 188 186 } ··· 285 277 local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" 286 278 local command="${DEFAULT_COMMAND}" 287 279 local update_image="no" 280 + local exit_command="poweroff -f" 281 + local debug_shell="no" 288 282 289 - while getopts 'hkid:j:' opt; do 283 + while getopts 'hskid:j:' opt; do 290 284 case ${opt} in 291 285 i) 292 286 update_image="yes" ··· 298 288 ;; 299 289 j) 300 290 NUM_COMPILE_JOBS="$OPTARG" 291 + ;; 292 + s) 293 + command="" 294 + debug_shell="yes" 295 + exit_command="bash" 301 296 ;; 302 297 h) 303 298 usage ··· 322 307 done 323 308 shift $((OPTIND -1)) 324 309 325 - if [[ $# -eq 0 ]]; then 310 + if [[ $# -eq 0 && "${debug_shell}" == "no" ]]; then 326 311 echo "No command specified, will run ${DEFAULT_COMMAND} in the vm" 327 312 else 328 313 command="$@" ··· 370 355 fi 371 356 372 357 update_selftests "${kernel_checkout}" "${make_command}" 373 - update_init_script "${command}" 358 + update_init_script "${command}" "${exit_command}" 374 359 run_vm "${kernel_bzimage}" 375 - copy_logs 376 - echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" 360 + if [[ "${command}" != "" ]]; then 361 + copy_logs 362 + echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" 363 + fi 377 364 } 378 365 379 366 catch()

+322 -388

tools/testing/selftests/bpf/xdpxceiver.c

··· 41 41 * Reduce the size of the RX ring to a fraction of the fill ring size. 42 42 * iv. fill queue empty 43 43 * Do not populate the fill queue and then try to receive pkts. 44 + * f. bpf_link resource persistence 45 + * Configure sockets at indexes 0 and 1, run a traffic on queue ids 0, 46 + * then remove xsk sockets from queue 0 on both veth interfaces and 47 + * finally run a traffic on queues ids 1 44 48 * 45 - * Total tests: 10 49 + * Total tests: 12 46 50 * 47 51 * Flow: 48 52 * ----- ··· 97 93 #include "xdpxceiver.h" 98 94 #include "../kselftest.h" 99 95 96 + static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; 97 + static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; 98 + static const char *IP1 = "192.168.100.162"; 99 + static const char *IP2 = "192.168.100.161"; 100 + static const u16 UDP_PORT1 = 2020; 101 + static const u16 UDP_PORT2 = 2121; 102 + 100 103 static void __exit_with_error(int error, const char *file, const char *func, int line) 101 104 { 102 105 if (configured_mode == TEST_MODE_UNCONFIGURED) { ··· 119 108 #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) 120 109 121 110 #define print_ksft_result(void)\ 122 - (ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\ 111 + (ksft_test_result_pass("PASS: %s %s %s%s%s%s\n", configured_mode ? "DRV" : "SKB",\ 123 112 test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ 124 113 test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ 125 114 test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\ 126 - test_type == TEST_TYPE_STATS ? "Stats" : "")) 127 - 128 - static void pthread_init_mutex(void) 129 - { 130 - pthread_mutex_init(&sync_mutex, NULL); 131 - pthread_mutex_init(&sync_mutex_tx, NULL); 132 - pthread_cond_init(&signal_rx_condition, NULL); 133 - pthread_cond_init(&signal_tx_condition, NULL); 134 - } 135 - 136 - static void pthread_destroy_mutex(void) 137 - { 138 - pthread_mutex_destroy(&sync_mutex); 139 - pthread_mutex_destroy(&sync_mutex_tx); 140 - pthread_cond_destroy(&signal_rx_condition); 141 - pthread_cond_destroy(&signal_tx_condition); 142 - } 115 + test_type == TEST_TYPE_STATS ? "Stats" : "",\ 116 + test_type == TEST_TYPE_BPF_RES ? "BPF RES" : "")) 143 117 144 118 static void *memset32_htonl(void *dest, u32 val, u32 size) 145 119 { ··· 143 147 } 144 148 145 149 /* 146 - * This function code has been taken from 147 - * Linux kernel lib/checksum.c 148 - */ 149 - static inline unsigned short from32to16(unsigned int x) 150 - { 151 - /* add up 16-bit and 16-bit for 16+c bit */ 152 - x = (x & 0xffff) + (x >> 16); 153 - /* add up carry.. */ 154 - x = (x & 0xffff) + (x >> 16); 155 - return x; 156 - } 157 - 158 - /* 159 150 * Fold a partial checksum 160 151 * This function code has been taken from 161 152 * Linux kernel include/asm-generic/checksum.h 162 153 */ 163 - static inline __u16 csum_fold(__u32 csum) 154 + static __u16 csum_fold(__u32 csum) 164 155 { 165 156 u32 sum = (__force u32)csum; 166 157 ··· 160 177 * This function code has been taken from 161 178 * Linux kernel lib/checksum.c 162 179 */ 163 - static inline u32 from64to32(u64 x) 180 + static u32 from64to32(u64 x) 164 181 { 165 182 /* add up 32-bit and 32-bit for 32+c bit */ 166 183 x = (x & 0xffffffff) + (x >> 32); ··· 169 186 return (u32)x; 170 187 } 171 188 172 - __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum); 173 - 174 189 /* 175 190 * This function code has been taken from 176 191 * Linux kernel lib/checksum.c 177 192 */ 178 - __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) 193 + static __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) 179 194 { 180 195 unsigned long long s = (__force u32)sum; 181 196 ··· 191 210 * This function has been taken from 192 211 * Linux kernel include/asm-generic/checksum.h 193 212 */ 194 - static inline __u16 195 - csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) 213 + static __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) 196 214 { 197 215 return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); 198 216 } 199 217 200 - static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) 218 + static u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) 201 219 { 202 220 u32 csum = 0; 203 221 u32 cnt = 0; ··· 251 271 memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); 252 272 } 253 273 254 - static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) 274 + static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) 255 275 { 256 - int ret; 257 276 struct xsk_umem_config cfg = { 258 277 .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, 259 278 .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, ··· 260 281 .frame_headroom = frame_headroom, 261 282 .flags = XSK_UMEM__DEFAULT_FLAGS 262 283 }; 284 + int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; 285 + struct xsk_umem_info *umem; 286 + int ret; 263 287 264 - data->umem = calloc(1, sizeof(struct xsk_umem_info)); 265 - if (!data->umem) 288 + umem = calloc(1, sizeof(struct xsk_umem_info)); 289 + if (!umem) 266 290 exit_with_error(errno); 267 291 268 - ret = xsk_umem__create(&data->umem->umem, buffer, size, 269 - &data->umem->fq, &data->umem->cq, &cfg); 292 + ret = xsk_umem__create(&umem->umem, buffer, size, 293 + &umem->fq, &umem->cq, &cfg); 270 294 if (ret) 271 295 exit_with_error(ret); 272 296 273 - data->umem->buffer = buffer; 297 + umem->buffer = buffer; 298 + 299 + data->umem_arr[idx] = umem; 274 300 } 275 301 276 302 static void xsk_populate_fill_ring(struct xsk_umem_info *umem) ··· 291 307 xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); 292 308 } 293 309 294 - static int xsk_configure_socket(struct ifobject *ifobject) 310 + static int xsk_configure_socket(struct ifobject *ifobject, int idx) 295 311 { 296 312 struct xsk_socket_config cfg; 313 + struct xsk_socket_info *xsk; 297 314 struct xsk_ring_cons *rxr; 298 315 struct xsk_ring_prod *txr; 299 316 int ret; 300 317 301 - ifobject->xsk = calloc(1, sizeof(struct xsk_socket_info)); 302 - if (!ifobject->xsk) 318 + xsk = calloc(1, sizeof(struct xsk_socket_info)); 319 + if (!xsk) 303 320 exit_with_error(errno); 304 321 305 - ifobject->xsk->umem = ifobject->umem; 322 + xsk->umem = ifobject->umem; 306 323 cfg.rx_size = rxqsize; 307 324 cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; 308 325 cfg.libbpf_flags = 0; ··· 311 326 cfg.bind_flags = xdp_bind_flags; 312 327 313 328 if (test_type != TEST_TYPE_BIDI) { 314 - rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; 315 - txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; 329 + rxr = (ifobject->fv.vector == rx) ? &xsk->rx : NULL; 330 + txr = (ifobject->fv.vector == tx) ? &xsk->tx : NULL; 316 331 } else { 317 - rxr = &ifobject->xsk->rx; 318 - txr = &ifobject->xsk->tx; 332 + rxr = &xsk->rx; 333 + txr = &xsk->tx; 319 334 } 320 335 321 - ret = xsk_socket__create(&ifobject->xsk->xsk, ifobject->ifname, 322 - opt_queue, ifobject->umem->umem, rxr, txr, &cfg); 323 - 336 + ret = xsk_socket__create(&xsk->xsk, ifobject->ifname, idx, 337 + ifobject->umem->umem, rxr, txr, &cfg); 324 338 if (ret) 325 339 return 1; 340 + 341 + ifobject->xsk_arr[idx] = xsk; 326 342 327 343 return 0; 328 344 } ··· 350 364 ksft_print_msg(str, prog); 351 365 } 352 366 353 - static bool switch_namespace(int idx) 367 + static int switch_namespace(const char *nsname) 354 368 { 355 369 char fqns[26] = "/var/run/netns/"; 356 370 int nsfd; 357 371 358 - strncat(fqns, ifdict[idx]->nsname, sizeof(fqns) - strlen(fqns) - 1); 372 + if (!nsname || strlen(nsname) == 0) 373 + return -1; 374 + 375 + strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1); 359 376 nsfd = open(fqns, O_RDONLY); 360 377 361 378 if (nsfd == -1) ··· 367 378 if (setns(nsfd, 0) == -1) 368 379 exit_with_error(errno); 369 380 370 - return true; 371 - } 381 + print_verbose("NS switched: %s\n", nsname); 372 382 373 - static void *nsswitchthread(void *args) 374 - { 375 - struct targs *targs = args; 376 - 377 - targs->retptr = false; 378 - 379 - if (switch_namespace(targs->idx)) { 380 - ifdict[targs->idx]->ifindex = if_nametoindex(ifdict[targs->idx]->ifname); 381 - if (!ifdict[targs->idx]->ifindex) { 382 - ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n", 383 - __func__, ifdict[targs->idx]->ifname); 384 - } else { 385 - print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname); 386 - targs->retptr = true; 387 - } 388 - } 389 - pthread_exit(NULL); 383 + return nsfd; 390 384 } 391 385 392 386 static int validate_interfaces(void) ··· 380 408 if (!strcmp(ifdict[i]->ifname, "")) { 381 409 ret = false; 382 410 ksft_test_result_fail("ERROR: interfaces: -i <int>,<ns> -i <int>,<ns>."); 383 - } 384 - if (strcmp(ifdict[i]->nsname, "")) { 385 - struct targs *targs; 386 - 387 - targs = malloc(sizeof(*targs)); 388 - if (!targs) 389 - exit_with_error(errno); 390 - 391 - targs->idx = i; 392 - if (pthread_create(&ns_thread, NULL, nsswitchthread, targs)) 393 - exit_with_error(errno); 394 - 395 - pthread_join(ns_thread, NULL); 396 - 397 - if (targs->retptr) 398 - print_verbose("NS switched: %s\n", ifdict[i]->nsname); 399 - 400 - free(targs); 401 - } else { 402 - ifdict[i]->ifindex = if_nametoindex(ifdict[i]->ifname); 403 - if (!ifdict[i]->ifindex) { 404 - ksft_test_result_fail 405 - ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); 406 - ret = false; 407 - } else { 408 - print_verbose("Interface found: %s\n", ifdict[i]->ifname); 409 - } 410 411 } 411 412 } 412 413 return ret; ··· 392 447 opterr = 0; 393 448 394 449 for (;;) { 395 - c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index); 450 + c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index); 396 451 397 452 if (c == -1) 398 453 break; ··· 411 466 memcpy(ifdict[interface_index]->nsname, token, 412 467 MAX_INTERFACES_NAMESPACE_CHARS); 413 468 interface_index++; 414 - break; 415 - case 'q': 416 - opt_queue = atoi(optarg); 417 469 break; 418 470 case 'D': 419 471 debug_pkt_dump = 1; ··· 448 506 exit_with_error(errno); 449 507 } 450 508 451 - static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) 509 + static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) 452 510 { 453 511 unsigned int rcvd; 454 512 u32 idx; ··· 456 514 if (!xsk->outstanding_tx) 457 515 return; 458 516 459 - if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) 517 + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) 460 518 kick_tx(xsk); 461 519 462 520 rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); ··· 544 602 xsk_ring_prod__submit(&xsk->tx, batch_size); 545 603 if (!tx_invalid_test) { 546 604 xsk->outstanding_tx += batch_size; 547 - } else { 548 - if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) 549 - kick_tx(xsk); 605 + } else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { 606 + kick_tx(xsk); 550 607 } 551 608 *frameptr += batch_size; 552 609 *frameptr %= num_frames; 553 610 complete_tx_only(xsk, batch_size); 554 611 } 555 612 556 - static inline int get_batch_size(int pkt_cnt) 613 + static int get_batch_size(int pkt_cnt) 557 614 { 558 615 if (!opt_pkt_count) 559 616 return BATCH_SIZE; ··· 608 667 609 668 static void worker_pkt_dump(void) 610 669 { 611 - struct in_addr ipaddr; 670 + struct ethhdr *ethhdr; 671 + struct iphdr *iphdr; 672 + struct udphdr *udphdr; 673 + char s[128]; 674 + int payload; 675 + void *ptr; 612 676 613 677 fprintf(stdout, "---------------------------------------\n"); 614 678 for (int iter = 0; iter < num_frames - 1; iter++) { 679 + ptr = pkt_buf[iter]->payload; 680 + ethhdr = ptr; 681 + iphdr = ptr + sizeof(*ethhdr); 682 + udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr); 683 + 615 684 /*extract L2 frame */ 616 685 fprintf(stdout, "DEBUG>> L2: dst mac: "); 617 686 for (int i = 0; i < ETH_ALEN; i++) 618 - fprintf(stdout, "%02X", ((struct ethhdr *) 619 - pkt_buf[iter]->payload)->h_dest[i]); 687 + fprintf(stdout, "%02X", ethhdr->h_dest[i]); 620 688 621 689 fprintf(stdout, "\nDEBUG>> L2: src mac: "); 622 690 for (int i = 0; i < ETH_ALEN; i++) 623 - fprintf(stdout, "%02X", ((struct ethhdr *) 624 - pkt_buf[iter]->payload)->h_source[i]); 691 + fprintf(stdout, "%02X", ethhdr->h_source[i]); 625 692 626 693 /*extract L3 frame */ 627 - fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", 628 - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->ihl); 629 - 630 - ipaddr.s_addr = 631 - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->saddr; 632 - fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", inet_ntoa(ipaddr)); 633 - 634 - ipaddr.s_addr = 635 - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->daddr; 636 - fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", inet_ntoa(ipaddr)); 637 - 694 + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); 695 + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", 696 + inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); 697 + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", 698 + inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); 638 699 /*extract L4 frame */ 639 - fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", 640 - ntohs(((struct udphdr *)(pkt_buf[iter]->payload + 641 - sizeof(struct ethhdr) + 642 - sizeof(struct iphdr)))->source)); 643 - 644 - fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", 645 - ntohs(((struct udphdr *)(pkt_buf[iter]->payload + 646 - sizeof(struct ethhdr) + 647 - sizeof(struct iphdr)))->dest)); 700 + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); 701 + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); 648 702 /*extract L5 frame */ 649 - int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); 703 + payload = *((uint32_t *)(ptr + PKT_HDR_SIZE)); 650 704 651 705 if (payload == EOT) { 652 706 print_verbose("End-of-transmission frame received\n"); ··· 745 809 } 746 810 } 747 811 748 - static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr, 749 - atomic_int *spinningptr) 812 + static void thread_common_ops(struct ifobject *ifobject, void *bufs) 750 813 { 814 + int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; 751 815 int ctr = 0; 752 816 int ret; 753 817 754 - xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); 755 - ret = xsk_configure_socket(ifobject); 818 + ifobject->ns_fd = switch_namespace(ifobject->nsname); 819 + 820 + if (test_type == TEST_TYPE_BPF_RES) 821 + umem_sz *= 2; 822 + 823 + bufs = mmap(NULL, umem_sz, 824 + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 825 + if (bufs == MAP_FAILED) 826 + exit_with_error(errno); 827 + 828 + xsk_configure_umem(ifobject, bufs, 0); 829 + ifobject->umem = ifobject->umem_arr[0]; 830 + ret = xsk_configure_socket(ifobject, 0); 756 831 757 832 /* Retry Create Socket if it fails as xsk_socket__create() 758 833 * is asynchronous 759 - * 760 - * Essential to lock Mutex here to prevent Tx thread from 761 - * entering before Rx and causing a deadlock 762 834 */ 763 - pthread_mutex_lock(mutexptr); 764 835 while (ret && ctr < SOCK_RECONF_CTR) { 765 - atomic_store(spinningptr, 1); 766 - xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); 767 - ret = xsk_configure_socket(ifobject); 836 + xsk_configure_umem(ifobject, bufs, 0); 837 + ifobject->umem = ifobject->umem_arr[0]; 838 + ret = xsk_configure_socket(ifobject, 0); 768 839 usleep(USLEEP_MAX); 769 840 ctr++; 770 841 } 771 - atomic_store(spinningptr, 0); 772 - pthread_mutex_unlock(mutexptr); 773 842 774 843 if (ctr >= SOCK_RECONF_CTR) 775 844 exit_with_error(ret); 845 + 846 + ifobject->umem = ifobject->umem_arr[0]; 847 + ifobject->xsk = ifobject->xsk_arr[0]; 848 + 849 + if (test_type == TEST_TYPE_BPF_RES) { 850 + xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1); 851 + ifobject->umem = ifobject->umem_arr[1]; 852 + ret = xsk_configure_socket(ifobject, 1); 853 + } 854 + 855 + ifobject->umem = ifobject->umem_arr[0]; 856 + ifobject->xsk = ifobject->xsk_arr[0]; 857 + print_verbose("Interface [%s] vector [%s]\n", 858 + ifobject->ifname, ifobject->fv.vector == tx ? "Tx" : "Rx"); 776 859 } 777 860 778 - static void *worker_testapp_validate(void *arg) 861 + static bool testapp_is_test_two_stepped(void) 862 + { 863 + return (test_type != TEST_TYPE_BIDI && test_type != TEST_TYPE_BPF_RES) || second_step; 864 + } 865 + 866 + static void testapp_cleanup_xsk_res(struct ifobject *ifobj) 867 + { 868 + if (testapp_is_test_two_stepped()) { 869 + xsk_socket__delete(ifobj->xsk->xsk); 870 + (void)xsk_umem__delete(ifobj->umem->umem); 871 + } 872 + } 873 + 874 + static void *worker_testapp_validate_tx(void *arg) 779 875 { 780 876 struct udphdr *udp_hdr = 781 877 (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); ··· 817 849 struct generic_data data; 818 850 void *bufs = NULL; 819 851 820 - pthread_attr_setstacksize(&attr, THREAD_STACK); 852 + if (!second_step) 853 + thread_common_ops(ifobject, bufs); 821 854 822 - if (!bidi_pass) { 823 - bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE, 824 - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 825 - if (bufs == MAP_FAILED) 855 + for (int i = 0; i < num_frames; i++) { 856 + /*send EOT frame */ 857 + if (i == (num_frames - 1)) 858 + data.seqnum = -1; 859 + else 860 + data.seqnum = i; 861 + gen_udp_hdr(&data, ifobject, udp_hdr); 862 + gen_ip_hdr(ifobject, ip_hdr); 863 + gen_udp_csum(udp_hdr, ip_hdr); 864 + gen_eth_hdr(ifobject, eth_hdr); 865 + gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); 866 + } 867 + 868 + print_verbose("Sending %d packets on interface %s\n", 869 + (opt_pkt_count - 1), ifobject->ifname); 870 + tx_only_all(ifobject); 871 + 872 + testapp_cleanup_xsk_res(ifobject); 873 + pthread_exit(NULL); 874 + } 875 + 876 + static void *worker_testapp_validate_rx(void *arg) 877 + { 878 + struct ifobject *ifobject = (struct ifobject *)arg; 879 + struct pollfd fds[MAX_SOCKS] = { }; 880 + void *bufs = NULL; 881 + 882 + if (!second_step) 883 + thread_common_ops(ifobject, bufs); 884 + 885 + if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) 886 + xsk_populate_fill_ring(ifobject->umem); 887 + 888 + TAILQ_INIT(&head); 889 + if (debug_pkt_dump) { 890 + pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); 891 + if (!pkt_buf) 826 892 exit_with_error(errno); 827 - 828 - if (strcmp(ifobject->nsname, "")) 829 - switch_namespace(ifobject->ifdict_index); 830 893 } 831 894 832 - if (ifobject->fv.vector == tx) { 833 - int spinningrxctr = 0; 895 + fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); 896 + fds[0].events = POLLIN; 834 897 835 - if (!bidi_pass) 836 - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx); 898 + pthread_barrier_wait(&barr); 837 899 838 - while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { 839 - spinningrxctr++; 840 - usleep(USLEEP_MAX); 900 + while (1) { 901 + if (test_type != TEST_TYPE_STATS) { 902 + rx_pkt(ifobject->xsk, fds); 903 + worker_pkt_validate(); 904 + } else { 905 + worker_stats_validate(ifobject); 841 906 } 842 - 843 - print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname); 844 - for (int i = 0; i < num_frames; i++) { 845 - /*send EOT frame */ 846 - if (i == (num_frames - 1)) 847 - data.seqnum = -1; 848 - else 849 - data.seqnum = i; 850 - gen_udp_hdr(&data, ifobject, udp_hdr); 851 - gen_ip_hdr(ifobject, ip_hdr); 852 - gen_udp_csum(udp_hdr, ip_hdr); 853 - gen_eth_hdr(ifobject, eth_hdr); 854 - gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); 855 - } 856 - 857 - print_verbose("Sending %d packets on interface %s\n", 858 - (opt_pkt_count - 1), ifobject->ifname); 859 - tx_only_all(ifobject); 860 - } else if (ifobject->fv.vector == rx) { 861 - struct pollfd fds[MAX_SOCKS] = { }; 862 - int ret; 863 - 864 - if (!bidi_pass) 865 - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); 866 - 867 - print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname); 868 - if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) 869 - xsk_populate_fill_ring(ifobject->umem); 870 - 871 - TAILQ_INIT(&head); 872 - if (debug_pkt_dump) { 873 - pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); 874 - if (!pkt_buf) 875 - exit_with_error(errno); 876 - } 877 - 878 - fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); 879 - fds[0].events = POLLIN; 880 - 881 - pthread_mutex_lock(&sync_mutex); 882 - pthread_cond_signal(&signal_rx_condition); 883 - pthread_mutex_unlock(&sync_mutex); 884 - 885 - while (1) { 886 - if (test_type == TEST_TYPE_POLL) { 887 - ret = poll(fds, 1, POLL_TMOUT); 888 - if (ret <= 0) 889 - continue; 890 - } 891 - 892 - if (test_type != TEST_TYPE_STATS) { 893 - rx_pkt(ifobject->xsk, fds); 894 - worker_pkt_validate(); 895 - } else { 896 - worker_stats_validate(ifobject); 897 - } 898 - 899 - if (sigvar) 900 - break; 901 - } 902 - 903 - if (test_type != TEST_TYPE_STATS) 904 - print_verbose("Received %d packets on interface %s\n", 905 - pkt_counter, ifobject->ifname); 906 - 907 - if (test_type == TEST_TYPE_TEARDOWN) 908 - print_verbose("Destroying socket\n"); 907 + if (sigvar) 908 + break; 909 909 } 910 910 911 - if ((test_type != TEST_TYPE_BIDI) || bidi_pass) { 912 - xsk_socket__delete(ifobject->xsk->xsk); 913 - (void)xsk_umem__delete(ifobject->umem->umem); 914 - } 911 + print_verbose("Received %d packets on interface %s\n", 912 + pkt_counter, ifobject->ifname); 913 + 914 + if (test_type == TEST_TYPE_TEARDOWN) 915 + print_verbose("Destroying socket\n"); 916 + 917 + testapp_cleanup_xsk_res(ifobject); 915 918 pthread_exit(NULL); 916 919 } 917 920 918 921 static void testapp_validate(void) 919 922 { 920 - struct timespec max_wait = { 0, 0 }; 921 923 bool bidi = test_type == TEST_TYPE_BIDI; 924 + bool bpf = test_type == TEST_TYPE_BPF_RES; 922 925 923 - pthread_attr_init(&attr); 924 - pthread_attr_setstacksize(&attr, THREAD_STACK); 925 - 926 - if ((test_type == TEST_TYPE_BIDI) && bidi_pass) { 927 - pthread_init_mutex(); 928 - if (!switching_notify) { 929 - print_verbose("Switching Tx/Rx vectors\n"); 930 - switching_notify++; 931 - } 932 - } 933 - 934 - pthread_mutex_lock(&sync_mutex); 926 + if (pthread_barrier_init(&barr, NULL, 2)) 927 + exit_with_error(errno); 935 928 936 929 /*Spawn RX thread */ 937 - if (!bidi || !bidi_pass) { 938 - if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1])) 939 - exit_with_error(errno); 940 - } else if (bidi && bidi_pass) { 941 - /*switch Tx/Rx vectors */ 942 - ifdict[0]->fv.vector = rx; 943 - if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0])) 944 - exit_with_error(errno); 945 - } 930 + pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx); 946 931 947 - if (clock_gettime(CLOCK_REALTIME, &max_wait)) 932 + pthread_barrier_wait(&barr); 933 + if (pthread_barrier_destroy(&barr)) 948 934 exit_with_error(errno); 949 - max_wait.tv_sec += TMOUT_SEC; 950 - 951 - if (pthread_cond_timedwait(&signal_rx_condition, &sync_mutex, &max_wait) == ETIMEDOUT) 952 - exit_with_error(errno); 953 - 954 - pthread_mutex_unlock(&sync_mutex); 955 935 956 936 /*Spawn TX thread */ 957 - if (!bidi || !bidi_pass) { 958 - if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0])) 959 - exit_with_error(errno); 960 - } else if (bidi && bidi_pass) { 961 - /*switch Tx/Rx vectors */ 962 - ifdict[1]->fv.vector = tx; 963 - if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1])) 964 - exit_with_error(errno); 965 - } 937 + pthread_create(&t1, NULL, ifdict_tx->func_ptr, ifdict_tx); 966 938 967 939 pthread_join(t1, NULL); 968 940 pthread_join(t0, NULL); 969 941 970 - if (debug_pkt_dump) { 942 + if (debug_pkt_dump && test_type != TEST_TYPE_STATS) { 971 943 worker_pkt_dump(); 972 944 for (int iter = 0; iter < num_frames - 1; iter++) { 973 945 free(pkt_buf[iter]->payload); ··· 916 1008 free(pkt_buf); 917 1009 } 918 1010 919 - if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS)) 1011 + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS)) 920 1012 print_ksft_result(); 921 1013 } 922 1014 923 - static void testapp_sockets(void) 1015 + static void testapp_teardown(void) 924 1016 { 925 - for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); 926 - i++) { 1017 + int i; 1018 + 1019 + for (i = 0; i < MAX_TEARDOWN_ITER; i++) { 927 1020 pkt_counter = 0; 928 1021 prev_pkt = -1; 929 1022 sigvar = 0; 930 1023 print_verbose("Creating socket\n"); 931 1024 testapp_validate(); 932 - test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass; 1025 + } 1026 + 1027 + print_ksft_result(); 1028 + } 1029 + 1030 + static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2) 1031 + { 1032 + void *(*tmp_func_ptr)(void *) = ifobj1->func_ptr; 1033 + enum fvector tmp_vector = ifobj1->fv.vector; 1034 + 1035 + ifobj1->func_ptr = ifobj2->func_ptr; 1036 + ifobj1->fv.vector = ifobj2->fv.vector; 1037 + 1038 + ifobj2->func_ptr = tmp_func_ptr; 1039 + ifobj2->fv.vector = tmp_vector; 1040 + 1041 + ifdict_tx = ifobj1; 1042 + ifdict_rx = ifobj2; 1043 + } 1044 + 1045 + static void testapp_bidi(void) 1046 + { 1047 + for (int i = 0; i < MAX_BIDI_ITER; i++) { 1048 + pkt_counter = 0; 1049 + prev_pkt = -1; 1050 + sigvar = 0; 1051 + print_verbose("Creating socket\n"); 1052 + testapp_validate(); 1053 + if (!second_step) { 1054 + print_verbose("Switching Tx/Rx vectors\n"); 1055 + swap_vectors(ifdict[1], ifdict[0]); 1056 + } 1057 + second_step = true; 1058 + } 1059 + 1060 + swap_vectors(ifdict[0], ifdict[1]); 1061 + 1062 + print_ksft_result(); 1063 + } 1064 + 1065 + static void swap_xsk_res(void) 1066 + { 1067 + xsk_socket__delete(ifdict_tx->xsk->xsk); 1068 + xsk_umem__delete(ifdict_tx->umem->umem); 1069 + xsk_socket__delete(ifdict_rx->xsk->xsk); 1070 + xsk_umem__delete(ifdict_rx->umem->umem); 1071 + ifdict_tx->umem = ifdict_tx->umem_arr[1]; 1072 + ifdict_tx->xsk = ifdict_tx->xsk_arr[1]; 1073 + ifdict_rx->umem = ifdict_rx->umem_arr[1]; 1074 + ifdict_rx->xsk = ifdict_rx->xsk_arr[1]; 1075 + } 1076 + 1077 + static void testapp_bpf_res(void) 1078 + { 1079 + int i; 1080 + 1081 + for (i = 0; i < MAX_BPF_ITER; i++) { 1082 + pkt_counter = 0; 1083 + prev_pkt = -1; 1084 + sigvar = 0; 1085 + print_verbose("Creating socket\n"); 1086 + testapp_validate(); 1087 + if (!second_step) 1088 + swap_xsk_res(); 1089 + second_step = true; 933 1090 } 934 1091 935 1092 print_ksft_result(); ··· 1026 1053 print_ksft_result(); 1027 1054 } 1028 1055 1029 - static void init_iface_config(struct ifaceconfigobj *ifaceconfig) 1056 + static void init_iface(struct ifobject *ifobj, const char *dst_mac, 1057 + const char *src_mac, const char *dst_ip, 1058 + const char *src_ip, const u16 dst_port, 1059 + const u16 src_port, enum fvector vector) 1030 1060 { 1031 - /*Init interface0 */ 1032 - ifdict[0]->fv.vector = tx; 1033 - memcpy(ifdict[0]->dst_mac, ifaceconfig->dst_mac, ETH_ALEN); 1034 - memcpy(ifdict[0]->src_mac, ifaceconfig->src_mac, ETH_ALEN); 1035 - ifdict[0]->dst_ip = ifaceconfig->dst_ip.s_addr; 1036 - ifdict[0]->src_ip = ifaceconfig->src_ip.s_addr; 1037 - ifdict[0]->dst_port = ifaceconfig->dst_port; 1038 - ifdict[0]->src_port = ifaceconfig->src_port; 1061 + struct in_addr ip; 1039 1062 1040 - /*Init interface1 */ 1041 - ifdict[1]->fv.vector = rx; 1042 - memcpy(ifdict[1]->dst_mac, ifaceconfig->src_mac, ETH_ALEN); 1043 - memcpy(ifdict[1]->src_mac, ifaceconfig->dst_mac, ETH_ALEN); 1044 - ifdict[1]->dst_ip = ifaceconfig->src_ip.s_addr; 1045 - ifdict[1]->src_ip = ifaceconfig->dst_ip.s_addr; 1046 - ifdict[1]->dst_port = ifaceconfig->src_port; 1047 - ifdict[1]->src_port = ifaceconfig->dst_port; 1048 - } 1063 + memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN); 1064 + memcpy(ifobj->src_mac, src_mac, ETH_ALEN); 1049 1065 1050 - static void *nsdisablemodethread(void *args) 1051 - { 1052 - struct targs *targs = args; 1066 + inet_aton(dst_ip, &ip); 1067 + ifobj->dst_ip = ip.s_addr; 1053 1068 1054 - targs->retptr = false; 1069 + inet_aton(src_ip, &ip); 1070 + ifobj->src_ip = ip.s_addr; 1055 1071 1056 - if (switch_namespace(targs->idx)) { 1057 - targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags); 1072 + ifobj->dst_port = dst_port; 1073 + ifobj->src_port = src_port; 1074 + 1075 + if (vector == tx) { 1076 + ifobj->fv.vector = tx; 1077 + ifobj->func_ptr = worker_testapp_validate_tx; 1078 + ifdict_tx = ifobj; 1058 1079 } else { 1059 - targs->retptr = errno; 1060 - print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname); 1061 - } 1062 - 1063 - pthread_exit(NULL); 1064 - } 1065 - 1066 - static void disable_xdp_mode(int mode) 1067 - { 1068 - int err = 0; 1069 - __u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode; 1070 - char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv"; 1071 - 1072 - for (int i = 0; i < MAX_INTERFACES; i++) { 1073 - if (strcmp(ifdict[i]->nsname, "")) { 1074 - struct targs *targs; 1075 - 1076 - targs = malloc(sizeof(*targs)); 1077 - memset(targs, 0, sizeof(*targs)); 1078 - if (!targs) 1079 - exit_with_error(errno); 1080 - 1081 - targs->idx = i; 1082 - targs->flags = flags; 1083 - if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs)) 1084 - exit_with_error(errno); 1085 - 1086 - pthread_join(ns_thread, NULL); 1087 - err = targs->retptr; 1088 - free(targs); 1089 - } else { 1090 - err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags); 1091 - } 1092 - 1093 - if (err) { 1094 - print_verbose("Failed to disable %s mode on interface %s\n", 1095 - mode_str, ifdict[i]->ifname); 1096 - exit_with_error(err); 1097 - } 1098 - 1099 - print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname); 1100 - configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB; 1080 + ifobj->fv.vector = rx; 1081 + ifobj->func_ptr = worker_testapp_validate_rx; 1082 + ifdict_rx = ifobj; 1101 1083 } 1102 1084 } 1103 1085 ··· 1063 1135 /* reset defaults after potential previous test */ 1064 1136 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; 1065 1137 pkt_counter = 0; 1066 - switching_notify = 0; 1067 - bidi_pass = 0; 1138 + second_step = 0; 1068 1139 prev_pkt = -1; 1069 - ifdict[0]->fv.vector = tx; 1070 - ifdict[1]->fv.vector = rx; 1071 1140 sigvar = 0; 1072 1141 stat_test_type = -1; 1073 1142 rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; 1074 1143 frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; 1075 1144 1145 + configured_mode = mode; 1146 + 1076 1147 switch (mode) { 1077 1148 case (TEST_MODE_SKB): 1078 - if (configured_mode == TEST_MODE_DRV) 1079 - disable_xdp_mode(XDP_FLAGS_DRV_MODE); 1080 1149 xdp_flags |= XDP_FLAGS_SKB_MODE; 1081 1150 break; 1082 1151 case (TEST_MODE_DRV): 1083 - if (configured_mode == TEST_MODE_SKB) 1084 - disable_xdp_mode(XDP_FLAGS_SKB_MODE); 1085 1152 xdp_flags |= XDP_FLAGS_DRV_MODE; 1086 1153 break; 1087 1154 default: 1088 1155 break; 1089 1156 } 1090 1157 1091 - pthread_init_mutex(); 1092 - 1093 - if (test_type == TEST_TYPE_STATS) 1158 + switch (test_type) { 1159 + case TEST_TYPE_STATS: 1094 1160 testapp_stats(); 1095 - else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) 1161 + break; 1162 + case TEST_TYPE_TEARDOWN: 1163 + testapp_teardown(); 1164 + break; 1165 + case TEST_TYPE_BIDI: 1166 + testapp_bidi(); 1167 + break; 1168 + case TEST_TYPE_BPF_RES: 1169 + testapp_bpf_res(); 1170 + break; 1171 + default: 1096 1172 testapp_validate(); 1097 - else 1098 - testapp_sockets(); 1099 - 1100 - pthread_destroy_mutex(); 1173 + break; 1174 + } 1101 1175 } 1102 1176 1103 1177 int main(int argc, char **argv) 1104 1178 { 1105 1179 struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; 1180 + bool failure = false; 1181 + int i, j; 1106 1182 1107 1183 if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) 1108 1184 exit_with_error(errno); 1109 - 1110 - const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; 1111 - const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; 1112 - const char *IP1 = "192.168.100.162"; 1113 - const char *IP2 = "192.168.100.161"; 1114 - u16 UDP_DST_PORT = 2020; 1115 - u16 UDP_SRC_PORT = 2121; 1116 - int i, j; 1117 - 1118 - ifaceconfig = malloc(sizeof(struct ifaceconfigobj)); 1119 - memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); 1120 - memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN); 1121 - inet_aton(IP1, &ifaceconfig->dst_ip); 1122 - inet_aton(IP2, &ifaceconfig->src_ip); 1123 - ifaceconfig->dst_port = UDP_DST_PORT; 1124 - ifaceconfig->src_port = UDP_SRC_PORT; 1125 1185 1126 1186 for (int i = 0; i < MAX_INTERFACES; i++) { 1127 1187 ifdict[i] = malloc(sizeof(struct ifobject)); ··· 1117 1201 exit_with_error(errno); 1118 1202 1119 1203 ifdict[i]->ifdict_index = i; 1204 + ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *)); 1205 + if (!ifdict[i]->xsk_arr) { 1206 + failure = true; 1207 + goto cleanup; 1208 + } 1209 + ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *)); 1210 + if (!ifdict[i]->umem_arr) { 1211 + failure = true; 1212 + goto cleanup; 1213 + } 1120 1214 } 1121 1215 1122 1216 setlocale(LC_ALL, ""); ··· 1135 1209 1136 1210 num_frames = ++opt_pkt_count; 1137 1211 1138 - init_iface_config(ifaceconfig); 1139 - 1140 - disable_xdp_mode(XDP_FLAGS_DRV_MODE); 1212 + init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); 1213 + init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); 1141 1214 1142 1215 ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); 1143 1216 ··· 1145 1220 run_pkt_test(i, j); 1146 1221 } 1147 1222 1148 - for (int i = 0; i < MAX_INTERFACES; i++) 1223 + cleanup: 1224 + for (int i = 0; i < MAX_INTERFACES; i++) { 1225 + if (ifdict[i]->ns_fd != -1) 1226 + close(ifdict[i]->ns_fd); 1227 + free(ifdict[i]->xsk_arr); 1228 + free(ifdict[i]->umem_arr); 1149 1229 free(ifdict[i]); 1230 + } 1231 + 1232 + if (failure) 1233 + exit_with_error(errno); 1150 1234 1151 1235 ksft_exit_pass(); 1152 1236

+15 -34

tools/testing/selftests/bpf/xdpxceiver.h

··· 23 23 #define MAX_SOCKS 1 24 24 #define MAX_TEARDOWN_ITER 10 25 25 #define MAX_BIDI_ITER 2 26 + #define MAX_BPF_ITER 2 26 27 #define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ 27 28 sizeof(struct udphdr)) 28 29 #define MIN_PKT_SIZE 64 ··· 34 33 #define IP_PKT_TOS 0x9 35 34 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) 36 35 #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) 37 - #define TMOUT_SEC (3) 38 36 #define EOT (-1) 39 37 #define USLEEP_MAX 200000 40 - #define THREAD_STACK 60000000 41 38 #define SOCK_RECONF_CTR 10 42 39 #define BATCH_SIZE 64 43 40 #define POLL_TMOUT 1000 44 - #define NEED_WAKEUP true 45 41 #define DEFAULT_PKT_CNT 10000 46 42 #define RX_FULL_RXQSIZE 32 47 43 ··· 61 63 TEST_TYPE_TEARDOWN, 62 64 TEST_TYPE_BIDI, 63 65 TEST_TYPE_STATS, 66 + TEST_TYPE_BPF_RES, 64 67 TEST_TYPE_MAX 65 68 }; 66 69 ··· 76 77 static int configured_mode = TEST_MODE_UNCONFIGURED; 77 78 static u8 debug_pkt_dump; 78 79 static u32 num_frames; 79 - static u8 switching_notify; 80 - static u8 bidi_pass; 80 + static bool second_step; 81 81 static int test_type; 82 82 83 - static int opt_queue; 84 83 static int opt_pkt_count; 85 84 static u8 opt_verbose; 86 85 ··· 122 125 u32 seqnum; 123 126 }; 124 127 125 - struct ifaceconfigobj { 126 - u8 dst_mac[ETH_ALEN]; 127 - u8 src_mac[ETH_ALEN]; 128 - struct in_addr dst_ip; 129 - struct in_addr src_ip; 130 - u16 src_port; 131 - u16 dst_port; 132 - } *ifaceconfig; 133 - 134 128 struct ifobject { 135 - int ifindex; 136 - int ifdict_index; 137 129 char ifname[MAX_INTERFACE_NAME_CHARS]; 138 130 char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; 139 - struct flow_vector fv; 140 131 struct xsk_socket_info *xsk; 132 + struct xsk_socket_info **xsk_arr; 133 + struct xsk_umem_info **umem_arr; 141 134 struct xsk_umem_info *umem; 142 - u8 dst_mac[ETH_ALEN]; 143 - u8 src_mac[ETH_ALEN]; 135 + void *(*func_ptr)(void *arg); 136 + struct flow_vector fv; 137 + int ns_fd; 138 + int ifdict_index; 144 139 u32 dst_ip; 145 140 u32 src_ip; 146 141 u16 src_port; 147 142 u16 dst_port; 143 + u8 dst_mac[ETH_ALEN]; 144 + u8 src_mac[ETH_ALEN]; 148 145 }; 149 146 150 147 static struct ifobject *ifdict[MAX_INTERFACES]; 148 + static struct ifobject *ifdict_rx; 149 + static struct ifobject *ifdict_tx; 151 150 152 151 /*threads*/ 153 - atomic_int spinning_tx; 154 - atomic_int spinning_rx; 155 - pthread_mutex_t sync_mutex; 156 - pthread_mutex_t sync_mutex_tx; 157 - pthread_cond_t signal_rx_condition; 158 - pthread_cond_t signal_tx_condition; 159 - pthread_t t0, t1, ns_thread; 160 - pthread_attr_t attr; 161 - 162 - struct targs { 163 - u8 retptr; 164 - int idx; 165 - u32 flags; 166 - }; 152 + pthread_barrier_t barr; 153 + pthread_t t0, t1; 167 154 168 155 TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); 169 156 struct head_s *head_p;