Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+89 -21

arch/powerpc/net/bpf_jit_comp64.c

··· 167 167 168 168 static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func) 169 169 { 170 + unsigned int i, ctx_idx = ctx->idx; 171 + 172 + /* Load function address into r12 */ 173 + PPC_LI64(12, func); 174 + 175 + /* For bpf-to-bpf function calls, the callee's address is unknown 176 + * until the last extra pass. As seen above, we use PPC_LI64() to 177 + * load the callee's address, but this may optimize the number of 178 + * instructions required based on the nature of the address. 179 + * 180 + * Since we don't want the number of instructions emitted to change, 181 + * we pad the optimized PPC_LI64() call with NOPs to guarantee that 182 + * we always have a five-instruction sequence, which is the maximum 183 + * that PPC_LI64() can emit. 184 + */ 185 + for (i = ctx->idx - ctx_idx; i < 5; i++) 186 + PPC_NOP(); 187 + 170 188 #ifdef PPC64_ELF_ABI_v1 171 - /* func points to the function descriptor */ 172 - PPC_LI64(b2p[TMP_REG_2], func); 173 - /* Load actual entry point from function descriptor */ 174 - PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0); 175 - /* ... and move it to LR */ 176 - PPC_MTLR(b2p[TMP_REG_1]); 177 189 /* 178 190 * Load TOC from function descriptor at offset 8. 179 191 * We can clobber r2 since we get called through a 180 192 * function pointer (so caller will save/restore r2) 181 193 * and since we don't use a TOC ourself. 182 194 */ 183 - PPC_BPF_LL(2, b2p[TMP_REG_2], 8); 184 - #else 185 - /* We can clobber r12 */ 186 - PPC_FUNC_ADDR(12, func); 187 - PPC_MTLR(12); 195 + PPC_BPF_LL(2, 12, 8); 196 + /* Load actual entry point from function descriptor */ 197 + PPC_BPF_LL(12, 12, 0); 188 198 #endif 199 + 200 + PPC_MTLR(12); 189 201 PPC_BLRL(); 190 202 } 191 203 ··· 268 256 /* Assemble the body code between the prologue & epilogue */ 269 257 static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 270 258 struct codegen_context *ctx, 271 - u32 *addrs) 259 + u32 *addrs, bool extra_pass) 272 260 { 273 261 const struct bpf_insn *insn = fp->insnsi; 274 262 int flen = fp->len; ··· 724 712 break; 725 713 726 714 /* 727 - * Call kernel helper 715 + * Call kernel helper or bpf function 728 716 */ 729 717 case BPF_JMP | BPF_CALL: 730 718 ctx->seen |= SEEN_FUNC; 731 - func = (u8 *) __bpf_call_base + imm; 719 + 720 + /* bpf function call */ 721 + if (insn[i].src_reg == BPF_PSEUDO_CALL) 722 + if (!extra_pass) 723 + func = NULL; 724 + else if (fp->aux->func && off < fp->aux->func_cnt) 725 + /* use the subprog id from the off 726 + * field to lookup the callee address 727 + */ 728 + func = (u8 *) fp->aux->func[off]->bpf_func; 729 + else 730 + return -EINVAL; 731 + /* kernel helper call */ 732 + else 733 + func = (u8 *) __bpf_call_base + imm; 732 734 733 735 bpf_jit_emit_func_call(image, ctx, (u64)func); 734 736 ··· 890 864 return 0; 891 865 } 892 866 867 + struct powerpc64_jit_data { 868 + struct bpf_binary_header *header; 869 + u32 *addrs; 870 + u8 *image; 871 + u32 proglen; 872 + struct codegen_context ctx; 873 + }; 874 + 893 875 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) 894 876 { 895 877 u32 proglen; ··· 905 871 u8 *image = NULL; 906 872 u32 *code_base; 907 873 u32 *addrs; 874 + struct powerpc64_jit_data *jit_data; 908 875 struct codegen_context cgctx; 909 876 int pass; 910 877 int flen; ··· 913 878 struct bpf_prog *org_fp = fp; 914 879 struct bpf_prog *tmp_fp; 915 880 bool bpf_blinded = false; 881 + bool extra_pass = false; 916 882 917 883 if (!fp->jit_requested) 918 884 return org_fp; ··· 927 891 fp = tmp_fp; 928 892 } 929 893 894 + jit_data = fp->aux->jit_data; 895 + if (!jit_data) { 896 + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); 897 + if (!jit_data) { 898 + fp = org_fp; 899 + goto out; 900 + } 901 + fp->aux->jit_data = jit_data; 902 + } 903 + 930 904 flen = fp->len; 905 + addrs = jit_data->addrs; 906 + if (addrs) { 907 + cgctx = jit_data->ctx; 908 + image = jit_data->image; 909 + bpf_hdr = jit_data->header; 910 + proglen = jit_data->proglen; 911 + alloclen = proglen + FUNCTION_DESCR_SIZE; 912 + extra_pass = true; 913 + goto skip_init_ctx; 914 + } 915 + 931 916 addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL); 932 917 if (addrs == NULL) { 933 918 fp = org_fp; 934 - goto out; 919 + goto out_addrs; 935 920 } 936 921 937 922 memset(&cgctx, 0, sizeof(struct codegen_context)); ··· 961 904 cgctx.stack_size = round_up(fp->aux->stack_depth, 16); 962 905 963 906 /* Scouting faux-generate pass 0 */ 964 - if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { 907 + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { 965 908 /* We hit something illegal or unsupported. */ 966 909 fp = org_fp; 967 - goto out; 910 + goto out_addrs; 968 911 } 969 912 970 913 /* ··· 982 925 bpf_jit_fill_ill_insns); 983 926 if (!bpf_hdr) { 984 927 fp = org_fp; 985 - goto out; 928 + goto out_addrs; 986 929 } 987 930 931 + skip_init_ctx: 988 932 code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); 989 933 990 934 /* Code generation passes 1-2 */ ··· 993 935 /* Now build the prologue, body code & epilogue for real. */ 994 936 cgctx.idx = 0; 995 937 bpf_jit_build_prologue(code_base, &cgctx); 996 - bpf_jit_build_body(fp, code_base, &cgctx, addrs); 938 + bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass); 997 939 bpf_jit_build_epilogue(code_base, &cgctx); 998 940 999 941 if (bpf_jit_enable > 1) ··· 1019 961 fp->jited_len = alloclen; 1020 962 1021 963 bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); 964 + if (!fp->is_func || extra_pass) { 965 + out_addrs: 966 + kfree(addrs); 967 + kfree(jit_data); 968 + fp->aux->jit_data = NULL; 969 + } else { 970 + jit_data->addrs = addrs; 971 + jit_data->ctx = cgctx; 972 + jit_data->proglen = proglen; 973 + jit_data->image = image; 974 + jit_data->header = bpf_hdr; 975 + } 1022 976 1023 977 out: 1024 - kfree(addrs); 1025 - 1026 978 if (bpf_blinded) 1027 979 bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp); 1028 980

+19 -7

drivers/net/ethernet/intel/i40e/i40e_txrx.c

··· 3664 3664 * @dev: netdev 3665 3665 * @xdp: XDP buffer 3666 3666 * 3667 - * Returns Zero if sent, else an error code 3667 + * Returns number of frames successfully sent. Frames that fail are 3668 + * free'ed via XDP return API. 3669 + * 3670 + * For error cases, a negative errno code is returned and no-frames 3671 + * are transmitted (caller must handle freeing frames). 3668 3672 **/ 3669 - int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) 3673 + int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) 3670 3674 { 3671 3675 struct i40e_netdev_priv *np = netdev_priv(dev); 3672 3676 unsigned int queue_index = smp_processor_id(); 3673 3677 struct i40e_vsi *vsi = np->vsi; 3674 - int err; 3678 + int drops = 0; 3679 + int i; 3675 3680 3676 3681 if (test_bit(__I40E_VSI_DOWN, vsi->state)) 3677 3682 return -ENETDOWN; ··· 3684 3679 if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) 3685 3680 return -ENXIO; 3686 3681 3687 - err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); 3688 - if (err != I40E_XDP_TX) 3689 - return -ENOSPC; 3682 + for (i = 0; i < n; i++) { 3683 + struct xdp_frame *xdpf = frames[i]; 3684 + int err; 3690 3685 3691 - return 0; 3686 + err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); 3687 + if (err != I40E_XDP_TX) { 3688 + xdp_return_frame_rx_napi(xdpf); 3689 + drops++; 3690 + } 3691 + } 3692 + 3693 + return n - drops; 3692 3694 } 3693 3695 3694 3696 /**

+1 -1

drivers/net/ethernet/intel/i40e/i40e_txrx.h

··· 487 487 void i40e_detect_recover_hung(struct i40e_vsi *vsi); 488 488 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); 489 489 bool __i40e_chk_linearize(struct sk_buff *skb); 490 - int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf); 490 + int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames); 491 491 void i40e_xdp_flush(struct net_device *dev); 492 492 493 493 /**

+15 -6

drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

··· 10022 10022 } 10023 10023 } 10024 10024 10025 - static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) 10025 + static int ixgbe_xdp_xmit(struct net_device *dev, int n, 10026 + struct xdp_frame **frames) 10026 10027 { 10027 10028 struct ixgbe_adapter *adapter = netdev_priv(dev); 10028 10029 struct ixgbe_ring *ring; 10029 - int err; 10030 + int drops = 0; 10031 + int i; 10030 10032 10031 10033 if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) 10032 10034 return -ENETDOWN; ··· 10040 10038 if (unlikely(!ring)) 10041 10039 return -ENXIO; 10042 10040 10043 - err = ixgbe_xmit_xdp_ring(adapter, xdpf); 10044 - if (err != IXGBE_XDP_TX) 10045 - return -ENOSPC; 10041 + for (i = 0; i < n; i++) { 10042 + struct xdp_frame *xdpf = frames[i]; 10043 + int err; 10046 10044 10047 - return 0; 10045 + err = ixgbe_xmit_xdp_ring(adapter, xdpf); 10046 + if (err != IXGBE_XDP_TX) { 10047 + xdp_return_frame_rx_napi(xdpf); 10048 + drops++; 10049 + } 10050 + } 10051 + 10052 + return n - drops; 10048 10053 } 10049 10054 10050 10055 static void ixgbe_xdp_flush(struct net_device *dev)

+381 -29

drivers/net/ethernet/netronome/nfp/bpf/jit.c

··· 212 212 } 213 213 214 214 static void 215 + __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer, 216 + bool set, bool src_lmextn) 217 + { 218 + u16 addr_lo, addr_hi; 219 + u64 insn; 220 + 221 + addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO)); 222 + addr_hi = addr != addr_lo; 223 + 224 + insn = OP_BR_BIT_BASE | 225 + FIELD_PREP(OP_BR_BIT_A_SRC, areg) | 226 + FIELD_PREP(OP_BR_BIT_B_SRC, breg) | 227 + FIELD_PREP(OP_BR_BIT_BV, set) | 228 + FIELD_PREP(OP_BR_BIT_DEFBR, defer) | 229 + FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) | 230 + FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) | 231 + FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn); 232 + 233 + nfp_prog_push(nfp_prog, insn); 234 + } 235 + 236 + static void 237 + emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, 238 + u8 defer, bool set, enum nfp_relo_type relo) 239 + { 240 + struct nfp_insn_re_regs reg; 241 + int err; 242 + 243 + /* NOTE: The bit to test is specified as an rotation amount, such that 244 + * the bit to test will be placed on the MSB of the result when 245 + * doing a rotate right. For bit X, we need right rotate X + 1. 246 + */ 247 + bit += 1; 248 + 249 + err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false); 250 + if (err) { 251 + nfp_prog->error = err; 252 + return; 253 + } 254 + 255 + __emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set, 256 + reg.src_lmextn); 257 + 258 + nfp_prog->prog[nfp_prog->prog_len - 1] |= 259 + FIELD_PREP(OP_RELO_TYPE, relo); 260 + } 261 + 262 + static void 263 + emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer) 264 + { 265 + emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL); 266 + } 267 + 268 + static void 215 269 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi, 216 270 enum immed_width width, bool invert, 217 271 enum immed_shift shift, bool wr_both, ··· 361 307 __emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift, 362 308 reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both, 363 309 reg.dst_lmextn, reg.src_lmextn); 310 + } 311 + 312 + static void 313 + emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst, 314 + swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc) 315 + { 316 + if (sc == SHF_SC_R_ROT) { 317 + pr_err("indirect shift is not allowed on rotation\n"); 318 + nfp_prog->error = -EFAULT; 319 + return; 320 + } 321 + 322 + emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0); 364 323 } 365 324 366 325 static void ··· 1696 1629 return 0; 1697 1630 } 1698 1631 1632 + /* Pseudo code: 1633 + * if shift_amt >= 32 1634 + * dst_high = dst_low << shift_amt[4:0] 1635 + * dst_low = 0; 1636 + * else 1637 + * dst_high = (dst_high, dst_low) >> (32 - shift_amt) 1638 + * dst_low = dst_low << shift_amt 1639 + * 1640 + * The indirect shift will use the same logic at runtime. 1641 + */ 1642 + static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) 1643 + { 1644 + if (shift_amt < 32) { 1645 + emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), 1646 + SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF, 1647 + 32 - shift_amt); 1648 + emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, 1649 + reg_b(dst), SHF_SC_L_SHF, shift_amt); 1650 + } else if (shift_amt == 32) { 1651 + wrp_reg_mov(nfp_prog, dst + 1, dst); 1652 + wrp_immed(nfp_prog, reg_both(dst), 0); 1653 + } else if (shift_amt > 32) { 1654 + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE, 1655 + reg_b(dst), SHF_SC_L_SHF, shift_amt - 32); 1656 + wrp_immed(nfp_prog, reg_both(dst), 0); 1657 + } 1658 + 1659 + return 0; 1660 + } 1661 + 1699 1662 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1700 1663 { 1701 1664 const struct bpf_insn *insn = &meta->insn; 1702 1665 u8 dst = insn->dst_reg * 2; 1703 1666 1704 - if (insn->imm < 32) { 1705 - emit_shf(nfp_prog, reg_both(dst + 1), 1706 - reg_a(dst + 1), SHF_OP_NONE, reg_b(dst), 1707 - SHF_SC_R_DSHF, 32 - insn->imm); 1708 - emit_shf(nfp_prog, reg_both(dst), 1709 - reg_none(), SHF_OP_NONE, reg_b(dst), 1710 - SHF_SC_L_SHF, insn->imm); 1711 - } else if (insn->imm == 32) { 1712 - wrp_reg_mov(nfp_prog, dst + 1, dst); 1713 - wrp_immed(nfp_prog, reg_both(dst), 0); 1714 - } else if (insn->imm > 32) { 1715 - emit_shf(nfp_prog, reg_both(dst + 1), 1716 - reg_none(), SHF_OP_NONE, reg_b(dst), 1717 - SHF_SC_L_SHF, insn->imm - 32); 1718 - wrp_immed(nfp_prog, reg_both(dst), 0); 1667 + return __shl_imm64(nfp_prog, dst, insn->imm); 1668 + } 1669 + 1670 + static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1671 + { 1672 + emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB, 1673 + reg_b(src)); 1674 + emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0)); 1675 + emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE, 1676 + reg_b(dst), SHF_SC_R_DSHF); 1677 + } 1678 + 1679 + /* NOTE: for indirect left shift, HIGH part should be calculated first. */ 1680 + static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1681 + { 1682 + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); 1683 + emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, 1684 + reg_b(dst), SHF_SC_L_SHF); 1685 + } 1686 + 1687 + static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1688 + { 1689 + shl_reg64_lt32_high(nfp_prog, dst, src); 1690 + shl_reg64_lt32_low(nfp_prog, dst, src); 1691 + } 1692 + 1693 + static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1694 + { 1695 + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); 1696 + emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE, 1697 + reg_b(dst), SHF_SC_L_SHF); 1698 + wrp_immed(nfp_prog, reg_both(dst), 0); 1699 + } 1700 + 1701 + static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1702 + { 1703 + const struct bpf_insn *insn = &meta->insn; 1704 + u64 umin, umax; 1705 + u8 dst, src; 1706 + 1707 + dst = insn->dst_reg * 2; 1708 + umin = meta->umin; 1709 + umax = meta->umax; 1710 + if (umin == umax) 1711 + return __shl_imm64(nfp_prog, dst, umin); 1712 + 1713 + src = insn->src_reg * 2; 1714 + if (umax < 32) { 1715 + shl_reg64_lt32(nfp_prog, dst, src); 1716 + } else if (umin >= 32) { 1717 + shl_reg64_ge32(nfp_prog, dst, src); 1718 + } else { 1719 + /* Generate different instruction sequences depending on runtime 1720 + * value of shift amount. 1721 + */ 1722 + u16 label_ge32, label_end; 1723 + 1724 + label_ge32 = nfp_prog_current_offset(nfp_prog) + 7; 1725 + emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0); 1726 + 1727 + shl_reg64_lt32_high(nfp_prog, dst, src); 1728 + label_end = nfp_prog_current_offset(nfp_prog) + 6; 1729 + emit_br(nfp_prog, BR_UNC, label_end, 2); 1730 + /* shl_reg64_lt32_low packed in delay slot. */ 1731 + shl_reg64_lt32_low(nfp_prog, dst, src); 1732 + 1733 + if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32)) 1734 + return -EINVAL; 1735 + shl_reg64_ge32(nfp_prog, dst, src); 1736 + 1737 + if (!nfp_prog_confirm_current_offset(nfp_prog, label_end)) 1738 + return -EINVAL; 1739 + } 1740 + 1741 + return 0; 1742 + } 1743 + 1744 + /* Pseudo code: 1745 + * if shift_amt >= 32 1746 + * dst_high = 0; 1747 + * dst_low = dst_high >> shift_amt[4:0] 1748 + * else 1749 + * dst_high = dst_high >> shift_amt 1750 + * dst_low = (dst_high, dst_low) >> shift_amt 1751 + * 1752 + * The indirect shift will use the same logic at runtime. 1753 + */ 1754 + static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) 1755 + { 1756 + if (shift_amt < 32) { 1757 + emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE, 1758 + reg_b(dst), SHF_SC_R_DSHF, shift_amt); 1759 + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE, 1760 + reg_b(dst + 1), SHF_SC_R_SHF, shift_amt); 1761 + } else if (shift_amt == 32) { 1762 + wrp_reg_mov(nfp_prog, dst, dst + 1); 1763 + wrp_immed(nfp_prog, reg_both(dst + 1), 0); 1764 + } else if (shift_amt > 32) { 1765 + emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, 1766 + reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32); 1767 + wrp_immed(nfp_prog, reg_both(dst + 1), 0); 1719 1768 } 1720 1769 1721 1770 return 0; ··· 1842 1659 const struct bpf_insn *insn = &meta->insn; 1843 1660 u8 dst = insn->dst_reg * 2; 1844 1661 1845 - if (insn->imm < 32) { 1846 - emit_shf(nfp_prog, reg_both(dst), 1847 - reg_a(dst + 1), SHF_OP_NONE, reg_b(dst), 1848 - SHF_SC_R_DSHF, insn->imm); 1849 - emit_shf(nfp_prog, reg_both(dst + 1), 1850 - reg_none(), SHF_OP_NONE, reg_b(dst + 1), 1851 - SHF_SC_R_SHF, insn->imm); 1852 - } else if (insn->imm == 32) { 1662 + return __shr_imm64(nfp_prog, dst, insn->imm); 1663 + } 1664 + 1665 + /* NOTE: for indirect right shift, LOW part should be calculated first. */ 1666 + static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1667 + { 1668 + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); 1669 + emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE, 1670 + reg_b(dst + 1), SHF_SC_R_SHF); 1671 + } 1672 + 1673 + static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1674 + { 1675 + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); 1676 + emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE, 1677 + reg_b(dst), SHF_SC_R_DSHF); 1678 + } 1679 + 1680 + static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1681 + { 1682 + shr_reg64_lt32_low(nfp_prog, dst, src); 1683 + shr_reg64_lt32_high(nfp_prog, dst, src); 1684 + } 1685 + 1686 + static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1687 + { 1688 + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); 1689 + emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, 1690 + reg_b(dst + 1), SHF_SC_R_SHF); 1691 + wrp_immed(nfp_prog, reg_both(dst + 1), 0); 1692 + } 1693 + 1694 + static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1695 + { 1696 + const struct bpf_insn *insn = &meta->insn; 1697 + u64 umin, umax; 1698 + u8 dst, src; 1699 + 1700 + dst = insn->dst_reg * 2; 1701 + umin = meta->umin; 1702 + umax = meta->umax; 1703 + if (umin == umax) 1704 + return __shr_imm64(nfp_prog, dst, umin); 1705 + 1706 + src = insn->src_reg * 2; 1707 + if (umax < 32) { 1708 + shr_reg64_lt32(nfp_prog, dst, src); 1709 + } else if (umin >= 32) { 1710 + shr_reg64_ge32(nfp_prog, dst, src); 1711 + } else { 1712 + /* Generate different instruction sequences depending on runtime 1713 + * value of shift amount. 1714 + */ 1715 + u16 label_ge32, label_end; 1716 + 1717 + label_ge32 = nfp_prog_current_offset(nfp_prog) + 6; 1718 + emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0); 1719 + shr_reg64_lt32_low(nfp_prog, dst, src); 1720 + label_end = nfp_prog_current_offset(nfp_prog) + 6; 1721 + emit_br(nfp_prog, BR_UNC, label_end, 2); 1722 + /* shr_reg64_lt32_high packed in delay slot. */ 1723 + shr_reg64_lt32_high(nfp_prog, dst, src); 1724 + 1725 + if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32)) 1726 + return -EINVAL; 1727 + shr_reg64_ge32(nfp_prog, dst, src); 1728 + 1729 + if (!nfp_prog_confirm_current_offset(nfp_prog, label_end)) 1730 + return -EINVAL; 1731 + } 1732 + 1733 + return 0; 1734 + } 1735 + 1736 + /* Code logic is the same as __shr_imm64 except ashr requires signedness bit 1737 + * told through PREV_ALU result. 1738 + */ 1739 + static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) 1740 + { 1741 + if (shift_amt < 32) { 1742 + emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE, 1743 + reg_b(dst), SHF_SC_R_DSHF, shift_amt); 1744 + /* Set signedness bit. */ 1745 + emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR, 1746 + reg_imm(0)); 1747 + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, 1748 + reg_b(dst + 1), SHF_SC_R_SHF, shift_amt); 1749 + } else if (shift_amt == 32) { 1750 + /* NOTE: this also helps setting signedness bit. */ 1853 1751 wrp_reg_mov(nfp_prog, dst, dst + 1); 1854 - wrp_immed(nfp_prog, reg_both(dst + 1), 0); 1855 - } else if (insn->imm > 32) { 1856 - emit_shf(nfp_prog, reg_both(dst), 1857 - reg_none(), SHF_OP_NONE, reg_b(dst + 1), 1858 - SHF_SC_R_SHF, insn->imm - 32); 1859 - wrp_immed(nfp_prog, reg_both(dst + 1), 0); 1752 + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, 1753 + reg_b(dst + 1), SHF_SC_R_SHF, 31); 1754 + } else if (shift_amt > 32) { 1755 + emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR, 1756 + reg_imm(0)); 1757 + emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR, 1758 + reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32); 1759 + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, 1760 + reg_b(dst + 1), SHF_SC_R_SHF, 31); 1761 + } 1762 + 1763 + return 0; 1764 + } 1765 + 1766 + static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1767 + { 1768 + const struct bpf_insn *insn = &meta->insn; 1769 + u8 dst = insn->dst_reg * 2; 1770 + 1771 + return __ashr_imm64(nfp_prog, dst, insn->imm); 1772 + } 1773 + 1774 + static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1775 + { 1776 + /* NOTE: the first insn will set both indirect shift amount (source A) 1777 + * and signedness bit (MSB of result). 1778 + */ 1779 + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1)); 1780 + emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, 1781 + reg_b(dst + 1), SHF_SC_R_SHF); 1782 + } 1783 + 1784 + static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1785 + { 1786 + /* NOTE: it is the same as logic shift because we don't need to shift in 1787 + * signedness bit when the shift amount is less than 32. 1788 + */ 1789 + return shr_reg64_lt32_low(nfp_prog, dst, src); 1790 + } 1791 + 1792 + static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1793 + { 1794 + ashr_reg64_lt32_low(nfp_prog, dst, src); 1795 + ashr_reg64_lt32_high(nfp_prog, dst, src); 1796 + } 1797 + 1798 + static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src) 1799 + { 1800 + emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1)); 1801 + emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR, 1802 + reg_b(dst + 1), SHF_SC_R_SHF); 1803 + emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR, 1804 + reg_b(dst + 1), SHF_SC_R_SHF, 31); 1805 + } 1806 + 1807 + /* Like ashr_imm64, but need to use indirect shift. */ 1808 + static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1809 + { 1810 + const struct bpf_insn *insn = &meta->insn; 1811 + u64 umin, umax; 1812 + u8 dst, src; 1813 + 1814 + dst = insn->dst_reg * 2; 1815 + umin = meta->umin; 1816 + umax = meta->umax; 1817 + if (umin == umax) 1818 + return __ashr_imm64(nfp_prog, dst, umin); 1819 + 1820 + src = insn->src_reg * 2; 1821 + if (umax < 32) { 1822 + ashr_reg64_lt32(nfp_prog, dst, src); 1823 + } else if (umin >= 32) { 1824 + ashr_reg64_ge32(nfp_prog, dst, src); 1825 + } else { 1826 + u16 label_ge32, label_end; 1827 + 1828 + label_ge32 = nfp_prog_current_offset(nfp_prog) + 6; 1829 + emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0); 1830 + ashr_reg64_lt32_low(nfp_prog, dst, src); 1831 + label_end = nfp_prog_current_offset(nfp_prog) + 6; 1832 + emit_br(nfp_prog, BR_UNC, label_end, 2); 1833 + /* ashr_reg64_lt32_high packed in delay slot. */ 1834 + ashr_reg64_lt32_high(nfp_prog, dst, src); 1835 + 1836 + if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32)) 1837 + return -EINVAL; 1838 + ashr_reg64_ge32(nfp_prog, dst, src); 1839 + 1840 + if (!nfp_prog_confirm_current_offset(nfp_prog, label_end)) 1841 + return -EINVAL; 1860 1842 } 1861 1843 1862 1844 return 0; ··· 2849 2501 [BPF_ALU64 | BPF_SUB | BPF_X] = sub_reg64, 2850 2502 [BPF_ALU64 | BPF_SUB | BPF_K] = sub_imm64, 2851 2503 [BPF_ALU64 | BPF_NEG] = neg_reg64, 2504 + [BPF_ALU64 | BPF_LSH | BPF_X] = shl_reg64, 2852 2505 [BPF_ALU64 | BPF_LSH | BPF_K] = shl_imm64, 2506 + [BPF_ALU64 | BPF_RSH | BPF_X] = shr_reg64, 2853 2507 [BPF_ALU64 | BPF_RSH | BPF_K] = shr_imm64, 2508 + [BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64, 2509 + [BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64, 2854 2510 [BPF_ALU | BPF_MOV | BPF_X] = mov_reg, 2855 2511 [BPF_ALU | BPF_MOV | BPF_K] = mov_imm, 2856 2512 [BPF_ALU | BPF_XOR | BPF_X] = xor_reg,

+28

drivers/net/ethernet/netronome/nfp/bpf/main.h

··· 263 263 * @func_id: function id for call instructions 264 264 * @arg1: arg1 for call instructions 265 265 * @arg2: arg2 for call instructions 266 + * @umin: copy of core verifier umin_value. 267 + * @umax: copy of core verifier umax_value. 266 268 * @off: index of first generated machine instruction (in nfp_prog.prog) 267 269 * @n: eBPF instruction number 268 270 * @flags: eBPF instruction extra optimization flags ··· 299 297 u32 func_id; 300 298 struct bpf_reg_state arg1; 301 299 struct nfp_bpf_reg_state arg2; 300 + }; 301 + /* We are interested in range info for some operands, 302 + * for example, the shift amount. 303 + */ 304 + struct { 305 + u64 umin; 306 + u64 umax; 302 307 }; 303 308 }; 304 309 unsigned int off; ··· 382 373 static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta) 383 374 { 384 375 return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD); 376 + } 377 + 378 + static inline bool is_mbpf_indir_shift(const struct nfp_insn_meta *meta) 379 + { 380 + u8 code = meta->insn.code; 381 + bool is_alu, is_shift; 382 + u8 opclass, opcode; 383 + 384 + opclass = BPF_CLASS(code); 385 + is_alu = opclass == BPF_ALU64 || opclass == BPF_ALU; 386 + if (!is_alu) 387 + return false; 388 + 389 + opcode = BPF_OP(code); 390 + is_shift = opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH; 391 + if (!is_shift) 392 + return false; 393 + 394 + return BPF_SRC(code) == BPF_X; 385 395 } 386 396 387 397 /**

+2

drivers/net/ethernet/netronome/nfp/bpf/offload.c

··· 190 190 191 191 meta->insn = prog[i]; 192 192 meta->n = i; 193 + if (is_mbpf_indir_shift(meta)) 194 + meta->umin = U64_MAX; 193 195 194 196 list_add_tail(&meta->l, &nfp_prog->insns); 195 197 }

+8

drivers/net/ethernet/netronome/nfp/bpf/verifier.c

··· 551 551 if (is_mbpf_xadd(meta)) 552 552 return nfp_bpf_check_xadd(nfp_prog, meta, env); 553 553 554 + if (is_mbpf_indir_shift(meta)) { 555 + const struct bpf_reg_state *sreg = 556 + cur_regs(env) + meta->insn.src_reg; 557 + 558 + meta->umin = min(meta->umin, sreg->umin_value); 559 + meta->umax = max(meta->umax, sreg->umax_value); 560 + } 561 + 554 562 return 0; 555 563 } 556 564

+16 -2

drivers/net/ethernet/netronome/nfp/nfp_asm.h

··· 72 72 #define OP_BR_ADDR_LO 0x007ffc00000ULL 73 73 #define OP_BR_ADDR_HI 0x10000000000ULL 74 74 75 - #define nfp_is_br(_insn) \ 76 - (((_insn) & OP_BR_BASE_MASK) == OP_BR_BASE) 75 + #define OP_BR_BIT_BASE 0x0d000000000ULL 76 + #define OP_BR_BIT_BASE_MASK 0x0f800080300ULL 77 + #define OP_BR_BIT_A_SRC 0x000000000ffULL 78 + #define OP_BR_BIT_B_SRC 0x0000003fc00ULL 79 + #define OP_BR_BIT_BV 0x00000040000ULL 80 + #define OP_BR_BIT_SRC_LMEXTN 0x40000000000ULL 81 + #define OP_BR_BIT_DEFBR OP_BR_DEFBR 82 + #define OP_BR_BIT_ADDR_LO OP_BR_ADDR_LO 83 + #define OP_BR_BIT_ADDR_HI OP_BR_ADDR_HI 84 + 85 + static inline bool nfp_is_br(u64 insn) 86 + { 87 + return (insn & OP_BR_BASE_MASK) == OP_BR_BASE || 88 + (insn & OP_BR_BIT_BASE_MASK) == OP_BR_BIT_BASE; 89 + } 77 90 78 91 enum br_mask { 79 92 BR_BEQ = 0x00, ··· 174 161 SHF_OP_NONE = 0, 175 162 SHF_OP_AND = 2, 176 163 SHF_OP_OR = 5, 164 + SHF_OP_ASHR = 6, 177 165 }; 178 166 179 167 enum shf_sc {

+25 -14

drivers/net/tun.c

··· 70 70 #include <net/netns/generic.h> 71 71 #include <net/rtnetlink.h> 72 72 #include <net/sock.h> 73 + #include <net/xdp.h> 73 74 #include <linux/seq_file.h> 74 75 #include <linux/uio.h> 75 76 #include <linux/skb_array.h> ··· 1285 1284 .ndo_get_stats64 = tun_net_get_stats64, 1286 1285 }; 1287 1286 1288 - static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame) 1287 + static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) 1289 1288 { 1290 1289 struct tun_struct *tun = netdev_priv(dev); 1291 1290 struct tun_file *tfile; 1292 1291 u32 numqueues; 1293 - int ret = 0; 1292 + int drops = 0; 1293 + int cnt = n; 1294 + int i; 1294 1295 1295 1296 rcu_read_lock(); 1296 1297 1297 1298 numqueues = READ_ONCE(tun->numqueues); 1298 1299 if (!numqueues) { 1299 - ret = -ENOSPC; 1300 - goto out; 1300 + rcu_read_unlock(); 1301 + return -ENXIO; /* Caller will free/return all frames */ 1301 1302 } 1302 1303 1303 1304 tfile = rcu_dereference(tun->tfiles[smp_processor_id() % 1304 1305 numqueues]); 1305 - /* Encode the XDP flag into lowest bit for consumer to differ 1306 - * XDP buffer from sk_buff. 1307 - */ 1308 - if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) { 1309 - this_cpu_inc(tun->pcpu_stats->tx_dropped); 1310 - ret = -ENOSPC; 1311 - } 1312 1306 1313 - out: 1307 + spin_lock(&tfile->tx_ring.producer_lock); 1308 + for (i = 0; i < n; i++) { 1309 + struct xdp_frame *xdp = frames[i]; 1310 + /* Encode the XDP flag into lowest bit for consumer to differ 1311 + * XDP buffer from sk_buff. 1312 + */ 1313 + void *frame = tun_xdp_to_ptr(xdp); 1314 + 1315 + if (__ptr_ring_produce(&tfile->tx_ring, frame)) { 1316 + this_cpu_inc(tun->pcpu_stats->tx_dropped); 1317 + xdp_return_frame_rx_napi(xdp); 1318 + drops++; 1319 + } 1320 + } 1321 + spin_unlock(&tfile->tx_ring.producer_lock); 1322 + 1314 1323 rcu_read_unlock(); 1315 - return ret; 1324 + return cnt - drops; 1316 1325 } 1317 1326 1318 1327 static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) ··· 1332 1321 if (unlikely(!frame)) 1333 1322 return -EOVERFLOW; 1334 1323 1335 - return tun_xdp_xmit(dev, frame); 1324 + return tun_xdp_xmit(dev, 1, &frame); 1336 1325 } 1337 1326 1338 1327 static void tun_xdp_flush(struct net_device *dev)

+49 -17

drivers/net/virtio_net.c

··· 419 419 virtqueue_kick(sq->vq); 420 420 } 421 421 422 - static int __virtnet_xdp_xmit(struct virtnet_info *vi, 423 - struct xdp_frame *xdpf) 422 + static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, 423 + struct send_queue *sq, 424 + struct xdp_frame *xdpf) 424 425 { 425 426 struct virtio_net_hdr_mrg_rxbuf *hdr; 426 - struct xdp_frame *xdpf_sent; 427 - struct send_queue *sq; 428 - unsigned int len; 429 - unsigned int qp; 430 427 int err; 431 - 432 - qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); 433 - sq = &vi->sq[qp]; 434 - 435 - /* Free up any pending old buffers before queueing new ones. */ 436 - while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) 437 - xdp_return_frame(xdpf_sent); 438 428 439 429 /* virtqueue want to use data area in-front of packet */ 440 430 if (unlikely(xdpf->metasize > 0)) ··· 449 459 return 0; 450 460 } 451 461 452 - static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) 462 + static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi, 463 + struct xdp_frame *xdpf) 464 + { 465 + struct xdp_frame *xdpf_sent; 466 + struct send_queue *sq; 467 + unsigned int len; 468 + unsigned int qp; 469 + 470 + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); 471 + sq = &vi->sq[qp]; 472 + 473 + /* Free up any pending old buffers before queueing new ones. */ 474 + while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) 475 + xdp_return_frame(xdpf_sent); 476 + 477 + return __virtnet_xdp_xmit_one(vi, sq, xdpf); 478 + } 479 + 480 + static int virtnet_xdp_xmit(struct net_device *dev, 481 + int n, struct xdp_frame **frames) 453 482 { 454 483 struct virtnet_info *vi = netdev_priv(dev); 455 484 struct receive_queue *rq = vi->rq; 485 + struct xdp_frame *xdpf_sent; 456 486 struct bpf_prog *xdp_prog; 487 + struct send_queue *sq; 488 + unsigned int len; 489 + unsigned int qp; 490 + int drops = 0; 491 + int err; 492 + int i; 493 + 494 + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); 495 + sq = &vi->sq[qp]; 457 496 458 497 /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this 459 498 * indicate XDP resources have been successfully allocated. ··· 491 472 if (!xdp_prog) 492 473 return -ENXIO; 493 474 494 - return __virtnet_xdp_xmit(vi, xdpf); 475 + /* Free up any pending old buffers before queueing new ones. */ 476 + while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) 477 + xdp_return_frame(xdpf_sent); 478 + 479 + for (i = 0; i < n; i++) { 480 + struct xdp_frame *xdpf = frames[i]; 481 + 482 + err = __virtnet_xdp_xmit_one(vi, sq, xdpf); 483 + if (err) { 484 + xdp_return_frame_rx_napi(xdpf); 485 + drops++; 486 + } 487 + } 488 + return n - drops; 495 489 } 496 490 497 491 static unsigned int virtnet_get_headroom(struct virtnet_info *vi) ··· 648 616 xdpf = convert_to_xdp_frame(&xdp); 649 617 if (unlikely(!xdpf)) 650 618 goto err_xdp; 651 - err = __virtnet_xdp_xmit(vi, xdpf); 619 + err = __virtnet_xdp_tx_xmit(vi, xdpf); 652 620 if (unlikely(err)) { 653 621 trace_xdp_exception(vi->dev, xdp_prog, act); 654 622 goto err_xdp; ··· 811 779 xdpf = convert_to_xdp_frame(&xdp); 812 780 if (unlikely(!xdpf)) 813 781 goto err_xdp; 814 - err = __virtnet_xdp_xmit(vi, xdpf); 782 + err = __virtnet_xdp_tx_xmit(vi, xdpf); 815 783 if (unlikely(err)) { 816 784 trace_xdp_exception(vi->dev, xdp_prog, act); 817 785 if (unlikely(xdp_page != page))

+19 -5

include/linux/bpf.h

··· 69 69 u32 pages; 70 70 u32 id; 71 71 int numa_node; 72 - u32 btf_key_id; 73 - u32 btf_value_id; 72 + u32 btf_key_type_id; 73 + u32 btf_value_type_id; 74 74 struct btf *btf; 75 75 bool unpriv_array; 76 76 /* 55 bytes hole */ ··· 463 463 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); 464 464 465 465 int bpf_get_file_flag(int flags); 466 + int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, 467 + size_t actual_size); 466 468 467 469 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and 468 470 * forced to use 'long' read/writes to try to atomically copy long counters. ··· 487 485 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); 488 486 489 487 /* Map specifics */ 490 - struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); 488 + struct xdp_buff; 489 + 490 + struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); 491 491 void __dev_map_insert_ctx(struct bpf_map *map, u32 index); 492 492 void __dev_map_flush(struct bpf_map *map); 493 + int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 494 + struct net_device *dev_rx); 493 495 494 496 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); 495 497 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); 496 498 void __cpu_map_flush(struct bpf_map *map); 497 - struct xdp_buff; 498 499 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, 499 500 struct net_device *dev_rx); 500 501 ··· 576 571 { 577 572 } 578 573 574 + struct xdp_buff; 575 + struct bpf_dtab_netdev; 576 + 577 + static inline 578 + int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 579 + struct net_device *dev_rx) 580 + { 581 + return 0; 582 + } 583 + 579 584 static inline 580 585 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) 581 586 { ··· 600 585 { 601 586 } 602 587 603 - struct xdp_buff; 604 588 static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, 605 589 struct xdp_buff *xdp, 606 590 struct net_device *dev_rx)

+3 -2

include/linux/bpf_types.h

··· 9 9 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) 10 10 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) 11 11 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) 12 - BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) 13 - BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) 12 + BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) 13 + BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) 14 14 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) 15 + BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) 15 16 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) 16 17 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) 17 18 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)

+1

include/linux/filter.h

··· 517 517 bool sg_copy[MAX_SKB_FRAGS]; 518 518 __u32 flags; 519 519 struct sock *sk_redir; 520 + struct sock *sk; 520 521 struct sk_buff *skb; 521 522 struct list_head list; 522 523 };

+9 -5

include/linux/netdevice.h

··· 1185 1185 * This function is used to set or query state related to XDP on the 1186 1186 * netdevice and manage BPF offload. See definition of 1187 1187 * enum bpf_netdev_command for details. 1188 - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); 1189 - * This function is used to submit a XDP packet for transmit on a 1190 - * netdevice. 1188 + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); 1189 + * This function is used to submit @n XDP packets for transmit on a 1190 + * netdevice. Returns number of frames successfully transmitted, frames 1191 + * that got dropped are freed/returned via xdp_return_frame(). 1192 + * Returns negative number, means general error invoking ndo, meaning 1193 + * no frames were xmit'ed and core-caller will free all frames. 1194 + * TODO: Consider add flag to allow sending flush operation. 1191 1195 * void (*ndo_xdp_flush)(struct net_device *dev); 1192 1196 * This function is used to inform the driver to flush a particular 1193 1197 * xdp tx queue. Must be called on same CPU as xdp_xmit. ··· 1379 1375 int needed_headroom); 1380 1376 int (*ndo_bpf)(struct net_device *dev, 1381 1377 struct netdev_bpf *bpf); 1382 - int (*ndo_xdp_xmit)(struct net_device *dev, 1383 - struct xdp_frame *xdp); 1378 + int (*ndo_xdp_xmit)(struct net_device *dev, int n, 1379 + struct xdp_frame **xdp); 1384 1380 void (*ndo_xdp_flush)(struct net_device *dev); 1385 1381 }; 1386 1382

+5

include/linux/perf_event.h

··· 868 868 extern void perf_event_free_task(struct task_struct *task); 869 869 extern void perf_event_delayed_put(struct task_struct *task); 870 870 extern struct file *perf_event_get(unsigned int fd); 871 + extern const struct perf_event *perf_get_event(struct file *file); 871 872 extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); 872 873 extern void perf_event_print_debug(void); 873 874 extern void perf_pmu_disable(struct pmu *pmu); ··· 1290 1289 static inline void perf_event_free_task(struct task_struct *task) { } 1291 1290 static inline void perf_event_delayed_put(struct task_struct *task) { } 1292 1291 static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } 1292 + static inline const struct perf_event *perf_get_event(struct file *file) 1293 + { 1294 + return ERR_PTR(-EINVAL); 1295 + } 1293 1296 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 1294 1297 { 1295 1298 return ERR_PTR(-EINVAL);

+17

include/linux/trace_events.h

··· 473 473 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); 474 474 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); 475 475 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); 476 + int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, 477 + u32 *fd_type, const char **buf, 478 + u64 *probe_offset, u64 *probe_addr); 476 479 #else 477 480 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) 478 481 { ··· 506 503 static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) 507 504 { 508 505 return NULL; 506 + } 507 + static inline int bpf_get_perf_event_info(const struct perf_event *event, 508 + u32 *prog_id, u32 *fd_type, 509 + const char **buf, u64 *probe_offset, 510 + u64 *probe_addr) 511 + { 512 + return -EOPNOTSUPP; 509 513 } 510 514 #endif 511 515 ··· 570 560 #ifdef CONFIG_KPROBE_EVENTS 571 561 extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); 572 562 extern void perf_kprobe_destroy(struct perf_event *event); 563 + extern int bpf_get_kprobe_info(const struct perf_event *event, 564 + u32 *fd_type, const char **symbol, 565 + u64 *probe_offset, u64 *probe_addr, 566 + bool perf_type_tracepoint); 573 567 #endif 574 568 #ifdef CONFIG_UPROBE_EVENTS 575 569 extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); 576 570 extern void perf_uprobe_destroy(struct perf_event *event); 571 + extern int bpf_get_uprobe_info(const struct perf_event *event, 572 + u32 *fd_type, const char **filename, 573 + u64 *probe_offset, bool perf_type_tracepoint); 577 574 #endif 578 575 extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, 579 576 char *filter_str);

+2

include/net/addrconf.h

··· 236 236 struct flowi6 *fl6, int oif, 237 237 const struct sk_buff *skb, 238 238 int strict); 239 + u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, 240 + struct in6_addr *saddr); 239 241 240 242 void (*udpv6_encap_enable)(void); 241 243 void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,

+6

include/net/ip6_fib.h

··· 412 412 return f6i->fib6_nh.nh_dev; 413 413 } 414 414 415 + static inline 416 + struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) 417 + { 418 + return f6i->fib6_nh.nh_lwtstate; 419 + } 420 + 415 421 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, 416 422 unsigned int flags); 417 423

+3

include/net/ip6_route.h

··· 294 294 return mtu; 295 295 } 296 296 297 + u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 298 + struct in6_addr *saddr); 299 + 297 300 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, 298 301 struct net_device *dev, struct sk_buff *skb, 299 302 const void *daddr);

+2

include/net/ip_fib.h

··· 449 449 } 450 450 #endif 451 451 452 + u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); 453 + 452 454 #endif /* _NET_FIB_H */

+3 -2

include/net/page_pool.h

··· 115 115 void __page_pool_put_page(struct page_pool *pool, 116 116 struct page *page, bool allow_direct); 117 117 118 - static inline void page_pool_put_page(struct page_pool *pool, struct page *page) 118 + static inline void page_pool_put_page(struct page_pool *pool, 119 + struct page *page, bool allow_direct) 119 120 { 120 121 /* When page_pool isn't compiled-in, net/core/xdp.c doesn't 121 122 * allow registering MEM_TYPE_PAGE_POOL, but shield linker. 122 123 */ 123 124 #ifdef CONFIG_PAGE_POOL 124 - __page_pool_put_page(pool, page, false); 125 + __page_pool_put_page(pool, page, allow_direct); 125 126 #endif 126 127 } 127 128 /* Very limited use-cases allow recycle direct */

+6 -1

include/net/seg6.h

··· 49 49 50 50 static inline struct seg6_pernet_data *seg6_pernet(struct net *net) 51 51 { 52 + #if IS_ENABLED(CONFIG_IPV6) 52 53 return net->ipv6.seg6_data; 54 + #else 55 + return NULL; 56 + #endif 53 57 } 54 58 55 59 extern int seg6_init(void); ··· 67 63 extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, 68 64 int proto); 69 65 extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); 70 - 66 + extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, 67 + u32 tbl_id); 71 68 #endif

+32

include/net/seg6_local.h

··· 1 + /* 2 + * SR-IPv6 implementation 3 + * 4 + * Authors: 5 + * David Lebrun <david.lebrun@uclouvain.be> 6 + * eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com> 7 + * 8 + * 9 + * This program is free software; you can redistribute it and/or 10 + * modify it under the terms of the GNU General Public License 11 + * as published by the Free Software Foundation; either version 12 + * 2 of the License, or (at your option) any later version. 13 + */ 14 + 15 + #ifndef _NET_SEG6_LOCAL_H 16 + #define _NET_SEG6_LOCAL_H 17 + 18 + #include <linux/percpu.h> 19 + #include <linux/net.h> 20 + #include <linux/ipv6.h> 21 + 22 + extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, 23 + u32 tbl_id); 24 + 25 + struct seg6_bpf_srh_state { 26 + bool valid; 27 + u16 hdrlen; 28 + }; 29 + 30 + DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); 31 + 32 + #endif

+1

include/net/xdp.h

··· 104 104 } 105 105 106 106 void xdp_return_frame(struct xdp_frame *xdpf); 107 + void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); 107 108 void xdp_return_buff(struct xdp_buff *xdp); 108 109 109 110 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,

+2 -11

include/net/xdp_sock.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 2 - * AF_XDP internal functions 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* AF_XDP internal functions 3 3 * Copyright(c) 2018 Intel Corporation. 4 - * 5 - * This program is free software; you can redistribute it and/or modify it 6 - * under the terms and conditions of the GNU General Public License, 7 - * version 2, as published by the Free Software Foundation. 8 - * 9 - * This program is distributed in the hope it will be useful, but WITHOUT 10 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 - * more details. 13 4 */ 14 5 15 6 #ifndef _LINUX_XDP_SOCK_H

+49 -1

include/trace/events/xdp.h

··· 138 138 __entry->map_id, __entry->map_index) 139 139 ); 140 140 141 + #ifndef __DEVMAP_OBJ_TYPE 142 + #define __DEVMAP_OBJ_TYPE 143 + struct _bpf_dtab_netdev { 144 + struct net_device *dev; 145 + }; 146 + #endif /* __DEVMAP_OBJ_TYPE */ 147 + 141 148 #define devmap_ifindex(fwd, map) \ 142 149 (!fwd ? 0 : \ 143 150 (!map ? 0 : \ 144 151 ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ 145 - ((struct net_device *)fwd)->ifindex : 0))) 152 + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) 146 153 147 154 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ 148 155 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ ··· 227 220 __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), 228 221 __entry->processed, __entry->drops, 229 222 __entry->to_cpu) 223 + ); 224 + 225 + TRACE_EVENT(xdp_devmap_xmit, 226 + 227 + TP_PROTO(const struct bpf_map *map, u32 map_index, 228 + int sent, int drops, 229 + const struct net_device *from_dev, 230 + const struct net_device *to_dev, int err), 231 + 232 + TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), 233 + 234 + TP_STRUCT__entry( 235 + __field(int, map_id) 236 + __field(u32, act) 237 + __field(u32, map_index) 238 + __field(int, drops) 239 + __field(int, sent) 240 + __field(int, from_ifindex) 241 + __field(int, to_ifindex) 242 + __field(int, err) 243 + ), 244 + 245 + TP_fast_assign( 246 + __entry->map_id = map->id; 247 + __entry->act = XDP_REDIRECT; 248 + __entry->map_index = map_index; 249 + __entry->drops = drops; 250 + __entry->sent = sent; 251 + __entry->from_ifindex = from_dev->ifindex; 252 + __entry->to_ifindex = to_dev->ifindex; 253 + __entry->err = err; 254 + ), 255 + 256 + TP_printk("ndo_xdp_xmit" 257 + " map_id=%d map_index=%d action=%s" 258 + " sent=%d drops=%d" 259 + " from_ifindex=%d to_ifindex=%d err=%d", 260 + __entry->map_id, __entry->map_index, 261 + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), 262 + __entry->sent, __entry->drops, 263 + __entry->from_ifindex, __entry->to_ifindex, __entry->err) 230 264 ); 231 265 232 266 #endif /* _TRACE_XDP_H */

+138 -5

include/uapi/linux/bpf.h

··· 97 97 BPF_RAW_TRACEPOINT_OPEN, 98 98 BPF_BTF_LOAD, 99 99 BPF_BTF_GET_FD_BY_ID, 100 + BPF_TASK_FD_QUERY, 100 101 }; 101 102 102 103 enum bpf_map_type { ··· 142 141 BPF_PROG_TYPE_SK_MSG, 143 142 BPF_PROG_TYPE_RAW_TRACEPOINT, 144 143 BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 144 + BPF_PROG_TYPE_LWT_SEG6LOCAL, 145 145 }; 146 146 147 147 enum bpf_attach_type { ··· 286 284 char map_name[BPF_OBJ_NAME_LEN]; 287 285 __u32 map_ifindex; /* ifindex of netdev to create on */ 288 286 __u32 btf_fd; /* fd pointing to a BTF type data */ 289 - __u32 btf_key_id; /* BTF type_id of the key */ 290 - __u32 btf_value_id; /* BTF type_id of the value */ 287 + __u32 btf_key_type_id; /* BTF type_id of the key */ 288 + __u32 btf_value_type_id; /* BTF type_id of the value */ 291 289 }; 292 290 293 291 struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ ··· 381 379 __u32 btf_log_size; 382 380 __u32 btf_log_level; 383 381 }; 382 + 383 + struct { 384 + __u32 pid; /* input: pid */ 385 + __u32 fd; /* input: fd */ 386 + __u32 flags; /* input: flags */ 387 + __u32 buf_len; /* input/output: buf len */ 388 + __aligned_u64 buf; /* input/output: 389 + * tp_name for tracepoint 390 + * symbol for kprobe 391 + * filename for uprobe 392 + */ 393 + __u32 prog_id; /* output: prod_id */ 394 + __u32 fd_type; /* output: BPF_FD_TYPE_* */ 395 + __u64 probe_offset; /* output: probe_offset */ 396 + __u64 probe_addr; /* output: probe_addr */ 397 + } task_fd_query; 384 398 } __attribute__((aligned(8))); 385 399 386 400 /* The description below is an attempt at providing documentation to eBPF ··· 1920 1902 * egress otherwise). This is the only flag supported for now. 1921 1903 * Return 1922 1904 * **SK_PASS** on success, or **SK_DROP** on error. 1905 + * 1906 + * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) 1907 + * Description 1908 + * Encapsulate the packet associated to *skb* within a Layer 3 1909 + * protocol header. This header is provided in the buffer at 1910 + * address *hdr*, with *len* its size in bytes. *type* indicates 1911 + * the protocol of the header and can be one of: 1912 + * 1913 + * **BPF_LWT_ENCAP_SEG6** 1914 + * IPv6 encapsulation with Segment Routing Header 1915 + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, 1916 + * the IPv6 header is computed by the kernel. 1917 + * **BPF_LWT_ENCAP_SEG6_INLINE** 1918 + * Only works if *skb* contains an IPv6 packet. Insert a 1919 + * Segment Routing Header (**struct ipv6_sr_hdr**) inside 1920 + * the IPv6 header. 1921 + * 1922 + * A call to this helper is susceptible to change the underlaying 1923 + * packet buffer. Therefore, at load time, all checks on pointers 1924 + * previously done by the verifier are invalidated and must be 1925 + * performed again, if the helper is used in combination with 1926 + * direct packet access. 1927 + * Return 1928 + * 0 on success, or a negative error in case of failure. 1929 + * 1930 + * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) 1931 + * Description 1932 + * Store *len* bytes from address *from* into the packet 1933 + * associated to *skb*, at *offset*. Only the flags, tag and TLVs 1934 + * inside the outermost IPv6 Segment Routing Header can be 1935 + * modified through this helper. 1936 + * 1937 + * A call to this helper is susceptible to change the underlaying 1938 + * packet buffer. Therefore, at load time, all checks on pointers 1939 + * previously done by the verifier are invalidated and must be 1940 + * performed again, if the helper is used in combination with 1941 + * direct packet access. 1942 + * Return 1943 + * 0 on success, or a negative error in case of failure. 1944 + * 1945 + * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) 1946 + * Description 1947 + * Adjust the size allocated to TLVs in the outermost IPv6 1948 + * Segment Routing Header contained in the packet associated to 1949 + * *skb*, at position *offset* by *delta* bytes. Only offsets 1950 + * after the segments are accepted. *delta* can be as well 1951 + * positive (growing) as negative (shrinking). 1952 + * 1953 + * A call to this helper is susceptible to change the underlaying 1954 + * packet buffer. Therefore, at load time, all checks on pointers 1955 + * previously done by the verifier are invalidated and must be 1956 + * performed again, if the helper is used in combination with 1957 + * direct packet access. 1958 + * Return 1959 + * 0 on success, or a negative error in case of failure. 1960 + * 1961 + * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) 1962 + * Description 1963 + * Apply an IPv6 Segment Routing action of type *action* to the 1964 + * packet associated to *skb*. Each action takes a parameter 1965 + * contained at address *param*, and of length *param_len* bytes. 1966 + * *action* can be one of: 1967 + * 1968 + * **SEG6_LOCAL_ACTION_END_X** 1969 + * End.X action: Endpoint with Layer-3 cross-connect. 1970 + * Type of *param*: **struct in6_addr**. 1971 + * **SEG6_LOCAL_ACTION_END_T** 1972 + * End.T action: Endpoint with specific IPv6 table lookup. 1973 + * Type of *param*: **int**. 1974 + * **SEG6_LOCAL_ACTION_END_B6** 1975 + * End.B6 action: Endpoint bound to an SRv6 policy. 1976 + * Type of param: **struct ipv6_sr_hdr**. 1977 + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** 1978 + * End.B6.Encap action: Endpoint bound to an SRv6 1979 + * encapsulation policy. 1980 + * Type of param: **struct ipv6_sr_hdr**. 1981 + * 1982 + * A call to this helper is susceptible to change the underlaying 1983 + * packet buffer. Therefore, at load time, all checks on pointers 1984 + * previously done by the verifier are invalidated and must be 1985 + * performed again, if the helper is used in combination with 1986 + * direct packet access. 1987 + * Return 1988 + * 0 on success, or a negative error in case of failure. 1923 1989 */ 1924 1990 #define __BPF_FUNC_MAPPER(FN) \ 1925 1991 FN(unspec), \ ··· 2078 1976 FN(fib_lookup), \ 2079 1977 FN(sock_hash_update), \ 2080 1978 FN(msg_redirect_hash), \ 2081 - FN(sk_redirect_hash), 1979 + FN(sk_redirect_hash), \ 1980 + FN(lwt_push_encap), \ 1981 + FN(lwt_seg6_store_bytes), \ 1982 + FN(lwt_seg6_adjust_srh), \ 1983 + FN(lwt_seg6_action), 2082 1984 2083 1985 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2084 1986 * function eBPF program intends to call ··· 2147 2041 enum bpf_hdr_start_off { 2148 2042 BPF_HDR_START_MAC, 2149 2043 BPF_HDR_START_NET, 2044 + }; 2045 + 2046 + /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ 2047 + enum bpf_lwt_encap_mode { 2048 + BPF_LWT_ENCAP_SEG6, 2049 + BPF_LWT_ENCAP_SEG6_INLINE 2150 2050 }; 2151 2051 2152 2052 /* user accessible mirror of in-kernel sk_buff. ··· 2288 2176 struct sk_msg_md { 2289 2177 void *data; 2290 2178 void *data_end; 2179 + 2180 + __u32 family; 2181 + __u32 remote_ip4; /* Stored in network byte order */ 2182 + __u32 local_ip4; /* Stored in network byte order */ 2183 + __u32 remote_ip6[4]; /* Stored in network byte order */ 2184 + __u32 local_ip6[4]; /* Stored in network byte order */ 2185 + __u32 remote_port; /* Stored in network byte order */ 2186 + __u32 local_port; /* stored in host byte order */ 2291 2187 }; 2292 2188 2293 2189 #define BPF_TAG_SIZE 8 ··· 2317 2197 __u32 gpl_compatible:1; 2318 2198 __u64 netns_dev; 2319 2199 __u64 netns_ino; 2200 + __u32 nr_jited_ksyms; 2201 + __u32 nr_jited_func_lens; 2202 + __aligned_u64 jited_ksyms; 2203 + __aligned_u64 jited_func_lens; 2320 2204 } __attribute__((aligned(8))); 2321 2205 2322 2206 struct bpf_map_info { ··· 2335 2211 __u64 netns_dev; 2336 2212 __u64 netns_ino; 2337 2213 __u32 btf_id; 2338 - __u32 btf_key_id; 2339 - __u32 btf_value_id; 2214 + __u32 btf_key_type_id; 2215 + __u32 btf_value_type_id; 2340 2216 } __attribute__((aligned(8))); 2341 2217 2342 2218 struct bpf_btf_info { ··· 2572 2448 __be16 h_vlan_TCI; 2573 2449 __u8 smac[6]; /* ETH_ALEN */ 2574 2450 __u8 dmac[6]; /* ETH_ALEN */ 2451 + }; 2452 + 2453 + enum bpf_task_fd_type { 2454 + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ 2455 + BPF_FD_TYPE_TRACEPOINT, /* tp name */ 2456 + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ 2457 + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ 2458 + BPF_FD_TYPE_UPROBE, /* filename + offset */ 2459 + BPF_FD_TYPE_URETPROBE, /* filename + offset */ 2575 2460 }; 2576 2461 2577 2462 #endif /* _UAPI__LINUX_BPF_H__ */

+11 -26

include/uapi/linux/btf.h

··· 12 12 __u16 magic; 13 13 __u8 version; 14 14 __u8 flags; 15 - 16 - __u32 parent_label; 17 - __u32 parent_name; 15 + __u32 hdr_len; 18 16 19 17 /* All offsets are in bytes relative to the end of this header */ 20 - __u32 label_off; /* offset of label section */ 21 - __u32 object_off; /* offset of data object section*/ 22 - __u32 func_off; /* offset of function section */ 23 18 __u32 type_off; /* offset of type section */ 19 + __u32 type_len; /* length of type section */ 24 20 __u32 str_off; /* offset of string section */ 25 21 __u32 str_len; /* length of string section */ 26 22 }; 27 23 28 24 /* Max # of type identifier */ 29 - #define BTF_MAX_TYPE 0x7fffffff 25 + #define BTF_MAX_TYPE 0x0000ffff 30 26 /* Max offset into the string section */ 31 - #define BTF_MAX_NAME_OFFSET 0x7fffffff 27 + #define BTF_MAX_NAME_OFFSET 0x0000ffff 32 28 /* Max # of struct/union/enum members or func args */ 33 29 #define BTF_MAX_VLEN 0xffff 34 - 35 - /* The type id is referring to a parent BTF */ 36 - #define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) 37 - #define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) 38 - 39 - /* String is in the ELF string section */ 40 - #define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) 41 - #define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) 42 30 43 31 struct btf_type { 44 32 __u32 name_off; 45 33 /* "info" bits arrangement 46 34 * bits 0-15: vlen (e.g. # of struct's members) 47 35 * bits 16-23: unused 48 - * bits 24-28: kind (e.g. int, ptr, array...etc) 49 - * bits 29-30: unused 50 - * bits 31: root 36 + * bits 24-27: kind (e.g. int, ptr, array...etc) 37 + * bits 28-31: unused 51 38 */ 52 39 __u32 info; 53 40 /* "size" is used by INT, ENUM, STRUCT and UNION. ··· 49 62 }; 50 63 }; 51 64 52 - #define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) 53 - #define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) 65 + #define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) 54 66 #define BTF_INFO_VLEN(info) ((info) & 0xffff) 55 67 56 68 #define BTF_KIND_UNKN 0 /* Unknown */ ··· 74 88 /* BTF_KIND_INT is followed by a u32 and the following 75 89 * is the 32 bits arrangement: 76 90 */ 77 - #define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) 91 + #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) 78 92 #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) 79 93 #define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) 80 94 81 95 /* Attributes stored in the BTF_INT_ENCODING */ 82 - #define BTF_INT_SIGNED 0x1 83 - #define BTF_INT_CHAR 0x2 84 - #define BTF_INT_BOOL 0x4 85 - #define BTF_INT_VARARGS 0x8 96 + #define BTF_INT_SIGNED (1 << 0) 97 + #define BTF_INT_CHAR (1 << 1) 98 + #define BTF_INT_BOOL (1 << 2) 86 99 87 100 /* BTF_KIND_ENUM is followed by multiple "struct btf_enum". 88 101 * The exact number of btf_enum is stored in the vlen (of the

+25 -34

include/uapi/linux/if_xdp.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note 2 - * 1 + /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 + /* 3 3 * if_xdp: XDP socket user-space interface 4 4 * Copyright(c) 2018 Intel Corporation. 5 - * 6 - * This program is free software; you can redistribute it and/or modify it 7 - * under the terms and conditions of the GNU General Public License, 8 - * version 2, as published by the Free Software Foundation. 9 - * 10 - * This program is distributed in the hope it will be useful, but WITHOUT 11 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 - * more details. 14 5 * 15 6 * Author(s): Björn Töpel <bjorn.topel@intel.com> 16 7 * Magnus Karlsson <magnus.karlsson@intel.com> ··· 17 26 18 27 struct sockaddr_xdp { 19 28 __u16 sxdp_family; 29 + __u16 sxdp_flags; 20 30 __u32 sxdp_ifindex; 21 31 __u32 sxdp_queue_id; 22 32 __u32 sxdp_shared_umem_fd; 23 - __u16 sxdp_flags; 33 + }; 34 + 35 + struct xdp_ring_offset { 36 + __u64 producer; 37 + __u64 consumer; 38 + __u64 desc; 39 + }; 40 + 41 + struct xdp_mmap_offsets { 42 + struct xdp_ring_offset rx; 43 + struct xdp_ring_offset tx; 44 + struct xdp_ring_offset fr; /* Fill */ 45 + struct xdp_ring_offset cr; /* Completion */ 24 46 }; 25 47 26 48 /* XDP socket options */ 27 - #define XDP_RX_RING 1 28 - #define XDP_TX_RING 2 29 - #define XDP_UMEM_REG 3 30 - #define XDP_UMEM_FILL_RING 4 31 - #define XDP_UMEM_COMPLETION_RING 5 32 - #define XDP_STATISTICS 6 49 + #define XDP_MMAP_OFFSETS 1 50 + #define XDP_RX_RING 2 51 + #define XDP_TX_RING 3 52 + #define XDP_UMEM_REG 4 53 + #define XDP_UMEM_FILL_RING 5 54 + #define XDP_UMEM_COMPLETION_RING 6 55 + #define XDP_STATISTICS 7 33 56 34 57 struct xdp_umem_reg { 35 58 __u64 addr; /* Start of packet data area */ ··· 64 59 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 65 60 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 66 61 62 + /* Rx/Tx descriptor */ 67 63 struct xdp_desc { 68 64 __u32 idx; 69 65 __u32 len; ··· 73 67 __u8 padding[5]; 74 68 }; 75 69 76 - struct xdp_ring { 77 - __u32 producer __attribute__((aligned(64))); 78 - __u32 consumer __attribute__((aligned(64))); 79 - }; 80 - 81 - /* Used for the RX and TX queues for packets */ 82 - struct xdp_rxtx_ring { 83 - struct xdp_ring ptrs; 84 - struct xdp_desc desc[0] __attribute__((aligned(64))); 85 - }; 86 - 87 - /* Used for the fill and completion queues for buffers */ 88 - struct xdp_umem_ring { 89 - struct xdp_ring ptrs; 90 - __u32 desc[0] __attribute__((aligned(64))); 91 - }; 70 + /* UMEM descriptor is __u32 */ 92 71 93 72 #endif /* _LINUX_IF_XDP_H */

+12

include/uapi/linux/seg6_local.h

··· 25 25 SEG6_LOCAL_NH6, 26 26 SEG6_LOCAL_IIF, 27 27 SEG6_LOCAL_OIF, 28 + SEG6_LOCAL_BPF, 28 29 __SEG6_LOCAL_MAX, 29 30 }; 30 31 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) ··· 60 59 SEG6_LOCAL_ACTION_END_AS = 13, 61 60 /* forward to SR-unaware VNF with masquerading */ 62 61 SEG6_LOCAL_ACTION_END_AM = 14, 62 + /* custom BPF action */ 63 + SEG6_LOCAL_ACTION_END_BPF = 15, 63 64 64 65 __SEG6_LOCAL_ACTION_MAX, 65 66 }; 66 67 67 68 #define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1) 69 + 70 + enum { 71 + SEG6_LOCAL_BPF_PROG_UNSPEC, 72 + SEG6_LOCAL_BPF_PROG, 73 + SEG6_LOCAL_BPF_PROG_NAME, 74 + __SEG6_LOCAL_BPF_PROG_MAX, 75 + }; 76 + 77 + #define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1) 68 78 69 79 #endif

+1 -1

kernel/bpf/arraymap.c

··· 352 352 } 353 353 354 354 seq_printf(m, "%u: ", *(u32 *)key); 355 - btf_type_seq_show(map->btf, map->btf_value_id, value, m); 355 + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 356 356 seq_puts(m, "\n"); 357 357 358 358 rcu_read_unlock();

+245 -89

kernel/bpf/btf.c

··· 12 12 #include <linux/uaccess.h> 13 13 #include <linux/kernel.h> 14 14 #include <linux/idr.h> 15 + #include <linux/sort.h> 15 16 #include <linux/bpf_verifier.h> 16 17 #include <linux/btf.h> 17 18 ··· 163 162 #define BITS_ROUNDUP_BYTES(bits) \ 164 163 (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) 165 164 165 + #define BTF_INFO_MASK 0x0f00ffff 166 + #define BTF_INT_MASK 0x0fffffff 167 + #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) 168 + #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) 169 + 166 170 /* 16MB for 64k structs and each has 16 members and 167 171 * a few MB spaces for the string section. 168 172 * The hard limit is S32_MAX. 169 173 */ 170 174 #define BTF_MAX_SIZE (16 * 1024 * 1024) 171 - /* 64k. We can raise it later. The hard limit is S32_MAX. */ 172 - #define BTF_MAX_NR_TYPES 65535 173 175 174 176 #define for_each_member(i, struct_type, member) \ 175 177 for (i = 0, member = btf_type_member(struct_type); \ ··· 188 184 static DEFINE_SPINLOCK(btf_idr_lock); 189 185 190 186 struct btf { 191 - union { 192 - struct btf_header *hdr; 193 - void *data; 194 - }; 187 + void *data; 195 188 struct btf_type **types; 196 189 u32 *resolved_ids; 197 190 u32 *resolved_sizes; 198 191 const char *strings; 199 192 void *nohdr_data; 193 + struct btf_header hdr; 200 194 u32 nr_types; 201 195 u32 types_size; 202 196 u32 data_size; ··· 229 227 }; 230 228 231 229 #define MAX_RESOLVE_DEPTH 32 230 + 231 + struct btf_sec_info { 232 + u32 off; 233 + u32 len; 234 + }; 232 235 233 236 struct btf_verifier_env { 234 237 struct btf *btf; ··· 386 379 return "CHAR"; 387 380 else if (encoding == BTF_INT_BOOL) 388 381 return "BOOL"; 389 - else if (encoding == BTF_INT_VARARGS) 390 - return "VARARGS"; 391 382 else 392 383 return "UNKN"; 393 384 } ··· 422 417 423 418 static bool btf_name_offset_valid(const struct btf *btf, u32 offset) 424 419 { 425 - return !BTF_STR_TBL_ELF_ID(offset) && 426 - BTF_STR_OFFSET(offset) < btf->hdr->str_len; 420 + return BTF_STR_OFFSET_VALID(offset) && 421 + offset < btf->hdr.str_len; 427 422 } 428 423 429 424 static const char *btf_name_by_offset(const struct btf *btf, u32 offset) 430 425 { 431 - if (!BTF_STR_OFFSET(offset)) 426 + if (!offset) 432 427 return "(anon)"; 433 - else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len) 434 - return &btf->strings[BTF_STR_OFFSET(offset)]; 428 + else if (offset < btf->hdr.str_len) 429 + return &btf->strings[offset]; 435 430 else 436 431 return "(invalid-name-offset)"; 437 432 } ··· 442 437 return NULL; 443 438 444 439 return btf->types[type_id]; 440 + } 441 + 442 + /* 443 + * Regular int is not a bit field and it must be either 444 + * u8/u16/u32/u64. 445 + */ 446 + static bool btf_type_int_is_regular(const struct btf_type *t) 447 + { 448 + u16 nr_bits, nr_bytes; 449 + u32 int_data; 450 + 451 + int_data = btf_type_int(t); 452 + nr_bits = BTF_INT_BITS(int_data); 453 + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); 454 + if (BITS_PER_BYTE_MASKED(nr_bits) || 455 + BTF_INT_OFFSET(int_data) || 456 + (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && 457 + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { 458 + return false; 459 + } 460 + 461 + return true; 445 462 } 446 463 447 464 __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, ··· 563 536 __btf_verifier_log(log, "\n"); 564 537 } 565 538 566 - static void btf_verifier_log_hdr(struct btf_verifier_env *env) 539 + static void btf_verifier_log_hdr(struct btf_verifier_env *env, 540 + u32 btf_data_size) 567 541 { 568 542 struct bpf_verifier_log *log = &env->log; 569 543 const struct btf *btf = env->btf; ··· 573 545 if (!bpf_verifier_log_needed(log)) 574 546 return; 575 547 576 - hdr = btf->hdr; 548 + hdr = &btf->hdr; 577 549 __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); 578 550 __btf_verifier_log(log, "version: %u\n", hdr->version); 579 551 __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); 580 - __btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label); 581 - __btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name); 582 - __btf_verifier_log(log, "label_off: %u\n", hdr->label_off); 583 - __btf_verifier_log(log, "object_off: %u\n", hdr->object_off); 584 - __btf_verifier_log(log, "func_off: %u\n", hdr->func_off); 552 + __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); 585 553 __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); 554 + __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); 586 555 __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); 587 556 __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); 588 - __btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size); 557 + __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); 589 558 } 590 559 591 560 static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) ··· 599 574 struct btf_type **new_types; 600 575 u32 expand_by, new_size; 601 576 602 - if (btf->types_size == BTF_MAX_NR_TYPES) { 577 + if (btf->types_size == BTF_MAX_TYPE) { 603 578 btf_verifier_log(env, "Exceeded max num of types"); 604 579 return -E2BIG; 605 580 } 606 581 607 582 expand_by = max_t(u32, btf->types_size >> 2, 16); 608 - new_size = min_t(u32, BTF_MAX_NR_TYPES, 583 + new_size = min_t(u32, BTF_MAX_TYPE, 609 584 btf->types_size + expand_by); 610 585 611 586 new_types = kvzalloc(new_size * sizeof(*new_types), ··· 935 910 } 936 911 937 912 int_data = btf_type_int(t); 913 + if (int_data & ~BTF_INT_MASK) { 914 + btf_verifier_log_basic(env, t, "Invalid int_data:%x", 915 + int_data); 916 + return -EINVAL; 917 + } 918 + 938 919 nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); 939 920 940 921 if (nr_bits > BITS_PER_U64) { ··· 954 923 return -EINVAL; 955 924 } 956 925 926 + /* 927 + * Only one of the encoding bits is allowed and it 928 + * should be sufficient for the pretty print purpose (i.e. decoding). 929 + * Multiple bits can be allowed later if it is found 930 + * to be insufficient. 931 + */ 957 932 encoding = BTF_INT_ENCODING(int_data); 958 933 if (encoding && 959 934 encoding != BTF_INT_SIGNED && 960 935 encoding != BTF_INT_CHAR && 961 - encoding != BTF_INT_BOOL && 962 - encoding != BTF_INT_VARARGS) { 936 + encoding != BTF_INT_BOOL) { 963 937 btf_verifier_log_type(env, t, "Unsupported encoding"); 964 938 return -ENOTSUPP; 965 939 } ··· 1138 1102 return -EINVAL; 1139 1103 } 1140 1104 1141 - if (BTF_TYPE_PARENT(t->type)) { 1105 + if (!BTF_TYPE_ID_VALID(t->type)) { 1142 1106 btf_verifier_log_type(env, t, "Invalid type_id"); 1143 1107 return -EINVAL; 1144 1108 } ··· 1342 1306 return -EINVAL; 1343 1307 } 1344 1308 1345 - /* We are a little forgiving on array->index_type since 1346 - * the kernel is not using it. 1309 + /* Array elem type and index type cannot be in type void, 1310 + * so !array->type and !array->index_type are not allowed. 1347 1311 */ 1348 - /* Array elem cannot be in type void, 1349 - * so !array->type is not allowed. 1350 - */ 1351 - if (!array->type || BTF_TYPE_PARENT(array->type)) { 1352 - btf_verifier_log_type(env, t, "Invalid type_id"); 1312 + if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { 1313 + btf_verifier_log_type(env, t, "Invalid elem"); 1314 + return -EINVAL; 1315 + } 1316 + 1317 + if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { 1318 + btf_verifier_log_type(env, t, "Invalid index"); 1353 1319 return -EINVAL; 1354 1320 } 1355 1321 ··· 1364 1326 const struct resolve_vertex *v) 1365 1327 { 1366 1328 const struct btf_array *array = btf_type_array(v->t); 1367 - const struct btf_type *elem_type; 1368 - u32 elem_type_id = array->type; 1329 + const struct btf_type *elem_type, *index_type; 1330 + u32 elem_type_id, index_type_id; 1369 1331 struct btf *btf = env->btf; 1370 1332 u32 elem_size; 1371 1333 1334 + /* Check array->index_type */ 1335 + index_type_id = array->index_type; 1336 + index_type = btf_type_by_id(btf, index_type_id); 1337 + if (btf_type_is_void_or_null(index_type)) { 1338 + btf_verifier_log_type(env, v->t, "Invalid index"); 1339 + return -EINVAL; 1340 + } 1341 + 1342 + if (!env_type_is_resolve_sink(env, index_type) && 1343 + !env_type_is_resolved(env, index_type_id)) 1344 + return env_stack_push(env, index_type, index_type_id); 1345 + 1346 + index_type = btf_type_id_size(btf, &index_type_id, NULL); 1347 + if (!index_type || !btf_type_is_int(index_type) || 1348 + !btf_type_int_is_regular(index_type)) { 1349 + btf_verifier_log_type(env, v->t, "Invalid index"); 1350 + return -EINVAL; 1351 + } 1352 + 1353 + /* Check array->type */ 1354 + elem_type_id = array->type; 1372 1355 elem_type = btf_type_by_id(btf, elem_type_id); 1373 1356 if (btf_type_is_void_or_null(elem_type)) { 1374 1357 btf_verifier_log_type(env, v->t, ··· 1407 1348 return -EINVAL; 1408 1349 } 1409 1350 1410 - if (btf_type_is_int(elem_type)) { 1411 - int int_type_data = btf_type_int(elem_type); 1412 - u16 nr_bits = BTF_INT_BITS(int_type_data); 1413 - u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); 1414 - 1415 - /* Put more restriction on array of int. The int cannot 1416 - * be a bit field and it must be either u8/u16/u32/u64. 1417 - */ 1418 - if (BITS_PER_BYTE_MASKED(nr_bits) || 1419 - BTF_INT_OFFSET(int_type_data) || 1420 - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && 1421 - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { 1422 - btf_verifier_log_type(env, v->t, 1423 - "Invalid array of int"); 1424 - return -EINVAL; 1425 - } 1351 + if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { 1352 + btf_verifier_log_type(env, v->t, "Invalid array of int"); 1353 + return -EINVAL; 1426 1354 } 1427 1355 1428 1356 if (array->nelems && elem_size > U32_MAX / array->nelems) { ··· 1519 1473 } 1520 1474 1521 1475 /* A member cannot be in type void */ 1522 - if (!member->type || BTF_TYPE_PARENT(member->type)) { 1476 + if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { 1523 1477 btf_verifier_log_member(env, t, member, 1524 1478 "Invalid type_id"); 1525 1479 return -EINVAL; ··· 1772 1726 } 1773 1727 meta_left -= sizeof(*t); 1774 1728 1729 + if (t->info & ~BTF_INFO_MASK) { 1730 + btf_verifier_log(env, "[%u] Invalid btf_info:%x", 1731 + env->log_type_id, t->info); 1732 + return -EINVAL; 1733 + } 1734 + 1775 1735 if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || 1776 1736 BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { 1777 1737 btf_verifier_log(env, "[%u] Invalid kind:%u", ··· 1806 1754 struct btf_header *hdr; 1807 1755 void *cur, *end; 1808 1756 1809 - hdr = btf->hdr; 1757 + hdr = &btf->hdr; 1810 1758 cur = btf->nohdr_data + hdr->type_off; 1811 - end = btf->nohdr_data + hdr->str_off; 1759 + end = btf->nohdr_data + hdr->type_len; 1812 1760 1813 1761 env->log_type_id = 1; 1814 1762 while (cur < end) { ··· 1918 1866 1919 1867 static int btf_parse_type_sec(struct btf_verifier_env *env) 1920 1868 { 1869 + const struct btf_header *hdr = &env->btf->hdr; 1921 1870 int err; 1871 + 1872 + /* Type section must align to 4 bytes */ 1873 + if (hdr->type_off & (sizeof(u32) - 1)) { 1874 + btf_verifier_log(env, "Unaligned type_off"); 1875 + return -EINVAL; 1876 + } 1877 + 1878 + if (!hdr->type_len) { 1879 + btf_verifier_log(env, "No type found"); 1880 + return -EINVAL; 1881 + } 1922 1882 1923 1883 err = btf_check_all_metas(env); 1924 1884 if (err) ··· 1945 1881 struct btf *btf = env->btf; 1946 1882 const char *start, *end; 1947 1883 1948 - hdr = btf->hdr; 1884 + hdr = &btf->hdr; 1949 1885 start = btf->nohdr_data + hdr->str_off; 1950 1886 end = start + hdr->str_len; 1887 + 1888 + if (end != btf->data + btf->data_size) { 1889 + btf_verifier_log(env, "String section is not at the end"); 1890 + return -EINVAL; 1891 + } 1951 1892 1952 1893 if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || 1953 1894 start[0] || end[-1]) { ··· 1965 1896 return 0; 1966 1897 } 1967 1898 1968 - static int btf_parse_hdr(struct btf_verifier_env *env) 1899 + static const size_t btf_sec_info_offset[] = { 1900 + offsetof(struct btf_header, type_off), 1901 + offsetof(struct btf_header, str_off), 1902 + }; 1903 + 1904 + static int btf_sec_info_cmp(const void *a, const void *b) 1905 + { 1906 + const struct btf_sec_info *x = a; 1907 + const struct btf_sec_info *y = b; 1908 + 1909 + return (int)(x->off - y->off) ? : (int)(x->len - y->len); 1910 + } 1911 + 1912 + static int btf_check_sec_info(struct btf_verifier_env *env, 1913 + u32 btf_data_size) 1914 + { 1915 + struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; 1916 + u32 total, expected_total, i; 1917 + const struct btf_header *hdr; 1918 + const struct btf *btf; 1919 + 1920 + btf = env->btf; 1921 + hdr = &btf->hdr; 1922 + 1923 + /* Populate the secs from hdr */ 1924 + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) 1925 + secs[i] = *(struct btf_sec_info *)((void *)hdr + 1926 + btf_sec_info_offset[i]); 1927 + 1928 + sort(secs, ARRAY_SIZE(btf_sec_info_offset), 1929 + sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); 1930 + 1931 + /* Check for gaps and overlap among sections */ 1932 + total = 0; 1933 + expected_total = btf_data_size - hdr->hdr_len; 1934 + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { 1935 + if (expected_total < secs[i].off) { 1936 + btf_verifier_log(env, "Invalid section offset"); 1937 + return -EINVAL; 1938 + } 1939 + if (total < secs[i].off) { 1940 + /* gap */ 1941 + btf_verifier_log(env, "Unsupported section found"); 1942 + return -EINVAL; 1943 + } 1944 + if (total > secs[i].off) { 1945 + btf_verifier_log(env, "Section overlap found"); 1946 + return -EINVAL; 1947 + } 1948 + if (expected_total - total < secs[i].len) { 1949 + btf_verifier_log(env, 1950 + "Total section length too long"); 1951 + return -EINVAL; 1952 + } 1953 + total += secs[i].len; 1954 + } 1955 + 1956 + /* There is data other than hdr and known sections */ 1957 + if (expected_total != total) { 1958 + btf_verifier_log(env, "Unsupported section found"); 1959 + return -EINVAL; 1960 + } 1961 + 1962 + return 0; 1963 + } 1964 + 1965 + static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, 1966 + u32 btf_data_size) 1969 1967 { 1970 1968 const struct btf_header *hdr; 1971 - struct btf *btf = env->btf; 1972 - u32 meta_left; 1969 + u32 hdr_len, hdr_copy; 1970 + /* 1971 + * Minimal part of the "struct btf_header" that 1972 + * contains the hdr_len. 1973 + */ 1974 + struct btf_min_header { 1975 + u16 magic; 1976 + u8 version; 1977 + u8 flags; 1978 + u32 hdr_len; 1979 + } __user *min_hdr; 1980 + struct btf *btf; 1981 + int err; 1973 1982 1974 - if (btf->data_size < sizeof(*hdr)) { 1983 + btf = env->btf; 1984 + min_hdr = btf_data; 1985 + 1986 + if (btf_data_size < sizeof(*min_hdr)) { 1987 + btf_verifier_log(env, "hdr_len not found"); 1988 + return -EINVAL; 1989 + } 1990 + 1991 + if (get_user(hdr_len, &min_hdr->hdr_len)) 1992 + return -EFAULT; 1993 + 1994 + if (btf_data_size < hdr_len) { 1975 1995 btf_verifier_log(env, "btf_header not found"); 1976 1996 return -EINVAL; 1977 1997 } 1978 1998 1979 - btf_verifier_log_hdr(env); 1999 + err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); 2000 + if (err) { 2001 + if (err == -E2BIG) 2002 + btf_verifier_log(env, "Unsupported btf_header"); 2003 + return err; 2004 + } 1980 2005 1981 - hdr = btf->hdr; 2006 + hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); 2007 + if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) 2008 + return -EFAULT; 2009 + 2010 + hdr = &btf->hdr; 2011 + 2012 + btf_verifier_log_hdr(env, btf_data_size); 2013 + 1982 2014 if (hdr->magic != BTF_MAGIC) { 1983 2015 btf_verifier_log(env, "Invalid magic"); 1984 2016 return -EINVAL; ··· 2095 1925 return -ENOTSUPP; 2096 1926 } 2097 1927 2098 - meta_left = btf->data_size - sizeof(*hdr); 2099 - if (!meta_left) { 1928 + if (btf_data_size == hdr->hdr_len) { 2100 1929 btf_verifier_log(env, "No data"); 2101 1930 return -EINVAL; 2102 1931 } 2103 1932 2104 - if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off || 2105 - /* Type section must align to 4 bytes */ 2106 - hdr->type_off & (sizeof(u32) - 1)) { 2107 - btf_verifier_log(env, "Invalid type_off"); 2108 - return -EINVAL; 2109 - } 2110 - 2111 - if (meta_left < hdr->str_off || 2112 - meta_left - hdr->str_off < hdr->str_len) { 2113 - btf_verifier_log(env, "Invalid str_off or str_len"); 2114 - return -EINVAL; 2115 - } 2116 - 2117 - btf->nohdr_data = btf->hdr + 1; 1933 + err = btf_check_sec_info(env, btf_data_size); 1934 + if (err) 1935 + return err; 2118 1936 2119 1937 return 0; 2120 1938 } ··· 2145 1987 err = -ENOMEM; 2146 1988 goto errout; 2147 1989 } 1990 + env->btf = btf; 1991 + 1992 + err = btf_parse_hdr(env, btf_data, btf_data_size); 1993 + if (err) 1994 + goto errout; 2148 1995 2149 1996 data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); 2150 1997 if (!data) { ··· 2159 1996 2160 1997 btf->data = data; 2161 1998 btf->data_size = btf_data_size; 1999 + btf->nohdr_data = btf->data + btf->hdr.hdr_len; 2162 2000 2163 2001 if (copy_from_user(data, btf_data, btf_data_size)) { 2164 2002 err = -EFAULT; 2165 2003 goto errout; 2166 2004 } 2167 - 2168 - env->btf = btf; 2169 - 2170 - err = btf_parse_hdr(env); 2171 - if (err) 2172 - goto errout; 2173 2005 2174 2006 err = btf_parse_str_sec(env); 2175 2007 if (err) ··· 2174 2016 if (err) 2175 2017 goto errout; 2176 2018 2177 - if (!err && log->level && bpf_verifier_log_full(log)) { 2019 + if (log->level && bpf_verifier_log_full(log)) { 2178 2020 err = -ENOSPC; 2179 2021 goto errout; 2180 2022 } 2181 2023 2182 - if (!err) { 2183 - btf_verifier_env_free(env); 2184 - refcount_set(&btf->refcnt, 1); 2185 - return btf; 2186 - } 2024 + btf_verifier_env_free(env); 2025 + refcount_set(&btf->refcnt, 1); 2026 + return btf; 2187 2027 2188 2028 errout: 2189 2029 btf_verifier_env_free(env);

+1 -1

kernel/bpf/cpumap.c

··· 578 578 err = __ptr_ring_produce(q, xdpf); 579 579 if (err) { 580 580 drops++; 581 - xdp_return_frame(xdpf); 581 + xdp_return_frame_rx_napi(xdpf); 582 582 } 583 583 processed++; 584 584 }

+123 -8

kernel/bpf/devmap.c

··· 48 48 * calls will fail at this point. 49 49 */ 50 50 #include <linux/bpf.h> 51 + #include <net/xdp.h> 51 52 #include <linux/filter.h> 53 + #include <trace/events/xdp.h> 52 54 53 55 #define DEV_CREATE_FLAG_MASK \ 54 56 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 55 57 58 + #define DEV_MAP_BULK_SIZE 16 59 + struct xdp_bulk_queue { 60 + struct xdp_frame *q[DEV_MAP_BULK_SIZE]; 61 + struct net_device *dev_rx; 62 + unsigned int count; 63 + }; 64 + 56 65 struct bpf_dtab_netdev { 57 - struct net_device *dev; 66 + struct net_device *dev; /* must be first member, due to tracepoint */ 58 67 struct bpf_dtab *dtab; 59 68 unsigned int bit; 69 + struct xdp_bulk_queue __percpu *bulkq; 60 70 struct rcu_head rcu; 61 71 }; 62 72 ··· 216 206 __set_bit(bit, bitmap); 217 207 } 218 208 209 + static int bq_xmit_all(struct bpf_dtab_netdev *obj, 210 + struct xdp_bulk_queue *bq) 211 + { 212 + struct net_device *dev = obj->dev; 213 + int sent = 0, drops = 0, err = 0; 214 + int i; 215 + 216 + if (unlikely(!bq->count)) 217 + return 0; 218 + 219 + for (i = 0; i < bq->count; i++) { 220 + struct xdp_frame *xdpf = bq->q[i]; 221 + 222 + prefetch(xdpf); 223 + } 224 + 225 + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); 226 + if (sent < 0) { 227 + err = sent; 228 + sent = 0; 229 + goto error; 230 + } 231 + drops = bq->count - sent; 232 + out: 233 + bq->count = 0; 234 + 235 + trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, 236 + sent, drops, bq->dev_rx, dev, err); 237 + bq->dev_rx = NULL; 238 + return 0; 239 + error: 240 + /* If ndo_xdp_xmit fails with an errno, no frames have been 241 + * xmit'ed and it's our responsibility to them free all. 242 + */ 243 + for (i = 0; i < bq->count; i++) { 244 + struct xdp_frame *xdpf = bq->q[i]; 245 + 246 + /* RX path under NAPI protection, can return frames faster */ 247 + xdp_return_frame_rx_napi(xdpf); 248 + drops++; 249 + } 250 + goto out; 251 + } 252 + 219 253 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled 220 254 * from the driver before returning from its napi->poll() routine. The poll() 221 255 * routine is called either from busy_poll context or net_rx_action signaled ··· 275 221 276 222 for_each_set_bit(bit, bitmap, map->max_entries) { 277 223 struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); 224 + struct xdp_bulk_queue *bq; 278 225 struct net_device *netdev; 279 226 280 227 /* This is possible if the dev entry is removed by user space ··· 285 230 continue; 286 231 287 232 __clear_bit(bit, bitmap); 233 + 234 + bq = this_cpu_ptr(dev->bulkq); 235 + bq_xmit_all(dev, bq); 288 236 netdev = dev->dev; 289 237 if (likely(netdev->netdev_ops->ndo_xdp_flush)) 290 238 netdev->netdev_ops->ndo_xdp_flush(netdev); ··· 298 240 * update happens in parallel here a dev_put wont happen until after reading the 299 241 * ifindex. 300 242 */ 301 - struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) 243 + struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) 302 244 { 303 245 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 304 - struct bpf_dtab_netdev *dev; 246 + struct bpf_dtab_netdev *obj; 305 247 306 248 if (key >= map->max_entries) 307 249 return NULL; 308 250 309 - dev = READ_ONCE(dtab->netdev_map[key]); 310 - return dev ? dev->dev : NULL; 251 + obj = READ_ONCE(dtab->netdev_map[key]); 252 + return obj; 253 + } 254 + 255 + /* Runs under RCU-read-side, plus in softirq under NAPI protection. 256 + * Thus, safe percpu variable access. 257 + */ 258 + static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, 259 + struct net_device *dev_rx) 260 + 261 + { 262 + struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); 263 + 264 + if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) 265 + bq_xmit_all(obj, bq); 266 + 267 + /* Ingress dev_rx will be the same for all xdp_frame's in 268 + * bulk_queue, because bq stored per-CPU and must be flushed 269 + * from net_device drivers NAPI func end. 270 + */ 271 + if (!bq->dev_rx) 272 + bq->dev_rx = dev_rx; 273 + 274 + bq->q[bq->count++] = xdpf; 275 + return 0; 276 + } 277 + 278 + int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 279 + struct net_device *dev_rx) 280 + { 281 + struct net_device *dev = dst->dev; 282 + struct xdp_frame *xdpf; 283 + 284 + if (!dev->netdev_ops->ndo_xdp_xmit) 285 + return -EOPNOTSUPP; 286 + 287 + xdpf = convert_to_xdp_frame(xdp); 288 + if (unlikely(!xdpf)) 289 + return -EOVERFLOW; 290 + 291 + return bq_enqueue(dst, xdpf, dev_rx); 311 292 } 312 293 313 294 static void *dev_map_lookup_elem(struct bpf_map *map, void *key) 314 295 { 315 - struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); 296 + struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); 297 + struct net_device *dev = dev = obj ? obj->dev : NULL; 316 298 317 299 return dev ? &dev->ifindex : NULL; 318 300 } ··· 361 263 { 362 264 if (dev->dev->netdev_ops->ndo_xdp_flush) { 363 265 struct net_device *fl = dev->dev; 266 + struct xdp_bulk_queue *bq; 364 267 unsigned long *bitmap; 268 + 365 269 int cpu; 366 270 367 271 for_each_online_cpu(cpu) { 368 272 bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); 369 273 __clear_bit(dev->bit, bitmap); 274 + 275 + bq = per_cpu_ptr(dev->bulkq, cpu); 276 + bq_xmit_all(dev, bq); 370 277 371 278 fl->netdev_ops->ndo_xdp_flush(dev->dev); 372 279 } ··· 384 281 385 282 dev = container_of(rcu, struct bpf_dtab_netdev, rcu); 386 283 dev_map_flush_old(dev); 284 + free_percpu(dev->bulkq); 387 285 dev_put(dev->dev); 388 286 kfree(dev); 389 287 } ··· 417 313 { 418 314 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 419 315 struct net *net = current->nsproxy->net_ns; 316 + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 420 317 struct bpf_dtab_netdev *dev, *old_dev; 421 318 u32 i = *(u32 *)key; 422 319 u32 ifindex = *(u32 *)value; ··· 432 327 if (!ifindex) { 433 328 dev = NULL; 434 329 } else { 435 - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, 436 - map->numa_node); 330 + dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); 437 331 if (!dev) 438 332 return -ENOMEM; 439 333 334 + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), 335 + sizeof(void *), gfp); 336 + if (!dev->bulkq) { 337 + kfree(dev); 338 + return -ENOMEM; 339 + } 340 + 440 341 dev->dev = dev_get_by_index(net, ifindex); 441 342 if (!dev->dev) { 343 + free_percpu(dev->bulkq); 442 344 kfree(dev); 443 345 return -EINVAL; 444 346 } ··· 517 405 518 406 static int __init dev_map_init(void) 519 407 { 408 + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ 409 + BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != 410 + offsetof(struct _bpf_dtab_netdev, dev)); 520 411 register_netdevice_notifier(&dev_map_notifier); 521 412 return 0; 522 413 }

+2 -2

kernel/bpf/sockmap.c

··· 523 523 } 524 524 525 525 bpf_compute_data_pointers_sg(md); 526 + md->sk = sk; 526 527 rc = (*prog->bpf_func)(md, prog->insnsi); 527 528 psock->apply_bytes = md->apply_bytes; 528 529 ··· 1714 1713 struct smap_psock_map_entry *e = NULL; 1715 1714 struct smap_psock *psock; 1716 1715 bool new = false; 1717 - int err; 1716 + int err = 0; 1718 1717 1719 1718 /* 1. If sock map has BPF programs those will be inherited by the 1720 1719 * sock being added. If the sock is already attached to BPF programs ··· 1824 1823 write_unlock_bh(&sock->sk_callback_lock); 1825 1824 return err; 1826 1825 out_free: 1827 - kfree(e); 1828 1826 smap_release_sock(psock, sock); 1829 1827 out_progs: 1830 1828 if (parse && verdict) {

+226 -19

kernel/bpf/syscall.c

··· 18 18 #include <linux/vmalloc.h> 19 19 #include <linux/mmzone.h> 20 20 #include <linux/anon_inodes.h> 21 + #include <linux/fdtable.h> 21 22 #include <linux/file.h> 23 + #include <linux/fs.h> 22 24 #include <linux/license.h> 23 25 #include <linux/filter.h> 24 26 #include <linux/version.h> ··· 67 65 * copy_from_user() call. However, this is not a concern since this function is 68 66 * meant to be a future-proofing of bits. 69 67 */ 70 - static int check_uarg_tail_zero(void __user *uaddr, 71 - size_t expected_size, 72 - size_t actual_size) 68 + int bpf_check_uarg_tail_zero(void __user *uaddr, 69 + size_t expected_size, 70 + size_t actual_size) 73 71 { 74 72 unsigned char __user *addr; 75 73 unsigned char __user *end; ··· 424 422 return 0; 425 423 } 426 424 427 - #define BPF_MAP_CREATE_LAST_FIELD btf_value_id 425 + #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id 428 426 /* called via syscall */ 429 427 static int map_create(union bpf_attr *attr) 430 428 { ··· 459 457 atomic_set(&map->usercnt, 1); 460 458 461 459 if (bpf_map_support_seq_show(map) && 462 - (attr->btf_key_id || attr->btf_value_id)) { 460 + (attr->btf_key_type_id || attr->btf_value_type_id)) { 463 461 struct btf *btf; 464 462 465 - if (!attr->btf_key_id || !attr->btf_value_id) { 463 + if (!attr->btf_key_type_id || !attr->btf_value_type_id) { 466 464 err = -EINVAL; 467 465 goto free_map_nouncharge; 468 466 } ··· 473 471 goto free_map_nouncharge; 474 472 } 475 473 476 - err = map->ops->map_check_btf(map, btf, attr->btf_key_id, 477 - attr->btf_value_id); 474 + err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, 475 + attr->btf_value_type_id); 478 476 if (err) { 479 477 btf_put(btf); 480 478 goto free_map_nouncharge; 481 479 } 482 480 483 481 map->btf = btf; 484 - map->btf_key_id = attr->btf_key_id; 485 - map->btf_value_id = attr->btf_value_id; 482 + map->btf_key_type_id = attr->btf_key_type_id; 483 + map->btf_value_type_id = attr->btf_value_type_id; 486 484 } 487 485 488 486 err = security_bpf_map_alloc(map); ··· 1901 1899 u32 ulen; 1902 1900 int err; 1903 1901 1904 - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); 1902 + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); 1905 1903 if (err) 1906 1904 return err; 1907 1905 info_len = min_t(u32, sizeof(info), info_len); ··· 1935 1933 if (!capable(CAP_SYS_ADMIN)) { 1936 1934 info.jited_prog_len = 0; 1937 1935 info.xlated_prog_len = 0; 1936 + info.nr_jited_ksyms = 0; 1938 1937 goto done; 1939 1938 } 1940 1939 ··· 1972 1969 * for offload. 1973 1970 */ 1974 1971 ulen = info.jited_prog_len; 1975 - info.jited_prog_len = prog->jited_len; 1972 + if (prog->aux->func_cnt) { 1973 + u32 i; 1974 + 1975 + info.jited_prog_len = 0; 1976 + for (i = 0; i < prog->aux->func_cnt; i++) 1977 + info.jited_prog_len += prog->aux->func[i]->jited_len; 1978 + } else { 1979 + info.jited_prog_len = prog->jited_len; 1980 + } 1981 + 1976 1982 if (info.jited_prog_len && ulen) { 1977 1983 if (bpf_dump_raw_ok()) { 1978 1984 uinsns = u64_to_user_ptr(info.jited_prog_insns); 1979 1985 ulen = min_t(u32, info.jited_prog_len, ulen); 1980 - if (copy_to_user(uinsns, prog->bpf_func, ulen)) 1981 - return -EFAULT; 1986 + 1987 + /* for multi-function programs, copy the JITed 1988 + * instructions for all the functions 1989 + */ 1990 + if (prog->aux->func_cnt) { 1991 + u32 len, free, i; 1992 + u8 *img; 1993 + 1994 + free = ulen; 1995 + for (i = 0; i < prog->aux->func_cnt; i++) { 1996 + len = prog->aux->func[i]->jited_len; 1997 + len = min_t(u32, len, free); 1998 + img = (u8 *) prog->aux->func[i]->bpf_func; 1999 + if (copy_to_user(uinsns, img, len)) 2000 + return -EFAULT; 2001 + uinsns += len; 2002 + free -= len; 2003 + if (!free) 2004 + break; 2005 + } 2006 + } else { 2007 + if (copy_to_user(uinsns, prog->bpf_func, ulen)) 2008 + return -EFAULT; 2009 + } 1982 2010 } else { 1983 2011 info.jited_prog_insns = 0; 2012 + } 2013 + } 2014 + 2015 + ulen = info.nr_jited_ksyms; 2016 + info.nr_jited_ksyms = prog->aux->func_cnt; 2017 + if (info.nr_jited_ksyms && ulen) { 2018 + if (bpf_dump_raw_ok()) { 2019 + u64 __user *user_ksyms; 2020 + ulong ksym_addr; 2021 + u32 i; 2022 + 2023 + /* copy the address of the kernel symbol 2024 + * corresponding to each function 2025 + */ 2026 + ulen = min_t(u32, info.nr_jited_ksyms, ulen); 2027 + user_ksyms = u64_to_user_ptr(info.jited_ksyms); 2028 + for (i = 0; i < ulen; i++) { 2029 + ksym_addr = (ulong) prog->aux->func[i]->bpf_func; 2030 + ksym_addr &= PAGE_MASK; 2031 + if (put_user((u64) ksym_addr, &user_ksyms[i])) 2032 + return -EFAULT; 2033 + } 2034 + } else { 2035 + info.jited_ksyms = 0; 2036 + } 2037 + } 2038 + 2039 + ulen = info.nr_jited_func_lens; 2040 + info.nr_jited_func_lens = prog->aux->func_cnt; 2041 + if (info.nr_jited_func_lens && ulen) { 2042 + if (bpf_dump_raw_ok()) { 2043 + u32 __user *user_lens; 2044 + u32 func_len, i; 2045 + 2046 + /* copy the JITed image lengths for each function */ 2047 + ulen = min_t(u32, info.nr_jited_func_lens, ulen); 2048 + user_lens = u64_to_user_ptr(info.jited_func_lens); 2049 + for (i = 0; i < ulen; i++) { 2050 + func_len = prog->aux->func[i]->jited_len; 2051 + if (put_user(func_len, &user_lens[i])) 2052 + return -EFAULT; 2053 + } 2054 + } else { 2055 + info.jited_func_lens = 0; 1984 2056 } 1985 2057 } 1986 2058 ··· 2076 1998 u32 info_len = attr->info.info_len; 2077 1999 int err; 2078 2000 2079 - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); 2001 + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); 2080 2002 if (err) 2081 2003 return err; 2082 2004 info_len = min_t(u32, sizeof(info), info_len); ··· 2091 2013 2092 2014 if (map->btf) { 2093 2015 info.btf_id = btf_id(map->btf); 2094 - info.btf_key_id = map->btf_key_id; 2095 - info.btf_value_id = map->btf_value_id; 2016 + info.btf_key_type_id = map->btf_key_type_id; 2017 + info.btf_value_type_id = map->btf_value_type_id; 2096 2018 } 2097 2019 2098 2020 if (bpf_map_is_dev_bound(map)) { ··· 2116 2038 u32 info_len = attr->info.info_len; 2117 2039 int err; 2118 2040 2119 - err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); 2041 + err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); 2120 2042 if (err) 2121 2043 return err; 2122 2044 ··· 2180 2102 return btf_get_fd_by_id(attr->btf_id); 2181 2103 } 2182 2104 2105 + static int bpf_task_fd_query_copy(const union bpf_attr *attr, 2106 + union bpf_attr __user *uattr, 2107 + u32 prog_id, u32 fd_type, 2108 + const char *buf, u64 probe_offset, 2109 + u64 probe_addr) 2110 + { 2111 + char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); 2112 + u32 len = buf ? strlen(buf) : 0, input_len; 2113 + int err = 0; 2114 + 2115 + if (put_user(len, &uattr->task_fd_query.buf_len)) 2116 + return -EFAULT; 2117 + input_len = attr->task_fd_query.buf_len; 2118 + if (input_len && ubuf) { 2119 + if (!len) { 2120 + /* nothing to copy, just make ubuf NULL terminated */ 2121 + char zero = '\0'; 2122 + 2123 + if (put_user(zero, ubuf)) 2124 + return -EFAULT; 2125 + } else if (input_len >= len + 1) { 2126 + /* ubuf can hold the string with NULL terminator */ 2127 + if (copy_to_user(ubuf, buf, len + 1)) 2128 + return -EFAULT; 2129 + } else { 2130 + /* ubuf cannot hold the string with NULL terminator, 2131 + * do a partial copy with NULL terminator. 2132 + */ 2133 + char zero = '\0'; 2134 + 2135 + err = -ENOSPC; 2136 + if (copy_to_user(ubuf, buf, input_len - 1)) 2137 + return -EFAULT; 2138 + if (put_user(zero, ubuf + input_len - 1)) 2139 + return -EFAULT; 2140 + } 2141 + } 2142 + 2143 + if (put_user(prog_id, &uattr->task_fd_query.prog_id) || 2144 + put_user(fd_type, &uattr->task_fd_query.fd_type) || 2145 + put_user(probe_offset, &uattr->task_fd_query.probe_offset) || 2146 + put_user(probe_addr, &uattr->task_fd_query.probe_addr)) 2147 + return -EFAULT; 2148 + 2149 + return err; 2150 + } 2151 + 2152 + #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr 2153 + 2154 + static int bpf_task_fd_query(const union bpf_attr *attr, 2155 + union bpf_attr __user *uattr) 2156 + { 2157 + pid_t pid = attr->task_fd_query.pid; 2158 + u32 fd = attr->task_fd_query.fd; 2159 + const struct perf_event *event; 2160 + struct files_struct *files; 2161 + struct task_struct *task; 2162 + struct file *file; 2163 + int err; 2164 + 2165 + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) 2166 + return -EINVAL; 2167 + 2168 + if (!capable(CAP_SYS_ADMIN)) 2169 + return -EPERM; 2170 + 2171 + if (attr->task_fd_query.flags != 0) 2172 + return -EINVAL; 2173 + 2174 + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); 2175 + if (!task) 2176 + return -ENOENT; 2177 + 2178 + files = get_files_struct(task); 2179 + put_task_struct(task); 2180 + if (!files) 2181 + return -ENOENT; 2182 + 2183 + err = 0; 2184 + spin_lock(&files->file_lock); 2185 + file = fcheck_files(files, fd); 2186 + if (!file) 2187 + err = -EBADF; 2188 + else 2189 + get_file(file); 2190 + spin_unlock(&files->file_lock); 2191 + put_files_struct(files); 2192 + 2193 + if (err) 2194 + goto out; 2195 + 2196 + if (file->f_op == &bpf_raw_tp_fops) { 2197 + struct bpf_raw_tracepoint *raw_tp = file->private_data; 2198 + struct bpf_raw_event_map *btp = raw_tp->btp; 2199 + 2200 + err = bpf_task_fd_query_copy(attr, uattr, 2201 + raw_tp->prog->aux->id, 2202 + BPF_FD_TYPE_RAW_TRACEPOINT, 2203 + btp->tp->name, 0, 0); 2204 + goto put_file; 2205 + } 2206 + 2207 + event = perf_get_event(file); 2208 + if (!IS_ERR(event)) { 2209 + u64 probe_offset, probe_addr; 2210 + u32 prog_id, fd_type; 2211 + const char *buf; 2212 + 2213 + err = bpf_get_perf_event_info(event, &prog_id, &fd_type, 2214 + &buf, &probe_offset, 2215 + &probe_addr); 2216 + if (!err) 2217 + err = bpf_task_fd_query_copy(attr, uattr, prog_id, 2218 + fd_type, buf, 2219 + probe_offset, 2220 + probe_addr); 2221 + goto put_file; 2222 + } 2223 + 2224 + err = -ENOTSUPP; 2225 + put_file: 2226 + fput(file); 2227 + out: 2228 + return err; 2229 + } 2230 + 2183 2231 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 2184 2232 { 2185 2233 union bpf_attr attr = {}; ··· 2314 2110 if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) 2315 2111 return -EPERM; 2316 2112 2317 - err = check_uarg_tail_zero(uattr, sizeof(attr), size); 2113 + err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); 2318 2114 if (err) 2319 2115 return err; 2320 2116 size = min_t(u32, size, sizeof(attr)); ··· 2391 2187 break; 2392 2188 case BPF_BTF_GET_FD_BY_ID: 2393 2189 err = bpf_btf_get_fd_by_id(&attr); 2190 + break; 2191 + case BPF_TASK_FD_QUERY: 2192 + err = bpf_task_fd_query(&attr, uattr); 2394 2193 break; 2395 2194 default: 2396 2195 err = -EINVAL;

+16 -7

kernel/bpf/verifier.c

··· 1262 1262 switch (env->prog->type) { 1263 1263 case BPF_PROG_TYPE_LWT_IN: 1264 1264 case BPF_PROG_TYPE_LWT_OUT: 1265 + case BPF_PROG_TYPE_LWT_SEG6LOCAL: 1265 1266 /* dst_input() and dst_output() can't write for now */ 1266 1267 if (t == BPF_WRITE) 1267 1268 return false; ··· 5384 5383 insn->src_reg != BPF_PSEUDO_CALL) 5385 5384 continue; 5386 5385 subprog = insn->off; 5387 - insn->off = 0; 5388 5386 insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) 5389 5387 func[subprog]->bpf_func - 5390 5388 __bpf_call_base; 5391 5389 } 5390 + 5391 + /* we use the aux data to keep a list of the start addresses 5392 + * of the JITed images for each function in the program 5393 + * 5394 + * for some architectures, such as powerpc64, the imm field 5395 + * might not be large enough to hold the offset of the start 5396 + * address of the callee's JITed image from __bpf_call_base 5397 + * 5398 + * in such cases, we can lookup the start address of a callee 5399 + * by using its subprog id, available from the off field of 5400 + * the call instruction, as an index for this list 5401 + */ 5402 + func[i]->aux->func = func; 5403 + func[i]->aux->func_cnt = env->subprog_cnt; 5392 5404 } 5393 5405 for (i = 0; i < env->subprog_cnt; i++) { 5394 5406 old_bpf_func = func[i]->bpf_func; ··· 5427 5413 * later look the same as if they were interpreted only. 5428 5414 */ 5429 5415 for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { 5430 - unsigned long addr; 5431 - 5432 5416 if (insn->code != (BPF_JMP | BPF_CALL) || 5433 5417 insn->src_reg != BPF_PSEUDO_CALL) 5434 5418 continue; 5435 5419 insn->off = env->insn_aux_data[i].call_imm; 5436 5420 subprog = find_subprog(env, i + insn->off + 1); 5437 - addr = (unsigned long)func[subprog]->bpf_func; 5438 - addr &= PAGE_MASK; 5439 - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) 5440 - addr - __bpf_call_base; 5421 + insn->imm = subprog; 5441 5422 } 5442 5423 5443 5424 prog->jited = 1;

-9

kernel/bpf/xskmap.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* XSKMAP used for AF_XDP sockets 3 3 * Copyright(c) 2018 Intel Corporation. 4 - * 5 - * This program is free software; you can redistribute it and/or modify it 6 - * under the terms and conditions of the GNU General Public License, 7 - * version 2, as published by the Free Software Foundation. 8 - * 9 - * This program is distributed in the hope it will be useful, but WITHOUT 10 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 - * more details. 13 4 */ 14 5 15 6 #include <linux/bpf.h>

+8

kernel/events/core.c

··· 11212 11212 return file; 11213 11213 } 11214 11214 11215 + const struct perf_event *perf_get_event(struct file *file) 11216 + { 11217 + if (file->f_op != &perf_fops) 11218 + return ERR_PTR(-EINVAL); 11219 + 11220 + return file->private_data; 11221 + } 11222 + 11215 11223 const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 11216 11224 { 11217 11225 if (!event)

+48

kernel/trace/bpf_trace.c

··· 14 14 #include <linux/uaccess.h> 15 15 #include <linux/ctype.h> 16 16 #include <linux/kprobes.h> 17 + #include <linux/syscalls.h> 17 18 #include <linux/error-injection.h> 18 19 19 20 #include "trace_probe.h" ··· 1162 1161 mutex_lock(&bpf_event_mutex); 1163 1162 err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); 1164 1163 mutex_unlock(&bpf_event_mutex); 1164 + return err; 1165 + } 1166 + 1167 + int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, 1168 + u32 *fd_type, const char **buf, 1169 + u64 *probe_offset, u64 *probe_addr) 1170 + { 1171 + bool is_tracepoint, is_syscall_tp; 1172 + struct bpf_prog *prog; 1173 + int flags, err = 0; 1174 + 1175 + prog = event->prog; 1176 + if (!prog) 1177 + return -ENOENT; 1178 + 1179 + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ 1180 + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) 1181 + return -EOPNOTSUPP; 1182 + 1183 + *prog_id = prog->aux->id; 1184 + flags = event->tp_event->flags; 1185 + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; 1186 + is_syscall_tp = is_syscall_trace_event(event->tp_event); 1187 + 1188 + if (is_tracepoint || is_syscall_tp) { 1189 + *buf = is_tracepoint ? event->tp_event->tp->name 1190 + : event->tp_event->name; 1191 + *fd_type = BPF_FD_TYPE_TRACEPOINT; 1192 + *probe_offset = 0x0; 1193 + *probe_addr = 0x0; 1194 + } else { 1195 + /* kprobe/uprobe */ 1196 + err = -EOPNOTSUPP; 1197 + #ifdef CONFIG_KPROBE_EVENTS 1198 + if (flags & TRACE_EVENT_FL_KPROBE) 1199 + err = bpf_get_kprobe_info(event, fd_type, buf, 1200 + probe_offset, probe_addr, 1201 + event->attr.type == PERF_TYPE_TRACEPOINT); 1202 + #endif 1203 + #ifdef CONFIG_UPROBE_EVENTS 1204 + if (flags & TRACE_EVENT_FL_UPROBE) 1205 + err = bpf_get_uprobe_info(event, fd_type, buf, 1206 + probe_offset, 1207 + event->attr.type == PERF_TYPE_TRACEPOINT); 1208 + #endif 1209 + } 1210 + 1165 1211 return err; 1166 1212 }

+29

kernel/trace/trace_kprobe.c

··· 1287 1287 head, NULL); 1288 1288 } 1289 1289 NOKPROBE_SYMBOL(kretprobe_perf_func); 1290 + 1291 + int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, 1292 + const char **symbol, u64 *probe_offset, 1293 + u64 *probe_addr, bool perf_type_tracepoint) 1294 + { 1295 + const char *pevent = trace_event_name(event->tp_event); 1296 + const char *group = event->tp_event->class->system; 1297 + struct trace_kprobe *tk; 1298 + 1299 + if (perf_type_tracepoint) 1300 + tk = find_trace_kprobe(pevent, group); 1301 + else 1302 + tk = event->tp_event->data; 1303 + if (!tk) 1304 + return -EINVAL; 1305 + 1306 + *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE 1307 + : BPF_FD_TYPE_KPROBE; 1308 + if (tk->symbol) { 1309 + *symbol = tk->symbol; 1310 + *probe_offset = tk->rp.kp.offset; 1311 + *probe_addr = 0; 1312 + } else { 1313 + *symbol = NULL; 1314 + *probe_offset = 0; 1315 + *probe_addr = (unsigned long)tk->rp.kp.addr; 1316 + } 1317 + return 0; 1318 + } 1290 1319 #endif /* CONFIG_PERF_EVENTS */ 1291 1320 1292 1321 /*

+22

kernel/trace/trace_uprobe.c

··· 1161 1161 { 1162 1162 __uprobe_perf_func(tu, func, regs, ucb, dsize); 1163 1163 } 1164 + 1165 + int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, 1166 + const char **filename, u64 *probe_offset, 1167 + bool perf_type_tracepoint) 1168 + { 1169 + const char *pevent = trace_event_name(event->tp_event); 1170 + const char *group = event->tp_event->class->system; 1171 + struct trace_uprobe *tu; 1172 + 1173 + if (perf_type_tracepoint) 1174 + tu = find_probe_event(pevent, group); 1175 + else 1176 + tu = event->tp_event->data; 1177 + if (!tu) 1178 + return -EINVAL; 1179 + 1180 + *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE 1181 + : BPF_FD_TYPE_UPROBE; 1182 + *filename = tu->filename; 1183 + *probe_offset = tu->offset; 1184 + return 0; 1185 + } 1164 1186 #endif /* CONFIG_PERF_EVENTS */ 1165 1187 1166 1188 static int

+491 -81

net/core/filter.c

··· 64 64 #include <net/ip_fib.h> 65 65 #include <net/flow.h> 66 66 #include <net/arp.h> 67 + #include <net/ipv6.h> 68 + #include <linux/seg6_local.h> 69 + #include <net/seg6.h> 70 + #include <net/seg6_local.h> 67 71 68 72 /** 69 73 * sk_filter_trim_cap - run a packet through a socket filter ··· 3046 3042 u32 index) 3047 3043 { 3048 3044 struct xdp_frame *xdpf; 3049 - int err; 3045 + int sent; 3050 3046 3051 3047 if (!dev->netdev_ops->ndo_xdp_xmit) { 3052 3048 return -EOPNOTSUPP; ··· 3056 3052 if (unlikely(!xdpf)) 3057 3053 return -EOVERFLOW; 3058 3054 3059 - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); 3060 - if (err) 3061 - return err; 3055 + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf); 3056 + if (sent <= 0) 3057 + return sent; 3062 3058 dev->netdev_ops->ndo_xdp_flush(dev); 3063 3059 return 0; 3064 3060 } ··· 3072 3068 3073 3069 switch (map->map_type) { 3074 3070 case BPF_MAP_TYPE_DEVMAP: { 3075 - struct net_device *dev = fwd; 3076 - struct xdp_frame *xdpf; 3071 + struct bpf_dtab_netdev *dst = fwd; 3077 3072 3078 - if (!dev->netdev_ops->ndo_xdp_xmit) 3079 - return -EOPNOTSUPP; 3080 - 3081 - xdpf = convert_to_xdp_frame(xdp); 3082 - if (unlikely(!xdpf)) 3083 - return -EOVERFLOW; 3084 - 3085 - /* TODO: move to inside map code instead, for bulk support 3086 - * err = dev_map_enqueue(dev, xdp); 3087 - */ 3088 - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); 3073 + err = dev_map_enqueue(dst, xdp, dev_rx); 3089 3074 if (err) 3090 3075 return err; 3091 3076 __dev_map_insert_ctx(map, index); ··· 3362 3369 .arg2_type = ARG_ANYTHING, 3363 3370 .arg3_type = ARG_ANYTHING, 3364 3371 }; 3365 - 3366 - bool bpf_helper_changes_pkt_data(void *func) 3367 - { 3368 - if (func == bpf_skb_vlan_push || 3369 - func == bpf_skb_vlan_pop || 3370 - func == bpf_skb_store_bytes || 3371 - func == bpf_skb_change_proto || 3372 - func == bpf_skb_change_head || 3373 - func == bpf_skb_change_tail || 3374 - func == bpf_skb_adjust_room || 3375 - func == bpf_skb_pull_data || 3376 - func == bpf_clone_redirect || 3377 - func == bpf_l3_csum_replace || 3378 - func == bpf_l4_csum_replace || 3379 - func == bpf_xdp_adjust_head || 3380 - func == bpf_xdp_adjust_meta || 3381 - func == bpf_msg_pull_data || 3382 - func == bpf_xdp_adjust_tail) 3383 - return true; 3384 - 3385 - return false; 3386 - } 3387 3372 3388 3373 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, 3389 3374 unsigned long off, unsigned long len) ··· 4067 4096 4068 4097 #if IS_ENABLED(CONFIG_INET) 4069 4098 static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, 4070 - u32 flags) 4099 + u32 flags, bool check_mtu) 4071 4100 { 4072 4101 struct in_device *in_dev; 4073 4102 struct neighbour *neigh; ··· 4076 4105 struct fib_nh *nh; 4077 4106 struct flowi4 fl4; 4078 4107 int err; 4108 + u32 mtu; 4079 4109 4080 4110 dev = dev_get_by_index_rcu(net, params->ifindex); 4081 4111 if (unlikely(!dev)) ··· 4128 4156 if (res.fi->fib_nhs > 1) 4129 4157 fib_select_path(net, &res, &fl4, NULL); 4130 4158 4159 + if (check_mtu) { 4160 + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); 4161 + if (params->tot_len > mtu) 4162 + return 0; 4163 + } 4164 + 4131 4165 nh = &res.fi->fib_nh[res.nh_sel]; 4132 4166 4133 4167 /* do not handle lwt encaps right now */ ··· 4162 4184 4163 4185 #if IS_ENABLED(CONFIG_IPV6) 4164 4186 static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, 4165 - u32 flags) 4187 + u32 flags, bool check_mtu) 4166 4188 { 4167 4189 struct in6_addr *src = (struct in6_addr *) params->ipv6_src; 4168 4190 struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; ··· 4173 4195 struct flowi6 fl6; 4174 4196 int strict = 0; 4175 4197 int oif; 4198 + u32 mtu; 4176 4199 4177 4200 /* link local addresses are never forwarded */ 4178 4201 if (rt6_need_strict(dst) || rt6_need_strict(src)) ··· 4236 4257 fl6.flowi6_oif, NULL, 4237 4258 strict); 4238 4259 4260 + if (check_mtu) { 4261 + mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); 4262 + if (params->tot_len > mtu) 4263 + return 0; 4264 + } 4265 + 4239 4266 if (f6i->fib6_nh.nh_lwtstate) 4240 4267 return 0; 4241 4268 ··· 4274 4289 #if IS_ENABLED(CONFIG_INET) 4275 4290 case AF_INET: 4276 4291 return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, 4277 - flags); 4292 + flags, true); 4278 4293 #endif 4279 4294 #if IS_ENABLED(CONFIG_IPV6) 4280 4295 case AF_INET6: 4281 4296 return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, 4282 - flags); 4297 + flags, true); 4283 4298 #endif 4284 4299 } 4285 4300 return 0; ··· 4298 4313 BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, 4299 4314 struct bpf_fib_lookup *, params, int, plen, u32, flags) 4300 4315 { 4316 + struct net *net = dev_net(skb->dev); 4317 + int index = 0; 4318 + 4301 4319 if (plen < sizeof(*params)) 4302 4320 return -EINVAL; 4303 4321 4304 4322 switch (params->family) { 4305 4323 #if IS_ENABLED(CONFIG_INET) 4306 4324 case AF_INET: 4307 - return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); 4325 + index = bpf_ipv4_fib_lookup(net, params, flags, false); 4326 + break; 4308 4327 #endif 4309 4328 #if IS_ENABLED(CONFIG_IPV6) 4310 4329 case AF_INET6: 4311 - return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); 4330 + index = bpf_ipv6_fib_lookup(net, params, flags, false); 4331 + break; 4312 4332 #endif 4313 4333 } 4314 - return -ENOTSUPP; 4334 + 4335 + if (index > 0) { 4336 + struct net_device *dev; 4337 + 4338 + dev = dev_get_by_index_rcu(net, index); 4339 + if (!is_skb_forwardable(dev, skb)) 4340 + index = 0; 4341 + } 4342 + 4343 + return index; 4315 4344 } 4316 4345 4317 4346 static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { ··· 4337 4338 .arg3_type = ARG_CONST_SIZE, 4338 4339 .arg4_type = ARG_ANYTHING, 4339 4340 }; 4341 + 4342 + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 4343 + static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) 4344 + { 4345 + int err; 4346 + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; 4347 + 4348 + if (!seg6_validate_srh(srh, len)) 4349 + return -EINVAL; 4350 + 4351 + switch (type) { 4352 + case BPF_LWT_ENCAP_SEG6_INLINE: 4353 + if (skb->protocol != htons(ETH_P_IPV6)) 4354 + return -EBADMSG; 4355 + 4356 + err = seg6_do_srh_inline(skb, srh); 4357 + break; 4358 + case BPF_LWT_ENCAP_SEG6: 4359 + skb_reset_inner_headers(skb); 4360 + skb->encapsulation = 1; 4361 + err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); 4362 + break; 4363 + default: 4364 + return -EINVAL; 4365 + } 4366 + 4367 + bpf_compute_data_pointers(skb); 4368 + if (err) 4369 + return err; 4370 + 4371 + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); 4372 + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); 4373 + 4374 + return seg6_lookup_nexthop(skb, NULL, 0); 4375 + } 4376 + #endif /* CONFIG_IPV6_SEG6_BPF */ 4377 + 4378 + BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, 4379 + u32, len) 4380 + { 4381 + switch (type) { 4382 + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 4383 + case BPF_LWT_ENCAP_SEG6: 4384 + case BPF_LWT_ENCAP_SEG6_INLINE: 4385 + return bpf_push_seg6_encap(skb, type, hdr, len); 4386 + #endif 4387 + default: 4388 + return -EINVAL; 4389 + } 4390 + } 4391 + 4392 + static const struct bpf_func_proto bpf_lwt_push_encap_proto = { 4393 + .func = bpf_lwt_push_encap, 4394 + .gpl_only = false, 4395 + .ret_type = RET_INTEGER, 4396 + .arg1_type = ARG_PTR_TO_CTX, 4397 + .arg2_type = ARG_ANYTHING, 4398 + .arg3_type = ARG_PTR_TO_MEM, 4399 + .arg4_type = ARG_CONST_SIZE 4400 + }; 4401 + 4402 + BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, 4403 + const void *, from, u32, len) 4404 + { 4405 + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 4406 + struct seg6_bpf_srh_state *srh_state = 4407 + this_cpu_ptr(&seg6_bpf_srh_states); 4408 + void *srh_tlvs, *srh_end, *ptr; 4409 + struct ipv6_sr_hdr *srh; 4410 + int srhoff = 0; 4411 + 4412 + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) 4413 + return -EINVAL; 4414 + 4415 + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); 4416 + srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); 4417 + srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); 4418 + 4419 + ptr = skb->data + offset; 4420 + if (ptr >= srh_tlvs && ptr + len <= srh_end) 4421 + srh_state->valid = 0; 4422 + else if (ptr < (void *)&srh->flags || 4423 + ptr + len > (void *)&srh->segments) 4424 + return -EFAULT; 4425 + 4426 + if (unlikely(bpf_try_make_writable(skb, offset + len))) 4427 + return -EFAULT; 4428 + 4429 + memcpy(skb->data + offset, from, len); 4430 + return 0; 4431 + #else /* CONFIG_IPV6_SEG6_BPF */ 4432 + return -EOPNOTSUPP; 4433 + #endif 4434 + } 4435 + 4436 + static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { 4437 + .func = bpf_lwt_seg6_store_bytes, 4438 + .gpl_only = false, 4439 + .ret_type = RET_INTEGER, 4440 + .arg1_type = ARG_PTR_TO_CTX, 4441 + .arg2_type = ARG_ANYTHING, 4442 + .arg3_type = ARG_PTR_TO_MEM, 4443 + .arg4_type = ARG_CONST_SIZE 4444 + }; 4445 + 4446 + BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, 4447 + u32, action, void *, param, u32, param_len) 4448 + { 4449 + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 4450 + struct seg6_bpf_srh_state *srh_state = 4451 + this_cpu_ptr(&seg6_bpf_srh_states); 4452 + struct ipv6_sr_hdr *srh; 4453 + int srhoff = 0; 4454 + int err; 4455 + 4456 + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) 4457 + return -EINVAL; 4458 + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); 4459 + 4460 + if (!srh_state->valid) { 4461 + if (unlikely((srh_state->hdrlen & 7) != 0)) 4462 + return -EBADMSG; 4463 + 4464 + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); 4465 + if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) 4466 + return -EBADMSG; 4467 + 4468 + srh_state->valid = 1; 4469 + } 4470 + 4471 + switch (action) { 4472 + case SEG6_LOCAL_ACTION_END_X: 4473 + if (param_len != sizeof(struct in6_addr)) 4474 + return -EINVAL; 4475 + return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); 4476 + case SEG6_LOCAL_ACTION_END_T: 4477 + if (param_len != sizeof(int)) 4478 + return -EINVAL; 4479 + return seg6_lookup_nexthop(skb, NULL, *(int *)param); 4480 + case SEG6_LOCAL_ACTION_END_B6: 4481 + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, 4482 + param, param_len); 4483 + if (!err) 4484 + srh_state->hdrlen = 4485 + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; 4486 + return err; 4487 + case SEG6_LOCAL_ACTION_END_B6_ENCAP: 4488 + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, 4489 + param, param_len); 4490 + if (!err) 4491 + srh_state->hdrlen = 4492 + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; 4493 + return err; 4494 + default: 4495 + return -EINVAL; 4496 + } 4497 + #else /* CONFIG_IPV6_SEG6_BPF */ 4498 + return -EOPNOTSUPP; 4499 + #endif 4500 + } 4501 + 4502 + static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { 4503 + .func = bpf_lwt_seg6_action, 4504 + .gpl_only = false, 4505 + .ret_type = RET_INTEGER, 4506 + .arg1_type = ARG_PTR_TO_CTX, 4507 + .arg2_type = ARG_ANYTHING, 4508 + .arg3_type = ARG_PTR_TO_MEM, 4509 + .arg4_type = ARG_CONST_SIZE 4510 + }; 4511 + 4512 + BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, 4513 + s32, len) 4514 + { 4515 + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 4516 + struct seg6_bpf_srh_state *srh_state = 4517 + this_cpu_ptr(&seg6_bpf_srh_states); 4518 + void *srh_end, *srh_tlvs, *ptr; 4519 + struct ipv6_sr_hdr *srh; 4520 + struct ipv6hdr *hdr; 4521 + int srhoff = 0; 4522 + int ret; 4523 + 4524 + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) 4525 + return -EINVAL; 4526 + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); 4527 + 4528 + srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + 4529 + ((srh->first_segment + 1) << 4)); 4530 + srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + 4531 + srh_state->hdrlen); 4532 + ptr = skb->data + offset; 4533 + 4534 + if (unlikely(ptr < srh_tlvs || ptr > srh_end)) 4535 + return -EFAULT; 4536 + if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) 4537 + return -EFAULT; 4538 + 4539 + if (len > 0) { 4540 + ret = skb_cow_head(skb, len); 4541 + if (unlikely(ret < 0)) 4542 + return ret; 4543 + 4544 + ret = bpf_skb_net_hdr_push(skb, offset, len); 4545 + } else { 4546 + ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); 4547 + } 4548 + 4549 + bpf_compute_data_pointers(skb); 4550 + if (unlikely(ret < 0)) 4551 + return ret; 4552 + 4553 + hdr = (struct ipv6hdr *)skb->data; 4554 + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); 4555 + 4556 + srh_state->hdrlen += len; 4557 + srh_state->valid = 0; 4558 + return 0; 4559 + #else /* CONFIG_IPV6_SEG6_BPF */ 4560 + return -EOPNOTSUPP; 4561 + #endif 4562 + } 4563 + 4564 + static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { 4565 + .func = bpf_lwt_seg6_adjust_srh, 4566 + .gpl_only = false, 4567 + .ret_type = RET_INTEGER, 4568 + .arg1_type = ARG_PTR_TO_CTX, 4569 + .arg2_type = ARG_ANYTHING, 4570 + .arg3_type = ARG_ANYTHING, 4571 + }; 4572 + 4573 + bool bpf_helper_changes_pkt_data(void *func) 4574 + { 4575 + if (func == bpf_skb_vlan_push || 4576 + func == bpf_skb_vlan_pop || 4577 + func == bpf_skb_store_bytes || 4578 + func == bpf_skb_change_proto || 4579 + func == bpf_skb_change_head || 4580 + func == bpf_skb_change_tail || 4581 + func == bpf_skb_adjust_room || 4582 + func == bpf_skb_pull_data || 4583 + func == bpf_clone_redirect || 4584 + func == bpf_l3_csum_replace || 4585 + func == bpf_l4_csum_replace || 4586 + func == bpf_xdp_adjust_head || 4587 + func == bpf_xdp_adjust_meta || 4588 + func == bpf_msg_pull_data || 4589 + func == bpf_xdp_adjust_tail || 4590 + func == bpf_lwt_push_encap || 4591 + func == bpf_lwt_seg6_store_bytes || 4592 + func == bpf_lwt_seg6_adjust_srh || 4593 + func == bpf_lwt_seg6_action 4594 + ) 4595 + return true; 4596 + 4597 + return false; 4598 + } 4340 4599 4341 4600 static const struct bpf_func_proto * 4342 4601 bpf_base_func_proto(enum bpf_func_id func_id) ··· 4780 4523 } 4781 4524 4782 4525 static const struct bpf_func_proto * 4783 - lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4784 - { 4785 - switch (func_id) { 4786 - case BPF_FUNC_skb_load_bytes: 4787 - return &bpf_skb_load_bytes_proto; 4788 - case BPF_FUNC_skb_pull_data: 4789 - return &bpf_skb_pull_data_proto; 4790 - case BPF_FUNC_csum_diff: 4791 - return &bpf_csum_diff_proto; 4792 - case BPF_FUNC_get_cgroup_classid: 4793 - return &bpf_get_cgroup_classid_proto; 4794 - case BPF_FUNC_get_route_realm: 4795 - return &bpf_get_route_realm_proto; 4796 - case BPF_FUNC_get_hash_recalc: 4797 - return &bpf_get_hash_recalc_proto; 4798 - case BPF_FUNC_perf_event_output: 4799 - return &bpf_skb_event_output_proto; 4800 - case BPF_FUNC_get_smp_processor_id: 4801 - return &bpf_get_smp_processor_id_proto; 4802 - case BPF_FUNC_skb_under_cgroup: 4803 - return &bpf_skb_under_cgroup_proto; 4804 - default: 4805 - return bpf_base_func_proto(func_id); 4806 - } 4807 - } 4808 - 4809 - static const struct bpf_func_proto * 4810 4526 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4811 4527 { 4812 4528 switch (func_id) { ··· 4845 4615 } 4846 4616 4847 4617 static const struct bpf_func_proto * 4618 + lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4619 + { 4620 + switch (func_id) { 4621 + case BPF_FUNC_skb_load_bytes: 4622 + return &bpf_skb_load_bytes_proto; 4623 + case BPF_FUNC_skb_pull_data: 4624 + return &bpf_skb_pull_data_proto; 4625 + case BPF_FUNC_csum_diff: 4626 + return &bpf_csum_diff_proto; 4627 + case BPF_FUNC_get_cgroup_classid: 4628 + return &bpf_get_cgroup_classid_proto; 4629 + case BPF_FUNC_get_route_realm: 4630 + return &bpf_get_route_realm_proto; 4631 + case BPF_FUNC_get_hash_recalc: 4632 + return &bpf_get_hash_recalc_proto; 4633 + case BPF_FUNC_perf_event_output: 4634 + return &bpf_skb_event_output_proto; 4635 + case BPF_FUNC_get_smp_processor_id: 4636 + return &bpf_get_smp_processor_id_proto; 4637 + case BPF_FUNC_skb_under_cgroup: 4638 + return &bpf_skb_under_cgroup_proto; 4639 + default: 4640 + return bpf_base_func_proto(func_id); 4641 + } 4642 + } 4643 + 4644 + static const struct bpf_func_proto * 4645 + lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4646 + { 4647 + switch (func_id) { 4648 + case BPF_FUNC_lwt_push_encap: 4649 + return &bpf_lwt_push_encap_proto; 4650 + default: 4651 + return lwt_out_func_proto(func_id, prog); 4652 + } 4653 + } 4654 + 4655 + static const struct bpf_func_proto * 4848 4656 lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4849 4657 { 4850 4658 switch (func_id) { ··· 4913 4645 case BPF_FUNC_set_hash_invalid: 4914 4646 return &bpf_set_hash_invalid_proto; 4915 4647 default: 4916 - return lwt_inout_func_proto(func_id, prog); 4648 + return lwt_out_func_proto(func_id, prog); 4649 + } 4650 + } 4651 + 4652 + static const struct bpf_func_proto * 4653 + lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 4654 + { 4655 + switch (func_id) { 4656 + case BPF_FUNC_lwt_seg6_store_bytes: 4657 + return &bpf_lwt_seg6_store_bytes_proto; 4658 + case BPF_FUNC_lwt_seg6_action: 4659 + return &bpf_lwt_seg6_action_proto; 4660 + case BPF_FUNC_lwt_seg6_adjust_srh: 4661 + return &bpf_lwt_seg6_adjust_srh_proto; 4662 + default: 4663 + return lwt_out_func_proto(func_id, prog); 4917 4664 } 4918 4665 } 4919 4666 ··· 5035 4752 5036 4753 return bpf_skb_is_valid_access(off, size, type, prog, info); 5037 4754 } 5038 - 5039 4755 5040 4756 /* Attach type specific accesses */ 5041 4757 static bool __sock_filter_check_attach_type(int off, ··· 5437 5155 switch (off) { 5438 5156 case offsetof(struct sk_msg_md, data): 5439 5157 info->reg_type = PTR_TO_PACKET; 5158 + if (size != sizeof(__u64)) 5159 + return false; 5440 5160 break; 5441 5161 case offsetof(struct sk_msg_md, data_end): 5442 5162 info->reg_type = PTR_TO_PACKET_END; 5163 + if (size != sizeof(__u64)) 5164 + return false; 5443 5165 break; 5166 + default: 5167 + if (size != sizeof(__u32)) 5168 + return false; 5444 5169 } 5445 5170 5446 5171 if (off < 0 || off >= sizeof(struct sk_msg_md)) 5447 5172 return false; 5448 5173 if (off % size != 0) 5449 - return false; 5450 - if (size != sizeof(__u64)) 5451 5174 return false; 5452 5175 5453 5176 return true; ··· 6129 5842 break; 6130 5843 6131 5844 case offsetof(struct bpf_sock_ops, local_ip4): 6132 - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); 5845 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 5846 + skc_rcv_saddr) != 4); 6133 5847 6134 5848 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6135 5849 struct bpf_sock_ops_kern, sk), ··· 6447 6159 struct bpf_prog *prog, u32 *target_size) 6448 6160 { 6449 6161 struct bpf_insn *insn = insn_buf; 6162 + int off; 6450 6163 6451 6164 switch (si->off) { 6452 6165 case offsetof(struct sk_msg_md, data): ··· 6459 6170 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), 6460 6171 si->dst_reg, si->src_reg, 6461 6172 offsetof(struct sk_msg_buff, data_end)); 6173 + break; 6174 + case offsetof(struct sk_msg_md, family): 6175 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); 6176 + 6177 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6178 + struct sk_msg_buff, sk), 6179 + si->dst_reg, si->src_reg, 6180 + offsetof(struct sk_msg_buff, sk)); 6181 + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 6182 + offsetof(struct sock_common, skc_family)); 6183 + break; 6184 + 6185 + case offsetof(struct sk_msg_md, remote_ip4): 6186 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); 6187 + 6188 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6189 + struct sk_msg_buff, sk), 6190 + si->dst_reg, si->src_reg, 6191 + offsetof(struct sk_msg_buff, sk)); 6192 + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 6193 + offsetof(struct sock_common, skc_daddr)); 6194 + break; 6195 + 6196 + case offsetof(struct sk_msg_md, local_ip4): 6197 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 6198 + skc_rcv_saddr) != 4); 6199 + 6200 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6201 + struct sk_msg_buff, sk), 6202 + si->dst_reg, si->src_reg, 6203 + offsetof(struct sk_msg_buff, sk)); 6204 + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 6205 + offsetof(struct sock_common, 6206 + skc_rcv_saddr)); 6207 + break; 6208 + 6209 + case offsetof(struct sk_msg_md, remote_ip6[0]) ... 6210 + offsetof(struct sk_msg_md, remote_ip6[3]): 6211 + #if IS_ENABLED(CONFIG_IPV6) 6212 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 6213 + skc_v6_daddr.s6_addr32[0]) != 4); 6214 + 6215 + off = si->off; 6216 + off -= offsetof(struct sk_msg_md, remote_ip6[0]); 6217 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6218 + struct sk_msg_buff, sk), 6219 + si->dst_reg, si->src_reg, 6220 + offsetof(struct sk_msg_buff, sk)); 6221 + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 6222 + offsetof(struct sock_common, 6223 + skc_v6_daddr.s6_addr32[0]) + 6224 + off); 6225 + #else 6226 + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 6227 + #endif 6228 + break; 6229 + 6230 + case offsetof(struct sk_msg_md, local_ip6[0]) ... 6231 + offsetof(struct sk_msg_md, local_ip6[3]): 6232 + #if IS_ENABLED(CONFIG_IPV6) 6233 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 6234 + skc_v6_rcv_saddr.s6_addr32[0]) != 4); 6235 + 6236 + off = si->off; 6237 + off -= offsetof(struct sk_msg_md, local_ip6[0]); 6238 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6239 + struct sk_msg_buff, sk), 6240 + si->dst_reg, si->src_reg, 6241 + offsetof(struct sk_msg_buff, sk)); 6242 + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 6243 + offsetof(struct sock_common, 6244 + skc_v6_rcv_saddr.s6_addr32[0]) + 6245 + off); 6246 + #else 6247 + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 6248 + #endif 6249 + break; 6250 + 6251 + case offsetof(struct sk_msg_md, remote_port): 6252 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); 6253 + 6254 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6255 + struct sk_msg_buff, sk), 6256 + si->dst_reg, si->src_reg, 6257 + offsetof(struct sk_msg_buff, sk)); 6258 + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 6259 + offsetof(struct sock_common, skc_dport)); 6260 + #ifndef __BIG_ENDIAN_BITFIELD 6261 + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); 6262 + #endif 6263 + break; 6264 + 6265 + case offsetof(struct sk_msg_md, local_port): 6266 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); 6267 + 6268 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 6269 + struct sk_msg_buff, sk), 6270 + si->dst_reg, si->src_reg, 6271 + offsetof(struct sk_msg_buff, sk)); 6272 + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 6273 + offsetof(struct sock_common, skc_num)); 6462 6274 break; 6463 6275 } 6464 6276 ··· 6609 6219 .test_run = bpf_prog_test_run_skb, 6610 6220 }; 6611 6221 6612 - const struct bpf_verifier_ops lwt_inout_verifier_ops = { 6613 - .get_func_proto = lwt_inout_func_proto, 6222 + const struct bpf_verifier_ops lwt_in_verifier_ops = { 6223 + .get_func_proto = lwt_in_func_proto, 6614 6224 .is_valid_access = lwt_is_valid_access, 6615 6225 .convert_ctx_access = bpf_convert_ctx_access, 6616 6226 }; 6617 6227 6618 - const struct bpf_prog_ops lwt_inout_prog_ops = { 6228 + const struct bpf_prog_ops lwt_in_prog_ops = { 6229 + .test_run = bpf_prog_test_run_skb, 6230 + }; 6231 + 6232 + const struct bpf_verifier_ops lwt_out_verifier_ops = { 6233 + .get_func_proto = lwt_out_func_proto, 6234 + .is_valid_access = lwt_is_valid_access, 6235 + .convert_ctx_access = bpf_convert_ctx_access, 6236 + }; 6237 + 6238 + const struct bpf_prog_ops lwt_out_prog_ops = { 6619 6239 .test_run = bpf_prog_test_run_skb, 6620 6240 }; 6621 6241 ··· 6637 6237 }; 6638 6238 6639 6239 const struct bpf_prog_ops lwt_xmit_prog_ops = { 6240 + .test_run = bpf_prog_test_run_skb, 6241 + }; 6242 + 6243 + const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { 6244 + .get_func_proto = lwt_seg6local_func_proto, 6245 + .is_valid_access = lwt_is_valid_access, 6246 + .convert_ctx_access = bpf_convert_ctx_access, 6247 + }; 6248 + 6249 + const struct bpf_prog_ops lwt_seg6local_prog_ops = { 6640 6250 .test_run = bpf_prog_test_run_skb, 6641 6251 }; 6642 6252

+16 -4

net/core/xdp.c

··· 308 308 } 309 309 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); 310 310 311 - static void xdp_return(void *data, struct xdp_mem_info *mem) 311 + /* XDP RX runs under NAPI protection, and in different delivery error 312 + * scenarios (e.g. queue full), it is possible to return the xdp_frame 313 + * while still leveraging this protection. The @napi_direct boolian 314 + * is used for those calls sites. Thus, allowing for faster recycling 315 + * of xdp_frames/pages in those cases. 316 + */ 317 + static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) 312 318 { 313 319 struct xdp_mem_allocator *xa; 314 320 struct page *page; ··· 326 320 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); 327 321 page = virt_to_head_page(data); 328 322 if (xa) 329 - page_pool_put_page(xa->page_pool, page); 323 + page_pool_put_page(xa->page_pool, page, napi_direct); 330 324 else 331 325 put_page(page); 332 326 rcu_read_unlock(); ··· 346 340 347 341 void xdp_return_frame(struct xdp_frame *xdpf) 348 342 { 349 - xdp_return(xdpf->data, &xdpf->mem); 343 + __xdp_return(xdpf->data, &xdpf->mem, false); 350 344 } 351 345 EXPORT_SYMBOL_GPL(xdp_return_frame); 352 346 347 + void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) 348 + { 349 + __xdp_return(xdpf->data, &xdpf->mem, true); 350 + } 351 + EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); 352 + 353 353 void xdp_return_buff(struct xdp_buff *xdp) 354 354 { 355 - xdp_return(xdp->data, &xdp->rxq->mem); 355 + __xdp_return(xdp->data, &xdp->rxq->mem, true); 356 356 } 357 357 EXPORT_SYMBOL_GPL(xdp_return_buff);

+31

net/ipv4/route.c

··· 1352 1352 return NULL; 1353 1353 } 1354 1354 1355 + /* MTU selection: 1356 + * 1. mtu on route is locked - use it 1357 + * 2. mtu from nexthop exception 1358 + * 3. mtu from egress device 1359 + */ 1360 + 1361 + u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) 1362 + { 1363 + struct fib_info *fi = res->fi; 1364 + struct fib_nh *nh = &fi->fib_nh[res->nh_sel]; 1365 + struct net_device *dev = nh->nh_dev; 1366 + u32 mtu = 0; 1367 + 1368 + if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || 1369 + fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) 1370 + mtu = fi->fib_mtu; 1371 + 1372 + if (likely(!mtu)) { 1373 + struct fib_nh_exception *fnhe; 1374 + 1375 + fnhe = find_exception(nh, daddr); 1376 + if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) 1377 + mtu = fnhe->fnhe_pmtu; 1378 + } 1379 + 1380 + if (likely(!mtu)) 1381 + mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); 1382 + 1383 + return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu); 1384 + } 1385 + 1355 1386 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1356 1387 __be32 daddr, const bool do_cache) 1357 1388 {

+5

net/ipv6/Kconfig

··· 329 329 330 330 If unsure, say N. 331 331 332 + config IPV6_SEG6_BPF 333 + def_bool y 334 + depends on IPV6_SEG6_LWTUNNEL 335 + depends on IPV6 = y 336 + 332 337 endif # IPV6

+8

net/ipv6/addrconf_core.c

··· 161 161 return f6i; 162 162 } 163 163 164 + static u32 165 + eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 166 + struct in6_addr *saddr) 167 + { 168 + return 0; 169 + } 170 + 164 171 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { 165 172 .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, 166 173 .fib6_get_table = eafnosupport_fib6_get_table, 167 174 .fib6_table_lookup = eafnosupport_fib6_table_lookup, 168 175 .fib6_lookup = eafnosupport_fib6_lookup, 169 176 .fib6_multipath_select = eafnosupport_fib6_multipath_select, 177 + .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, 170 178 }; 171 179 EXPORT_SYMBOL_GPL(ipv6_stub); 172 180

+1

net/ipv6/af_inet6.c

··· 894 894 .fib6_table_lookup = fib6_table_lookup, 895 895 .fib6_lookup = fib6_lookup, 896 896 .fib6_multipath_select = fib6_multipath_select, 897 + .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, 897 898 .udpv6_encap_enable = udpv6_encap_enable, 898 899 .ndisc_send_na = ndisc_send_na, 899 900 .nd_tbl = &nd_tbl,

+48

net/ipv6/route.c

··· 2604 2604 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2605 2605 } 2606 2606 2607 + /* MTU selection: 2608 + * 1. mtu on route is locked - use it 2609 + * 2. mtu from nexthop exception 2610 + * 3. mtu from egress device 2611 + * 2612 + * based on ip6_dst_mtu_forward and exception logic of 2613 + * rt6_find_cached_rt; called with rcu_read_lock 2614 + */ 2615 + u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, 2616 + struct in6_addr *saddr) 2617 + { 2618 + struct rt6_exception_bucket *bucket; 2619 + struct rt6_exception *rt6_ex; 2620 + struct in6_addr *src_key; 2621 + struct inet6_dev *idev; 2622 + u32 mtu = 0; 2623 + 2624 + if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { 2625 + mtu = f6i->fib6_pmtu; 2626 + if (mtu) 2627 + goto out; 2628 + } 2629 + 2630 + src_key = NULL; 2631 + #ifdef CONFIG_IPV6_SUBTREES 2632 + if (f6i->fib6_src.plen) 2633 + src_key = saddr; 2634 + #endif 2635 + 2636 + bucket = rcu_dereference(f6i->rt6i_exception_bucket); 2637 + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 2638 + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 2639 + mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); 2640 + 2641 + if (likely(!mtu)) { 2642 + struct net_device *dev = fib6_info_nh_dev(f6i); 2643 + 2644 + mtu = IPV6_MIN_MTU; 2645 + idev = __in6_dev_get(dev); 2646 + if (idev && idev->cnf.mtu6 > mtu) 2647 + mtu = idev->cnf.mtu6; 2648 + } 2649 + 2650 + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2651 + out: 2652 + return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); 2653 + } 2654 + 2607 2655 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2608 2656 struct flowi6 *fl6) 2609 2657 {

+179 -11

net/ipv6/seg6_local.c

··· 1 1 /* 2 2 * SR-IPv6 implementation 3 3 * 4 - * Author: 4 + * Authors: 5 5 * David Lebrun <david.lebrun@uclouvain.be> 6 + * eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com> 6 7 * 7 8 * 8 9 * This program is free software; you can redistribute it and/or ··· 31 30 #ifdef CONFIG_IPV6_SEG6_HMAC 32 31 #include <net/seg6_hmac.h> 33 32 #endif 33 + #include <net/seg6_local.h> 34 34 #include <linux/etherdevice.h> 35 + #include <linux/bpf.h> 35 36 36 37 struct seg6_local_lwt; 37 38 ··· 44 41 int static_headroom; 45 42 }; 46 43 44 + struct bpf_lwt_prog { 45 + struct bpf_prog *prog; 46 + char *name; 47 + }; 48 + 47 49 struct seg6_local_lwt { 48 50 int action; 49 51 struct ipv6_sr_hdr *srh; ··· 57 49 struct in6_addr nh6; 58 50 int iif; 59 51 int oif; 52 + struct bpf_lwt_prog bpf; 60 53 61 54 int headroom; 62 55 struct seg6_action_desc *desc; ··· 149 140 *daddr = *addr; 150 141 } 151 142 152 - static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, 153 - u32 tbl_id) 143 + int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, 144 + u32 tbl_id) 154 145 { 155 146 struct net *net = dev_net(skb->dev); 156 147 struct ipv6hdr *hdr = ipv6_hdr(skb); ··· 196 187 197 188 skb_dst_drop(skb); 198 189 skb_dst_set(skb, dst); 190 + return dst->error; 199 191 } 200 192 201 193 /* regular endpoint function */ ··· 210 200 211 201 advance_nextseg(srh, &ipv6_hdr(skb)->daddr); 212 202 213 - lookup_nexthop(skb, NULL, 0); 203 + seg6_lookup_nexthop(skb, NULL, 0); 214 204 215 205 return dst_input(skb); 216 206 ··· 230 220 231 221 advance_nextseg(srh, &ipv6_hdr(skb)->daddr); 232 222 233 - lookup_nexthop(skb, &slwt->nh6, 0); 223 + seg6_lookup_nexthop(skb, &slwt->nh6, 0); 234 224 235 225 return dst_input(skb); 236 226 ··· 249 239 250 240 advance_nextseg(srh, &ipv6_hdr(skb)->daddr); 251 241 252 - lookup_nexthop(skb, NULL, slwt->table); 242 + seg6_lookup_nexthop(skb, NULL, slwt->table); 253 243 254 244 return dst_input(skb); 255 245 ··· 341 331 if (!ipv6_addr_any(&slwt->nh6)) 342 332 nhaddr = &slwt->nh6; 343 333 344 - lookup_nexthop(skb, nhaddr, 0); 334 + seg6_lookup_nexthop(skb, nhaddr, 0); 345 335 346 336 return dst_input(skb); 347 337 drop: ··· 390 380 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 391 381 goto drop; 392 382 393 - lookup_nexthop(skb, NULL, slwt->table); 383 + seg6_lookup_nexthop(skb, NULL, slwt->table); 394 384 395 385 return dst_input(skb); 396 386 ··· 416 406 ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); 417 407 skb_set_transport_header(skb, sizeof(struct ipv6hdr)); 418 408 419 - lookup_nexthop(skb, NULL, 0); 409 + seg6_lookup_nexthop(skb, NULL, 0); 420 410 421 411 return dst_input(skb); 422 412 ··· 448 438 ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); 449 439 skb_set_transport_header(skb, sizeof(struct ipv6hdr)); 450 440 451 - lookup_nexthop(skb, NULL, 0); 441 + seg6_lookup_nexthop(skb, NULL, 0); 452 442 453 443 return dst_input(skb); 454 444 455 445 drop: 456 446 kfree_skb(skb); 457 447 return err; 448 + } 449 + 450 + DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); 451 + 452 + static int input_action_end_bpf(struct sk_buff *skb, 453 + struct seg6_local_lwt *slwt) 454 + { 455 + struct seg6_bpf_srh_state *srh_state = 456 + this_cpu_ptr(&seg6_bpf_srh_states); 457 + struct seg6_bpf_srh_state local_srh_state; 458 + struct ipv6_sr_hdr *srh; 459 + int srhoff = 0; 460 + int ret; 461 + 462 + srh = get_and_validate_srh(skb); 463 + if (!srh) 464 + goto drop; 465 + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); 466 + 467 + /* preempt_disable is needed to protect the per-CPU buffer srh_state, 468 + * which is also accessed by the bpf_lwt_seg6_* helpers 469 + */ 470 + preempt_disable(); 471 + srh_state->hdrlen = srh->hdrlen << 3; 472 + srh_state->valid = 1; 473 + 474 + rcu_read_lock(); 475 + bpf_compute_data_pointers(skb); 476 + ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); 477 + rcu_read_unlock(); 478 + 479 + local_srh_state = *srh_state; 480 + preempt_enable(); 481 + 482 + switch (ret) { 483 + case BPF_OK: 484 + case BPF_REDIRECT: 485 + break; 486 + case BPF_DROP: 487 + goto drop; 488 + default: 489 + pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret); 490 + goto drop; 491 + } 492 + 493 + if (unlikely((local_srh_state.hdrlen & 7) != 0)) 494 + goto drop; 495 + 496 + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) 497 + goto drop; 498 + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); 499 + srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); 500 + 501 + if (!local_srh_state.valid && 502 + unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) 503 + goto drop; 504 + 505 + if (ret != BPF_REDIRECT) 506 + seg6_lookup_nexthop(skb, NULL, 0); 507 + 508 + return dst_input(skb); 509 + 510 + drop: 511 + kfree_skb(skb); 512 + return -EINVAL; 458 513 } 459 514 460 515 static struct seg6_action_desc seg6_action_table[] = { ··· 568 493 .attrs = (1 << SEG6_LOCAL_SRH), 569 494 .input = input_action_end_b6_encap, 570 495 .static_headroom = sizeof(struct ipv6hdr), 571 - } 496 + }, 497 + { 498 + .action = SEG6_LOCAL_ACTION_END_BPF, 499 + .attrs = (1 << SEG6_LOCAL_BPF), 500 + .input = input_action_end_bpf, 501 + }, 502 + 572 503 }; 573 504 574 505 static struct seg6_action_desc *__get_action_desc(int action) ··· 619 538 .len = sizeof(struct in6_addr) }, 620 539 [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, 621 540 [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, 541 + [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, 622 542 }; 623 543 624 544 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) ··· 797 715 return 0; 798 716 } 799 717 718 + #define MAX_PROG_NAME 256 719 + static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = { 720 + [SEG6_LOCAL_BPF_PROG] = { .type = NLA_U32, }, 721 + [SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 722 + .len = MAX_PROG_NAME }, 723 + }; 724 + 725 + static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) 726 + { 727 + struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1]; 728 + struct bpf_prog *p; 729 + int ret; 730 + u32 fd; 731 + 732 + ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX, 733 + attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL); 734 + if (ret < 0) 735 + return ret; 736 + 737 + if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME]) 738 + return -EINVAL; 739 + 740 + slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL); 741 + if (!slwt->bpf.name) 742 + return -ENOMEM; 743 + 744 + fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]); 745 + p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL); 746 + if (IS_ERR(p)) { 747 + kfree(slwt->bpf.name); 748 + return PTR_ERR(p); 749 + } 750 + 751 + slwt->bpf.prog = p; 752 + return 0; 753 + } 754 + 755 + static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) 756 + { 757 + struct nlattr *nest; 758 + 759 + if (!slwt->bpf.prog) 760 + return 0; 761 + 762 + nest = nla_nest_start(skb, SEG6_LOCAL_BPF); 763 + if (!nest) 764 + return -EMSGSIZE; 765 + 766 + if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id)) 767 + return -EMSGSIZE; 768 + 769 + if (slwt->bpf.name && 770 + nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name)) 771 + return -EMSGSIZE; 772 + 773 + return nla_nest_end(skb, nest); 774 + } 775 + 776 + static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b) 777 + { 778 + if (!a->bpf.name && !b->bpf.name) 779 + return 0; 780 + 781 + if (!a->bpf.name || !b->bpf.name) 782 + return 1; 783 + 784 + return strcmp(a->bpf.name, b->bpf.name); 785 + } 786 + 800 787 struct seg6_action_param { 801 788 int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); 802 789 int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); ··· 896 745 [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif, 897 746 .put = put_nla_oif, 898 747 .cmp = cmp_nla_oif }, 748 + 749 + [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf, 750 + .put = put_nla_bpf, 751 + .cmp = cmp_nla_bpf }, 752 + 899 753 }; 900 754 901 755 static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) ··· 986 830 struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); 987 831 988 832 kfree(slwt->srh); 833 + 834 + if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) { 835 + kfree(slwt->bpf.name); 836 + bpf_prog_put(slwt->bpf.prog); 837 + } 838 + 839 + return; 989 840 } 990 841 991 842 static int seg6_local_fill_encap(struct sk_buff *skb, ··· 1044 881 1045 882 if (attrs & (1 << SEG6_LOCAL_OIF)) 1046 883 nlsize += nla_total_size(4); 884 + 885 + if (attrs & (1 << SEG6_LOCAL_BPF)) 886 + nlsize += nla_total_size(sizeof(struct nlattr)) + 887 + nla_total_size(MAX_PROG_NAME) + 888 + nla_total_size(4); 1047 889 1048 890 return nlsize; 1049 891 }

-1

net/xdp/Makefile

··· 1 1 obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o 2 -

+42 -54

net/xdp/xdp_umem.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* XDP user-space packet buffer 3 3 * Copyright(c) 2018 Intel Corporation. 4 - * 5 - * This program is free software; you can redistribute it and/or modify it 6 - * under the terms and conditions of the GNU General Public License, 7 - * version 2, as published by the Free Software Foundation. 8 - * 9 - * This program is distributed in the hope it will be useful, but WITHOUT 10 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 - * more details. 13 4 */ 14 5 15 6 #include <linux/init.h> ··· 16 25 17 26 #define XDP_UMEM_MIN_FRAME_SIZE 2048 18 27 19 - int xdp_umem_create(struct xdp_umem **umem) 20 - { 21 - *umem = kzalloc(sizeof(**umem), GFP_KERNEL); 22 - 23 - if (!(*umem)) 24 - return -ENOMEM; 25 - 26 - return 0; 27 - } 28 - 29 28 static void xdp_umem_unpin_pages(struct xdp_umem *umem) 30 29 { 31 30 unsigned int i; 32 31 33 - if (umem->pgs) { 34 - for (i = 0; i < umem->npgs; i++) { 35 - struct page *page = umem->pgs[i]; 32 + for (i = 0; i < umem->npgs; i++) { 33 + struct page *page = umem->pgs[i]; 36 34 37 - set_page_dirty_lock(page); 38 - put_page(page); 39 - } 40 - 41 - kfree(umem->pgs); 42 - umem->pgs = NULL; 35 + set_page_dirty_lock(page); 36 + put_page(page); 43 37 } 38 + 39 + kfree(umem->pgs); 40 + umem->pgs = NULL; 44 41 } 45 42 46 43 static void xdp_umem_unaccount_pages(struct xdp_umem *umem) 47 44 { 48 - if (umem->user) { 49 - atomic_long_sub(umem->npgs, &umem->user->locked_vm); 50 - free_uid(umem->user); 51 - } 45 + atomic_long_sub(umem->npgs, &umem->user->locked_vm); 46 + free_uid(umem->user); 52 47 } 53 48 54 49 static void xdp_umem_release(struct xdp_umem *umem) ··· 52 75 umem->cq = NULL; 53 76 } 54 77 55 - if (umem->pgs) { 56 - xdp_umem_unpin_pages(umem); 78 + xdp_umem_unpin_pages(umem); 57 79 58 - task = get_pid_task(umem->pid, PIDTYPE_PID); 59 - put_pid(umem->pid); 60 - if (!task) 61 - goto out; 62 - mm = get_task_mm(task); 63 - put_task_struct(task); 64 - if (!mm) 65 - goto out; 80 + task = get_pid_task(umem->pid, PIDTYPE_PID); 81 + put_pid(umem->pid); 82 + if (!task) 83 + goto out; 84 + mm = get_task_mm(task); 85 + put_task_struct(task); 86 + if (!mm) 87 + goto out; 66 88 67 - mmput(mm); 68 - umem->pgs = NULL; 69 - } 70 - 89 + mmput(mm); 71 90 xdp_umem_unaccount_pages(umem); 72 91 out: 73 92 kfree(umem); ··· 78 105 79 106 void xdp_get_umem(struct xdp_umem *umem) 80 107 { 81 - atomic_inc(&umem->users); 108 + refcount_inc(&umem->users); 82 109 } 83 110 84 111 void xdp_put_umem(struct xdp_umem *umem) ··· 86 113 if (!umem) 87 114 return; 88 115 89 - if (atomic_dec_and_test(&umem->users)) { 116 + if (refcount_dec_and_test(&umem->users)) { 90 117 INIT_WORK(&umem->work, xdp_umem_release_deferred); 91 118 schedule_work(&umem->work); 92 119 } ··· 149 176 return 0; 150 177 } 151 178 152 - int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) 179 + static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) 153 180 { 154 181 u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; 155 182 u64 addr = mr->addr, size = mr->len; 156 183 unsigned int nframes, nfpp; 157 184 int size_chk, err; 158 - 159 - if (!umem) 160 - return -EINVAL; 161 185 162 186 if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { 163 187 /* Strictly speaking we could support this, if: ··· 206 236 umem->frame_size_log2 = ilog2(frame_size); 207 237 umem->nfpp_mask = nfpp - 1; 208 238 umem->nfpplog2 = ilog2(nfpp); 209 - atomic_set(&umem->users, 1); 239 + refcount_set(&umem->users, 1); 210 240 211 241 err = xdp_umem_account_pages(umem); 212 242 if (err) ··· 224 254 return err; 225 255 } 226 256 257 + struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) 258 + { 259 + struct xdp_umem *umem; 260 + int err; 261 + 262 + umem = kzalloc(sizeof(*umem), GFP_KERNEL); 263 + if (!umem) 264 + return ERR_PTR(-ENOMEM); 265 + 266 + err = xdp_umem_reg(umem, mr); 267 + if (err) { 268 + kfree(umem); 269 + return ERR_PTR(err); 270 + } 271 + 272 + return umem; 273 + } 274 + 227 275 bool xdp_umem_validate_queues(struct xdp_umem *umem) 228 276 { 229 - return (umem->fq && umem->cq); 277 + return umem->fq && umem->cq; 230 278 }

+4 -14

net/xdp/xdp_umem.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 2 - * XDP user-space packet buffer 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* XDP user-space packet buffer 3 3 * Copyright(c) 2018 Intel Corporation. 4 - * 5 - * This program is free software; you can redistribute it and/or modify it 6 - * under the terms and conditions of the GNU General Public License, 7 - * version 2, as published by the Free Software Foundation. 8 - * 9 - * This program is distributed in the hope it will be useful, but WITHOUT 10 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 - * more details. 13 4 */ 14 5 15 6 #ifndef XDP_UMEM_H_ ··· 27 36 struct pid *pid; 28 37 unsigned long address; 29 38 size_t size; 30 - atomic_t users; 39 + refcount_t users; 31 40 struct work_struct work; 32 41 }; 33 42 ··· 50 59 } 51 60 52 61 bool xdp_umem_validate_queues(struct xdp_umem *umem); 53 - int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); 54 62 void xdp_get_umem(struct xdp_umem *umem); 55 63 void xdp_put_umem(struct xdp_umem *umem); 56 - int xdp_umem_create(struct xdp_umem **umem); 64 + struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); 57 65 58 66 #endif /* XDP_UMEM_H_ */

+2 -11

net/xdp/xdp_umem_props.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 2 - * XDP user-space packet buffer 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* XDP user-space packet buffer 3 3 * Copyright(c) 2018 Intel Corporation. 4 - * 5 - * This program is free software; you can redistribute it and/or modify it 6 - * under the terms and conditions of the GNU General Public License, 7 - * version 2, as published by the Free Software Foundation. 8 - * 9 - * This program is distributed in the hope it will be useful, but WITHOUT 10 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 - * more details. 13 4 */ 14 5 15 6 #ifndef XDP_UMEM_PROPS_H_

+85 -67

net/xdp/xsk.c

··· 5 5 * applications. 6 6 * Copyright(c) 2018 Intel Corporation. 7 7 * 8 - * This program is free software; you can redistribute it and/or modify it 9 - * under the terms and conditions of the GNU General Public License, 10 - * version 2, as published by the Free Software Foundation. 11 - * 12 - * This program is distributed in the hope it will be useful, but WITHOUT 13 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 - * more details. 16 - * 17 8 * Author(s): Björn Töpel <bjorn.topel@intel.com> 18 9 * Magnus Karlsson <magnus.karlsson@intel.com> 19 10 */ ··· 142 151 goto out; 143 152 } 144 153 154 + if (xs->queue_id >= xs->dev->real_num_tx_queues) { 155 + err = -ENXIO; 156 + goto out; 157 + } 158 + 145 159 skb = sock_alloc_send_skb(sk, len, !need_wait, &err); 146 160 if (unlikely(!skb)) { 147 161 err = -EAGAIN; ··· 228 232 if (!q) 229 233 return -ENOMEM; 230 234 235 + /* Make sure queue is ready before it can be seen by others */ 236 + smp_wmb(); 231 237 *queue = q; 232 238 return 0; 233 - } 234 - 235 - static void __xsk_release(struct xdp_sock *xs) 236 - { 237 - /* Wait for driver to stop using the xdp socket. */ 238 - synchronize_net(); 239 - 240 - dev_put(xs->dev); 241 239 } 242 240 243 241 static int xsk_release(struct socket *sock) ··· 250 260 local_bh_enable(); 251 261 252 262 if (xs->dev) { 253 - __xsk_release(xs); 263 + /* Wait for driver to stop using the xdp socket. */ 264 + synchronize_net(); 265 + dev_put(xs->dev); 254 266 xs->dev = NULL; 255 267 } 256 268 ··· 286 294 { 287 295 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; 288 296 struct sock *sk = sock->sk; 289 - struct net_device *dev, *dev_curr; 290 297 struct xdp_sock *xs = xdp_sk(sk); 291 - struct xdp_umem *old_umem = NULL; 298 + struct net_device *dev; 292 299 int err = 0; 293 300 294 301 if (addr_len < sizeof(struct sockaddr_xdp)) ··· 296 305 return -EINVAL; 297 306 298 307 mutex_lock(&xs->mutex); 299 - dev_curr = xs->dev; 308 + if (xs->dev) { 309 + err = -EBUSY; 310 + goto out_release; 311 + } 312 + 300 313 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); 301 314 if (!dev) { 302 315 err = -ENODEV; ··· 312 317 goto out_unlock; 313 318 } 314 319 315 - if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { 320 + if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || 321 + (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { 316 322 err = -EINVAL; 317 323 goto out_unlock; 318 324 } ··· 348 352 } 349 353 350 354 xdp_get_umem(umem_xs->umem); 351 - old_umem = xs->umem; 352 355 xs->umem = umem_xs->umem; 353 356 sockfd_put(sock); 354 357 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { ··· 357 362 /* This xsk has its own umem. */ 358 363 xskq_set_umem(xs->umem->fq, &xs->umem->props); 359 364 xskq_set_umem(xs->umem->cq, &xs->umem->props); 360 - } 361 - 362 - /* Rebind? */ 363 - if (dev_curr && (dev_curr != dev || 364 - xs->queue_id != sxdp->sxdp_queue_id)) { 365 - __xsk_release(xs); 366 - if (old_umem) 367 - xdp_put_umem(old_umem); 368 365 } 369 366 370 367 xs->dev = dev; ··· 406 419 struct xdp_umem_reg mr; 407 420 struct xdp_umem *umem; 408 421 409 - if (xs->umem) 410 - return -EBUSY; 411 - 412 422 if (copy_from_user(&mr, optval, sizeof(mr))) 413 423 return -EFAULT; 414 424 415 425 mutex_lock(&xs->mutex); 416 - err = xdp_umem_create(&umem); 417 - 418 - err = xdp_umem_reg(umem, &mr); 419 - if (err) { 420 - kfree(umem); 426 + if (xs->umem) { 421 427 mutex_unlock(&xs->mutex); 422 - return err; 428 + return -EBUSY; 429 + } 430 + 431 + umem = xdp_umem_create(&mr); 432 + if (IS_ERR(umem)) { 433 + mutex_unlock(&xs->mutex); 434 + return PTR_ERR(umem); 423 435 } 424 436 425 437 /* Make sure umem is ready before it can be seen by others */ 426 438 smp_wmb(); 427 - 428 439 xs->umem = umem; 429 440 mutex_unlock(&xs->mutex); 430 441 return 0; ··· 433 448 struct xsk_queue **q; 434 449 int entries; 435 450 436 - if (!xs->umem) 437 - return -EINVAL; 438 - 439 451 if (copy_from_user(&entries, optval, sizeof(entries))) 440 452 return -EFAULT; 441 453 442 454 mutex_lock(&xs->mutex); 455 + if (!xs->umem) { 456 + mutex_unlock(&xs->mutex); 457 + return -EINVAL; 458 + } 459 + 443 460 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : 444 461 &xs->umem->cq; 445 462 err = xsk_init_queue(entries, q, true); ··· 491 504 492 505 return 0; 493 506 } 507 + case XDP_MMAP_OFFSETS: 508 + { 509 + struct xdp_mmap_offsets off; 510 + 511 + if (len < sizeof(off)) 512 + return -EINVAL; 513 + 514 + off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 515 + off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 516 + off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); 517 + off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); 518 + off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); 519 + off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); 520 + 521 + off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); 522 + off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 523 + off.fr.desc = offsetof(struct xdp_umem_ring, desc); 524 + off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); 525 + off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); 526 + off.cr.desc = offsetof(struct xdp_umem_ring, desc); 527 + 528 + len = sizeof(off); 529 + if (copy_to_user(optval, &off, len)) 530 + return -EFAULT; 531 + if (put_user(len, optlen)) 532 + return -EFAULT; 533 + 534 + return 0; 535 + } 494 536 default: 495 537 break; 496 538 } ··· 534 518 unsigned long size = vma->vm_end - vma->vm_start; 535 519 struct xdp_sock *xs = xdp_sk(sock->sk); 536 520 struct xsk_queue *q = NULL; 521 + struct xdp_umem *umem; 537 522 unsigned long pfn; 538 523 struct page *qpg; 539 524 540 525 if (offset == XDP_PGOFF_RX_RING) { 541 - q = xs->rx; 526 + q = READ_ONCE(xs->rx); 542 527 } else if (offset == XDP_PGOFF_TX_RING) { 543 - q = xs->tx; 528 + q = READ_ONCE(xs->tx); 544 529 } else { 545 - if (!xs->umem) 530 + umem = READ_ONCE(xs->umem); 531 + if (!umem) 546 532 return -EINVAL; 547 533 548 534 if (offset == XDP_UMEM_PGOFF_FILL_RING) 549 - q = xs->umem->fq; 535 + q = READ_ONCE(umem->fq); 550 536 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) 551 - q = xs->umem->cq; 537 + q = READ_ONCE(umem->cq); 552 538 } 553 539 554 540 if (!q) ··· 572 554 }; 573 555 574 556 static const struct proto_ops xsk_proto_ops = { 575 - .family = PF_XDP, 576 - .owner = THIS_MODULE, 577 - .release = xsk_release, 578 - .bind = xsk_bind, 579 - .connect = sock_no_connect, 580 - .socketpair = sock_no_socketpair, 581 - .accept = sock_no_accept, 582 - .getname = sock_no_getname, 583 - .poll = xsk_poll, 584 - .ioctl = sock_no_ioctl, 585 - .listen = sock_no_listen, 586 - .shutdown = sock_no_shutdown, 587 - .setsockopt = xsk_setsockopt, 588 - .getsockopt = xsk_getsockopt, 589 - .sendmsg = xsk_sendmsg, 590 - .recvmsg = sock_no_recvmsg, 591 - .mmap = xsk_mmap, 592 - .sendpage = sock_no_sendpage, 557 + .family = PF_XDP, 558 + .owner = THIS_MODULE, 559 + .release = xsk_release, 560 + .bind = xsk_bind, 561 + .connect = sock_no_connect, 562 + .socketpair = sock_no_socketpair, 563 + .accept = sock_no_accept, 564 + .getname = sock_no_getname, 565 + .poll = xsk_poll, 566 + .ioctl = sock_no_ioctl, 567 + .listen = sock_no_listen, 568 + .shutdown = sock_no_shutdown, 569 + .setsockopt = xsk_setsockopt, 570 + .getsockopt = xsk_getsockopt, 571 + .sendmsg = xsk_sendmsg, 572 + .recvmsg = sock_no_recvmsg, 573 + .mmap = xsk_mmap, 574 + .sendpage = sock_no_sendpage, 593 575 }; 594 576 595 577 static void xsk_destruct(struct sock *sk)

+1 -11

net/xdp/xsk_queue.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* XDP user-space ring structure 3 3 * Copyright(c) 2018 Intel Corporation. 4 - * 5 - * This program is free software; you can redistribute it and/or modify it 6 - * under the terms and conditions of the GNU General Public License, 7 - * version 2, as published by the Free Software Foundation. 8 - * 9 - * This program is distributed in the hope it will be useful, but WITHOUT 10 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 - * more details. 13 4 */ 14 5 15 6 #include <linux/slab.h> ··· 22 31 23 32 static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) 24 33 { 25 - return (sizeof(struct xdp_ring) + 26 - q->nentries * sizeof(struct xdp_desc)); 34 + return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); 27 35 } 28 36 29 37 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)

+21 -13

net/xdp/xsk_queue.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 2 - * XDP user-space ring structure 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* XDP user-space ring structure 3 3 * Copyright(c) 2018 Intel Corporation. 4 - * 5 - * This program is free software; you can redistribute it and/or modify it 6 - * under the terms and conditions of the GNU General Public License, 7 - * version 2, as published by the Free Software Foundation. 8 - * 9 - * This program is distributed in the hope it will be useful, but WITHOUT 10 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 - * more details. 13 4 */ 14 5 15 6 #ifndef _LINUX_XSK_QUEUE_H ··· 12 21 #include "xdp_umem_props.h" 13 22 14 23 #define RX_BATCH_SIZE 16 24 + 25 + struct xdp_ring { 26 + u32 producer ____cacheline_aligned_in_smp; 27 + u32 consumer ____cacheline_aligned_in_smp; 28 + }; 29 + 30 + /* Used for the RX and TX queues for packets */ 31 + struct xdp_rxtx_ring { 32 + struct xdp_ring ptrs; 33 + struct xdp_desc desc[0] ____cacheline_aligned_in_smp; 34 + }; 35 + 36 + /* Used for the fill and completion queues for buffers */ 37 + struct xdp_umem_ring { 38 + struct xdp_ring ptrs; 39 + u32 desc[0] ____cacheline_aligned_in_smp; 40 + }; 15 41 16 42 struct xsk_queue { 17 43 struct xdp_umem_props umem_props; ··· 240 232 241 233 static inline bool xskq_full_desc(struct xsk_queue *q) 242 234 { 243 - return (xskq_nb_avail(q, q->nentries) == q->nentries); 235 + return xskq_nb_avail(q, q->nentries) == q->nentries; 244 236 } 245 237 246 238 static inline bool xskq_empty_desc(struct xsk_queue *q) 247 239 { 248 - return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); 240 + return xskq_nb_free(q, q->prod_tail, 1) == q->nentries; 249 241 } 250 242 251 243 void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);

+4

samples/bpf/Makefile

··· 51 51 hostprogs-y += xdp_adjust_tail 52 52 hostprogs-y += xdpsock 53 53 hostprogs-y += xdp_fwd 54 + hostprogs-y += task_fd_query 54 55 55 56 # Libbpf dependencies 56 57 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a ··· 106 105 xdp_adjust_tail-objs := xdp_adjust_tail_user.o 107 106 xdpsock-objs := bpf_load.o xdpsock_user.o 108 107 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o 108 + task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) 109 109 110 110 # Tell kbuild to always build the programs 111 111 always := $(hostprogs-y) ··· 162 160 always += xdp_adjust_tail_kern.o 163 161 always += xdpsock_kern.o 164 162 always += xdp_fwd_kern.o 163 + always += task_fd_query_kern.o 165 164 166 165 HOSTCFLAGS += -I$(objtree)/usr/include 167 166 HOSTCFLAGS += -I$(srctree)/tools/lib/ ··· 178 175 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/ 179 176 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/ 180 177 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/ 178 + HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/ 181 179 182 180 HOST_LOADLIBES += $(LIBBPF) -lelf 183 181 HOSTLOADLIBES_tracex4 += -lrt

+19

samples/bpf/task_fd_query_kern.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <linux/version.h> 3 + #include <linux/ptrace.h> 4 + #include <uapi/linux/bpf.h> 5 + #include "bpf_helpers.h" 6 + 7 + SEC("kprobe/blk_start_request") 8 + int bpf_prog1(struct pt_regs *ctx) 9 + { 10 + return 0; 11 + } 12 + 13 + SEC("kretprobe/blk_account_io_completion") 14 + int bpf_prog2(struct pt_regs *ctx) 15 + { 16 + return 0; 17 + } 18 + char _license[] SEC("license") = "GPL"; 19 + u32 _version SEC("version") = LINUX_VERSION_CODE;

+382

samples/bpf/task_fd_query_user.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <stdio.h> 4 + #include <stdlib.h> 5 + #include <signal.h> 6 + #include <unistd.h> 7 + #include <stdbool.h> 8 + #include <string.h> 9 + #include <stdint.h> 10 + #include <fcntl.h> 11 + #include <linux/bpf.h> 12 + #include <sys/ioctl.h> 13 + #include <sys/resource.h> 14 + #include <sys/types.h> 15 + #include <sys/stat.h> 16 + 17 + #include "libbpf.h" 18 + #include "bpf_load.h" 19 + #include "bpf_util.h" 20 + #include "perf-sys.h" 21 + #include "trace_helpers.h" 22 + 23 + #define CHECK_PERROR_RET(condition) ({ \ 24 + int __ret = !!(condition); \ 25 + if (__ret) { \ 26 + printf("FAIL: %s:\n", __func__); \ 27 + perror(" "); \ 28 + return -1; \ 29 + } \ 30 + }) 31 + 32 + #define CHECK_AND_RET(condition) ({ \ 33 + int __ret = !!(condition); \ 34 + if (__ret) \ 35 + return -1; \ 36 + }) 37 + 38 + static __u64 ptr_to_u64(void *ptr) 39 + { 40 + return (__u64) (unsigned long) ptr; 41 + } 42 + 43 + #define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type" 44 + static int bpf_find_probe_type(const char *event_type) 45 + { 46 + char buf[256]; 47 + int fd, ret; 48 + 49 + ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type); 50 + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); 51 + 52 + fd = open(buf, O_RDONLY); 53 + CHECK_PERROR_RET(fd < 0); 54 + 55 + ret = read(fd, buf, sizeof(buf)); 56 + close(fd); 57 + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); 58 + 59 + errno = 0; 60 + ret = (int)strtol(buf, NULL, 10); 61 + CHECK_PERROR_RET(errno); 62 + return ret; 63 + } 64 + 65 + #define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe" 66 + static int bpf_get_retprobe_bit(const char *event_type) 67 + { 68 + char buf[256]; 69 + int fd, ret; 70 + 71 + ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type); 72 + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); 73 + 74 + fd = open(buf, O_RDONLY); 75 + CHECK_PERROR_RET(fd < 0); 76 + 77 + ret = read(fd, buf, sizeof(buf)); 78 + close(fd); 79 + CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); 80 + CHECK_PERROR_RET(strlen(buf) < strlen("config:")); 81 + 82 + errno = 0; 83 + ret = (int)strtol(buf + strlen("config:"), NULL, 10); 84 + CHECK_PERROR_RET(errno); 85 + return ret; 86 + } 87 + 88 + static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, 89 + __u32 expected_fd_type) 90 + { 91 + __u64 probe_offset, probe_addr; 92 + __u32 len, prog_id, fd_type; 93 + char buf[256]; 94 + int err; 95 + 96 + len = sizeof(buf); 97 + err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len, 98 + &prog_id, &fd_type, &probe_offset, 99 + &probe_addr); 100 + if (err < 0) { 101 + printf("FAIL: %s, for event_fd idx %d, fn_name %s\n", 102 + __func__, prog_fd_idx, fn_name); 103 + perror(" :"); 104 + return -1; 105 + } 106 + if (strcmp(buf, fn_name) != 0 || 107 + fd_type != expected_fd_type || 108 + probe_offset != 0x0 || probe_addr != 0x0) { 109 + printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n", 110 + prog_fd_idx); 111 + printf("buf: %s, fd_type: %u, probe_offset: 0x%llx," 112 + " probe_addr: 0x%llx\n", 113 + buf, fd_type, probe_offset, probe_addr); 114 + return -1; 115 + } 116 + return 0; 117 + } 118 + 119 + static int test_nondebug_fs_kuprobe_common(const char *event_type, 120 + const char *name, __u64 offset, __u64 addr, bool is_return, 121 + char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, 122 + __u64 *probe_offset, __u64 *probe_addr) 123 + { 124 + int is_return_bit = bpf_get_retprobe_bit(event_type); 125 + int type = bpf_find_probe_type(event_type); 126 + struct perf_event_attr attr = {}; 127 + int fd; 128 + 129 + if (type < 0 || is_return_bit < 0) { 130 + printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n", 131 + __func__, type, is_return_bit); 132 + return -1; 133 + } 134 + 135 + attr.sample_period = 1; 136 + attr.wakeup_events = 1; 137 + if (is_return) 138 + attr.config |= 1 << is_return_bit; 139 + 140 + if (name) { 141 + attr.config1 = ptr_to_u64((void *)name); 142 + attr.config2 = offset; 143 + } else { 144 + attr.config1 = 0; 145 + attr.config2 = addr; 146 + } 147 + attr.size = sizeof(attr); 148 + attr.type = type; 149 + 150 + fd = sys_perf_event_open(&attr, -1, 0, -1, 0); 151 + CHECK_PERROR_RET(fd < 0); 152 + 153 + CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0); 154 + CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); 155 + CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len, 156 + prog_id, fd_type, probe_offset, probe_addr) < 0); 157 + 158 + return 0; 159 + } 160 + 161 + static int test_nondebug_fs_probe(const char *event_type, const char *name, 162 + __u64 offset, __u64 addr, bool is_return, 163 + __u32 expected_fd_type, 164 + __u32 expected_ret_fd_type, 165 + char *buf, __u32 buf_len) 166 + { 167 + __u64 probe_offset, probe_addr; 168 + __u32 prog_id, fd_type; 169 + int err; 170 + 171 + err = test_nondebug_fs_kuprobe_common(event_type, name, 172 + offset, addr, is_return, 173 + buf, &buf_len, &prog_id, 174 + &fd_type, &probe_offset, 175 + &probe_addr); 176 + if (err < 0) { 177 + printf("FAIL: %s, " 178 + "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n", 179 + __func__, name ? name : "", offset, addr, is_return); 180 + perror(" :"); 181 + return -1; 182 + } 183 + if ((is_return && fd_type != expected_ret_fd_type) || 184 + (!is_return && fd_type != expected_fd_type)) { 185 + printf("FAIL: %s, incorrect fd_type %u\n", 186 + __func__, fd_type); 187 + return -1; 188 + } 189 + if (name) { 190 + if (strcmp(name, buf) != 0) { 191 + printf("FAIL: %s, incorrect buf %s\n", __func__, buf); 192 + return -1; 193 + } 194 + if (probe_offset != offset) { 195 + printf("FAIL: %s, incorrect probe_offset 0x%llx\n", 196 + __func__, probe_offset); 197 + return -1; 198 + } 199 + } else { 200 + if (buf_len != 0) { 201 + printf("FAIL: %s, incorrect buf %p\n", 202 + __func__, buf); 203 + return -1; 204 + } 205 + 206 + if (probe_addr != addr) { 207 + printf("FAIL: %s, incorrect probe_addr 0x%llx\n", 208 + __func__, probe_addr); 209 + return -1; 210 + } 211 + } 212 + return 0; 213 + } 214 + 215 + static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) 216 + { 217 + const char *event_type = "uprobe"; 218 + struct perf_event_attr attr = {}; 219 + char buf[256], event_alias[256]; 220 + __u64 probe_offset, probe_addr; 221 + __u32 len, prog_id, fd_type; 222 + int err, res, kfd, efd; 223 + ssize_t bytes; 224 + 225 + snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", 226 + event_type); 227 + kfd = open(buf, O_WRONLY | O_APPEND, 0); 228 + CHECK_PERROR_RET(kfd < 0); 229 + 230 + res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid()); 231 + CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias)); 232 + 233 + res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", 234 + is_return ? 'r' : 'p', event_type, event_alias, 235 + binary_path, offset); 236 + CHECK_PERROR_RET(res < 0 || res >= sizeof(buf)); 237 + CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0); 238 + 239 + close(kfd); 240 + kfd = -1; 241 + 242 + snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id", 243 + event_type, event_alias); 244 + efd = open(buf, O_RDONLY, 0); 245 + CHECK_PERROR_RET(efd < 0); 246 + 247 + bytes = read(efd, buf, sizeof(buf)); 248 + CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf)); 249 + close(efd); 250 + buf[bytes] = '\0'; 251 + 252 + attr.config = strtol(buf, NULL, 0); 253 + attr.type = PERF_TYPE_TRACEPOINT; 254 + attr.sample_period = 1; 255 + attr.wakeup_events = 1; 256 + kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); 257 + CHECK_PERROR_RET(kfd < 0); 258 + CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); 259 + CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0); 260 + 261 + len = sizeof(buf); 262 + err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len, 263 + &prog_id, &fd_type, &probe_offset, 264 + &probe_addr); 265 + if (err < 0) { 266 + printf("FAIL: %s, binary_path %s\n", __func__, binary_path); 267 + perror(" :"); 268 + return -1; 269 + } 270 + if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) || 271 + (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) { 272 + printf("FAIL: %s, incorrect fd_type %u\n", __func__, 273 + fd_type); 274 + return -1; 275 + } 276 + if (strcmp(binary_path, buf) != 0) { 277 + printf("FAIL: %s, incorrect buf %s\n", __func__, buf); 278 + return -1; 279 + } 280 + if (probe_offset != offset) { 281 + printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__, 282 + probe_offset); 283 + return -1; 284 + } 285 + 286 + close(kfd); 287 + return 0; 288 + } 289 + 290 + int main(int argc, char **argv) 291 + { 292 + struct rlimit r = {1024*1024, RLIM_INFINITY}; 293 + extern char __executable_start; 294 + char filename[256], buf[256]; 295 + __u64 uprobe_file_offset; 296 + 297 + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 298 + if (setrlimit(RLIMIT_MEMLOCK, &r)) { 299 + perror("setrlimit(RLIMIT_MEMLOCK)"); 300 + return 1; 301 + } 302 + 303 + if (load_kallsyms()) { 304 + printf("failed to process /proc/kallsyms\n"); 305 + return 1; 306 + } 307 + 308 + if (load_bpf_file(filename)) { 309 + printf("%s", bpf_log_buf); 310 + return 1; 311 + } 312 + 313 + /* test two functions in the corresponding *_kern.c file */ 314 + CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request", 315 + BPF_FD_TYPE_KPROBE)); 316 + CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion", 317 + BPF_FD_TYPE_KRETPROBE)); 318 + 319 + /* test nondebug fs kprobe */ 320 + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, 321 + false, BPF_FD_TYPE_KPROBE, 322 + BPF_FD_TYPE_KRETPROBE, 323 + buf, sizeof(buf))); 324 + #ifdef __x86_64__ 325 + /* set a kprobe on "bpf_check + 0x5", which is x64 specific */ 326 + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0, 327 + false, BPF_FD_TYPE_KPROBE, 328 + BPF_FD_TYPE_KRETPROBE, 329 + buf, sizeof(buf))); 330 + #endif 331 + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, 332 + true, BPF_FD_TYPE_KPROBE, 333 + BPF_FD_TYPE_KRETPROBE, 334 + buf, sizeof(buf))); 335 + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, 336 + ksym_get_addr("bpf_check"), false, 337 + BPF_FD_TYPE_KPROBE, 338 + BPF_FD_TYPE_KRETPROBE, 339 + buf, sizeof(buf))); 340 + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, 341 + ksym_get_addr("bpf_check"), false, 342 + BPF_FD_TYPE_KPROBE, 343 + BPF_FD_TYPE_KRETPROBE, 344 + NULL, 0)); 345 + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, 346 + ksym_get_addr("bpf_check"), true, 347 + BPF_FD_TYPE_KPROBE, 348 + BPF_FD_TYPE_KRETPROBE, 349 + buf, sizeof(buf))); 350 + CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, 351 + ksym_get_addr("bpf_check"), true, 352 + BPF_FD_TYPE_KPROBE, 353 + BPF_FD_TYPE_KRETPROBE, 354 + 0, 0)); 355 + 356 + /* test nondebug fs uprobe */ 357 + /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64 358 + * and the default linker script, which defines __executable_start as 359 + * the start of the .text section. The calculation could be different 360 + * on different systems with different compilers. The right way is 361 + * to parse the ELF file. We took a shortcut here. 362 + */ 363 + uprobe_file_offset = (__u64)main - (__u64)&__executable_start; 364 + CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], 365 + uprobe_file_offset, 0x0, false, 366 + BPF_FD_TYPE_UPROBE, 367 + BPF_FD_TYPE_URETPROBE, 368 + buf, sizeof(buf))); 369 + CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], 370 + uprobe_file_offset, 0x0, true, 371 + BPF_FD_TYPE_UPROBE, 372 + BPF_FD_TYPE_URETPROBE, 373 + buf, sizeof(buf))); 374 + 375 + /* test debug fs uprobe */ 376 + CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, 377 + false)); 378 + CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, 379 + true)); 380 + 381 + return 0; 382 + }

+49

samples/bpf/xdp_monitor_kern.c

··· 125 125 u64 processed; 126 126 u64 dropped; 127 127 u64 info; 128 + u64 err; 128 129 }; 129 130 #define MAX_CPUS 64 130 131 ··· 208 207 rec->info++; 209 208 210 209 return 0; 210 + } 211 + 212 + struct bpf_map_def SEC("maps") devmap_xmit_cnt = { 213 + .type = BPF_MAP_TYPE_PERCPU_ARRAY, 214 + .key_size = sizeof(u32), 215 + .value_size = sizeof(struct datarec), 216 + .max_entries = 1, 217 + }; 218 + 219 + /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format 220 + * Code in: kernel/include/trace/events/xdp.h 221 + */ 222 + struct devmap_xmit_ctx { 223 + u64 __pad; // First 8 bytes are not accessible by bpf code 224 + int map_id; // offset:8; size:4; signed:1; 225 + u32 act; // offset:12; size:4; signed:0; 226 + u32 map_index; // offset:16; size:4; signed:0; 227 + int drops; // offset:20; size:4; signed:1; 228 + int sent; // offset:24; size:4; signed:1; 229 + int from_ifindex; // offset:28; size:4; signed:1; 230 + int to_ifindex; // offset:32; size:4; signed:1; 231 + int err; // offset:36; size:4; signed:1; 232 + }; 233 + 234 + SEC("tracepoint/xdp/xdp_devmap_xmit") 235 + int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx) 236 + { 237 + struct datarec *rec; 238 + u32 key = 0; 239 + 240 + rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key); 241 + if (!rec) 242 + return 0; 243 + rec->processed += ctx->sent; 244 + rec->dropped += ctx->drops; 245 + 246 + /* Record bulk events, then userspace can calc average bulk size */ 247 + rec->info += 1; 248 + 249 + /* Record error cases, where no frame were sent */ 250 + if (ctx->err) 251 + rec->err++; 252 + 253 + /* Catch API error of drv ndo_xdp_xmit sent more than count */ 254 + if (ctx->drops < 0) 255 + rec->err++; 256 + 257 + return 1; 211 258 }

+68 -1

samples/bpf/xdp_monitor_user.c

··· 117 117 __u64 processed; 118 118 __u64 dropped; 119 119 __u64 info; 120 + __u64 err; 120 121 }; 121 122 #define MAX_CPUS 64 122 123 ··· 142 141 struct record_u64 xdp_exception[XDP_ACTION_MAX]; 143 142 struct record xdp_cpumap_kthread; 144 143 struct record xdp_cpumap_enqueue[MAX_CPUS]; 144 + struct record xdp_devmap_xmit; 145 145 }; 146 146 147 147 static bool map_collect_record(int fd, __u32 key, struct record *rec) ··· 153 151 __u64 sum_processed = 0; 154 152 __u64 sum_dropped = 0; 155 153 __u64 sum_info = 0; 154 + __u64 sum_err = 0; 156 155 int i; 157 156 158 157 if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { ··· 172 169 sum_dropped += values[i].dropped; 173 170 rec->cpu[i].info = values[i].info; 174 171 sum_info += values[i].info; 172 + rec->cpu[i].err = values[i].err; 173 + sum_err += values[i].err; 175 174 } 176 175 rec->total.processed = sum_processed; 177 176 rec->total.dropped = sum_dropped; 178 177 rec->total.info = sum_info; 178 + rec->total.err = sum_err; 179 179 return true; 180 180 } 181 181 ··· 274 268 275 269 if (period > 0) { 276 270 packets = r->info - p->info; 271 + pps = packets / period; 272 + } 273 + return pps; 274 + } 275 + 276 + static double calc_err(struct datarec *r, struct datarec *p, double period) 277 + { 278 + __u64 packets = 0; 279 + double pps = 0; 280 + 281 + if (period > 0) { 282 + packets = r->err - p->err; 277 283 pps = packets / period; 278 284 } 279 285 return pps; ··· 415 397 info = calc_info(r, p, t); 416 398 if (info > 0) 417 399 i_str = "sched"; 418 - if (pps > 0) 400 + if (pps > 0 || drop > 0) 419 401 printf(fmt1, "cpumap-kthread", 420 402 i, pps, drop, info, i_str); 421 403 } ··· 425 407 if (info > 0) 426 408 i_str = "sched-sum"; 427 409 printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); 410 + } 411 + 412 + /* devmap ndo_xdp_xmit stats */ 413 + { 414 + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n"; 415 + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n"; 416 + struct record *rec, *prev; 417 + double drop, info, err; 418 + char *i_str = ""; 419 + char *err_str = ""; 420 + 421 + rec = &stats_rec->xdp_devmap_xmit; 422 + prev = &stats_prev->xdp_devmap_xmit; 423 + t = calc_period(rec, prev); 424 + for (i = 0; i < nr_cpus; i++) { 425 + struct datarec *r = &rec->cpu[i]; 426 + struct datarec *p = &prev->cpu[i]; 427 + 428 + pps = calc_pps(r, p, t); 429 + drop = calc_drop(r, p, t); 430 + info = calc_info(r, p, t); 431 + err = calc_err(r, p, t); 432 + if (info > 0) { 433 + i_str = "bulk-average"; 434 + info = (pps+drop) / info; /* calc avg bulk */ 435 + } 436 + if (err > 0) 437 + err_str = "drv-err"; 438 + if (pps > 0 || drop > 0) 439 + printf(fmt1, "devmap-xmit", 440 + i, pps, drop, info, i_str, err_str); 441 + } 442 + pps = calc_pps(&rec->total, &prev->total, t); 443 + drop = calc_drop(&rec->total, &prev->total, t); 444 + info = calc_info(&rec->total, &prev->total, t); 445 + err = calc_err(&rec->total, &prev->total, t); 446 + if (info > 0) { 447 + i_str = "bulk-average"; 448 + info = (pps+drop) / info; /* calc avg bulk */ 449 + } 450 + if (err > 0) 451 + err_str = "drv-err"; 452 + printf(fmt2, "devmap-xmit", "total", pps, drop, 453 + info, i_str, err_str); 428 454 } 429 455 430 456 printf("\n"); ··· 498 436 499 437 fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ 500 438 map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); 439 + 440 + fd = map_data[4].fd; /* map4: devmap_xmit_cnt */ 441 + map_collect_record(fd, 0, &rec->xdp_devmap_xmit); 501 442 502 443 return true; 503 444 } ··· 545 480 546 481 rec_sz = sizeof(struct datarec); 547 482 rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); 483 + rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz); 548 484 549 485 for (i = 0; i < MAX_CPUS; i++) 550 486 rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); ··· 564 498 free(r->xdp_exception[i].cpu); 565 499 566 500 free(r->xdp_cpumap_kthread.cpu); 501 + free(r->xdp_devmap_xmit.cpu); 567 502 568 503 for (i = 0; i < MAX_CPUS; i++) 569 504 free(r->xdp_cpumap_enqueue[i].cpu);

+77 -58

samples/bpf/xdpsock_user.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 - /* Copyright(c) 2017 - 2018 Intel Corporation. 3 - * 4 - * This program is free software; you can redistribute it and/or modify it 5 - * under the terms and conditions of the GNU General Public License, 6 - * version 2, as published by the Free Software Foundation. 7 - * 8 - * This program is distributed in the hope it will be useful, but WITHOUT 9 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 - * more details. 12 - */ 2 + /* Copyright(c) 2017 - 2018 Intel Corporation. */ 13 3 14 4 #include <assert.h> 15 5 #include <errno.h> ··· 79 89 u32 cached_cons; 80 90 u32 mask; 81 91 u32 size; 82 - struct xdp_umem_ring *ring; 92 + u32 *producer; 93 + u32 *consumer; 94 + u32 *ring; 95 + void *map; 83 96 }; 84 97 85 98 struct xdp_umem { ··· 97 104 u32 cached_cons; 98 105 u32 mask; 99 106 u32 size; 100 - struct xdp_rxtx_ring *ring; 107 + u32 *producer; 108 + u32 *consumer; 109 + struct xdp_desc *ring; 110 + void *map; 101 111 }; 102 112 103 113 struct xdpsock { ··· 161 165 return free_entries; 162 166 163 167 /* Refresh the local tail pointer */ 164 - q->cached_cons = q->ring->ptrs.consumer; 168 + q->cached_cons = *q->consumer; 165 169 166 170 return q->size - (q->cached_prod - q->cached_cons); 167 171 } ··· 174 178 return free_entries; 175 179 176 180 /* Refresh the local tail pointer */ 177 - q->cached_cons = q->ring->ptrs.consumer + q->size; 181 + q->cached_cons = *q->consumer + q->size; 178 182 return q->cached_cons - q->cached_prod; 179 183 } 180 184 ··· 183 187 u32 entries = q->cached_prod - q->cached_cons; 184 188 185 189 if (entries == 0) { 186 - q->cached_prod = q->ring->ptrs.producer; 190 + q->cached_prod = *q->producer; 187 191 entries = q->cached_prod - q->cached_cons; 188 192 } 189 193 ··· 195 199 u32 entries = q->cached_prod - q->cached_cons; 196 200 197 201 if (entries == 0) { 198 - q->cached_prod = q->ring->ptrs.producer; 202 + q->cached_prod = *q->producer; 199 203 entries = q->cached_prod - q->cached_cons; 200 204 } 201 205 ··· 214 218 for (i = 0; i < nb; i++) { 215 219 u32 idx = fq->cached_prod++ & fq->mask; 216 220 217 - fq->ring->desc[idx] = d[i].idx; 221 + fq->ring[idx] = d[i].idx; 218 222 } 219 223 220 224 u_smp_wmb(); 221 225 222 - fq->ring->ptrs.producer = fq->cached_prod; 226 + *fq->producer = fq->cached_prod; 223 227 224 228 return 0; 225 229 } ··· 235 239 for (i = 0; i < nb; i++) { 236 240 u32 idx = fq->cached_prod++ & fq->mask; 237 241 238 - fq->ring->desc[idx] = d[i]; 242 + fq->ring[idx] = d[i]; 239 243 } 240 244 241 245 u_smp_wmb(); 242 246 243 - fq->ring->ptrs.producer = fq->cached_prod; 247 + *fq->producer = fq->cached_prod; 244 248 245 249 return 0; 246 250 } ··· 254 258 255 259 for (i = 0; i < entries; i++) { 256 260 idx = cq->cached_cons++ & cq->mask; 257 - d[i] = cq->ring->desc[idx]; 261 + d[i] = cq->ring[idx]; 258 262 } 259 263 260 264 if (entries > 0) { 261 265 u_smp_wmb(); 262 266 263 - cq->ring->ptrs.consumer = cq->cached_cons; 267 + *cq->consumer = cq->cached_cons; 264 268 } 265 269 266 270 return entries; ··· 276 280 const struct xdp_desc *descs, 277 281 unsigned int ndescs) 278 282 { 279 - struct xdp_rxtx_ring *r = uq->ring; 283 + struct xdp_desc *r = uq->ring; 280 284 unsigned int i; 281 285 282 286 if (xq_nb_free(uq, ndescs) < ndescs) ··· 285 289 for (i = 0; i < ndescs; i++) { 286 290 u32 idx = uq->cached_prod++ & uq->mask; 287 291 288 - r->desc[idx].idx = descs[i].idx; 289 - r->desc[idx].len = descs[i].len; 290 - r->desc[idx].offset = descs[i].offset; 292 + r[idx].idx = descs[i].idx; 293 + r[idx].len = descs[i].len; 294 + r[idx].offset = descs[i].offset; 291 295 } 292 296 293 297 u_smp_wmb(); 294 298 295 - r->ptrs.producer = uq->cached_prod; 299 + *uq->producer = uq->cached_prod; 296 300 return 0; 297 301 } 298 302 299 303 static inline int xq_enq_tx_only(struct xdp_uqueue *uq, 300 304 __u32 idx, unsigned int ndescs) 301 305 { 302 - struct xdp_rxtx_ring *q = uq->ring; 306 + struct xdp_desc *r = uq->ring; 303 307 unsigned int i; 304 308 305 309 if (xq_nb_free(uq, ndescs) < ndescs) ··· 308 312 for (i = 0; i < ndescs; i++) { 309 313 u32 idx = uq->cached_prod++ & uq->mask; 310 314 311 - q->desc[idx].idx = idx + i; 312 - q->desc[idx].len = sizeof(pkt_data) - 1; 313 - q->desc[idx].offset = 0; 315 + r[idx].idx = idx + i; 316 + r[idx].len = sizeof(pkt_data) - 1; 317 + r[idx].offset = 0; 314 318 } 315 319 316 320 u_smp_wmb(); 317 321 318 - q->ptrs.producer = uq->cached_prod; 322 + *uq->producer = uq->cached_prod; 319 323 return 0; 320 324 } 321 325 ··· 323 327 struct xdp_desc *descs, 324 328 int ndescs) 325 329 { 326 - struct xdp_rxtx_ring *r = uq->ring; 330 + struct xdp_desc *r = uq->ring; 327 331 unsigned int idx; 328 332 int i, entries; 329 333 ··· 333 337 334 338 for (i = 0; i < entries; i++) { 335 339 idx = uq->cached_cons++ & uq->mask; 336 - descs[i] = r->desc[idx]; 340 + descs[i] = r[idx]; 337 341 } 338 342 339 343 if (entries > 0) { 340 344 u_smp_wmb(); 341 345 342 - r->ptrs.consumer = uq->cached_cons; 346 + *uq->consumer = uq->cached_cons; 343 347 } 344 348 345 349 return entries; ··· 398 402 static struct xdp_umem *xdp_umem_configure(int sfd) 399 403 { 400 404 int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS; 405 + struct xdp_mmap_offsets off; 401 406 struct xdp_umem_reg mr; 402 407 struct xdp_umem *umem; 408 + socklen_t optlen; 403 409 void *bufs; 404 410 405 411 umem = calloc(1, sizeof(*umem)); ··· 421 423 lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size, 422 424 sizeof(int)) == 0); 423 425 424 - umem->fq.ring = mmap(0, sizeof(struct xdp_umem_ring) + 425 - FQ_NUM_DESCS * sizeof(u32), 426 - PROT_READ | PROT_WRITE, 427 - MAP_SHARED | MAP_POPULATE, sfd, 428 - XDP_UMEM_PGOFF_FILL_RING); 429 - lassert(umem->fq.ring != MAP_FAILED); 426 + optlen = sizeof(off); 427 + lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, 428 + &optlen) == 0); 429 + 430 + umem->fq.map = mmap(0, off.fr.desc + 431 + FQ_NUM_DESCS * sizeof(u32), 432 + PROT_READ | PROT_WRITE, 433 + MAP_SHARED | MAP_POPULATE, sfd, 434 + XDP_UMEM_PGOFF_FILL_RING); 435 + lassert(umem->fq.map != MAP_FAILED); 430 436 431 437 umem->fq.mask = FQ_NUM_DESCS - 1; 432 438 umem->fq.size = FQ_NUM_DESCS; 439 + umem->fq.producer = umem->fq.map + off.fr.producer; 440 + umem->fq.consumer = umem->fq.map + off.fr.consumer; 441 + umem->fq.ring = umem->fq.map + off.fr.desc; 433 442 434 - umem->cq.ring = mmap(0, sizeof(struct xdp_umem_ring) + 443 + umem->cq.map = mmap(0, off.cr.desc + 435 444 CQ_NUM_DESCS * sizeof(u32), 436 445 PROT_READ | PROT_WRITE, 437 446 MAP_SHARED | MAP_POPULATE, sfd, 438 447 XDP_UMEM_PGOFF_COMPLETION_RING); 439 - lassert(umem->cq.ring != MAP_FAILED); 448 + lassert(umem->cq.map != MAP_FAILED); 440 449 441 450 umem->cq.mask = CQ_NUM_DESCS - 1; 442 451 umem->cq.size = CQ_NUM_DESCS; 452 + umem->cq.producer = umem->cq.map + off.cr.producer; 453 + umem->cq.consumer = umem->cq.map + off.cr.consumer; 454 + umem->cq.ring = umem->cq.map + off.cr.desc; 443 455 444 456 umem->frames = (char (*)[FRAME_SIZE])bufs; 445 457 umem->fd = sfd; ··· 467 459 static struct xdpsock *xsk_configure(struct xdp_umem *umem) 468 460 { 469 461 struct sockaddr_xdp sxdp = {}; 462 + struct xdp_mmap_offsets off; 470 463 int sfd, ndescs = NUM_DESCS; 471 464 struct xdpsock *xsk; 472 465 bool shared = true; 466 + socklen_t optlen; 473 467 u32 i; 474 468 475 469 sfd = socket(PF_XDP, SOCK_RAW, 0); ··· 494 484 &ndescs, sizeof(int)) == 0); 495 485 lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING, 496 486 &ndescs, sizeof(int)) == 0); 487 + optlen = sizeof(off); 488 + lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, 489 + &optlen) == 0); 497 490 498 491 /* Rx */ 499 - xsk->rx.ring = mmap(NULL, 500 - sizeof(struct xdp_ring) + 501 - NUM_DESCS * sizeof(struct xdp_desc), 502 - PROT_READ | PROT_WRITE, 503 - MAP_SHARED | MAP_POPULATE, sfd, 504 - XDP_PGOFF_RX_RING); 505 - lassert(xsk->rx.ring != MAP_FAILED); 492 + xsk->rx.map = mmap(NULL, 493 + off.rx.desc + 494 + NUM_DESCS * sizeof(struct xdp_desc), 495 + PROT_READ | PROT_WRITE, 496 + MAP_SHARED | MAP_POPULATE, sfd, 497 + XDP_PGOFF_RX_RING); 498 + lassert(xsk->rx.map != MAP_FAILED); 506 499 507 500 if (!shared) { 508 501 for (i = 0; i < NUM_DESCS / 2; i++) ··· 514 501 } 515 502 516 503 /* Tx */ 517 - xsk->tx.ring = mmap(NULL, 518 - sizeof(struct xdp_ring) + 519 - NUM_DESCS * sizeof(struct xdp_desc), 520 - PROT_READ | PROT_WRITE, 521 - MAP_SHARED | MAP_POPULATE, sfd, 522 - XDP_PGOFF_TX_RING); 523 - lassert(xsk->tx.ring != MAP_FAILED); 504 + xsk->tx.map = mmap(NULL, 505 + off.tx.desc + 506 + NUM_DESCS * sizeof(struct xdp_desc), 507 + PROT_READ | PROT_WRITE, 508 + MAP_SHARED | MAP_POPULATE, sfd, 509 + XDP_PGOFF_TX_RING); 510 + lassert(xsk->tx.map != MAP_FAILED); 524 511 525 512 xsk->rx.mask = NUM_DESCS - 1; 526 513 xsk->rx.size = NUM_DESCS; 514 + xsk->rx.producer = xsk->rx.map + off.rx.producer; 515 + xsk->rx.consumer = xsk->rx.map + off.rx.consumer; 516 + xsk->rx.ring = xsk->rx.map + off.rx.desc; 527 517 528 518 xsk->tx.mask = NUM_DESCS - 1; 529 519 xsk->tx.size = NUM_DESCS; 520 + xsk->tx.producer = xsk->tx.map + off.tx.producer; 521 + xsk->tx.consumer = xsk->tx.map + off.tx.consumer; 522 + xsk->tx.ring = xsk->tx.map + off.tx.desc; 530 523 531 524 sxdp.sxdp_family = PF_XDP; 532 525 sxdp.sxdp_ifindex = opt_ifindex;

+4 -4

scripts/bpf_helpers_doc.py

··· 95 95 return capture.group(1) 96 96 97 97 def parse_desc(self): 98 - p = re.compile(' \* ?(?:\t| {6,8})Description$') 98 + p = re.compile(' \* ?(?:\t| {5,8})Description$') 99 99 capture = p.match(self.line) 100 100 if not capture: 101 101 # Helper can have empty description and we might be parsing another ··· 109 109 if self.line == ' *\n': 110 110 desc += '\n' 111 111 else: 112 - p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)') 112 + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') 113 113 capture = p.match(self.line) 114 114 if capture: 115 115 desc += capture.group(1) + '\n' ··· 118 118 return desc 119 119 120 120 def parse_ret(self): 121 - p = re.compile(' \* ?(?:\t| {6,8})Return$') 121 + p = re.compile(' \* ?(?:\t| {5,8})Return$') 122 122 capture = p.match(self.line) 123 123 if not capture: 124 124 # Helper can have empty retval and we might be parsing another ··· 132 132 if self.line == ' *\n': 133 133 ret += '\n' 134 134 else: 135 - p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)') 135 + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') 136 136 capture = p.match(self.line) 137 137 if capture: 138 138 ret += capture.group(1) + '\n'

+81

tools/bpf/bpftool/Documentation/bpftool-perf.rst

··· 1 + ================ 2 + bpftool-perf 3 + ================ 4 + ------------------------------------------------------------------------------- 5 + tool for inspection of perf related bpf prog attachments 6 + ------------------------------------------------------------------------------- 7 + 8 + :Manual section: 8 9 + 10 + SYNOPSIS 11 + ======== 12 + 13 + **bpftool** [*OPTIONS*] **perf** *COMMAND* 14 + 15 + *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] } 16 + 17 + *COMMANDS* := 18 + { **show** | **list** | **help** } 19 + 20 + PERF COMMANDS 21 + ============= 22 + 23 + | **bpftool** **perf { show | list }** 24 + | **bpftool** **perf help** 25 + 26 + DESCRIPTION 27 + =========== 28 + **bpftool perf { show | list }** 29 + List all raw_tracepoint, tracepoint, kprobe attachment in the system. 30 + 31 + Output will start with process id and file descriptor in that process, 32 + followed by bpf program id, attachment information, and attachment point. 33 + The attachment point for raw_tracepoint/tracepoint is the trace probe name. 34 + The attachment point for k[ret]probe is either symbol name and offset, 35 + or a kernel virtual address. 36 + The attachment point for u[ret]probe is the file name and the file offset. 37 + 38 + **bpftool perf help** 39 + Print short help message. 40 + 41 + OPTIONS 42 + ======= 43 + -h, --help 44 + Print short generic help message (similar to **bpftool help**). 45 + 46 + -v, --version 47 + Print version number (similar to **bpftool version**). 48 + 49 + -j, --json 50 + Generate JSON output. For commands that cannot produce JSON, this 51 + option has no effect. 52 + 53 + -p, --pretty 54 + Generate human-readable JSON output. Implies **-j**. 55 + 56 + EXAMPLES 57 + ======== 58 + 59 + | **# bpftool perf** 60 + 61 + :: 62 + 63 + pid 21711 fd 5: prog_id 5 kprobe func __x64_sys_write offset 0 64 + pid 21765 fd 5: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0 65 + pid 21767 fd 5: prog_id 8 tracepoint sys_enter_nanosleep 66 + pid 21800 fd 5: prog_id 9 uprobe filename /home/yhs/a.out offset 1159 67 + 68 + | 69 + | **# bpftool -j perf** 70 + 71 + :: 72 + 73 + [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \ 74 + {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \ 75 + {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \ 76 + {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}] 77 + 78 + 79 + SEE ALSO 80 + ======== 81 + **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)

+4 -1

tools/bpf/bpftool/Documentation/bpftool.rst

··· 16 16 17 17 **bpftool** **version** 18 18 19 - *OBJECT* := { **map** | **program** | **cgroup** } 19 + *OBJECT* := { **map** | **program** | **cgroup** | **perf** } 20 20 21 21 *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } 22 22 | { **-j** | **--json** } [{ **-p** | **--pretty** }] } ··· 29 29 | **load** | **help** } 30 30 31 31 *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } 32 + 33 + *PERF-COMMANDS* := { **show** | **list** | **help** } 32 34 33 35 DESCRIPTION 34 36 =========== ··· 58 56 SEE ALSO 59 57 ======== 60 58 **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) 59 + **bpftool-perf**\ (8)

+9

tools/bpf/bpftool/bash-completion/bpftool

··· 448 448 ;; 449 449 esac 450 450 ;; 451 + perf) 452 + case $command in 453 + *) 454 + [[ $prev == $object ]] && \ 455 + COMPREPLY=( $( compgen -W 'help \ 456 + show list' -- "$cur" ) ) 457 + ;; 458 + esac 459 + ;; 451 460 esac 452 461 } && 453 462 complete -F _bpftool bpftool

+2 -1

tools/bpf/bpftool/main.c

··· 87 87 " %s batch file FILE\n" 88 88 " %s version\n" 89 89 "\n" 90 - " OBJECT := { prog | map | cgroup }\n" 90 + " OBJECT := { prog | map | cgroup | perf }\n" 91 91 " " HELP_SPEC_OPTIONS "\n" 92 92 "", 93 93 bin_name, bin_name, bin_name); ··· 216 216 { "prog", do_prog }, 217 217 { "map", do_map }, 218 218 { "cgroup", do_cgroup }, 219 + { "perf", do_perf }, 219 220 { "version", do_version }, 220 221 { 0 } 221 222 };

+1

tools/bpf/bpftool/main.h

··· 119 119 int do_map(int argc, char **arg); 120 120 int do_event_pipe(int argc, char **argv); 121 121 int do_cgroup(int argc, char **arg); 122 + int do_perf(int argc, char **arg); 122 123 123 124 int prog_parse_fd(int *argc, char ***argv); 124 125 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);

+246

tools/bpf/bpftool/perf.c

··· 1 + // SPDX-License-Identifier: GPL-2.0+ 2 + // Copyright (C) 2018 Facebook 3 + // Author: Yonghong Song <yhs@fb.com> 4 + 5 + #define _GNU_SOURCE 6 + #include <ctype.h> 7 + #include <errno.h> 8 + #include <fcntl.h> 9 + #include <stdlib.h> 10 + #include <string.h> 11 + #include <sys/stat.h> 12 + #include <sys/types.h> 13 + #include <unistd.h> 14 + #include <ftw.h> 15 + 16 + #include <bpf.h> 17 + 18 + #include "main.h" 19 + 20 + /* 0: undecided, 1: supported, 2: not supported */ 21 + static int perf_query_supported; 22 + static bool has_perf_query_support(void) 23 + { 24 + __u64 probe_offset, probe_addr; 25 + __u32 len, prog_id, fd_type; 26 + char buf[256]; 27 + int fd; 28 + 29 + if (perf_query_supported) 30 + goto out; 31 + 32 + fd = open(bin_name, O_RDONLY); 33 + if (fd < 0) { 34 + p_err("perf_query_support: %s", strerror(errno)); 35 + goto out; 36 + } 37 + 38 + /* the following query will fail as no bpf attachment, 39 + * the expected errno is ENOTSUPP 40 + */ 41 + errno = 0; 42 + len = sizeof(buf); 43 + bpf_task_fd_query(getpid(), fd, 0, buf, &len, &prog_id, 44 + &fd_type, &probe_offset, &probe_addr); 45 + 46 + if (errno == 524 /* ENOTSUPP */) { 47 + perf_query_supported = 1; 48 + goto close_fd; 49 + } 50 + 51 + perf_query_supported = 2; 52 + p_err("perf_query_support: %s", strerror(errno)); 53 + fprintf(stderr, 54 + "HINT: non root or kernel doesn't support TASK_FD_QUERY\n"); 55 + 56 + close_fd: 57 + close(fd); 58 + out: 59 + return perf_query_supported == 1; 60 + } 61 + 62 + static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type, 63 + char *buf, __u64 probe_offset, __u64 probe_addr) 64 + { 65 + jsonw_start_object(json_wtr); 66 + jsonw_int_field(json_wtr, "pid", pid); 67 + jsonw_int_field(json_wtr, "fd", fd); 68 + jsonw_uint_field(json_wtr, "prog_id", prog_id); 69 + switch (fd_type) { 70 + case BPF_FD_TYPE_RAW_TRACEPOINT: 71 + jsonw_string_field(json_wtr, "fd_type", "raw_tracepoint"); 72 + jsonw_string_field(json_wtr, "tracepoint", buf); 73 + break; 74 + case BPF_FD_TYPE_TRACEPOINT: 75 + jsonw_string_field(json_wtr, "fd_type", "tracepoint"); 76 + jsonw_string_field(json_wtr, "tracepoint", buf); 77 + break; 78 + case BPF_FD_TYPE_KPROBE: 79 + jsonw_string_field(json_wtr, "fd_type", "kprobe"); 80 + if (buf[0] != '\0') { 81 + jsonw_string_field(json_wtr, "func", buf); 82 + jsonw_lluint_field(json_wtr, "offset", probe_offset); 83 + } else { 84 + jsonw_lluint_field(json_wtr, "addr", probe_addr); 85 + } 86 + break; 87 + case BPF_FD_TYPE_KRETPROBE: 88 + jsonw_string_field(json_wtr, "fd_type", "kretprobe"); 89 + if (buf[0] != '\0') { 90 + jsonw_string_field(json_wtr, "func", buf); 91 + jsonw_lluint_field(json_wtr, "offset", probe_offset); 92 + } else { 93 + jsonw_lluint_field(json_wtr, "addr", probe_addr); 94 + } 95 + break; 96 + case BPF_FD_TYPE_UPROBE: 97 + jsonw_string_field(json_wtr, "fd_type", "uprobe"); 98 + jsonw_string_field(json_wtr, "filename", buf); 99 + jsonw_lluint_field(json_wtr, "offset", probe_offset); 100 + break; 101 + case BPF_FD_TYPE_URETPROBE: 102 + jsonw_string_field(json_wtr, "fd_type", "uretprobe"); 103 + jsonw_string_field(json_wtr, "filename", buf); 104 + jsonw_lluint_field(json_wtr, "offset", probe_offset); 105 + break; 106 + } 107 + jsonw_end_object(json_wtr); 108 + } 109 + 110 + static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type, 111 + char *buf, __u64 probe_offset, __u64 probe_addr) 112 + { 113 + printf("pid %d fd %d: prog_id %u ", pid, fd, prog_id); 114 + switch (fd_type) { 115 + case BPF_FD_TYPE_RAW_TRACEPOINT: 116 + printf("raw_tracepoint %s\n", buf); 117 + break; 118 + case BPF_FD_TYPE_TRACEPOINT: 119 + printf("tracepoint %s\n", buf); 120 + break; 121 + case BPF_FD_TYPE_KPROBE: 122 + if (buf[0] != '\0') 123 + printf("kprobe func %s offset %llu\n", buf, 124 + probe_offset); 125 + else 126 + printf("kprobe addr %llu\n", probe_addr); 127 + break; 128 + case BPF_FD_TYPE_KRETPROBE: 129 + if (buf[0] != '\0') 130 + printf("kretprobe func %s offset %llu\n", buf, 131 + probe_offset); 132 + else 133 + printf("kretprobe addr %llu\n", probe_addr); 134 + break; 135 + case BPF_FD_TYPE_UPROBE: 136 + printf("uprobe filename %s offset %llu\n", buf, probe_offset); 137 + break; 138 + case BPF_FD_TYPE_URETPROBE: 139 + printf("uretprobe filename %s offset %llu\n", buf, 140 + probe_offset); 141 + break; 142 + } 143 + } 144 + 145 + static int show_proc(const char *fpath, const struct stat *sb, 146 + int tflag, struct FTW *ftwbuf) 147 + { 148 + __u64 probe_offset, probe_addr; 149 + __u32 len, prog_id, fd_type; 150 + int err, pid = 0, fd = 0; 151 + const char *pch; 152 + char buf[4096]; 153 + 154 + /* prefix always /proc */ 155 + pch = fpath + 5; 156 + if (*pch == '\0') 157 + return 0; 158 + 159 + /* pid should be all numbers */ 160 + pch++; 161 + while (isdigit(*pch)) { 162 + pid = pid * 10 + *pch - '0'; 163 + pch++; 164 + } 165 + if (*pch == '\0') 166 + return 0; 167 + if (*pch != '/') 168 + return FTW_SKIP_SUBTREE; 169 + 170 + /* check /proc/<pid>/fd directory */ 171 + pch++; 172 + if (strncmp(pch, "fd", 2)) 173 + return FTW_SKIP_SUBTREE; 174 + pch += 2; 175 + if (*pch == '\0') 176 + return 0; 177 + if (*pch != '/') 178 + return FTW_SKIP_SUBTREE; 179 + 180 + /* check /proc/<pid>/fd/<fd_num> */ 181 + pch++; 182 + while (isdigit(*pch)) { 183 + fd = fd * 10 + *pch - '0'; 184 + pch++; 185 + } 186 + if (*pch != '\0') 187 + return FTW_SKIP_SUBTREE; 188 + 189 + /* query (pid, fd) for potential perf events */ 190 + len = sizeof(buf); 191 + err = bpf_task_fd_query(pid, fd, 0, buf, &len, &prog_id, &fd_type, 192 + &probe_offset, &probe_addr); 193 + if (err < 0) 194 + return 0; 195 + 196 + if (json_output) 197 + print_perf_json(pid, fd, prog_id, fd_type, buf, probe_offset, 198 + probe_addr); 199 + else 200 + print_perf_plain(pid, fd, prog_id, fd_type, buf, probe_offset, 201 + probe_addr); 202 + 203 + return 0; 204 + } 205 + 206 + static int do_show(int argc, char **argv) 207 + { 208 + int flags = FTW_ACTIONRETVAL | FTW_PHYS; 209 + int err = 0, nopenfd = 16; 210 + 211 + if (!has_perf_query_support()) 212 + return -1; 213 + 214 + if (json_output) 215 + jsonw_start_array(json_wtr); 216 + if (nftw("/proc", show_proc, nopenfd, flags) == -1) { 217 + p_err("%s", strerror(errno)); 218 + err = -1; 219 + } 220 + if (json_output) 221 + jsonw_end_array(json_wtr); 222 + 223 + return err; 224 + } 225 + 226 + static int do_help(int argc, char **argv) 227 + { 228 + fprintf(stderr, 229 + "Usage: %s %s { show | list | help }\n" 230 + "", 231 + bin_name, argv[-2]); 232 + 233 + return 0; 234 + } 235 + 236 + static const struct cmd cmds[] = { 237 + { "show", do_show }, 238 + { "list", do_show }, 239 + { "help", do_help }, 240 + { 0 } 241 + }; 242 + 243 + int do_perf(int argc, char **argv) 244 + { 245 + return cmd_select(cmds, argc, argv, do_help); 246 + }

+96 -1

tools/bpf/bpftool/prog.c

··· 420 420 421 421 static int do_dump(int argc, char **argv) 422 422 { 423 + unsigned long *func_ksyms = NULL; 423 424 struct bpf_prog_info info = {}; 425 + unsigned int *func_lens = NULL; 426 + unsigned int nr_func_ksyms; 427 + unsigned int nr_func_lens; 424 428 struct dump_data dd = {}; 425 429 __u32 len = sizeof(info); 426 430 unsigned int buf_size; ··· 500 496 return -1; 501 497 } 502 498 499 + nr_func_ksyms = info.nr_jited_ksyms; 500 + if (nr_func_ksyms) { 501 + func_ksyms = malloc(nr_func_ksyms * sizeof(__u64)); 502 + if (!func_ksyms) { 503 + p_err("mem alloc failed"); 504 + close(fd); 505 + goto err_free; 506 + } 507 + } 508 + 509 + nr_func_lens = info.nr_jited_func_lens; 510 + if (nr_func_lens) { 511 + func_lens = malloc(nr_func_lens * sizeof(__u32)); 512 + if (!func_lens) { 513 + p_err("mem alloc failed"); 514 + close(fd); 515 + goto err_free; 516 + } 517 + } 518 + 503 519 memset(&info, 0, sizeof(info)); 504 520 505 521 *member_ptr = ptr_to_u64(buf); 506 522 *member_len = buf_size; 523 + info.jited_ksyms = ptr_to_u64(func_ksyms); 524 + info.nr_jited_ksyms = nr_func_ksyms; 525 + info.jited_func_lens = ptr_to_u64(func_lens); 526 + info.nr_jited_func_lens = nr_func_lens; 507 527 508 528 err = bpf_obj_get_info_by_fd(fd, &info, &len); 509 529 close(fd); ··· 538 510 539 511 if (*member_len > buf_size) { 540 512 p_err("too many instructions returned"); 513 + goto err_free; 514 + } 515 + 516 + if (info.nr_jited_ksyms > nr_func_ksyms) { 517 + p_err("too many addresses returned"); 518 + goto err_free; 519 + } 520 + 521 + if (info.nr_jited_func_lens > nr_func_lens) { 522 + p_err("too many values returned"); 541 523 goto err_free; 542 524 } 543 525 ··· 588 550 goto err_free; 589 551 } 590 552 591 - disasm_print_insn(buf, *member_len, opcodes, name); 553 + if (info.nr_jited_func_lens && info.jited_func_lens) { 554 + struct kernel_sym *sym = NULL; 555 + char sym_name[SYM_MAX_NAME]; 556 + unsigned char *img = buf; 557 + __u64 *ksyms = NULL; 558 + __u32 *lens; 559 + __u32 i; 560 + 561 + if (info.nr_jited_ksyms) { 562 + kernel_syms_load(&dd); 563 + ksyms = (__u64 *) info.jited_ksyms; 564 + } 565 + 566 + if (json_output) 567 + jsonw_start_array(json_wtr); 568 + 569 + lens = (__u32 *) info.jited_func_lens; 570 + for (i = 0; i < info.nr_jited_func_lens; i++) { 571 + if (ksyms) { 572 + sym = kernel_syms_search(&dd, ksyms[i]); 573 + if (sym) 574 + sprintf(sym_name, "%s", sym->name); 575 + else 576 + sprintf(sym_name, "0x%016llx", ksyms[i]); 577 + } else { 578 + strcpy(sym_name, "unknown"); 579 + } 580 + 581 + if (json_output) { 582 + jsonw_start_object(json_wtr); 583 + jsonw_name(json_wtr, "name"); 584 + jsonw_string(json_wtr, sym_name); 585 + jsonw_name(json_wtr, "insns"); 586 + } else { 587 + printf("%s:\n", sym_name); 588 + } 589 + 590 + disasm_print_insn(img, lens[i], opcodes, name); 591 + img += lens[i]; 592 + 593 + if (json_output) 594 + jsonw_end_object(json_wtr); 595 + else 596 + printf("\n"); 597 + } 598 + 599 + if (json_output) 600 + jsonw_end_array(json_wtr); 601 + } else { 602 + disasm_print_insn(buf, *member_len, opcodes, name); 603 + } 592 604 } else if (visual) { 593 605 if (json_output) 594 606 jsonw_null(json_wtr); ··· 646 558 dump_xlated_cfg(buf, *member_len); 647 559 } else { 648 560 kernel_syms_load(&dd); 561 + dd.nr_jited_ksyms = info.nr_jited_ksyms; 562 + dd.jited_ksyms = (__u64 *) info.jited_ksyms; 563 + 649 564 if (json_output) 650 565 dump_xlated_json(&dd, buf, *member_len, opcodes); 651 566 else ··· 657 566 } 658 567 659 568 free(buf); 569 + free(func_ksyms); 570 + free(func_lens); 660 571 return 0; 661 572 662 573 err_free: 663 574 free(buf); 575 + free(func_ksyms); 576 + free(func_lens); 664 577 return -1; 665 578 } 666 579

+11 -3

tools/bpf/bpftool/xlated_dumper.c

··· 102 102 free(dd->sym_mapping); 103 103 } 104 104 105 - static struct kernel_sym *kernel_syms_search(struct dump_data *dd, 106 - unsigned long key) 105 + struct kernel_sym *kernel_syms_search(struct dump_data *dd, 106 + unsigned long key) 107 107 { 108 108 struct kernel_sym sym = { 109 109 .address = key, ··· 174 174 unsigned long address, 175 175 const struct bpf_insn *insn) 176 176 { 177 - if (sym) 177 + if (!dd->nr_jited_ksyms) 178 + /* Do not show address for interpreted programs */ 179 + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), 180 + "%+d", insn->off); 181 + else if (sym) 178 182 snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), 179 183 "%+d#%s", insn->off, sym->name); 180 184 else ··· 206 202 struct dump_data *dd = private_data; 207 203 unsigned long address = dd->address_call_base + insn->imm; 208 204 struct kernel_sym *sym; 205 + 206 + if (insn->src_reg == BPF_PSEUDO_CALL && 207 + (__u32) insn->imm < dd->nr_jited_ksyms) 208 + address = dd->jited_ksyms[insn->imm]; 209 209 210 210 sym = kernel_syms_search(dd, address); 211 211 if (insn->src_reg == BPF_PSEUDO_CALL)

+3

tools/bpf/bpftool/xlated_dumper.h

··· 49 49 unsigned long address_call_base; 50 50 struct kernel_sym *sym_mapping; 51 51 __u32 sym_count; 52 + __u64 *jited_ksyms; 53 + __u32 nr_jited_ksyms; 52 54 char scratch_buff[SYM_MAX_NAME + 8]; 53 55 }; 54 56 55 57 void kernel_syms_load(struct dump_data *dd); 56 58 void kernel_syms_destroy(struct dump_data *dd); 59 + struct kernel_sym *kernel_syms_search(struct dump_data *dd, unsigned long key); 57 60 void dump_xlated_json(struct dump_data *dd, void *buf, unsigned int len, 58 61 bool opcodes); 59 62 void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len,

+138 -5

tools/include/uapi/linux/bpf.h

··· 97 97 BPF_RAW_TRACEPOINT_OPEN, 98 98 BPF_BTF_LOAD, 99 99 BPF_BTF_GET_FD_BY_ID, 100 + BPF_TASK_FD_QUERY, 100 101 }; 101 102 102 103 enum bpf_map_type { ··· 142 141 BPF_PROG_TYPE_SK_MSG, 143 142 BPF_PROG_TYPE_RAW_TRACEPOINT, 144 143 BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 144 + BPF_PROG_TYPE_LWT_SEG6LOCAL, 145 145 }; 146 146 147 147 enum bpf_attach_type { ··· 286 284 char map_name[BPF_OBJ_NAME_LEN]; 287 285 __u32 map_ifindex; /* ifindex of netdev to create on */ 288 286 __u32 btf_fd; /* fd pointing to a BTF type data */ 289 - __u32 btf_key_id; /* BTF type_id of the key */ 290 - __u32 btf_value_id; /* BTF type_id of the value */ 287 + __u32 btf_key_type_id; /* BTF type_id of the key */ 288 + __u32 btf_value_type_id; /* BTF type_id of the value */ 291 289 }; 292 290 293 291 struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ ··· 381 379 __u32 btf_log_size; 382 380 __u32 btf_log_level; 383 381 }; 382 + 383 + struct { 384 + __u32 pid; /* input: pid */ 385 + __u32 fd; /* input: fd */ 386 + __u32 flags; /* input: flags */ 387 + __u32 buf_len; /* input/output: buf len */ 388 + __aligned_u64 buf; /* input/output: 389 + * tp_name for tracepoint 390 + * symbol for kprobe 391 + * filename for uprobe 392 + */ 393 + __u32 prog_id; /* output: prod_id */ 394 + __u32 fd_type; /* output: BPF_FD_TYPE_* */ 395 + __u64 probe_offset; /* output: probe_offset */ 396 + __u64 probe_addr; /* output: probe_addr */ 397 + } task_fd_query; 384 398 } __attribute__((aligned(8))); 385 399 386 400 /* The description below is an attempt at providing documentation to eBPF ··· 1920 1902 * egress otherwise). This is the only flag supported for now. 1921 1903 * Return 1922 1904 * **SK_PASS** on success, or **SK_DROP** on error. 1905 + * 1906 + * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) 1907 + * Description 1908 + * Encapsulate the packet associated to *skb* within a Layer 3 1909 + * protocol header. This header is provided in the buffer at 1910 + * address *hdr*, with *len* its size in bytes. *type* indicates 1911 + * the protocol of the header and can be one of: 1912 + * 1913 + * **BPF_LWT_ENCAP_SEG6** 1914 + * IPv6 encapsulation with Segment Routing Header 1915 + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, 1916 + * the IPv6 header is computed by the kernel. 1917 + * **BPF_LWT_ENCAP_SEG6_INLINE** 1918 + * Only works if *skb* contains an IPv6 packet. Insert a 1919 + * Segment Routing Header (**struct ipv6_sr_hdr**) inside 1920 + * the IPv6 header. 1921 + * 1922 + * A call to this helper is susceptible to change the underlaying 1923 + * packet buffer. Therefore, at load time, all checks on pointers 1924 + * previously done by the verifier are invalidated and must be 1925 + * performed again, if the helper is used in combination with 1926 + * direct packet access. 1927 + * Return 1928 + * 0 on success, or a negative error in case of failure. 1929 + * 1930 + * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) 1931 + * Description 1932 + * Store *len* bytes from address *from* into the packet 1933 + * associated to *skb*, at *offset*. Only the flags, tag and TLVs 1934 + * inside the outermost IPv6 Segment Routing Header can be 1935 + * modified through this helper. 1936 + * 1937 + * A call to this helper is susceptible to change the underlaying 1938 + * packet buffer. Therefore, at load time, all checks on pointers 1939 + * previously done by the verifier are invalidated and must be 1940 + * performed again, if the helper is used in combination with 1941 + * direct packet access. 1942 + * Return 1943 + * 0 on success, or a negative error in case of failure. 1944 + * 1945 + * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) 1946 + * Description 1947 + * Adjust the size allocated to TLVs in the outermost IPv6 1948 + * Segment Routing Header contained in the packet associated to 1949 + * *skb*, at position *offset* by *delta* bytes. Only offsets 1950 + * after the segments are accepted. *delta* can be as well 1951 + * positive (growing) as negative (shrinking). 1952 + * 1953 + * A call to this helper is susceptible to change the underlaying 1954 + * packet buffer. Therefore, at load time, all checks on pointers 1955 + * previously done by the verifier are invalidated and must be 1956 + * performed again, if the helper is used in combination with 1957 + * direct packet access. 1958 + * Return 1959 + * 0 on success, or a negative error in case of failure. 1960 + * 1961 + * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) 1962 + * Description 1963 + * Apply an IPv6 Segment Routing action of type *action* to the 1964 + * packet associated to *skb*. Each action takes a parameter 1965 + * contained at address *param*, and of length *param_len* bytes. 1966 + * *action* can be one of: 1967 + * 1968 + * **SEG6_LOCAL_ACTION_END_X** 1969 + * End.X action: Endpoint with Layer-3 cross-connect. 1970 + * Type of *param*: **struct in6_addr**. 1971 + * **SEG6_LOCAL_ACTION_END_T** 1972 + * End.T action: Endpoint with specific IPv6 table lookup. 1973 + * Type of *param*: **int**. 1974 + * **SEG6_LOCAL_ACTION_END_B6** 1975 + * End.B6 action: Endpoint bound to an SRv6 policy. 1976 + * Type of param: **struct ipv6_sr_hdr**. 1977 + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** 1978 + * End.B6.Encap action: Endpoint bound to an SRv6 1979 + * encapsulation policy. 1980 + * Type of param: **struct ipv6_sr_hdr**. 1981 + * 1982 + * A call to this helper is susceptible to change the underlaying 1983 + * packet buffer. Therefore, at load time, all checks on pointers 1984 + * previously done by the verifier are invalidated and must be 1985 + * performed again, if the helper is used in combination with 1986 + * direct packet access. 1987 + * Return 1988 + * 0 on success, or a negative error in case of failure. 1923 1989 */ 1924 1990 #define __BPF_FUNC_MAPPER(FN) \ 1925 1991 FN(unspec), \ ··· 2078 1976 FN(fib_lookup), \ 2079 1977 FN(sock_hash_update), \ 2080 1978 FN(msg_redirect_hash), \ 2081 - FN(sk_redirect_hash), 1979 + FN(sk_redirect_hash), \ 1980 + FN(lwt_push_encap), \ 1981 + FN(lwt_seg6_store_bytes), \ 1982 + FN(lwt_seg6_adjust_srh), \ 1983 + FN(lwt_seg6_action), 2082 1984 2083 1985 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2084 1986 * function eBPF program intends to call ··· 2147 2041 enum bpf_hdr_start_off { 2148 2042 BPF_HDR_START_MAC, 2149 2043 BPF_HDR_START_NET, 2044 + }; 2045 + 2046 + /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ 2047 + enum bpf_lwt_encap_mode { 2048 + BPF_LWT_ENCAP_SEG6, 2049 + BPF_LWT_ENCAP_SEG6_INLINE 2150 2050 }; 2151 2051 2152 2052 /* user accessible mirror of in-kernel sk_buff. ··· 2288 2176 struct sk_msg_md { 2289 2177 void *data; 2290 2178 void *data_end; 2179 + 2180 + __u32 family; 2181 + __u32 remote_ip4; /* Stored in network byte order */ 2182 + __u32 local_ip4; /* Stored in network byte order */ 2183 + __u32 remote_ip6[4]; /* Stored in network byte order */ 2184 + __u32 local_ip6[4]; /* Stored in network byte order */ 2185 + __u32 remote_port; /* Stored in network byte order */ 2186 + __u32 local_port; /* stored in host byte order */ 2291 2187 }; 2292 2188 2293 2189 #define BPF_TAG_SIZE 8 ··· 2317 2197 __u32 gpl_compatible:1; 2318 2198 __u64 netns_dev; 2319 2199 __u64 netns_ino; 2200 + __u32 nr_jited_ksyms; 2201 + __u32 nr_jited_func_lens; 2202 + __aligned_u64 jited_ksyms; 2203 + __aligned_u64 jited_func_lens; 2320 2204 } __attribute__((aligned(8))); 2321 2205 2322 2206 struct bpf_map_info { ··· 2335 2211 __u64 netns_dev; 2336 2212 __u64 netns_ino; 2337 2213 __u32 btf_id; 2338 - __u32 btf_key_id; 2339 - __u32 btf_value_id; 2214 + __u32 btf_key_type_id; 2215 + __u32 btf_value_type_id; 2340 2216 } __attribute__((aligned(8))); 2341 2217 2342 2218 struct bpf_btf_info { ··· 2572 2448 __be16 h_vlan_TCI; 2573 2449 __u8 smac[6]; /* ETH_ALEN */ 2574 2450 __u8 dmac[6]; /* ETH_ALEN */ 2451 + }; 2452 + 2453 + enum bpf_task_fd_type { 2454 + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ 2455 + BPF_FD_TYPE_TRACEPOINT, /* tp name */ 2456 + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ 2457 + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ 2458 + BPF_FD_TYPE_UPROBE, /* filename + offset */ 2459 + BPF_FD_TYPE_URETPROBE, /* filename + offset */ 2575 2460 }; 2576 2461 2577 2462 #endif /* _UAPI__LINUX_BPF_H__ */

+11 -26

tools/include/uapi/linux/btf.h

··· 12 12 __u16 magic; 13 13 __u8 version; 14 14 __u8 flags; 15 - 16 - __u32 parent_label; 17 - __u32 parent_name; 15 + __u32 hdr_len; 18 16 19 17 /* All offsets are in bytes relative to the end of this header */ 20 - __u32 label_off; /* offset of label section */ 21 - __u32 object_off; /* offset of data object section*/ 22 - __u32 func_off; /* offset of function section */ 23 18 __u32 type_off; /* offset of type section */ 19 + __u32 type_len; /* length of type section */ 24 20 __u32 str_off; /* offset of string section */ 25 21 __u32 str_len; /* length of string section */ 26 22 }; 27 23 28 24 /* Max # of type identifier */ 29 - #define BTF_MAX_TYPE 0x7fffffff 25 + #define BTF_MAX_TYPE 0x0000ffff 30 26 /* Max offset into the string section */ 31 - #define BTF_MAX_NAME_OFFSET 0x7fffffff 27 + #define BTF_MAX_NAME_OFFSET 0x0000ffff 32 28 /* Max # of struct/union/enum members or func args */ 33 29 #define BTF_MAX_VLEN 0xffff 34 - 35 - /* The type id is referring to a parent BTF */ 36 - #define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) 37 - #define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) 38 - 39 - /* String is in the ELF string section */ 40 - #define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) 41 - #define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) 42 30 43 31 struct btf_type { 44 32 __u32 name_off; 45 33 /* "info" bits arrangement 46 34 * bits 0-15: vlen (e.g. # of struct's members) 47 35 * bits 16-23: unused 48 - * bits 24-28: kind (e.g. int, ptr, array...etc) 49 - * bits 29-30: unused 50 - * bits 31: root 36 + * bits 24-27: kind (e.g. int, ptr, array...etc) 37 + * bits 28-31: unused 51 38 */ 52 39 __u32 info; 53 40 /* "size" is used by INT, ENUM, STRUCT and UNION. ··· 49 62 }; 50 63 }; 51 64 52 - #define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) 53 - #define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) 65 + #define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) 54 66 #define BTF_INFO_VLEN(info) ((info) & 0xffff) 55 67 56 68 #define BTF_KIND_UNKN 0 /* Unknown */ ··· 74 88 /* BTF_KIND_INT is followed by a u32 and the following 75 89 * is the 32 bits arrangement: 76 90 */ 77 - #define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) 91 + #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) 78 92 #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) 79 93 #define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) 80 94 81 95 /* Attributes stored in the BTF_INT_ENCODING */ 82 - #define BTF_INT_SIGNED 0x1 83 - #define BTF_INT_CHAR 0x2 84 - #define BTF_INT_BOOL 0x4 85 - #define BTF_INT_VARARGS 0x8 96 + #define BTF_INT_SIGNED (1 << 0) 97 + #define BTF_INT_CHAR (1 << 1) 98 + #define BTF_INT_BOOL (1 << 2) 86 99 87 100 /* BTF_KIND_ENUM is followed by multiple "struct btf_enum". 88 101 * The exact number of btf_enum is stored in the vlen (of the

+25 -2

tools/lib/bpf/bpf.c

··· 89 89 min(name_len, BPF_OBJ_NAME_LEN - 1)); 90 90 attr.numa_node = create_attr->numa_node; 91 91 attr.btf_fd = create_attr->btf_fd; 92 - attr.btf_key_id = create_attr->btf_key_id; 93 - attr.btf_value_id = create_attr->btf_value_id; 92 + attr.btf_key_type_id = create_attr->btf_key_type_id; 93 + attr.btf_value_type_id = create_attr->btf_value_type_id; 94 94 attr.map_ifindex = create_attr->map_ifindex; 95 95 96 96 return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); ··· 642 642 } 643 643 644 644 return fd; 645 + } 646 + 647 + int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, 648 + __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, 649 + __u64 *probe_addr) 650 + { 651 + union bpf_attr attr = {}; 652 + int err; 653 + 654 + attr.task_fd_query.pid = pid; 655 + attr.task_fd_query.fd = fd; 656 + attr.task_fd_query.flags = flags; 657 + attr.task_fd_query.buf = ptr_to_u64(buf); 658 + attr.task_fd_query.buf_len = *buf_len; 659 + 660 + err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr)); 661 + *buf_len = attr.task_fd_query.buf_len; 662 + *prog_id = attr.task_fd_query.prog_id; 663 + *fd_type = attr.task_fd_query.fd_type; 664 + *probe_offset = attr.task_fd_query.probe_offset; 665 + *probe_addr = attr.task_fd_query.probe_addr; 666 + 667 + return err; 645 668 }

+5 -2

tools/lib/bpf/bpf.h

··· 36 36 __u32 max_entries; 37 37 __u32 numa_node; 38 38 __u32 btf_fd; 39 - __u32 btf_key_id; 40 - __u32 btf_value_id; 39 + __u32 btf_key_type_id; 40 + __u32 btf_value_type_id; 41 41 __u32 map_ifindex; 42 42 }; 43 43 ··· 107 107 int bpf_raw_tracepoint_open(const char *name, int prog_fd); 108 108 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, 109 109 bool do_log); 110 + int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, 111 + __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, 112 + __u64 *probe_addr); 110 113 #endif

+2 -3

tools/lib/bpf/btf.c

··· 35 35 36 36 static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset) 37 37 { 38 - if (!BTF_STR_TBL_ELF_ID(offset) && 39 - BTF_STR_OFFSET(offset) < btf->hdr->str_len) 40 - return &btf->strings[BTF_STR_OFFSET(offset)]; 38 + if (offset < btf->hdr->str_len) 39 + return &btf->strings[offset]; 41 40 else 42 41 return NULL; 43 42 }

+22 -21

tools/lib/bpf/libbpf.c

··· 216 216 size_t offset; 217 217 int map_ifindex; 218 218 struct bpf_map_def def; 219 - uint32_t btf_key_id; 220 - uint32_t btf_value_id; 219 + uint32_t btf_key_type_id; 220 + uint32_t btf_value_type_id; 221 221 void *priv; 222 222 bpf_map_clear_priv_t clear_priv; 223 223 }; ··· 1042 1042 } 1043 1043 1044 1044 if (def->key_size != key_size) { 1045 - pr_warning("map:%s key_type:%s has BTF type_size:%ld != key_size:%u\n", 1046 - map->name, name, key_size, def->key_size); 1045 + pr_warning("map:%s key_type:%s has BTF type_size:%u != key_size:%u\n", 1046 + map->name, name, (unsigned int)key_size, def->key_size); 1047 1047 return -EINVAL; 1048 1048 } 1049 1049 ··· 1069 1069 } 1070 1070 1071 1071 if (def->value_size != value_size) { 1072 - pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n", 1073 - map->name, name, value_size, def->value_size); 1072 + pr_warning("map:%s value_type:%s has BTF type_size:%u != value_size:%u\n", 1073 + map->name, name, (unsigned int)value_size, def->value_size); 1074 1074 return -EINVAL; 1075 1075 } 1076 1076 1077 - map->btf_key_id = key_id; 1078 - map->btf_value_id = value_id; 1077 + map->btf_key_type_id = key_id; 1078 + map->btf_value_type_id = value_id; 1079 1079 1080 1080 return 0; 1081 1081 } ··· 1100 1100 create_attr.value_size = def->value_size; 1101 1101 create_attr.max_entries = def->max_entries; 1102 1102 create_attr.btf_fd = 0; 1103 - create_attr.btf_key_id = 0; 1104 - create_attr.btf_value_id = 0; 1103 + create_attr.btf_key_type_id = 0; 1104 + create_attr.btf_value_type_id = 0; 1105 1105 1106 1106 if (obj->btf && !bpf_map_find_btf_info(map, obj->btf)) { 1107 1107 create_attr.btf_fd = btf__fd(obj->btf); 1108 - create_attr.btf_key_id = map->btf_key_id; 1109 - create_attr.btf_value_id = map->btf_value_id; 1108 + create_attr.btf_key_type_id = map->btf_key_type_id; 1109 + create_attr.btf_value_type_id = map->btf_value_type_id; 1110 1110 } 1111 1111 1112 1112 *pfd = bpf_create_map_xattr(&create_attr); 1113 - if (*pfd < 0 && create_attr.btf_key_id) { 1113 + if (*pfd < 0 && create_attr.btf_key_type_id) { 1114 1114 pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", 1115 1115 map->name, strerror(errno), errno); 1116 1116 create_attr.btf_fd = 0; 1117 - create_attr.btf_key_id = 0; 1118 - create_attr.btf_value_id = 0; 1119 - map->btf_key_id = 0; 1120 - map->btf_value_id = 0; 1117 + create_attr.btf_key_type_id = 0; 1118 + create_attr.btf_value_type_id = 0; 1119 + map->btf_key_type_id = 0; 1120 + map->btf_value_type_id = 0; 1121 1121 *pfd = bpf_create_map_xattr(&create_attr); 1122 1122 } 1123 1123 ··· 1456 1456 case BPF_PROG_TYPE_LWT_IN: 1457 1457 case BPF_PROG_TYPE_LWT_OUT: 1458 1458 case BPF_PROG_TYPE_LWT_XMIT: 1459 + case BPF_PROG_TYPE_LWT_SEG6LOCAL: 1459 1460 case BPF_PROG_TYPE_SOCK_OPS: 1460 1461 case BPF_PROG_TYPE_SK_SKB: 1461 1462 case BPF_PROG_TYPE_CGROUP_DEVICE: ··· 2086 2085 return map ? map->name : NULL; 2087 2086 } 2088 2087 2089 - uint32_t bpf_map__btf_key_id(const struct bpf_map *map) 2088 + uint32_t bpf_map__btf_key_type_id(const struct bpf_map *map) 2090 2089 { 2091 - return map ? map->btf_key_id : 0; 2090 + return map ? map->btf_key_type_id : 0; 2092 2091 } 2093 2092 2094 - uint32_t bpf_map__btf_value_id(const struct bpf_map *map) 2093 + uint32_t bpf_map__btf_value_type_id(const struct bpf_map *map) 2095 2094 { 2096 - return map ? map->btf_value_id : 0; 2095 + return map ? map->btf_value_type_id : 0; 2097 2096 } 2098 2097 2099 2098 int bpf_map__set_priv(struct bpf_map *map, void *priv,

+2 -2

tools/lib/bpf/libbpf.h

··· 244 244 int bpf_map__fd(struct bpf_map *map); 245 245 const struct bpf_map_def *bpf_map__def(struct bpf_map *map); 246 246 const char *bpf_map__name(struct bpf_map *map); 247 - uint32_t bpf_map__btf_key_id(const struct bpf_map *map); 248 - uint32_t bpf_map__btf_value_id(const struct bpf_map *map); 247 + uint32_t bpf_map__btf_key_type_id(const struct bpf_map *map); 248 + uint32_t bpf_map__btf_value_type_id(const struct bpf_map *map); 249 249 250 250 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); 251 251 int bpf_map__set_priv(struct bpf_map *map, void *priv,

+14 -2

tools/testing/selftests/bpf/Makefile

··· 33 33 sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \ 34 34 sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \ 35 35 test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ 36 - test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o 36 + test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ 37 + test_lwt_seg6local.o 37 38 38 39 # Order correspond to 'make run_tests' order 39 40 TEST_PROGS := test_kmod.sh \ ··· 43 42 test_xdp_meta.sh \ 44 43 test_offload.py \ 45 44 test_sock_addr.sh \ 46 - test_tunnel.sh 45 + test_tunnel.sh \ 46 + test_lwt_seg6local.sh 47 47 48 48 # Compile but not part of 'make run_tests' 49 49 TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr ··· 86 84 CPU ?= generic 87 85 endif 88 86 87 + # Get Clang's default includes on this system, as opposed to those seen by 88 + # '-target bpf'. This fixes "missing" files on some architectures/distros, 89 + # such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. 90 + # 91 + # Use '-idirafter': Don't interfere with include mechanics except where the 92 + # build would have failed anyways. 93 + CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \ 94 + | sed -n '/<...> search starts here:/,/End of search list./{ s| $/.*$|-idirafter \1|p }') 95 + 89 96 CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \ 97 + $(CLANG_SYS_INCLUDES) \ 90 98 -Wno-compare-distinct-pointer-types 91 99 92 100 $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline

+12

tools/testing/selftests/bpf/bpf_helpers.h

··· 114 114 static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, 115 115 int plen, __u32 flags) = 116 116 (void *) BPF_FUNC_fib_lookup; 117 + static int (*bpf_lwt_push_encap)(void *ctx, unsigned int type, void *hdr, 118 + unsigned int len) = 119 + (void *) BPF_FUNC_lwt_push_encap; 120 + static int (*bpf_lwt_seg6_store_bytes)(void *ctx, unsigned int offset, 121 + void *from, unsigned int len) = 122 + (void *) BPF_FUNC_lwt_seg6_store_bytes; 123 + static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param, 124 + unsigned int param_len) = 125 + (void *) BPF_FUNC_lwt_seg6_action; 126 + static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset, 127 + unsigned int len) = 128 + (void *) BPF_FUNC_lwt_seg6_adjust_srh; 117 129 118 130 /* llvm builtin functions that eBPF C program may use to 119 131 * emit BPF_LD_ABS and BPF_LD_IND instructions

+431 -90

tools/testing/selftests/bpf/test_btf.c

··· 113 113 static struct btf_header hdr_tmpl = { 114 114 .magic = BTF_MAGIC, 115 115 .version = BTF_VERSION, 116 + .hdr_len = sizeof(struct btf_header), 116 117 }; 117 118 118 119 struct btf_raw_test { 119 120 const char *descr; 120 121 const char *str_sec; 121 122 const char *map_name; 123 + const char *err_str; 122 124 __u32 raw_types[MAX_NR_RAW_TYPES]; 123 125 __u32 str_sec_size; 124 126 enum bpf_map_type map_type; 125 127 __u32 key_size; 126 128 __u32 value_size; 127 - __u32 key_id; 128 - __u32 value_id; 129 + __u32 key_type_id; 130 + __u32 value_type_id; 129 131 __u32 max_entries; 130 132 bool btf_load_err; 131 133 bool map_create_err; 134 + int hdr_len_delta; 132 135 int type_off_delta; 133 136 int str_off_delta; 134 137 int str_len_delta; ··· 144 141 * }; 145 142 * 146 143 * struct A { 147 - * int m; 148 - * unsigned long long n; 144 + * unsigned long long m; 145 + * int n; 149 146 * char o; 150 147 * [3 bytes hole] 151 148 * int p[8]; ··· 166 163 BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */ 167 164 /* struct A { */ /* [5] */ 168 165 BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 6), 180), 169 - BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */ 170 - BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* unsigned long long n;*/ 166 + BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/ 167 + BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */ 171 168 BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */ 172 169 BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */ 173 170 BTF_MEMBER_ENC(NAME_TBD, 6, 384),/* int q[4][8] */ ··· 175 172 /* } */ 176 173 /* int[4][8] */ 177 174 BTF_TYPE_ARRAY_ENC(4, 1, 4), /* [6] */ 175 + /* enum E */ /* [7] */ 178 176 BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)), 179 177 BTF_ENUM_ENC(NAME_TBD, 0), 180 178 BTF_ENUM_ENC(NAME_TBD, 1), ··· 187 183 .map_name = "struct_test1_map", 188 184 .key_size = sizeof(int), 189 185 .value_size = 180, 190 - .key_id = 1, 191 - .value_id = 5, 186 + .key_type_id = 1, 187 + .value_type_id = 5, 192 188 .max_entries = 4, 193 189 }, 194 190 ··· 242 238 .map_name = "struct_test2_map", 243 239 .key_size = sizeof(int), 244 240 .value_size = 68, 245 - .key_id = 1, 246 - .value_id = 3, 241 + .key_type_id = 1, 242 + .value_type_id = 3, 247 243 .max_entries = 4, 248 244 }, 249 245 ··· 262 258 /* struct A { */ /* [2] */ 263 259 BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 - 1), 264 260 BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */ 265 - BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* int n; */ 261 + BTF_MEMBER_ENC(NAME_TBD, 1, 32),/* int n; */ 266 262 /* } */ 267 263 BTF_END_RAW, 268 264 }, ··· 272 268 .map_name = "size_check1_map", 273 269 .key_size = sizeof(int), 274 270 .value_size = 1, 275 - .key_id = 1, 276 - .value_id = 2, 271 + .key_type_id = 1, 272 + .value_type_id = 2, 277 273 .max_entries = 4, 278 274 .btf_load_err = true, 275 + .err_str = "Member exceeds struct_size", 279 276 }, 280 277 281 278 /* Test member exeeds the size of struct ··· 306 301 .map_name = "size_check2_map", 307 302 .key_size = sizeof(int), 308 303 .value_size = 1, 309 - .key_id = 1, 310 - .value_id = 3, 304 + .key_type_id = 1, 305 + .value_type_id = 3, 311 306 .max_entries = 4, 312 307 .btf_load_err = true, 313 - 308 + .err_str = "Member exceeds struct_size", 314 309 }, 315 310 316 311 /* Test member exeeds the size of struct ··· 340 335 .map_name = "size_check3_map", 341 336 .key_size = sizeof(int), 342 337 .value_size = 1, 343 - .key_id = 1, 344 - .value_id = 3, 338 + .key_type_id = 1, 339 + .value_type_id = 3, 345 340 .max_entries = 4, 346 341 .btf_load_err = true, 342 + .err_str = "Member exceeds struct_size", 347 343 }, 348 344 349 345 /* Test member exceeds the size of struct ··· 382 376 .map_name = "size_check4_map", 383 377 .key_size = sizeof(int), 384 378 .value_size = 1, 385 - .key_id = 1, 386 - .value_id = 3, 379 + .key_type_id = 1, 380 + .value_type_id = 3, 387 381 .max_entries = 4, 388 382 .btf_load_err = true, 383 + .err_str = "Member exceeds struct_size", 389 384 }, 390 385 391 386 /* typedef const void * const_void_ptr; ··· 418 411 .map_name = "void_test1_map", 419 412 .key_size = sizeof(int), 420 413 .value_size = sizeof(void *), 421 - .key_id = 1, 422 - .value_id = 4, 414 + .key_type_id = 1, 415 + .value_type_id = 4, 423 416 .max_entries = 4, 424 417 }, 425 418 ··· 447 440 .map_name = "void_test2_map", 448 441 .key_size = sizeof(int), 449 442 .value_size = sizeof(void *), 450 - .key_id = 1, 451 - .value_id = 3, 443 + .key_type_id = 1, 444 + .value_type_id = 3, 452 445 .max_entries = 4, 453 446 .btf_load_err = true, 447 + .err_str = "Invalid member", 454 448 }, 455 449 456 450 /* typedef const void * const_void_ptr; ··· 466 458 BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0), 467 459 /* const void* */ /* [3] */ 468 460 BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), 469 - /* typedef const void * const_void_ptr */ 461 + /* typedef const void * const_void_ptr */ /* [4] */ 470 462 BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3), 471 - /* const_void_ptr[4] */ /* [4] */ 463 + /* const_void_ptr[4] */ /* [5] */ 472 464 BTF_TYPE_ARRAY_ENC(3, 1, 4), 473 465 BTF_END_RAW, 474 466 }, ··· 478 470 .map_name = "void_test3_map", 479 471 .key_size = sizeof(int), 480 472 .value_size = sizeof(void *) * 4, 481 - .key_id = 1, 482 - .value_id = 4, 473 + .key_type_id = 1, 474 + .value_type_id = 4, 483 475 .max_entries = 4, 484 476 }, 485 477 ··· 501 493 .map_name = "void_test4_map", 502 494 .key_size = sizeof(int), 503 495 .value_size = sizeof(void *) * 4, 504 - .key_id = 1, 505 - .value_id = 3, 496 + .key_type_id = 1, 497 + .value_type_id = 3, 506 498 .max_entries = 4, 507 499 .btf_load_err = true, 500 + .err_str = "Invalid elem", 508 501 }, 509 502 510 503 /* Array_A <------------------+ ··· 532 523 .map_name = "loop_test1_map", 533 524 .key_size = sizeof(int), 534 525 .value_size = sizeof(sizeof(int) * 8), 535 - .key_id = 1, 536 - .value_id = 2, 526 + .key_type_id = 1, 527 + .value_type_id = 2, 537 528 .max_entries = 4, 538 529 .btf_load_err = true, 530 + .err_str = "Loop detected", 539 531 }, 540 532 541 533 /* typedef is _before_ the BTF type of Array_A and Array_B ··· 561 551 BTF_TYPE_ARRAY_ENC(2, 1, 8), /* [3] */ 562 552 /* Array_B */ 563 553 BTF_TYPE_ARRAY_ENC(3, 1, 8), /* [4] */ 564 - 565 554 BTF_END_RAW, 566 555 }, 567 556 .str_sec = "\0int_array\0", ··· 569 560 .map_name = "loop_test2_map", 570 561 .key_size = sizeof(int), 571 562 .value_size = sizeof(sizeof(int) * 8), 572 - .key_id = 1, 573 - .value_id = 2, 563 + .key_type_id = 1, 564 + .value_type_id = 2, 574 565 .max_entries = 4, 575 566 .btf_load_err = true, 567 + .err_str = "Loop detected", 576 568 }, 577 569 578 570 /* Array_A <------------------+ ··· 592 582 BTF_TYPE_ARRAY_ENC(3, 1, 8), 593 583 /* Array_B */ /* [3] */ 594 584 BTF_TYPE_ARRAY_ENC(2, 1, 8), 595 - 596 585 BTF_END_RAW, 597 586 }, 598 587 .str_sec = "", ··· 600 591 .map_name = "loop_test3_map", 601 592 .key_size = sizeof(int), 602 593 .value_size = sizeof(sizeof(int) * 8), 603 - .key_id = 1, 604 - .value_id = 2, 594 + .key_type_id = 1, 595 + .value_type_id = 2, 605 596 .max_entries = 4, 606 597 .btf_load_err = true, 598 + .err_str = "Loop detected", 607 599 }, 608 600 609 601 /* typedef is _between_ the BTF type of Array_A and Array_B ··· 637 627 .map_name = "loop_test4_map", 638 628 .key_size = sizeof(int), 639 629 .value_size = sizeof(sizeof(int) * 8), 640 - .key_id = 1, 641 - .value_id = 2, 630 + .key_type_id = 1, 631 + .value_type_id = 2, 642 632 .max_entries = 4, 643 633 .btf_load_err = true, 634 + .err_str = "Loop detected", 644 635 }, 645 636 646 637 /* typedef struct B Struct_B ··· 679 668 .map_name = "loop_test5_map", 680 669 .key_size = sizeof(int), 681 670 .value_size = 8, 682 - .key_id = 1, 683 - .value_id = 2, 671 + .key_type_id = 1, 672 + .value_type_id = 2, 684 673 .max_entries = 4, 685 674 .btf_load_err = true, 675 + .err_str = "Loop detected", 686 676 }, 687 677 688 678 /* struct A { ··· 709 697 .map_name = "loop_test6_map", 710 698 .key_size = sizeof(int), 711 699 .value_size = 8, 712 - .key_id = 1, 713 - .value_id = 2, 700 + .key_type_id = 1, 701 + .value_type_id = 2, 714 702 .max_entries = 4, 715 703 .btf_load_err = true, 704 + .err_str = "Loop detected", 716 705 }, 717 706 718 707 { ··· 737 724 .map_name = "loop_test7_map", 738 725 .key_size = sizeof(int), 739 726 .value_size = sizeof(void *), 740 - .key_id = 1, 741 - .value_id = 2, 727 + .key_type_id = 1, 728 + .value_type_id = 2, 742 729 .max_entries = 4, 743 730 .btf_load_err = true, 731 + .err_str = "Loop detected", 744 732 }, 745 733 746 734 { ··· 773 759 .map_name = "loop_test8_map", 774 760 .key_size = sizeof(int), 775 761 .value_size = sizeof(void *), 776 - .key_id = 1, 777 - .value_id = 2, 762 + .key_type_id = 1, 763 + .value_type_id = 2, 778 764 .max_entries = 4, 779 765 .btf_load_err = true, 766 + .err_str = "Loop detected", 780 767 }, 781 768 782 769 { 783 - .descr = "type_off == str_off", 770 + .descr = "string section does not end with null", 771 + .raw_types = { 772 + /* int */ /* [1] */ 773 + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), 774 + BTF_END_RAW, 775 + }, 776 + .str_sec = "\0int", 777 + .str_sec_size = sizeof("\0int") - 1, 778 + .map_type = BPF_MAP_TYPE_ARRAY, 779 + .map_name = "hdr_test_map", 780 + .key_size = sizeof(int), 781 + .value_size = sizeof(int), 782 + .key_type_id = 1, 783 + .value_type_id = 1, 784 + .max_entries = 4, 785 + .btf_load_err = true, 786 + .err_str = "Invalid string section", 787 + }, 788 + 789 + { 790 + .descr = "empty string section", 791 + .raw_types = { 792 + /* int */ /* [1] */ 793 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 794 + BTF_END_RAW, 795 + }, 796 + .str_sec = "", 797 + .str_sec_size = 0, 798 + .map_type = BPF_MAP_TYPE_ARRAY, 799 + .map_name = "hdr_test_map", 800 + .key_size = sizeof(int), 801 + .value_size = sizeof(int), 802 + .key_type_id = 1, 803 + .value_type_id = 1, 804 + .max_entries = 4, 805 + .btf_load_err = true, 806 + .err_str = "Invalid string section", 807 + }, 808 + 809 + { 810 + .descr = "empty type section", 811 + .raw_types = { 812 + BTF_END_RAW, 813 + }, 814 + .str_sec = "\0int", 815 + .str_sec_size = sizeof("\0int"), 816 + .map_type = BPF_MAP_TYPE_ARRAY, 817 + .map_name = "hdr_test_map", 818 + .key_size = sizeof(int), 819 + .value_size = sizeof(int), 820 + .key_type_id = 1, 821 + .value_type_id = 1, 822 + .max_entries = 4, 823 + .btf_load_err = true, 824 + .err_str = "No type found", 825 + }, 826 + 827 + { 828 + .descr = "btf_header test. Longer hdr_len", 784 829 .raw_types = { 785 830 /* int */ /* [1] */ 786 831 BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), ··· 851 778 .map_name = "hdr_test_map", 852 779 .key_size = sizeof(int), 853 780 .value_size = sizeof(int), 854 - .key_id = 1, 855 - .value_id = 1, 781 + .key_type_id = 1, 782 + .value_type_id = 1, 856 783 .max_entries = 4, 857 784 .btf_load_err = true, 858 - .type_off_delta = sizeof(struct btf_type) + sizeof(int) + sizeof("\0int"), 785 + .hdr_len_delta = 4, 786 + .err_str = "Unsupported btf_header", 859 787 }, 860 788 861 789 { 862 - .descr = "Unaligned type_off", 790 + .descr = "btf_header test. Gap between hdr and type", 863 791 .raw_types = { 864 792 /* int */ /* [1] */ 865 793 BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), ··· 872 798 .map_name = "hdr_test_map", 873 799 .key_size = sizeof(int), 874 800 .value_size = sizeof(int), 875 - .key_id = 1, 876 - .value_id = 1, 801 + .key_type_id = 1, 802 + .value_type_id = 1, 877 803 .max_entries = 4, 878 804 .btf_load_err = true, 879 - .type_off_delta = 1, 805 + .type_off_delta = 4, 806 + .err_str = "Unsupported section found", 880 807 }, 881 808 882 809 { 883 - .descr = "str_off beyonds btf size", 810 + .descr = "btf_header test. Gap between type and str", 884 811 .raw_types = { 885 812 /* int */ /* [1] */ 886 813 BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), ··· 893 818 .map_name = "hdr_test_map", 894 819 .key_size = sizeof(int), 895 820 .value_size = sizeof(int), 896 - .key_id = 1, 897 - .value_id = 1, 821 + .key_type_id = 1, 822 + .value_type_id = 1, 898 823 .max_entries = 4, 899 824 .btf_load_err = true, 900 - .str_off_delta = sizeof("\0int") + 1, 825 + .str_off_delta = 4, 826 + .err_str = "Unsupported section found", 901 827 }, 902 828 903 829 { 904 - .descr = "str_len beyonds btf size", 830 + .descr = "btf_header test. Overlap between type and str", 905 831 .raw_types = { 906 832 /* int */ /* [1] */ 907 833 BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), ··· 914 838 .map_name = "hdr_test_map", 915 839 .key_size = sizeof(int), 916 840 .value_size = sizeof(int), 917 - .key_id = 1, 918 - .value_id = 1, 841 + .key_type_id = 1, 842 + .value_type_id = 1, 919 843 .max_entries = 4, 920 844 .btf_load_err = true, 921 - .str_len_delta = 1, 845 + .str_off_delta = -4, 846 + .err_str = "Section overlap found", 922 847 }, 923 848 924 849 { 925 - .descr = "String section does not end with null", 850 + .descr = "btf_header test. Larger BTF size", 926 851 .raw_types = { 927 852 /* int */ /* [1] */ 928 853 BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), ··· 935 858 .map_name = "hdr_test_map", 936 859 .key_size = sizeof(int), 937 860 .value_size = sizeof(int), 938 - .key_id = 1, 939 - .value_id = 1, 861 + .key_type_id = 1, 862 + .value_type_id = 1, 940 863 .max_entries = 4, 941 864 .btf_load_err = true, 942 - .str_len_delta = -1, 865 + .str_len_delta = -4, 866 + .err_str = "Unsupported section found", 943 867 }, 944 868 945 869 { 946 - .descr = "Empty string section", 870 + .descr = "btf_header test. Smaller BTF size", 947 871 .raw_types = { 948 872 /* int */ /* [1] */ 949 873 BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), ··· 956 878 .map_name = "hdr_test_map", 957 879 .key_size = sizeof(int), 958 880 .value_size = sizeof(int), 959 - .key_id = 1, 960 - .value_id = 1, 881 + .key_type_id = 1, 882 + .value_type_id = 1, 961 883 .max_entries = 4, 962 884 .btf_load_err = true, 963 - .str_len_delta = 0 - (int)sizeof("\0int"), 885 + .str_len_delta = 4, 886 + .err_str = "Total section length too long", 887 + }, 888 + 889 + { 890 + .descr = "array test. index_type/elem_type \"int\"", 891 + .raw_types = { 892 + /* int */ /* [1] */ 893 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 894 + /* int[16] */ /* [2] */ 895 + BTF_TYPE_ARRAY_ENC(1, 1, 16), 896 + BTF_END_RAW, 897 + }, 898 + .str_sec = "", 899 + .str_sec_size = sizeof(""), 900 + .map_type = BPF_MAP_TYPE_ARRAY, 901 + .map_name = "array_test_map", 902 + .key_size = sizeof(int), 903 + .value_size = sizeof(int), 904 + .key_type_id = 1, 905 + .value_type_id = 1, 906 + .max_entries = 4, 907 + }, 908 + 909 + { 910 + .descr = "array test. index_type/elem_type \"const int\"", 911 + .raw_types = { 912 + /* int */ /* [1] */ 913 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 914 + /* int[16] */ /* [2] */ 915 + BTF_TYPE_ARRAY_ENC(3, 3, 16), 916 + /* CONST type_id=1 */ /* [3] */ 917 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1), 918 + BTF_END_RAW, 919 + }, 920 + .str_sec = "", 921 + .str_sec_size = sizeof(""), 922 + .map_type = BPF_MAP_TYPE_ARRAY, 923 + .map_name = "array_test_map", 924 + .key_size = sizeof(int), 925 + .value_size = sizeof(int), 926 + .key_type_id = 1, 927 + .value_type_id = 1, 928 + .max_entries = 4, 929 + }, 930 + 931 + { 932 + .descr = "array test. index_type \"const int:31\"", 933 + .raw_types = { 934 + /* int */ /* [1] */ 935 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 936 + /* int:31 */ /* [2] */ 937 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 31, 4), 938 + /* int[16] */ /* [3] */ 939 + BTF_TYPE_ARRAY_ENC(1, 4, 16), 940 + /* CONST type_id=2 */ /* [4] */ 941 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2), 942 + BTF_END_RAW, 943 + }, 944 + .str_sec = "", 945 + .str_sec_size = sizeof(""), 946 + .map_type = BPF_MAP_TYPE_ARRAY, 947 + .map_name = "array_test_map", 948 + .key_size = sizeof(int), 949 + .value_size = sizeof(int), 950 + .key_type_id = 1, 951 + .value_type_id = 1, 952 + .max_entries = 4, 953 + .btf_load_err = true, 954 + .err_str = "Invalid index", 955 + }, 956 + 957 + { 958 + .descr = "array test. elem_type \"const int:31\"", 959 + .raw_types = { 960 + /* int */ /* [1] */ 961 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 962 + /* int:31 */ /* [2] */ 963 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 31, 4), 964 + /* int[16] */ /* [3] */ 965 + BTF_TYPE_ARRAY_ENC(4, 1, 16), 966 + /* CONST type_id=2 */ /* [4] */ 967 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2), 968 + BTF_END_RAW, 969 + }, 970 + .str_sec = "", 971 + .str_sec_size = sizeof(""), 972 + .map_type = BPF_MAP_TYPE_ARRAY, 973 + .map_name = "array_test_map", 974 + .key_size = sizeof(int), 975 + .value_size = sizeof(int), 976 + .key_type_id = 1, 977 + .value_type_id = 1, 978 + .max_entries = 4, 979 + .btf_load_err = true, 980 + .err_str = "Invalid array of int", 981 + }, 982 + 983 + { 984 + .descr = "array test. index_type \"void\"", 985 + .raw_types = { 986 + /* int */ /* [1] */ 987 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 988 + /* int[16] */ /* [2] */ 989 + BTF_TYPE_ARRAY_ENC(1, 0, 16), 990 + BTF_END_RAW, 991 + }, 992 + .str_sec = "", 993 + .str_sec_size = sizeof(""), 994 + .map_type = BPF_MAP_TYPE_ARRAY, 995 + .map_name = "array_test_map", 996 + .key_size = sizeof(int), 997 + .value_size = sizeof(int), 998 + .key_type_id = 1, 999 + .value_type_id = 1, 1000 + .max_entries = 4, 1001 + .btf_load_err = true, 1002 + .err_str = "Invalid index", 1003 + }, 1004 + 1005 + { 1006 + .descr = "array test. index_type \"const void\"", 1007 + .raw_types = { 1008 + /* int */ /* [1] */ 1009 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 1010 + /* int[16] */ /* [2] */ 1011 + BTF_TYPE_ARRAY_ENC(1, 3, 16), 1012 + /* CONST type_id=0 (void) */ /* [3] */ 1013 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0), 1014 + BTF_END_RAW, 1015 + }, 1016 + .str_sec = "", 1017 + .str_sec_size = sizeof(""), 1018 + .map_type = BPF_MAP_TYPE_ARRAY, 1019 + .map_name = "array_test_map", 1020 + .key_size = sizeof(int), 1021 + .value_size = sizeof(int), 1022 + .key_type_id = 1, 1023 + .value_type_id = 1, 1024 + .max_entries = 4, 1025 + .btf_load_err = true, 1026 + .err_str = "Invalid index", 1027 + }, 1028 + 1029 + { 1030 + .descr = "array test. elem_type \"const void\"", 1031 + .raw_types = { 1032 + /* int */ /* [1] */ 1033 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 1034 + /* int[16] */ /* [2] */ 1035 + BTF_TYPE_ARRAY_ENC(3, 1, 16), 1036 + /* CONST type_id=0 (void) */ /* [3] */ 1037 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0), 1038 + BTF_END_RAW, 1039 + }, 1040 + .str_sec = "", 1041 + .str_sec_size = sizeof(""), 1042 + .map_type = BPF_MAP_TYPE_ARRAY, 1043 + .map_name = "array_test_map", 1044 + .key_size = sizeof(int), 1045 + .value_size = sizeof(int), 1046 + .key_type_id = 1, 1047 + .value_type_id = 1, 1048 + .max_entries = 4, 1049 + .btf_load_err = true, 1050 + .err_str = "Invalid elem", 1051 + }, 1052 + 1053 + { 1054 + .descr = "array test. elem_type \"const void *\"", 1055 + .raw_types = { 1056 + /* int */ /* [1] */ 1057 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 1058 + /* const void *[16] */ /* [2] */ 1059 + BTF_TYPE_ARRAY_ENC(3, 1, 16), 1060 + /* CONST type_id=4 */ /* [3] */ 1061 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4), 1062 + /* void* */ /* [4] */ 1063 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), 1064 + BTF_END_RAW, 1065 + }, 1066 + .str_sec = "", 1067 + .str_sec_size = sizeof(""), 1068 + .map_type = BPF_MAP_TYPE_ARRAY, 1069 + .map_name = "array_test_map", 1070 + .key_size = sizeof(int), 1071 + .value_size = sizeof(int), 1072 + .key_type_id = 1, 1073 + .value_type_id = 1, 1074 + .max_entries = 4, 1075 + }, 1076 + 1077 + { 1078 + .descr = "array test. index_type \"const void *\"", 1079 + .raw_types = { 1080 + /* int */ /* [1] */ 1081 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 1082 + /* const void *[16] */ /* [2] */ 1083 + BTF_TYPE_ARRAY_ENC(3, 3, 16), 1084 + /* CONST type_id=4 */ /* [3] */ 1085 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4), 1086 + /* void* */ /* [4] */ 1087 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), 1088 + BTF_END_RAW, 1089 + }, 1090 + .str_sec = "", 1091 + .str_sec_size = sizeof(""), 1092 + .map_type = BPF_MAP_TYPE_ARRAY, 1093 + .map_name = "array_test_map", 1094 + .key_size = sizeof(int), 1095 + .value_size = sizeof(int), 1096 + .key_type_id = 1, 1097 + .value_type_id = 1, 1098 + .max_entries = 4, 1099 + .btf_load_err = true, 1100 + .err_str = "Invalid index", 1101 + }, 1102 + 1103 + { 1104 + .descr = "int test. invalid int_data", 1105 + .raw_types = { 1106 + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), 4), 1107 + 0x10000000, 1108 + BTF_END_RAW, 1109 + }, 1110 + .str_sec = "", 1111 + .str_sec_size = sizeof(""), 1112 + .map_type = BPF_MAP_TYPE_ARRAY, 1113 + .map_name = "array_test_map", 1114 + .key_size = sizeof(int), 1115 + .value_size = sizeof(int), 1116 + .key_type_id = 1, 1117 + .value_type_id = 1, 1118 + .max_entries = 4, 1119 + .btf_load_err = true, 1120 + .err_str = "Invalid int_data", 1121 + }, 1122 + 1123 + { 1124 + .descr = "invalid BTF_INFO", 1125 + .raw_types = { 1126 + /* int */ /* [1] */ 1127 + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), 1128 + BTF_TYPE_ENC(0, 0x10000000, 4), 1129 + BTF_END_RAW, 1130 + }, 1131 + .str_sec = "", 1132 + .str_sec_size = sizeof(""), 1133 + .map_type = BPF_MAP_TYPE_ARRAY, 1134 + .map_name = "array_test_map", 1135 + .key_size = sizeof(int), 1136 + .value_size = sizeof(int), 1137 + .key_type_id = 1, 1138 + .value_type_id = 1, 1139 + .max_entries = 4, 1140 + .btf_load_err = true, 1141 + .err_str = "Invalid btf_info", 964 1142 }, 965 1143 966 1144 }; /* struct btf_raw_test raw_tests[] */ ··· 1285 951 memcpy(raw_btf + offset, str, str_sec_size); 1286 952 1287 953 ret_hdr = (struct btf_header *)raw_btf; 954 + ret_hdr->type_len = type_sec_size; 1288 955 ret_hdr->str_off = type_sec_size; 1289 956 ret_hdr->str_len = str_sec_size; 1290 957 ··· 1316 981 1317 982 hdr = raw_btf; 1318 983 984 + hdr->hdr_len = (int)hdr->hdr_len + test->hdr_len_delta; 1319 985 hdr->type_off = (int)hdr->type_off + test->type_off_delta; 1320 986 hdr->str_off = (int)hdr->str_off + test->str_off_delta; 1321 987 hdr->str_len = (int)hdr->str_len + test->str_len_delta; ··· 1328 992 free(raw_btf); 1329 993 1330 994 err = ((btf_fd == -1) != test->btf_load_err); 1331 - CHECK(err, "btf_fd:%d test->btf_load_err:%u", 1332 - btf_fd, test->btf_load_err); 995 + if (CHECK(err, "btf_fd:%d test->btf_load_err:%u", 996 + btf_fd, test->btf_load_err) || 997 + CHECK(test->err_str && !strstr(btf_log_buf, test->err_str), 998 + "expected err_str:%s", test->err_str)) { 999 + err = -1; 1000 + goto done; 1001 + } 1333 1002 1334 1003 if (err || btf_fd == -1) 1335 1004 goto done; ··· 1345 1004 create_attr.value_size = test->value_size; 1346 1005 create_attr.max_entries = test->max_entries; 1347 1006 create_attr.btf_fd = btf_fd; 1348 - create_attr.btf_key_id = test->key_id; 1349 - create_attr.btf_value_id = test->value_id; 1007 + create_attr.btf_key_type_id = test->key_type_id; 1008 + create_attr.btf_value_type_id = test->value_type_id; 1350 1009 1351 1010 map_fd = bpf_create_map_xattr(&create_attr); 1352 1011 ··· 1608 1267 create_attr.value_size = sizeof(unsigned int); 1609 1268 create_attr.max_entries = 4; 1610 1269 create_attr.btf_fd = btf_fd[0]; 1611 - create_attr.btf_key_id = 1; 1612 - create_attr.btf_value_id = 2; 1270 + create_attr.btf_key_type_id = 1; 1271 + create_attr.btf_value_type_id = 2; 1613 1272 1614 1273 map_fd = bpf_create_map_xattr(&create_attr); 1615 1274 if (CHECK(map_fd == -1, "errno:%d", errno)) { ··· 1620 1279 info_len = sizeof(map_info); 1621 1280 err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len); 1622 1281 if (CHECK(err || map_info.btf_id != info[0].id || 1623 - map_info.btf_key_id != 1 || map_info.btf_value_id != 2, 1624 - "err:%d errno:%d info.id:%u btf_id:%u btf_key_id:%u btf_value_id:%u", 1625 - err, errno, info[0].id, map_info.btf_id, map_info.btf_key_id, 1626 - map_info.btf_value_id)) { 1282 + map_info.btf_key_type_id != 1 || map_info.btf_value_type_id != 2, 1283 + "err:%d errno:%d info.id:%u btf_id:%u btf_key_type_id:%u btf_value_type_id:%u", 1284 + err, errno, info[0].id, map_info.btf_id, map_info.btf_key_type_id, 1285 + map_info.btf_value_type_id)) { 1627 1286 err = -1; 1628 1287 goto done; 1629 1288 } ··· 1883 1542 goto done; 1884 1543 } 1885 1544 1886 - err = (bpf_map__btf_key_id(map) == 0 || bpf_map__btf_value_id(map) == 0) 1545 + err = (bpf_map__btf_key_type_id(map) == 0 || bpf_map__btf_value_type_id(map) == 0) 1887 1546 != test->btf_kv_notfound; 1888 - if (CHECK(err, "btf_key_id:%u btf_value_id:%u test->btf_kv_notfound:%u", 1889 - bpf_map__btf_key_id(map), bpf_map__btf_value_id(map), 1547 + if (CHECK(err, "btf_key_type_id:%u btf_value_type_id:%u test->btf_kv_notfound:%u", 1548 + bpf_map__btf_key_type_id(map), bpf_map__btf_value_type_id(map), 1890 1549 test->btf_kv_notfound)) 1891 1550 goto done; 1892 1551 ··· 1956 1615 /* 28 bits */ /* [7] */ 1957 1616 BTF_TYPE_INT_ENC(0, 0, 0, 28, 4), 1958 1617 /* uint8_t[8] */ /* [8] */ 1959 - BTF_TYPE_ARRAY_ENC(9, 3, 8), 1618 + BTF_TYPE_ARRAY_ENC(9, 1, 8), 1960 1619 /* typedef unsigned char uint8_t */ /* [9] */ 1961 1620 BTF_TYPEDEF_ENC(NAME_TBD, 1), 1962 1621 /* typedef unsigned short uint16_t */ /* [10] */ ··· 1995 1654 .map_name = "pprint_test", 1996 1655 .key_size = sizeof(unsigned int), 1997 1656 .value_size = sizeof(struct pprint_mapv), 1998 - .key_id = 3, /* unsigned int */ 1999 - .value_id = 16, /* struct pprint_mapv */ 1657 + .key_type_id = 3, /* unsigned int */ 1658 + .value_type_id = 16, /* struct pprint_mapv */ 2000 1659 .max_entries = 128 * 1024, 2001 1660 }; 2002 1661 ··· 2053 1712 create_attr.value_size = test->value_size; 2054 1713 create_attr.max_entries = test->max_entries; 2055 1714 create_attr.btf_fd = btf_fd; 2056 - create_attr.btf_key_id = test->key_id; 2057 - create_attr.btf_value_id = test->value_id; 1715 + create_attr.btf_key_type_id = test->key_type_id; 1716 + create_attr.btf_value_type_id = test->value_type_id; 2058 1717 2059 1718 map_fd = bpf_create_map_xattr(&create_attr); 2060 1719 if (CHECK(map_fd == -1, "errno:%d", errno)) {

+437

tools/testing/selftests/bpf/test_lwt_seg6local.c

··· 1 + #include <stddef.h> 2 + #include <inttypes.h> 3 + #include <errno.h> 4 + #include <linux/seg6_local.h> 5 + #include <linux/bpf.h> 6 + #include "bpf_helpers.h" 7 + #include "bpf_endian.h" 8 + 9 + #define bpf_printk(fmt, ...) \ 10 + ({ \ 11 + char ____fmt[] = fmt; \ 12 + bpf_trace_printk(____fmt, sizeof(____fmt), \ 13 + ##__VA_ARGS__); \ 14 + }) 15 + 16 + /* Packet parsing state machine helpers. */ 17 + #define cursor_advance(_cursor, _len) \ 18 + ({ void *_tmp = _cursor; _cursor += _len; _tmp; }) 19 + 20 + #define SR6_FLAG_ALERT (1 << 4) 21 + 22 + #define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \ 23 + 0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32)) 24 + #define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \ 25 + 0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32)) 26 + #define BPF_PACKET_HEADER __attribute__((packed)) 27 + 28 + struct ip6_t { 29 + unsigned int ver:4; 30 + unsigned int priority:8; 31 + unsigned int flow_label:20; 32 + unsigned short payload_len; 33 + unsigned char next_header; 34 + unsigned char hop_limit; 35 + unsigned long long src_hi; 36 + unsigned long long src_lo; 37 + unsigned long long dst_hi; 38 + unsigned long long dst_lo; 39 + } BPF_PACKET_HEADER; 40 + 41 + struct ip6_addr_t { 42 + unsigned long long hi; 43 + unsigned long long lo; 44 + } BPF_PACKET_HEADER; 45 + 46 + struct ip6_srh_t { 47 + unsigned char nexthdr; 48 + unsigned char hdrlen; 49 + unsigned char type; 50 + unsigned char segments_left; 51 + unsigned char first_segment; 52 + unsigned char flags; 53 + unsigned short tag; 54 + 55 + struct ip6_addr_t segments[0]; 56 + } BPF_PACKET_HEADER; 57 + 58 + struct sr6_tlv_t { 59 + unsigned char type; 60 + unsigned char len; 61 + unsigned char value[0]; 62 + } BPF_PACKET_HEADER; 63 + 64 + __attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff *skb) 65 + { 66 + void *cursor, *data_end; 67 + struct ip6_srh_t *srh; 68 + struct ip6_t *ip; 69 + uint8_t *ipver; 70 + 71 + data_end = (void *)(long)skb->data_end; 72 + cursor = (void *)(long)skb->data; 73 + ipver = (uint8_t *)cursor; 74 + 75 + if ((void *)ipver + sizeof(*ipver) > data_end) 76 + return NULL; 77 + 78 + if ((*ipver >> 4) != 6) 79 + return NULL; 80 + 81 + ip = cursor_advance(cursor, sizeof(*ip)); 82 + if ((void *)ip + sizeof(*ip) > data_end) 83 + return NULL; 84 + 85 + if (ip->next_header != 43) 86 + return NULL; 87 + 88 + srh = cursor_advance(cursor, sizeof(*srh)); 89 + if ((void *)srh + sizeof(*srh) > data_end) 90 + return NULL; 91 + 92 + if (srh->type != 4) 93 + return NULL; 94 + 95 + return srh; 96 + } 97 + 98 + __attribute__((always_inline)) 99 + int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad, 100 + uint32_t old_pad, uint32_t pad_off) 101 + { 102 + int err; 103 + 104 + if (new_pad != old_pad) { 105 + err = bpf_lwt_seg6_adjust_srh(skb, pad_off, 106 + (int) new_pad - (int) old_pad); 107 + if (err) 108 + return err; 109 + } 110 + 111 + if (new_pad > 0) { 112 + char pad_tlv_buf[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113 + 0, 0, 0}; 114 + struct sr6_tlv_t *pad_tlv = (struct sr6_tlv_t *) pad_tlv_buf; 115 + 116 + pad_tlv->type = SR6_TLV_PADDING; 117 + pad_tlv->len = new_pad - 2; 118 + 119 + err = bpf_lwt_seg6_store_bytes(skb, pad_off, 120 + (void *)pad_tlv_buf, new_pad); 121 + if (err) 122 + return err; 123 + } 124 + 125 + return 0; 126 + } 127 + 128 + __attribute__((always_inline)) 129 + int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh, 130 + uint32_t *tlv_off, uint32_t *pad_size, 131 + uint32_t *pad_off) 132 + { 133 + uint32_t srh_off, cur_off; 134 + int offset_valid = 0; 135 + int err; 136 + 137 + srh_off = (char *)srh - (char *)(long)skb->data; 138 + // cur_off = end of segments, start of possible TLVs 139 + cur_off = srh_off + sizeof(*srh) + 140 + sizeof(struct ip6_addr_t) * (srh->first_segment + 1); 141 + 142 + *pad_off = 0; 143 + 144 + // we can only go as far as ~10 TLVs due to the BPF max stack size 145 + #pragma clang loop unroll(full) 146 + for (int i = 0; i < 10; i++) { 147 + struct sr6_tlv_t tlv; 148 + 149 + if (cur_off == *tlv_off) 150 + offset_valid = 1; 151 + 152 + if (cur_off >= srh_off + ((srh->hdrlen + 1) << 3)) 153 + break; 154 + 155 + err = bpf_skb_load_bytes(skb, cur_off, &tlv, sizeof(tlv)); 156 + if (err) 157 + return err; 158 + 159 + if (tlv.type == SR6_TLV_PADDING) { 160 + *pad_size = tlv.len + sizeof(tlv); 161 + *pad_off = cur_off; 162 + 163 + if (*tlv_off == srh_off) { 164 + *tlv_off = cur_off; 165 + offset_valid = 1; 166 + } 167 + break; 168 + 169 + } else if (tlv.type == SR6_TLV_HMAC) { 170 + break; 171 + } 172 + 173 + cur_off += sizeof(tlv) + tlv.len; 174 + } // we reached the padding or HMAC TLVs, or the end of the SRH 175 + 176 + if (*pad_off == 0) 177 + *pad_off = cur_off; 178 + 179 + if (*tlv_off == -1) 180 + *tlv_off = cur_off; 181 + else if (!offset_valid) 182 + return -EINVAL; 183 + 184 + return 0; 185 + } 186 + 187 + __attribute__((always_inline)) 188 + int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off, 189 + struct sr6_tlv_t *itlv, uint8_t tlv_size) 190 + { 191 + uint32_t srh_off = (char *)srh - (char *)(long)skb->data; 192 + uint8_t len_remaining, new_pad; 193 + uint32_t pad_off = 0; 194 + uint32_t pad_size = 0; 195 + uint32_t partial_srh_len; 196 + int err; 197 + 198 + if (tlv_off != -1) 199 + tlv_off += srh_off; 200 + 201 + if (itlv->type == SR6_TLV_PADDING || itlv->type == SR6_TLV_HMAC) 202 + return -EINVAL; 203 + 204 + err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off); 205 + if (err) 206 + return err; 207 + 208 + err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, sizeof(*itlv) + itlv->len); 209 + if (err) 210 + return err; 211 + 212 + err = bpf_lwt_seg6_store_bytes(skb, tlv_off, (void *)itlv, tlv_size); 213 + if (err) 214 + return err; 215 + 216 + // the following can't be moved inside update_tlv_pad because the 217 + // bpf verifier has some issues with it 218 + pad_off += sizeof(*itlv) + itlv->len; 219 + partial_srh_len = pad_off - srh_off; 220 + len_remaining = partial_srh_len % 8; 221 + new_pad = 8 - len_remaining; 222 + 223 + if (new_pad == 1) // cannot pad for 1 byte only 224 + new_pad = 9; 225 + else if (new_pad == 8) 226 + new_pad = 0; 227 + 228 + return update_tlv_pad(skb, new_pad, pad_size, pad_off); 229 + } 230 + 231 + __attribute__((always_inline)) 232 + int delete_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, 233 + uint32_t tlv_off) 234 + { 235 + uint32_t srh_off = (char *)srh - (char *)(long)skb->data; 236 + uint8_t len_remaining, new_pad; 237 + uint32_t partial_srh_len; 238 + uint32_t pad_off = 0; 239 + uint32_t pad_size = 0; 240 + struct sr6_tlv_t tlv; 241 + int err; 242 + 243 + tlv_off += srh_off; 244 + 245 + err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off); 246 + if (err) 247 + return err; 248 + 249 + err = bpf_skb_load_bytes(skb, tlv_off, &tlv, sizeof(tlv)); 250 + if (err) 251 + return err; 252 + 253 + err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, -(sizeof(tlv) + tlv.len)); 254 + if (err) 255 + return err; 256 + 257 + pad_off -= sizeof(tlv) + tlv.len; 258 + partial_srh_len = pad_off - srh_off; 259 + len_remaining = partial_srh_len % 8; 260 + new_pad = 8 - len_remaining; 261 + if (new_pad == 1) // cannot pad for 1 byte only 262 + new_pad = 9; 263 + else if (new_pad == 8) 264 + new_pad = 0; 265 + 266 + return update_tlv_pad(skb, new_pad, pad_size, pad_off); 267 + } 268 + 269 + __attribute__((always_inline)) 270 + int has_egr_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh) 271 + { 272 + int tlv_offset = sizeof(struct ip6_t) + sizeof(struct ip6_srh_t) + 273 + ((srh->first_segment + 1) << 4); 274 + struct sr6_tlv_t tlv; 275 + 276 + if (bpf_skb_load_bytes(skb, tlv_offset, &tlv, sizeof(struct sr6_tlv_t))) 277 + return 0; 278 + 279 + if (tlv.type == SR6_TLV_EGRESS && tlv.len == 18) { 280 + struct ip6_addr_t egr_addr; 281 + 282 + if (bpf_skb_load_bytes(skb, tlv_offset + 4, &egr_addr, 16)) 283 + return 0; 284 + 285 + // check if egress TLV value is correct 286 + if (ntohll(egr_addr.hi) == 0xfd00000000000000 && 287 + ntohll(egr_addr.lo) == 0x4) 288 + return 1; 289 + } 290 + 291 + return 0; 292 + } 293 + 294 + // This function will push a SRH with segments fd00::1, fd00::2, fd00::3, 295 + // fd00::4 296 + SEC("encap_srh") 297 + int __encap_srh(struct __sk_buff *skb) 298 + { 299 + unsigned long long hi = 0xfd00000000000000; 300 + struct ip6_addr_t *seg; 301 + struct ip6_srh_t *srh; 302 + char srh_buf[72]; // room for 4 segments 303 + int err; 304 + 305 + srh = (struct ip6_srh_t *)srh_buf; 306 + srh->nexthdr = 0; 307 + srh->hdrlen = 8; 308 + srh->type = 4; 309 + srh->segments_left = 3; 310 + srh->first_segment = 3; 311 + srh->flags = 0; 312 + srh->tag = 0; 313 + 314 + seg = (struct ip6_addr_t *)((char *)srh + sizeof(*srh)); 315 + 316 + #pragma clang loop unroll(full) 317 + for (unsigned long long lo = 0; lo < 4; lo++) { 318 + seg->lo = htonll(4 - lo); 319 + seg->hi = htonll(hi); 320 + seg = (struct ip6_addr_t *)((char *)seg + sizeof(*seg)); 321 + } 322 + 323 + err = bpf_lwt_push_encap(skb, 0, (void *)srh, sizeof(srh_buf)); 324 + if (err) 325 + return BPF_DROP; 326 + 327 + return BPF_REDIRECT; 328 + } 329 + 330 + // Add an Egress TLV fc00::4, add the flag A, 331 + // and apply End.X action to fc42::1 332 + SEC("add_egr_x") 333 + int __add_egr_x(struct __sk_buff *skb) 334 + { 335 + unsigned long long hi = 0xfc42000000000000; 336 + unsigned long long lo = 0x1; 337 + struct ip6_srh_t *srh = get_srh(skb); 338 + uint8_t new_flags = SR6_FLAG_ALERT; 339 + struct ip6_addr_t addr; 340 + int err, offset; 341 + 342 + if (srh == NULL) 343 + return BPF_DROP; 344 + 345 + uint8_t tlv[20] = {2, 18, 0, 0, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 346 + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4}; 347 + 348 + err = add_tlv(skb, srh, (srh->hdrlen+1) << 3, 349 + (struct sr6_tlv_t *)&tlv, 20); 350 + if (err) 351 + return BPF_DROP; 352 + 353 + offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags); 354 + err = bpf_lwt_seg6_store_bytes(skb, offset, 355 + (void *)&new_flags, sizeof(new_flags)); 356 + if (err) 357 + return BPF_DROP; 358 + 359 + addr.lo = htonll(lo); 360 + addr.hi = htonll(hi); 361 + err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X, 362 + (void *)&addr, sizeof(addr)); 363 + if (err) 364 + return BPF_DROP; 365 + return BPF_REDIRECT; 366 + } 367 + 368 + // Pop the Egress TLV, reset the flags, change the tag 2442 and finally do a 369 + // simple End action 370 + SEC("pop_egr") 371 + int __pop_egr(struct __sk_buff *skb) 372 + { 373 + struct ip6_srh_t *srh = get_srh(skb); 374 + uint16_t new_tag = bpf_htons(2442); 375 + uint8_t new_flags = 0; 376 + int err, offset; 377 + 378 + if (srh == NULL) 379 + return BPF_DROP; 380 + 381 + if (srh->flags != SR6_FLAG_ALERT) 382 + return BPF_DROP; 383 + 384 + if (srh->hdrlen != 11) // 4 segments + Egress TLV + Padding TLV 385 + return BPF_DROP; 386 + 387 + if (!has_egr_tlv(skb, srh)) 388 + return BPF_DROP; 389 + 390 + err = delete_tlv(skb, srh, 8 + (srh->first_segment + 1) * 16); 391 + if (err) 392 + return BPF_DROP; 393 + 394 + offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags); 395 + if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_flags, 396 + sizeof(new_flags))) 397 + return BPF_DROP; 398 + 399 + offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, tag); 400 + if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_tag, 401 + sizeof(new_tag))) 402 + return BPF_DROP; 403 + 404 + return BPF_OK; 405 + } 406 + 407 + // Inspect if the Egress TLV and flag have been removed, if the tag is correct, 408 + // then apply a End.T action to reach the last segment 409 + SEC("inspect_t") 410 + int __inspect_t(struct __sk_buff *skb) 411 + { 412 + struct ip6_srh_t *srh = get_srh(skb); 413 + int table = 117; 414 + int err; 415 + 416 + if (srh == NULL) 417 + return BPF_DROP; 418 + 419 + if (srh->flags != 0) 420 + return BPF_DROP; 421 + 422 + if (srh->tag != bpf_htons(2442)) 423 + return BPF_DROP; 424 + 425 + if (srh->hdrlen != 8) // 4 segments 426 + return BPF_DROP; 427 + 428 + err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_T, 429 + (void *)&table, sizeof(table)); 430 + 431 + if (err) 432 + return BPF_DROP; 433 + 434 + return BPF_REDIRECT; 435 + } 436 + 437 + char __license[] SEC("license") = "GPL";

+140

tools/testing/selftests/bpf/test_lwt_seg6local.sh

··· 1 + #!/bin/bash 2 + # Connects 6 network namespaces through veths. 3 + # Each NS may have different IPv6 global scope addresses : 4 + # NS1 ---- NS2 ---- NS3 ---- NS4 ---- NS5 ---- NS6 5 + # fb00::1 fd00::1 fd00::2 fd00::3 fb00::6 6 + # fc42::1 fd00::4 7 + # 8 + # All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a 9 + # IPv6 header with a Segment Routing Header, with segments : 10 + # fd00::1 -> fd00::2 -> fd00::3 -> fd00::4 11 + # 12 + # 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions : 13 + # - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1 14 + # - fd00::2 : remove the TLV, change the flags, add a tag 15 + # - fd00::3 : apply an End.T action to fd00::4, through routing table 117 16 + # 17 + # fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet. 18 + # Each End.BPF action will validate the operations applied on the SRH by the 19 + # previous BPF program in the chain, otherwise the packet is dropped. 20 + # 21 + # An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this 22 + # datagram can be read on NS6 when binding to fb00::6. 23 + 24 + TMP_FILE="/tmp/selftest_lwt_seg6local.txt" 25 + 26 + cleanup() 27 + { 28 + if [ "$?" = "0" ]; then 29 + echo "selftests: test_lwt_seg6local [PASS]"; 30 + else 31 + echo "selftests: test_lwt_seg6local [FAILED]"; 32 + fi 33 + 34 + set +e 35 + ip netns del ns1 2> /dev/null 36 + ip netns del ns2 2> /dev/null 37 + ip netns del ns3 2> /dev/null 38 + ip netns del ns4 2> /dev/null 39 + ip netns del ns5 2> /dev/null 40 + ip netns del ns6 2> /dev/null 41 + rm -f $TMP_FILE 42 + } 43 + 44 + set -e 45 + 46 + ip netns add ns1 47 + ip netns add ns2 48 + ip netns add ns3 49 + ip netns add ns4 50 + ip netns add ns5 51 + ip netns add ns6 52 + 53 + trap cleanup 0 2 3 6 9 54 + 55 + ip link add veth1 type veth peer name veth2 56 + ip link add veth3 type veth peer name veth4 57 + ip link add veth5 type veth peer name veth6 58 + ip link add veth7 type veth peer name veth8 59 + ip link add veth9 type veth peer name veth10 60 + 61 + ip link set veth1 netns ns1 62 + ip link set veth2 netns ns2 63 + ip link set veth3 netns ns2 64 + ip link set veth4 netns ns3 65 + ip link set veth5 netns ns3 66 + ip link set veth6 netns ns4 67 + ip link set veth7 netns ns4 68 + ip link set veth8 netns ns5 69 + ip link set veth9 netns ns5 70 + ip link set veth10 netns ns6 71 + 72 + ip netns exec ns1 ip link set dev veth1 up 73 + ip netns exec ns2 ip link set dev veth2 up 74 + ip netns exec ns2 ip link set dev veth3 up 75 + ip netns exec ns3 ip link set dev veth4 up 76 + ip netns exec ns3 ip link set dev veth5 up 77 + ip netns exec ns4 ip link set dev veth6 up 78 + ip netns exec ns4 ip link set dev veth7 up 79 + ip netns exec ns5 ip link set dev veth8 up 80 + ip netns exec ns5 ip link set dev veth9 up 81 + ip netns exec ns6 ip link set dev veth10 up 82 + ip netns exec ns6 ip link set dev lo up 83 + 84 + # All link scope addresses and routes required between veths 85 + ip netns exec ns1 ip -6 addr add fb00::12/16 dev veth1 scope link 86 + ip netns exec ns1 ip -6 route add fb00::21 dev veth1 scope link 87 + ip netns exec ns2 ip -6 addr add fb00::21/16 dev veth2 scope link 88 + ip netns exec ns2 ip -6 addr add fb00::34/16 dev veth3 scope link 89 + ip netns exec ns2 ip -6 route add fb00::43 dev veth3 scope link 90 + ip netns exec ns3 ip -6 route add fb00::65 dev veth5 scope link 91 + ip netns exec ns3 ip -6 addr add fb00::43/16 dev veth4 scope link 92 + ip netns exec ns3 ip -6 addr add fb00::56/16 dev veth5 scope link 93 + ip netns exec ns4 ip -6 addr add fb00::65/16 dev veth6 scope link 94 + ip netns exec ns4 ip -6 addr add fb00::78/16 dev veth7 scope link 95 + ip netns exec ns4 ip -6 route add fb00::87 dev veth7 scope link 96 + ip netns exec ns5 ip -6 addr add fb00::87/16 dev veth8 scope link 97 + ip netns exec ns5 ip -6 addr add fb00::910/16 dev veth9 scope link 98 + ip netns exec ns5 ip -6 route add fb00::109 dev veth9 scope link 99 + ip netns exec ns5 ip -6 route add fb00::109 table 117 dev veth9 scope link 100 + ip netns exec ns6 ip -6 addr add fb00::109/16 dev veth10 scope link 101 + 102 + ip netns exec ns1 ip -6 addr add fb00::1/16 dev lo 103 + ip netns exec ns1 ip -6 route add fb00::6 dev veth1 via fb00::21 104 + 105 + ip netns exec ns2 ip -6 route add fb00::6 encap bpf in obj test_lwt_seg6local.o sec encap_srh dev veth2 106 + ip netns exec ns2 ip -6 route add fd00::1 dev veth3 via fb00::43 scope link 107 + 108 + ip netns exec ns3 ip -6 route add fc42::1 dev veth5 via fb00::65 109 + ip netns exec ns3 ip -6 route add fd00::1 encap seg6local action End.BPF obj test_lwt_seg6local.o sec add_egr_x dev veth4 110 + 111 + ip netns exec ns4 ip -6 route add fd00::2 encap seg6local action End.BPF obj test_lwt_seg6local.o sec pop_egr dev veth6 112 + ip netns exec ns4 ip -6 addr add fc42::1 dev lo 113 + ip netns exec ns4 ip -6 route add fd00::3 dev veth7 via fb00::87 114 + 115 + ip netns exec ns5 ip -6 route add fd00::4 table 117 dev veth9 via fb00::109 116 + ip netns exec ns5 ip -6 route add fd00::3 encap seg6local action End.BPF obj test_lwt_seg6local.o sec inspect_t dev veth8 117 + 118 + ip netns exec ns6 ip -6 addr add fb00::6/16 dev lo 119 + ip netns exec ns6 ip -6 addr add fd00::4/16 dev lo 120 + 121 + ip netns exec ns1 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 122 + ip netns exec ns2 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 123 + ip netns exec ns3 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 124 + ip netns exec ns4 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 125 + ip netns exec ns5 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 126 + 127 + ip netns exec ns6 sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null 128 + ip netns exec ns6 sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null 129 + ip netns exec ns6 sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null 130 + 131 + ip netns exec ns6 nc -l -6 -u -d 7330 > $TMP_FILE & 132 + ip netns exec ns1 bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330" 133 + sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment 134 + kill -INT $! 135 + 136 + if [[ $(< $TMP_FILE) != "foobar" ]]; then 137 + exit 1 138 + fi 139 + 140 + exit 0

+158

tools/testing/selftests/bpf/test_progs.c

··· 1542 1542 bpf_object__close(obj); 1543 1543 } 1544 1544 1545 + static void test_task_fd_query_rawtp(void) 1546 + { 1547 + const char *file = "./test_get_stack_rawtp.o"; 1548 + __u64 probe_offset, probe_addr; 1549 + __u32 len, prog_id, fd_type; 1550 + struct bpf_object *obj; 1551 + int efd, err, prog_fd; 1552 + __u32 duration = 0; 1553 + char buf[256]; 1554 + 1555 + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd); 1556 + if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno)) 1557 + return; 1558 + 1559 + efd = bpf_raw_tracepoint_open("sys_enter", prog_fd); 1560 + if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno)) 1561 + goto close_prog; 1562 + 1563 + /* query (getpid(), efd) */ 1564 + len = sizeof(buf); 1565 + err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id, 1566 + &fd_type, &probe_offset, &probe_addr); 1567 + if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err, 1568 + errno)) 1569 + goto close_prog; 1570 + 1571 + err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT && 1572 + strcmp(buf, "sys_enter") == 0; 1573 + if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n", 1574 + fd_type, buf)) 1575 + goto close_prog; 1576 + 1577 + /* test zero len */ 1578 + len = 0; 1579 + err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id, 1580 + &fd_type, &probe_offset, &probe_addr); 1581 + if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n", 1582 + err, errno)) 1583 + goto close_prog; 1584 + err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT && 1585 + len == strlen("sys_enter"); 1586 + if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len)) 1587 + goto close_prog; 1588 + 1589 + /* test empty buffer */ 1590 + len = sizeof(buf); 1591 + err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id, 1592 + &fd_type, &probe_offset, &probe_addr); 1593 + if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n", 1594 + err, errno)) 1595 + goto close_prog; 1596 + err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT && 1597 + len == strlen("sys_enter"); 1598 + if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len)) 1599 + goto close_prog; 1600 + 1601 + /* test smaller buffer */ 1602 + len = 3; 1603 + err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id, 1604 + &fd_type, &probe_offset, &probe_addr); 1605 + if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 3)", 1606 + "err %d errno %d\n", err, errno)) 1607 + goto close_prog; 1608 + err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT && 1609 + len == strlen("sys_enter") && 1610 + strcmp(buf, "sy") == 0; 1611 + if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len)) 1612 + goto close_prog; 1613 + 1614 + goto close_prog_noerr; 1615 + close_prog: 1616 + error_cnt++; 1617 + close_prog_noerr: 1618 + bpf_object__close(obj); 1619 + } 1620 + 1621 + static void test_task_fd_query_tp_core(const char *probe_name, 1622 + const char *tp_name) 1623 + { 1624 + const char *file = "./test_tracepoint.o"; 1625 + int err, bytes, efd, prog_fd, pmu_fd; 1626 + struct perf_event_attr attr = {}; 1627 + __u64 probe_offset, probe_addr; 1628 + __u32 len, prog_id, fd_type; 1629 + struct bpf_object *obj; 1630 + __u32 duration = 0; 1631 + char buf[256]; 1632 + 1633 + err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); 1634 + if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno)) 1635 + goto close_prog; 1636 + 1637 + snprintf(buf, sizeof(buf), 1638 + "/sys/kernel/debug/tracing/events/%s/id", probe_name); 1639 + efd = open(buf, O_RDONLY, 0); 1640 + if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) 1641 + goto close_prog; 1642 + bytes = read(efd, buf, sizeof(buf)); 1643 + close(efd); 1644 + if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read", 1645 + "bytes %d errno %d\n", bytes, errno)) 1646 + goto close_prog; 1647 + 1648 + attr.config = strtol(buf, NULL, 0); 1649 + attr.type = PERF_TYPE_TRACEPOINT; 1650 + attr.sample_type = PERF_SAMPLE_RAW; 1651 + attr.sample_period = 1; 1652 + attr.wakeup_events = 1; 1653 + pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 1654 + 0 /* cpu 0 */, -1 /* group id */, 1655 + 0 /* flags */); 1656 + if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno)) 1657 + goto close_pmu; 1658 + 1659 + err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); 1660 + if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err, 1661 + errno)) 1662 + goto close_pmu; 1663 + 1664 + err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); 1665 + if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err, 1666 + errno)) 1667 + goto close_pmu; 1668 + 1669 + /* query (getpid(), pmu_fd) */ 1670 + len = sizeof(buf); 1671 + err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id, 1672 + &fd_type, &probe_offset, &probe_addr); 1673 + if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err, 1674 + errno)) 1675 + goto close_pmu; 1676 + 1677 + err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name); 1678 + if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n", 1679 + fd_type, buf)) 1680 + goto close_pmu; 1681 + 1682 + close(pmu_fd); 1683 + goto close_prog_noerr; 1684 + 1685 + close_pmu: 1686 + close(pmu_fd); 1687 + close_prog: 1688 + error_cnt++; 1689 + close_prog_noerr: 1690 + bpf_object__close(obj); 1691 + } 1692 + 1693 + static void test_task_fd_query_tp(void) 1694 + { 1695 + test_task_fd_query_tp_core("sched/sched_switch", 1696 + "sched_switch"); 1697 + test_task_fd_query_tp_core("syscalls/sys_enter_read", 1698 + "sys_enter_read"); 1699 + } 1700 + 1545 1701 int main(void) 1546 1702 { 1547 1703 jit_enabled = is_jit_enabled(); ··· 1717 1561 test_stacktrace_build_id_nmi(); 1718 1562 test_stacktrace_map_raw_tp(); 1719 1563 test_get_stack_raw_tp(); 1564 + test_task_fd_query_rawtp(); 1565 + test_task_fd_query_tp(); 1720 1566 1721 1567 printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); 1722 1568 return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;

+115

tools/testing/selftests/bpf/test_verifier.c

··· 1686 1686 .prog_type = BPF_PROG_TYPE_SK_SKB, 1687 1687 }, 1688 1688 { 1689 + "valid access family in SK_MSG", 1690 + .insns = { 1691 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1692 + offsetof(struct sk_msg_md, family)), 1693 + BPF_EXIT_INSN(), 1694 + }, 1695 + .result = ACCEPT, 1696 + .prog_type = BPF_PROG_TYPE_SK_MSG, 1697 + }, 1698 + { 1699 + "valid access remote_ip4 in SK_MSG", 1700 + .insns = { 1701 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1702 + offsetof(struct sk_msg_md, remote_ip4)), 1703 + BPF_EXIT_INSN(), 1704 + }, 1705 + .result = ACCEPT, 1706 + .prog_type = BPF_PROG_TYPE_SK_MSG, 1707 + }, 1708 + { 1709 + "valid access local_ip4 in SK_MSG", 1710 + .insns = { 1711 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1712 + offsetof(struct sk_msg_md, local_ip4)), 1713 + BPF_EXIT_INSN(), 1714 + }, 1715 + .result = ACCEPT, 1716 + .prog_type = BPF_PROG_TYPE_SK_MSG, 1717 + }, 1718 + { 1719 + "valid access remote_port in SK_MSG", 1720 + .insns = { 1721 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1722 + offsetof(struct sk_msg_md, remote_port)), 1723 + BPF_EXIT_INSN(), 1724 + }, 1725 + .result = ACCEPT, 1726 + .prog_type = BPF_PROG_TYPE_SK_MSG, 1727 + }, 1728 + { 1729 + "valid access local_port in SK_MSG", 1730 + .insns = { 1731 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1732 + offsetof(struct sk_msg_md, local_port)), 1733 + BPF_EXIT_INSN(), 1734 + }, 1735 + .result = ACCEPT, 1736 + .prog_type = BPF_PROG_TYPE_SK_MSG, 1737 + }, 1738 + { 1739 + "valid access remote_ip6 in SK_MSG", 1740 + .insns = { 1741 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1742 + offsetof(struct sk_msg_md, remote_ip6[0])), 1743 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1744 + offsetof(struct sk_msg_md, remote_ip6[1])), 1745 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1746 + offsetof(struct sk_msg_md, remote_ip6[2])), 1747 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1748 + offsetof(struct sk_msg_md, remote_ip6[3])), 1749 + BPF_EXIT_INSN(), 1750 + }, 1751 + .result = ACCEPT, 1752 + .prog_type = BPF_PROG_TYPE_SK_SKB, 1753 + }, 1754 + { 1755 + "valid access local_ip6 in SK_MSG", 1756 + .insns = { 1757 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1758 + offsetof(struct sk_msg_md, local_ip6[0])), 1759 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1760 + offsetof(struct sk_msg_md, local_ip6[1])), 1761 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1762 + offsetof(struct sk_msg_md, local_ip6[2])), 1763 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1764 + offsetof(struct sk_msg_md, local_ip6[3])), 1765 + BPF_EXIT_INSN(), 1766 + }, 1767 + .result = ACCEPT, 1768 + .prog_type = BPF_PROG_TYPE_SK_SKB, 1769 + }, 1770 + { 1771 + "invalid 64B read of family in SK_MSG", 1772 + .insns = { 1773 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 1774 + offsetof(struct sk_msg_md, family)), 1775 + BPF_EXIT_INSN(), 1776 + }, 1777 + .errstr = "invalid bpf_context access", 1778 + .result = REJECT, 1779 + .prog_type = BPF_PROG_TYPE_SK_MSG, 1780 + }, 1781 + { 1782 + "invalid read past end of SK_MSG", 1783 + .insns = { 1784 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 1785 + offsetof(struct sk_msg_md, local_port) + 4), 1786 + BPF_EXIT_INSN(), 1787 + }, 1788 + .errstr = "R0 !read_ok", 1789 + .result = REJECT, 1790 + .prog_type = BPF_PROG_TYPE_SK_MSG, 1791 + }, 1792 + { 1793 + "invalid read offset in SK_MSG", 1794 + .insns = { 1795 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 1796 + offsetof(struct sk_msg_md, family) + 1), 1797 + BPF_EXIT_INSN(), 1798 + }, 1799 + .errstr = "invalid bpf_context access", 1800 + .result = REJECT, 1801 + .prog_type = BPF_PROG_TYPE_SK_MSG, 1802 + }, 1803 + { 1689 1804 "direct packet read for SK_MSG", 1690 1805 .insns = { 1691 1806 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,

+12

tools/testing/selftests/bpf/trace_helpers.c

··· 72 72 return &syms[0]; 73 73 } 74 74 75 + long ksym_get_addr(const char *name) 76 + { 77 + int i; 78 + 79 + for (i = 0; i < sym_cnt; i++) { 80 + if (strcmp(syms[i].name, name) == 0) 81 + return syms[i].addr; 82 + } 83 + 84 + return 0; 85 + } 86 + 75 87 static int page_size; 76 88 static int page_cnt = 8; 77 89 static struct perf_event_mmap_page *header;

+1

tools/testing/selftests/bpf/trace_helpers.h

··· 11 11 12 12 int load_kallsyms(void); 13 13 struct ksym *ksym_search(long key); 14 + long ksym_get_addr(const char *name); 14 15 15 16 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size); 16 17