Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+25 -5

Documentation/bpf/bpf_design_QA.rst

··· 172 172 CPU architectures and 32-bit HW accelerators. Can true 32-bit registers 173 173 be added to BPF in the future? 174 174 175 - A: NO. The first thing to improve performance on 32-bit archs is to teach 176 - LLVM to generate code that uses 32-bit subregisters. Then second step 177 - is to teach verifier to mark operations where zero-ing upper bits 178 - is unnecessary. Then JITs can take advantage of those markings and 179 - drastically reduce size of generated code and improve performance. 175 + A: NO. 176 + 177 + But some optimizations on zero-ing the upper 32 bits for BPF registers are 178 + available, and can be leveraged to improve the performance of JITed BPF 179 + programs for 32-bit architectures. 180 + 181 + Starting with version 7, LLVM is able to generate instructions that operate 182 + on 32-bit subregisters, provided the option -mattr=+alu32 is passed for 183 + compiling a program. Furthermore, the verifier can now mark the 184 + instructions for which zero-ing the upper bits of the destination register 185 + is required, and insert an explicit zero-extension (zext) instruction 186 + (a mov32 variant). This means that for architectures without zext hardware 187 + support, the JIT back-ends do not need to clear the upper bits for 188 + subregisters written by alu32 instructions or narrow loads. Instead, the 189 + back-ends simply need to support code generation for that mov32 variant, 190 + and to overwrite bpf_jit_needs_zext() to make it return "true" (in order to 191 + enable zext insertion in the verifier). 192 + 193 + Note that it is possible for a JIT back-end to have partial hardware 194 + support for zext. In that case, if verifier zext insertion is enabled, 195 + it could lead to the insertion of unnecessary zext instructions. Such 196 + instructions could be removed by creating a simple peephole inside the JIT 197 + back-end: if one instruction has hardware support for zext and if the next 198 + instruction is an explicit zext, then the latter can be skipped when doing 199 + the code generation. 180 200 181 201 Q: Does BPF have a stable ABI? 182 202 ------------------------------

+31 -11

arch/arm/net/bpf_jit_32.c

··· 736 736 737 737 /* ALU operation */ 738 738 emit_alu_r(rd[1], rs, true, false, op, ctx); 739 - emit_a32_mov_i(rd[0], 0, ctx); 739 + if (!ctx->prog->aux->verifier_zext) 740 + emit_a32_mov_i(rd[0], 0, ctx); 740 741 } 741 742 742 743 arm_bpf_put_reg64(dst, rd, ctx); ··· 759 758 struct jit_ctx *ctx) { 760 759 if (!is64) { 761 760 emit_a32_mov_r(dst_lo, src_lo, ctx); 762 - /* Zero out high 4 bytes */ 763 - emit_a32_mov_i(dst_hi, 0, ctx); 761 + if (!ctx->prog->aux->verifier_zext) 762 + /* Zero out high 4 bytes */ 763 + emit_a32_mov_i(dst_hi, 0, ctx); 764 764 } else if (__LINUX_ARM_ARCH__ < 6 && 765 765 ctx->cpu_architecture < CPU_ARCH_ARMv5TE) { 766 766 /* complete 8 byte move */ ··· 1062 1060 case BPF_B: 1063 1061 /* Load a Byte */ 1064 1062 emit(ARM_LDRB_I(rd[1], rm, off), ctx); 1065 - emit_a32_mov_i(rd[0], 0, ctx); 1063 + if (!ctx->prog->aux->verifier_zext) 1064 + emit_a32_mov_i(rd[0], 0, ctx); 1066 1065 break; 1067 1066 case BPF_H: 1068 1067 /* Load a HalfWord */ 1069 1068 emit(ARM_LDRH_I(rd[1], rm, off), ctx); 1070 - emit_a32_mov_i(rd[0], 0, ctx); 1069 + if (!ctx->prog->aux->verifier_zext) 1070 + emit_a32_mov_i(rd[0], 0, ctx); 1071 1071 break; 1072 1072 case BPF_W: 1073 1073 /* Load a Word */ 1074 1074 emit(ARM_LDR_I(rd[1], rm, off), ctx); 1075 - emit_a32_mov_i(rd[0], 0, ctx); 1075 + if (!ctx->prog->aux->verifier_zext) 1076 + emit_a32_mov_i(rd[0], 0, ctx); 1076 1077 break; 1077 1078 case BPF_DW: 1078 1079 /* Load a Double Word */ ··· 1364 1359 case BPF_ALU64 | BPF_MOV | BPF_X: 1365 1360 switch (BPF_SRC(code)) { 1366 1361 case BPF_X: 1362 + if (imm == 1) { 1363 + /* Special mov32 for zext */ 1364 + emit_a32_mov_i(dst_hi, 0, ctx); 1365 + break; 1366 + } 1367 1367 emit_a32_mov_r64(is64, dst, src, ctx); 1368 1368 break; 1369 1369 case BPF_K: ··· 1448 1438 } 1449 1439 emit_udivmod(rd_lo, rd_lo, rt, ctx, BPF_OP(code)); 1450 1440 arm_bpf_put_reg32(dst_lo, rd_lo, ctx); 1451 - emit_a32_mov_i(dst_hi, 0, ctx); 1441 + if (!ctx->prog->aux->verifier_zext) 1442 + emit_a32_mov_i(dst_hi, 0, ctx); 1452 1443 break; 1453 1444 case BPF_ALU64 | BPF_DIV | BPF_K: 1454 1445 case BPF_ALU64 | BPF_DIV | BPF_X: ··· 1464 1453 return -EINVAL; 1465 1454 if (imm) 1466 1455 emit_a32_alu_i(dst_lo, imm, ctx, BPF_OP(code)); 1467 - emit_a32_mov_i(dst_hi, 0, ctx); 1456 + if (!ctx->prog->aux->verifier_zext) 1457 + emit_a32_mov_i(dst_hi, 0, ctx); 1468 1458 break; 1469 1459 /* dst = dst << imm */ 1470 1460 case BPF_ALU64 | BPF_LSH | BPF_K: ··· 1500 1488 /* dst = ~dst */ 1501 1489 case BPF_ALU | BPF_NEG: 1502 1490 emit_a32_alu_i(dst_lo, 0, ctx, BPF_OP(code)); 1503 - emit_a32_mov_i(dst_hi, 0, ctx); 1491 + if (!ctx->prog->aux->verifier_zext) 1492 + emit_a32_mov_i(dst_hi, 0, ctx); 1504 1493 break; 1505 1494 /* dst = ~dst (64 bit) */ 1506 1495 case BPF_ALU64 | BPF_NEG: ··· 1557 1544 #else /* ARMv6+ */ 1558 1545 emit(ARM_UXTH(rd[1], rd[1]), ctx); 1559 1546 #endif 1560 - emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); 1547 + if (!ctx->prog->aux->verifier_zext) 1548 + emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); 1561 1549 break; 1562 1550 case 32: 1563 1551 /* zero-extend 32 bits into 64 bits */ 1564 - emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); 1552 + if (!ctx->prog->aux->verifier_zext) 1553 + emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); 1565 1554 break; 1566 1555 case 64: 1567 1556 /* nop */ ··· 1851 1836 void bpf_jit_compile(struct bpf_prog *prog) 1852 1837 { 1853 1838 /* Nothing to do here. We support Internal BPF. */ 1839 + } 1840 + 1841 + bool bpf_jit_needs_zext(void) 1842 + { 1843 + return true; 1854 1844 } 1855 1845 1856 1846 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)

+33 -3

arch/powerpc/net/bpf_jit_comp64.c

··· 504 504 case BPF_ALU | BPF_LSH | BPF_X: /* (u32) dst <<= (u32) src */ 505 505 /* slw clears top 32 bits */ 506 506 PPC_SLW(dst_reg, dst_reg, src_reg); 507 + /* skip zero extension move, but set address map. */ 508 + if (insn_is_zext(&insn[i + 1])) 509 + addrs[++i] = ctx->idx * 4; 507 510 break; 508 511 case BPF_ALU64 | BPF_LSH | BPF_X: /* dst <<= src; */ 509 512 PPC_SLD(dst_reg, dst_reg, src_reg); ··· 514 511 case BPF_ALU | BPF_LSH | BPF_K: /* (u32) dst <<== (u32) imm */ 515 512 /* with imm 0, we still need to clear top 32 bits */ 516 513 PPC_SLWI(dst_reg, dst_reg, imm); 514 + if (insn_is_zext(&insn[i + 1])) 515 + addrs[++i] = ctx->idx * 4; 517 516 break; 518 517 case BPF_ALU64 | BPF_LSH | BPF_K: /* dst <<== imm */ 519 518 if (imm != 0) ··· 523 518 break; 524 519 case BPF_ALU | BPF_RSH | BPF_X: /* (u32) dst >>= (u32) src */ 525 520 PPC_SRW(dst_reg, dst_reg, src_reg); 521 + if (insn_is_zext(&insn[i + 1])) 522 + addrs[++i] = ctx->idx * 4; 526 523 break; 527 524 case BPF_ALU64 | BPF_RSH | BPF_X: /* dst >>= src */ 528 525 PPC_SRD(dst_reg, dst_reg, src_reg); 529 526 break; 530 527 case BPF_ALU | BPF_RSH | BPF_K: /* (u32) dst >>= (u32) imm */ 531 528 PPC_SRWI(dst_reg, dst_reg, imm); 529 + if (insn_is_zext(&insn[i + 1])) 530 + addrs[++i] = ctx->idx * 4; 532 531 break; 533 532 case BPF_ALU64 | BPF_RSH | BPF_K: /* dst >>= imm */ 534 533 if (imm != 0) ··· 557 548 */ 558 549 case BPF_ALU | BPF_MOV | BPF_X: /* (u32) dst = src */ 559 550 case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ 551 + if (imm == 1) { 552 + /* special mov32 for zext */ 553 + PPC_RLWINM(dst_reg, dst_reg, 0, 0, 31); 554 + break; 555 + } 560 556 PPC_MR(dst_reg, src_reg); 561 557 goto bpf_alu32_trunc; 562 558 case BPF_ALU | BPF_MOV | BPF_K: /* (u32) dst = imm */ ··· 569 555 PPC_LI32(dst_reg, imm); 570 556 if (imm < 0) 571 557 goto bpf_alu32_trunc; 558 + else if (insn_is_zext(&insn[i + 1])) 559 + addrs[++i] = ctx->idx * 4; 572 560 break; 573 561 574 562 bpf_alu32_trunc: 575 563 /* Truncate to 32-bits */ 576 - if (BPF_CLASS(code) == BPF_ALU) 564 + if (BPF_CLASS(code) == BPF_ALU && !fp->aux->verifier_zext) 577 565 PPC_RLWINM(dst_reg, dst_reg, 0, 0, 31); 578 566 break; 579 567 ··· 634 618 case 16: 635 619 /* zero-extend 16 bits into 64 bits */ 636 620 PPC_RLDICL(dst_reg, dst_reg, 0, 48); 621 + if (insn_is_zext(&insn[i + 1])) 622 + addrs[++i] = ctx->idx * 4; 637 623 break; 638 624 case 32: 639 - /* zero-extend 32 bits into 64 bits */ 640 - PPC_RLDICL(dst_reg, dst_reg, 0, 32); 625 + if (!fp->aux->verifier_zext) 626 + /* zero-extend 32 bits into 64 bits */ 627 + PPC_RLDICL(dst_reg, dst_reg, 0, 32); 641 628 break; 642 629 case 64: 643 630 /* nop */ ··· 717 698 /* dst = *(u8 *)(ul) (src + off) */ 718 699 case BPF_LDX | BPF_MEM | BPF_B: 719 700 PPC_LBZ(dst_reg, src_reg, off); 701 + if (insn_is_zext(&insn[i + 1])) 702 + addrs[++i] = ctx->idx * 4; 720 703 break; 721 704 /* dst = *(u16 *)(ul) (src + off) */ 722 705 case BPF_LDX | BPF_MEM | BPF_H: 723 706 PPC_LHZ(dst_reg, src_reg, off); 707 + if (insn_is_zext(&insn[i + 1])) 708 + addrs[++i] = ctx->idx * 4; 724 709 break; 725 710 /* dst = *(u32 *)(ul) (src + off) */ 726 711 case BPF_LDX | BPF_MEM | BPF_W: 727 712 PPC_LWZ(dst_reg, src_reg, off); 713 + if (insn_is_zext(&insn[i + 1])) 714 + addrs[++i] = ctx->idx * 4; 728 715 break; 729 716 /* dst = *(u64 *)(ul) (src + off) */ 730 717 case BPF_LDX | BPF_MEM | BPF_DW: ··· 1070 1045 u32 proglen; 1071 1046 struct codegen_context ctx; 1072 1047 }; 1048 + 1049 + bool bpf_jit_needs_zext(void) 1050 + { 1051 + return true; 1052 + } 1073 1053 1074 1054 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) 1075 1055 {

+30 -13

arch/riscv/net/bpf_jit_comp.c

··· 731 731 { 732 732 bool is64 = BPF_CLASS(insn->code) == BPF_ALU64 || 733 733 BPF_CLASS(insn->code) == BPF_JMP; 734 + struct bpf_prog_aux *aux = ctx->prog->aux; 734 735 int rvoff, i = insn - ctx->prog->insnsi; 735 736 u8 rd = -1, rs = -1, code = insn->code; 736 737 s16 off = insn->off; ··· 743 742 /* dst = src */ 744 743 case BPF_ALU | BPF_MOV | BPF_X: 745 744 case BPF_ALU64 | BPF_MOV | BPF_X: 745 + if (imm == 1) { 746 + /* Special mov32 for zext */ 747 + emit_zext_32(rd, ctx); 748 + break; 749 + } 746 750 emit(is64 ? rv_addi(rd, rs, 0) : rv_addiw(rd, rs, 0), ctx); 747 - if (!is64) 751 + if (!is64 && !aux->verifier_zext) 748 752 emit_zext_32(rd, ctx); 749 753 break; 750 754 ··· 777 771 case BPF_ALU | BPF_MUL | BPF_X: 778 772 case BPF_ALU64 | BPF_MUL | BPF_X: 779 773 emit(is64 ? rv_mul(rd, rd, rs) : rv_mulw(rd, rd, rs), ctx); 780 - if (!is64) 774 + if (!is64 && !aux->verifier_zext) 781 775 emit_zext_32(rd, ctx); 782 776 break; 783 777 case BPF_ALU | BPF_DIV | BPF_X: 784 778 case BPF_ALU64 | BPF_DIV | BPF_X: 785 779 emit(is64 ? rv_divu(rd, rd, rs) : rv_divuw(rd, rd, rs), ctx); 786 - if (!is64) 780 + if (!is64 && !aux->verifier_zext) 787 781 emit_zext_32(rd, ctx); 788 782 break; 789 783 case BPF_ALU | BPF_MOD | BPF_X: 790 784 case BPF_ALU64 | BPF_MOD | BPF_X: 791 785 emit(is64 ? rv_remu(rd, rd, rs) : rv_remuw(rd, rd, rs), ctx); 792 - if (!is64) 786 + if (!is64 && !aux->verifier_zext) 793 787 emit_zext_32(rd, ctx); 794 788 break; 795 789 case BPF_ALU | BPF_LSH | BPF_X: ··· 873 867 case BPF_ALU | BPF_MOV | BPF_K: 874 868 case BPF_ALU64 | BPF_MOV | BPF_K: 875 869 emit_imm(rd, imm, ctx); 876 - if (!is64) 870 + if (!is64 && !aux->verifier_zext) 877 871 emit_zext_32(rd, ctx); 878 872 break; 879 873 ··· 888 882 emit(is64 ? rv_add(rd, rd, RV_REG_T1) : 889 883 rv_addw(rd, rd, RV_REG_T1), ctx); 890 884 } 891 - if (!is64) 885 + if (!is64 && !aux->verifier_zext) 892 886 emit_zext_32(rd, ctx); 893 887 break; 894 888 case BPF_ALU | BPF_SUB | BPF_K: ··· 901 895 emit(is64 ? rv_sub(rd, rd, RV_REG_T1) : 902 896 rv_subw(rd, rd, RV_REG_T1), ctx); 903 897 } 904 - if (!is64) 898 + if (!is64 && !aux->verifier_zext) 905 899 emit_zext_32(rd, ctx); 906 900 break; 907 901 case BPF_ALU | BPF_AND | BPF_K: ··· 912 906 emit_imm(RV_REG_T1, imm, ctx); 913 907 emit(rv_and(rd, rd, RV_REG_T1), ctx); 914 908 } 915 - if (!is64) 909 + if (!is64 && !aux->verifier_zext) 916 910 emit_zext_32(rd, ctx); 917 911 break; 918 912 case BPF_ALU | BPF_OR | BPF_K: ··· 923 917 emit_imm(RV_REG_T1, imm, ctx); 924 918 emit(rv_or(rd, rd, RV_REG_T1), ctx); 925 919 } 926 - if (!is64) 920 + if (!is64 && !aux->verifier_zext) 927 921 emit_zext_32(rd, ctx); 928 922 break; 929 923 case BPF_ALU | BPF_XOR | BPF_K: ··· 934 928 emit_imm(RV_REG_T1, imm, ctx); 935 929 emit(rv_xor(rd, rd, RV_REG_T1), ctx); 936 930 } 937 - if (!is64) 931 + if (!is64 && !aux->verifier_zext) 938 932 emit_zext_32(rd, ctx); 939 933 break; 940 934 case BPF_ALU | BPF_MUL | BPF_K: ··· 942 936 emit_imm(RV_REG_T1, imm, ctx); 943 937 emit(is64 ? rv_mul(rd, rd, RV_REG_T1) : 944 938 rv_mulw(rd, rd, RV_REG_T1), ctx); 945 - if (!is64) 939 + if (!is64 && !aux->verifier_zext) 946 940 emit_zext_32(rd, ctx); 947 941 break; 948 942 case BPF_ALU | BPF_DIV | BPF_K: ··· 950 944 emit_imm(RV_REG_T1, imm, ctx); 951 945 emit(is64 ? rv_divu(rd, rd, RV_REG_T1) : 952 946 rv_divuw(rd, rd, RV_REG_T1), ctx); 953 - if (!is64) 947 + if (!is64 && !aux->verifier_zext) 954 948 emit_zext_32(rd, ctx); 955 949 break; 956 950 case BPF_ALU | BPF_MOD | BPF_K: ··· 958 952 emit_imm(RV_REG_T1, imm, ctx); 959 953 emit(is64 ? rv_remu(rd, rd, RV_REG_T1) : 960 954 rv_remuw(rd, rd, RV_REG_T1), ctx); 961 - if (!is64) 955 + if (!is64 && !aux->verifier_zext) 962 956 emit_zext_32(rd, ctx); 963 957 break; 964 958 case BPF_ALU | BPF_LSH | BPF_K: ··· 1245 1239 emit_imm(RV_REG_T1, off, ctx); 1246 1240 emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx); 1247 1241 emit(rv_lbu(rd, 0, RV_REG_T1), ctx); 1242 + if (insn_is_zext(&insn[1])) 1243 + return 1; 1248 1244 break; 1249 1245 case BPF_LDX | BPF_MEM | BPF_H: 1250 1246 if (is_12b_int(off)) { ··· 1257 1249 emit_imm(RV_REG_T1, off, ctx); 1258 1250 emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx); 1259 1251 emit(rv_lhu(rd, 0, RV_REG_T1), ctx); 1252 + if (insn_is_zext(&insn[1])) 1253 + return 1; 1260 1254 break; 1261 1255 case BPF_LDX | BPF_MEM | BPF_W: 1262 1256 if (is_12b_int(off)) { ··· 1269 1259 emit_imm(RV_REG_T1, off, ctx); 1270 1260 emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx); 1271 1261 emit(rv_lwu(rd, 0, RV_REG_T1), ctx); 1262 + if (insn_is_zext(&insn[1])) 1263 + return 1; 1272 1264 break; 1273 1265 case BPF_LDX | BPF_MEM | BPF_DW: 1274 1266 if (is_12b_int(off)) { ··· 1513 1501 static void bpf_flush_icache(void *start, void *end) 1514 1502 { 1515 1503 flush_icache_range((unsigned long)start, (unsigned long)end); 1504 + } 1505 + 1506 + bool bpf_jit_needs_zext(void) 1507 + { 1508 + return true; 1516 1509 } 1517 1510 1518 1511 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)

+34 -7

arch/s390/net/bpf_jit_comp.c

··· 299 299 300 300 #define EMIT_ZERO(b1) \ 301 301 ({ \ 302 - /* llgfr %dst,%dst (zero extend to 64 bit) */ \ 303 - EMIT4(0xb9160000, b1, b1); \ 304 - REG_SET_SEEN(b1); \ 302 + if (!fp->aux->verifier_zext) { \ 303 + /* llgfr %dst,%dst (zero extend to 64 bit) */ \ 304 + EMIT4(0xb9160000, b1, b1); \ 305 + REG_SET_SEEN(b1); \ 306 + } \ 305 307 }) 306 308 307 309 /* ··· 522 520 case BPF_ALU | BPF_MOV | BPF_X: /* dst = (u32) src */ 523 521 /* llgfr %dst,%src */ 524 522 EMIT4(0xb9160000, dst_reg, src_reg); 523 + if (insn_is_zext(&insn[1])) 524 + insn_count = 2; 525 525 break; 526 526 case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ 527 527 /* lgr %dst,%src */ ··· 532 528 case BPF_ALU | BPF_MOV | BPF_K: /* dst = (u32) imm */ 533 529 /* llilf %dst,imm */ 534 530 EMIT6_IMM(0xc00f0000, dst_reg, imm); 531 + if (insn_is_zext(&insn[1])) 532 + insn_count = 2; 535 533 break; 536 534 case BPF_ALU64 | BPF_MOV | BPF_K: /* dst = imm */ 537 535 /* lgfi %dst,imm */ ··· 645 639 EMIT4(0xb9970000, REG_W0, src_reg); 646 640 /* llgfr %dst,%rc */ 647 641 EMIT4(0xb9160000, dst_reg, rc_reg); 642 + if (insn_is_zext(&insn[1])) 643 + insn_count = 2; 648 644 break; 649 645 } 650 646 case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / src */ ··· 684 676 EMIT_CONST_U32(imm)); 685 677 /* llgfr %dst,%rc */ 686 678 EMIT4(0xb9160000, dst_reg, rc_reg); 679 + if (insn_is_zext(&insn[1])) 680 + insn_count = 2; 687 681 break; 688 682 } 689 683 case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / imm */ ··· 874 864 case 16: /* dst = (u16) cpu_to_be16(dst) */ 875 865 /* llghr %dst,%dst */ 876 866 EMIT4(0xb9850000, dst_reg, dst_reg); 867 + if (insn_is_zext(&insn[1])) 868 + insn_count = 2; 877 869 break; 878 870 case 32: /* dst = (u32) cpu_to_be32(dst) */ 879 - /* llgfr %dst,%dst */ 880 - EMIT4(0xb9160000, dst_reg, dst_reg); 871 + if (!fp->aux->verifier_zext) 872 + /* llgfr %dst,%dst */ 873 + EMIT4(0xb9160000, dst_reg, dst_reg); 881 874 break; 882 875 case 64: /* dst = (u64) cpu_to_be64(dst) */ 883 876 break; ··· 895 882 EMIT4_DISP(0x88000000, dst_reg, REG_0, 16); 896 883 /* llghr %dst,%dst */ 897 884 EMIT4(0xb9850000, dst_reg, dst_reg); 885 + if (insn_is_zext(&insn[1])) 886 + insn_count = 2; 898 887 break; 899 888 case 32: /* dst = (u32) cpu_to_le32(dst) */ 900 889 /* lrvr %dst,%dst */ 901 890 EMIT4(0xb91f0000, dst_reg, dst_reg); 902 - /* llgfr %dst,%dst */ 903 - EMIT4(0xb9160000, dst_reg, dst_reg); 891 + if (!fp->aux->verifier_zext) 892 + /* llgfr %dst,%dst */ 893 + EMIT4(0xb9160000, dst_reg, dst_reg); 904 894 break; 905 895 case 64: /* dst = (u64) cpu_to_le64(dst) */ 906 896 /* lrvgr %dst,%dst */ ··· 984 968 /* llgc %dst,0(off,%src) */ 985 969 EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off); 986 970 jit->seen |= SEEN_MEM; 971 + if (insn_is_zext(&insn[1])) 972 + insn_count = 2; 987 973 break; 988 974 case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ 989 975 /* llgh %dst,0(off,%src) */ 990 976 EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off); 991 977 jit->seen |= SEEN_MEM; 978 + if (insn_is_zext(&insn[1])) 979 + insn_count = 2; 992 980 break; 993 981 case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ 994 982 /* llgf %dst,off(%src) */ 995 983 jit->seen |= SEEN_MEM; 996 984 EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off); 985 + if (insn_is_zext(&insn[1])) 986 + insn_count = 2; 997 987 break; 998 988 case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ 999 989 /* lg %dst,0(off,%src) */ ··· 1302 1280 jit->size = jit->lit; 1303 1281 jit->size_prg = jit->prg; 1304 1282 return 0; 1283 + } 1284 + 1285 + bool bpf_jit_needs_zext(void) 1286 + { 1287 + return true; 1305 1288 } 1306 1289 1307 1290 /*

+27 -2

arch/sparc/net/bpf_jit_comp_64.c

··· 908 908 /* dst = src */ 909 909 case BPF_ALU | BPF_MOV | BPF_X: 910 910 emit_alu3_K(SRL, src, 0, dst, ctx); 911 + if (insn_is_zext(&insn[1])) 912 + return 1; 911 913 break; 912 914 case BPF_ALU64 | BPF_MOV | BPF_X: 913 915 emit_reg_move(src, dst, ctx); ··· 944 942 case BPF_ALU | BPF_DIV | BPF_X: 945 943 emit_write_y(G0, ctx); 946 944 emit_alu(DIV, src, dst, ctx); 945 + if (insn_is_zext(&insn[1])) 946 + return 1; 947 947 break; 948 948 case BPF_ALU64 | BPF_DIV | BPF_X: 949 949 emit_alu(UDIVX, src, dst, ctx); ··· 979 975 break; 980 976 case BPF_ALU | BPF_RSH | BPF_X: 981 977 emit_alu(SRL, src, dst, ctx); 978 + if (insn_is_zext(&insn[1])) 979 + return 1; 982 980 break; 983 981 case BPF_ALU64 | BPF_RSH | BPF_X: 984 982 emit_alu(SRLX, src, dst, ctx); ··· 1003 997 case 16: 1004 998 emit_alu_K(SLL, dst, 16, ctx); 1005 999 emit_alu_K(SRL, dst, 16, ctx); 1000 + if (insn_is_zext(&insn[1])) 1001 + return 1; 1006 1002 break; 1007 1003 case 32: 1008 - emit_alu_K(SRL, dst, 0, ctx); 1004 + if (!ctx->prog->aux->verifier_zext) 1005 + emit_alu_K(SRL, dst, 0, ctx); 1009 1006 break; 1010 1007 case 64: 1011 1008 /* nop */ ··· 1030 1021 emit_alu3_K(AND, dst, 0xff, dst, ctx); 1031 1022 emit_alu3_K(SLL, tmp, 8, tmp, ctx); 1032 1023 emit_alu(OR, tmp, dst, ctx); 1024 + if (insn_is_zext(&insn[1])) 1025 + return 1; 1033 1026 break; 1034 1027 1035 1028 case 32: ··· 1048 1037 emit_alu3_K(AND, dst, 0xff, dst, ctx); /* dst = dst & 0xff */ 1049 1038 emit_alu3_K(SLL, dst, 24, dst, ctx); /* dst = dst << 24 */ 1050 1039 emit_alu(OR, tmp, dst, ctx); /* dst = dst | tmp */ 1040 + if (insn_is_zext(&insn[1])) 1041 + return 1; 1051 1042 break; 1052 1043 1053 1044 case 64: ··· 1063 1050 /* dst = imm */ 1064 1051 case BPF_ALU | BPF_MOV | BPF_K: 1065 1052 emit_loadimm32(imm, dst, ctx); 1053 + if (insn_is_zext(&insn[1])) 1054 + return 1; 1066 1055 break; 1067 1056 case BPF_ALU64 | BPF_MOV | BPF_K: 1068 1057 emit_loadimm_sext(imm, dst, ctx); ··· 1147 1132 break; 1148 1133 case BPF_ALU | BPF_RSH | BPF_K: 1149 1134 emit_alu_K(SRL, dst, imm, ctx); 1135 + if (insn_is_zext(&insn[1])) 1136 + return 1; 1150 1137 break; 1151 1138 case BPF_ALU64 | BPF_RSH | BPF_K: 1152 1139 emit_alu_K(SRLX, dst, imm, ctx); ··· 1161 1144 break; 1162 1145 1163 1146 do_alu32_trunc: 1164 - if (BPF_CLASS(code) == BPF_ALU) 1147 + if (BPF_CLASS(code) == BPF_ALU && 1148 + !ctx->prog->aux->verifier_zext) 1165 1149 emit_alu_K(SRL, dst, 0, ctx); 1166 1150 break; 1167 1151 ··· 1283 1265 rs2 = RS2(tmp); 1284 1266 } 1285 1267 emit(opcode | RS1(src) | rs2 | RD(dst), ctx); 1268 + if (opcode != LD64 && insn_is_zext(&insn[1])) 1269 + return 1; 1286 1270 break; 1287 1271 } 1288 1272 /* ST: *(size *)(dst + off) = imm */ ··· 1450 1430 /* We are guaranteed to have aligned memory. */ 1451 1431 for (ptr = area; size >= sizeof(u32); size -= sizeof(u32)) 1452 1432 *ptr++ = 0x91d02005; /* ta 5 */ 1433 + } 1434 + 1435 + bool bpf_jit_needs_zext(void) 1436 + { 1437 + return true; 1453 1438 } 1454 1439 1455 1440 struct sparc64_jit_data {

+56 -27

arch/x86/net/bpf_jit_comp32.c

··· 253 253 /* dst = src */ 254 254 static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[], 255 255 const u8 src[], bool dstk, 256 - bool sstk, u8 **pprog) 256 + bool sstk, u8 **pprog, 257 + const struct bpf_prog_aux *aux) 257 258 { 258 259 emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog); 259 260 if (is64) 260 261 /* complete 8 byte move */ 261 262 emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog); 262 - else 263 + else if (!aux->verifier_zext) 263 264 /* zero out high 4 bytes */ 264 265 emit_ia32_mov_i(dst_hi, 0, dstk, pprog); 265 266 } ··· 314 313 } 315 314 316 315 static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val, 317 - bool dstk, u8 **pprog) 316 + bool dstk, u8 **pprog, 317 + const struct bpf_prog_aux *aux) 318 318 { 319 319 u8 *prog = *pprog; 320 320 int cnt = 0; ··· 336 334 */ 337 335 EMIT2(0x0F, 0xB7); 338 336 EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo)); 339 - /* xor dreg_hi,dreg_hi */ 340 - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); 337 + if (!aux->verifier_zext) 338 + /* xor dreg_hi,dreg_hi */ 339 + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); 341 340 break; 342 341 case 32: 343 - /* xor dreg_hi,dreg_hi */ 344 - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); 342 + if (!aux->verifier_zext) 343 + /* xor dreg_hi,dreg_hi */ 344 + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); 345 345 break; 346 346 case 64: 347 347 /* nop */ ··· 362 358 } 363 359 364 360 static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val, 365 - bool dstk, u8 **pprog) 361 + bool dstk, u8 **pprog, 362 + const struct bpf_prog_aux *aux) 366 363 { 367 364 u8 *prog = *pprog; 368 365 int cnt = 0; ··· 385 380 EMIT2(0x0F, 0xB7); 386 381 EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo)); 387 382 388 - /* xor dreg_hi,dreg_hi */ 389 - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); 383 + if (!aux->verifier_zext) 384 + /* xor dreg_hi,dreg_hi */ 385 + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); 390 386 break; 391 387 case 32: 392 388 /* Emit 'bswap eax' to swap lower 4 bytes */ 393 389 EMIT1(0x0F); 394 390 EMIT1(add_1reg(0xC8, dreg_lo)); 395 391 396 - /* xor dreg_hi,dreg_hi */ 397 - EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); 392 + if (!aux->verifier_zext) 393 + /* xor dreg_hi,dreg_hi */ 394 + EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi)); 398 395 break; 399 396 case 64: 400 397 /* Emit 'bswap eax' to swap lower 4 bytes */ ··· 576 569 static inline void emit_ia32_alu_r64(const bool is64, const u8 op, 577 570 const u8 dst[], const u8 src[], 578 571 bool dstk, bool sstk, 579 - u8 **pprog) 572 + u8 **pprog, const struct bpf_prog_aux *aux) 580 573 { 581 574 u8 *prog = *pprog; 582 575 ··· 584 577 if (is64) 585 578 emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk, 586 579 &prog); 587 - else 580 + else if (!aux->verifier_zext) 588 581 emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 589 582 *pprog = prog; 590 583 } ··· 675 668 /* ALU operation (64 bit) */ 676 669 static inline void emit_ia32_alu_i64(const bool is64, const u8 op, 677 670 const u8 dst[], const u32 val, 678 - bool dstk, u8 **pprog) 671 + bool dstk, u8 **pprog, 672 + const struct bpf_prog_aux *aux) 679 673 { 680 674 u8 *prog = *pprog; 681 675 u32 hi = 0; ··· 687 679 emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog); 688 680 if (is64) 689 681 emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog); 690 - else 682 + else if (!aux->verifier_zext) 691 683 emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 692 684 693 685 *pprog = prog; ··· 1721 1713 case BPF_ALU64 | BPF_MOV | BPF_X: 1722 1714 switch (BPF_SRC(code)) { 1723 1715 case BPF_X: 1724 - emit_ia32_mov_r64(is64, dst, src, dstk, 1725 - sstk, &prog); 1716 + if (imm32 == 1) { 1717 + /* Special mov32 for zext. */ 1718 + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1719 + break; 1720 + } 1721 + emit_ia32_mov_r64(is64, dst, src, dstk, sstk, 1722 + &prog, bpf_prog->aux); 1726 1723 break; 1727 1724 case BPF_K: 1728 1725 /* Sign-extend immediate value to dst reg */ ··· 1767 1754 switch (BPF_SRC(code)) { 1768 1755 case BPF_X: 1769 1756 emit_ia32_alu_r64(is64, BPF_OP(code), dst, 1770 - src, dstk, sstk, &prog); 1757 + src, dstk, sstk, &prog, 1758 + bpf_prog->aux); 1771 1759 break; 1772 1760 case BPF_K: 1773 1761 emit_ia32_alu_i64(is64, BPF_OP(code), dst, 1774 - imm32, dstk, &prog); 1762 + imm32, dstk, &prog, 1763 + bpf_prog->aux); 1775 1764 break; 1776 1765 } 1777 1766 break; ··· 1792 1777 false, &prog); 1793 1778 break; 1794 1779 } 1795 - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1780 + if (!bpf_prog->aux->verifier_zext) 1781 + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1796 1782 break; 1797 1783 case BPF_ALU | BPF_LSH | BPF_X: 1798 1784 case BPF_ALU | BPF_RSH | BPF_X: ··· 1813 1797 &prog); 1814 1798 break; 1815 1799 } 1816 - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1800 + if (!bpf_prog->aux->verifier_zext) 1801 + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1817 1802 break; 1818 1803 /* dst = dst / src(imm) */ 1819 1804 /* dst = dst % src(imm) */ ··· 1836 1819 &prog); 1837 1820 break; 1838 1821 } 1839 - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1822 + if (!bpf_prog->aux->verifier_zext) 1823 + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1840 1824 break; 1841 1825 case BPF_ALU64 | BPF_DIV | BPF_K: 1842 1826 case BPF_ALU64 | BPF_DIV | BPF_X: ··· 1854 1836 EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32); 1855 1837 emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk, 1856 1838 false, &prog); 1857 - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1839 + if (!bpf_prog->aux->verifier_zext) 1840 + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1858 1841 break; 1859 1842 /* dst = dst << imm */ 1860 1843 case BPF_ALU64 | BPF_LSH | BPF_K: ··· 1891 1872 case BPF_ALU | BPF_NEG: 1892 1873 emit_ia32_alu_i(is64, false, BPF_OP(code), 1893 1874 dst_lo, 0, dstk, &prog); 1894 - emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1875 + if (!bpf_prog->aux->verifier_zext) 1876 + emit_ia32_mov_i(dst_hi, 0, dstk, &prog); 1895 1877 break; 1896 1878 /* dst = ~dst (64 bit) */ 1897 1879 case BPF_ALU64 | BPF_NEG: ··· 1912 1892 break; 1913 1893 /* dst = htole(dst) */ 1914 1894 case BPF_ALU | BPF_END | BPF_FROM_LE: 1915 - emit_ia32_to_le_r64(dst, imm32, dstk, &prog); 1895 + emit_ia32_to_le_r64(dst, imm32, dstk, &prog, 1896 + bpf_prog->aux); 1916 1897 break; 1917 1898 /* dst = htobe(dst) */ 1918 1899 case BPF_ALU | BPF_END | BPF_FROM_BE: 1919 - emit_ia32_to_be_r64(dst, imm32, dstk, &prog); 1900 + emit_ia32_to_be_r64(dst, imm32, dstk, &prog, 1901 + bpf_prog->aux); 1920 1902 break; 1921 1903 /* dst = imm64 */ 1922 1904 case BPF_LD | BPF_IMM | BPF_DW: { ··· 2073 2051 case BPF_B: 2074 2052 case BPF_H: 2075 2053 case BPF_W: 2054 + if (!bpf_prog->aux->verifier_zext) 2055 + break; 2076 2056 if (dstk) { 2077 2057 EMIT3(0xC7, add_1reg(0x40, IA32_EBP), 2078 2058 STACK_VAR(dst_hi)); ··· 2497 2473 prog = temp; 2498 2474 } 2499 2475 return proglen; 2476 + } 2477 + 2478 + bool bpf_jit_needs_zext(void) 2479 + { 2480 + return true; 2500 2481 } 2501 2482 2502 2483 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)

+17 -13

drivers/media/rc/bpf-lirc.c

··· 8 8 #include <linux/bpf_lirc.h> 9 9 #include "rc-core-priv.h" 10 10 11 + #define lirc_rcu_dereference(p) \ 12 + rcu_dereference_protected(p, lockdep_is_held(&ir_raw_handler_lock)) 13 + 11 14 /* 12 15 * BPF interface for raw IR 13 16 */ ··· 139 136 140 137 static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) 141 138 { 142 - struct bpf_prog_array __rcu *old_array; 139 + struct bpf_prog_array *old_array; 143 140 struct bpf_prog_array *new_array; 144 141 struct ir_raw_event_ctrl *raw; 145 142 int ret; ··· 157 154 goto unlock; 158 155 } 159 156 160 - if (raw->progs && bpf_prog_array_length(raw->progs) >= BPF_MAX_PROGS) { 157 + old_array = lirc_rcu_dereference(raw->progs); 158 + if (old_array && bpf_prog_array_length(old_array) >= BPF_MAX_PROGS) { 161 159 ret = -E2BIG; 162 160 goto unlock; 163 161 } 164 162 165 - old_array = raw->progs; 166 163 ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); 167 164 if (ret < 0) 168 165 goto unlock; ··· 177 174 178 175 static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) 179 176 { 180 - struct bpf_prog_array __rcu *old_array; 177 + struct bpf_prog_array *old_array; 181 178 struct bpf_prog_array *new_array; 182 179 struct ir_raw_event_ctrl *raw; 183 180 int ret; ··· 195 192 goto unlock; 196 193 } 197 194 198 - old_array = raw->progs; 195 + old_array = lirc_rcu_dereference(raw->progs); 199 196 ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array); 200 197 /* 201 198 * Do not use bpf_prog_array_delete_safe() as we would end up ··· 226 223 /* 227 224 * This should be called once the rc thread has been stopped, so there can be 228 225 * no concurrent bpf execution. 226 + * 227 + * Should be called with the ir_raw_handler_lock held. 229 228 */ 230 229 void lirc_bpf_free(struct rc_dev *rcdev) 231 230 { 232 231 struct bpf_prog_array_item *item; 232 + struct bpf_prog_array *array; 233 233 234 - if (!rcdev->raw->progs) 234 + array = lirc_rcu_dereference(rcdev->raw->progs); 235 + if (!array) 235 236 return; 236 237 237 - item = rcu_dereference(rcdev->raw->progs)->items; 238 - while (item->prog) { 238 + for (item = array->items; item->prog; item++) 239 239 bpf_prog_put(item->prog); 240 - item++; 241 - } 242 240 243 - bpf_prog_array_free(rcdev->raw->progs); 241 + bpf_prog_array_free(array); 244 242 } 245 243 246 244 int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) ··· 294 290 int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) 295 291 { 296 292 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 297 - struct bpf_prog_array __rcu *progs; 293 + struct bpf_prog_array *progs; 298 294 struct rc_dev *rcdev; 299 295 u32 cnt, flags = 0; 300 296 int ret; ··· 315 311 if (ret) 316 312 goto put; 317 313 318 - progs = rcdev->raw->progs; 314 + progs = lirc_rcu_dereference(rcdev->raw->progs); 319 315 cnt = progs ? bpf_prog_array_length(progs) : 0; 320 316 321 317 if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {

+67 -48

drivers/net/ethernet/netronome/nfp/bpf/jit.c

··· 623 623 } 624 624 625 625 static void 626 + wrp_zext(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst) 627 + { 628 + if (meta->flags & FLAG_INSN_DO_ZEXT) 629 + wrp_immed(nfp_prog, reg_both(dst + 1), 0); 630 + } 631 + 632 + static void 626 633 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm, 627 634 enum nfp_relo_type relo) 628 635 { ··· 865 858 } 866 859 867 860 static int 868 - data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size) 861 + data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, swreg offset, 862 + u8 dst_gpr, int size) 869 863 { 870 864 unsigned int i; 871 865 u16 shift, sz; ··· 889 881 wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i)); 890 882 891 883 if (i < 2) 892 - wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0); 884 + wrp_zext(nfp_prog, meta, dst_gpr); 893 885 894 886 return 0; 895 887 } 896 888 897 889 static int 898 - data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, 899 - swreg lreg, swreg rreg, int size, enum cmd_mode mode) 890 + data_ld_host_order(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, 891 + u8 dst_gpr, swreg lreg, swreg rreg, int size, 892 + enum cmd_mode mode) 900 893 { 901 894 unsigned int i; 902 895 u8 mask, sz; ··· 920 911 wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i)); 921 912 922 913 if (i < 2) 923 - wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0); 914 + wrp_zext(nfp_prog, meta, dst_gpr); 924 915 925 916 return 0; 926 917 } 927 918 928 919 static int 929 - data_ld_host_order_addr32(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, 930 - u8 dst_gpr, u8 size) 920 + data_ld_host_order_addr32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, 921 + u8 src_gpr, swreg offset, u8 dst_gpr, u8 size) 931 922 { 932 - return data_ld_host_order(nfp_prog, dst_gpr, reg_a(src_gpr), offset, 933 - size, CMD_MODE_32b); 923 + return data_ld_host_order(nfp_prog, meta, dst_gpr, reg_a(src_gpr), 924 + offset, size, CMD_MODE_32b); 934 925 } 935 926 936 927 static int 937 - data_ld_host_order_addr40(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, 938 - u8 dst_gpr, u8 size) 928 + data_ld_host_order_addr40(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, 929 + u8 src_gpr, swreg offset, u8 dst_gpr, u8 size) 939 930 { 940 931 swreg rega, regb; 941 932 942 933 addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb); 943 934 944 - return data_ld_host_order(nfp_prog, dst_gpr, rega, regb, 935 + return data_ld_host_order(nfp_prog, meta, dst_gpr, rega, regb, 945 936 size, CMD_MODE_40b_BA); 946 937 } 947 938 948 939 static int 949 - construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size) 940 + construct_data_ind_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, 941 + u16 offset, u16 src, u8 size) 950 942 { 951 943 swreg tmp_reg; 952 944 ··· 963 953 emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT); 964 954 965 955 /* Load data */ 966 - return data_ld(nfp_prog, imm_b(nfp_prog), 0, size); 956 + return data_ld(nfp_prog, meta, imm_b(nfp_prog), 0, size); 967 957 } 968 958 969 - static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size) 959 + static int 960 + construct_data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, 961 + u16 offset, u8 size) 970 962 { 971 963 swreg tmp_reg; 972 964 ··· 979 967 980 968 /* Load data */ 981 969 tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog)); 982 - return data_ld(nfp_prog, tmp_reg, 0, size); 970 + return data_ld(nfp_prog, meta, tmp_reg, 0, size); 983 971 } 984 972 985 973 static int ··· 1216 1204 } 1217 1205 1218 1206 if (clr_gpr && size < 8) 1219 - wrp_immed(nfp_prog, reg_both(gpr + 1), 0); 1207 + wrp_zext(nfp_prog, meta, gpr); 1220 1208 1221 1209 while (size) { 1222 1210 u32 slice_end; ··· 1317 1305 enum alu_op alu_op) 1318 1306 { 1319 1307 const struct bpf_insn *insn = &meta->insn; 1308 + u8 dst = insn->dst_reg * 2; 1320 1309 1321 - wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, insn->imm); 1322 - wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0); 1310 + wrp_alu_imm(nfp_prog, dst, alu_op, insn->imm); 1311 + wrp_zext(nfp_prog, meta, dst); 1323 1312 1324 1313 return 0; 1325 1314 } ··· 1332 1319 u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2; 1333 1320 1334 1321 emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src)); 1335 - wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0); 1322 + wrp_zext(nfp_prog, meta, dst); 1336 1323 1337 1324 return 0; 1338 1325 } ··· 2409 2396 u8 dst = meta->insn.dst_reg * 2; 2410 2397 2411 2398 emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst)); 2412 - wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0); 2399 + wrp_zext(nfp_prog, meta, dst); 2413 2400 2414 2401 return 0; 2415 2402 } 2416 2403 2417 - static int __ashr_imm(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) 2404 + static int 2405 + __ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst, 2406 + u8 shift_amt) 2418 2407 { 2419 2408 if (shift_amt) { 2420 2409 /* Set signedness bit (MSB of result). */ ··· 2425 2410 emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR, 2426 2411 reg_b(dst), SHF_SC_R_SHF, shift_amt); 2427 2412 } 2428 - wrp_immed(nfp_prog, reg_both(dst + 1), 0); 2413 + wrp_zext(nfp_prog, meta, dst); 2429 2414 2430 2415 return 0; 2431 2416 } ··· 2440 2425 umin = meta->umin_src; 2441 2426 umax = meta->umax_src; 2442 2427 if (umin == umax) 2443 - return __ashr_imm(nfp_prog, dst, umin); 2428 + return __ashr_imm(nfp_prog, meta, dst, umin); 2444 2429 2445 2430 src = insn->src_reg * 2; 2446 2431 /* NOTE: the first insn will set both indirect shift amount (source A) ··· 2449 2434 emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst)); 2450 2435 emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR, 2451 2436 reg_b(dst), SHF_SC_R_SHF); 2452 - wrp_immed(nfp_prog, reg_both(dst + 1), 0); 2437 + wrp_zext(nfp_prog, meta, dst); 2453 2438 2454 2439 return 0; 2455 2440 } ··· 2459 2444 const struct bpf_insn *insn = &meta->insn; 2460 2445 u8 dst = insn->dst_reg * 2; 2461 2446 2462 - return __ashr_imm(nfp_prog, dst, insn->imm); 2447 + return __ashr_imm(nfp_prog, meta, dst, insn->imm); 2463 2448 } 2464 2449 2465 - static int __shr_imm(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) 2450 + static int 2451 + __shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst, 2452 + u8 shift_amt) 2466 2453 { 2467 2454 if (shift_amt) 2468 2455 emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, 2469 2456 reg_b(dst), SHF_SC_R_SHF, shift_amt); 2470 - wrp_immed(nfp_prog, reg_both(dst + 1), 0); 2457 + wrp_zext(nfp_prog, meta, dst); 2471 2458 return 0; 2472 2459 } 2473 2460 ··· 2478 2461 const struct bpf_insn *insn = &meta->insn; 2479 2462 u8 dst = insn->dst_reg * 2; 2480 2463 2481 - return __shr_imm(nfp_prog, dst, insn->imm); 2464 + return __shr_imm(nfp_prog, meta, dst, insn->imm); 2482 2465 } 2483 2466 2484 2467 static int shr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) ··· 2491 2474 umin = meta->umin_src; 2492 2475 umax = meta->umax_src; 2493 2476 if (umin == umax) 2494 - return __shr_imm(nfp_prog, dst, umin); 2477 + return __shr_imm(nfp_prog, meta, dst, umin); 2495 2478 2496 2479 src = insn->src_reg * 2; 2497 2480 emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0)); 2498 2481 emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, 2499 2482 reg_b(dst), SHF_SC_R_SHF); 2500 - wrp_immed(nfp_prog, reg_both(dst + 1), 0); 2483 + wrp_zext(nfp_prog, meta, dst); 2501 2484 return 0; 2502 2485 } 2503 2486 2504 - static int __shl_imm(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt) 2487 + static int 2488 + __shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst, 2489 + u8 shift_amt) 2505 2490 { 2506 2491 if (shift_amt) 2507 2492 emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE, 2508 2493 reg_b(dst), SHF_SC_L_SHF, shift_amt); 2509 - wrp_immed(nfp_prog, reg_both(dst + 1), 0); 2494 + wrp_zext(nfp_prog, meta, dst); 2510 2495 return 0; 2511 2496 } 2512 2497 ··· 2517 2498 const struct bpf_insn *insn = &meta->insn; 2518 2499 u8 dst = insn->dst_reg * 2; 2519 2500 2520 - return __shl_imm(nfp_prog, dst, insn->imm); 2501 + return __shl_imm(nfp_prog, meta, dst, insn->imm); 2521 2502 } 2522 2503 2523 2504 static int shl_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) ··· 2530 2511 umin = meta->umin_src; 2531 2512 umax = meta->umax_src; 2532 2513 if (umin == umax) 2533 - return __shl_imm(nfp_prog, dst, umin); 2514 + return __shl_imm(nfp_prog, meta, dst, umin); 2534 2515 2535 2516 src = insn->src_reg * 2; 2536 2517 shl_reg64_lt32_low(nfp_prog, dst, src); 2537 - wrp_immed(nfp_prog, reg_both(dst + 1), 0); 2518 + wrp_zext(nfp_prog, meta, dst); 2538 2519 return 0; 2539 2520 } 2540 2521 ··· 2596 2577 2597 2578 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 2598 2579 { 2599 - return construct_data_ld(nfp_prog, meta->insn.imm, 1); 2580 + return construct_data_ld(nfp_prog, meta, meta->insn.imm, 1); 2600 2581 } 2601 2582 2602 2583 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 2603 2584 { 2604 - return construct_data_ld(nfp_prog, meta->insn.imm, 2); 2585 + return construct_data_ld(nfp_prog, meta, meta->insn.imm, 2); 2605 2586 } 2606 2587 2607 2588 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 2608 2589 { 2609 - return construct_data_ld(nfp_prog, meta->insn.imm, 4); 2590 + return construct_data_ld(nfp_prog, meta, meta->insn.imm, 4); 2610 2591 } 2611 2592 2612 2593 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 2613 2594 { 2614 - return construct_data_ind_ld(nfp_prog, meta->insn.imm, 2595 + return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm, 2615 2596 meta->insn.src_reg * 2, 1); 2616 2597 } 2617 2598 2618 2599 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 2619 2600 { 2620 - return construct_data_ind_ld(nfp_prog, meta->insn.imm, 2601 + return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm, 2621 2602 meta->insn.src_reg * 2, 2); 2622 2603 } 2623 2604 2624 2605 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 2625 2606 { 2626 - return construct_data_ind_ld(nfp_prog, meta->insn.imm, 2607 + return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm, 2627 2608 meta->insn.src_reg * 2, 4); 2628 2609 } 2629 2610 ··· 2701 2682 2702 2683 tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); 2703 2684 2704 - return data_ld_host_order_addr32(nfp_prog, meta->insn.src_reg * 2, 2685 + return data_ld_host_order_addr32(nfp_prog, meta, meta->insn.src_reg * 2, 2705 2686 tmp_reg, meta->insn.dst_reg * 2, size); 2706 2687 } 2707 2688 ··· 2713 2694 2714 2695 tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); 2715 2696 2716 - return data_ld_host_order_addr40(nfp_prog, meta->insn.src_reg * 2, 2697 + return data_ld_host_order_addr40(nfp_prog, meta, meta->insn.src_reg * 2, 2717 2698 tmp_reg, meta->insn.dst_reg * 2, size); 2718 2699 } 2719 2700 ··· 2774 2755 wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off); 2775 2756 2776 2757 if (!len_mid) { 2777 - wrp_immed(nfp_prog, dst_hi, 0); 2758 + wrp_zext(nfp_prog, meta, dst_gpr); 2778 2759 return 0; 2779 2760 } 2780 2761 ··· 2782 2763 2783 2764 if (size <= REG_WIDTH) { 2784 2765 wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo); 2785 - wrp_immed(nfp_prog, dst_hi, 0); 2766 + wrp_zext(nfp_prog, meta, dst_gpr); 2786 2767 } else { 2787 2768 swreg src_hi = reg_xfer(idx + 2); 2788 2769 ··· 2813 2794 2814 2795 if (size < REG_WIDTH) { 2815 2796 wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0); 2816 - wrp_immed(nfp_prog, dst_hi, 0); 2797 + wrp_zext(nfp_prog, meta, dst_gpr); 2817 2798 } else if (size == REG_WIDTH) { 2818 2799 wrp_mov(nfp_prog, dst_lo, src_lo); 2819 - wrp_immed(nfp_prog, dst_hi, 0); 2800 + wrp_zext(nfp_prog, meta, dst_gpr); 2820 2801 } else { 2821 2802 swreg src_hi = reg_xfer(idx + 1); 2822 2803

+2

drivers/net/ethernet/netronome/nfp/bpf/main.h

··· 238 238 #define FLAG_INSN_SKIP_PREC_DEPENDENT BIT(4) 239 239 /* Instruction is optimized by the verifier */ 240 240 #define FLAG_INSN_SKIP_VERIFIER_OPT BIT(5) 241 + /* Instruction needs to zero extend to high 32-bit */ 242 + #define FLAG_INSN_DO_ZEXT BIT(6) 241 243 242 244 #define FLAG_INSN_SKIP_MASK (FLAG_INSN_SKIP_NOOP | \ 243 245 FLAG_INSN_SKIP_PREC_DEPENDENT | \

+12

drivers/net/ethernet/netronome/nfp/bpf/verifier.c

··· 744 744 goto continue_subprog; 745 745 } 746 746 747 + static void nfp_bpf_insn_flag_zext(struct nfp_prog *nfp_prog, 748 + struct bpf_insn_aux_data *aux) 749 + { 750 + struct nfp_insn_meta *meta; 751 + 752 + list_for_each_entry(meta, &nfp_prog->insns, l) { 753 + if (aux[meta->n].zext_dst) 754 + meta->flags |= FLAG_INSN_DO_ZEXT; 755 + } 756 + } 757 + 747 758 int nfp_bpf_finalize(struct bpf_verifier_env *env) 748 759 { 749 760 struct bpf_subprog_info *info; ··· 795 784 return -EOPNOTSUPP; 796 785 } 797 786 787 + nfp_bpf_insn_flag_zext(nfp_prog, env->insn_aux_data); 798 788 return 0; 799 789 } 800 790

+10 -3

include/linux/bpf-cgroup.h

··· 6 6 #include <linux/errno.h> 7 7 #include <linux/jump_label.h> 8 8 #include <linux/percpu.h> 9 + #include <linux/percpu-refcount.h> 9 10 #include <linux/rbtree.h> 10 11 #include <uapi/linux/bpf.h> 11 12 ··· 72 71 u32 flags[MAX_BPF_ATTACH_TYPE]; 73 72 74 73 /* temp storage for effective prog array used by prog_attach/detach */ 75 - struct bpf_prog_array __rcu *inactive; 74 + struct bpf_prog_array *inactive; 75 + 76 + /* reference counter used to detach bpf programs after cgroup removal */ 77 + struct percpu_ref refcnt; 78 + 79 + /* cgroup_bpf is released using a work queue */ 80 + struct work_struct release_work; 76 81 }; 77 82 78 - void cgroup_bpf_put(struct cgroup *cgrp); 79 83 int cgroup_bpf_inherit(struct cgroup *cgrp); 84 + void cgroup_bpf_offline(struct cgroup *cgrp); 80 85 81 86 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, 82 87 enum bpf_attach_type type, u32 flags); ··· 290 283 291 284 struct bpf_prog; 292 285 struct cgroup_bpf {}; 293 - static inline void cgroup_bpf_put(struct cgroup *cgrp) {} 294 286 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } 287 + static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} 295 288 296 289 static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, 297 290 enum bpf_prog_type ptype,

+68 -10

include/linux/bpf.h

··· 66 66 u64 imm, u32 *off); 67 67 }; 68 68 69 + struct bpf_map_memory { 70 + u32 pages; 71 + struct user_struct *user; 72 + }; 73 + 69 74 struct bpf_map { 70 75 /* The first two cachelines with read-mostly members of which some 71 76 * are also accessed in fast-path (e.g. ops, max_entries). ··· 91 86 u32 btf_key_type_id; 92 87 u32 btf_value_type_id; 93 88 struct btf *btf; 94 - u32 pages; 89 + struct bpf_map_memory memory; 95 90 bool unpriv_array; 96 91 bool frozen; /* write-once */ 97 92 /* 48 bytes hole */ ··· 99 94 /* The 3rd and 4th cacheline with misc members to avoid false sharing 100 95 * particularly with refcounting. 101 96 */ 102 - struct user_struct *user ____cacheline_aligned; 103 - atomic_t refcnt; 97 + atomic_t refcnt ____cacheline_aligned; 104 98 atomic_t usercnt; 105 99 struct work_struct work; 106 100 char name[BPF_OBJ_NAME_LEN]; ··· 374 370 u32 id; 375 371 u32 func_cnt; /* used by non-func prog as the number of func progs */ 376 372 u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ 373 + bool verifier_zext; /* Zero extensions has been inserted by verifier. */ 377 374 bool offload_requested; 378 375 struct bpf_prog **func; 379 376 void *jit_data; /* JIT specific data. arch dependent */ ··· 518 513 }; 519 514 520 515 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); 521 - void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); 522 - int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); 523 - int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, 516 + void bpf_prog_array_free(struct bpf_prog_array *progs); 517 + int bpf_prog_array_length(struct bpf_prog_array *progs); 518 + int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, 524 519 __u32 __user *prog_ids, u32 cnt); 525 520 526 - void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, 521 + void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, 527 522 struct bpf_prog *old_prog); 528 - int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, 523 + int bpf_prog_array_copy_info(struct bpf_prog_array *array, 529 524 u32 *prog_ids, u32 request_cnt, 530 525 u32 *prog_cnt); 531 - int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, 526 + int bpf_prog_array_copy(struct bpf_prog_array *old_array, 532 527 struct bpf_prog *exclude_prog, 533 528 struct bpf_prog *include_prog, 534 529 struct bpf_prog_array **new_array); ··· 555 550 preempt_enable(); \ 556 551 _ret; \ 557 552 }) 553 + 554 + /* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs 555 + * so BPF programs can request cwr for TCP packets. 556 + * 557 + * Current cgroup skb programs can only return 0 or 1 (0 to drop the 558 + * packet. This macro changes the behavior so the low order bit 559 + * indicates whether the packet should be dropped (0) or not (1) 560 + * and the next bit is a congestion notification bit. This could be 561 + * used by TCP to call tcp_enter_cwr() 562 + * 563 + * Hence, new allowed return values of CGROUP EGRESS BPF programs are: 564 + * 0: drop packet 565 + * 1: keep packet 566 + * 2: drop packet and cn 567 + * 3: keep packet and cn 568 + * 569 + * This macro then converts it to one of the NET_XMIT or an error 570 + * code that is then interpreted as drop packet (and no cn): 571 + * 0: NET_XMIT_SUCCESS skb should be transmitted 572 + * 1: NET_XMIT_DROP skb should be dropped and cn 573 + * 2: NET_XMIT_CN skb should be transmitted and cn 574 + * 3: -EPERM skb should be dropped 575 + */ 576 + #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ 577 + ({ \ 578 + struct bpf_prog_array_item *_item; \ 579 + struct bpf_prog *_prog; \ 580 + struct bpf_prog_array *_array; \ 581 + u32 ret; \ 582 + u32 _ret = 1; \ 583 + u32 _cn = 0; \ 584 + preempt_disable(); \ 585 + rcu_read_lock(); \ 586 + _array = rcu_dereference(array); \ 587 + _item = &_array->items[0]; \ 588 + while ((_prog = READ_ONCE(_item->prog))) { \ 589 + bpf_cgroup_storage_set(_item->cgroup_storage); \ 590 + ret = func(_prog, ctx); \ 591 + _ret &= (ret & 1); \ 592 + _cn |= (ret & 2); \ 593 + _item++; \ 594 + } \ 595 + rcu_read_unlock(); \ 596 + preempt_enable(); \ 597 + if (_ret) \ 598 + _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ 599 + else \ 600 + _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ 601 + _ret; \ 602 + }) 558 603 559 604 #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ 560 605 __BPF_PROG_RUN_ARRAY(array, ctx, func, false) ··· 650 595 struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); 651 596 void bpf_map_put_with_uref(struct bpf_map *map); 652 597 void bpf_map_put(struct bpf_map *map); 653 - int bpf_map_precharge_memlock(u32 pages); 654 598 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); 655 599 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); 600 + int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); 601 + void bpf_map_charge_finish(struct bpf_map_memory *mem); 602 + void bpf_map_charge_move(struct bpf_map_memory *dst, 603 + struct bpf_map_memory *src); 656 604 void *bpf_map_area_alloc(size_t size, int numa_node); 657 605 void bpf_map_area_free(void *base); 658 606 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);

+13 -3

include/linux/bpf_verifier.h

··· 36 36 */ 37 37 enum bpf_reg_liveness { 38 38 REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ 39 - REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ 40 - REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ 41 - REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */ 39 + REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ 40 + REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ 41 + REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, 42 + REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ 43 + REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ 42 44 }; 43 45 44 46 struct bpf_reg_state { ··· 133 131 * pointing to bpf_func_state. 134 132 */ 135 133 u32 frameno; 134 + /* Tracks subreg definition. The stored value is the insn_idx of the 135 + * writing insn. This is safe because subreg_def is used before any insn 136 + * patching which only happens after main verification finished. 137 + */ 138 + s32 subreg_def; 136 139 enum bpf_reg_liveness live; 137 140 }; 138 141 ··· 194 187 struct bpf_verifier_state { 195 188 /* call stack tracking */ 196 189 struct bpf_func_state *frame[MAX_CALL_FRAMES]; 190 + u32 insn_idx; 197 191 u32 curframe; 198 192 u32 active_spin_lock; 199 193 bool speculative; ··· 240 232 int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ 241 233 int sanitize_stack_off; /* stack slot to be cleared */ 242 234 bool seen; /* this insn was processed by the verifier */ 235 + bool zext_dst; /* this insn zero extends dst reg */ 243 236 u8 alu_state; /* used in combination with alu_limit */ 237 + bool prune_point; 244 238 unsigned int orig_idx; /* original instruction index */ 245 239 }; 246 240

+18

include/linux/cgroup.h

··· 924 924 925 925 #endif /* !CONFIG_CGROUPS */ 926 926 927 + #ifdef CONFIG_CGROUP_BPF 928 + static inline void cgroup_bpf_get(struct cgroup *cgrp) 929 + { 930 + percpu_ref_get(&cgrp->bpf.refcnt); 931 + } 932 + 933 + static inline void cgroup_bpf_put(struct cgroup *cgrp) 934 + { 935 + percpu_ref_put(&cgrp->bpf.refcnt); 936 + } 937 + 938 + #else /* CONFIG_CGROUP_BPF */ 939 + 940 + static inline void cgroup_bpf_get(struct cgroup *cgrp) {} 941 + static inline void cgroup_bpf_put(struct cgroup *cgrp) {} 942 + 943 + #endif /* CONFIG_CGROUP_BPF */ 944 + 927 945 #endif /* _LINUX_CGROUP_H */

+17 -1

include/linux/filter.h

··· 160 160 .off = 0, \ 161 161 .imm = IMM }) 162 162 163 + /* Special form of mov32, used for doing explicit zero extension on dst. */ 164 + #define BPF_ZEXT_REG(DST) \ 165 + ((struct bpf_insn) { \ 166 + .code = BPF_ALU | BPF_MOV | BPF_X, \ 167 + .dst_reg = DST, \ 168 + .src_reg = DST, \ 169 + .off = 0, \ 170 + .imm = 1 }) 171 + 172 + static inline bool insn_is_zext(const struct bpf_insn *insn) 173 + { 174 + return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; 175 + } 176 + 163 177 /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ 164 178 #define BPF_LD_IMM64(DST, IMM) \ 165 179 BPF_LD_IMM64_RAW(DST, 0, IMM) ··· 526 512 blinded:1, /* Was blinded */ 527 513 is_func:1, /* program is a bpf function */ 528 514 kprobe_override:1, /* Do we override a kprobe? */ 529 - has_callchain_buf:1; /* callchain buffer allocated? */ 515 + has_callchain_buf:1, /* callchain buffer allocated? */ 516 + enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ 530 517 enum bpf_prog_type type; /* Type of BPF program */ 531 518 enum bpf_attach_type expected_attach_type; /* For some prog types */ 532 519 u32 len; /* Number of filter blocks */ ··· 826 811 827 812 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); 828 813 void bpf_jit_compile(struct bpf_prog *prog); 814 + bool bpf_jit_needs_zext(void); 829 815 bool bpf_helper_changes_pkt_data(void *func); 830 816 831 817 static inline bool bpf_dump_raw_ok(void)

+34 -1

include/uapi/linux/bpf.h

··· 260 260 */ 261 261 #define BPF_F_ANY_ALIGNMENT (1U << 1) 262 262 263 + /* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. 264 + * Verifier does sub-register def/use analysis and identifies instructions whose 265 + * def only matters for low 32-bit, high 32-bit is never referenced later 266 + * through implicit zero extension. Therefore verifier notifies JIT back-ends 267 + * that it is safe to ignore clearing high 32-bit for these instructions. This 268 + * saves some back-ends a lot of code-gen. However such optimization is not 269 + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends 270 + * hence hasn't used verifier's analysis result. But, we really want to have a 271 + * way to be able to verify the correctness of the described optimization on 272 + * x86_64 on which testsuites are frequently exercised. 273 + * 274 + * So, this flag is introduced. Once it is set, verifier will randomize high 275 + * 32-bit for those instructions who has been identified as safe to ignore them. 276 + * Then, if verifier is not doing correct analysis, such randomization will 277 + * regress tests to expose bugs. 278 + */ 279 + #define BPF_F_TEST_RND_HI32 (1U << 2) 280 + 263 281 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have 264 282 * two extensions: 265 283 * ··· 2690 2672 * 0 on success. 2691 2673 * 2692 2674 * **-ENOENT** if the bpf-local-storage cannot be found. 2675 + * 2676 + * int bpf_send_signal(u32 sig) 2677 + * Description 2678 + * Send signal *sig* to the current task. 2679 + * Return 2680 + * 0 on success or successfully queued. 2681 + * 2682 + * **-EBUSY** if work queue under nmi is full. 2683 + * 2684 + * **-EINVAL** if *sig* is invalid. 2685 + * 2686 + * **-EPERM** if no permission to send the *sig*. 2687 + * 2688 + * **-EAGAIN** if bpf program can try again. 2693 2689 */ 2694 2690 #define __BPF_FUNC_MAPPER(FN) \ 2695 2691 FN(unspec), \ ··· 2814 2782 FN(strtol), \ 2815 2783 FN(strtoul), \ 2816 2784 FN(sk_storage_get), \ 2817 - FN(sk_storage_delete), 2785 + FN(sk_storage_delete), \ 2786 + FN(send_signal), 2818 2787 2819 2788 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2820 2789 * function eBPF program intends to call

+8 -10

kernel/bpf/arraymap.c

··· 83 83 u32 elem_size, index_mask, max_entries; 84 84 bool unpriv = !capable(CAP_SYS_ADMIN); 85 85 u64 cost, array_size, mask64; 86 + struct bpf_map_memory mem; 86 87 struct bpf_array *array; 87 88 88 89 elem_size = round_up(attr->value_size, 8); ··· 117 116 118 117 /* make sure there is no u32 overflow later in round_up() */ 119 118 cost = array_size; 120 - if (cost >= U32_MAX - PAGE_SIZE) 121 - return ERR_PTR(-ENOMEM); 122 - if (percpu) { 119 + if (percpu) 123 120 cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); 124 - if (cost >= U32_MAX - PAGE_SIZE) 125 - return ERR_PTR(-ENOMEM); 126 - } 127 - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 128 121 129 - ret = bpf_map_precharge_memlock(cost); 122 + ret = bpf_map_charge_init(&mem, cost); 130 123 if (ret < 0) 131 124 return ERR_PTR(ret); 132 125 133 126 /* allocate all map elements and zero-initialize them */ 134 127 array = bpf_map_area_alloc(array_size, numa_node); 135 - if (!array) 128 + if (!array) { 129 + bpf_map_charge_finish(&mem); 136 130 return ERR_PTR(-ENOMEM); 131 + } 137 132 array->index_mask = index_mask; 138 133 array->map.unpriv_array = unpriv; 139 134 140 135 /* copy mandatory map attributes */ 141 136 bpf_map_init_from_attr(&array->map, attr); 142 - array->map.pages = cost; 137 + bpf_map_charge_move(&array->map.memory, &mem); 143 138 array->elem_size = elem_size; 144 139 145 140 if (percpu && bpf_array_alloc_percpu(array)) { 141 + bpf_map_charge_finish(&array->map.memory); 146 142 bpf_map_area_free(array); 147 143 return ERR_PTR(-ENOMEM); 148 144 }

+76 -22

kernel/bpf/cgroup.c

··· 22 22 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 23 23 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 24 24 25 - /** 26 - * cgroup_bpf_put() - put references of all bpf programs 27 - * @cgrp: the cgroup to modify 28 - */ 29 - void cgroup_bpf_put(struct cgroup *cgrp) 25 + void cgroup_bpf_offline(struct cgroup *cgrp) 30 26 { 27 + cgroup_get(cgrp); 28 + percpu_ref_kill(&cgrp->bpf.refcnt); 29 + } 30 + 31 + /** 32 + * cgroup_bpf_release() - put references of all bpf programs and 33 + * release all cgroup bpf data 34 + * @work: work structure embedded into the cgroup to modify 35 + */ 36 + static void cgroup_bpf_release(struct work_struct *work) 37 + { 38 + struct cgroup *cgrp = container_of(work, struct cgroup, 39 + bpf.release_work); 31 40 enum bpf_cgroup_storage_type stype; 41 + struct bpf_prog_array *old_array; 32 42 unsigned int type; 33 43 34 44 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { ··· 55 45 kfree(pl); 56 46 static_branch_dec(&cgroup_bpf_enabled_key); 57 47 } 58 - bpf_prog_array_free(cgrp->bpf.effective[type]); 48 + old_array = rcu_dereference_protected( 49 + cgrp->bpf.effective[type], 50 + percpu_ref_is_dying(&cgrp->bpf.refcnt)); 51 + bpf_prog_array_free(old_array); 59 52 } 53 + 54 + percpu_ref_exit(&cgrp->bpf.refcnt); 55 + cgroup_put(cgrp); 56 + } 57 + 58 + /** 59 + * cgroup_bpf_release_fn() - callback used to schedule releasing 60 + * of bpf cgroup data 61 + * @ref: percpu ref counter structure 62 + */ 63 + static void cgroup_bpf_release_fn(struct percpu_ref *ref) 64 + { 65 + struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); 66 + 67 + INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); 68 + queue_work(system_wq, &cgrp->bpf.release_work); 60 69 } 61 70 62 71 /* count number of elements in the list. ··· 130 101 */ 131 102 static int compute_effective_progs(struct cgroup *cgrp, 132 103 enum bpf_attach_type type, 133 - struct bpf_prog_array __rcu **array) 104 + struct bpf_prog_array **array) 134 105 { 135 106 enum bpf_cgroup_storage_type stype; 136 107 struct bpf_prog_array *progs; ··· 168 139 } 169 140 } while ((p = cgroup_parent(p))); 170 141 171 - rcu_assign_pointer(*array, progs); 142 + *array = progs; 172 143 return 0; 173 144 } 174 145 175 146 static void activate_effective_progs(struct cgroup *cgrp, 176 147 enum bpf_attach_type type, 177 - struct bpf_prog_array __rcu *array) 148 + struct bpf_prog_array *old_array) 178 149 { 179 - struct bpf_prog_array __rcu *old_array; 180 - 181 - old_array = xchg(&cgrp->bpf.effective[type], array); 150 + rcu_swap_protected(cgrp->bpf.effective[type], old_array, 151 + lockdep_is_held(&cgroup_mutex)); 182 152 /* free prog array after grace period, since __cgroup_bpf_run_*() 183 153 * might be still walking the array 184 154 */ ··· 194 166 * that array below is variable length 195 167 */ 196 168 #define NR ARRAY_SIZE(cgrp->bpf.effective) 197 - struct bpf_prog_array __rcu *arrays[NR] = {}; 198 - int i; 169 + struct bpf_prog_array *arrays[NR] = {}; 170 + int ret, i; 171 + 172 + ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, 173 + GFP_KERNEL); 174 + if (ret) 175 + return ret; 199 176 200 177 for (i = 0; i < NR; i++) 201 178 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); ··· 216 183 cleanup: 217 184 for (i = 0; i < NR; i++) 218 185 bpf_prog_array_free(arrays[i]); 186 + 187 + percpu_ref_exit(&cgrp->bpf.refcnt); 188 + 219 189 return -ENOMEM; 220 190 } 221 191 ··· 480 444 enum bpf_attach_type type = attr->query.attach_type; 481 445 struct list_head *progs = &cgrp->bpf.progs[type]; 482 446 u32 flags = cgrp->bpf.flags[type]; 447 + struct bpf_prog_array *effective; 483 448 int cnt, ret = 0, i; 484 449 450 + effective = rcu_dereference_protected(cgrp->bpf.effective[type], 451 + lockdep_is_held(&cgroup_mutex)); 452 + 485 453 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) 486 - cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); 454 + cnt = bpf_prog_array_length(effective); 487 455 else 488 456 cnt = prog_list_length(progs); 489 457 ··· 504 464 } 505 465 506 466 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { 507 - return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], 508 - prog_ids, cnt); 467 + return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); 509 468 } else { 510 469 struct bpf_prog_list *pl; 511 470 u32 id; ··· 587 548 * The program type passed in via @type must be suitable for network 588 549 * filtering. No further check is performed to assert that. 589 550 * 590 - * This function will return %-EPERM if any if an attached program was found 591 - * and if it returned != 1 during execution. In all other cases, 0 is returned. 551 + * For egress packets, this function can return: 552 + * NET_XMIT_SUCCESS (0) - continue with packet output 553 + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr 554 + * NET_XMIT_CN (2) - continue with packet output and notify TCP 555 + * to call cwr 556 + * -EPERM - drop packet 557 + * 558 + * For ingress packets, this function will return -EPERM if any 559 + * attached program was found and if it returned != 1 during execution. 560 + * Otherwise 0 is returned. 592 561 */ 593 562 int __cgroup_bpf_run_filter_skb(struct sock *sk, 594 563 struct sk_buff *skb, ··· 622 575 /* compute pointers for the bpf prog */ 623 576 bpf_compute_and_save_data_end(skb, &saved_data_end); 624 577 625 - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 626 - __bpf_prog_run_save_cb); 578 + if (type == BPF_CGROUP_INET_EGRESS) { 579 + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( 580 + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); 581 + } else { 582 + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 583 + __bpf_prog_run_save_cb); 584 + ret = (ret == 1 ? 0 : -EPERM); 585 + } 627 586 bpf_restore_data_end(skb, saved_data_end); 628 587 __skb_pull(skb, offset); 629 588 skb->sk = save_sk; 630 - return ret == 1 ? 0 : -EPERM; 589 + 590 + return ret; 631 591 } 632 592 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 633 593

+22 -24

kernel/bpf/core.c

··· 1795 1795 return &empty_prog_array.hdr; 1796 1796 } 1797 1797 1798 - void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) 1798 + void bpf_prog_array_free(struct bpf_prog_array *progs) 1799 1799 { 1800 - if (!progs || 1801 - progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) 1800 + if (!progs || progs == &empty_prog_array.hdr) 1802 1801 return; 1803 1802 kfree_rcu(progs, rcu); 1804 1803 } 1805 1804 1806 - int bpf_prog_array_length(struct bpf_prog_array __rcu *array) 1805 + int bpf_prog_array_length(struct bpf_prog_array *array) 1807 1806 { 1808 1807 struct bpf_prog_array_item *item; 1809 1808 u32 cnt = 0; 1810 1809 1811 - rcu_read_lock(); 1812 - item = rcu_dereference(array)->items; 1813 - for (; item->prog; item++) 1810 + for (item = array->items; item->prog; item++) 1814 1811 if (item->prog != &dummy_bpf_prog.prog) 1815 1812 cnt++; 1816 - rcu_read_unlock(); 1817 1813 return cnt; 1818 1814 } 1819 1815 1820 1816 1821 - static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, 1817 + static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, 1822 1818 u32 *prog_ids, 1823 1819 u32 request_cnt) 1824 1820 { 1825 1821 struct bpf_prog_array_item *item; 1826 1822 int i = 0; 1827 1823 1828 - item = rcu_dereference_check(array, 1)->items; 1829 - for (; item->prog; item++) { 1824 + for (item = array->items; item->prog; item++) { 1830 1825 if (item->prog == &dummy_bpf_prog.prog) 1831 1826 continue; 1832 1827 prog_ids[i] = item->prog->aux->id; ··· 1834 1839 return !!(item->prog); 1835 1840 } 1836 1841 1837 - int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, 1842 + int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, 1838 1843 __u32 __user *prog_ids, u32 cnt) 1839 1844 { 1840 1845 unsigned long err = 0; ··· 1845 1850 * cnt = bpf_prog_array_length(); 1846 1851 * if (cnt > 0) 1847 1852 * bpf_prog_array_copy_to_user(..., cnt); 1848 - * so below kcalloc doesn't need extra cnt > 0 check, but 1849 - * bpf_prog_array_length() releases rcu lock and 1850 - * prog array could have been swapped with empty or larger array, 1851 - * so always copy 'cnt' prog_ids to the user. 1852 - * In a rare race the user will see zero prog_ids 1853 + * so below kcalloc doesn't need extra cnt > 0 check. 1853 1854 */ 1854 1855 ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); 1855 1856 if (!ids) 1856 1857 return -ENOMEM; 1857 - rcu_read_lock(); 1858 1858 nospc = bpf_prog_array_copy_core(array, ids, cnt); 1859 - rcu_read_unlock(); 1860 1859 err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); 1861 1860 kfree(ids); 1862 1861 if (err) ··· 1860 1871 return 0; 1861 1872 } 1862 1873 1863 - void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, 1874 + void bpf_prog_array_delete_safe(struct bpf_prog_array *array, 1864 1875 struct bpf_prog *old_prog) 1865 1876 { 1866 - struct bpf_prog_array_item *item = array->items; 1877 + struct bpf_prog_array_item *item; 1867 1878 1868 - for (; item->prog; item++) 1879 + for (item = array->items; item->prog; item++) 1869 1880 if (item->prog == old_prog) { 1870 1881 WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); 1871 1882 break; 1872 1883 } 1873 1884 } 1874 1885 1875 - int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, 1886 + int bpf_prog_array_copy(struct bpf_prog_array *old_array, 1876 1887 struct bpf_prog *exclude_prog, 1877 1888 struct bpf_prog *include_prog, 1878 1889 struct bpf_prog_array **new_array) ··· 1936 1947 return 0; 1937 1948 } 1938 1949 1939 - int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, 1950 + int bpf_prog_array_copy_info(struct bpf_prog_array *array, 1940 1951 u32 *prog_ids, u32 request_cnt, 1941 1952 u32 *prog_cnt) 1942 1953 { ··· 2075 2086 } 2076 2087 2077 2088 bool __weak bpf_helper_changes_pkt_data(void *func) 2089 + { 2090 + return false; 2091 + } 2092 + 2093 + /* Return TRUE if the JIT backend wants verifier to enable sub-register usage 2094 + * analysis code and wants explicit zero extension inserted by verifier. 2095 + * Otherwise, return FALSE. 2096 + */ 2097 + bool __weak bpf_jit_needs_zext(void) 2078 2098 { 2079 2099 return false; 2080 2100 }

+4 -5

kernel/bpf/cpumap.c

··· 106 106 /* make sure page count doesn't overflow */ 107 107 cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); 108 108 cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); 109 - if (cost >= U32_MAX - PAGE_SIZE) 110 - goto free_cmap; 111 - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 112 109 113 110 /* Notice returns -EPERM on if map size is larger than memlock limit */ 114 - ret = bpf_map_precharge_memlock(cmap->map.pages); 111 + ret = bpf_map_charge_init(&cmap->map.memory, cost); 115 112 if (ret) { 116 113 err = ret; 117 114 goto free_cmap; ··· 118 121 cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), 119 122 __alignof__(unsigned long)); 120 123 if (!cmap->flush_needed) 121 - goto free_cmap; 124 + goto free_charge; 122 125 123 126 /* Alloc array for possible remote "destination" CPUs */ 124 127 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * ··· 130 133 return &cmap->map; 131 134 free_percpu: 132 135 free_percpu(cmap->flush_needed); 136 + free_charge: 137 + bpf_map_charge_finish(&cmap->map.memory); 133 138 free_cmap: 134 139 kfree(cmap); 135 140 return ERR_PTR(err);

+6 -8

kernel/bpf/devmap.c

··· 108 108 /* make sure page count doesn't overflow */ 109 109 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); 110 110 cost += dev_map_bitmap_size(attr) * num_possible_cpus(); 111 - if (cost >= U32_MAX - PAGE_SIZE) 112 - goto free_dtab; 113 111 114 - dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 115 - 116 - /* if map size is larger than memlock limit, reject it early */ 117 - err = bpf_map_precharge_memlock(dtab->map.pages); 112 + /* if map size is larger than memlock limit, reject it */ 113 + err = bpf_map_charge_init(&dtab->map.memory, cost); 118 114 if (err) 119 115 goto free_dtab; 120 116 ··· 121 125 __alignof__(unsigned long), 122 126 GFP_KERNEL | __GFP_NOWARN); 123 127 if (!dtab->flush_needed) 124 - goto free_dtab; 128 + goto free_charge; 125 129 126 130 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * 127 131 sizeof(struct bpf_dtab_netdev *), 128 132 dtab->map.numa_node); 129 133 if (!dtab->netdev_map) 130 - goto free_dtab; 134 + goto free_charge; 131 135 132 136 spin_lock(&dev_map_lock); 133 137 list_add_tail_rcu(&dtab->list, &dev_map_list); 134 138 spin_unlock(&dev_map_lock); 135 139 136 140 return &dtab->map; 141 + free_charge: 142 + bpf_map_charge_finish(&dtab->map.memory); 137 143 free_dtab: 138 144 free_percpu(dtab->flush_needed); 139 145 kfree(dtab);

+5 -9

kernel/bpf/hashtab.c

··· 360 360 else 361 361 cost += (u64) htab->elem_size * num_possible_cpus(); 362 362 363 - if (cost >= U32_MAX - PAGE_SIZE) 364 - /* make sure page count doesn't overflow */ 365 - goto free_htab; 366 - 367 - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 368 - 369 - /* if map size is larger than memlock limit, reject it early */ 370 - err = bpf_map_precharge_memlock(htab->map.pages); 363 + /* if map size is larger than memlock limit, reject it */ 364 + err = bpf_map_charge_init(&htab->map.memory, cost); 371 365 if (err) 372 366 goto free_htab; 373 367 ··· 370 376 sizeof(struct bucket), 371 377 htab->map.numa_node); 372 378 if (!htab->buckets) 373 - goto free_htab; 379 + goto free_charge; 374 380 375 381 if (htab->map.map_flags & BPF_F_ZERO_SEED) 376 382 htab->hashrnd = 0; ··· 403 409 prealloc_destroy(htab); 404 410 free_buckets: 405 411 bpf_map_area_free(htab->buckets); 412 + free_charge: 413 + bpf_map_charge_finish(&htab->map.memory); 406 414 free_htab: 407 415 kfree(htab); 408 416 return ERR_PTR(err);

+10 -3

kernel/bpf/local_storage.c

··· 272 272 { 273 273 int numa_node = bpf_map_attr_numa_node(attr); 274 274 struct bpf_cgroup_storage_map *map; 275 + struct bpf_map_memory mem; 276 + int ret; 275 277 276 278 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) 277 279 return ERR_PTR(-EINVAL); ··· 292 290 /* max_entries is not used and enforced to be 0 */ 293 291 return ERR_PTR(-EINVAL); 294 292 293 + ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); 294 + if (ret < 0) 295 + return ERR_PTR(ret); 296 + 295 297 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), 296 298 __GFP_ZERO | GFP_USER, numa_node); 297 - if (!map) 299 + if (!map) { 300 + bpf_map_charge_finish(&mem); 298 301 return ERR_PTR(-ENOMEM); 302 + } 299 303 300 - map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), 301 - PAGE_SIZE) >> PAGE_SHIFT; 304 + bpf_map_charge_move(&map->map.memory, &mem); 302 305 303 306 /* copy mandatory map attributes */ 304 307 bpf_map_init_from_attr(&map->map, attr);

+1 -7

kernel/bpf/lpm_trie.c

··· 573 573 cost_per_node = sizeof(struct lpm_trie_node) + 574 574 attr->value_size + trie->data_size; 575 575 cost += (u64) attr->max_entries * cost_per_node; 576 - if (cost >= U32_MAX - PAGE_SIZE) { 577 - ret = -E2BIG; 578 - goto out_err; 579 - } 580 576 581 - trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 582 - 583 - ret = bpf_map_precharge_memlock(trie->map.pages); 577 + ret = bpf_map_charge_init(&trie->map.memory, cost); 584 578 if (ret) 585 579 goto out_err; 586 580

+6 -7

kernel/bpf/queue_stack_maps.c

··· 67 67 static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) 68 68 { 69 69 int ret, numa_node = bpf_map_attr_numa_node(attr); 70 + struct bpf_map_memory mem = {0}; 70 71 struct bpf_queue_stack *qs; 71 72 u64 size, queue_size, cost; 72 73 73 74 size = (u64) attr->max_entries + 1; 74 75 cost = queue_size = sizeof(*qs) + size * attr->value_size; 75 - if (cost >= U32_MAX - PAGE_SIZE) 76 - return ERR_PTR(-E2BIG); 77 76 78 - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 79 - 80 - ret = bpf_map_precharge_memlock(cost); 77 + ret = bpf_map_charge_init(&mem, cost); 81 78 if (ret < 0) 82 79 return ERR_PTR(ret); 83 80 84 81 qs = bpf_map_area_alloc(queue_size, numa_node); 85 - if (!qs) 82 + if (!qs) { 83 + bpf_map_charge_finish(&mem); 86 84 return ERR_PTR(-ENOMEM); 85 + } 87 86 88 87 memset(qs, 0, sizeof(*qs)); 89 88 90 89 bpf_map_init_from_attr(&qs->map, attr); 91 90 92 - qs->map.pages = cost; 91 + bpf_map_charge_move(&qs->map.memory, &mem); 93 92 qs->size = size; 94 93 95 94 raw_spin_lock_init(&qs->lock);

+7 -10

kernel/bpf/reuseport_array.c

··· 151 151 { 152 152 int err, numa_node = bpf_map_attr_numa_node(attr); 153 153 struct reuseport_array *array; 154 - u64 cost, array_size; 154 + struct bpf_map_memory mem; 155 + u64 array_size; 155 156 156 157 if (!capable(CAP_SYS_ADMIN)) 157 158 return ERR_PTR(-EPERM); ··· 160 159 array_size = sizeof(*array); 161 160 array_size += (u64)attr->max_entries * sizeof(struct sock *); 162 161 163 - /* make sure there is no u32 overflow later in round_up() */ 164 - cost = array_size; 165 - if (cost >= U32_MAX - PAGE_SIZE) 166 - return ERR_PTR(-ENOMEM); 167 - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 168 - 169 - err = bpf_map_precharge_memlock(cost); 162 + err = bpf_map_charge_init(&mem, array_size); 170 163 if (err) 171 164 return ERR_PTR(err); 172 165 173 166 /* allocate all map elements and zero-initialize them */ 174 167 array = bpf_map_area_alloc(array_size, numa_node); 175 - if (!array) 168 + if (!array) { 169 + bpf_map_charge_finish(&mem); 176 170 return ERR_PTR(-ENOMEM); 171 + } 177 172 178 173 /* copy mandatory map attributes */ 179 174 bpf_map_init_from_attr(&array->map, attr); 180 - array->map.pages = cost; 175 + bpf_map_charge_move(&array->map.memory, &mem); 181 176 182 177 return &array->map; 183 178 }

+13 -15

kernel/bpf/stackmap.c

··· 89 89 { 90 90 u32 value_size = attr->value_size; 91 91 struct bpf_stack_map *smap; 92 + struct bpf_map_memory mem; 92 93 u64 cost, n_buckets; 93 94 int err; 94 95 ··· 117 116 n_buckets = roundup_pow_of_two(attr->max_entries); 118 117 119 118 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); 120 - if (cost >= U32_MAX - PAGE_SIZE) 121 - return ERR_PTR(-E2BIG); 119 + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); 120 + err = bpf_map_charge_init(&mem, cost); 121 + if (err) 122 + return ERR_PTR(err); 122 123 123 124 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); 124 - if (!smap) 125 + if (!smap) { 126 + bpf_map_charge_finish(&mem); 125 127 return ERR_PTR(-ENOMEM); 126 - 127 - err = -E2BIG; 128 - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); 129 - if (cost >= U32_MAX - PAGE_SIZE) 130 - goto free_smap; 128 + } 131 129 132 130 bpf_map_init_from_attr(&smap->map, attr); 133 131 smap->map.value_size = value_size; 134 132 smap->n_buckets = n_buckets; 135 - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 136 - 137 - err = bpf_map_precharge_memlock(smap->map.pages); 138 - if (err) 139 - goto free_smap; 140 133 141 134 err = get_callchain_buffers(sysctl_perf_event_max_stack); 142 135 if (err) 143 - goto free_smap; 136 + goto free_charge; 144 137 145 138 err = prealloc_elems_and_freelist(smap); 146 139 if (err) 147 140 goto put_buffers; 148 141 142 + bpf_map_charge_move(&smap->map.memory, &mem); 143 + 149 144 return &smap->map; 150 145 151 146 put_buffers: 152 147 put_callchain_buffers(); 153 - free_smap: 148 + free_charge: 149 + bpf_map_charge_finish(&mem); 154 150 bpf_map_area_free(smap); 155 151 return ERR_PTR(err); 156 152 }

+60 -43

kernel/bpf/syscall.c

··· 188 188 map->numa_node = bpf_map_attr_numa_node(attr); 189 189 } 190 190 191 - int bpf_map_precharge_memlock(u32 pages) 192 - { 193 - struct user_struct *user = get_current_user(); 194 - unsigned long memlock_limit, cur; 195 - 196 - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 197 - cur = atomic_long_read(&user->locked_vm); 198 - free_uid(user); 199 - if (cur + pages > memlock_limit) 200 - return -EPERM; 201 - return 0; 202 - } 203 - 204 191 static int bpf_charge_memlock(struct user_struct *user, u32 pages) 205 192 { 206 193 unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; ··· 201 214 202 215 static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) 203 216 { 204 - atomic_long_sub(pages, &user->locked_vm); 217 + if (user) 218 + atomic_long_sub(pages, &user->locked_vm); 205 219 } 206 220 207 - static int bpf_map_init_memlock(struct bpf_map *map) 221 + int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) 208 222 { 209 - struct user_struct *user = get_current_user(); 223 + u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; 224 + struct user_struct *user; 210 225 int ret; 211 226 212 - ret = bpf_charge_memlock(user, map->pages); 227 + if (size >= U32_MAX - PAGE_SIZE) 228 + return -E2BIG; 229 + 230 + user = get_current_user(); 231 + ret = bpf_charge_memlock(user, pages); 213 232 if (ret) { 214 233 free_uid(user); 215 234 return ret; 216 235 } 217 - map->user = user; 218 - return ret; 236 + 237 + mem->pages = pages; 238 + mem->user = user; 239 + 240 + return 0; 219 241 } 220 242 221 - static void bpf_map_release_memlock(struct bpf_map *map) 243 + void bpf_map_charge_finish(struct bpf_map_memory *mem) 222 244 { 223 - struct user_struct *user = map->user; 224 - bpf_uncharge_memlock(user, map->pages); 225 - free_uid(user); 245 + bpf_uncharge_memlock(mem->user, mem->pages); 246 + free_uid(mem->user); 247 + } 248 + 249 + void bpf_map_charge_move(struct bpf_map_memory *dst, 250 + struct bpf_map_memory *src) 251 + { 252 + *dst = *src; 253 + 254 + /* Make sure src will not be used for the redundant uncharging. */ 255 + memset(src, 0, sizeof(struct bpf_map_memory)); 226 256 } 227 257 228 258 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) 229 259 { 230 260 int ret; 231 261 232 - ret = bpf_charge_memlock(map->user, pages); 262 + ret = bpf_charge_memlock(map->memory.user, pages); 233 263 if (ret) 234 264 return ret; 235 - map->pages += pages; 265 + map->memory.pages += pages; 236 266 return ret; 237 267 } 238 268 239 269 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) 240 270 { 241 - bpf_uncharge_memlock(map->user, pages); 242 - map->pages -= pages; 271 + bpf_uncharge_memlock(map->memory.user, pages); 272 + map->memory.pages -= pages; 243 273 } 244 274 245 275 static int bpf_map_alloc_id(struct bpf_map *map) ··· 307 303 static void bpf_map_free_deferred(struct work_struct *work) 308 304 { 309 305 struct bpf_map *map = container_of(work, struct bpf_map, work); 306 + struct bpf_map_memory mem; 310 307 311 - bpf_map_release_memlock(map); 308 + bpf_map_charge_move(&mem, &map->memory); 312 309 security_bpf_map_free(map); 313 310 /* implementation dependent freeing */ 314 311 map->ops->map_free(map); 312 + bpf_map_charge_finish(&mem); 315 313 } 316 314 317 315 static void bpf_map_put_uref(struct bpf_map *map) ··· 401 395 map->value_size, 402 396 map->max_entries, 403 397 map->map_flags, 404 - map->pages * 1ULL << PAGE_SHIFT, 398 + map->memory.pages * 1ULL << PAGE_SHIFT, 405 399 map->id, 406 400 READ_ONCE(map->frozen)); 407 401 ··· 555 549 static int map_create(union bpf_attr *attr) 556 550 { 557 551 int numa_node = bpf_map_attr_numa_node(attr); 552 + struct bpf_map_memory mem; 558 553 struct bpf_map *map; 559 554 int f_flags; 560 555 int err; ··· 580 573 581 574 err = bpf_obj_name_cpy(map->name, attr->map_name); 582 575 if (err) 583 - goto free_map_nouncharge; 576 + goto free_map; 584 577 585 578 atomic_set(&map->refcnt, 1); 586 579 atomic_set(&map->usercnt, 1); ··· 590 583 591 584 if (!attr->btf_value_type_id) { 592 585 err = -EINVAL; 593 - goto free_map_nouncharge; 586 + goto free_map; 594 587 } 595 588 596 589 btf = btf_get_by_fd(attr->btf_fd); 597 590 if (IS_ERR(btf)) { 598 591 err = PTR_ERR(btf); 599 - goto free_map_nouncharge; 592 + goto free_map; 600 593 } 601 594 602 595 err = map_check_btf(map, btf, attr->btf_key_type_id, 603 596 attr->btf_value_type_id); 604 597 if (err) { 605 598 btf_put(btf); 606 - goto free_map_nouncharge; 599 + goto free_map; 607 600 } 608 601 609 602 map->btf = btf; ··· 615 608 616 609 err = security_bpf_map_alloc(map); 617 610 if (err) 618 - goto free_map_nouncharge; 619 - 620 - err = bpf_map_init_memlock(map); 621 - if (err) 622 - goto free_map_sec; 611 + goto free_map; 623 612 624 613 err = bpf_map_alloc_id(map); 625 614 if (err) 626 - goto free_map; 615 + goto free_map_sec; 627 616 628 617 err = bpf_map_new_fd(map, f_flags); 629 618 if (err < 0) { ··· 635 632 636 633 return err; 637 634 638 - free_map: 639 - bpf_map_release_memlock(map); 640 635 free_map_sec: 641 636 security_bpf_map_free(map); 642 - free_map_nouncharge: 637 + free_map: 643 638 btf_put(map->btf); 639 + bpf_map_charge_move(&mem, &map->memory); 644 640 map->ops->map_free(map); 641 + bpf_map_charge_finish(&mem); 645 642 return err; 646 643 } 647 644 ··· 1588 1585 default: 1589 1586 return -EINVAL; 1590 1587 } 1588 + case BPF_PROG_TYPE_CGROUP_SKB: 1589 + switch (expected_attach_type) { 1590 + case BPF_CGROUP_INET_INGRESS: 1591 + case BPF_CGROUP_INET_EGRESS: 1592 + return 0; 1593 + default: 1594 + return -EINVAL; 1595 + } 1591 1596 default: 1592 1597 return 0; 1593 1598 } ··· 1615 1604 if (CHECK_ATTR(BPF_PROG_LOAD)) 1616 1605 return -EINVAL; 1617 1606 1618 - if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) 1607 + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | 1608 + BPF_F_ANY_ALIGNMENT | 1609 + BPF_F_TEST_RND_HI32)) 1619 1610 return -EINVAL; 1620 1611 1621 1612 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && ··· 1847 1834 case BPF_PROG_TYPE_CGROUP_SOCK: 1848 1835 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 1849 1836 return attach_type == prog->expected_attach_type ? 0 : -EINVAL; 1837 + case BPF_PROG_TYPE_CGROUP_SKB: 1838 + return prog->enforce_expected_attach_type && 1839 + prog->expected_attach_type != attach_type ? 1840 + -EINVAL : 0; 1850 1841 default: 1851 1842 return 0; 1852 1843 }

+345 -52

kernel/bpf/verifier.c

··· 176 176 struct bpf_verifier_stack_elem *next; 177 177 }; 178 178 179 - #define BPF_COMPLEXITY_LIMIT_STACK 1024 179 + #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 180 180 #define BPF_COMPLEXITY_LIMIT_STATES 64 181 181 182 182 #define BPF_MAP_PTR_UNPRIV 1UL ··· 782 782 if (err) 783 783 goto err; 784 784 elem->st.speculative |= speculative; 785 - if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { 786 - verbose(env, "BPF program is too complex\n"); 785 + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { 786 + verbose(env, "The sequence of %d jumps is too complex.\n", 787 + env->stack_size); 787 788 goto err; 788 789 } 789 790 return &elem->st; ··· 982 981 __mark_reg_not_init(regs + regno); 983 982 } 984 983 984 + #define DEF_NOT_SUBREG (0) 985 985 static void init_reg_state(struct bpf_verifier_env *env, 986 986 struct bpf_func_state *state) 987 987 { ··· 993 991 mark_reg_not_init(env, regs, i); 994 992 regs[i].live = REG_LIVE_NONE; 995 993 regs[i].parent = NULL; 994 + regs[i].subreg_def = DEF_NOT_SUBREG; 996 995 } 997 996 998 997 /* frame pointer */ ··· 1139 1136 */ 1140 1137 static int mark_reg_read(struct bpf_verifier_env *env, 1141 1138 const struct bpf_reg_state *state, 1142 - struct bpf_reg_state *parent) 1139 + struct bpf_reg_state *parent, u8 flag) 1143 1140 { 1144 1141 bool writes = parent == state->parent; /* Observe write marks */ 1145 1142 int cnt = 0; ··· 1154 1151 parent->var_off.value, parent->off); 1155 1152 return -EFAULT; 1156 1153 } 1157 - if (parent->live & REG_LIVE_READ) 1154 + /* The first condition is more likely to be true than the 1155 + * second, checked it first. 1156 + */ 1157 + if ((parent->live & REG_LIVE_READ) == flag || 1158 + parent->live & REG_LIVE_READ64) 1158 1159 /* The parentage chain never changes and 1159 1160 * this parent was already marked as LIVE_READ. 1160 1161 * There is no need to keep walking the chain again and 1161 1162 * keep re-marking all parents as LIVE_READ. 1162 1163 * This case happens when the same register is read 1163 1164 * multiple times without writes into it in-between. 1165 + * Also, if parent has the stronger REG_LIVE_READ64 set, 1166 + * then no need to set the weak REG_LIVE_READ32. 1164 1167 */ 1165 1168 break; 1166 1169 /* ... then we depend on parent's value */ 1167 - parent->live |= REG_LIVE_READ; 1170 + parent->live |= flag; 1171 + /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ 1172 + if (flag == REG_LIVE_READ64) 1173 + parent->live &= ~REG_LIVE_READ32; 1168 1174 state = parent; 1169 1175 parent = state->parent; 1170 1176 writes = true; ··· 1185 1173 return 0; 1186 1174 } 1187 1175 1176 + /* This function is supposed to be used by the following 32-bit optimization 1177 + * code only. It returns TRUE if the source or destination register operates 1178 + * on 64-bit, otherwise return FALSE. 1179 + */ 1180 + static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, 1181 + u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) 1182 + { 1183 + u8 code, class, op; 1184 + 1185 + code = insn->code; 1186 + class = BPF_CLASS(code); 1187 + op = BPF_OP(code); 1188 + if (class == BPF_JMP) { 1189 + /* BPF_EXIT for "main" will reach here. Return TRUE 1190 + * conservatively. 1191 + */ 1192 + if (op == BPF_EXIT) 1193 + return true; 1194 + if (op == BPF_CALL) { 1195 + /* BPF to BPF call will reach here because of marking 1196 + * caller saved clobber with DST_OP_NO_MARK for which we 1197 + * don't care the register def because they are anyway 1198 + * marked as NOT_INIT already. 1199 + */ 1200 + if (insn->src_reg == BPF_PSEUDO_CALL) 1201 + return false; 1202 + /* Helper call will reach here because of arg type 1203 + * check, conservatively return TRUE. 1204 + */ 1205 + if (t == SRC_OP) 1206 + return true; 1207 + 1208 + return false; 1209 + } 1210 + } 1211 + 1212 + if (class == BPF_ALU64 || class == BPF_JMP || 1213 + /* BPF_END always use BPF_ALU class. */ 1214 + (class == BPF_ALU && op == BPF_END && insn->imm == 64)) 1215 + return true; 1216 + 1217 + if (class == BPF_ALU || class == BPF_JMP32) 1218 + return false; 1219 + 1220 + if (class == BPF_LDX) { 1221 + if (t != SRC_OP) 1222 + return BPF_SIZE(code) == BPF_DW; 1223 + /* LDX source must be ptr. */ 1224 + return true; 1225 + } 1226 + 1227 + if (class == BPF_STX) { 1228 + if (reg->type != SCALAR_VALUE) 1229 + return true; 1230 + return BPF_SIZE(code) == BPF_DW; 1231 + } 1232 + 1233 + if (class == BPF_LD) { 1234 + u8 mode = BPF_MODE(code); 1235 + 1236 + /* LD_IMM64 */ 1237 + if (mode == BPF_IMM) 1238 + return true; 1239 + 1240 + /* Both LD_IND and LD_ABS return 32-bit data. */ 1241 + if (t != SRC_OP) 1242 + return false; 1243 + 1244 + /* Implicit ctx ptr. */ 1245 + if (regno == BPF_REG_6) 1246 + return true; 1247 + 1248 + /* Explicit source could be any width. */ 1249 + return true; 1250 + } 1251 + 1252 + if (class == BPF_ST) 1253 + /* The only source register for BPF_ST is a ptr. */ 1254 + return true; 1255 + 1256 + /* Conservatively return true at default. */ 1257 + return true; 1258 + } 1259 + 1260 + /* Return TRUE if INSN doesn't have explicit value define. */ 1261 + static bool insn_no_def(struct bpf_insn *insn) 1262 + { 1263 + u8 class = BPF_CLASS(insn->code); 1264 + 1265 + return (class == BPF_JMP || class == BPF_JMP32 || 1266 + class == BPF_STX || class == BPF_ST); 1267 + } 1268 + 1269 + /* Return TRUE if INSN has defined any 32-bit value explicitly. */ 1270 + static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) 1271 + { 1272 + if (insn_no_def(insn)) 1273 + return false; 1274 + 1275 + return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); 1276 + } 1277 + 1278 + static void mark_insn_zext(struct bpf_verifier_env *env, 1279 + struct bpf_reg_state *reg) 1280 + { 1281 + s32 def_idx = reg->subreg_def; 1282 + 1283 + if (def_idx == DEF_NOT_SUBREG) 1284 + return; 1285 + 1286 + env->insn_aux_data[def_idx - 1].zext_dst = true; 1287 + /* The dst will be zero extended, so won't be sub-register anymore. */ 1288 + reg->subreg_def = DEF_NOT_SUBREG; 1289 + } 1290 + 1188 1291 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, 1189 1292 enum reg_arg_type t) 1190 1293 { 1191 1294 struct bpf_verifier_state *vstate = env->cur_state; 1192 1295 struct bpf_func_state *state = vstate->frame[vstate->curframe]; 1296 + struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; 1193 1297 struct bpf_reg_state *reg, *regs = state->regs; 1298 + bool rw64; 1194 1299 1195 1300 if (regno >= MAX_BPF_REG) { 1196 1301 verbose(env, "R%d is invalid\n", regno); ··· 1315 1186 } 1316 1187 1317 1188 reg = &regs[regno]; 1189 + rw64 = is_reg64(env, insn, regno, reg, t); 1318 1190 if (t == SRC_OP) { 1319 1191 /* check whether register used as source operand can be read */ 1320 1192 if (reg->type == NOT_INIT) { ··· 1326 1196 if (regno == BPF_REG_FP) 1327 1197 return 0; 1328 1198 1329 - return mark_reg_read(env, reg, reg->parent); 1199 + if (rw64) 1200 + mark_insn_zext(env, reg); 1201 + 1202 + return mark_reg_read(env, reg, reg->parent, 1203 + rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); 1330 1204 } else { 1331 1205 /* check whether register used as dest operand can be written to */ 1332 1206 if (regno == BPF_REG_FP) { ··· 1338 1204 return -EACCES; 1339 1205 } 1340 1206 reg->live |= REG_LIVE_WRITTEN; 1207 + reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; 1341 1208 if (t == DST_OP) 1342 1209 mark_reg_unknown(env, regs, regno); 1343 1210 } ··· 1518 1383 state->regs[value_regno].live |= REG_LIVE_WRITTEN; 1519 1384 } 1520 1385 mark_reg_read(env, &reg_state->stack[spi].spilled_ptr, 1521 - reg_state->stack[spi].spilled_ptr.parent); 1386 + reg_state->stack[spi].spilled_ptr.parent, 1387 + REG_LIVE_READ64); 1522 1388 return 0; 1523 1389 } else { 1524 1390 int zeros = 0; ··· 1536 1400 return -EACCES; 1537 1401 } 1538 1402 mark_reg_read(env, &reg_state->stack[spi].spilled_ptr, 1539 - reg_state->stack[spi].spilled_ptr.parent); 1403 + reg_state->stack[spi].spilled_ptr.parent, 1404 + REG_LIVE_READ64); 1540 1405 if (value_regno >= 0) { 1541 1406 if (zeros == size) { 1542 1407 /* any size read into register is zero extended, ··· 2246 2109 value_regno); 2247 2110 if (reg_type_may_be_null(reg_type)) 2248 2111 regs[value_regno].id = ++env->id_gen; 2112 + /* A load of ctx field could have different 2113 + * actual load size with the one encoded in the 2114 + * insn. When the dst is PTR, it is for sure not 2115 + * a sub-register. 2116 + */ 2117 + regs[value_regno].subreg_def = DEF_NOT_SUBREG; 2249 2118 } 2250 2119 regs[value_regno].type = reg_type; 2251 2120 } ··· 2511 2368 * the whole slot to be marked as 'read' 2512 2369 */ 2513 2370 mark_reg_read(env, &state->stack[spi].spilled_ptr, 2514 - state->stack[spi].spilled_ptr.parent); 2371 + state->stack[spi].spilled_ptr.parent, 2372 + REG_LIVE_READ64); 2515 2373 } 2516 2374 return update_stack_depth(env, state, min_off); 2517 2375 } ··· 3476 3332 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); 3477 3333 } 3478 3334 3335 + /* helper call returns 64-bit value. */ 3336 + regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; 3337 + 3479 3338 /* update return register (already marked as written above) */ 3480 3339 if (fn->ret_type == RET_INTEGER) { 3481 3340 /* sets type to SCALAR_VALUE */ ··· 4410 4263 */ 4411 4264 *dst_reg = *src_reg; 4412 4265 dst_reg->live |= REG_LIVE_WRITTEN; 4266 + dst_reg->subreg_def = DEF_NOT_SUBREG; 4413 4267 } else { 4414 4268 /* R1 = (u32) R2 */ 4415 4269 if (is_pointer_value(env, insn->src_reg)) { ··· 4421 4273 } else if (src_reg->type == SCALAR_VALUE) { 4422 4274 *dst_reg = *src_reg; 4423 4275 dst_reg->live |= REG_LIVE_WRITTEN; 4276 + dst_reg->subreg_def = env->insn_idx + 1; 4424 4277 } else { 4425 4278 mark_reg_unknown(env, regs, 4426 4279 insn->dst_reg); ··· 5501 5352 * Already marked as written above. 5502 5353 */ 5503 5354 mark_reg_unknown(env, regs, BPF_REG_0); 5355 + /* ld_abs load up to 32-bit skb data. */ 5356 + regs[BPF_REG_0].subreg_def = env->insn_idx + 1; 5504 5357 return 0; 5505 5358 } 5506 5359 5507 5360 static int check_return_code(struct bpf_verifier_env *env) 5508 5361 { 5362 + struct tnum enforce_attach_type_range = tnum_unknown; 5509 5363 struct bpf_reg_state *reg; 5510 5364 struct tnum range = tnum_range(0, 1); 5511 5365 5512 5366 switch (env->prog->type) { 5513 5367 case BPF_PROG_TYPE_CGROUP_SKB: 5368 + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { 5369 + range = tnum_range(0, 3); 5370 + enforce_attach_type_range = tnum_range(2, 3); 5371 + } 5514 5372 case BPF_PROG_TYPE_CGROUP_SOCK: 5515 5373 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 5516 5374 case BPF_PROG_TYPE_SOCK_OPS: ··· 5536 5380 } 5537 5381 5538 5382 if (!tnum_in(range, reg->var_off)) { 5383 + char tn_buf[48]; 5384 + 5539 5385 verbose(env, "At program exit the register R0 "); 5540 5386 if (!tnum_is_unknown(reg->var_off)) { 5541 - char tn_buf[48]; 5542 - 5543 5387 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); 5544 5388 verbose(env, "has value %s", tn_buf); 5545 5389 } else { 5546 5390 verbose(env, "has unknown scalar value"); 5547 5391 } 5548 - verbose(env, " should have been 0 or 1\n"); 5392 + tnum_strn(tn_buf, sizeof(tn_buf), range); 5393 + verbose(env, " should have been %s\n", tn_buf); 5549 5394 return -EINVAL; 5550 5395 } 5396 + 5397 + if (!tnum_is_unknown(enforce_attach_type_range) && 5398 + tnum_in(enforce_attach_type_range, reg->var_off)) 5399 + env->prog->enforce_expected_attach_type = 1; 5551 5400 return 0; 5552 5401 } 5553 5402 ··· 5596 5435 BRANCH = 2, 5597 5436 }; 5598 5437 5599 - #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) 5438 + static u32 state_htab_size(struct bpf_verifier_env *env) 5439 + { 5440 + return env->prog->len; 5441 + } 5442 + 5443 + static struct bpf_verifier_state_list **explored_state( 5444 + struct bpf_verifier_env *env, 5445 + int idx) 5446 + { 5447 + struct bpf_verifier_state *cur = env->cur_state; 5448 + struct bpf_func_state *state = cur->frame[cur->curframe]; 5449 + 5450 + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; 5451 + } 5452 + 5453 + static void init_explored_state(struct bpf_verifier_env *env, int idx) 5454 + { 5455 + env->insn_aux_data[idx].prune_point = true; 5456 + } 5600 5457 5601 5458 /* t, w, e - match pseudo-code above: 5602 5459 * t - index of current instruction ··· 5640 5461 5641 5462 if (e == BRANCH) 5642 5463 /* mark branch target for state pruning */ 5643 - env->explored_states[w] = STATE_LIST_MARK; 5464 + init_explored_state(env, w); 5644 5465 5645 5466 if (insn_state[w] == 0) { 5646 5467 /* tree-edge */ ··· 5708 5529 else if (ret < 0) 5709 5530 goto err_free; 5710 5531 if (t + 1 < insn_cnt) 5711 - env->explored_states[t + 1] = STATE_LIST_MARK; 5532 + init_explored_state(env, t + 1); 5712 5533 if (insns[t].src_reg == BPF_PSEUDO_CALL) { 5713 - env->explored_states[t] = STATE_LIST_MARK; 5534 + init_explored_state(env, t); 5714 5535 ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); 5715 5536 if (ret == 1) 5716 5537 goto peek_stack; ··· 5733 5554 * after every call and jump 5734 5555 */ 5735 5556 if (t + 1 < insn_cnt) 5736 - env->explored_states[t + 1] = STATE_LIST_MARK; 5557 + init_explored_state(env, t + 1); 5737 5558 } else { 5738 5559 /* conditional jump with two edges */ 5739 - env->explored_states[t] = STATE_LIST_MARK; 5560 + init_explored_state(env, t); 5740 5561 ret = push_insn(t, t + 1, FALLTHROUGH, env); 5741 5562 if (ret == 1) 5742 5563 goto peek_stack; ··· 6184 6005 struct bpf_verifier_state_list *sl; 6185 6006 int i; 6186 6007 6187 - sl = env->explored_states[insn]; 6188 - if (!sl) 6189 - return; 6190 - 6191 - while (sl != STATE_LIST_MARK) { 6192 - if (sl->state.curframe != cur->curframe) 6008 + sl = *explored_state(env, insn); 6009 + while (sl) { 6010 + if (sl->state.insn_idx != insn || 6011 + sl->state.curframe != cur->curframe) 6193 6012 goto next; 6194 6013 for (i = 0; i <= cur->curframe; i++) 6195 6014 if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) ··· 6469 6292 return true; 6470 6293 } 6471 6294 6295 + /* Return 0 if no propagation happened. Return negative error code if error 6296 + * happened. Otherwise, return the propagated bit. 6297 + */ 6472 6298 static int propagate_liveness_reg(struct bpf_verifier_env *env, 6473 6299 struct bpf_reg_state *reg, 6474 6300 struct bpf_reg_state *parent_reg) 6475 6301 { 6302 + u8 parent_flag = parent_reg->live & REG_LIVE_READ; 6303 + u8 flag = reg->live & REG_LIVE_READ; 6476 6304 int err; 6477 6305 6478 - if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) 6306 + /* When comes here, read flags of PARENT_REG or REG could be any of 6307 + * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need 6308 + * of propagation if PARENT_REG has strongest REG_LIVE_READ64. 6309 + */ 6310 + if (parent_flag == REG_LIVE_READ64 || 6311 + /* Or if there is no read flag from REG. */ 6312 + !flag || 6313 + /* Or if the read flag from REG is the same as PARENT_REG. */ 6314 + parent_flag == flag) 6479 6315 return 0; 6480 6316 6481 - err = mark_reg_read(env, reg, parent_reg); 6317 + err = mark_reg_read(env, reg, parent_reg, flag); 6482 6318 if (err) 6483 6319 return err; 6484 6320 6485 - return 0; 6321 + return flag; 6486 6322 } 6487 6323 6488 6324 /* A write screens off any subsequent reads; but write marks come from the ··· 6529 6339 for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { 6530 6340 err = propagate_liveness_reg(env, &state_reg[i], 6531 6341 &parent_reg[i]); 6532 - if (err) 6342 + if (err < 0) 6533 6343 return err; 6344 + if (err == REG_LIVE_READ64) 6345 + mark_insn_zext(env, &parent_reg[i]); 6534 6346 } 6535 6347 6536 6348 /* Propagate stack slots. */ ··· 6542 6350 state_reg = &state->stack[i].spilled_ptr; 6543 6351 err = propagate_liveness_reg(env, state_reg, 6544 6352 parent_reg); 6545 - if (err) 6353 + if (err < 0) 6546 6354 return err; 6547 6355 } 6548 6356 } 6549 - return err; 6357 + return 0; 6550 6358 } 6551 6359 6552 6360 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) ··· 6556 6364 struct bpf_verifier_state *cur = env->cur_state, *new; 6557 6365 int i, j, err, states_cnt = 0; 6558 6366 6559 - pprev = &env->explored_states[insn_idx]; 6560 - sl = *pprev; 6561 - 6562 - if (!sl) 6367 + if (!env->insn_aux_data[insn_idx].prune_point) 6563 6368 /* this 'insn_idx' instruction wasn't marked, so we will not 6564 6369 * be doing state search here 6565 6370 */ 6566 6371 return 0; 6567 6372 6373 + pprev = explored_state(env, insn_idx); 6374 + sl = *pprev; 6375 + 6568 6376 clean_live_states(env, insn_idx, cur); 6569 6377 6570 - while (sl != STATE_LIST_MARK) { 6378 + while (sl) { 6379 + states_cnt++; 6380 + if (sl->state.insn_idx != insn_idx) 6381 + goto next; 6571 6382 if (states_equal(env, &sl->state, cur)) { 6572 6383 sl->hit_cnt++; 6573 6384 /* reached equivalent register/stack state, ··· 6588 6393 return err; 6589 6394 return 1; 6590 6395 } 6591 - states_cnt++; 6592 6396 sl->miss_cnt++; 6593 6397 /* heuristic to determine whether this state is beneficial 6594 6398 * to keep checking from state equivalence point of view. ··· 6614 6420 sl = *pprev; 6615 6421 continue; 6616 6422 } 6423 + next: 6617 6424 pprev = &sl->next; 6618 6425 sl = *pprev; 6619 6426 } ··· 6646 6451 kfree(new_sl); 6647 6452 return err; 6648 6453 } 6649 - new_sl->next = env->explored_states[insn_idx]; 6650 - env->explored_states[insn_idx] = new_sl; 6454 + new->insn_idx = insn_idx; 6455 + new_sl->next = *explored_state(env, insn_idx); 6456 + *explored_state(env, insn_idx) = new_sl; 6651 6457 /* connect new state to parentage chain. Current frame needs all 6652 6458 * registers connected. Only r6 - r9 of the callers are alive (pushed 6653 6459 * to the stack implicitly by JITs) so in callers' frames connect just ··· 7326 7130 * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying 7327 7131 * [0, off) and [off, end) to new locations, so the patched range stays zero 7328 7132 */ 7329 - static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, 7330 - u32 off, u32 cnt) 7133 + static int adjust_insn_aux_data(struct bpf_verifier_env *env, 7134 + struct bpf_prog *new_prog, u32 off, u32 cnt) 7331 7135 { 7332 7136 struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; 7137 + struct bpf_insn *insn = new_prog->insnsi; 7138 + u32 prog_len; 7333 7139 int i; 7140 + 7141 + /* aux info at OFF always needs adjustment, no matter fast path 7142 + * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the 7143 + * original insn at old prog. 7144 + */ 7145 + old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); 7334 7146 7335 7147 if (cnt == 1) 7336 7148 return 0; 7149 + prog_len = new_prog->len; 7337 7150 new_data = vzalloc(array_size(prog_len, 7338 7151 sizeof(struct bpf_insn_aux_data))); 7339 7152 if (!new_data) ··· 7350 7145 memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); 7351 7146 memcpy(new_data + off + cnt - 1, old_data + off, 7352 7147 sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); 7353 - for (i = off; i < off + cnt - 1; i++) 7148 + for (i = off; i < off + cnt - 1; i++) { 7354 7149 new_data[i].seen = true; 7150 + new_data[i].zext_dst = insn_has_def32(env, insn + i); 7151 + } 7355 7152 env->insn_aux_data = new_data; 7356 7153 vfree(old_data); 7357 7154 return 0; ··· 7386 7179 env->insn_aux_data[off].orig_idx); 7387 7180 return NULL; 7388 7181 } 7389 - if (adjust_insn_aux_data(env, new_prog->len, off, len)) 7182 + if (adjust_insn_aux_data(env, new_prog, off, len)) 7390 7183 return NULL; 7391 7184 adjust_subprog_starts(env, off, len); 7392 7185 return new_prog; ··· 7645 7438 return err; 7646 7439 insn_cnt--; 7647 7440 i--; 7441 + } 7442 + 7443 + return 0; 7444 + } 7445 + 7446 + static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, 7447 + const union bpf_attr *attr) 7448 + { 7449 + struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; 7450 + struct bpf_insn_aux_data *aux = env->insn_aux_data; 7451 + int i, patch_len, delta = 0, len = env->prog->len; 7452 + struct bpf_insn *insns = env->prog->insnsi; 7453 + struct bpf_prog *new_prog; 7454 + bool rnd_hi32; 7455 + 7456 + rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; 7457 + zext_patch[1] = BPF_ZEXT_REG(0); 7458 + rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); 7459 + rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); 7460 + rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); 7461 + for (i = 0; i < len; i++) { 7462 + int adj_idx = i + delta; 7463 + struct bpf_insn insn; 7464 + 7465 + insn = insns[adj_idx]; 7466 + if (!aux[adj_idx].zext_dst) { 7467 + u8 code, class; 7468 + u32 imm_rnd; 7469 + 7470 + if (!rnd_hi32) 7471 + continue; 7472 + 7473 + code = insn.code; 7474 + class = BPF_CLASS(code); 7475 + if (insn_no_def(&insn)) 7476 + continue; 7477 + 7478 + /* NOTE: arg "reg" (the fourth one) is only used for 7479 + * BPF_STX which has been ruled out in above 7480 + * check, it is safe to pass NULL here. 7481 + */ 7482 + if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { 7483 + if (class == BPF_LD && 7484 + BPF_MODE(code) == BPF_IMM) 7485 + i++; 7486 + continue; 7487 + } 7488 + 7489 + /* ctx load could be transformed into wider load. */ 7490 + if (class == BPF_LDX && 7491 + aux[adj_idx].ptr_type == PTR_TO_CTX) 7492 + continue; 7493 + 7494 + imm_rnd = get_random_int(); 7495 + rnd_hi32_patch[0] = insn; 7496 + rnd_hi32_patch[1].imm = imm_rnd; 7497 + rnd_hi32_patch[3].dst_reg = insn.dst_reg; 7498 + patch = rnd_hi32_patch; 7499 + patch_len = 4; 7500 + goto apply_patch_buffer; 7501 + } 7502 + 7503 + if (!bpf_jit_needs_zext()) 7504 + continue; 7505 + 7506 + zext_patch[0] = insn; 7507 + zext_patch[1].dst_reg = insn.dst_reg; 7508 + zext_patch[1].src_reg = insn.dst_reg; 7509 + patch = zext_patch; 7510 + patch_len = 2; 7511 + apply_patch_buffer: 7512 + new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); 7513 + if (!new_prog) 7514 + return -ENOMEM; 7515 + env->prog = new_prog; 7516 + insns = new_prog->insnsi; 7517 + aux = env->insn_aux_data; 7518 + delta += patch_len - 1; 7648 7519 } 7649 7520 7650 7521 return 0; ··· 8415 8130 if (!env->explored_states) 8416 8131 return; 8417 8132 8418 - for (i = 0; i < env->prog->len; i++) { 8133 + for (i = 0; i < state_htab_size(env); i++) { 8419 8134 sl = env->explored_states[i]; 8420 8135 8421 - if (sl) 8422 - while (sl != STATE_LIST_MARK) { 8423 - sln = sl->next; 8424 - free_verifier_state(&sl->state, false); 8425 - kfree(sl); 8426 - sl = sln; 8427 - } 8136 + while (sl) { 8137 + sln = sl->next; 8138 + free_verifier_state(&sl->state, false); 8139 + kfree(sl); 8140 + sl = sln; 8141 + } 8428 8142 } 8429 8143 8430 8144 kvfree(env->explored_states); ··· 8523 8239 goto skip_full_check; 8524 8240 } 8525 8241 8526 - env->explored_states = kvcalloc(env->prog->len, 8242 + env->explored_states = kvcalloc(state_htab_size(env), 8527 8243 sizeof(struct bpf_verifier_state_list *), 8528 8244 GFP_USER); 8529 8245 ret = -ENOMEM; ··· 8577 8293 8578 8294 if (ret == 0) 8579 8295 ret = fixup_bpf_calls(env); 8296 + 8297 + /* do 32-bit optimization after insn patching has done so those patched 8298 + * insns could be handled correctly. 8299 + */ 8300 + if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { 8301 + ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); 8302 + env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret 8303 + : false; 8304 + } 8580 8305 8581 8306 if (ret == 0) 8582 8307 ret = fixup_call_args(env);

+4 -6

kernel/bpf/xskmap.c

··· 37 37 38 38 cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); 39 39 cost += sizeof(struct list_head) * num_possible_cpus(); 40 - if (cost >= U32_MAX - PAGE_SIZE) 41 - goto free_m; 42 - 43 - m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 44 40 45 41 /* Notice returns -EPERM on if map size is larger than memlock limit */ 46 - err = bpf_map_precharge_memlock(m->map.pages); 42 + err = bpf_map_charge_init(&m->map.memory, cost); 47 43 if (err) 48 44 goto free_m; 49 45 ··· 47 51 48 52 m->flush_list = alloc_percpu(struct list_head); 49 53 if (!m->flush_list) 50 - goto free_m; 54 + goto free_charge; 51 55 52 56 for_each_possible_cpu(cpu) 53 57 INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); ··· 61 65 62 66 free_percpu: 63 67 free_percpu(m->flush_list); 68 + free_charge: 69 + bpf_map_charge_finish(&m->map.memory); 64 70 free_m: 65 71 kfree(m); 66 72 return ERR_PTR(err);

+8 -3

kernel/cgroup/cgroup.c

··· 4955 4955 if (cgrp->kn) 4956 4956 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, 4957 4957 NULL); 4958 - 4959 - cgroup_bpf_put(cgrp); 4960 4958 } 4961 4959 4962 4960 mutex_unlock(&cgroup_mutex); ··· 5479 5481 spin_unlock_irq(&css_set_lock); 5480 5482 5481 5483 cgroup1_check_for_release(parent); 5484 + 5485 + cgroup_bpf_offline(cgrp); 5482 5486 5483 5487 /* put the base reference */ 5484 5488 percpu_ref_kill(&cgrp->self.refcnt); ··· 6221 6221 * Don't use cgroup_get_live(). 6222 6222 */ 6223 6223 cgroup_get(sock_cgroup_ptr(skcd)); 6224 + cgroup_bpf_get(sock_cgroup_ptr(skcd)); 6224 6225 return; 6225 6226 } 6226 6227 ··· 6233 6232 cset = task_css_set(current); 6234 6233 if (likely(cgroup_tryget(cset->dfl_cgrp))) { 6235 6234 skcd->val = (unsigned long)cset->dfl_cgrp; 6235 + cgroup_bpf_get(cset->dfl_cgrp); 6236 6236 break; 6237 6237 } 6238 6238 cpu_relax(); ··· 6244 6242 6245 6243 void cgroup_sk_free(struct sock_cgroup_data *skcd) 6246 6244 { 6247 - cgroup_put(sock_cgroup_ptr(skcd)); 6245 + struct cgroup *cgrp = sock_cgroup_ptr(skcd); 6246 + 6247 + cgroup_bpf_put(cgrp); 6248 + cgroup_put(cgrp); 6248 6249 } 6249 6250 6250 6251 #endif /* CONFIG_SOCK_CGROUP_DATA */

+88 -8

kernel/trace/bpf_trace.c

··· 19 19 #include "trace_probe.h" 20 20 #include "trace.h" 21 21 22 + #define bpf_event_rcu_dereference(p) \ 23 + rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) 24 + 22 25 #ifdef CONFIG_MODULES 23 26 struct bpf_trace_module { 24 27 struct module *module; ··· 570 567 .arg3_type = ARG_ANYTHING, 571 568 }; 572 569 570 + struct send_signal_irq_work { 571 + struct irq_work irq_work; 572 + struct task_struct *task; 573 + u32 sig; 574 + }; 575 + 576 + static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); 577 + 578 + static void do_bpf_send_signal(struct irq_work *entry) 579 + { 580 + struct send_signal_irq_work *work; 581 + 582 + work = container_of(entry, struct send_signal_irq_work, irq_work); 583 + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); 584 + } 585 + 586 + BPF_CALL_1(bpf_send_signal, u32, sig) 587 + { 588 + struct send_signal_irq_work *work = NULL; 589 + 590 + /* Similar to bpf_probe_write_user, task needs to be 591 + * in a sound condition and kernel memory access be 592 + * permitted in order to send signal to the current 593 + * task. 594 + */ 595 + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) 596 + return -EPERM; 597 + if (unlikely(uaccess_kernel())) 598 + return -EPERM; 599 + if (unlikely(!nmi_uaccess_okay())) 600 + return -EPERM; 601 + 602 + if (in_nmi()) { 603 + /* Do an early check on signal validity. Otherwise, 604 + * the error is lost in deferred irq_work. 605 + */ 606 + if (unlikely(!valid_signal(sig))) 607 + return -EINVAL; 608 + 609 + work = this_cpu_ptr(&send_signal_work); 610 + if (work->irq_work.flags & IRQ_WORK_BUSY) 611 + return -EBUSY; 612 + 613 + /* Add the current task, which is the target of sending signal, 614 + * to the irq_work. The current task may change when queued 615 + * irq works get executed. 616 + */ 617 + work->task = current; 618 + work->sig = sig; 619 + irq_work_queue(&work->irq_work); 620 + return 0; 621 + } 622 + 623 + return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); 624 + } 625 + 626 + static const struct bpf_func_proto bpf_send_signal_proto = { 627 + .func = bpf_send_signal, 628 + .gpl_only = false, 629 + .ret_type = RET_INTEGER, 630 + .arg1_type = ARG_ANYTHING, 631 + }; 632 + 573 633 static const struct bpf_func_proto * 574 634 tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 575 635 { ··· 683 617 case BPF_FUNC_get_current_cgroup_id: 684 618 return &bpf_get_current_cgroup_id_proto; 685 619 #endif 620 + case BPF_FUNC_send_signal: 621 + return &bpf_send_signal_proto; 686 622 default: 687 623 return NULL; 688 624 } ··· 1102 1034 int perf_event_attach_bpf_prog(struct perf_event *event, 1103 1035 struct bpf_prog *prog) 1104 1036 { 1105 - struct bpf_prog_array __rcu *old_array; 1037 + struct bpf_prog_array *old_array; 1106 1038 struct bpf_prog_array *new_array; 1107 1039 int ret = -EEXIST; 1108 1040 ··· 1120 1052 if (event->prog) 1121 1053 goto unlock; 1122 1054 1123 - old_array = event->tp_event->prog_array; 1055 + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); 1124 1056 if (old_array && 1125 1057 bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { 1126 1058 ret = -E2BIG; ··· 1143 1075 1144 1076 void perf_event_detach_bpf_prog(struct perf_event *event) 1145 1077 { 1146 - struct bpf_prog_array __rcu *old_array; 1078 + struct bpf_prog_array *old_array; 1147 1079 struct bpf_prog_array *new_array; 1148 1080 int ret; 1149 1081 ··· 1152 1084 if (!event->prog) 1153 1085 goto unlock; 1154 1086 1155 - old_array = event->tp_event->prog_array; 1087 + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); 1156 1088 ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); 1157 1089 if (ret == -ENOENT) 1158 1090 goto unlock; ··· 1174 1106 { 1175 1107 struct perf_event_query_bpf __user *uquery = info; 1176 1108 struct perf_event_query_bpf query = {}; 1109 + struct bpf_prog_array *progs; 1177 1110 u32 *ids, prog_cnt, ids_len; 1178 1111 int ret; 1179 1112 ··· 1199 1130 */ 1200 1131 1201 1132 mutex_lock(&bpf_event_mutex); 1202 - ret = bpf_prog_array_copy_info(event->tp_event->prog_array, 1203 - ids, 1204 - ids_len, 1205 - &prog_cnt); 1133 + progs = bpf_event_rcu_dereference(event->tp_event->prog_array); 1134 + ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); 1206 1135 mutex_unlock(&bpf_event_mutex); 1207 1136 1208 1137 if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || ··· 1410 1343 return 0; 1411 1344 } 1412 1345 1346 + static int __init send_signal_irq_work_init(void) 1347 + { 1348 + int cpu; 1349 + struct send_signal_irq_work *work; 1350 + 1351 + for_each_possible_cpu(cpu) { 1352 + work = per_cpu_ptr(&send_signal_work, cpu); 1353 + init_irq_work(&work->irq_work, do_bpf_send_signal); 1354 + } 1355 + return 0; 1356 + } 1357 + 1413 1358 fs_initcall(bpf_event_init); 1359 + subsys_initcall(send_signal_irq_work_init); 1414 1360 #endif /* CONFIG_MODULES */

+10 -2

net/core/bpf_sk_storage.c

··· 627 627 unsigned int i; 628 628 u32 nbuckets; 629 629 u64 cost; 630 + int ret; 630 631 631 632 smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); 632 633 if (!smap) ··· 636 635 637 636 smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); 638 637 nbuckets = 1U << smap->bucket_log; 638 + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); 639 + 640 + ret = bpf_map_charge_init(&smap->map.memory, cost); 641 + if (ret < 0) { 642 + kfree(smap); 643 + return ERR_PTR(ret); 644 + } 645 + 639 646 smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, 640 647 GFP_USER | __GFP_NOWARN); 641 648 if (!smap->buckets) { 649 + bpf_map_charge_finish(&smap->map.memory); 642 650 kfree(smap); 643 651 return ERR_PTR(-ENOMEM); 644 652 } 645 - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); 646 653 647 654 for (i = 0; i < nbuckets; i++) { 648 655 INIT_HLIST_HEAD(&smap->buckets[i].list); ··· 660 651 smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; 661 652 smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % 662 653 BPF_SK_STORAGE_CACHE_SIZE; 663 - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 664 654 665 655 return &smap->map; 666 656 }

+2 -7

net/core/sock_map.c

··· 44 44 45 45 /* Make sure page count doesn't overflow. */ 46 46 cost = (u64) stab->map.max_entries * sizeof(struct sock *); 47 - if (cost >= U32_MAX - PAGE_SIZE) { 48 - err = -EINVAL; 49 - goto free_stab; 50 - } 51 - 52 - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 53 - err = bpf_map_precharge_memlock(stab->map.pages); 47 + err = bpf_map_charge_init(&stab->map.memory, cost); 54 48 if (err) 55 49 goto free_stab; 56 50 ··· 54 60 if (stab->sks) 55 61 return &stab->map; 56 62 err = -ENOMEM; 63 + bpf_map_charge_finish(&stab->map.memory); 57 64 free_stab: 58 65 kfree(stab); 59 66 return ERR_PTR(err);

+23 -11

net/ipv4/ip_output.c

··· 287 287 return ret; 288 288 } 289 289 290 - static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 290 + static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 291 291 { 292 292 unsigned int mtu; 293 - int ret; 294 - 295 - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 296 - if (ret) { 297 - kfree_skb(skb); 298 - return ret; 299 - } 300 293 301 294 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 302 295 /* Policy lookup after SNAT yielded a new policy */ ··· 308 315 return ip_finish_output2(net, sk, skb); 309 316 } 310 317 318 + static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 319 + { 320 + int ret; 321 + 322 + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 323 + switch (ret) { 324 + case NET_XMIT_SUCCESS: 325 + return __ip_finish_output(net, sk, skb); 326 + case NET_XMIT_CN: 327 + return __ip_finish_output(net, sk, skb) ? : ret; 328 + default: 329 + kfree_skb(skb); 330 + return ret; 331 + } 332 + } 333 + 311 334 static int ip_mc_finish_output(struct net *net, struct sock *sk, 312 335 struct sk_buff *skb) 313 336 { 314 337 int ret; 315 338 316 339 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 317 - if (ret) { 340 + switch (ret) { 341 + case NET_XMIT_SUCCESS: 342 + return dev_loopback_xmit(net, sk, skb); 343 + case NET_XMIT_CN: 344 + return dev_loopback_xmit(net, sk, skb) ? : ret; 345 + default: 318 346 kfree_skb(skb); 319 347 return ret; 320 348 } 321 - 322 - return dev_loopback_xmit(net, sk, skb); 323 349 } 324 350 325 351 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)

+17 -9

net/ipv6/ip6_output.c

··· 128 128 return -EINVAL; 129 129 } 130 130 131 - static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 131 + static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 132 { 133 - int ret; 134 - 135 - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 136 - if (ret) { 137 - kfree_skb(skb); 138 - return ret; 139 - } 140 - 141 133 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 142 134 /* Policy lookup after SNAT yielded a new policy */ 143 135 if (skb_dst(skb)->xfrm) { ··· 144 152 return ip6_fragment(net, sk, skb, ip6_finish_output2); 145 153 else 146 154 return ip6_finish_output2(net, sk, skb); 155 + } 156 + 157 + static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 158 + { 159 + int ret; 160 + 161 + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 162 + switch (ret) { 163 + case NET_XMIT_SUCCESS: 164 + return __ip6_finish_output(net, sk, skb); 165 + case NET_XMIT_CN: 166 + return __ip6_finish_output(net, sk, skb) ? : ret; 167 + default: 168 + kfree_skb(skb); 169 + return ret; 170 + } 147 171 } 148 172 149 173 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)

+1

samples/bpf/.gitignore

··· 1 1 cpustat 2 2 fds_example 3 3 hbm 4 + ibumad 4 5 lathist 5 6 lwt_len_hist 6 7 map_perf_test

-2

samples/bpf/Makefile

··· 26 26 hostprogs-y += test_overhead 27 27 hostprogs-y += test_cgrp2_array_pin 28 28 hostprogs-y += test_cgrp2_attach 29 - hostprogs-y += test_cgrp2_attach2 30 29 hostprogs-y += test_cgrp2_sock 31 30 hostprogs-y += test_cgrp2_sock2 32 31 hostprogs-y += xdp1 ··· 80 81 test_overhead-objs := bpf_load.o test_overhead_user.o 81 82 test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o 82 83 test_cgrp2_attach-objs := test_cgrp2_attach.o 83 - test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(CGROUP_HELPERS) 84 84 test_cgrp2_sock-objs := test_cgrp2_sock.o 85 85 test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o 86 86 xdp1-objs := xdp1_user.o

+4 -4

samples/bpf/bpf_load.c

··· 40 40 int prog_array_fd = -1; 41 41 42 42 struct bpf_map_data map_data[MAX_MAPS]; 43 - int map_data_count = 0; 43 + int map_data_count; 44 44 45 45 static int populate_prog_array(const char *event, int prog_fd) 46 46 { ··· 65 65 else 66 66 flags = O_WRONLY | O_APPEND; 67 67 68 - fd = open("/sys/kernel/debug/tracing/kprobe_events", flags); 68 + fd = open(DEBUGFS "kprobe_events", flags); 69 69 70 70 ret = write(fd, val, strlen(val)); 71 71 close(fd); ··· 490 490 491 491 /* Verify no newer features were requested */ 492 492 if (validate_zero) { 493 - addr = (unsigned char*) def + map_sz_copy; 494 - end = (unsigned char*) def + map_sz_elf; 493 + addr = (unsigned char *) def + map_sz_copy; 494 + end = (unsigned char *) def + map_sz_elf; 495 495 for (; addr < end; addr++) { 496 496 if (*addr != 0) { 497 497 free(sym);

+7 -3

samples/bpf/do_hbm_test.sh

··· 13 13 echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" 14 14 echo "loads. The output is the goodput in Mbps (unless -D was used)." 15 15 echo "" 16 - echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]" 17 - echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]" 16 + echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]" 17 + echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E]" 18 18 echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" 19 - echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]" 19 + echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]" 20 20 echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" 21 21 echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" 22 22 echo " Where:" ··· 33 33 echo " -f or --flows number of concurrent flows (default=1)" 34 34 echo " -i or --id cgroup id (an integer, default is 1)" 35 35 echo " -N use netperf instead of iperf3" 36 + echo " --no_cn Do not return CN notifications" 36 37 echo " -l do not limit flows using loopback" 37 38 echo " -h Help" 38 39 echo " -p or --port iperf3 port (default is 5201)" ··· 115 114 ;; 116 115 -c=*|--cc=*) 117 116 cc="${i#*=}" 117 + ;; 118 + --no_cn) 119 + flags="$flags --no_cn" 118 120 ;; 119 121 --debug) 120 122 flags="$flags -d"

+48 -3

samples/bpf/hbm.c

··· 16 16 * -l Also limit flows doing loopback 17 17 * -n <#> To create cgroup \"/hbm#\" and attach prog 18 18 * Default is /hbm1 19 + * --no_cn Do not return cn notifications 19 20 * -r <rate> Rate limit in Mbps 20 21 * -s Get HBM stats (marked, dropped, etc.) 21 22 * -t <time> Exit after specified seconds (default is 0) ··· 43 42 44 43 #include <linux/bpf.h> 45 44 #include <bpf/bpf.h> 45 + #include <getopt.h> 46 46 47 47 #include "bpf_load.h" 48 48 #include "bpf_rlimit.h" ··· 61 59 bool loopback_flag; 62 60 bool debugFlag; 63 61 bool work_conserving_flag; 62 + bool no_cn_flag; 64 63 65 64 static void Usage(void); 66 65 static void read_trace_pipe2(void); ··· 188 185 qstats.rate = rate; 189 186 qstats.stats = stats_flag ? 1 : 0; 190 187 qstats.loopback = loopback_flag ? 1 : 0; 188 + qstats.no_cn = no_cn_flag ? 1 : 0; 191 189 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { 192 190 printf("ERROR: Could not update map element\n"); 193 191 goto err; ··· 316 312 double percent_pkts, percent_bytes; 317 313 char fname[100]; 318 314 FILE *fout; 315 + int k; 316 + static const char *returnValNames[] = { 317 + "DROP_PKT", 318 + "ALLOW_PKT", 319 + "DROP_PKT_CWR", 320 + "ALLOW_PKT_CWR" 321 + }; 322 + #define RET_VAL_COUNT 4 319 323 320 324 // Future support of ingress 321 325 // if (!outFlag) ··· 358 346 (qstats.bytes_total + 1); 359 347 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); 360 348 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); 349 + 350 + // ECN CE markings 351 + percent_pkts = (qstats.pkts_ecn_ce * 100.0) / 352 + (qstats.pkts_total + 1); 353 + fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, 354 + (int)qstats.pkts_ecn_ce); 355 + 356 + // Average cwnd 357 + fprintf(fout, "avg cwnd:%d\n", 358 + (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); 359 + // Average rtt 360 + fprintf(fout, "avg rtt:%d\n", 361 + (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); 362 + // Average credit 363 + fprintf(fout, "avg credit:%d\n", 364 + (int)(qstats.sum_credit / 365 + (1500 * ((int)qstats.pkts_total) + 1))); 366 + 367 + // Return values stats 368 + for (k = 0; k < RET_VAL_COUNT; k++) { 369 + percent_pkts = (qstats.returnValCount[k] * 100.0) / 370 + (qstats.pkts_total + 1); 371 + fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], 372 + percent_pkts, (int)qstats.returnValCount[k]); 373 + } 361 374 fclose(fout); 362 375 } 363 376 ··· 403 366 { 404 367 printf("This program loads a cgroup skb BPF program to enforce\n" 405 368 "cgroup output (egress) bandwidth limits.\n\n" 406 - "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n" 407 - " [-t <secs>] [-w] [-h] [prog]\n" 369 + "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" 370 + " [-s] [-t <secs>] [-w] [-h] [prog]\n" 408 371 " Where:\n" 409 372 " -o indicates egress direction (default)\n" 410 373 " -d print BPF trace debug buffer\n" 411 374 " -l also limit flows using loopback\n" 412 375 " -n <#> to create cgroup \"/hbm#\" and attach prog\n" 413 376 " Default is /hbm1\n" 377 + " --no_cn disable CN notifcations\n" 414 378 " -r <rate> Rate in Mbps\n" 415 379 " -s Update HBM stats\n" 416 380 " -t <time> Exit after specified seconds (default is 0)\n" ··· 431 393 int k; 432 394 int cg_id = 1; 433 395 char *optstring = "iodln:r:st:wh"; 396 + struct option loptions[] = { 397 + {"no_cn", 0, NULL, 1}, 398 + {NULL, 0, NULL, 0} 399 + }; 434 400 435 - while ((k = getopt(argc, argv, optstring)) != -1) { 401 + while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { 436 402 switch (k) { 403 + case 1: 404 + no_cn_flag = true; 405 + break; 437 406 case'o': 438 407 break; 439 408 case 'd':

+8 -1

samples/bpf/hbm.h

··· 19 19 struct hbm_queue_stats { 20 20 unsigned long rate; /* in Mbps*/ 21 21 unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ 22 - loopback:1; /* also limit flows using loopback */ 22 + loopback:1, /* also limit flows using loopback */ 23 + no_cn:1; /* do not use cn flags */ 23 24 unsigned long long pkts_marked; 24 25 unsigned long long bytes_marked; 25 26 unsigned long long pkts_dropped; ··· 29 28 unsigned long long bytes_total; 30 29 unsigned long long firstPacketTime; 31 30 unsigned long long lastPacketTime; 31 + unsigned long long pkts_ecn_ce; 32 + unsigned long long returnValCount[4]; 33 + unsigned long long sum_cwnd; 34 + unsigned long long sum_rtt; 35 + unsigned long long sum_cwnd_cnt; 36 + long long sum_credit; 32 37 };

+66 -11

samples/bpf/hbm_kern.h

··· 30 30 #define ALLOW_PKT 1 31 31 #define TCP_ECN_OK 1 32 32 33 - #define HBM_DEBUG 0 // Set to 1 to enable debugging 34 - #if HBM_DEBUG 35 - #define bpf_printk(fmt, ...) \ 36 - ({ \ 37 - char ____fmt[] = fmt; \ 38 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 39 - ##__VA_ARGS__); \ 40 - }) 41 - #else 33 + #ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging 34 + #undef bpf_printk 42 35 #define bpf_printk(fmt, ...) 43 36 #endif 44 37 ··· 65 72 BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); 66 73 67 74 struct hbm_pkt_info { 75 + int cwnd; 76 + int rtt; 68 77 bool is_ip; 69 78 bool is_tcp; 70 79 short ecn; 71 80 }; 81 + 82 + static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti) 83 + { 84 + struct bpf_sock *sk; 85 + struct bpf_tcp_sock *tp; 86 + 87 + sk = skb->sk; 88 + if (sk) { 89 + sk = bpf_sk_fullsock(sk); 90 + if (sk) { 91 + if (sk->protocol == IPPROTO_TCP) { 92 + tp = bpf_tcp_sock(sk); 93 + if (tp) { 94 + pkti->cwnd = tp->snd_cwnd; 95 + pkti->rtt = tp->srtt_us >> 3; 96 + return 0; 97 + } 98 + } 99 + } 100 + } 101 + return 1; 102 + } 72 103 73 104 static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, 74 105 struct hbm_pkt_info *pkti) ··· 100 83 struct iphdr iph; 101 84 struct ipv6hdr *ip6h; 102 85 86 + pkti->cwnd = 0; 87 + pkti->rtt = 0; 103 88 bpf_skb_load_bytes(skb, 0, &iph, 12); 104 89 if (iph.version == 6) { 105 90 ip6h = (struct ipv6hdr *)&iph; ··· 117 98 pkti->is_tcp = false; 118 99 pkti->ecn = 0; 119 100 } 101 + if (pkti->is_tcp) 102 + get_tcp_info(skb, pkti); 120 103 } 121 104 122 105 static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) ··· 133 112 int len, 134 113 unsigned long long curtime, 135 114 bool congestion_flag, 136 - bool drop_flag) 115 + bool drop_flag, 116 + bool cwr_flag, 117 + bool ecn_ce_flag, 118 + struct hbm_pkt_info *pkti, 119 + int credit) 137 120 { 121 + int rv = ALLOW_PKT; 122 + 138 123 if (qsp != NULL) { 139 124 // Following is needed for work conserving 140 125 __sync_add_and_fetch(&(qsp->bytes_total), len); ··· 150 123 qsp->firstPacketTime = curtime; 151 124 qsp->lastPacketTime = curtime; 152 125 __sync_add_and_fetch(&(qsp->pkts_total), 1); 153 - if (congestion_flag || drop_flag) { 126 + if (congestion_flag) { 154 127 __sync_add_and_fetch(&(qsp->pkts_marked), 1); 155 128 __sync_add_and_fetch(&(qsp->bytes_marked), len); 156 129 } ··· 159 132 __sync_add_and_fetch(&(qsp->bytes_dropped), 160 133 len); 161 134 } 135 + if (ecn_ce_flag) 136 + __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1); 137 + if (pkti->cwnd) { 138 + __sync_add_and_fetch(&(qsp->sum_cwnd), 139 + pkti->cwnd); 140 + __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1); 141 + } 142 + if (pkti->rtt) 143 + __sync_add_and_fetch(&(qsp->sum_rtt), 144 + pkti->rtt); 145 + __sync_add_and_fetch(&(qsp->sum_credit), credit); 146 + 147 + if (drop_flag) 148 + rv = DROP_PKT; 149 + if (cwr_flag) 150 + rv |= 2; 151 + if (rv == DROP_PKT) 152 + __sync_add_and_fetch(&(qsp->returnValCount[0]), 153 + 1); 154 + else if (rv == ALLOW_PKT) 155 + __sync_add_and_fetch(&(qsp->returnValCount[1]), 156 + 1); 157 + else if (rv == 2) 158 + __sync_add_and_fetch(&(qsp->returnValCount[2]), 159 + 1); 160 + else if (rv == 3) 161 + __sync_add_and_fetch(&(qsp->returnValCount[3]), 162 + 1); 162 163 } 163 164 } 164 165 }

+35 -13

samples/bpf/hbm_out_kern.c

··· 62 62 unsigned int queue_index = 0; 63 63 unsigned long long curtime; 64 64 int credit; 65 - signed long long delta = 0, zero = 0; 65 + signed long long delta = 0, new_credit; 66 66 int max_credit = MAX_CREDIT; 67 67 bool congestion_flag = false; 68 68 bool drop_flag = false; 69 69 bool cwr_flag = false; 70 + bool ecn_ce_flag = false; 70 71 struct hbm_vqueue *qdp; 71 72 struct hbm_queue_stats *qsp = NULL; 72 73 int rv = ALLOW_PKT; ··· 100 99 */ 101 100 if (delta > 0) { 102 101 qdp->lasttime = curtime; 103 - credit += CREDIT_PER_NS(delta, qdp->rate); 104 - if (credit > MAX_CREDIT) 102 + new_credit = credit + CREDIT_PER_NS(delta, qdp->rate); 103 + if (new_credit > MAX_CREDIT) 105 104 credit = MAX_CREDIT; 105 + else 106 + credit = new_credit; 106 107 } 107 108 credit -= len; 108 109 qdp->credit = credit; ··· 122 119 // Set flags (drop, congestion, cwr) 123 120 // Dropping => we are congested, so ignore congestion flag 124 121 if (credit < -DROP_THRESH || 125 - (len > LARGE_PKT_THRESH && 126 - credit < -LARGE_PKT_DROP_THRESH)) { 127 - // Very congested, set drop flag 122 + (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) { 123 + // Very congested, set drop packet 128 124 drop_flag = true; 125 + if (pkti.ecn) 126 + congestion_flag = true; 127 + else if (pkti.is_tcp) 128 + cwr_flag = true; 129 129 } else if (credit < 0) { 130 130 // Congested, set congestion flag 131 - if (pkti.ecn) { 131 + if (pkti.ecn || pkti.is_tcp) { 132 132 if (credit < -MARK_THRESH) 133 133 congestion_flag = true; 134 134 else ··· 142 136 } 143 137 144 138 if (congestion_flag) { 145 - if (!bpf_skb_ecn_set_ce(skb)) { 146 - if (len > LARGE_PKT_THRESH) { 139 + if (bpf_skb_ecn_set_ce(skb)) { 140 + ecn_ce_flag = true; 141 + } else { 142 + if (pkti.is_tcp) { 143 + unsigned int rand = bpf_get_prandom_u32(); 144 + 145 + if (-credit >= MARK_THRESH + 146 + (rand % MARK_REGION_SIZE)) { 147 + // Do congestion control 148 + cwr_flag = true; 149 + } 150 + } else if (len > LARGE_PKT_THRESH) { 147 151 // Problem if too many small packets? 148 152 drop_flag = true; 149 153 } 150 154 } 151 155 } 152 156 153 - if (drop_flag) 154 - rv = DROP_PKT; 157 + if (qsp != NULL) 158 + if (qsp->no_cn) 159 + cwr_flag = false; 155 160 156 - hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); 161 + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, 162 + cwr_flag, ecn_ce_flag, &pkti, credit); 157 163 158 - if (rv == DROP_PKT) 164 + if (drop_flag) { 159 165 __sync_add_and_fetch(&(qdp->credit), len); 166 + rv = DROP_PKT; 167 + } 160 168 169 + if (cwr_flag) 170 + rv |= 2; 161 171 return rv; 162 172 } 163 173 char _license[] SEC("license") = "GPL";

-7

samples/bpf/tcp_basertt_kern.c

··· 21 21 22 22 #define DEBUG 1 23 23 24 - #define bpf_printk(fmt, ...) \ 25 - ({ \ 26 - char ____fmt[] = fmt; \ 27 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 28 - ##__VA_ARGS__); \ 29 - }) 30 - 31 24 SEC("sockops") 32 25 int bpf_basertt(struct bpf_sock_ops *skops) 33 26 {

-7

samples/bpf/tcp_bufs_kern.c

··· 22 22 23 23 #define DEBUG 1 24 24 25 - #define bpf_printk(fmt, ...) \ 26 - ({ \ 27 - char ____fmt[] = fmt; \ 28 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 29 - ##__VA_ARGS__); \ 30 - }) 31 - 32 25 SEC("sockops") 33 26 int bpf_bufs(struct bpf_sock_ops *skops) 34 27 {

-7

samples/bpf/tcp_clamp_kern.c

··· 22 22 23 23 #define DEBUG 1 24 24 25 - #define bpf_printk(fmt, ...) \ 26 - ({ \ 27 - char ____fmt[] = fmt; \ 28 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 29 - ##__VA_ARGS__); \ 30 - }) 31 - 32 25 SEC("sockops") 33 26 int bpf_clamp(struct bpf_sock_ops *skops) 34 27 {

-7

samples/bpf/tcp_cong_kern.c

··· 21 21 22 22 #define DEBUG 1 23 23 24 - #define bpf_printk(fmt, ...) \ 25 - ({ \ 26 - char ____fmt[] = fmt; \ 27 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 28 - ##__VA_ARGS__); \ 29 - }) 30 - 31 24 SEC("sockops") 32 25 int bpf_cong(struct bpf_sock_ops *skops) 33 26 {

-7

samples/bpf/tcp_iw_kern.c

··· 22 22 23 23 #define DEBUG 1 24 24 25 - #define bpf_printk(fmt, ...) \ 26 - ({ \ 27 - char ____fmt[] = fmt; \ 28 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 29 - ##__VA_ARGS__); \ 30 - }) 31 - 32 25 SEC("sockops") 33 26 int bpf_iw(struct bpf_sock_ops *skops) 34 27 {

-7

samples/bpf/tcp_rwnd_kern.c

··· 21 21 22 22 #define DEBUG 1 23 23 24 - #define bpf_printk(fmt, ...) \ 25 - ({ \ 26 - char ____fmt[] = fmt; \ 27 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 28 - ##__VA_ARGS__); \ 29 - }) 30 - 31 24 SEC("sockops") 32 25 int bpf_rwnd(struct bpf_sock_ops *skops) 33 26 {

-7

samples/bpf/tcp_synrto_kern.c

··· 21 21 22 22 #define DEBUG 1 23 23 24 - #define bpf_printk(fmt, ...) \ 25 - ({ \ 26 - char ____fmt[] = fmt; \ 27 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 28 - ##__VA_ARGS__); \ 29 - }) 30 - 31 24 SEC("sockops") 32 25 int bpf_synrto(struct bpf_sock_ops *skops) 33 26 {

-7

samples/bpf/tcp_tos_reflect_kern.c

··· 20 20 21 21 #define DEBUG 1 22 22 23 - #define bpf_printk(fmt, ...) \ 24 - ({ \ 25 - char ____fmt[] = fmt; \ 26 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 27 - ##__VA_ARGS__); \ 28 - }) 29 - 30 23 SEC("sockops") 31 24 int bpf_basertt(struct bpf_sock_ops *skops) 32 25 {

+129 -17

samples/bpf/test_cgrp2_attach2.c tools/testing/selftests/bpf/test_cgroup_attach.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 1 3 /* eBPF example program: 2 4 * 3 5 * - Creates arraymap in kernel with 4 bytes keys and 8 byte values ··· 27 25 #include <sys/resource.h> 28 26 #include <sys/time.h> 29 27 #include <unistd.h> 28 + #include <linux/filter.h> 30 29 31 30 #include <linux/bpf.h> 32 31 #include <bpf/bpf.h> 33 32 34 - #include "bpf_insn.h" 33 + #include "bpf_util.h" 35 34 #include "bpf_rlimit.h" 36 35 #include "cgroup_helpers.h" 37 36 38 37 #define FOO "/foo" 39 38 #define BAR "/foo/bar/" 40 - #define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null" 39 + #define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" 41 40 42 41 char bpf_log_buf[BPF_LOG_BUF_SIZE]; 42 + 43 + #ifdef DEBUG 44 + #define debug(args...) printf(args) 45 + #else 46 + #define debug(args...) 47 + #endif 43 48 44 49 static int prog_load(int verdict) 45 50 { ··· 98 89 goto err; 99 90 } 100 91 101 - printf("Attached DROP prog. This ping in cgroup /foo should fail...\n"); 92 + debug("Attached DROP prog. This ping in cgroup /foo should fail...\n"); 102 93 assert(system(PING_CMD) != 0); 103 94 104 95 /* Create cgroup /foo/bar, get fd, and join it */ ··· 109 100 if (join_cgroup(BAR)) 110 101 goto err; 111 102 112 - printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); 103 + debug("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); 113 104 assert(system(PING_CMD) != 0); 114 105 115 106 if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, ··· 118 109 goto err; 119 110 } 120 111 121 - printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); 112 + debug("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); 122 113 assert(system(PING_CMD) == 0); 123 114 124 115 if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { ··· 126 117 goto err; 127 118 } 128 119 129 - printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n" 120 + debug("Detached PASS from /foo/bar while DROP is attached to /foo.\n" 130 121 "This ping in cgroup /foo/bar should fail...\n"); 131 122 assert(system(PING_CMD) != 0); 132 123 ··· 141 132 goto err; 142 133 } 143 134 144 - printf("Attached PASS from /foo/bar and detached DROP from /foo.\n" 135 + debug("Attached PASS from /foo/bar and detached DROP from /foo.\n" 145 136 "This ping in cgroup /foo/bar should pass...\n"); 146 137 assert(system(PING_CMD) == 0); 147 138 ··· 208 199 close(bar); 209 200 cleanup_cgroup_environment(); 210 201 if (!rc) 211 - printf("### override:PASS\n"); 202 + printf("#override:PASS\n"); 212 203 else 213 - printf("### override:FAIL\n"); 204 + printf("#override:FAIL\n"); 214 205 return rc; 215 206 } 216 207 ··· 450 441 close(cg5); 451 442 cleanup_cgroup_environment(); 452 443 if (!rc) 453 - printf("### multi:PASS\n"); 444 + printf("#multi:PASS\n"); 454 445 else 455 - printf("### multi:FAIL\n"); 446 + printf("#multi:FAIL\n"); 456 447 return rc; 457 448 } 458 449 459 - int main(int argc, char **argv) 450 + static int test_autodetach(void) 460 451 { 461 - int rc = 0; 452 + __u32 prog_cnt = 4, attach_flags; 453 + int allow_prog[2] = {0}; 454 + __u32 prog_ids[2] = {0}; 455 + int cg = 0, i, rc = -1; 456 + void *ptr = NULL; 457 + int attempts; 462 458 463 - rc = test_foo_bar(); 464 - if (rc) 465 - return rc; 459 + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { 460 + allow_prog[i] = prog_load_cnt(1, 1 << i); 461 + if (!allow_prog[i]) 462 + goto err; 463 + } 466 464 467 - return test_multiprog(); 465 + if (setup_cgroup_environment()) 466 + goto err; 467 + 468 + /* create a cgroup, attach two programs and remember their ids */ 469 + cg = create_and_get_cgroup("/cg_autodetach"); 470 + if (cg < 0) 471 + goto err; 472 + 473 + if (join_cgroup("/cg_autodetach")) 474 + goto err; 475 + 476 + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { 477 + if (bpf_prog_attach(allow_prog[i], cg, BPF_CGROUP_INET_EGRESS, 478 + BPF_F_ALLOW_MULTI)) { 479 + log_err("Attaching prog[%d] to cg:egress", i); 480 + goto err; 481 + } 482 + } 483 + 484 + /* make sure that programs are attached and run some traffic */ 485 + assert(bpf_prog_query(cg, BPF_CGROUP_INET_EGRESS, 0, &attach_flags, 486 + prog_ids, &prog_cnt) == 0); 487 + assert(system(PING_CMD) == 0); 488 + 489 + /* allocate some memory (4Mb) to pin the original cgroup */ 490 + ptr = malloc(4 * (1 << 20)); 491 + if (!ptr) 492 + goto err; 493 + 494 + /* close programs and cgroup fd */ 495 + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) { 496 + close(allow_prog[i]); 497 + allow_prog[i] = 0; 498 + } 499 + 500 + close(cg); 501 + cg = 0; 502 + 503 + /* leave the cgroup and remove it. don't detach programs */ 504 + cleanup_cgroup_environment(); 505 + 506 + /* wait for the asynchronous auto-detachment. 507 + * wait for no more than 5 sec and give up. 508 + */ 509 + for (i = 0; i < ARRAY_SIZE(prog_ids); i++) { 510 + for (attempts = 5; attempts >= 0; attempts--) { 511 + int fd = bpf_prog_get_fd_by_id(prog_ids[i]); 512 + 513 + if (fd < 0) 514 + break; 515 + 516 + /* don't leave the fd open */ 517 + close(fd); 518 + 519 + if (!attempts) 520 + goto err; 521 + 522 + sleep(1); 523 + } 524 + } 525 + 526 + rc = 0; 527 + err: 528 + for (i = 0; i < ARRAY_SIZE(allow_prog); i++) 529 + if (allow_prog[i] > 0) 530 + close(allow_prog[i]); 531 + if (cg) 532 + close(cg); 533 + free(ptr); 534 + cleanup_cgroup_environment(); 535 + if (!rc) 536 + printf("#autodetach:PASS\n"); 537 + else 538 + printf("#autodetach:FAIL\n"); 539 + return rc; 540 + } 541 + 542 + int main(void) 543 + { 544 + int (*tests[])(void) = { 545 + test_foo_bar, 546 + test_multiprog, 547 + test_autodetach, 548 + }; 549 + int errors = 0; 550 + int i; 551 + 552 + for (i = 0; i < ARRAY_SIZE(tests); i++) 553 + if (tests[i]()) 554 + errors++; 555 + 556 + if (errors) 557 + printf("test_cgroup_attach:FAIL\n"); 558 + else 559 + printf("test_cgroup_attach:PASS\n"); 560 + 561 + return errors ? EXIT_FAILURE : EXIT_SUCCESS; 468 562 }

-7

samples/bpf/xdp_sample_pkts_kern.c

··· 7 7 #define SAMPLE_SIZE 64ul 8 8 #define MAX_CPUS 128 9 9 10 - #define bpf_printk(fmt, ...) \ 11 - ({ \ 12 - char ____fmt[] = fmt; \ 13 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 14 - ##__VA_ARGS__); \ 15 - }) 16 - 17 10 struct bpf_map_def SEC("maps") my_map = { 18 11 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, 19 12 .key_size = sizeof(int),

+24 -15

tools/bpf/bpftool/Documentation/bpftool-btf.rst

··· 19 19 BTF COMMANDS 20 20 ============= 21 21 22 - | **bpftool** **btf dump** *BTF_SRC* 22 + | **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] 23 23 | **bpftool** **btf help** 24 24 | 25 25 | *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } 26 + | *FORMAT* := { **raw** | **c** } 26 27 | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } 27 28 | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } 28 29 ··· 32 31 **bpftool btf dump** *BTF_SRC* 33 32 Dump BTF entries from a given *BTF_SRC*. 34 33 35 - When **id** is specified, BTF object with that ID will be 36 - loaded and all its BTF types emitted. 34 + When **id** is specified, BTF object with that ID will be 35 + loaded and all its BTF types emitted. 37 36 38 - When **map** is provided, it's expected that map has 39 - associated BTF object with BTF types describing key and 40 - value. It's possible to select whether to dump only BTF 41 - type(s) associated with key (**key**), value (**value**), 42 - both key and value (**kv**), or all BTF types present in 43 - associated BTF object (**all**). If not specified, **kv** 44 - is assumed. 37 + When **map** is provided, it's expected that map has 38 + associated BTF object with BTF types describing key and 39 + value. It's possible to select whether to dump only BTF 40 + type(s) associated with key (**key**), value (**value**), 41 + both key and value (**kv**), or all BTF types present in 42 + associated BTF object (**all**). If not specified, **kv** 43 + is assumed. 45 44 46 - When **prog** is provided, it's expected that program has 47 - associated BTF object with BTF types. 45 + When **prog** is provided, it's expected that program has 46 + associated BTF object with BTF types. 48 47 49 - When specifying *FILE*, an ELF file is expected, containing 50 - .BTF section with well-defined BTF binary format data, 51 - typically produced by clang or pahole. 48 + When specifying *FILE*, an ELF file is expected, containing 49 + .BTF section with well-defined BTF binary format data, 50 + typically produced by clang or pahole. 51 + 52 + **format** option can be used to override default (raw) 53 + output format. Raw (**raw**) or C-syntax (**c**) output 54 + formats are supported. 52 55 53 56 **bpftool btf help** 54 57 Print short help message. ··· 71 66 72 67 -p, --pretty 73 68 Generate human-readable JSON output. Implies **-j**. 69 + 70 + -d, --debug 71 + Print all logs available from libbpf, including debug-level 72 + information. 74 73 75 74 EXAMPLES 76 75 ========

+4

tools/bpf/bpftool/Documentation/bpftool-cgroup.rst

··· 113 113 -f, --bpffs 114 114 Show file names of pinned programs. 115 115 116 + -d, --debug 117 + Print all logs available from libbpf, including debug-level 118 + information. 119 + 116 120 EXAMPLES 117 121 ======== 118 122 |

+4

tools/bpf/bpftool/Documentation/bpftool-feature.rst

··· 73 73 -p, --pretty 74 74 Generate human-readable JSON output. Implies **-j**. 75 75 76 + -d, --debug 77 + Print all logs available from libbpf, including debug-level 78 + information. 79 + 76 80 SEE ALSO 77 81 ======== 78 82 **bpf**\ (2),

+4

tools/bpf/bpftool/Documentation/bpftool-map.rst

··· 152 152 Do not automatically attempt to mount any virtual file system 153 153 (such as tracefs or BPF virtual file system) when necessary. 154 154 155 + -d, --debug 156 + Print all logs available from libbpf, including debug-level 157 + information. 158 + 155 159 EXAMPLES 156 160 ======== 157 161 **# bpftool map show**

+4

tools/bpf/bpftool/Documentation/bpftool-net.rst

··· 65 65 -p, --pretty 66 66 Generate human-readable JSON output. Implies **-j**. 67 67 68 + -d, --debug 69 + Print all logs available from libbpf, including debug-level 70 + information. 71 + 68 72 EXAMPLES 69 73 ======== 70 74

+4

tools/bpf/bpftool/Documentation/bpftool-perf.rst

··· 53 53 -p, --pretty 54 54 Generate human-readable JSON output. Implies **-j**. 55 55 56 + -d, --debug 57 + Print all logs available from libbpf, including debug-level 58 + information. 59 + 56 60 EXAMPLES 57 61 ======== 58 62

+5

tools/bpf/bpftool/Documentation/bpftool-prog.rst

··· 174 174 Do not automatically attempt to mount any virtual file system 175 175 (such as tracefs or BPF virtual file system) when necessary. 176 176 177 + -d, --debug 178 + Print all logs available, even debug-level information. This 179 + includes logs from libbpf as well as from the verifier, when 180 + attempting to load programs. 181 + 177 182 EXAMPLES 178 183 ======== 179 184 **# bpftool prog show**

+4

tools/bpf/bpftool/Documentation/bpftool.rst

··· 66 66 Do not automatically attempt to mount any virtual file system 67 67 (such as tracefs or BPF virtual file system) when necessary. 68 68 69 + -d, --debug 70 + Print all logs available, even debug-level information. This 71 + includes logs from libbpf as well as from the verifier, when 72 + attempting to load programs. 69 73 70 74 SEE ALSO 71 75 ========

+27 -5

tools/bpf/bpftool/bash-completion/bpftool

··· 71 71 command sed -n 's/.*"tag": "$.*$",$/\1/p' )" -- "$cur" ) ) 72 72 } 73 73 74 + _bpftool_get_btf_ids() 75 + { 76 + COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ 77 + command sed -n 's/.*"btf_id": $.*$,\?$/\1/p' )" -- "$cur" ) ) 78 + } 79 + 74 80 _bpftool_get_obj_map_names() 75 81 { 76 82 local obj ··· 187 181 188 182 # Deal with options 189 183 if [[ ${words[cword]} == -* ]]; then 190 - local c='--version --json --pretty --bpffs --mapcompat' 184 + local c='--version --json --pretty --bpffs --mapcompat --debug' 191 185 COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) 192 186 return 0 193 187 fi ··· 641 635 map) 642 636 _bpftool_get_map_ids 643 637 ;; 638 + dump) 639 + _bpftool_get_btf_ids 640 + ;; 644 641 esac 645 642 return 0 646 643 ;; 644 + format) 645 + COMPREPLY=( $( compgen -W "c raw" -- "$cur" ) ) 646 + ;; 647 647 *) 648 - if [[ $cword == 6 ]] && [[ ${words[3]} == "map" ]]; then 649 - COMPREPLY+=( $( compgen -W 'key value kv all' -- \ 650 - "$cur" ) ) 651 - fi 648 + # emit extra options 649 + case ${words[3]} in 650 + id|file) 651 + _bpftool_once_attr 'format' 652 + ;; 653 + map|prog) 654 + if [[ ${words[3]} == "map" ]] && [[ $cword == 6 ]]; then 655 + COMPREPLY+=( $( compgen -W "key value kv all" -- "$cur" ) ) 656 + fi 657 + _bpftool_once_attr 'format' 658 + ;; 659 + *) 660 + ;; 661 + esac 652 662 return 0 653 663 ;; 654 664 esac

+67 -97

tools/bpf/bpftool/btf.c

··· 8 8 #include <stdio.h> 9 9 #include <string.h> 10 10 #include <unistd.h> 11 - #include <gelf.h> 12 11 #include <bpf.h> 12 + #include <libbpf.h> 13 13 #include <linux/btf.h> 14 14 15 15 #include "btf.h" ··· 340 340 return 0; 341 341 } 342 342 343 - static bool check_btf_endianness(GElf_Ehdr *ehdr) 343 + static void __printf(2, 0) btf_dump_printf(void *ctx, 344 + const char *fmt, va_list args) 344 345 { 345 - static unsigned int const endian = 1; 346 - 347 - switch (ehdr->e_ident[EI_DATA]) { 348 - case ELFDATA2LSB: 349 - return *(unsigned char const *)&endian == 1; 350 - case ELFDATA2MSB: 351 - return *(unsigned char const *)&endian == 0; 352 - default: 353 - return 0; 354 - } 346 + vfprintf(stdout, fmt, args); 355 347 } 356 348 357 - static int btf_load_from_elf(const char *path, struct btf **btf) 349 + static int dump_btf_c(const struct btf *btf, 350 + __u32 *root_type_ids, int root_type_cnt) 358 351 { 359 - int err = -1, fd = -1, idx = 0; 360 - Elf_Data *btf_data = NULL; 361 - Elf_Scn *scn = NULL; 362 - Elf *elf = NULL; 363 - GElf_Ehdr ehdr; 352 + struct btf_dump *d; 353 + int err = 0, i; 364 354 365 - if (elf_version(EV_CURRENT) == EV_NONE) { 366 - p_err("failed to init libelf for %s", path); 367 - return -1; 368 - } 355 + d = btf_dump__new(btf, NULL, NULL, btf_dump_printf); 356 + if (IS_ERR(d)) 357 + return PTR_ERR(d); 369 358 370 - fd = open(path, O_RDONLY); 371 - if (fd < 0) { 372 - p_err("failed to open %s: %s", path, strerror(errno)); 373 - return -1; 374 - } 375 - 376 - elf = elf_begin(fd, ELF_C_READ, NULL); 377 - if (!elf) { 378 - p_err("failed to open %s as ELF file", path); 379 - goto done; 380 - } 381 - if (!gelf_getehdr(elf, &ehdr)) { 382 - p_err("failed to get EHDR from %s", path); 383 - goto done; 384 - } 385 - if (!check_btf_endianness(&ehdr)) { 386 - p_err("non-native ELF endianness is not supported"); 387 - goto done; 388 - } 389 - if (!elf_rawdata(elf_getscn(elf, ehdr.e_shstrndx), NULL)) { 390 - p_err("failed to get e_shstrndx from %s\n", path); 391 - goto done; 392 - } 393 - 394 - while ((scn = elf_nextscn(elf, scn)) != NULL) { 395 - GElf_Shdr sh; 396 - char *name; 397 - 398 - idx++; 399 - if (gelf_getshdr(scn, &sh) != &sh) { 400 - p_err("failed to get section(%d) header from %s", 401 - idx, path); 402 - goto done; 403 - } 404 - name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name); 405 - if (!name) { 406 - p_err("failed to get section(%d) name from %s", 407 - idx, path); 408 - goto done; 409 - } 410 - if (strcmp(name, BTF_ELF_SEC) == 0) { 411 - btf_data = elf_getdata(scn, 0); 412 - if (!btf_data) { 413 - p_err("failed to get section(%d, %s) data from %s", 414 - idx, name, path); 359 + if (root_type_cnt) { 360 + for (i = 0; i < root_type_cnt; i++) { 361 + err = btf_dump__dump_type(d, root_type_ids[i]); 362 + if (err) 415 363 goto done; 416 - } 417 - break; 364 + } 365 + } else { 366 + int cnt = btf__get_nr_types(btf); 367 + 368 + for (i = 1; i <= cnt; i++) { 369 + err = btf_dump__dump_type(d, i); 370 + if (err) 371 + goto done; 418 372 } 419 373 } 420 374 421 - if (!btf_data) { 422 - p_err("%s ELF section not found in %s", BTF_ELF_SEC, path); 423 - goto done; 424 - } 425 - 426 - *btf = btf__new(btf_data->d_buf, btf_data->d_size); 427 - if (IS_ERR(*btf)) { 428 - err = PTR_ERR(*btf); 429 - *btf = NULL; 430 - p_err("failed to load BTF data from %s: %s", 431 - path, strerror(err)); 432 - goto done; 433 - } 434 - 435 - err = 0; 436 375 done: 437 - if (err) { 438 - if (*btf) { 439 - btf__free(*btf); 440 - *btf = NULL; 441 - } 442 - } 443 - if (elf) 444 - elf_end(elf); 445 - close(fd); 376 + btf_dump__free(d); 446 377 return err; 447 378 } 448 379 ··· 382 451 struct btf *btf = NULL; 383 452 __u32 root_type_ids[2]; 384 453 int root_type_cnt = 0; 454 + bool dump_c = false; 385 455 __u32 btf_id = -1; 386 456 const char *src; 387 457 int fd = -1; ··· 454 522 } 455 523 NEXT_ARG(); 456 524 } else if (is_prefix(src, "file")) { 457 - err = btf_load_from_elf(*argv, &btf); 458 - if (err) 525 + btf = btf__parse_elf(*argv, NULL); 526 + if (IS_ERR(btf)) { 527 + err = PTR_ERR(btf); 528 + btf = NULL; 529 + p_err("failed to load BTF from %s: %s", 530 + *argv, strerror(err)); 459 531 goto done; 532 + } 460 533 NEXT_ARG(); 461 534 } else { 462 535 err = -1; 463 536 p_err("unrecognized BTF source specifier: '%s'", src); 464 537 goto done; 538 + } 539 + 540 + while (argc) { 541 + if (is_prefix(*argv, "format")) { 542 + NEXT_ARG(); 543 + if (argc < 1) { 544 + p_err("expecting value for 'format' option\n"); 545 + goto done; 546 + } 547 + if (strcmp(*argv, "c") == 0) { 548 + dump_c = true; 549 + } else if (strcmp(*argv, "raw") == 0) { 550 + dump_c = false; 551 + } else { 552 + p_err("unrecognized format specifier: '%s', possible values: raw, c", 553 + *argv); 554 + goto done; 555 + } 556 + NEXT_ARG(); 557 + } else { 558 + p_err("unrecognized option: '%s'", *argv); 559 + goto done; 560 + } 465 561 } 466 562 467 563 if (!btf) { ··· 505 545 } 506 546 } 507 547 508 - dump_btf_raw(btf, root_type_ids, root_type_cnt); 548 + if (dump_c) { 549 + if (json_output) { 550 + p_err("JSON output for C-syntax dump is not supported"); 551 + err = -ENOTSUP; 552 + goto done; 553 + } 554 + err = dump_btf_c(btf, root_type_ids, root_type_cnt); 555 + } else { 556 + err = dump_btf_raw(btf, root_type_ids, root_type_cnt); 557 + } 509 558 510 559 done: 511 560 close(fd); ··· 530 561 } 531 562 532 563 fprintf(stderr, 533 - "Usage: %s btf dump BTF_SRC\n" 564 + "Usage: %s btf dump BTF_SRC [format FORMAT]\n" 534 565 " %s btf help\n" 535 566 "\n" 536 567 " BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n" 568 + " FORMAT := { raw | c }\n" 537 569 " " HELP_SPEC_MAP "\n" 538 570 " " HELP_SPEC_PROGRAM "\n" 539 571 " " HELP_SPEC_OPTIONS "\n"

+15 -1

tools/bpf/bpftool/main.c

··· 10 10 #include <string.h> 11 11 12 12 #include <bpf.h> 13 + #include <libbpf.h> 13 14 14 15 #include "main.h" 15 16 ··· 26 25 bool json_output; 27 26 bool show_pinned; 28 27 bool block_mount; 28 + bool verifier_logs; 29 29 int bpf_flags; 30 30 struct pinned_obj_table prog_table; 31 31 struct pinned_obj_table map_table; ··· 77 75 printf("%s v%s\n", bin_name, BPFTOOL_VERSION); 78 76 } 79 77 return 0; 78 + } 79 + 80 + static int __printf(2, 0) 81 + print_all_levels(__maybe_unused enum libbpf_print_level level, 82 + const char *format, va_list args) 83 + { 84 + return vfprintf(stderr, format, args); 80 85 } 81 86 82 87 int cmd_select(const struct cmd *cmds, int argc, char **argv, ··· 326 317 { "bpffs", no_argument, NULL, 'f' }, 327 318 { "mapcompat", no_argument, NULL, 'm' }, 328 319 { "nomount", no_argument, NULL, 'n' }, 320 + { "debug", no_argument, NULL, 'd' }, 329 321 { 0 } 330 322 }; 331 323 int opt, ret; ··· 342 332 hash_init(map_table.table); 343 333 344 334 opterr = 0; 345 - while ((opt = getopt_long(argc, argv, "Vhpjfmn", 335 + while ((opt = getopt_long(argc, argv, "Vhpjfmnd", 346 336 options, NULL)) >= 0) { 347 337 switch (opt) { 348 338 case 'V': ··· 371 361 break; 372 362 case 'n': 373 363 block_mount = true; 364 + break; 365 + case 'd': 366 + libbpf_set_print(print_all_levels); 367 + verifier_logs = true; 374 368 break; 375 369 default: 376 370 p_err("unrecognized option '%s'", argv[optind - 1]);

+1

tools/bpf/bpftool/main.h

··· 91 91 extern bool json_output; 92 92 extern bool show_pinned; 93 93 extern bool block_mount; 94 + extern bool verifier_logs; 94 95 extern int bpf_flags; 95 96 extern struct pinned_obj_table prog_table; 96 97 extern struct pinned_obj_table map_table;

+17 -10

tools/bpf/bpftool/prog.c

··· 750 750 751 751 static int load_with_options(int argc, char **argv, bool first_prog_only) 752 752 { 753 - enum bpf_attach_type expected_attach_type; 754 - struct bpf_object_open_attr attr = { 755 - .prog_type = BPF_PROG_TYPE_UNSPEC, 753 + struct bpf_object_load_attr load_attr = { 0 }; 754 + struct bpf_object_open_attr open_attr = { 755 + .prog_type = BPF_PROG_TYPE_UNSPEC, 756 756 }; 757 + enum bpf_attach_type expected_attach_type; 757 758 struct map_replace *map_replace = NULL; 758 759 struct bpf_program *prog = NULL, *pos; 759 760 unsigned int old_map_fds = 0; ··· 768 767 769 768 if (!REQ_ARGS(2)) 770 769 return -1; 771 - attr.file = GET_ARG(); 770 + open_attr.file = GET_ARG(); 772 771 pinfile = GET_ARG(); 773 772 774 773 while (argc) { ··· 777 776 778 777 NEXT_ARG(); 779 778 780 - if (attr.prog_type != BPF_PROG_TYPE_UNSPEC) { 779 + if (open_attr.prog_type != BPF_PROG_TYPE_UNSPEC) { 781 780 p_err("program type already specified"); 782 781 goto err_free_reuse_maps; 783 782 } ··· 794 793 strcat(type, *argv); 795 794 strcat(type, "/"); 796 795 797 - err = libbpf_prog_type_by_name(type, &attr.prog_type, 796 + err = libbpf_prog_type_by_name(type, 797 + &open_attr.prog_type, 798 798 &expected_attach_type); 799 799 free(type); 800 800 if (err < 0) ··· 883 881 884 882 set_max_rlimit(); 885 883 886 - obj = __bpf_object__open_xattr(&attr, bpf_flags); 884 + obj = __bpf_object__open_xattr(&open_attr, bpf_flags); 887 885 if (IS_ERR_OR_NULL(obj)) { 888 886 p_err("failed to open object file"); 889 887 goto err_free_reuse_maps; 890 888 } 891 889 892 890 bpf_object__for_each_program(pos, obj) { 893 - enum bpf_prog_type prog_type = attr.prog_type; 891 + enum bpf_prog_type prog_type = open_attr.prog_type; 894 892 895 - if (attr.prog_type == BPF_PROG_TYPE_UNSPEC) { 893 + if (open_attr.prog_type == BPF_PROG_TYPE_UNSPEC) { 896 894 const char *sec_name = bpf_program__title(pos, false); 897 895 898 896 err = libbpf_prog_type_by_name(sec_name, &prog_type, ··· 962 960 goto err_close_obj; 963 961 } 964 962 965 - err = bpf_object__load(obj); 963 + load_attr.obj = obj; 964 + if (verifier_logs) 965 + /* log_level1 + log_level2 + stats, but not stable UAPI */ 966 + load_attr.log_level = 1 + 2 + 4; 967 + 968 + err = bpf_object__load_xattr(&load_attr); 966 969 if (err) { 967 970 p_err("failed to load object file"); 968 971 goto err_close_obj;

+1 -3

tools/bpf/bpftool/xlated_dumper.c

··· 31 31 if (!fp) 32 32 return; 33 33 34 - while (!feof(fp)) { 35 - if (!fgets(buff, sizeof(buff), fp)) 36 - break; 34 + while (fgets(buff, sizeof(buff), fp)) { 37 35 tmp = reallocarray(dd->sym_mapping, dd->sym_count + 1, 38 36 sizeof(*dd->sym_mapping)); 39 37 if (!tmp) {

+34 -1

tools/include/uapi/linux/bpf.h

··· 260 260 */ 261 261 #define BPF_F_ANY_ALIGNMENT (1U << 1) 262 262 263 + /* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. 264 + * Verifier does sub-register def/use analysis and identifies instructions whose 265 + * def only matters for low 32-bit, high 32-bit is never referenced later 266 + * through implicit zero extension. Therefore verifier notifies JIT back-ends 267 + * that it is safe to ignore clearing high 32-bit for these instructions. This 268 + * saves some back-ends a lot of code-gen. However such optimization is not 269 + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends 270 + * hence hasn't used verifier's analysis result. But, we really want to have a 271 + * way to be able to verify the correctness of the described optimization on 272 + * x86_64 on which testsuites are frequently exercised. 273 + * 274 + * So, this flag is introduced. Once it is set, verifier will randomize high 275 + * 32-bit for those instructions who has been identified as safe to ignore them. 276 + * Then, if verifier is not doing correct analysis, such randomization will 277 + * regress tests to expose bugs. 278 + */ 279 + #define BPF_F_TEST_RND_HI32 (1U << 2) 280 + 263 281 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have 264 282 * two extensions: 265 283 * ··· 2690 2672 * 0 on success. 2691 2673 * 2692 2674 * **-ENOENT** if the bpf-local-storage cannot be found. 2675 + * 2676 + * int bpf_send_signal(u32 sig) 2677 + * Description 2678 + * Send signal *sig* to the current task. 2679 + * Return 2680 + * 0 on success or successfully queued. 2681 + * 2682 + * **-EBUSY** if work queue under nmi is full. 2683 + * 2684 + * **-EINVAL** if *sig* is invalid. 2685 + * 2686 + * **-EPERM** if no permission to send the *sig*. 2687 + * 2688 + * **-EAGAIN** if bpf program can try again. 2693 2689 */ 2694 2690 #define __BPF_FUNC_MAPPER(FN) \ 2695 2691 FN(unspec), \ ··· 2814 2782 FN(strtol), \ 2815 2783 FN(strtoul), \ 2816 2784 FN(sk_storage_get), \ 2817 - FN(sk_storage_delete), 2785 + FN(sk_storage_delete), \ 2786 + FN(send_signal), 2818 2787 2819 2788 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2820 2789 * function eBPF program intends to call

+114

tools/include/uapi/linux/if_tun.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ 2 + /* 3 + * Universal TUN/TAP device driver. 4 + * Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation; either version 2 of the License, or 9 + * (at your option) any later version. 10 + * 11 + * This program is distributed in the hope that it will be useful, 12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 + * GNU General Public License for more details. 15 + */ 16 + 17 + #ifndef _UAPI__IF_TUN_H 18 + #define _UAPI__IF_TUN_H 19 + 20 + #include <linux/types.h> 21 + #include <linux/if_ether.h> 22 + #include <linux/filter.h> 23 + 24 + /* Read queue size */ 25 + #define TUN_READQ_SIZE 500 26 + /* TUN device type flags: deprecated. Use IFF_TUN/IFF_TAP instead. */ 27 + #define TUN_TUN_DEV IFF_TUN 28 + #define TUN_TAP_DEV IFF_TAP 29 + #define TUN_TYPE_MASK 0x000f 30 + 31 + /* Ioctl defines */ 32 + #define TUNSETNOCSUM _IOW('T', 200, int) 33 + #define TUNSETDEBUG _IOW('T', 201, int) 34 + #define TUNSETIFF _IOW('T', 202, int) 35 + #define TUNSETPERSIST _IOW('T', 203, int) 36 + #define TUNSETOWNER _IOW('T', 204, int) 37 + #define TUNSETLINK _IOW('T', 205, int) 38 + #define TUNSETGROUP _IOW('T', 206, int) 39 + #define TUNGETFEATURES _IOR('T', 207, unsigned int) 40 + #define TUNSETOFFLOAD _IOW('T', 208, unsigned int) 41 + #define TUNSETTXFILTER _IOW('T', 209, unsigned int) 42 + #define TUNGETIFF _IOR('T', 210, unsigned int) 43 + #define TUNGETSNDBUF _IOR('T', 211, int) 44 + #define TUNSETSNDBUF _IOW('T', 212, int) 45 + #define TUNATTACHFILTER _IOW('T', 213, struct sock_fprog) 46 + #define TUNDETACHFILTER _IOW('T', 214, struct sock_fprog) 47 + #define TUNGETVNETHDRSZ _IOR('T', 215, int) 48 + #define TUNSETVNETHDRSZ _IOW('T', 216, int) 49 + #define TUNSETQUEUE _IOW('T', 217, int) 50 + #define TUNSETIFINDEX _IOW('T', 218, unsigned int) 51 + #define TUNGETFILTER _IOR('T', 219, struct sock_fprog) 52 + #define TUNSETVNETLE _IOW('T', 220, int) 53 + #define TUNGETVNETLE _IOR('T', 221, int) 54 + /* The TUNSETVNETBE and TUNGETVNETBE ioctls are for cross-endian support on 55 + * little-endian hosts. Not all kernel configurations support them, but all 56 + * configurations that support SET also support GET. 57 + */ 58 + #define TUNSETVNETBE _IOW('T', 222, int) 59 + #define TUNGETVNETBE _IOR('T', 223, int) 60 + #define TUNSETSTEERINGEBPF _IOR('T', 224, int) 61 + #define TUNSETFILTEREBPF _IOR('T', 225, int) 62 + #define TUNSETCARRIER _IOW('T', 226, int) 63 + #define TUNGETDEVNETNS _IO('T', 227) 64 + 65 + /* TUNSETIFF ifr flags */ 66 + #define IFF_TUN 0x0001 67 + #define IFF_TAP 0x0002 68 + #define IFF_NAPI 0x0010 69 + #define IFF_NAPI_FRAGS 0x0020 70 + #define IFF_NO_PI 0x1000 71 + /* This flag has no real effect */ 72 + #define IFF_ONE_QUEUE 0x2000 73 + #define IFF_VNET_HDR 0x4000 74 + #define IFF_TUN_EXCL 0x8000 75 + #define IFF_MULTI_QUEUE 0x0100 76 + #define IFF_ATTACH_QUEUE 0x0200 77 + #define IFF_DETACH_QUEUE 0x0400 78 + /* read-only flag */ 79 + #define IFF_PERSIST 0x0800 80 + #define IFF_NOFILTER 0x1000 81 + 82 + /* Socket options */ 83 + #define TUN_TX_TIMESTAMP 1 84 + 85 + /* Features for GSO (TUNSETOFFLOAD). */ 86 + #define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ 87 + #define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ 88 + #define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ 89 + #define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ 90 + #define TUN_F_UFO 0x10 /* I can handle UFO packets */ 91 + 92 + /* Protocol info prepended to the packets (when IFF_NO_PI is not set) */ 93 + #define TUN_PKT_STRIP 0x0001 94 + struct tun_pi { 95 + __u16 flags; 96 + __be16 proto; 97 + }; 98 + 99 + /* 100 + * Filter spec (used for SETXXFILTER ioctls) 101 + * This stuff is applicable only to the TAP (Ethernet) devices. 102 + * If the count is zero the filter is disabled and the driver accepts 103 + * all packets (promisc mode). 104 + * If the filter is enabled in order to accept broadcast packets 105 + * broadcast addr must be explicitly included in the addr list. 106 + */ 107 + #define TUN_FLT_ALLMULTI 0x0001 /* Accept all multicast packets */ 108 + struct tun_filter { 109 + __u16 flags; /* TUN_FLT_ flags see above */ 110 + __u16 count; /* Number of addresses */ 111 + __u8 addr[0][ETH_ALEN]; 112 + }; 113 + 114 + #endif /* _UAPI__IF_TUN_H */

+3 -1

tools/lib/bpf/Build

··· 1 - libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o 1 + libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ 2 + netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ 3 + btf_dump.o

+11 -1

tools/lib/bpf/Makefile

··· 3 3 4 4 BPF_VERSION = 0 5 5 BPF_PATCHLEVEL = 0 6 - BPF_EXTRAVERSION = 3 6 + BPF_EXTRAVERSION = 4 7 7 8 8 MAKEFLAGS += --no-print-directory 9 9 ··· 204 204 "versioned symbols in $^ ($(VERSIONED_SYM_COUNT))." \ 205 205 "Please make sure all LIBBPF_API symbols are" \ 206 206 "versioned in $(VERSION_SCRIPT)." >&2; \ 207 + readelf -s --wide $(OUTPUT)libbpf-in.o | \ 208 + awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}'| \ 209 + sort -u > $(OUTPUT)libbpf_global_syms.tmp; \ 210 + readelf -s --wide $(OUTPUT)libbpf.so | \ 211 + grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | \ 212 + sort -u > $(OUTPUT)libbpf_versioned_syms.tmp; \ 213 + diff -u $(OUTPUT)libbpf_global_syms.tmp \ 214 + $(OUTPUT)libbpf_versioned_syms.tmp; \ 215 + rm $(OUTPUT)libbpf_global_syms.tmp \ 216 + $(OUTPUT)libbpf_versioned_syms.tmp; \ 207 217 exit 1; \ 208 218 fi 209 219

+1

tools/lib/bpf/bpf.c

··· 256 256 if (load_attr->name) 257 257 memcpy(attr.prog_name, load_attr->name, 258 258 min(strlen(load_attr->name), BPF_OBJ_NAME_LEN - 1)); 259 + attr.prog_flags = load_attr->prog_flags; 259 260 260 261 fd = sys_bpf_prog_load(&attr, sizeof(attr)); 261 262 if (fd >= 0)

+1

tools/lib/bpf/bpf.h

··· 87 87 const void *line_info; 88 88 __u32 line_info_cnt; 89 89 __u32 log_level; 90 + __u32 prog_flags; 90 91 }; 91 92 92 93 /* Flags to direct loading requirements */

+214 -117

tools/lib/bpf/btf.c

··· 4 4 #include <stdio.h> 5 5 #include <stdlib.h> 6 6 #include <string.h> 7 + #include <fcntl.h> 7 8 #include <unistd.h> 8 9 #include <errno.h> 9 10 #include <linux/err.h> 10 11 #include <linux/btf.h> 12 + #include <gelf.h> 11 13 #include "btf.h" 12 14 #include "bpf.h" 13 15 #include "libbpf.h" 14 16 #include "libbpf_internal.h" 17 + #include "hashmap.h" 15 18 16 19 #define max(a, b) ((a) > (b) ? (a) : (b)) 17 20 #define min(a, b) ((a) < (b) ? (a) : (b)) ··· 417 414 return ERR_PTR(err); 418 415 } 419 416 417 + return btf; 418 + } 419 + 420 + static bool btf_check_endianness(const GElf_Ehdr *ehdr) 421 + { 422 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 423 + return ehdr->e_ident[EI_DATA] == ELFDATA2LSB; 424 + #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 425 + return ehdr->e_ident[EI_DATA] == ELFDATA2MSB; 426 + #else 427 + # error "Unrecognized __BYTE_ORDER__" 428 + #endif 429 + } 430 + 431 + struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) 432 + { 433 + Elf_Data *btf_data = NULL, *btf_ext_data = NULL; 434 + int err = 0, fd = -1, idx = 0; 435 + struct btf *btf = NULL; 436 + Elf_Scn *scn = NULL; 437 + Elf *elf = NULL; 438 + GElf_Ehdr ehdr; 439 + 440 + if (elf_version(EV_CURRENT) == EV_NONE) { 441 + pr_warning("failed to init libelf for %s\n", path); 442 + return ERR_PTR(-LIBBPF_ERRNO__LIBELF); 443 + } 444 + 445 + fd = open(path, O_RDONLY); 446 + if (fd < 0) { 447 + err = -errno; 448 + pr_warning("failed to open %s: %s\n", path, strerror(errno)); 449 + return ERR_PTR(err); 450 + } 451 + 452 + err = -LIBBPF_ERRNO__FORMAT; 453 + 454 + elf = elf_begin(fd, ELF_C_READ, NULL); 455 + if (!elf) { 456 + pr_warning("failed to open %s as ELF file\n", path); 457 + goto done; 458 + } 459 + if (!gelf_getehdr(elf, &ehdr)) { 460 + pr_warning("failed to get EHDR from %s\n", path); 461 + goto done; 462 + } 463 + if (!btf_check_endianness(&ehdr)) { 464 + pr_warning("non-native ELF endianness is not supported\n"); 465 + goto done; 466 + } 467 + if (!elf_rawdata(elf_getscn(elf, ehdr.e_shstrndx), NULL)) { 468 + pr_warning("failed to get e_shstrndx from %s\n", path); 469 + goto done; 470 + } 471 + 472 + while ((scn = elf_nextscn(elf, scn)) != NULL) { 473 + GElf_Shdr sh; 474 + char *name; 475 + 476 + idx++; 477 + if (gelf_getshdr(scn, &sh) != &sh) { 478 + pr_warning("failed to get section(%d) header from %s\n", 479 + idx, path); 480 + goto done; 481 + } 482 + name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name); 483 + if (!name) { 484 + pr_warning("failed to get section(%d) name from %s\n", 485 + idx, path); 486 + goto done; 487 + } 488 + if (strcmp(name, BTF_ELF_SEC) == 0) { 489 + btf_data = elf_getdata(scn, 0); 490 + if (!btf_data) { 491 + pr_warning("failed to get section(%d, %s) data from %s\n", 492 + idx, name, path); 493 + goto done; 494 + } 495 + continue; 496 + } else if (btf_ext && strcmp(name, BTF_EXT_ELF_SEC) == 0) { 497 + btf_ext_data = elf_getdata(scn, 0); 498 + if (!btf_ext_data) { 499 + pr_warning("failed to get section(%d, %s) data from %s\n", 500 + idx, name, path); 501 + goto done; 502 + } 503 + continue; 504 + } 505 + } 506 + 507 + err = 0; 508 + 509 + if (!btf_data) { 510 + err = -ENOENT; 511 + goto done; 512 + } 513 + btf = btf__new(btf_data->d_buf, btf_data->d_size); 514 + if (IS_ERR(btf)) 515 + goto done; 516 + 517 + if (btf_ext && btf_ext_data) { 518 + *btf_ext = btf_ext__new(btf_ext_data->d_buf, 519 + btf_ext_data->d_size); 520 + if (IS_ERR(*btf_ext)) 521 + goto done; 522 + } else if (btf_ext) { 523 + *btf_ext = NULL; 524 + } 525 + done: 526 + if (elf) 527 + elf_end(elf); 528 + close(fd); 529 + 530 + if (err) 531 + return ERR_PTR(err); 532 + /* 533 + * btf is always parsed before btf_ext, so no need to clean up 534 + * btf_ext, if btf loading failed 535 + */ 536 + if (IS_ERR(btf)) 537 + return btf; 538 + if (btf_ext && IS_ERR(*btf_ext)) { 539 + btf__free(btf); 540 + err = PTR_ERR(*btf_ext); 541 + return ERR_PTR(err); 542 + } 420 543 return btf; 421 544 } 422 545 ··· 1294 1165 return err; 1295 1166 } 1296 1167 1297 - #define BTF_DEDUP_TABLE_DEFAULT_SIZE (1 << 14) 1298 - #define BTF_DEDUP_TABLE_MAX_SIZE_LOG 31 1299 1168 #define BTF_UNPROCESSED_ID ((__u32)-1) 1300 1169 #define BTF_IN_PROGRESS_ID ((__u32)-2) 1301 - 1302 - struct btf_dedup_node { 1303 - struct btf_dedup_node *next; 1304 - __u32 type_id; 1305 - }; 1306 1170 1307 1171 struct btf_dedup { 1308 1172 /* .BTF section to be deduped in-place */ ··· 1312 1190 * candidates, which is fine because we rely on subsequent 1313 1191 * btf_xxx_equal() checks to authoritatively verify type equality. 1314 1192 */ 1315 - struct btf_dedup_node **dedup_table; 1193 + struct hashmap *dedup_table; 1316 1194 /* Canonical types map */ 1317 1195 __u32 *map; 1318 1196 /* Hypothetical mapping, used during type graph equivalence checks */ ··· 1337 1215 __u32 cap; 1338 1216 }; 1339 1217 1340 - static inline __u32 hash_combine(__u32 h, __u32 value) 1218 + static long hash_combine(long h, long value) 1341 1219 { 1342 - /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ 1343 - #define GOLDEN_RATIO_PRIME 0x9e370001UL 1344 - return h * 37 + value * GOLDEN_RATIO_PRIME; 1345 - #undef GOLDEN_RATIO_PRIME 1220 + return h * 31 + value; 1346 1221 } 1347 1222 1348 - #define for_each_dedup_cand(d, hash, node) \ 1349 - for (node = d->dedup_table[hash & (d->opts.dedup_table_size - 1)]; \ 1350 - node; \ 1351 - node = node->next) 1223 + #define for_each_dedup_cand(d, node, hash) \ 1224 + hashmap__for_each_key_entry(d->dedup_table, node, (void *)hash) 1352 1225 1353 - static int btf_dedup_table_add(struct btf_dedup *d, __u32 hash, __u32 type_id) 1226 + static int btf_dedup_table_add(struct btf_dedup *d, long hash, __u32 type_id) 1354 1227 { 1355 - struct btf_dedup_node *node = malloc(sizeof(struct btf_dedup_node)); 1356 - int bucket = hash & (d->opts.dedup_table_size - 1); 1357 - 1358 - if (!node) 1359 - return -ENOMEM; 1360 - node->type_id = type_id; 1361 - node->next = d->dedup_table[bucket]; 1362 - d->dedup_table[bucket] = node; 1363 - return 0; 1228 + return hashmap__append(d->dedup_table, 1229 + (void *)hash, (void *)(long)type_id); 1364 1230 } 1365 1231 1366 1232 static int btf_dedup_hypot_map_add(struct btf_dedup *d, ··· 1377 1267 d->hypot_cnt = 0; 1378 1268 } 1379 1269 1380 - static void btf_dedup_table_free(struct btf_dedup *d) 1381 - { 1382 - struct btf_dedup_node *head, *tmp; 1383 - int i; 1384 - 1385 - if (!d->dedup_table) 1386 - return; 1387 - 1388 - for (i = 0; i < d->opts.dedup_table_size; i++) { 1389 - while (d->dedup_table[i]) { 1390 - tmp = d->dedup_table[i]; 1391 - d->dedup_table[i] = tmp->next; 1392 - free(tmp); 1393 - } 1394 - 1395 - head = d->dedup_table[i]; 1396 - while (head) { 1397 - tmp = head; 1398 - head = head->next; 1399 - free(tmp); 1400 - } 1401 - } 1402 - 1403 - free(d->dedup_table); 1404 - d->dedup_table = NULL; 1405 - } 1406 - 1407 1270 static void btf_dedup_free(struct btf_dedup *d) 1408 1271 { 1409 - btf_dedup_table_free(d); 1272 + hashmap__free(d->dedup_table); 1273 + d->dedup_table = NULL; 1410 1274 1411 1275 free(d->map); 1412 1276 d->map = NULL; ··· 1394 1310 free(d); 1395 1311 } 1396 1312 1397 - /* Find closest power of two >= to size, capped at 2^max_size_log */ 1398 - static __u32 roundup_pow2_max(__u32 size, int max_size_log) 1313 + static size_t btf_dedup_identity_hash_fn(const void *key, void *ctx) 1399 1314 { 1400 - int i; 1401 - 1402 - for (i = 0; i < max_size_log && (1U << i) < size; i++) 1403 - ; 1404 - return 1U << i; 1315 + return (size_t)key; 1405 1316 } 1406 1317 1318 + static size_t btf_dedup_collision_hash_fn(const void *key, void *ctx) 1319 + { 1320 + return 0; 1321 + } 1322 + 1323 + static bool btf_dedup_equal_fn(const void *k1, const void *k2, void *ctx) 1324 + { 1325 + return k1 == k2; 1326 + } 1407 1327 1408 1328 static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, 1409 1329 const struct btf_dedup_opts *opts) 1410 1330 { 1411 1331 struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup)); 1332 + hashmap_hash_fn hash_fn = btf_dedup_identity_hash_fn; 1412 1333 int i, err = 0; 1413 - __u32 sz; 1414 1334 1415 1335 if (!d) 1416 1336 return ERR_PTR(-ENOMEM); 1417 1337 1418 1338 d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds; 1419 - sz = opts && opts->dedup_table_size ? opts->dedup_table_size 1420 - : BTF_DEDUP_TABLE_DEFAULT_SIZE; 1421 - sz = roundup_pow2_max(sz, BTF_DEDUP_TABLE_MAX_SIZE_LOG); 1422 - d->opts.dedup_table_size = sz; 1339 + /* dedup_table_size is now used only to force collisions in tests */ 1340 + if (opts && opts->dedup_table_size == 1) 1341 + hash_fn = btf_dedup_collision_hash_fn; 1423 1342 1424 1343 d->btf = btf; 1425 1344 d->btf_ext = btf_ext; 1426 1345 1427 - d->dedup_table = calloc(d->opts.dedup_table_size, 1428 - sizeof(struct btf_dedup_node *)); 1429 - if (!d->dedup_table) { 1430 - err = -ENOMEM; 1346 + d->dedup_table = hashmap__new(hash_fn, btf_dedup_equal_fn, NULL); 1347 + if (IS_ERR(d->dedup_table)) { 1348 + err = PTR_ERR(d->dedup_table); 1349 + d->dedup_table = NULL; 1431 1350 goto done; 1432 1351 } 1433 1352 ··· 1749 1662 return err; 1750 1663 } 1751 1664 1752 - static __u32 btf_hash_common(struct btf_type *t) 1665 + static long btf_hash_common(struct btf_type *t) 1753 1666 { 1754 - __u32 h; 1667 + long h; 1755 1668 1756 1669 h = hash_combine(0, t->name_off); 1757 1670 h = hash_combine(h, t->info); ··· 1767 1680 } 1768 1681 1769 1682 /* Calculate type signature hash of INT. */ 1770 - static __u32 btf_hash_int(struct btf_type *t) 1683 + static long btf_hash_int(struct btf_type *t) 1771 1684 { 1772 1685 __u32 info = *(__u32 *)(t + 1); 1773 - __u32 h; 1686 + long h; 1774 1687 1775 1688 h = btf_hash_common(t); 1776 1689 h = hash_combine(h, info); ··· 1790 1703 } 1791 1704 1792 1705 /* Calculate type signature hash of ENUM. */ 1793 - static __u32 btf_hash_enum(struct btf_type *t) 1706 + static long btf_hash_enum(struct btf_type *t) 1794 1707 { 1795 - __u32 h; 1708 + long h; 1796 1709 1797 1710 /* don't hash vlen and enum members to support enum fwd resolving */ 1798 1711 h = hash_combine(0, t->name_off); ··· 1844 1757 * as referenced type IDs equivalence is established separately during type 1845 1758 * graph equivalence check algorithm. 1846 1759 */ 1847 - static __u32 btf_hash_struct(struct btf_type *t) 1760 + static long btf_hash_struct(struct btf_type *t) 1848 1761 { 1849 1762 struct btf_member *member = (struct btf_member *)(t + 1); 1850 1763 __u32 vlen = BTF_INFO_VLEN(t->info); 1851 - __u32 h = btf_hash_common(t); 1764 + long h = btf_hash_common(t); 1852 1765 int i; 1853 1766 1854 1767 for (i = 0; i < vlen; i++) { ··· 1891 1804 * under assumption that they were already resolved to canonical type IDs and 1892 1805 * are not going to change. 1893 1806 */ 1894 - static __u32 btf_hash_array(struct btf_type *t) 1807 + static long btf_hash_array(struct btf_type *t) 1895 1808 { 1896 1809 struct btf_array *info = (struct btf_array *)(t + 1); 1897 - __u32 h = btf_hash_common(t); 1810 + long h = btf_hash_common(t); 1898 1811 1899 1812 h = hash_combine(h, info->type); 1900 1813 h = hash_combine(h, info->index_type); ··· 1945 1858 * under assumption that they were already resolved to canonical type IDs and 1946 1859 * are not going to change. 1947 1860 */ 1948 - static inline __u32 btf_hash_fnproto(struct btf_type *t) 1861 + static long btf_hash_fnproto(struct btf_type *t) 1949 1862 { 1950 1863 struct btf_param *member = (struct btf_param *)(t + 1); 1951 1864 __u16 vlen = BTF_INFO_VLEN(t->info); 1952 - __u32 h = btf_hash_common(t); 1865 + long h = btf_hash_common(t); 1953 1866 int i; 1954 1867 1955 1868 for (i = 0; i < vlen; i++) { ··· 1967 1880 * This function is called during reference types deduplication to compare 1968 1881 * FUNC_PROTO to potential canonical representative. 1969 1882 */ 1970 - static inline bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2) 1883 + static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2) 1971 1884 { 1972 1885 struct btf_param *m1, *m2; 1973 1886 __u16 vlen; ··· 1993 1906 * IDs. This check is performed during type graph equivalence check and 1994 1907 * referenced types equivalence is checked separately. 1995 1908 */ 1996 - static inline bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) 1909 + static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) 1997 1910 { 1998 1911 struct btf_param *m1, *m2; 1999 1912 __u16 vlen; ··· 2024 1937 static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) 2025 1938 { 2026 1939 struct btf_type *t = d->btf->types[type_id]; 1940 + struct hashmap_entry *hash_entry; 2027 1941 struct btf_type *cand; 2028 - struct btf_dedup_node *cand_node; 2029 1942 /* if we don't find equivalent type, then we are canonical */ 2030 1943 __u32 new_id = type_id; 2031 - __u32 h; 1944 + __u32 cand_id; 1945 + long h; 2032 1946 2033 1947 switch (BTF_INFO_KIND(t->info)) { 2034 1948 case BTF_KIND_CONST: ··· 2048 1960 2049 1961 case BTF_KIND_INT: 2050 1962 h = btf_hash_int(t); 2051 - for_each_dedup_cand(d, h, cand_node) { 2052 - cand = d->btf->types[cand_node->type_id]; 1963 + for_each_dedup_cand(d, hash_entry, h) { 1964 + cand_id = (__u32)(long)hash_entry->value; 1965 + cand = d->btf->types[cand_id]; 2053 1966 if (btf_equal_int(t, cand)) { 2054 - new_id = cand_node->type_id; 1967 + new_id = cand_id; 2055 1968 break; 2056 1969 } 2057 1970 } ··· 2060 1971 2061 1972 case BTF_KIND_ENUM: 2062 1973 h = btf_hash_enum(t); 2063 - for_each_dedup_cand(d, h, cand_node) { 2064 - cand = d->btf->types[cand_node->type_id]; 1974 + for_each_dedup_cand(d, hash_entry, h) { 1975 + cand_id = (__u32)(long)hash_entry->value; 1976 + cand = d->btf->types[cand_id]; 2065 1977 if (btf_equal_enum(t, cand)) { 2066 - new_id = cand_node->type_id; 1978 + new_id = cand_id; 2067 1979 break; 2068 1980 } 2069 1981 if (d->opts.dont_resolve_fwds) ··· 2072 1982 if (btf_compat_enum(t, cand)) { 2073 1983 if (btf_is_enum_fwd(t)) { 2074 1984 /* resolve fwd to full enum */ 2075 - new_id = cand_node->type_id; 1985 + new_id = cand_id; 2076 1986 break; 2077 1987 } 2078 1988 /* resolve canonical enum fwd to full enum */ 2079 - d->map[cand_node->type_id] = type_id; 1989 + d->map[cand_id] = type_id; 2080 1990 } 2081 1991 } 2082 1992 break; 2083 1993 2084 1994 case BTF_KIND_FWD: 2085 1995 h = btf_hash_common(t); 2086 - for_each_dedup_cand(d, h, cand_node) { 2087 - cand = d->btf->types[cand_node->type_id]; 1996 + for_each_dedup_cand(d, hash_entry, h) { 1997 + cand_id = (__u32)(long)hash_entry->value; 1998 + cand = d->btf->types[cand_id]; 2088 1999 if (btf_equal_common(t, cand)) { 2089 - new_id = cand_node->type_id; 2000 + new_id = cand_id; 2090 2001 break; 2091 2002 } 2092 2003 } ··· 2488 2397 */ 2489 2398 static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) 2490 2399 { 2491 - struct btf_dedup_node *cand_node; 2492 2400 struct btf_type *cand_type, *t; 2401 + struct hashmap_entry *hash_entry; 2493 2402 /* if we don't find equivalent type, then we are canonical */ 2494 2403 __u32 new_id = type_id; 2495 2404 __u16 kind; 2496 - __u32 h; 2405 + long h; 2497 2406 2498 2407 /* already deduped or is in process of deduping (loop detected) */ 2499 2408 if (d->map[type_id] <= BTF_MAX_NR_TYPES) ··· 2506 2415 return 0; 2507 2416 2508 2417 h = btf_hash_struct(t); 2509 - for_each_dedup_cand(d, h, cand_node) { 2418 + for_each_dedup_cand(d, hash_entry, h) { 2419 + __u32 cand_id = (__u32)(long)hash_entry->value; 2510 2420 int eq; 2511 2421 2512 2422 /* ··· 2520 2428 * creating a loop (FWD -> STRUCT and STRUCT -> FWD), because 2521 2429 * FWD and compatible STRUCT/UNION are considered equivalent. 2522 2430 */ 2523 - cand_type = d->btf->types[cand_node->type_id]; 2431 + cand_type = d->btf->types[cand_id]; 2524 2432 if (!btf_shallow_equal_struct(t, cand_type)) 2525 2433 continue; 2526 2434 2527 2435 btf_dedup_clear_hypot_map(d); 2528 - eq = btf_dedup_is_equiv(d, type_id, cand_node->type_id); 2436 + eq = btf_dedup_is_equiv(d, type_id, cand_id); 2529 2437 if (eq < 0) 2530 2438 return eq; 2531 2439 if (!eq) 2532 2440 continue; 2533 - new_id = cand_node->type_id; 2441 + new_id = cand_id; 2534 2442 btf_dedup_merge_hypot_map(d); 2535 2443 break; 2536 2444 } ··· 2580 2488 */ 2581 2489 static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) 2582 2490 { 2583 - struct btf_dedup_node *cand_node; 2491 + struct hashmap_entry *hash_entry; 2492 + __u32 new_id = type_id, cand_id; 2584 2493 struct btf_type *t, *cand; 2585 2494 /* if we don't find equivalent type, then we are representative type */ 2586 - __u32 new_id = type_id; 2587 2495 int ref_type_id; 2588 - __u32 h; 2496 + long h; 2589 2497 2590 2498 if (d->map[type_id] == BTF_IN_PROGRESS_ID) 2591 2499 return -ELOOP; ··· 2608 2516 t->type = ref_type_id; 2609 2517 2610 2518 h = btf_hash_common(t); 2611 - for_each_dedup_cand(d, h, cand_node) { 2612 - cand = d->btf->types[cand_node->type_id]; 2519 + for_each_dedup_cand(d, hash_entry, h) { 2520 + cand_id = (__u32)(long)hash_entry->value; 2521 + cand = d->btf->types[cand_id]; 2613 2522 if (btf_equal_common(t, cand)) { 2614 - new_id = cand_node->type_id; 2523 + new_id = cand_id; 2615 2524 break; 2616 2525 } 2617 2526 } ··· 2632 2539 info->index_type = ref_type_id; 2633 2540 2634 2541 h = btf_hash_array(t); 2635 - for_each_dedup_cand(d, h, cand_node) { 2636 - cand = d->btf->types[cand_node->type_id]; 2542 + for_each_dedup_cand(d, hash_entry, h) { 2543 + cand_id = (__u32)(long)hash_entry->value; 2544 + cand = d->btf->types[cand_id]; 2637 2545 if (btf_equal_array(t, cand)) { 2638 - new_id = cand_node->type_id; 2546 + new_id = cand_id; 2639 2547 break; 2640 2548 } 2641 2549 } ··· 2664 2570 } 2665 2571 2666 2572 h = btf_hash_fnproto(t); 2667 - for_each_dedup_cand(d, h, cand_node) { 2668 - cand = d->btf->types[cand_node->type_id]; 2573 + for_each_dedup_cand(d, hash_entry, h) { 2574 + cand_id = (__u32)(long)hash_entry->value; 2575 + cand = d->btf->types[cand_id]; 2669 2576 if (btf_equal_fnproto(t, cand)) { 2670 - new_id = cand_node->type_id; 2577 + new_id = cand_id; 2671 2578 break; 2672 2579 } 2673 2580 } ··· 2695 2600 if (err < 0) 2696 2601 return err; 2697 2602 } 2698 - btf_dedup_table_free(d); 2603 + /* we won't need d->dedup_table anymore */ 2604 + hashmap__free(d->dedup_table); 2605 + d->dedup_table = NULL; 2699 2606 return 0; 2700 2607 } 2701 2608

+19

tools/lib/bpf/btf.h

··· 4 4 #ifndef __LIBBPF_BTF_H 5 5 #define __LIBBPF_BTF_H 6 6 7 + #include <stdarg.h> 7 8 #include <linux/types.h> 8 9 9 10 #ifdef __cplusplus ··· 60 59 61 60 LIBBPF_API void btf__free(struct btf *btf); 62 61 LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size); 62 + LIBBPF_API struct btf *btf__parse_elf(const char *path, 63 + struct btf_ext **btf_ext); 63 64 LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf); 64 65 LIBBPF_API int btf__load(struct btf *btf); 65 66 LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, ··· 102 99 103 100 LIBBPF_API int btf__dedup(struct btf *btf, struct btf_ext *btf_ext, 104 101 const struct btf_dedup_opts *opts); 102 + 103 + struct btf_dump; 104 + 105 + struct btf_dump_opts { 106 + void *ctx; 107 + }; 108 + 109 + typedef void (*btf_dump_printf_fn_t)(void *ctx, const char *fmt, va_list args); 110 + 111 + LIBBPF_API struct btf_dump *btf_dump__new(const struct btf *btf, 112 + const struct btf_ext *btf_ext, 113 + const struct btf_dump_opts *opts, 114 + btf_dump_printf_fn_t printf_fn); 115 + LIBBPF_API void btf_dump__free(struct btf_dump *d); 116 + 117 + LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); 105 118 106 119 #ifdef __cplusplus 107 120 } /* extern "C" */

+1336

tools/lib/bpf/btf_dump.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * BTF-to-C type converter. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + 9 + #include <stdbool.h> 10 + #include <stddef.h> 11 + #include <stdlib.h> 12 + #include <string.h> 13 + #include <errno.h> 14 + #include <linux/err.h> 15 + #include <linux/btf.h> 16 + #include "btf.h" 17 + #include "hashmap.h" 18 + #include "libbpf.h" 19 + #include "libbpf_internal.h" 20 + 21 + #define min(x, y) ((x) < (y) ? (x) : (y)) 22 + #define max(x, y) ((x) < (y) ? (y) : (x)) 23 + 24 + static const char PREFIXES[] = "\t\t\t\t\t\t\t\t\t\t\t\t\t"; 25 + static const size_t PREFIX_CNT = sizeof(PREFIXES) - 1; 26 + 27 + static const char *pfx(int lvl) 28 + { 29 + return lvl >= PREFIX_CNT ? PREFIXES : &PREFIXES[PREFIX_CNT - lvl]; 30 + } 31 + 32 + enum btf_dump_type_order_state { 33 + NOT_ORDERED, 34 + ORDERING, 35 + ORDERED, 36 + }; 37 + 38 + enum btf_dump_type_emit_state { 39 + NOT_EMITTED, 40 + EMITTING, 41 + EMITTED, 42 + }; 43 + 44 + /* per-type auxiliary state */ 45 + struct btf_dump_type_aux_state { 46 + /* topological sorting state */ 47 + enum btf_dump_type_order_state order_state: 2; 48 + /* emitting state used to determine the need for forward declaration */ 49 + enum btf_dump_type_emit_state emit_state: 2; 50 + /* whether forward declaration was already emitted */ 51 + __u8 fwd_emitted: 1; 52 + /* whether unique non-duplicate name was already assigned */ 53 + __u8 name_resolved: 1; 54 + }; 55 + 56 + struct btf_dump { 57 + const struct btf *btf; 58 + const struct btf_ext *btf_ext; 59 + btf_dump_printf_fn_t printf_fn; 60 + struct btf_dump_opts opts; 61 + 62 + /* per-type auxiliary state */ 63 + struct btf_dump_type_aux_state *type_states; 64 + /* per-type optional cached unique name, must be freed, if present */ 65 + const char **cached_names; 66 + 67 + /* topo-sorted list of dependent type definitions */ 68 + __u32 *emit_queue; 69 + int emit_queue_cap; 70 + int emit_queue_cnt; 71 + 72 + /* 73 + * stack of type declarations (e.g., chain of modifiers, arrays, 74 + * funcs, etc) 75 + */ 76 + __u32 *decl_stack; 77 + int decl_stack_cap; 78 + int decl_stack_cnt; 79 + 80 + /* maps struct/union/enum name to a number of name occurrences */ 81 + struct hashmap *type_names; 82 + /* 83 + * maps typedef identifiers and enum value names to a number of such 84 + * name occurrences 85 + */ 86 + struct hashmap *ident_names; 87 + }; 88 + 89 + static size_t str_hash_fn(const void *key, void *ctx) 90 + { 91 + const char *s = key; 92 + size_t h = 0; 93 + 94 + while (*s) { 95 + h = h * 31 + *s; 96 + s++; 97 + } 98 + return h; 99 + } 100 + 101 + static bool str_equal_fn(const void *a, const void *b, void *ctx) 102 + { 103 + return strcmp(a, b) == 0; 104 + } 105 + 106 + static __u16 btf_kind_of(const struct btf_type *t) 107 + { 108 + return BTF_INFO_KIND(t->info); 109 + } 110 + 111 + static __u16 btf_vlen_of(const struct btf_type *t) 112 + { 113 + return BTF_INFO_VLEN(t->info); 114 + } 115 + 116 + static bool btf_kflag_of(const struct btf_type *t) 117 + { 118 + return BTF_INFO_KFLAG(t->info); 119 + } 120 + 121 + static const char *btf_name_of(const struct btf_dump *d, __u32 name_off) 122 + { 123 + return btf__name_by_offset(d->btf, name_off); 124 + } 125 + 126 + static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...) 127 + { 128 + va_list args; 129 + 130 + va_start(args, fmt); 131 + d->printf_fn(d->opts.ctx, fmt, args); 132 + va_end(args); 133 + } 134 + 135 + struct btf_dump *btf_dump__new(const struct btf *btf, 136 + const struct btf_ext *btf_ext, 137 + const struct btf_dump_opts *opts, 138 + btf_dump_printf_fn_t printf_fn) 139 + { 140 + struct btf_dump *d; 141 + int err; 142 + 143 + d = calloc(1, sizeof(struct btf_dump)); 144 + if (!d) 145 + return ERR_PTR(-ENOMEM); 146 + 147 + d->btf = btf; 148 + d->btf_ext = btf_ext; 149 + d->printf_fn = printf_fn; 150 + d->opts.ctx = opts ? opts->ctx : NULL; 151 + 152 + d->type_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); 153 + if (IS_ERR(d->type_names)) { 154 + err = PTR_ERR(d->type_names); 155 + d->type_names = NULL; 156 + btf_dump__free(d); 157 + return ERR_PTR(err); 158 + } 159 + d->ident_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); 160 + if (IS_ERR(d->ident_names)) { 161 + err = PTR_ERR(d->ident_names); 162 + d->ident_names = NULL; 163 + btf_dump__free(d); 164 + return ERR_PTR(err); 165 + } 166 + 167 + return d; 168 + } 169 + 170 + void btf_dump__free(struct btf_dump *d) 171 + { 172 + int i, cnt; 173 + 174 + if (!d) 175 + return; 176 + 177 + free(d->type_states); 178 + if (d->cached_names) { 179 + /* any set cached name is owned by us and should be freed */ 180 + for (i = 0, cnt = btf__get_nr_types(d->btf); i <= cnt; i++) { 181 + if (d->cached_names[i]) 182 + free((void *)d->cached_names[i]); 183 + } 184 + } 185 + free(d->cached_names); 186 + free(d->emit_queue); 187 + free(d->decl_stack); 188 + hashmap__free(d->type_names); 189 + hashmap__free(d->ident_names); 190 + 191 + free(d); 192 + } 193 + 194 + static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr); 195 + static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id); 196 + 197 + /* 198 + * Dump BTF type in a compilable C syntax, including all the necessary 199 + * dependent types, necessary for compilation. If some of the dependent types 200 + * were already emitted as part of previous btf_dump__dump_type() invocation 201 + * for another type, they won't be emitted again. This API allows callers to 202 + * filter out BTF types according to user-defined criterias and emitted only 203 + * minimal subset of types, necessary to compile everything. Full struct/union 204 + * definitions will still be emitted, even if the only usage is through 205 + * pointer and could be satisfied with just a forward declaration. 206 + * 207 + * Dumping is done in two high-level passes: 208 + * 1. Topologically sort type definitions to satisfy C rules of compilation. 209 + * 2. Emit type definitions in C syntax. 210 + * 211 + * Returns 0 on success; <0, otherwise. 212 + */ 213 + int btf_dump__dump_type(struct btf_dump *d, __u32 id) 214 + { 215 + int err, i; 216 + 217 + if (id > btf__get_nr_types(d->btf)) 218 + return -EINVAL; 219 + 220 + /* type states are lazily allocated, as they might not be needed */ 221 + if (!d->type_states) { 222 + d->type_states = calloc(1 + btf__get_nr_types(d->btf), 223 + sizeof(d->type_states[0])); 224 + if (!d->type_states) 225 + return -ENOMEM; 226 + d->cached_names = calloc(1 + btf__get_nr_types(d->btf), 227 + sizeof(d->cached_names[0])); 228 + if (!d->cached_names) 229 + return -ENOMEM; 230 + 231 + /* VOID is special */ 232 + d->type_states[0].order_state = ORDERED; 233 + d->type_states[0].emit_state = EMITTED; 234 + } 235 + 236 + d->emit_queue_cnt = 0; 237 + err = btf_dump_order_type(d, id, false); 238 + if (err < 0) 239 + return err; 240 + 241 + for (i = 0; i < d->emit_queue_cnt; i++) 242 + btf_dump_emit_type(d, d->emit_queue[i], 0 /*top-level*/); 243 + 244 + return 0; 245 + } 246 + 247 + static int btf_dump_add_emit_queue_id(struct btf_dump *d, __u32 id) 248 + { 249 + __u32 *new_queue; 250 + size_t new_cap; 251 + 252 + if (d->emit_queue_cnt >= d->emit_queue_cap) { 253 + new_cap = max(16, d->emit_queue_cap * 3 / 2); 254 + new_queue = realloc(d->emit_queue, 255 + new_cap * sizeof(new_queue[0])); 256 + if (!new_queue) 257 + return -ENOMEM; 258 + d->emit_queue = new_queue; 259 + d->emit_queue_cap = new_cap; 260 + } 261 + 262 + d->emit_queue[d->emit_queue_cnt++] = id; 263 + return 0; 264 + } 265 + 266 + /* 267 + * Determine order of emitting dependent types and specified type to satisfy 268 + * C compilation rules. This is done through topological sorting with an 269 + * additional complication which comes from C rules. The main idea for C is 270 + * that if some type is "embedded" into a struct/union, it's size needs to be 271 + * known at the time of definition of containing type. E.g., for: 272 + * 273 + * struct A {}; 274 + * struct B { struct A x; } 275 + * 276 + * struct A *HAS* to be defined before struct B, because it's "embedded", 277 + * i.e., it is part of struct B layout. But in the following case: 278 + * 279 + * struct A; 280 + * struct B { struct A *x; } 281 + * struct A {}; 282 + * 283 + * it's enough to just have a forward declaration of struct A at the time of 284 + * struct B definition, as struct B has a pointer to struct A, so the size of 285 + * field x is known without knowing struct A size: it's sizeof(void *). 286 + * 287 + * Unfortunately, there are some trickier cases we need to handle, e.g.: 288 + * 289 + * struct A {}; // if this was forward-declaration: compilation error 290 + * struct B { 291 + * struct { // anonymous struct 292 + * struct A y; 293 + * } *x; 294 + * }; 295 + * 296 + * In this case, struct B's field x is a pointer, so it's size is known 297 + * regardless of the size of (anonymous) struct it points to. But because this 298 + * struct is anonymous and thus defined inline inside struct B, *and* it 299 + * embeds struct A, compiler requires full definition of struct A to be known 300 + * before struct B can be defined. This creates a transitive dependency 301 + * between struct A and struct B. If struct A was forward-declared before 302 + * struct B definition and fully defined after struct B definition, that would 303 + * trigger compilation error. 304 + * 305 + * All this means that while we are doing topological sorting on BTF type 306 + * graph, we need to determine relationships between different types (graph 307 + * nodes): 308 + * - weak link (relationship) between X and Y, if Y *CAN* be 309 + * forward-declared at the point of X definition; 310 + * - strong link, if Y *HAS* to be fully-defined before X can be defined. 311 + * 312 + * The rule is as follows. Given a chain of BTF types from X to Y, if there is 313 + * BTF_KIND_PTR type in the chain and at least one non-anonymous type 314 + * Z (excluding X, including Y), then link is weak. Otherwise, it's strong. 315 + * Weak/strong relationship is determined recursively during DFS traversal and 316 + * is returned as a result from btf_dump_order_type(). 317 + * 318 + * btf_dump_order_type() is trying to avoid unnecessary forward declarations, 319 + * but it is not guaranteeing that no extraneous forward declarations will be 320 + * emitted. 321 + * 322 + * To avoid extra work, algorithm marks some of BTF types as ORDERED, when 323 + * it's done with them, but not for all (e.g., VOLATILE, CONST, RESTRICT, 324 + * ARRAY, FUNC_PROTO), as weak/strong semantics for those depends on the 325 + * entire graph path, so depending where from one came to that BTF type, it 326 + * might cause weak or strong ordering. For types like STRUCT/UNION/INT/ENUM, 327 + * once they are processed, there is no need to do it again, so they are 328 + * marked as ORDERED. We can mark PTR as ORDERED as well, as it semi-forces 329 + * weak link, unless subsequent referenced STRUCT/UNION/ENUM is anonymous. But 330 + * in any case, once those are processed, no need to do it again, as the 331 + * result won't change. 332 + * 333 + * Returns: 334 + * - 1, if type is part of strong link (so there is strong topological 335 + * ordering requirements); 336 + * - 0, if type is part of weak link (so can be satisfied through forward 337 + * declaration); 338 + * - <0, on error (e.g., unsatisfiable type loop detected). 339 + */ 340 + static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) 341 + { 342 + /* 343 + * Order state is used to detect strong link cycles, but only for BTF 344 + * kinds that are or could be an independent definition (i.e., 345 + * stand-alone fwd decl, enum, typedef, struct, union). Ptrs, arrays, 346 + * func_protos, modifiers are just means to get to these definitions. 347 + * Int/void don't need definitions, they are assumed to be always 348 + * properly defined. We also ignore datasec, var, and funcs for now. 349 + * So for all non-defining kinds, we never even set ordering state, 350 + * for defining kinds we set ORDERING and subsequently ORDERED if it 351 + * forms a strong link. 352 + */ 353 + struct btf_dump_type_aux_state *tstate = &d->type_states[id]; 354 + const struct btf_type *t; 355 + __u16 kind, vlen; 356 + int err, i; 357 + 358 + /* return true, letting typedefs know that it's ok to be emitted */ 359 + if (tstate->order_state == ORDERED) 360 + return 1; 361 + 362 + t = btf__type_by_id(d->btf, id); 363 + kind = btf_kind_of(t); 364 + 365 + if (tstate->order_state == ORDERING) { 366 + /* type loop, but resolvable through fwd declaration */ 367 + if ((kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION) && 368 + through_ptr && t->name_off != 0) 369 + return 0; 370 + pr_warning("unsatisfiable type cycle, id:[%u]\n", id); 371 + return -ELOOP; 372 + } 373 + 374 + switch (kind) { 375 + case BTF_KIND_INT: 376 + tstate->order_state = ORDERED; 377 + return 0; 378 + 379 + case BTF_KIND_PTR: 380 + err = btf_dump_order_type(d, t->type, true); 381 + tstate->order_state = ORDERED; 382 + return err; 383 + 384 + case BTF_KIND_ARRAY: { 385 + const struct btf_array *a = (void *)(t + 1); 386 + 387 + return btf_dump_order_type(d, a->type, through_ptr); 388 + } 389 + case BTF_KIND_STRUCT: 390 + case BTF_KIND_UNION: { 391 + const struct btf_member *m = (void *)(t + 1); 392 + /* 393 + * struct/union is part of strong link, only if it's embedded 394 + * (so no ptr in a path) or it's anonymous (so has to be 395 + * defined inline, even if declared through ptr) 396 + */ 397 + if (through_ptr && t->name_off != 0) 398 + return 0; 399 + 400 + tstate->order_state = ORDERING; 401 + 402 + vlen = btf_vlen_of(t); 403 + for (i = 0; i < vlen; i++, m++) { 404 + err = btf_dump_order_type(d, m->type, false); 405 + if (err < 0) 406 + return err; 407 + } 408 + 409 + if (t->name_off != 0) { 410 + err = btf_dump_add_emit_queue_id(d, id); 411 + if (err < 0) 412 + return err; 413 + } 414 + 415 + tstate->order_state = ORDERED; 416 + return 1; 417 + } 418 + case BTF_KIND_ENUM: 419 + case BTF_KIND_FWD: 420 + if (t->name_off != 0) { 421 + err = btf_dump_add_emit_queue_id(d, id); 422 + if (err) 423 + return err; 424 + } 425 + tstate->order_state = ORDERED; 426 + return 1; 427 + 428 + case BTF_KIND_TYPEDEF: { 429 + int is_strong; 430 + 431 + is_strong = btf_dump_order_type(d, t->type, through_ptr); 432 + if (is_strong < 0) 433 + return is_strong; 434 + 435 + /* typedef is similar to struct/union w.r.t. fwd-decls */ 436 + if (through_ptr && !is_strong) 437 + return 0; 438 + 439 + /* typedef is always a named definition */ 440 + err = btf_dump_add_emit_queue_id(d, id); 441 + if (err) 442 + return err; 443 + 444 + d->type_states[id].order_state = ORDERED; 445 + return 1; 446 + } 447 + case BTF_KIND_VOLATILE: 448 + case BTF_KIND_CONST: 449 + case BTF_KIND_RESTRICT: 450 + return btf_dump_order_type(d, t->type, through_ptr); 451 + 452 + case BTF_KIND_FUNC_PROTO: { 453 + const struct btf_param *p = (void *)(t + 1); 454 + bool is_strong; 455 + 456 + err = btf_dump_order_type(d, t->type, through_ptr); 457 + if (err < 0) 458 + return err; 459 + is_strong = err > 0; 460 + 461 + vlen = btf_vlen_of(t); 462 + for (i = 0; i < vlen; i++, p++) { 463 + err = btf_dump_order_type(d, p->type, through_ptr); 464 + if (err < 0) 465 + return err; 466 + if (err > 0) 467 + is_strong = true; 468 + } 469 + return is_strong; 470 + } 471 + case BTF_KIND_FUNC: 472 + case BTF_KIND_VAR: 473 + case BTF_KIND_DATASEC: 474 + d->type_states[id].order_state = ORDERED; 475 + return 0; 476 + 477 + default: 478 + return -EINVAL; 479 + } 480 + } 481 + 482 + static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, 483 + const struct btf_type *t); 484 + static void btf_dump_emit_struct_def(struct btf_dump *d, __u32 id, 485 + const struct btf_type *t, int lvl); 486 + 487 + static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, 488 + const struct btf_type *t); 489 + static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, 490 + const struct btf_type *t, int lvl); 491 + 492 + static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, 493 + const struct btf_type *t); 494 + 495 + static void btf_dump_emit_typedef_def(struct btf_dump *d, __u32 id, 496 + const struct btf_type *t, int lvl); 497 + 498 + /* a local view into a shared stack */ 499 + struct id_stack { 500 + const __u32 *ids; 501 + int cnt; 502 + }; 503 + 504 + static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, 505 + const char *fname, int lvl); 506 + static void btf_dump_emit_type_chain(struct btf_dump *d, 507 + struct id_stack *decl_stack, 508 + const char *fname, int lvl); 509 + 510 + static const char *btf_dump_type_name(struct btf_dump *d, __u32 id); 511 + static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id); 512 + static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, 513 + const char *orig_name); 514 + 515 + static bool btf_dump_is_blacklisted(struct btf_dump *d, __u32 id) 516 + { 517 + const struct btf_type *t = btf__type_by_id(d->btf, id); 518 + 519 + /* __builtin_va_list is a compiler built-in, which causes compilation 520 + * errors, when compiling w/ different compiler, then used to compile 521 + * original code (e.g., GCC to compile kernel, Clang to use generated 522 + * C header from BTF). As it is built-in, it should be already defined 523 + * properly internally in compiler. 524 + */ 525 + if (t->name_off == 0) 526 + return false; 527 + return strcmp(btf_name_of(d, t->name_off), "__builtin_va_list") == 0; 528 + } 529 + 530 + /* 531 + * Emit C-syntax definitions of types from chains of BTF types. 532 + * 533 + * High-level handling of determining necessary forward declarations are handled 534 + * by btf_dump_emit_type() itself, but all nitty-gritty details of emitting type 535 + * declarations/definitions in C syntax are handled by a combo of 536 + * btf_dump_emit_type_decl()/btf_dump_emit_type_chain() w/ delegation to 537 + * corresponding btf_dump_emit_*_{def,fwd}() functions. 538 + * 539 + * We also keep track of "containing struct/union type ID" to determine when 540 + * we reference it from inside and thus can avoid emitting unnecessary forward 541 + * declaration. 542 + * 543 + * This algorithm is designed in such a way, that even if some error occurs 544 + * (either technical, e.g., out of memory, or logical, i.e., malformed BTF 545 + * that doesn't comply to C rules completely), algorithm will try to proceed 546 + * and produce as much meaningful output as possible. 547 + */ 548 + static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) 549 + { 550 + struct btf_dump_type_aux_state *tstate = &d->type_states[id]; 551 + bool top_level_def = cont_id == 0; 552 + const struct btf_type *t; 553 + __u16 kind; 554 + 555 + if (tstate->emit_state == EMITTED) 556 + return; 557 + 558 + t = btf__type_by_id(d->btf, id); 559 + kind = btf_kind_of(t); 560 + 561 + if (top_level_def && t->name_off == 0) { 562 + pr_warning("unexpected nameless definition, id:[%u]\n", id); 563 + return; 564 + } 565 + 566 + if (tstate->emit_state == EMITTING) { 567 + if (tstate->fwd_emitted) 568 + return; 569 + 570 + switch (kind) { 571 + case BTF_KIND_STRUCT: 572 + case BTF_KIND_UNION: 573 + /* 574 + * if we are referencing a struct/union that we are 575 + * part of - then no need for fwd declaration 576 + */ 577 + if (id == cont_id) 578 + return; 579 + if (t->name_off == 0) { 580 + pr_warning("anonymous struct/union loop, id:[%u]\n", 581 + id); 582 + return; 583 + } 584 + btf_dump_emit_struct_fwd(d, id, t); 585 + btf_dump_printf(d, ";\n\n"); 586 + tstate->fwd_emitted = 1; 587 + break; 588 + case BTF_KIND_TYPEDEF: 589 + /* 590 + * for typedef fwd_emitted means typedef definition 591 + * was emitted, but it can be used only for "weak" 592 + * references through pointer only, not for embedding 593 + */ 594 + if (!btf_dump_is_blacklisted(d, id)) { 595 + btf_dump_emit_typedef_def(d, id, t, 0); 596 + btf_dump_printf(d, ";\n\n"); 597 + }; 598 + tstate->fwd_emitted = 1; 599 + break; 600 + default: 601 + break; 602 + } 603 + 604 + return; 605 + } 606 + 607 + switch (kind) { 608 + case BTF_KIND_INT: 609 + tstate->emit_state = EMITTED; 610 + break; 611 + case BTF_KIND_ENUM: 612 + if (top_level_def) { 613 + btf_dump_emit_enum_def(d, id, t, 0); 614 + btf_dump_printf(d, ";\n\n"); 615 + } 616 + tstate->emit_state = EMITTED; 617 + break; 618 + case BTF_KIND_PTR: 619 + case BTF_KIND_VOLATILE: 620 + case BTF_KIND_CONST: 621 + case BTF_KIND_RESTRICT: 622 + btf_dump_emit_type(d, t->type, cont_id); 623 + break; 624 + case BTF_KIND_ARRAY: { 625 + const struct btf_array *a = (void *)(t + 1); 626 + 627 + btf_dump_emit_type(d, a->type, cont_id); 628 + break; 629 + } 630 + case BTF_KIND_FWD: 631 + btf_dump_emit_fwd_def(d, id, t); 632 + btf_dump_printf(d, ";\n\n"); 633 + tstate->emit_state = EMITTED; 634 + break; 635 + case BTF_KIND_TYPEDEF: 636 + tstate->emit_state = EMITTING; 637 + btf_dump_emit_type(d, t->type, id); 638 + /* 639 + * typedef can server as both definition and forward 640 + * declaration; at this stage someone depends on 641 + * typedef as a forward declaration (refers to it 642 + * through pointer), so unless we already did it, 643 + * emit typedef as a forward declaration 644 + */ 645 + if (!tstate->fwd_emitted && !btf_dump_is_blacklisted(d, id)) { 646 + btf_dump_emit_typedef_def(d, id, t, 0); 647 + btf_dump_printf(d, ";\n\n"); 648 + } 649 + tstate->emit_state = EMITTED; 650 + break; 651 + case BTF_KIND_STRUCT: 652 + case BTF_KIND_UNION: 653 + tstate->emit_state = EMITTING; 654 + /* if it's a top-level struct/union definition or struct/union 655 + * is anonymous, then in C we'll be emitting all fields and 656 + * their types (as opposed to just `struct X`), so we need to 657 + * make sure that all types, referenced from struct/union 658 + * members have necessary forward-declarations, where 659 + * applicable 660 + */ 661 + if (top_level_def || t->name_off == 0) { 662 + const struct btf_member *m = (void *)(t + 1); 663 + __u16 vlen = btf_vlen_of(t); 664 + int i, new_cont_id; 665 + 666 + new_cont_id = t->name_off == 0 ? cont_id : id; 667 + for (i = 0; i < vlen; i++, m++) 668 + btf_dump_emit_type(d, m->type, new_cont_id); 669 + } else if (!tstate->fwd_emitted && id != cont_id) { 670 + btf_dump_emit_struct_fwd(d, id, t); 671 + btf_dump_printf(d, ";\n\n"); 672 + tstate->fwd_emitted = 1; 673 + } 674 + 675 + if (top_level_def) { 676 + btf_dump_emit_struct_def(d, id, t, 0); 677 + btf_dump_printf(d, ";\n\n"); 678 + tstate->emit_state = EMITTED; 679 + } else { 680 + tstate->emit_state = NOT_EMITTED; 681 + } 682 + break; 683 + case BTF_KIND_FUNC_PROTO: { 684 + const struct btf_param *p = (void *)(t + 1); 685 + __u16 vlen = btf_vlen_of(t); 686 + int i; 687 + 688 + btf_dump_emit_type(d, t->type, cont_id); 689 + for (i = 0; i < vlen; i++, p++) 690 + btf_dump_emit_type(d, p->type, cont_id); 691 + 692 + break; 693 + } 694 + default: 695 + break; 696 + } 697 + } 698 + 699 + static int btf_align_of(const struct btf *btf, __u32 id) 700 + { 701 + const struct btf_type *t = btf__type_by_id(btf, id); 702 + __u16 kind = btf_kind_of(t); 703 + 704 + switch (kind) { 705 + case BTF_KIND_INT: 706 + case BTF_KIND_ENUM: 707 + return min(sizeof(void *), t->size); 708 + case BTF_KIND_PTR: 709 + return sizeof(void *); 710 + case BTF_KIND_TYPEDEF: 711 + case BTF_KIND_VOLATILE: 712 + case BTF_KIND_CONST: 713 + case BTF_KIND_RESTRICT: 714 + return btf_align_of(btf, t->type); 715 + case BTF_KIND_ARRAY: { 716 + const struct btf_array *a = (void *)(t + 1); 717 + 718 + return btf_align_of(btf, a->type); 719 + } 720 + case BTF_KIND_STRUCT: 721 + case BTF_KIND_UNION: { 722 + const struct btf_member *m = (void *)(t + 1); 723 + __u16 vlen = btf_vlen_of(t); 724 + int i, align = 1; 725 + 726 + for (i = 0; i < vlen; i++, m++) 727 + align = max(align, btf_align_of(btf, m->type)); 728 + 729 + return align; 730 + } 731 + default: 732 + pr_warning("unsupported BTF_KIND:%u\n", btf_kind_of(t)); 733 + return 1; 734 + } 735 + } 736 + 737 + static bool btf_is_struct_packed(const struct btf *btf, __u32 id, 738 + const struct btf_type *t) 739 + { 740 + const struct btf_member *m; 741 + int align, i, bit_sz; 742 + __u16 vlen; 743 + bool kflag; 744 + 745 + align = btf_align_of(btf, id); 746 + /* size of a non-packed struct has to be a multiple of its alignment*/ 747 + if (t->size % align) 748 + return true; 749 + 750 + m = (void *)(t + 1); 751 + kflag = btf_kflag_of(t); 752 + vlen = btf_vlen_of(t); 753 + /* all non-bitfield fields have to be naturally aligned */ 754 + for (i = 0; i < vlen; i++, m++) { 755 + align = btf_align_of(btf, m->type); 756 + bit_sz = kflag ? BTF_MEMBER_BITFIELD_SIZE(m->offset) : 0; 757 + if (bit_sz == 0 && m->offset % (8 * align) != 0) 758 + return true; 759 + } 760 + 761 + /* 762 + * if original struct was marked as packed, but its layout is 763 + * naturally aligned, we'll detect that it's not packed 764 + */ 765 + return false; 766 + } 767 + 768 + static int chip_away_bits(int total, int at_most) 769 + { 770 + return total % at_most ? : at_most; 771 + } 772 + 773 + static void btf_dump_emit_bit_padding(const struct btf_dump *d, 774 + int cur_off, int m_off, int m_bit_sz, 775 + int align, int lvl) 776 + { 777 + int off_diff = m_off - cur_off; 778 + int ptr_bits = sizeof(void *) * 8; 779 + 780 + if (off_diff <= 0) 781 + /* no gap */ 782 + return; 783 + if (m_bit_sz == 0 && off_diff < align * 8) 784 + /* natural padding will take care of a gap */ 785 + return; 786 + 787 + while (off_diff > 0) { 788 + const char *pad_type; 789 + int pad_bits; 790 + 791 + if (ptr_bits > 32 && off_diff > 32) { 792 + pad_type = "long"; 793 + pad_bits = chip_away_bits(off_diff, ptr_bits); 794 + } else if (off_diff > 16) { 795 + pad_type = "int"; 796 + pad_bits = chip_away_bits(off_diff, 32); 797 + } else if (off_diff > 8) { 798 + pad_type = "short"; 799 + pad_bits = chip_away_bits(off_diff, 16); 800 + } else { 801 + pad_type = "char"; 802 + pad_bits = chip_away_bits(off_diff, 8); 803 + } 804 + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); 805 + off_diff -= pad_bits; 806 + } 807 + } 808 + 809 + static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, 810 + const struct btf_type *t) 811 + { 812 + btf_dump_printf(d, "%s %s", 813 + btf_kind_of(t) == BTF_KIND_STRUCT ? "struct" : "union", 814 + btf_dump_type_name(d, id)); 815 + } 816 + 817 + static void btf_dump_emit_struct_def(struct btf_dump *d, 818 + __u32 id, 819 + const struct btf_type *t, 820 + int lvl) 821 + { 822 + const struct btf_member *m = (void *)(t + 1); 823 + bool kflag = btf_kflag_of(t), is_struct; 824 + int align, i, packed, off = 0; 825 + __u16 vlen = btf_vlen_of(t); 826 + 827 + is_struct = btf_kind_of(t) == BTF_KIND_STRUCT; 828 + packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0; 829 + align = packed ? 1 : btf_align_of(d->btf, id); 830 + 831 + btf_dump_printf(d, "%s%s%s {", 832 + is_struct ? "struct" : "union", 833 + t->name_off ? " " : "", 834 + btf_dump_type_name(d, id)); 835 + 836 + for (i = 0; i < vlen; i++, m++) { 837 + const char *fname; 838 + int m_off, m_sz; 839 + 840 + fname = btf_name_of(d, m->name_off); 841 + m_sz = kflag ? BTF_MEMBER_BITFIELD_SIZE(m->offset) : 0; 842 + m_off = kflag ? BTF_MEMBER_BIT_OFFSET(m->offset) : m->offset; 843 + align = packed ? 1 : btf_align_of(d->btf, m->type); 844 + 845 + btf_dump_emit_bit_padding(d, off, m_off, m_sz, align, lvl + 1); 846 + btf_dump_printf(d, "\n%s", pfx(lvl + 1)); 847 + btf_dump_emit_type_decl(d, m->type, fname, lvl + 1); 848 + 849 + if (m_sz) { 850 + btf_dump_printf(d, ": %d", m_sz); 851 + off = m_off + m_sz; 852 + } else { 853 + m_sz = max(0, btf__resolve_size(d->btf, m->type)); 854 + off = m_off + m_sz * 8; 855 + } 856 + btf_dump_printf(d, ";"); 857 + } 858 + 859 + if (vlen) 860 + btf_dump_printf(d, "\n"); 861 + btf_dump_printf(d, "%s}", pfx(lvl)); 862 + if (packed) 863 + btf_dump_printf(d, " __attribute__((packed))"); 864 + } 865 + 866 + static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, 867 + const struct btf_type *t) 868 + { 869 + btf_dump_printf(d, "enum %s", btf_dump_type_name(d, id)); 870 + } 871 + 872 + static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, 873 + const struct btf_type *t, 874 + int lvl) 875 + { 876 + const struct btf_enum *v = (void *)(t+1); 877 + __u16 vlen = btf_vlen_of(t); 878 + const char *name; 879 + size_t dup_cnt; 880 + int i; 881 + 882 + btf_dump_printf(d, "enum%s%s", 883 + t->name_off ? " " : "", 884 + btf_dump_type_name(d, id)); 885 + 886 + if (vlen) { 887 + btf_dump_printf(d, " {"); 888 + for (i = 0; i < vlen; i++, v++) { 889 + name = btf_name_of(d, v->name_off); 890 + /* enumerators share namespace with typedef idents */ 891 + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); 892 + if (dup_cnt > 1) { 893 + btf_dump_printf(d, "\n%s%s___%zu = %d,", 894 + pfx(lvl + 1), name, dup_cnt, 895 + (__s32)v->val); 896 + } else { 897 + btf_dump_printf(d, "\n%s%s = %d,", 898 + pfx(lvl + 1), name, 899 + (__s32)v->val); 900 + } 901 + } 902 + btf_dump_printf(d, "\n%s}", pfx(lvl)); 903 + } 904 + } 905 + 906 + static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, 907 + const struct btf_type *t) 908 + { 909 + const char *name = btf_dump_type_name(d, id); 910 + 911 + if (btf_kflag_of(t)) 912 + btf_dump_printf(d, "union %s", name); 913 + else 914 + btf_dump_printf(d, "struct %s", name); 915 + } 916 + 917 + static void btf_dump_emit_typedef_def(struct btf_dump *d, __u32 id, 918 + const struct btf_type *t, int lvl) 919 + { 920 + const char *name = btf_dump_ident_name(d, id); 921 + 922 + btf_dump_printf(d, "typedef "); 923 + btf_dump_emit_type_decl(d, t->type, name, lvl); 924 + } 925 + 926 + static int btf_dump_push_decl_stack_id(struct btf_dump *d, __u32 id) 927 + { 928 + __u32 *new_stack; 929 + size_t new_cap; 930 + 931 + if (d->decl_stack_cnt >= d->decl_stack_cap) { 932 + new_cap = max(16, d->decl_stack_cap * 3 / 2); 933 + new_stack = realloc(d->decl_stack, 934 + new_cap * sizeof(new_stack[0])); 935 + if (!new_stack) 936 + return -ENOMEM; 937 + d->decl_stack = new_stack; 938 + d->decl_stack_cap = new_cap; 939 + } 940 + 941 + d->decl_stack[d->decl_stack_cnt++] = id; 942 + 943 + return 0; 944 + } 945 + 946 + /* 947 + * Emit type declaration (e.g., field type declaration in a struct or argument 948 + * declaration in function prototype) in correct C syntax. 949 + * 950 + * For most types it's trivial, but there are few quirky type declaration 951 + * cases worth mentioning: 952 + * - function prototypes (especially nesting of function prototypes); 953 + * - arrays; 954 + * - const/volatile/restrict for pointers vs other types. 955 + * 956 + * For a good discussion of *PARSING* C syntax (as a human), see 957 + * Peter van der Linden's "Expert C Programming: Deep C Secrets", 958 + * Ch.3 "Unscrambling Declarations in C". 959 + * 960 + * It won't help with BTF to C conversion much, though, as it's an opposite 961 + * problem. So we came up with this algorithm in reverse to van der Linden's 962 + * parsing algorithm. It goes from structured BTF representation of type 963 + * declaration to a valid compilable C syntax. 964 + * 965 + * For instance, consider this C typedef: 966 + * typedef const int * const * arr[10] arr_t; 967 + * It will be represented in BTF with this chain of BTF types: 968 + * [typedef] -> [array] -> [ptr] -> [const] -> [ptr] -> [const] -> [int] 969 + * 970 + * Notice how [const] modifier always goes before type it modifies in BTF type 971 + * graph, but in C syntax, const/volatile/restrict modifiers are written to 972 + * the right of pointers, but to the left of other types. There are also other 973 + * quirks, like function pointers, arrays of them, functions returning other 974 + * functions, etc. 975 + * 976 + * We handle that by pushing all the types to a stack, until we hit "terminal" 977 + * type (int/enum/struct/union/fwd). Then depending on the kind of a type on 978 + * top of a stack, modifiers are handled differently. Array/function pointers 979 + * have also wildly different syntax and how nesting of them are done. See 980 + * code for authoritative definition. 981 + * 982 + * To avoid allocating new stack for each independent chain of BTF types, we 983 + * share one bigger stack, with each chain working only on its own local view 984 + * of a stack frame. Some care is required to "pop" stack frames after 985 + * processing type declaration chain. 986 + */ 987 + static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, 988 + const char *fname, int lvl) 989 + { 990 + struct id_stack decl_stack; 991 + const struct btf_type *t; 992 + int err, stack_start; 993 + __u16 kind; 994 + 995 + stack_start = d->decl_stack_cnt; 996 + for (;;) { 997 + err = btf_dump_push_decl_stack_id(d, id); 998 + if (err < 0) { 999 + /* 1000 + * if we don't have enough memory for entire type decl 1001 + * chain, restore stack, emit warning, and try to 1002 + * proceed nevertheless 1003 + */ 1004 + pr_warning("not enough memory for decl stack:%d", err); 1005 + d->decl_stack_cnt = stack_start; 1006 + return; 1007 + } 1008 + 1009 + /* VOID */ 1010 + if (id == 0) 1011 + break; 1012 + 1013 + t = btf__type_by_id(d->btf, id); 1014 + kind = btf_kind_of(t); 1015 + switch (kind) { 1016 + case BTF_KIND_PTR: 1017 + case BTF_KIND_VOLATILE: 1018 + case BTF_KIND_CONST: 1019 + case BTF_KIND_RESTRICT: 1020 + case BTF_KIND_FUNC_PROTO: 1021 + id = t->type; 1022 + break; 1023 + case BTF_KIND_ARRAY: { 1024 + const struct btf_array *a = (void *)(t + 1); 1025 + 1026 + id = a->type; 1027 + break; 1028 + } 1029 + case BTF_KIND_INT: 1030 + case BTF_KIND_ENUM: 1031 + case BTF_KIND_FWD: 1032 + case BTF_KIND_STRUCT: 1033 + case BTF_KIND_UNION: 1034 + case BTF_KIND_TYPEDEF: 1035 + goto done; 1036 + default: 1037 + pr_warning("unexpected type in decl chain, kind:%u, id:[%u]\n", 1038 + kind, id); 1039 + goto done; 1040 + } 1041 + } 1042 + done: 1043 + /* 1044 + * We might be inside a chain of declarations (e.g., array of function 1045 + * pointers returning anonymous (so inlined) structs, having another 1046 + * array field). Each of those needs its own "stack frame" to handle 1047 + * emitting of declarations. Those stack frames are non-overlapping 1048 + * portions of shared btf_dump->decl_stack. To make it a bit nicer to 1049 + * handle this set of nested stacks, we create a view corresponding to 1050 + * our own "stack frame" and work with it as an independent stack. 1051 + * We'll need to clean up after emit_type_chain() returns, though. 1052 + */ 1053 + decl_stack.ids = d->decl_stack + stack_start; 1054 + decl_stack.cnt = d->decl_stack_cnt - stack_start; 1055 + btf_dump_emit_type_chain(d, &decl_stack, fname, lvl); 1056 + /* 1057 + * emit_type_chain() guarantees that it will pop its entire decl_stack 1058 + * frame before returning. But it works with a read-only view into 1059 + * decl_stack, so it doesn't actually pop anything from the 1060 + * perspective of shared btf_dump->decl_stack, per se. We need to 1061 + * reset decl_stack state to how it was before us to avoid it growing 1062 + * all the time. 1063 + */ 1064 + d->decl_stack_cnt = stack_start; 1065 + } 1066 + 1067 + static void btf_dump_emit_mods(struct btf_dump *d, struct id_stack *decl_stack) 1068 + { 1069 + const struct btf_type *t; 1070 + __u32 id; 1071 + 1072 + while (decl_stack->cnt) { 1073 + id = decl_stack->ids[decl_stack->cnt - 1]; 1074 + t = btf__type_by_id(d->btf, id); 1075 + 1076 + switch (btf_kind_of(t)) { 1077 + case BTF_KIND_VOLATILE: 1078 + btf_dump_printf(d, "volatile "); 1079 + break; 1080 + case BTF_KIND_CONST: 1081 + btf_dump_printf(d, "const "); 1082 + break; 1083 + case BTF_KIND_RESTRICT: 1084 + btf_dump_printf(d, "restrict "); 1085 + break; 1086 + default: 1087 + return; 1088 + } 1089 + decl_stack->cnt--; 1090 + } 1091 + } 1092 + 1093 + static bool btf_is_mod_kind(const struct btf *btf, __u32 id) 1094 + { 1095 + const struct btf_type *t = btf__type_by_id(btf, id); 1096 + 1097 + switch (btf_kind_of(t)) { 1098 + case BTF_KIND_VOLATILE: 1099 + case BTF_KIND_CONST: 1100 + case BTF_KIND_RESTRICT: 1101 + return true; 1102 + default: 1103 + return false; 1104 + } 1105 + } 1106 + 1107 + static void btf_dump_emit_name(const struct btf_dump *d, 1108 + const char *name, bool last_was_ptr) 1109 + { 1110 + bool separate = name[0] && !last_was_ptr; 1111 + 1112 + btf_dump_printf(d, "%s%s", separate ? " " : "", name); 1113 + } 1114 + 1115 + static void btf_dump_emit_type_chain(struct btf_dump *d, 1116 + struct id_stack *decls, 1117 + const char *fname, int lvl) 1118 + { 1119 + /* 1120 + * last_was_ptr is used to determine if we need to separate pointer 1121 + * asterisk (*) from previous part of type signature with space, so 1122 + * that we get `int ***`, instead of `int * * *`. We default to true 1123 + * for cases where we have single pointer in a chain. E.g., in ptr -> 1124 + * func_proto case. func_proto will start a new emit_type_chain call 1125 + * with just ptr, which should be emitted as (*) or (*<fname>), so we 1126 + * don't want to prepend space for that last pointer. 1127 + */ 1128 + bool last_was_ptr = true; 1129 + const struct btf_type *t; 1130 + const char *name; 1131 + __u16 kind; 1132 + __u32 id; 1133 + 1134 + while (decls->cnt) { 1135 + id = decls->ids[--decls->cnt]; 1136 + if (id == 0) { 1137 + /* VOID is a special snowflake */ 1138 + btf_dump_emit_mods(d, decls); 1139 + btf_dump_printf(d, "void"); 1140 + last_was_ptr = false; 1141 + continue; 1142 + } 1143 + 1144 + t = btf__type_by_id(d->btf, id); 1145 + kind = btf_kind_of(t); 1146 + 1147 + switch (kind) { 1148 + case BTF_KIND_INT: 1149 + btf_dump_emit_mods(d, decls); 1150 + name = btf_name_of(d, t->name_off); 1151 + btf_dump_printf(d, "%s", name); 1152 + break; 1153 + case BTF_KIND_STRUCT: 1154 + case BTF_KIND_UNION: 1155 + btf_dump_emit_mods(d, decls); 1156 + /* inline anonymous struct/union */ 1157 + if (t->name_off == 0) 1158 + btf_dump_emit_struct_def(d, id, t, lvl); 1159 + else 1160 + btf_dump_emit_struct_fwd(d, id, t); 1161 + break; 1162 + case BTF_KIND_ENUM: 1163 + btf_dump_emit_mods(d, decls); 1164 + /* inline anonymous enum */ 1165 + if (t->name_off == 0) 1166 + btf_dump_emit_enum_def(d, id, t, lvl); 1167 + else 1168 + btf_dump_emit_enum_fwd(d, id, t); 1169 + break; 1170 + case BTF_KIND_FWD: 1171 + btf_dump_emit_mods(d, decls); 1172 + btf_dump_emit_fwd_def(d, id, t); 1173 + break; 1174 + case BTF_KIND_TYPEDEF: 1175 + btf_dump_emit_mods(d, decls); 1176 + btf_dump_printf(d, "%s", btf_dump_ident_name(d, id)); 1177 + break; 1178 + case BTF_KIND_PTR: 1179 + btf_dump_printf(d, "%s", last_was_ptr ? "*" : " *"); 1180 + break; 1181 + case BTF_KIND_VOLATILE: 1182 + btf_dump_printf(d, " volatile"); 1183 + break; 1184 + case BTF_KIND_CONST: 1185 + btf_dump_printf(d, " const"); 1186 + break; 1187 + case BTF_KIND_RESTRICT: 1188 + btf_dump_printf(d, " restrict"); 1189 + break; 1190 + case BTF_KIND_ARRAY: { 1191 + const struct btf_array *a = (void *)(t + 1); 1192 + const struct btf_type *next_t; 1193 + __u32 next_id; 1194 + bool multidim; 1195 + /* 1196 + * GCC has a bug 1197 + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=8354) 1198 + * which causes it to emit extra const/volatile 1199 + * modifiers for an array, if array's element type has 1200 + * const/volatile modifiers. Clang doesn't do that. 1201 + * In general, it doesn't seem very meaningful to have 1202 + * a const/volatile modifier for array, so we are 1203 + * going to silently skip them here. 1204 + */ 1205 + while (decls->cnt) { 1206 + next_id = decls->ids[decls->cnt - 1]; 1207 + if (btf_is_mod_kind(d->btf, next_id)) 1208 + decls->cnt--; 1209 + else 1210 + break; 1211 + } 1212 + 1213 + if (decls->cnt == 0) { 1214 + btf_dump_emit_name(d, fname, last_was_ptr); 1215 + btf_dump_printf(d, "[%u]", a->nelems); 1216 + return; 1217 + } 1218 + 1219 + next_t = btf__type_by_id(d->btf, next_id); 1220 + multidim = btf_kind_of(next_t) == BTF_KIND_ARRAY; 1221 + /* we need space if we have named non-pointer */ 1222 + if (fname[0] && !last_was_ptr) 1223 + btf_dump_printf(d, " "); 1224 + /* no parentheses for multi-dimensional array */ 1225 + if (!multidim) 1226 + btf_dump_printf(d, "("); 1227 + btf_dump_emit_type_chain(d, decls, fname, lvl); 1228 + if (!multidim) 1229 + btf_dump_printf(d, ")"); 1230 + btf_dump_printf(d, "[%u]", a->nelems); 1231 + return; 1232 + } 1233 + case BTF_KIND_FUNC_PROTO: { 1234 + const struct btf_param *p = (void *)(t + 1); 1235 + __u16 vlen = btf_vlen_of(t); 1236 + int i; 1237 + 1238 + btf_dump_emit_mods(d, decls); 1239 + if (decls->cnt) { 1240 + btf_dump_printf(d, " ("); 1241 + btf_dump_emit_type_chain(d, decls, fname, lvl); 1242 + btf_dump_printf(d, ")"); 1243 + } else { 1244 + btf_dump_emit_name(d, fname, last_was_ptr); 1245 + } 1246 + btf_dump_printf(d, "("); 1247 + /* 1248 + * Clang for BPF target generates func_proto with no 1249 + * args as a func_proto with a single void arg (e.g., 1250 + * `int (*f)(void)` vs just `int (*f)()`). We are 1251 + * going to pretend there are no args for such case. 1252 + */ 1253 + if (vlen == 1 && p->type == 0) { 1254 + btf_dump_printf(d, ")"); 1255 + return; 1256 + } 1257 + 1258 + for (i = 0; i < vlen; i++, p++) { 1259 + if (i > 0) 1260 + btf_dump_printf(d, ", "); 1261 + 1262 + /* last arg of type void is vararg */ 1263 + if (i == vlen - 1 && p->type == 0) { 1264 + btf_dump_printf(d, "..."); 1265 + break; 1266 + } 1267 + 1268 + name = btf_name_of(d, p->name_off); 1269 + btf_dump_emit_type_decl(d, p->type, name, lvl); 1270 + } 1271 + 1272 + btf_dump_printf(d, ")"); 1273 + return; 1274 + } 1275 + default: 1276 + pr_warning("unexpected type in decl chain, kind:%u, id:[%u]\n", 1277 + kind, id); 1278 + return; 1279 + } 1280 + 1281 + last_was_ptr = kind == BTF_KIND_PTR; 1282 + } 1283 + 1284 + btf_dump_emit_name(d, fname, last_was_ptr); 1285 + } 1286 + 1287 + /* return number of duplicates (occurrences) of a given name */ 1288 + static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, 1289 + const char *orig_name) 1290 + { 1291 + size_t dup_cnt = 0; 1292 + 1293 + hashmap__find(name_map, orig_name, (void **)&dup_cnt); 1294 + dup_cnt++; 1295 + hashmap__set(name_map, orig_name, (void *)dup_cnt, NULL, NULL); 1296 + 1297 + return dup_cnt; 1298 + } 1299 + 1300 + static const char *btf_dump_resolve_name(struct btf_dump *d, __u32 id, 1301 + struct hashmap *name_map) 1302 + { 1303 + struct btf_dump_type_aux_state *s = &d->type_states[id]; 1304 + const struct btf_type *t = btf__type_by_id(d->btf, id); 1305 + const char *orig_name = btf_name_of(d, t->name_off); 1306 + const char **cached_name = &d->cached_names[id]; 1307 + size_t dup_cnt; 1308 + 1309 + if (t->name_off == 0) 1310 + return ""; 1311 + 1312 + if (s->name_resolved) 1313 + return *cached_name ? *cached_name : orig_name; 1314 + 1315 + dup_cnt = btf_dump_name_dups(d, name_map, orig_name); 1316 + if (dup_cnt > 1) { 1317 + const size_t max_len = 256; 1318 + char new_name[max_len]; 1319 + 1320 + snprintf(new_name, max_len, "%s___%zu", orig_name, dup_cnt); 1321 + *cached_name = strdup(new_name); 1322 + } 1323 + 1324 + s->name_resolved = 1; 1325 + return *cached_name ? *cached_name : orig_name; 1326 + } 1327 + 1328 + static const char *btf_dump_type_name(struct btf_dump *d, __u32 id) 1329 + { 1330 + return btf_dump_resolve_name(d, id, d->type_names); 1331 + } 1332 + 1333 + static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id) 1334 + { 1335 + return btf_dump_resolve_name(d, id, d->ident_names); 1336 + }

+229

tools/lib/bpf/hashmap.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * Generic non-thread safe hash map implementation. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + #include <stdint.h> 9 + #include <stdlib.h> 10 + #include <stdio.h> 11 + #include <errno.h> 12 + #include <linux/err.h> 13 + #include "hashmap.h" 14 + 15 + /* start with 4 buckets */ 16 + #define HASHMAP_MIN_CAP_BITS 2 17 + 18 + static void hashmap_add_entry(struct hashmap_entry **pprev, 19 + struct hashmap_entry *entry) 20 + { 21 + entry->next = *pprev; 22 + *pprev = entry; 23 + } 24 + 25 + static void hashmap_del_entry(struct hashmap_entry **pprev, 26 + struct hashmap_entry *entry) 27 + { 28 + *pprev = entry->next; 29 + entry->next = NULL; 30 + } 31 + 32 + void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, 33 + hashmap_equal_fn equal_fn, void *ctx) 34 + { 35 + map->hash_fn = hash_fn; 36 + map->equal_fn = equal_fn; 37 + map->ctx = ctx; 38 + 39 + map->buckets = NULL; 40 + map->cap = 0; 41 + map->cap_bits = 0; 42 + map->sz = 0; 43 + } 44 + 45 + struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, 46 + hashmap_equal_fn equal_fn, 47 + void *ctx) 48 + { 49 + struct hashmap *map = malloc(sizeof(struct hashmap)); 50 + 51 + if (!map) 52 + return ERR_PTR(-ENOMEM); 53 + hashmap__init(map, hash_fn, equal_fn, ctx); 54 + return map; 55 + } 56 + 57 + void hashmap__clear(struct hashmap *map) 58 + { 59 + free(map->buckets); 60 + map->cap = map->cap_bits = map->sz = 0; 61 + } 62 + 63 + void hashmap__free(struct hashmap *map) 64 + { 65 + if (!map) 66 + return; 67 + 68 + hashmap__clear(map); 69 + free(map); 70 + } 71 + 72 + size_t hashmap__size(const struct hashmap *map) 73 + { 74 + return map->sz; 75 + } 76 + 77 + size_t hashmap__capacity(const struct hashmap *map) 78 + { 79 + return map->cap; 80 + } 81 + 82 + static bool hashmap_needs_to_grow(struct hashmap *map) 83 + { 84 + /* grow if empty or more than 75% filled */ 85 + return (map->cap == 0) || ((map->sz + 1) * 4 / 3 > map->cap); 86 + } 87 + 88 + static int hashmap_grow(struct hashmap *map) 89 + { 90 + struct hashmap_entry **new_buckets; 91 + struct hashmap_entry *cur, *tmp; 92 + size_t new_cap_bits, new_cap; 93 + size_t h; 94 + int bkt; 95 + 96 + new_cap_bits = map->cap_bits + 1; 97 + if (new_cap_bits < HASHMAP_MIN_CAP_BITS) 98 + new_cap_bits = HASHMAP_MIN_CAP_BITS; 99 + 100 + new_cap = 1UL << new_cap_bits; 101 + new_buckets = calloc(new_cap, sizeof(new_buckets[0])); 102 + if (!new_buckets) 103 + return -ENOMEM; 104 + 105 + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { 106 + h = hash_bits(map->hash_fn(cur->key, map->ctx), new_cap_bits); 107 + hashmap_add_entry(&new_buckets[h], cur); 108 + } 109 + 110 + map->cap = new_cap; 111 + map->cap_bits = new_cap_bits; 112 + free(map->buckets); 113 + map->buckets = new_buckets; 114 + 115 + return 0; 116 + } 117 + 118 + static bool hashmap_find_entry(const struct hashmap *map, 119 + const void *key, size_t hash, 120 + struct hashmap_entry ***pprev, 121 + struct hashmap_entry **entry) 122 + { 123 + struct hashmap_entry *cur, **prev_ptr; 124 + 125 + if (!map->buckets) 126 + return false; 127 + 128 + for (prev_ptr = &map->buckets[hash], cur = *prev_ptr; 129 + cur; 130 + prev_ptr = &cur->next, cur = cur->next) { 131 + if (map->equal_fn(cur->key, key, map->ctx)) { 132 + if (pprev) 133 + *pprev = prev_ptr; 134 + *entry = cur; 135 + return true; 136 + } 137 + } 138 + 139 + return false; 140 + } 141 + 142 + int hashmap__insert(struct hashmap *map, const void *key, void *value, 143 + enum hashmap_insert_strategy strategy, 144 + const void **old_key, void **old_value) 145 + { 146 + struct hashmap_entry *entry; 147 + size_t h; 148 + int err; 149 + 150 + if (old_key) 151 + *old_key = NULL; 152 + if (old_value) 153 + *old_value = NULL; 154 + 155 + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); 156 + if (strategy != HASHMAP_APPEND && 157 + hashmap_find_entry(map, key, h, NULL, &entry)) { 158 + if (old_key) 159 + *old_key = entry->key; 160 + if (old_value) 161 + *old_value = entry->value; 162 + 163 + if (strategy == HASHMAP_SET || strategy == HASHMAP_UPDATE) { 164 + entry->key = key; 165 + entry->value = value; 166 + return 0; 167 + } else if (strategy == HASHMAP_ADD) { 168 + return -EEXIST; 169 + } 170 + } 171 + 172 + if (strategy == HASHMAP_UPDATE) 173 + return -ENOENT; 174 + 175 + if (hashmap_needs_to_grow(map)) { 176 + err = hashmap_grow(map); 177 + if (err) 178 + return err; 179 + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); 180 + } 181 + 182 + entry = malloc(sizeof(struct hashmap_entry)); 183 + if (!entry) 184 + return -ENOMEM; 185 + 186 + entry->key = key; 187 + entry->value = value; 188 + hashmap_add_entry(&map->buckets[h], entry); 189 + map->sz++; 190 + 191 + return 0; 192 + } 193 + 194 + bool hashmap__find(const struct hashmap *map, const void *key, void **value) 195 + { 196 + struct hashmap_entry *entry; 197 + size_t h; 198 + 199 + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); 200 + if (!hashmap_find_entry(map, key, h, NULL, &entry)) 201 + return false; 202 + 203 + if (value) 204 + *value = entry->value; 205 + return true; 206 + } 207 + 208 + bool hashmap__delete(struct hashmap *map, const void *key, 209 + const void **old_key, void **old_value) 210 + { 211 + struct hashmap_entry **pprev, *entry; 212 + size_t h; 213 + 214 + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); 215 + if (!hashmap_find_entry(map, key, h, &pprev, &entry)) 216 + return false; 217 + 218 + if (old_key) 219 + *old_key = entry->key; 220 + if (old_value) 221 + *old_value = entry->value; 222 + 223 + hashmap_del_entry(pprev, entry); 224 + free(entry); 225 + map->sz--; 226 + 227 + return true; 228 + } 229 +

+173

tools/lib/bpf/hashmap.h

··· 1 + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 + 3 + /* 4 + * Generic non-thread safe hash map implementation. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + #ifndef __LIBBPF_HASHMAP_H 9 + #define __LIBBPF_HASHMAP_H 10 + 11 + #include <stdbool.h> 12 + #include <stddef.h> 13 + #include "libbpf_internal.h" 14 + 15 + static inline size_t hash_bits(size_t h, int bits) 16 + { 17 + /* shuffle bits and return requested number of upper bits */ 18 + return (h * 11400714819323198485llu) >> (__WORDSIZE - bits); 19 + } 20 + 21 + typedef size_t (*hashmap_hash_fn)(const void *key, void *ctx); 22 + typedef bool (*hashmap_equal_fn)(const void *key1, const void *key2, void *ctx); 23 + 24 + struct hashmap_entry { 25 + const void *key; 26 + void *value; 27 + struct hashmap_entry *next; 28 + }; 29 + 30 + struct hashmap { 31 + hashmap_hash_fn hash_fn; 32 + hashmap_equal_fn equal_fn; 33 + void *ctx; 34 + 35 + struct hashmap_entry **buckets; 36 + size_t cap; 37 + size_t cap_bits; 38 + size_t sz; 39 + }; 40 + 41 + #define HASHMAP_INIT(hash_fn, equal_fn, ctx) { \ 42 + .hash_fn = (hash_fn), \ 43 + .equal_fn = (equal_fn), \ 44 + .ctx = (ctx), \ 45 + .buckets = NULL, \ 46 + .cap = 0, \ 47 + .cap_bits = 0, \ 48 + .sz = 0, \ 49 + } 50 + 51 + void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, 52 + hashmap_equal_fn equal_fn, void *ctx); 53 + struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, 54 + hashmap_equal_fn equal_fn, 55 + void *ctx); 56 + void hashmap__clear(struct hashmap *map); 57 + void hashmap__free(struct hashmap *map); 58 + 59 + size_t hashmap__size(const struct hashmap *map); 60 + size_t hashmap__capacity(const struct hashmap *map); 61 + 62 + /* 63 + * Hashmap insertion strategy: 64 + * - HASHMAP_ADD - only add key/value if key doesn't exist yet; 65 + * - HASHMAP_SET - add key/value pair if key doesn't exist yet; otherwise, 66 + * update value; 67 + * - HASHMAP_UPDATE - update value, if key already exists; otherwise, do 68 + * nothing and return -ENOENT; 69 + * - HASHMAP_APPEND - always add key/value pair, even if key already exists. 70 + * This turns hashmap into a multimap by allowing multiple values to be 71 + * associated with the same key. Most useful read API for such hashmap is 72 + * hashmap__for_each_key_entry() iteration. If hashmap__find() is still 73 + * used, it will return last inserted key/value entry (first in a bucket 74 + * chain). 75 + */ 76 + enum hashmap_insert_strategy { 77 + HASHMAP_ADD, 78 + HASHMAP_SET, 79 + HASHMAP_UPDATE, 80 + HASHMAP_APPEND, 81 + }; 82 + 83 + /* 84 + * hashmap__insert() adds key/value entry w/ various semantics, depending on 85 + * provided strategy value. If a given key/value pair replaced already 86 + * existing key/value pair, both old key and old value will be returned 87 + * through old_key and old_value to allow calling code do proper memory 88 + * management. 89 + */ 90 + int hashmap__insert(struct hashmap *map, const void *key, void *value, 91 + enum hashmap_insert_strategy strategy, 92 + const void **old_key, void **old_value); 93 + 94 + static inline int hashmap__add(struct hashmap *map, 95 + const void *key, void *value) 96 + { 97 + return hashmap__insert(map, key, value, HASHMAP_ADD, NULL, NULL); 98 + } 99 + 100 + static inline int hashmap__set(struct hashmap *map, 101 + const void *key, void *value, 102 + const void **old_key, void **old_value) 103 + { 104 + return hashmap__insert(map, key, value, HASHMAP_SET, 105 + old_key, old_value); 106 + } 107 + 108 + static inline int hashmap__update(struct hashmap *map, 109 + const void *key, void *value, 110 + const void **old_key, void **old_value) 111 + { 112 + return hashmap__insert(map, key, value, HASHMAP_UPDATE, 113 + old_key, old_value); 114 + } 115 + 116 + static inline int hashmap__append(struct hashmap *map, 117 + const void *key, void *value) 118 + { 119 + return hashmap__insert(map, key, value, HASHMAP_APPEND, NULL, NULL); 120 + } 121 + 122 + bool hashmap__delete(struct hashmap *map, const void *key, 123 + const void **old_key, void **old_value); 124 + 125 + bool hashmap__find(const struct hashmap *map, const void *key, void **value); 126 + 127 + /* 128 + * hashmap__for_each_entry - iterate over all entries in hashmap 129 + * @map: hashmap to iterate 130 + * @cur: struct hashmap_entry * used as a loop cursor 131 + * @bkt: integer used as a bucket loop cursor 132 + */ 133 + #define hashmap__for_each_entry(map, cur, bkt) \ 134 + for (bkt = 0; bkt < map->cap; bkt++) \ 135 + for (cur = map->buckets[bkt]; cur; cur = cur->next) 136 + 137 + /* 138 + * hashmap__for_each_entry_safe - iterate over all entries in hashmap, safe 139 + * against removals 140 + * @map: hashmap to iterate 141 + * @cur: struct hashmap_entry * used as a loop cursor 142 + * @tmp: struct hashmap_entry * used as a temporary next cursor storage 143 + * @bkt: integer used as a bucket loop cursor 144 + */ 145 + #define hashmap__for_each_entry_safe(map, cur, tmp, bkt) \ 146 + for (bkt = 0; bkt < map->cap; bkt++) \ 147 + for (cur = map->buckets[bkt]; \ 148 + cur && ({tmp = cur->next; true; }); \ 149 + cur = tmp) 150 + 151 + /* 152 + * hashmap__for_each_key_entry - iterate over entries associated with given key 153 + * @map: hashmap to iterate 154 + * @cur: struct hashmap_entry * used as a loop cursor 155 + * @key: key to iterate entries for 156 + */ 157 + #define hashmap__for_each_key_entry(map, cur, _key) \ 158 + for (cur = ({ size_t bkt = hash_bits(map->hash_fn((_key), map->ctx),\ 159 + map->cap_bits); \ 160 + map->buckets ? map->buckets[bkt] : NULL; }); \ 161 + cur; \ 162 + cur = cur->next) \ 163 + if (map->equal_fn(cur->key, (_key), map->ctx)) 164 + 165 + #define hashmap__for_each_key_entry_safe(map, cur, tmp, _key) \ 166 + for (cur = ({ size_t bkt = hash_bits(map->hash_fn((_key), map->ctx),\ 167 + map->cap_bits); \ 168 + cur = map->buckets ? map->buckets[bkt] : NULL; }); \ 169 + cur && ({ tmp = cur->next; true; }); \ 170 + cur = tmp) \ 171 + if (map->equal_fn(cur->key, (_key), map->ctx)) 172 + 173 + #endif /* __LIBBPF_HASHMAP_H */

+80 -93

tools/lib/bpf/libbpf.c

··· 188 188 void *line_info; 189 189 __u32 line_info_rec_size; 190 190 __u32 line_info_cnt; 191 + __u32 prog_flags; 191 192 }; 192 193 193 194 enum libbpf_map_type { ··· 349 348 bpf_program__init(void *data, size_t size, char *section_name, int idx, 350 349 struct bpf_program *prog) 351 350 { 352 - if (size < sizeof(struct bpf_insn)) { 353 - pr_warning("corrupted section '%s'\n", section_name); 351 + const size_t bpf_insn_sz = sizeof(struct bpf_insn); 352 + 353 + if (size == 0 || size % bpf_insn_sz) { 354 + pr_warning("corrupted section '%s', size: %zu\n", 355 + section_name, size); 354 356 return -EINVAL; 355 357 } 356 358 ··· 379 375 section_name); 380 376 goto errout; 381 377 } 382 - prog->insns_cnt = size / sizeof(struct bpf_insn); 383 - memcpy(prog->insns, data, 384 - prog->insns_cnt * sizeof(struct bpf_insn)); 378 + prog->insns_cnt = size / bpf_insn_sz; 379 + memcpy(prog->insns, data, size); 385 380 prog->idx = idx; 386 381 prog->instances.fds = NULL; 387 382 prog->instances.nr = -1; ··· 497 494 498 495 strcpy(obj->path, path); 499 496 /* Using basename() GNU version which doesn't modify arg. */ 500 - strncpy(obj->name, basename((void *)path), 501 - sizeof(obj->name) - 1); 497 + strncpy(obj->name, basename((void *)path), sizeof(obj->name) - 1); 502 498 end = strchr(obj->name, '.'); 503 499 if (end) 504 500 *end = 0; 505 501 506 502 obj->efile.fd = -1; 507 503 /* 508 - * Caller of this function should also calls 504 + * Caller of this function should also call 509 505 * bpf_object__elf_finish() after data collection to return 510 506 * obj_buf to user. If not, we should duplicate the buffer to 511 507 * avoid user freeing them before elf finish. ··· 564 562 } else { 565 563 obj->efile.fd = open(obj->path, O_RDONLY); 566 564 if (obj->efile.fd < 0) { 567 - char errmsg[STRERR_BUFSIZE]; 568 - char *cp = libbpf_strerror_r(errno, errmsg, 569 - sizeof(errmsg)); 565 + char errmsg[STRERR_BUFSIZE], *cp; 570 566 567 + err = -errno; 568 + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); 571 569 pr_warning("failed to open %s: %s\n", obj->path, cp); 572 - return -errno; 570 + return err; 573 571 } 574 572 575 573 obj->efile.elf = elf_begin(obj->efile.fd, 576 - LIBBPF_ELF_C_READ_MMAP, 577 - NULL); 574 + LIBBPF_ELF_C_READ_MMAP, NULL); 578 575 } 579 576 580 577 if (!obj->efile.elf) { 581 - pr_warning("failed to open %s as ELF file\n", 582 - obj->path); 578 + pr_warning("failed to open %s as ELF file\n", obj->path); 583 579 err = -LIBBPF_ERRNO__LIBELF; 584 580 goto errout; 585 581 } 586 582 587 583 if (!gelf_getehdr(obj->efile.elf, &obj->efile.ehdr)) { 588 - pr_warning("failed to get EHDR from %s\n", 589 - obj->path); 584 + pr_warning("failed to get EHDR from %s\n", obj->path); 590 585 err = -LIBBPF_ERRNO__FORMAT; 591 586 goto errout; 592 587 } 593 588 ep = &obj->efile.ehdr; 594 589 595 590 /* Old LLVM set e_machine to EM_NONE */ 596 - if ((ep->e_type != ET_REL) || (ep->e_machine && (ep->e_machine != EM_BPF))) { 597 - pr_warning("%s is not an eBPF object file\n", 598 - obj->path); 591 + if (ep->e_type != ET_REL || 592 + (ep->e_machine && ep->e_machine != EM_BPF)) { 593 + pr_warning("%s is not an eBPF object file\n", obj->path); 599 594 err = -LIBBPF_ERRNO__FORMAT; 600 595 goto errout; 601 596 } ··· 603 604 return err; 604 605 } 605 606 606 - static int 607 - bpf_object__check_endianness(struct bpf_object *obj) 607 + static int bpf_object__check_endianness(struct bpf_object *obj) 608 608 { 609 - static unsigned int const endian = 1; 610 - 611 - switch (obj->efile.ehdr.e_ident[EI_DATA]) { 612 - case ELFDATA2LSB: 613 - /* We are big endian, BPF obj is little endian. */ 614 - if (*(unsigned char const *)&endian != 1) 615 - goto mismatch; 616 - break; 617 - 618 - case ELFDATA2MSB: 619 - /* We are little endian, BPF obj is big endian. */ 620 - if (*(unsigned char const *)&endian != 0) 621 - goto mismatch; 622 - break; 623 - default: 624 - return -LIBBPF_ERRNO__ENDIAN; 625 - } 626 - 627 - return 0; 628 - 629 - mismatch: 630 - pr_warning("Error: endianness mismatch.\n"); 609 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 610 + if (obj->efile.ehdr.e_ident[EI_DATA] == ELFDATA2LSB) 611 + return 0; 612 + #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 613 + if (obj->efile.ehdr.e_ident[EI_DATA] == ELFDATA2MSB) 614 + return 0; 615 + #else 616 + # error "Unrecognized __BYTE_ORDER__" 617 + #endif 618 + pr_warning("endianness mismatch.\n"); 631 619 return -LIBBPF_ERRNO__ENDIAN; 632 620 } 633 621 634 622 static int 635 - bpf_object__init_license(struct bpf_object *obj, 636 - void *data, size_t size) 623 + bpf_object__init_license(struct bpf_object *obj, void *data, size_t size) 637 624 { 638 - memcpy(obj->license, data, 639 - min(size, sizeof(obj->license) - 1)); 625 + memcpy(obj->license, data, min(size, sizeof(obj->license) - 1)); 640 626 pr_debug("license of %s is %s\n", obj->path, obj->license); 641 627 return 0; 642 628 } 643 629 644 630 static int 645 - bpf_object__init_kversion(struct bpf_object *obj, 646 - void *data, size_t size) 631 + bpf_object__init_kversion(struct bpf_object *obj, void *data, size_t size) 647 632 { 648 633 __u32 kver; 649 634 ··· 637 654 } 638 655 memcpy(&kver, data, sizeof(kver)); 639 656 obj->kern_version = kver; 640 - pr_debug("kernel version of %s is %x\n", obj->path, 641 - obj->kern_version); 657 + pr_debug("kernel version of %s is %x\n", obj->path, obj->kern_version); 642 658 return 0; 643 659 } 644 660 ··· 793 811 def->key_size = sizeof(int); 794 812 def->value_size = data->d_size; 795 813 def->max_entries = 1; 796 - def->map_flags = type == LIBBPF_MAP_RODATA ? 797 - BPF_F_RDONLY_PROG : 0; 814 + def->map_flags = type == LIBBPF_MAP_RODATA ? BPF_F_RDONLY_PROG : 0; 798 815 if (data_buff) { 799 816 *data_buff = malloc(data->d_size); 800 817 if (!*data_buff) { ··· 808 827 return 0; 809 828 } 810 829 811 - static int 812 - bpf_object__init_maps(struct bpf_object *obj, int flags) 830 + static int bpf_object__init_maps(struct bpf_object *obj, int flags) 813 831 { 814 832 int i, map_idx, map_def_sz = 0, nr_syms, nr_maps = 0, nr_maps_glob = 0; 815 833 bool strict = !(flags & MAPS_RELAX_COMPAT); ··· 910 930 map_name = elf_strptr(obj->efile.elf, 911 931 obj->efile.strtabidx, 912 932 sym.st_name); 933 + if (!map_name) { 934 + pr_warning("failed to get map #%d name sym string for obj %s\n", 935 + map_idx, obj->path); 936 + return -LIBBPF_ERRNO__FORMAT; 937 + } 913 938 914 939 obj->maps[map_idx].libbpf_type = LIBBPF_MAP_UNSPEC; 915 940 obj->maps[map_idx].offset = sym.st_value; ··· 1089 1104 1090 1105 /* Elf is corrupted/truncated, avoid calling elf_strptr. */ 1091 1106 if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL)) { 1092 - pr_warning("failed to get e_shstrndx from %s\n", 1093 - obj->path); 1107 + pr_warning("failed to get e_shstrndx from %s\n", obj->path); 1094 1108 return -LIBBPF_ERRNO__FORMAT; 1095 1109 } 1096 1110 ··· 1210 1226 1211 1227 if (!obj->efile.strtabidx || obj->efile.strtabidx >= idx) { 1212 1228 pr_warning("Corrupted ELF file: index of strtab invalid\n"); 1213 - return LIBBPF_ERRNO__FORMAT; 1229 + return -LIBBPF_ERRNO__FORMAT; 1214 1230 } 1215 1231 if (btf_data) { 1216 1232 obj->btf = btf__new(btf_data->d_buf, btf_data->d_size); ··· 1330 1346 size_t nr_maps = obj->nr_maps; 1331 1347 int i, nrels; 1332 1348 1333 - pr_debug("collecting relocating info for: '%s'\n", 1334 - prog->section_name); 1349 + pr_debug("collecting relocating info for: '%s'\n", prog->section_name); 1335 1350 nrels = shdr->sh_size / shdr->sh_entsize; 1336 1351 1337 1352 prog->reloc_desc = malloc(sizeof(*prog->reloc_desc) * nrels); ··· 1355 1372 return -LIBBPF_ERRNO__FORMAT; 1356 1373 } 1357 1374 1358 - if (!gelf_getsym(symbols, 1359 - GELF_R_SYM(rel.r_info), 1360 - &sym)) { 1375 + if (!gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym)) { 1361 1376 pr_warning("relocation: symbol %"PRIx64" not found\n", 1362 1377 GELF_R_SYM(rel.r_info)); 1363 1378 return -LIBBPF_ERRNO__FORMAT; ··· 1416 1435 if (maps[map_idx].libbpf_type != type) 1417 1436 continue; 1418 1437 if (type != LIBBPF_MAP_UNSPEC || 1419 - (type == LIBBPF_MAP_UNSPEC && 1420 - maps[map_idx].offset == sym.st_value)) { 1438 + maps[map_idx].offset == sym.st_value) { 1421 1439 pr_debug("relocation: find map %zd (%s) for insn %u\n", 1422 1440 map_idx, maps[map_idx].name, insn_idx); 1423 1441 break; ··· 1424 1444 } 1425 1445 1426 1446 if (map_idx >= nr_maps) { 1427 - pr_warning("bpf relocation: map_idx %d large than %d\n", 1447 + pr_warning("bpf relocation: map_idx %d larger than %d\n", 1428 1448 (int)map_idx, (int)nr_maps - 1); 1429 1449 return -LIBBPF_ERRNO__RELOC; 1430 1450 } ··· 1736 1756 create_attr.key_size = def->key_size; 1737 1757 create_attr.value_size = def->value_size; 1738 1758 create_attr.max_entries = def->max_entries; 1739 - create_attr.btf_fd = 0; 1759 + create_attr.btf_fd = -1; 1740 1760 create_attr.btf_key_type_id = 0; 1741 1761 create_attr.btf_value_type_id = 0; 1742 1762 if (bpf_map_type__is_map_in_map(def->type) && ··· 1750 1770 } 1751 1771 1752 1772 *pfd = bpf_create_map_xattr(&create_attr); 1753 - if (*pfd < 0 && create_attr.btf_key_type_id) { 1773 + if (*pfd < 0 && create_attr.btf_fd >= 0) { 1754 1774 cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); 1755 1775 pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", 1756 1776 map->name, cp, errno); 1757 - create_attr.btf_fd = 0; 1777 + create_attr.btf_fd = -1; 1758 1778 create_attr.btf_key_type_id = 0; 1759 1779 create_attr.btf_value_type_id = 0; 1760 1780 map->btf_key_type_id = 0; ··· 1783 1803 } 1784 1804 } 1785 1805 1786 - pr_debug("create map %s: fd=%d\n", map->name, *pfd); 1806 + pr_debug("created map %s: fd=%d\n", map->name, *pfd); 1787 1807 } 1788 1808 1789 1809 return 0; ··· 1804 1824 if (btf_prog_info) { 1805 1825 /* 1806 1826 * Some info has already been found but has problem 1807 - * in the last btf_ext reloc. Must have to error 1808 - * out. 1827 + * in the last btf_ext reloc. Must have to error out. 1809 1828 */ 1810 1829 pr_warning("Error in relocating %s for sec %s.\n", 1811 1830 info_name, prog->section_name); 1812 1831 return err; 1813 1832 } 1814 1833 1815 - /* 1816 - * Have problem loading the very first info. Ignore 1817 - * the rest. 1818 - */ 1834 + /* Have problem loading the very first info. Ignore the rest. */ 1819 1835 pr_warning("Cannot find %s for main program sec %s. Ignore all %s.\n", 1820 1836 info_name, prog->section_name, info_name); 1821 1837 return 0; ··· 2015 2039 return -LIBBPF_ERRNO__RELOC; 2016 2040 } 2017 2041 2018 - err = bpf_program__collect_reloc(prog, 2019 - shdr, data, 2020 - obj); 2042 + err = bpf_program__collect_reloc(prog, shdr, data, obj); 2021 2043 if (err) 2022 2044 return err; 2023 2045 } ··· 2032 2058 char *log_buf; 2033 2059 int ret; 2034 2060 2061 + if (!insns || !insns_cnt) 2062 + return -EINVAL; 2063 + 2035 2064 memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); 2036 2065 load_attr.prog_type = prog->type; 2037 2066 load_attr.expected_attach_type = prog->expected_attach_type; ··· 2045 2068 load_attr.license = license; 2046 2069 load_attr.kern_version = kern_version; 2047 2070 load_attr.prog_ifindex = prog->prog_ifindex; 2048 - load_attr.prog_btf_fd = prog->btf_fd >= 0 ? prog->btf_fd : 0; 2071 + load_attr.prog_btf_fd = prog->btf_fd; 2049 2072 load_attr.func_info = prog->func_info; 2050 2073 load_attr.func_info_rec_size = prog->func_info_rec_size; 2051 2074 load_attr.func_info_cnt = prog->func_info_cnt; ··· 2053 2076 load_attr.line_info_rec_size = prog->line_info_rec_size; 2054 2077 load_attr.line_info_cnt = prog->line_info_cnt; 2055 2078 load_attr.log_level = prog->log_level; 2056 - if (!load_attr.insns || !load_attr.insns_cnt) 2057 - return -EINVAL; 2079 + load_attr.prog_flags = prog->prog_flags; 2058 2080 2059 2081 retry_load: 2060 2082 log_buf = malloc(log_buf_size); ··· 2198 2222 } 2199 2223 2200 2224 static int 2201 - bpf_object__load_progs(struct bpf_object *obj) 2225 + bpf_object__load_progs(struct bpf_object *obj, int log_level) 2202 2226 { 2203 2227 size_t i; 2204 2228 int err; ··· 2206 2230 for (i = 0; i < obj->nr_programs; i++) { 2207 2231 if (bpf_program__is_function_storage(&obj->programs[i], obj)) 2208 2232 continue; 2233 + obj->programs[i].log_level |= log_level; 2209 2234 err = bpf_program__load(&obj->programs[i], 2210 2235 obj->license, 2211 2236 obj->kern_version); ··· 2333 2356 snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx", 2334 2357 (unsigned long)obj_buf, 2335 2358 (unsigned long)obj_buf_sz); 2336 - tmp_name[sizeof(tmp_name) - 1] = '\0'; 2337 2359 name = tmp_name; 2338 2360 } 2339 - pr_debug("loading object '%s' from buffer\n", 2340 - name); 2361 + pr_debug("loading object '%s' from buffer\n", name); 2341 2362 2342 2363 return __bpf_object__open(name, obj_buf, obj_buf_sz, true, true); 2343 2364 } ··· 2356 2381 return 0; 2357 2382 } 2358 2383 2359 - int bpf_object__load(struct bpf_object *obj) 2384 + int bpf_object__load_xattr(struct bpf_object_load_attr *attr) 2360 2385 { 2386 + struct bpf_object *obj; 2361 2387 int err; 2362 2388 2389 + if (!attr) 2390 + return -EINVAL; 2391 + obj = attr->obj; 2363 2392 if (!obj) 2364 2393 return -EINVAL; 2365 2394 ··· 2376 2397 2377 2398 CHECK_ERR(bpf_object__create_maps(obj), err, out); 2378 2399 CHECK_ERR(bpf_object__relocate(obj), err, out); 2379 - CHECK_ERR(bpf_object__load_progs(obj), err, out); 2400 + CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out); 2380 2401 2381 2402 return 0; 2382 2403 out: 2383 2404 bpf_object__unload(obj); 2384 2405 pr_warning("failed to load object '%s'\n", obj->path); 2385 2406 return err; 2407 + } 2408 + 2409 + int bpf_object__load(struct bpf_object *obj) 2410 + { 2411 + struct bpf_object_load_attr attr = { 2412 + .obj = obj, 2413 + }; 2414 + 2415 + return bpf_object__load_xattr(&attr); 2386 2416 } 2387 2417 2388 2418 static int check_path(const char *path) ··· 3446 3458 3447 3459 long libbpf_get_error(const void *ptr) 3448 3460 { 3449 - if (IS_ERR(ptr)) 3450 - return PTR_ERR(ptr); 3451 - return 0; 3461 + return PTR_ERR_OR_ZERO(ptr); 3452 3462 } 3453 3463 3454 3464 int bpf_prog_load(const char *file, enum bpf_prog_type type, ··· 3507 3521 expected_attach_type); 3508 3522 3509 3523 prog->log_level = attr->log_level; 3524 + prog->prog_flags = attr->prog_flags; 3510 3525 if (!first_prog) 3511 3526 first_prog = prog; 3512 3527 }

+7

tools/lib/bpf/libbpf.h

··· 89 89 LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); 90 90 LIBBPF_API void bpf_object__close(struct bpf_object *object); 91 91 92 + struct bpf_object_load_attr { 93 + struct bpf_object *obj; 94 + int log_level; 95 + }; 96 + 92 97 /* Load/unload object into/from kernel */ 93 98 LIBBPF_API int bpf_object__load(struct bpf_object *obj); 99 + LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); 94 100 LIBBPF_API int bpf_object__unload(struct bpf_object *obj); 95 101 LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); 96 102 LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); ··· 326 320 enum bpf_attach_type expected_attach_type; 327 321 int ifindex; 328 322 int log_level; 323 + int prog_flags; 329 324 }; 330 325 331 326 LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,

+9

tools/lib/bpf/libbpf.map

··· 164 164 bpf_map_freeze; 165 165 btf__finalize_data; 166 166 } LIBBPF_0.0.2; 167 + 168 + LIBBPF_0.0.4 { 169 + global: 170 + btf_dump__dump_type; 171 + btf_dump__free; 172 + btf_dump__new; 173 + btf__parse_elf; 174 + bpf_object__load_xattr; 175 + } LIBBPF_0.0.3;

+2

tools/lib/bpf/libbpf_internal.h

··· 9 9 #ifndef __LIBBPF_LIBBPF_INTERNAL_H 10 10 #define __LIBBPF_LIBBPF_INTERNAL_H 11 11 12 + #include "libbpf.h" 13 + 12 14 #define BTF_INFO_ENC(kind, kind_flag, vlen) \ 13 15 ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) 14 16 #define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type)

+4

tools/testing/selftests/bpf/.gitignore

··· 22 22 get_cgroup_id_user 23 23 test_skb_cgroup_id_user 24 24 test_socket_cookie 25 + test_cgroup_attach 25 26 test_cgroup_storage 26 27 test_select_reuseport 27 28 test_flow_dissector ··· 36 35 alu32 37 36 libbpf.pc 38 37 libbpf.so.* 38 + test_hashmap 39 + test_btf_dump 40 + xdping

+11 -6

tools/testing/selftests/bpf/Makefile

··· 15 15 LLVM_OBJCOPY ?= llvm-objcopy 16 16 LLVM_READELF ?= llvm-readelf 17 17 BTF_PAHOLE ?= pahole 18 - CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include 18 + CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include \ 19 + -Dbpf_prog_load=bpf_prog_test_load \ 20 + -Dbpf_load_program=bpf_test_load_program 19 21 LDLIBS += -lcap -lelf -lrt -lpthread 20 22 21 23 # Order correspond to 'make run_tests' order ··· 25 23 test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ 26 24 test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ 27 25 test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \ 28 - test_netcnt test_tcpnotify_user test_sock_fields test_sysctl 26 + test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ 27 + test_btf_dump test_cgroup_attach xdping 29 28 30 29 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c))) 31 30 TEST_GEN_FILES = $(BPF_OBJ_FILES) ··· 57 54 test_lwt_ip_encap.sh \ 58 55 test_tcp_check_syncookie.sh \ 59 56 test_tc_tunnel.sh \ 60 - test_tc_edt.sh 57 + test_tc_edt.sh \ 58 + test_xdping.sh 61 59 62 60 TEST_PROGS_EXTENDED := with_addr.sh \ 63 61 with_tunnels.sh \ ··· 82 78 83 79 BPFOBJ := $(OUTPUT)/libbpf.a 84 80 85 - $(TEST_GEN_PROGS): $(BPFOBJ) 81 + $(TEST_GEN_PROGS): test_stub.o $(BPFOBJ) 86 82 87 - $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a 83 + $(TEST_GEN_PROGS_EXTENDED): test_stub.o $(OUTPUT)/libbpf.a 88 84 89 85 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c 90 86 $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c ··· 100 96 $(OUTPUT)/test_netcnt: cgroup_helpers.c 101 97 $(OUTPUT)/test_sock_fields: cgroup_helpers.c 102 98 $(OUTPUT)/test_sysctl: cgroup_helpers.c 99 + $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c 103 100 104 101 .PHONY: force 105 102 ··· 181 176 $(ALU32_BUILD_DIR)/urandom_read 182 177 $(CC) $(TEST_PROGS_CFLAGS) $(CFLAGS) \ 183 178 -o $(ALU32_BUILD_DIR)/test_progs_32 \ 184 - test_progs.c trace_helpers.c prog_tests/*.c \ 179 + test_progs.c test_stub.c trace_helpers.c prog_tests/*.c \ 185 180 $(OUTPUT)/libbpf.a $(LDLIBS) 186 181 187 182 $(ALU32_BUILD_DIR)/test_progs_32: $(PROG_TESTS_H)

+9

tools/testing/selftests/bpf/bpf_helpers.h

··· 8 8 */ 9 9 #define SEC(NAME) __attribute__((section(NAME), used)) 10 10 11 + /* helper macro to print out debug messages */ 12 + #define bpf_printk(fmt, ...) \ 13 + ({ \ 14 + char ____fmt[] = fmt; \ 15 + bpf_trace_printk(____fmt, sizeof(____fmt), \ 16 + ##__VA_ARGS__); \ 17 + }) 18 + 11 19 /* helper functions called from eBPF programs written in C */ 12 20 static void *(*bpf_map_lookup_elem)(void *map, const void *key) = 13 21 (void *) BPF_FUNC_map_lookup_elem; ··· 224 216 (void *) BPF_FUNC_sk_storage_get; 225 217 static int (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) = 226 218 (void *)BPF_FUNC_sk_storage_delete; 219 + static int (*bpf_send_signal)(unsigned sig) = (void *)BPF_FUNC_send_signal; 227 220 228 221 /* llvm builtin functions that eBPF C program may use to 229 222 * emit BPF_LD_ABS and BPF_LD_IND instructions

+57

tools/testing/selftests/bpf/cgroup_helpers.c

··· 34 34 CGROUP_WORK_DIR, path) 35 35 36 36 /** 37 + * enable_all_controllers() - Enable all available cgroup v2 controllers 38 + * 39 + * Enable all available cgroup v2 controllers in order to increase 40 + * the code coverage. 41 + * 42 + * If successful, 0 is returned. 43 + */ 44 + int enable_all_controllers(char *cgroup_path) 45 + { 46 + char path[PATH_MAX + 1]; 47 + char buf[PATH_MAX]; 48 + char *c, *c2; 49 + int fd, cfd; 50 + size_t len; 51 + 52 + snprintf(path, sizeof(path), "%s/cgroup.controllers", cgroup_path); 53 + fd = open(path, O_RDONLY); 54 + if (fd < 0) { 55 + log_err("Opening cgroup.controllers: %s", path); 56 + return 1; 57 + } 58 + 59 + len = read(fd, buf, sizeof(buf) - 1); 60 + if (len < 0) { 61 + close(fd); 62 + log_err("Reading cgroup.controllers: %s", path); 63 + return 1; 64 + } 65 + buf[len] = 0; 66 + close(fd); 67 + 68 + /* No controllers available? We're probably on cgroup v1. */ 69 + if (len == 0) 70 + return 0; 71 + 72 + snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path); 73 + cfd = open(path, O_RDWR); 74 + if (cfd < 0) { 75 + log_err("Opening cgroup.subtree_control: %s", path); 76 + return 1; 77 + } 78 + 79 + for (c = strtok_r(buf, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) { 80 + if (dprintf(cfd, "+%s\n", c) <= 0) { 81 + log_err("Enabling controller %s: %s", c, path); 82 + close(cfd); 83 + return 1; 84 + } 85 + } 86 + close(cfd); 87 + return 0; 88 + } 89 + 90 + /** 37 91 * setup_cgroup_environment() - Setup the cgroup environment 38 92 * 39 93 * After calling this function, cleanup_cgroup_environment should be called ··· 124 70 log_err("mkdir cgroup work dir"); 125 71 return 1; 126 72 } 73 + 74 + if (enable_all_controllers(cgroup_workdir)) 75 + return 1; 127 76 128 77 return 0; 129 78 }

+19 -13

tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c

··· 12 12 return vfprintf(stderr, "%s", args); 13 13 } 14 14 15 - static int check_load(const char *file) 15 + static int check_load(const char *file, enum bpf_prog_type type) 16 16 { 17 17 struct bpf_prog_load_attr attr; 18 18 struct bpf_object *obj = NULL; ··· 20 20 21 21 memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); 22 22 attr.file = file; 23 - attr.prog_type = BPF_PROG_TYPE_SCHED_CLS; 23 + attr.prog_type = type; 24 24 attr.log_level = 4; 25 + attr.prog_flags = BPF_F_TEST_RND_HI32; 25 26 err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); 26 27 bpf_object__close(obj); 27 28 if (err) ··· 32 31 33 32 void test_bpf_verif_scale(void) 34 33 { 35 - const char *file1 = "./test_verif_scale1.o"; 36 - const char *file2 = "./test_verif_scale2.o"; 37 - const char *file3 = "./test_verif_scale3.o"; 38 - int err; 34 + const char *scale[] = { 35 + "./test_verif_scale1.o", "./test_verif_scale2.o", "./test_verif_scale3.o" 36 + }; 37 + const char *pyperf[] = { 38 + "./pyperf50.o", "./pyperf100.o", "./pyperf180.o" 39 + }; 40 + int err, i; 39 41 40 42 if (verifier_stats) 41 43 libbpf_set_print(libbpf_debug_print); 42 44 43 - err = check_load(file1); 44 - err |= check_load(file2); 45 - err |= check_load(file3); 46 - if (!err) 47 - printf("test_verif_scale:OK\n"); 48 - else 49 - printf("test_verif_scale:FAIL\n"); 45 + for (i = 0; i < ARRAY_SIZE(scale); i++) { 46 + err = check_load(scale[i], BPF_PROG_TYPE_SCHED_CLS); 47 + printf("test_scale:%s:%s\n", scale[i], err ? "FAIL" : "OK"); 48 + } 49 + 50 + for (i = 0; i < ARRAY_SIZE(pyperf); i++) { 51 + err = check_load(pyperf[i], BPF_PROG_TYPE_RAW_TRACEPOINT); 52 + printf("test_scale:%s:%s\n", pyperf[i], err ? "FAIL" : "OK"); 53 + } 50 54 }

+198

tools/testing/selftests/bpf/prog_tests/send_signal.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <test_progs.h> 3 + 4 + static volatile int sigusr1_received = 0; 5 + 6 + static void sigusr1_handler(int signum) 7 + { 8 + sigusr1_received++; 9 + } 10 + 11 + static int test_send_signal_common(struct perf_event_attr *attr, 12 + int prog_type, 13 + const char *test_name) 14 + { 15 + int err = -1, pmu_fd, prog_fd, info_map_fd, status_map_fd; 16 + const char *file = "./test_send_signal_kern.o"; 17 + struct bpf_object *obj = NULL; 18 + int pipe_c2p[2], pipe_p2c[2]; 19 + __u32 key = 0, duration = 0; 20 + char buf[256]; 21 + pid_t pid; 22 + __u64 val; 23 + 24 + if (CHECK(pipe(pipe_c2p), test_name, 25 + "pipe pipe_c2p error: %s\n", strerror(errno))) 26 + goto no_fork_done; 27 + 28 + if (CHECK(pipe(pipe_p2c), test_name, 29 + "pipe pipe_p2c error: %s\n", strerror(errno))) { 30 + close(pipe_c2p[0]); 31 + close(pipe_c2p[1]); 32 + goto no_fork_done; 33 + } 34 + 35 + pid = fork(); 36 + if (CHECK(pid < 0, test_name, "fork error: %s\n", strerror(errno))) { 37 + close(pipe_c2p[0]); 38 + close(pipe_c2p[1]); 39 + close(pipe_p2c[0]); 40 + close(pipe_p2c[1]); 41 + goto no_fork_done; 42 + } 43 + 44 + if (pid == 0) { 45 + /* install signal handler and notify parent */ 46 + signal(SIGUSR1, sigusr1_handler); 47 + 48 + close(pipe_c2p[0]); /* close read */ 49 + close(pipe_p2c[1]); /* close write */ 50 + 51 + /* notify parent signal handler is installed */ 52 + write(pipe_c2p[1], buf, 1); 53 + 54 + /* make sure parent enabled bpf program to send_signal */ 55 + read(pipe_p2c[0], buf, 1); 56 + 57 + /* wait a little for signal handler */ 58 + sleep(1); 59 + 60 + if (sigusr1_received) 61 + write(pipe_c2p[1], "2", 1); 62 + else 63 + write(pipe_c2p[1], "0", 1); 64 + 65 + /* wait for parent notification and exit */ 66 + read(pipe_p2c[0], buf, 1); 67 + 68 + close(pipe_c2p[1]); 69 + close(pipe_p2c[0]); 70 + exit(0); 71 + } 72 + 73 + close(pipe_c2p[1]); /* close write */ 74 + close(pipe_p2c[0]); /* close read */ 75 + 76 + err = bpf_prog_load(file, prog_type, &obj, &prog_fd); 77 + if (CHECK(err < 0, test_name, "bpf_prog_load error: %s\n", 78 + strerror(errno))) 79 + goto prog_load_failure; 80 + 81 + pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1, 82 + -1 /* group id */, 0 /* flags */); 83 + if (CHECK(pmu_fd < 0, test_name, "perf_event_open error: %s\n", 84 + strerror(errno))) { 85 + err = -1; 86 + goto close_prog; 87 + } 88 + 89 + err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); 90 + if (CHECK(err < 0, test_name, "ioctl perf_event_ioc_enable error: %s\n", 91 + strerror(errno))) 92 + goto disable_pmu; 93 + 94 + err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); 95 + if (CHECK(err < 0, test_name, "ioctl perf_event_ioc_set_bpf error: %s\n", 96 + strerror(errno))) 97 + goto disable_pmu; 98 + 99 + err = -1; 100 + info_map_fd = bpf_object__find_map_fd_by_name(obj, "info_map"); 101 + if (CHECK(info_map_fd < 0, test_name, "find map %s error\n", "info_map")) 102 + goto disable_pmu; 103 + 104 + status_map_fd = bpf_object__find_map_fd_by_name(obj, "status_map"); 105 + if (CHECK(status_map_fd < 0, test_name, "find map %s error\n", "status_map")) 106 + goto disable_pmu; 107 + 108 + /* wait until child signal handler installed */ 109 + read(pipe_c2p[0], buf, 1); 110 + 111 + /* trigger the bpf send_signal */ 112 + key = 0; 113 + val = (((__u64)(SIGUSR1)) << 32) | pid; 114 + bpf_map_update_elem(info_map_fd, &key, &val, 0); 115 + 116 + /* notify child that bpf program can send_signal now */ 117 + write(pipe_p2c[1], buf, 1); 118 + 119 + /* wait for result */ 120 + err = read(pipe_c2p[0], buf, 1); 121 + if (CHECK(err < 0, test_name, "reading pipe error: %s\n", strerror(errno))) 122 + goto disable_pmu; 123 + if (CHECK(err == 0, test_name, "reading pipe error: size 0\n")) { 124 + err = -1; 125 + goto disable_pmu; 126 + } 127 + 128 + err = CHECK(buf[0] != '2', test_name, "incorrect result\n"); 129 + 130 + /* notify child safe to exit */ 131 + write(pipe_p2c[1], buf, 1); 132 + 133 + disable_pmu: 134 + close(pmu_fd); 135 + close_prog: 136 + bpf_object__close(obj); 137 + prog_load_failure: 138 + close(pipe_c2p[0]); 139 + close(pipe_p2c[1]); 140 + wait(NULL); 141 + no_fork_done: 142 + return err; 143 + } 144 + 145 + static int test_send_signal_tracepoint(void) 146 + { 147 + const char *id_path = "/sys/kernel/debug/tracing/events/syscalls/sys_enter_nanosleep/id"; 148 + struct perf_event_attr attr = { 149 + .type = PERF_TYPE_TRACEPOINT, 150 + .sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN, 151 + .sample_period = 1, 152 + .wakeup_events = 1, 153 + }; 154 + __u32 duration = 0; 155 + int bytes, efd; 156 + char buf[256]; 157 + 158 + efd = open(id_path, O_RDONLY, 0); 159 + if (CHECK(efd < 0, "tracepoint", 160 + "open syscalls/sys_enter_nanosleep/id failure: %s\n", 161 + strerror(errno))) 162 + return -1; 163 + 164 + bytes = read(efd, buf, sizeof(buf)); 165 + close(efd); 166 + if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "tracepoint", 167 + "read syscalls/sys_enter_nanosleep/id failure: %s\n", 168 + strerror(errno))) 169 + return -1; 170 + 171 + attr.config = strtol(buf, NULL, 0); 172 + 173 + return test_send_signal_common(&attr, BPF_PROG_TYPE_TRACEPOINT, "tracepoint"); 174 + } 175 + 176 + static int test_send_signal_nmi(void) 177 + { 178 + struct perf_event_attr attr = { 179 + .sample_freq = 50, 180 + .freq = 1, 181 + .type = PERF_TYPE_HARDWARE, 182 + .config = PERF_COUNT_HW_CPU_CYCLES, 183 + }; 184 + 185 + return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT, "perf_event"); 186 + } 187 + 188 + void test_send_signal(void) 189 + { 190 + int ret = 0; 191 + 192 + ret |= test_send_signal_tracepoint(); 193 + ret |= test_send_signal_nmi(); 194 + if (!ret) 195 + printf("test_send_signal:OK\n"); 196 + else 197 + printf("test_send_signal:FAIL\n"); 198 + }

+92

tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * BTF-to-C dumper tests for bitfield. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + #include <stdbool.h> 9 + 10 + /* ----- START-EXPECTED-OUTPUT ----- */ 11 + /* 12 + *struct bitfields_only_mixed_types { 13 + * int a: 3; 14 + * long int b: 2; 15 + * _Bool c: 1; 16 + * enum { 17 + * A = 0, 18 + * B = 1, 19 + * } d: 1; 20 + * short e: 5; 21 + * int: 20; 22 + * unsigned int f: 30; 23 + *}; 24 + * 25 + */ 26 + /* ------ END-EXPECTED-OUTPUT ------ */ 27 + 28 + struct bitfields_only_mixed_types { 29 + int a: 3; 30 + long int b: 2; 31 + bool c: 1; /* it's really a _Bool type */ 32 + enum { 33 + A, /* A = 0, dumper is very explicit */ 34 + B, /* B = 1, same */ 35 + } d: 1; 36 + short e: 5; 37 + /* 20-bit padding here */ 38 + unsigned f: 30; /* this gets aligned on 4-byte boundary */ 39 + }; 40 + 41 + /* ----- START-EXPECTED-OUTPUT ----- */ 42 + /* 43 + *struct bitfield_mixed_with_others { 44 + * char: 4; 45 + * int a: 4; 46 + * short b; 47 + * long int c; 48 + * long int d: 8; 49 + * int e; 50 + * int f; 51 + *}; 52 + * 53 + */ 54 + /* ------ END-EXPECTED-OUTPUT ------ */ 55 + struct bitfield_mixed_with_others { 56 + long: 4; /* char is enough as a backing field */ 57 + int a: 4; 58 + /* 8-bit implicit padding */ 59 + short b; /* combined with previous bitfield */ 60 + /* 4 more bytes of implicit padding */ 61 + long c; 62 + long d: 8; 63 + /* 24 bits implicit padding */ 64 + int e; /* combined with previous bitfield */ 65 + int f; 66 + /* 4 bytes of padding */ 67 + }; 68 + 69 + /* ----- START-EXPECTED-OUTPUT ----- */ 70 + /* 71 + *struct bitfield_flushed { 72 + * int a: 4; 73 + * long: 60; 74 + * long int b: 16; 75 + *}; 76 + * 77 + */ 78 + /* ------ END-EXPECTED-OUTPUT ------ */ 79 + struct bitfield_flushed { 80 + int a: 4; 81 + long: 0; /* flush until next natural alignment boundary */ 82 + long b: 16; 83 + }; 84 + 85 + int f(struct { 86 + struct bitfields_only_mixed_types _1; 87 + struct bitfield_mixed_with_others _2; 88 + struct bitfield_flushed _3; 89 + } *_) 90 + { 91 + return 0; 92 + }

+35

tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * BTF-to-C dumper test for multi-dimensional array output. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + /* ----- START-EXPECTED-OUTPUT ----- */ 9 + typedef int arr_t[2]; 10 + 11 + typedef int multiarr_t[3][4][5]; 12 + 13 + typedef int *ptr_arr_t[6]; 14 + 15 + typedef int *ptr_multiarr_t[7][8][9][10]; 16 + 17 + typedef int * (*fn_ptr_arr_t[11])(); 18 + 19 + typedef int * (*fn_ptr_multiarr_t[12][13])(); 20 + 21 + struct root_struct { 22 + arr_t _1; 23 + multiarr_t _2; 24 + ptr_arr_t _3; 25 + ptr_multiarr_t _4; 26 + fn_ptr_arr_t _5; 27 + fn_ptr_multiarr_t _6; 28 + }; 29 + 30 + /* ------ END-EXPECTED-OUTPUT ------ */ 31 + 32 + int f(struct root_struct *s) 33 + { 34 + return 0; 35 + }

+73

tools/testing/selftests/bpf/progs/btf_dump_test_case_namespacing.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * BTF-to-C dumper test validating no name versioning happens between 5 + * independent C namespaces (struct/union/enum vs typedef/enum values). 6 + * 7 + * Copyright (c) 2019 Facebook 8 + */ 9 + /* ----- START-EXPECTED-OUTPUT ----- */ 10 + struct S { 11 + int S; 12 + int U; 13 + }; 14 + 15 + typedef struct S S; 16 + 17 + union U { 18 + int S; 19 + int U; 20 + }; 21 + 22 + typedef union U U; 23 + 24 + enum E { 25 + V = 0, 26 + }; 27 + 28 + typedef enum E E; 29 + 30 + struct A {}; 31 + 32 + union B {}; 33 + 34 + enum C { 35 + A = 1, 36 + B = 2, 37 + C = 3, 38 + }; 39 + 40 + struct X {}; 41 + 42 + union Y {}; 43 + 44 + enum Z; 45 + 46 + typedef int X; 47 + 48 + typedef int Y; 49 + 50 + typedef int Z; 51 + 52 + /*------ END-EXPECTED-OUTPUT ------ */ 53 + 54 + int f(struct { 55 + struct S _1; 56 + S _2; 57 + union U _3; 58 + U _4; 59 + enum E _5; 60 + E _6; 61 + struct A a; 62 + union B b; 63 + enum C c; 64 + struct X x; 65 + union Y y; 66 + enum Z *z; 67 + X xx; 68 + Y yy; 69 + Z zz; 70 + } *_) 71 + { 72 + return 0; 73 + }

+63

tools/testing/selftests/bpf/progs/btf_dump_test_case_ordering.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * BTF-to-C dumper test for topological sorting of dependent structs. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + /* ----- START-EXPECTED-OUTPUT ----- */ 9 + struct s1 {}; 10 + 11 + struct s3; 12 + 13 + struct s4; 14 + 15 + struct s2 { 16 + struct s2 *s2; 17 + struct s3 *s3; 18 + struct s4 *s4; 19 + }; 20 + 21 + struct s3 { 22 + struct s1 s1; 23 + struct s2 s2; 24 + }; 25 + 26 + struct s4 { 27 + struct s1 s1; 28 + struct s3 s3; 29 + }; 30 + 31 + struct list_head { 32 + struct list_head *next; 33 + struct list_head *prev; 34 + }; 35 + 36 + struct hlist_node { 37 + struct hlist_node *next; 38 + struct hlist_node **pprev; 39 + }; 40 + 41 + struct hlist_head { 42 + struct hlist_node *first; 43 + }; 44 + 45 + struct callback_head { 46 + struct callback_head *next; 47 + void (*func)(struct callback_head *); 48 + }; 49 + 50 + struct root_struct { 51 + struct s4 s4; 52 + struct list_head l; 53 + struct hlist_node n; 54 + struct hlist_head h; 55 + struct callback_head cb; 56 + }; 57 + 58 + /*------ END-EXPECTED-OUTPUT ------ */ 59 + 60 + int f(struct root_struct *root) 61 + { 62 + return 0; 63 + }

+75

tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * BTF-to-C dumper tests for struct packing determination. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + /* ----- START-EXPECTED-OUTPUT ----- */ 9 + struct packed_trailing_space { 10 + int a; 11 + short b; 12 + } __attribute__((packed)); 13 + 14 + struct non_packed_trailing_space { 15 + int a; 16 + short b; 17 + }; 18 + 19 + struct packed_fields { 20 + short a; 21 + int b; 22 + } __attribute__((packed)); 23 + 24 + struct non_packed_fields { 25 + short a; 26 + int b; 27 + }; 28 + 29 + struct nested_packed { 30 + char: 4; 31 + int a: 4; 32 + long int b; 33 + struct { 34 + char c; 35 + int d; 36 + } __attribute__((packed)) e; 37 + } __attribute__((packed)); 38 + 39 + union union_is_never_packed { 40 + int a: 4; 41 + char b; 42 + char c: 1; 43 + }; 44 + 45 + union union_does_not_need_packing { 46 + struct { 47 + long int a; 48 + int b; 49 + } __attribute__((packed)); 50 + int c; 51 + }; 52 + 53 + union jump_code_union { 54 + char code[5]; 55 + struct { 56 + char jump; 57 + int offset; 58 + } __attribute__((packed)); 59 + }; 60 + 61 + /*------ END-EXPECTED-OUTPUT ------ */ 62 + 63 + int f(struct { 64 + struct packed_trailing_space _1; 65 + struct non_packed_trailing_space _2; 66 + struct packed_fields _3; 67 + struct non_packed_fields _4; 68 + struct nested_packed _5; 69 + union union_is_never_packed _6; 70 + union union_does_not_need_packing _7; 71 + union jump_code_union _8; 72 + } *_) 73 + { 74 + return 0; 75 + }

+111

tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * BTF-to-C dumper tests for implicit and explicit padding between fields and 5 + * at the end of a struct. 6 + * 7 + * Copyright (c) 2019 Facebook 8 + */ 9 + /* ----- START-EXPECTED-OUTPUT ----- */ 10 + struct padded_implicitly { 11 + int a; 12 + long int b; 13 + char c; 14 + }; 15 + 16 + /* ------ END-EXPECTED-OUTPUT ------ */ 17 + 18 + /* ----- START-EXPECTED-OUTPUT ----- */ 19 + /* 20 + *struct padded_explicitly { 21 + * int a; 22 + * int: 32; 23 + * int b; 24 + *}; 25 + * 26 + */ 27 + /* ------ END-EXPECTED-OUTPUT ------ */ 28 + 29 + struct padded_explicitly { 30 + int a; 31 + int: 1; /* algo will explicitly pad with full 32 bits here */ 32 + int b; 33 + }; 34 + 35 + /* ----- START-EXPECTED-OUTPUT ----- */ 36 + /* 37 + *struct padded_a_lot { 38 + * int a; 39 + * long: 32; 40 + * long: 64; 41 + * long: 64; 42 + * int b; 43 + *}; 44 + * 45 + */ 46 + /* ------ END-EXPECTED-OUTPUT ------ */ 47 + 48 + struct padded_a_lot { 49 + int a; 50 + /* 32 bit of implicit padding here, which algo will make explicit */ 51 + long: 64; 52 + long: 64; 53 + int b; 54 + }; 55 + 56 + /* ----- START-EXPECTED-OUTPUT ----- */ 57 + /* 58 + *struct padded_cache_line { 59 + * int a; 60 + * long: 32; 61 + * long: 64; 62 + * long: 64; 63 + * long: 64; 64 + * int b; 65 + *}; 66 + * 67 + */ 68 + /* ------ END-EXPECTED-OUTPUT ------ */ 69 + 70 + struct padded_cache_line { 71 + int a; 72 + int b __attribute__((aligned(32))); 73 + }; 74 + 75 + /* ----- START-EXPECTED-OUTPUT ----- */ 76 + /* 77 + *struct zone_padding { 78 + * char x[0]; 79 + *}; 80 + * 81 + *struct zone { 82 + * int a; 83 + * short b; 84 + * short: 16; 85 + * struct zone_padding __pad__; 86 + *}; 87 + * 88 + */ 89 + /* ------ END-EXPECTED-OUTPUT ------ */ 90 + 91 + struct zone_padding { 92 + char x[0]; 93 + } __attribute__((__aligned__(8))); 94 + 95 + struct zone { 96 + int a; 97 + short b; 98 + short: 16; 99 + struct zone_padding __pad__; 100 + }; 101 + 102 + int f(struct { 103 + struct padded_implicitly _1; 104 + struct padded_explicitly _2; 105 + struct padded_a_lot _3; 106 + struct padded_cache_line _4; 107 + struct zone _5; 108 + } *_) 109 + { 110 + return 0; 111 + }

+229

tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * BTF-to-C dumper test for majority of C syntax quirks. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + /* ----- START-EXPECTED-OUTPUT ----- */ 9 + enum e1 { 10 + A = 0, 11 + B = 1, 12 + }; 13 + 14 + enum e2 { 15 + C = 100, 16 + D = -100, 17 + E = 0, 18 + }; 19 + 20 + typedef enum e2 e2_t; 21 + 22 + typedef enum { 23 + F = 0, 24 + G = 1, 25 + H = 2, 26 + } e3_t; 27 + 28 + typedef int int_t; 29 + 30 + typedef volatile const int * volatile const crazy_ptr_t; 31 + 32 + typedef int *****we_need_to_go_deeper_ptr_t; 33 + 34 + typedef volatile const we_need_to_go_deeper_ptr_t * restrict * volatile * const * restrict volatile * restrict const * volatile const * restrict volatile const how_about_this_ptr_t; 35 + 36 + typedef int *ptr_arr_t[10]; 37 + 38 + typedef void (*fn_ptr1_t)(int); 39 + 40 + typedef void (*printf_fn_t)(const char *, ...); 41 + 42 + /* ------ END-EXPECTED-OUTPUT ------ */ 43 + /* 44 + * While previous function pointers are pretty trivial (C-syntax-level 45 + * trivial), the following are deciphered here for future generations: 46 + * 47 + * - `fn_ptr2_t`: function, taking anonymous struct as a first arg and pointer 48 + * to a function, that takes int and returns int, as a second arg; returning 49 + * a pointer to a const pointer to a char. Equivalent to: 50 + * typedef struct { int a; } s_t; 51 + * typedef int (*fn_t)(int); 52 + * typedef char * const * (*fn_ptr2_t)(s_t, fn_t); 53 + * 54 + * - `fn_complext_t`: pointer to a function returning struct and accepting 55 + * union and struct. All structs and enum are anonymous and defined inline. 56 + * 57 + * - `signal_t: pointer to a function accepting a pointer to a function as an 58 + * argument and returning pointer to a function as a result. Sane equivalent: 59 + * typedef void (*signal_handler_t)(int); 60 + * typedef signal_handler_t (*signal_ptr_t)(int, signal_handler_t); 61 + * 62 + * - fn_ptr_arr1_t: array of pointers to a function accepting pointer to 63 + * a pointer to an int and returning pointer to a char. Easy. 64 + * 65 + * - fn_ptr_arr2_t: array of const pointers to a function taking no arguments 66 + * and returning a const pointer to a function, that takes pointer to a 67 + * `int -> char *` function and returns pointer to a char. Equivalent: 68 + * typedef char * (*fn_input_t)(int); 69 + * typedef char * (*fn_output_outer_t)(fn_input_t); 70 + * typedef const fn_output_outer_t (* fn_output_inner_t)(); 71 + * typedef const fn_output_inner_t fn_ptr_arr2_t[5]; 72 + */ 73 + /* ----- START-EXPECTED-OUTPUT ----- */ 74 + typedef char * const * (*fn_ptr2_t)(struct { 75 + int a; 76 + }, int (*)(int)); 77 + 78 + typedef struct { 79 + int a; 80 + void (*b)(int, struct { 81 + int c; 82 + }, union { 83 + char d; 84 + int e[5]; 85 + }); 86 + } (*fn_complex_t)(union { 87 + void *f; 88 + char g[16]; 89 + }, struct { 90 + int h; 91 + }); 92 + 93 + typedef void (* (*signal_t)(int, void (*)(int)))(int); 94 + 95 + typedef char * (*fn_ptr_arr1_t[10])(int **); 96 + 97 + typedef char * (* const (* const fn_ptr_arr2_t[5])())(char * (*)(int)); 98 + 99 + struct struct_w_typedefs { 100 + int_t a; 101 + crazy_ptr_t b; 102 + we_need_to_go_deeper_ptr_t c; 103 + how_about_this_ptr_t d; 104 + ptr_arr_t e; 105 + fn_ptr1_t f; 106 + printf_fn_t g; 107 + fn_ptr2_t h; 108 + fn_complex_t i; 109 + signal_t j; 110 + fn_ptr_arr1_t k; 111 + fn_ptr_arr2_t l; 112 + }; 113 + 114 + typedef struct { 115 + int x; 116 + int y; 117 + int z; 118 + } anon_struct_t; 119 + 120 + struct struct_fwd; 121 + 122 + typedef struct struct_fwd struct_fwd_t; 123 + 124 + typedef struct struct_fwd *struct_fwd_ptr_t; 125 + 126 + union union_fwd; 127 + 128 + typedef union union_fwd union_fwd_t; 129 + 130 + typedef union union_fwd *union_fwd_ptr_t; 131 + 132 + struct struct_empty {}; 133 + 134 + struct struct_simple { 135 + int a; 136 + char b; 137 + const int_t *p; 138 + struct struct_empty s; 139 + enum e2 e; 140 + enum { 141 + ANON_VAL1 = 1, 142 + ANON_VAL2 = 2, 143 + } f; 144 + int arr1[13]; 145 + enum e2 arr2[5]; 146 + }; 147 + 148 + union union_empty {}; 149 + 150 + union union_simple { 151 + void *ptr; 152 + int num; 153 + int_t num2; 154 + union union_empty u; 155 + }; 156 + 157 + struct struct_in_struct { 158 + struct struct_simple simple; 159 + union union_simple also_simple; 160 + struct { 161 + int a; 162 + } not_so_hard_as_well; 163 + union { 164 + int b; 165 + int c; 166 + } anon_union_is_good; 167 + struct { 168 + int d; 169 + int e; 170 + }; 171 + union { 172 + int f; 173 + int g; 174 + }; 175 + }; 176 + 177 + struct struct_with_embedded_stuff { 178 + int a; 179 + struct { 180 + int b; 181 + struct { 182 + struct struct_with_embedded_stuff *c; 183 + const char *d; 184 + } e; 185 + union { 186 + volatile long int f; 187 + void * restrict g; 188 + }; 189 + }; 190 + union { 191 + const int_t *h; 192 + void (*i)(char, int, void *); 193 + } j; 194 + enum { 195 + K = 100, 196 + L = 200, 197 + } m; 198 + char n[16]; 199 + struct { 200 + char o; 201 + int p; 202 + void (*q)(int); 203 + } r[5]; 204 + struct struct_in_struct s[10]; 205 + int t[11]; 206 + }; 207 + 208 + struct root_struct { 209 + enum e1 _1; 210 + enum e2 _2; 211 + e2_t _2_1; 212 + e3_t _2_2; 213 + struct struct_w_typedefs _3; 214 + anon_struct_t _7; 215 + struct struct_fwd *_8; 216 + struct_fwd_t *_9; 217 + struct_fwd_ptr_t _10; 218 + union union_fwd *_11; 219 + union_fwd_t *_12; 220 + union_fwd_ptr_t _13; 221 + struct struct_with_embedded_stuff _14; 222 + }; 223 + 224 + /* ------ END-EXPECTED-OUTPUT ------ */ 225 + 226 + int f(struct root_struct *s) 227 + { 228 + return 0; 229 + }

+268

tools/testing/selftests/bpf/progs/pyperf.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + #include <linux/sched.h> 4 + #include <linux/ptrace.h> 5 + #include <stdint.h> 6 + #include <stddef.h> 7 + #include <stdbool.h> 8 + #include <linux/bpf.h> 9 + #include "bpf_helpers.h" 10 + 11 + #define FUNCTION_NAME_LEN 64 12 + #define FILE_NAME_LEN 128 13 + #define TASK_COMM_LEN 16 14 + 15 + typedef struct { 16 + int PyThreadState_frame; 17 + int PyThreadState_thread; 18 + int PyFrameObject_back; 19 + int PyFrameObject_code; 20 + int PyFrameObject_lineno; 21 + int PyCodeObject_filename; 22 + int PyCodeObject_name; 23 + int String_data; 24 + int String_size; 25 + } OffsetConfig; 26 + 27 + typedef struct { 28 + uintptr_t current_state_addr; 29 + uintptr_t tls_key_addr; 30 + OffsetConfig offsets; 31 + bool use_tls; 32 + } PidData; 33 + 34 + typedef struct { 35 + uint32_t success; 36 + } Stats; 37 + 38 + typedef struct { 39 + char name[FUNCTION_NAME_LEN]; 40 + char file[FILE_NAME_LEN]; 41 + } Symbol; 42 + 43 + typedef struct { 44 + uint32_t pid; 45 + uint32_t tid; 46 + char comm[TASK_COMM_LEN]; 47 + int32_t kernel_stack_id; 48 + int32_t user_stack_id; 49 + bool thread_current; 50 + bool pthread_match; 51 + bool stack_complete; 52 + int16_t stack_len; 53 + int32_t stack[STACK_MAX_LEN]; 54 + 55 + int has_meta; 56 + int metadata; 57 + char dummy_safeguard; 58 + } Event; 59 + 60 + 61 + struct bpf_elf_map { 62 + __u32 type; 63 + __u32 size_key; 64 + __u32 size_value; 65 + __u32 max_elem; 66 + __u32 flags; 67 + }; 68 + 69 + typedef int pid_t; 70 + 71 + typedef struct { 72 + void* f_back; // PyFrameObject.f_back, previous frame 73 + void* f_code; // PyFrameObject.f_code, pointer to PyCodeObject 74 + void* co_filename; // PyCodeObject.co_filename 75 + void* co_name; // PyCodeObject.co_name 76 + } FrameData; 77 + 78 + static inline __attribute__((__always_inline__)) void* 79 + get_thread_state(void* tls_base, PidData* pidData) 80 + { 81 + void* thread_state; 82 + int key; 83 + 84 + bpf_probe_read(&key, sizeof(key), (void*)(long)pidData->tls_key_addr); 85 + bpf_probe_read(&thread_state, sizeof(thread_state), 86 + tls_base + 0x310 + key * 0x10 + 0x08); 87 + return thread_state; 88 + } 89 + 90 + static inline __attribute__((__always_inline__)) bool 91 + get_frame_data(void* frame_ptr, PidData* pidData, FrameData* frame, Symbol* symbol) 92 + { 93 + // read data from PyFrameObject 94 + bpf_probe_read(&frame->f_back, 95 + sizeof(frame->f_back), 96 + frame_ptr + pidData->offsets.PyFrameObject_back); 97 + bpf_probe_read(&frame->f_code, 98 + sizeof(frame->f_code), 99 + frame_ptr + pidData->offsets.PyFrameObject_code); 100 + 101 + // read data from PyCodeObject 102 + if (!frame->f_code) 103 + return false; 104 + bpf_probe_read(&frame->co_filename, 105 + sizeof(frame->co_filename), 106 + frame->f_code + pidData->offsets.PyCodeObject_filename); 107 + bpf_probe_read(&frame->co_name, 108 + sizeof(frame->co_name), 109 + frame->f_code + pidData->offsets.PyCodeObject_name); 110 + // read actual names into symbol 111 + if (frame->co_filename) 112 + bpf_probe_read_str(&symbol->file, 113 + sizeof(symbol->file), 114 + frame->co_filename + pidData->offsets.String_data); 115 + if (frame->co_name) 116 + bpf_probe_read_str(&symbol->name, 117 + sizeof(symbol->name), 118 + frame->co_name + pidData->offsets.String_data); 119 + return true; 120 + } 121 + 122 + struct bpf_elf_map SEC("maps") pidmap = { 123 + .type = BPF_MAP_TYPE_HASH, 124 + .size_key = sizeof(int), 125 + .size_value = sizeof(PidData), 126 + .max_elem = 1, 127 + }; 128 + 129 + struct bpf_elf_map SEC("maps") eventmap = { 130 + .type = BPF_MAP_TYPE_HASH, 131 + .size_key = sizeof(int), 132 + .size_value = sizeof(Event), 133 + .max_elem = 1, 134 + }; 135 + 136 + struct bpf_elf_map SEC("maps") symbolmap = { 137 + .type = BPF_MAP_TYPE_HASH, 138 + .size_key = sizeof(Symbol), 139 + .size_value = sizeof(int), 140 + .max_elem = 1, 141 + }; 142 + 143 + struct bpf_elf_map SEC("maps") statsmap = { 144 + .type = BPF_MAP_TYPE_ARRAY, 145 + .size_key = sizeof(Stats), 146 + .size_value = sizeof(int), 147 + .max_elem = 1, 148 + }; 149 + 150 + struct bpf_elf_map SEC("maps") perfmap = { 151 + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, 152 + .size_key = sizeof(int), 153 + .size_value = sizeof(int), 154 + .max_elem = 32, 155 + }; 156 + 157 + struct bpf_elf_map SEC("maps") stackmap = { 158 + .type = BPF_MAP_TYPE_STACK_TRACE, 159 + .size_key = sizeof(int), 160 + .size_value = sizeof(long long) * 127, 161 + .max_elem = 1000, 162 + }; 163 + 164 + static inline __attribute__((__always_inline__)) int __on_event(struct pt_regs *ctx) 165 + { 166 + uint64_t pid_tgid = bpf_get_current_pid_tgid(); 167 + pid_t pid = (pid_t)(pid_tgid >> 32); 168 + PidData* pidData = bpf_map_lookup_elem(&pidmap, &pid); 169 + if (!pidData) 170 + return 0; 171 + 172 + int zero = 0; 173 + Event* event = bpf_map_lookup_elem(&eventmap, &zero); 174 + if (!event) 175 + return 0; 176 + 177 + event->pid = pid; 178 + 179 + event->tid = (pid_t)pid_tgid; 180 + bpf_get_current_comm(&event->comm, sizeof(event->comm)); 181 + 182 + event->user_stack_id = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK); 183 + event->kernel_stack_id = bpf_get_stackid(ctx, &stackmap, 0); 184 + 185 + void* thread_state_current = (void*)0; 186 + bpf_probe_read(&thread_state_current, 187 + sizeof(thread_state_current), 188 + (void*)(long)pidData->current_state_addr); 189 + 190 + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); 191 + void* tls_base = (void*)task; 192 + 193 + void* thread_state = pidData->use_tls ? get_thread_state(tls_base, pidData) 194 + : thread_state_current; 195 + event->thread_current = thread_state == thread_state_current; 196 + 197 + if (pidData->use_tls) { 198 + uint64_t pthread_created; 199 + uint64_t pthread_self; 200 + bpf_probe_read(&pthread_self, sizeof(pthread_self), tls_base + 0x10); 201 + 202 + bpf_probe_read(&pthread_created, 203 + sizeof(pthread_created), 204 + thread_state + pidData->offsets.PyThreadState_thread); 205 + event->pthread_match = pthread_created == pthread_self; 206 + } else { 207 + event->pthread_match = 1; 208 + } 209 + 210 + if (event->pthread_match || !pidData->use_tls) { 211 + void* frame_ptr; 212 + FrameData frame; 213 + Symbol sym = {}; 214 + int cur_cpu = bpf_get_smp_processor_id(); 215 + 216 + bpf_probe_read(&frame_ptr, 217 + sizeof(frame_ptr), 218 + thread_state + pidData->offsets.PyThreadState_frame); 219 + 220 + int32_t* symbol_counter = bpf_map_lookup_elem(&symbolmap, &sym); 221 + if (symbol_counter == NULL) 222 + return 0; 223 + #pragma unroll 224 + /* Unwind python stack */ 225 + for (int i = 0; i < STACK_MAX_LEN; ++i) { 226 + if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) { 227 + int32_t new_symbol_id = *symbol_counter * 64 + cur_cpu; 228 + int32_t *symbol_id = bpf_map_lookup_elem(&symbolmap, &sym); 229 + if (!symbol_id) { 230 + bpf_map_update_elem(&symbolmap, &sym, &zero, 0); 231 + symbol_id = bpf_map_lookup_elem(&symbolmap, &sym); 232 + if (!symbol_id) 233 + return 0; 234 + } 235 + if (*symbol_id == new_symbol_id) 236 + (*symbol_counter)++; 237 + event->stack[i] = *symbol_id; 238 + event->stack_len = i + 1; 239 + frame_ptr = frame.f_back; 240 + } 241 + } 242 + event->stack_complete = frame_ptr == NULL; 243 + } else { 244 + event->stack_complete = 1; 245 + } 246 + 247 + Stats* stats = bpf_map_lookup_elem(&statsmap, &zero); 248 + if (stats) 249 + stats->success++; 250 + 251 + event->has_meta = 0; 252 + bpf_perf_event_output(ctx, &perfmap, 0, event, offsetof(Event, metadata)); 253 + return 0; 254 + } 255 + 256 + SEC("raw_tracepoint/kfree_skb") 257 + int on_event(struct pt_regs* ctx) 258 + { 259 + int i, ret = 0; 260 + ret |= __on_event(ctx); 261 + ret |= __on_event(ctx); 262 + ret |= __on_event(ctx); 263 + ret |= __on_event(ctx); 264 + ret |= __on_event(ctx); 265 + return ret; 266 + } 267 + 268 + char _license[] SEC("license") = "GPL";

+4

tools/testing/selftests/bpf/progs/pyperf100.c

+4

tools/testing/selftests/bpf/progs/pyperf180.c

+4

tools/testing/selftests/bpf/progs/pyperf50.c

-7

tools/testing/selftests/bpf/progs/sockmap_parse_prog.c

··· 5 5 6 6 int _version SEC("version") = 1; 7 7 8 - #define bpf_printk(fmt, ...) \ 9 - ({ \ 10 - char ____fmt[] = fmt; \ 11 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 12 - ##__VA_ARGS__); \ 13 - }) 14 - 15 8 SEC("sk_skb1") 16 9 int bpf_prog1(struct __sk_buff *skb) 17 10 {

-7

tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c

··· 5 5 6 6 int _version SEC("version") = 1; 7 7 8 - #define bpf_printk(fmt, ...) \ 9 - ({ \ 10 - char ____fmt[] = fmt; \ 11 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 12 - ##__VA_ARGS__); \ 13 - }) 14 - 15 8 SEC("sk_msg1") 16 9 int bpf_prog1(struct sk_msg_md *msg) 17 10 {

-7

tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c

··· 5 5 6 6 int _version SEC("version") = 1; 7 7 8 - #define bpf_printk(fmt, ...) \ 9 - ({ \ 10 - char ____fmt[] = fmt; \ 11 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 12 - ##__VA_ARGS__); \ 13 - }) 14 - 15 8 struct bpf_map_def SEC("maps") sock_map_rx = { 16 9 .type = BPF_MAP_TYPE_SOCKMAP, 17 10 .key_size = sizeof(int),

-7

tools/testing/selftests/bpf/progs/test_lwt_seg6local.c

··· 6 6 #include "bpf_helpers.h" 7 7 #include "bpf_endian.h" 8 8 9 - #define bpf_printk(fmt, ...) \ 10 - ({ \ 11 - char ____fmt[] = fmt; \ 12 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 13 - ##__VA_ARGS__); \ 14 - }) 15 - 16 9 /* Packet parsing state machine helpers. */ 17 10 #define cursor_advance(_cursor, _len) \ 18 11 ({ void *_tmp = _cursor; _cursor += _len; _tmp; })

+51

tools/testing/selftests/bpf/progs/test_send_signal_kern.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2019 Facebook 3 + #include <linux/bpf.h> 4 + #include <linux/version.h> 5 + #include "bpf_helpers.h" 6 + 7 + struct bpf_map_def SEC("maps") info_map = { 8 + .type = BPF_MAP_TYPE_ARRAY, 9 + .key_size = sizeof(__u32), 10 + .value_size = sizeof(__u64), 11 + .max_entries = 1, 12 + }; 13 + 14 + BPF_ANNOTATE_KV_PAIR(info_map, __u32, __u64); 15 + 16 + struct bpf_map_def SEC("maps") status_map = { 17 + .type = BPF_MAP_TYPE_ARRAY, 18 + .key_size = sizeof(__u32), 19 + .value_size = sizeof(__u64), 20 + .max_entries = 1, 21 + }; 22 + 23 + BPF_ANNOTATE_KV_PAIR(status_map, __u32, __u64); 24 + 25 + SEC("send_signal_demo") 26 + int bpf_send_signal_test(void *ctx) 27 + { 28 + __u64 *info_val, *status_val; 29 + __u32 key = 0, pid, sig; 30 + int ret; 31 + 32 + status_val = bpf_map_lookup_elem(&status_map, &key); 33 + if (!status_val || *status_val != 0) 34 + return 0; 35 + 36 + info_val = bpf_map_lookup_elem(&info_map, &key); 37 + if (!info_val || *info_val == 0) 38 + return 0; 39 + 40 + sig = *info_val >> 32; 41 + pid = *info_val & 0xffffFFFF; 42 + 43 + if ((bpf_get_current_pid_tgid() >> 32) == pid) { 44 + ret = bpf_send_signal(sig); 45 + if (ret == 0) 46 + *status_val = 1; 47 + } 48 + 49 + return 0; 50 + } 51 + char __license[] SEC("license") = "GPL";

-7

tools/testing/selftests/bpf/progs/test_xdp_noinline.c

··· 15 15 #include <linux/udp.h> 16 16 #include "bpf_helpers.h" 17 17 18 - #define bpf_printk(fmt, ...) \ 19 - ({ \ 20 - char ____fmt[] = fmt; \ 21 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 22 - ##__VA_ARGS__); \ 23 - }) 24 - 25 18 static __u32 rol32(__u32 word, unsigned int shift) 26 19 { 27 20 return (word << shift) | (word >> ((-shift) & 31));

+184

tools/testing/selftests/bpf/progs/xdping_kern.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ 3 + 4 + #define KBUILD_MODNAME "foo" 5 + #include <stddef.h> 6 + #include <string.h> 7 + #include <linux/bpf.h> 8 + #include <linux/icmp.h> 9 + #include <linux/in.h> 10 + #include <linux/if_ether.h> 11 + #include <linux/if_packet.h> 12 + #include <linux/if_vlan.h> 13 + #include <linux/ip.h> 14 + 15 + #include "bpf_helpers.h" 16 + #include "bpf_endian.h" 17 + 18 + #include "xdping.h" 19 + 20 + struct bpf_map_def SEC("maps") ping_map = { 21 + .type = BPF_MAP_TYPE_HASH, 22 + .key_size = sizeof(__u32), 23 + .value_size = sizeof(struct pinginfo), 24 + .max_entries = 256, 25 + }; 26 + 27 + static __always_inline void swap_src_dst_mac(void *data) 28 + { 29 + unsigned short *p = data; 30 + unsigned short dst[3]; 31 + 32 + dst[0] = p[0]; 33 + dst[1] = p[1]; 34 + dst[2] = p[2]; 35 + p[0] = p[3]; 36 + p[1] = p[4]; 37 + p[2] = p[5]; 38 + p[3] = dst[0]; 39 + p[4] = dst[1]; 40 + p[5] = dst[2]; 41 + } 42 + 43 + static __always_inline __u16 csum_fold_helper(__wsum sum) 44 + { 45 + sum = (sum & 0xffff) + (sum >> 16); 46 + return ~((sum & 0xffff) + (sum >> 16)); 47 + } 48 + 49 + static __always_inline __u16 ipv4_csum(void *data_start, int data_size) 50 + { 51 + __wsum sum; 52 + 53 + sum = bpf_csum_diff(0, 0, data_start, data_size, 0); 54 + return csum_fold_helper(sum); 55 + } 56 + 57 + #define ICMP_ECHO_LEN 64 58 + 59 + static __always_inline int icmp_check(struct xdp_md *ctx, int type) 60 + { 61 + void *data_end = (void *)(long)ctx->data_end; 62 + void *data = (void *)(long)ctx->data; 63 + struct ethhdr *eth = data; 64 + struct icmphdr *icmph; 65 + struct iphdr *iph; 66 + 67 + if (data + sizeof(*eth) + sizeof(*iph) + ICMP_ECHO_LEN > data_end) 68 + return XDP_PASS; 69 + 70 + if (eth->h_proto != bpf_htons(ETH_P_IP)) 71 + return XDP_PASS; 72 + 73 + iph = data + sizeof(*eth); 74 + 75 + if (iph->protocol != IPPROTO_ICMP) 76 + return XDP_PASS; 77 + 78 + if (bpf_ntohs(iph->tot_len) - sizeof(*iph) != ICMP_ECHO_LEN) 79 + return XDP_PASS; 80 + 81 + icmph = data + sizeof(*eth) + sizeof(*iph); 82 + 83 + if (icmph->type != type) 84 + return XDP_PASS; 85 + 86 + return XDP_TX; 87 + } 88 + 89 + SEC("xdpclient") 90 + int xdping_client(struct xdp_md *ctx) 91 + { 92 + void *data_end = (void *)(long)ctx->data_end; 93 + void *data = (void *)(long)ctx->data; 94 + struct pinginfo *pinginfo = NULL; 95 + struct ethhdr *eth = data; 96 + struct icmphdr *icmph; 97 + struct iphdr *iph; 98 + __u64 recvtime; 99 + __be32 raddr; 100 + __be16 seq; 101 + int ret; 102 + __u8 i; 103 + 104 + ret = icmp_check(ctx, ICMP_ECHOREPLY); 105 + 106 + if (ret != XDP_TX) 107 + return ret; 108 + 109 + iph = data + sizeof(*eth); 110 + icmph = data + sizeof(*eth) + sizeof(*iph); 111 + raddr = iph->saddr; 112 + 113 + /* Record time reply received. */ 114 + recvtime = bpf_ktime_get_ns(); 115 + pinginfo = bpf_map_lookup_elem(&ping_map, &raddr); 116 + if (!pinginfo || pinginfo->seq != icmph->un.echo.sequence) 117 + return XDP_PASS; 118 + 119 + if (pinginfo->start) { 120 + #pragma clang loop unroll(full) 121 + for (i = 0; i < XDPING_MAX_COUNT; i++) { 122 + if (pinginfo->times[i] == 0) 123 + break; 124 + } 125 + /* verifier is fussy here... */ 126 + if (i < XDPING_MAX_COUNT) { 127 + pinginfo->times[i] = recvtime - 128 + pinginfo->start; 129 + pinginfo->start = 0; 130 + i++; 131 + } 132 + /* No more space for values? */ 133 + if (i == pinginfo->count || i == XDPING_MAX_COUNT) 134 + return XDP_PASS; 135 + } 136 + 137 + /* Now convert reply back into echo request. */ 138 + swap_src_dst_mac(data); 139 + iph->saddr = iph->daddr; 140 + iph->daddr = raddr; 141 + icmph->type = ICMP_ECHO; 142 + seq = bpf_htons(bpf_ntohs(icmph->un.echo.sequence) + 1); 143 + icmph->un.echo.sequence = seq; 144 + icmph->checksum = 0; 145 + icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN); 146 + 147 + pinginfo->seq = seq; 148 + pinginfo->start = bpf_ktime_get_ns(); 149 + 150 + return XDP_TX; 151 + } 152 + 153 + SEC("xdpserver") 154 + int xdping_server(struct xdp_md *ctx) 155 + { 156 + void *data_end = (void *)(long)ctx->data_end; 157 + void *data = (void *)(long)ctx->data; 158 + struct ethhdr *eth = data; 159 + struct icmphdr *icmph; 160 + struct iphdr *iph; 161 + __be32 raddr; 162 + int ret; 163 + 164 + ret = icmp_check(ctx, ICMP_ECHO); 165 + 166 + if (ret != XDP_TX) 167 + return ret; 168 + 169 + iph = data + sizeof(*eth); 170 + icmph = data + sizeof(*eth) + sizeof(*iph); 171 + raddr = iph->saddr; 172 + 173 + /* Now convert request into echo reply. */ 174 + swap_src_dst_mac(data); 175 + iph->saddr = iph->daddr; 176 + iph->daddr = raddr; 177 + icmph->type = ICMP_ECHOREPLY; 178 + icmph->checksum = 0; 179 + icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN); 180 + 181 + return XDP_TX; 182 + } 183 + 184 + char _license[] SEC("license") = "GPL";

+13 -58

tools/testing/selftests/bpf/test_btf.c

··· 4025 4025 }, 4026 4026 }; 4027 4027 4028 - static int file_has_btf_elf(const char *fn, bool *has_btf_ext) 4029 - { 4030 - Elf_Scn *scn = NULL; 4031 - GElf_Ehdr ehdr; 4032 - int ret = 0; 4033 - int elf_fd; 4034 - Elf *elf; 4035 - 4036 - if (CHECK(elf_version(EV_CURRENT) == EV_NONE, 4037 - "elf_version(EV_CURRENT) == EV_NONE")) 4038 - return -1; 4039 - 4040 - elf_fd = open(fn, O_RDONLY); 4041 - if (CHECK(elf_fd == -1, "open(%s): errno:%d", fn, errno)) 4042 - return -1; 4043 - 4044 - elf = elf_begin(elf_fd, ELF_C_READ, NULL); 4045 - if (CHECK(!elf, "elf_begin(%s): %s", fn, elf_errmsg(elf_errno()))) { 4046 - ret = -1; 4047 - goto done; 4048 - } 4049 - 4050 - if (CHECK(!gelf_getehdr(elf, &ehdr), "!gelf_getehdr(%s)", fn)) { 4051 - ret = -1; 4052 - goto done; 4053 - } 4054 - 4055 - while ((scn = elf_nextscn(elf, scn))) { 4056 - const char *sh_name; 4057 - GElf_Shdr sh; 4058 - 4059 - if (CHECK(gelf_getshdr(scn, &sh) != &sh, 4060 - "file:%s gelf_getshdr != &sh", fn)) { 4061 - ret = -1; 4062 - goto done; 4063 - } 4064 - 4065 - sh_name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name); 4066 - if (!strcmp(sh_name, BTF_ELF_SEC)) 4067 - ret = 1; 4068 - if (!strcmp(sh_name, BTF_EXT_ELF_SEC)) 4069 - *has_btf_ext = true; 4070 - } 4071 - 4072 - done: 4073 - close(elf_fd); 4074 - elf_end(elf); 4075 - return ret; 4076 - } 4077 - 4078 4028 static int do_test_file(unsigned int test_num) 4079 4029 { 4080 4030 const struct btf_file_test *test = &file_tests[test_num - 1]; 4081 4031 const char *expected_fnames[] = {"_dummy_tracepoint", 4082 4032 "test_long_fname_1", 4083 4033 "test_long_fname_2"}; 4034 + struct btf_ext *btf_ext = NULL; 4084 4035 struct bpf_prog_info info = {}; 4085 4036 struct bpf_object *obj = NULL; 4086 4037 struct bpf_func_info *finfo; ··· 4046 4095 fprintf(stderr, "BTF libbpf test[%u] (%s): ", test_num, 4047 4096 test->file); 4048 4097 4049 - err = file_has_btf_elf(test->file, &has_btf_ext); 4050 - if (err == -1) 4051 - return err; 4052 - 4053 - if (err == 0) { 4054 - fprintf(stderr, "SKIP. No ELF %s found", BTF_ELF_SEC); 4055 - skip_cnt++; 4056 - return 0; 4098 + btf = btf__parse_elf(test->file, &btf_ext); 4099 + if (IS_ERR(btf)) { 4100 + if (PTR_ERR(btf) == -ENOENT) { 4101 + fprintf(stderr, "SKIP. No ELF %s found", BTF_ELF_SEC); 4102 + skip_cnt++; 4103 + return 0; 4104 + } 4105 + return PTR_ERR(btf); 4057 4106 } 4107 + btf__free(btf); 4108 + 4109 + has_btf_ext = btf_ext != NULL; 4110 + btf_ext__free(btf_ext); 4058 4111 4059 4112 obj = bpf_object__open(test->file); 4060 4113 if (CHECK(IS_ERR(obj), "obj: %ld", PTR_ERR(obj)))

+143

tools/testing/selftests/bpf/test_btf_dump.c

··· 1 + #include <stdio.h> 2 + #include <stdlib.h> 3 + #include <string.h> 4 + #include <unistd.h> 5 + #include <errno.h> 6 + #include <linux/err.h> 7 + #include <btf.h> 8 + 9 + #define CHECK(condition, format...) ({ \ 10 + int __ret = !!(condition); \ 11 + if (__ret) { \ 12 + fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ 13 + fprintf(stderr, format); \ 14 + } \ 15 + __ret; \ 16 + }) 17 + 18 + void btf_dump_printf(void *ctx, const char *fmt, va_list args) 19 + { 20 + vfprintf(ctx, fmt, args); 21 + } 22 + 23 + struct btf_dump_test_case { 24 + const char *name; 25 + struct btf_dump_opts opts; 26 + } btf_dump_test_cases[] = { 27 + {.name = "btf_dump_test_case_syntax", .opts = {}}, 28 + {.name = "btf_dump_test_case_ordering", .opts = {}}, 29 + {.name = "btf_dump_test_case_padding", .opts = {}}, 30 + {.name = "btf_dump_test_case_packing", .opts = {}}, 31 + {.name = "btf_dump_test_case_bitfields", .opts = {}}, 32 + {.name = "btf_dump_test_case_multidim", .opts = {}}, 33 + {.name = "btf_dump_test_case_namespacing", .opts = {}}, 34 + }; 35 + 36 + static int btf_dump_all_types(const struct btf *btf, 37 + const struct btf_dump_opts *opts) 38 + { 39 + size_t type_cnt = btf__get_nr_types(btf); 40 + struct btf_dump *d; 41 + int err = 0, id; 42 + 43 + d = btf_dump__new(btf, NULL, opts, btf_dump_printf); 44 + if (IS_ERR(d)) 45 + return PTR_ERR(d); 46 + 47 + for (id = 1; id <= type_cnt; id++) { 48 + err = btf_dump__dump_type(d, id); 49 + if (err) 50 + goto done; 51 + } 52 + 53 + done: 54 + btf_dump__free(d); 55 + return err; 56 + } 57 + 58 + int test_btf_dump_case(int n, struct btf_dump_test_case *test_case) 59 + { 60 + char test_file[256], out_file[256], diff_cmd[1024]; 61 + struct btf *btf = NULL; 62 + int err = 0, fd = -1; 63 + FILE *f = NULL; 64 + 65 + fprintf(stderr, "Test case #%d (%s): ", n, test_case->name); 66 + 67 + snprintf(test_file, sizeof(test_file), "%s.o", test_case->name); 68 + 69 + btf = btf__parse_elf(test_file, NULL); 70 + if (CHECK(IS_ERR(btf), 71 + "failed to load test BTF: %ld\n", PTR_ERR(btf))) { 72 + err = -PTR_ERR(btf); 73 + btf = NULL; 74 + goto done; 75 + } 76 + 77 + snprintf(out_file, sizeof(out_file), 78 + "/tmp/%s.output.XXXXXX", test_case->name); 79 + fd = mkstemp(out_file); 80 + if (CHECK(fd < 0, "failed to create temp output file: %d\n", fd)) { 81 + err = fd; 82 + goto done; 83 + } 84 + f = fdopen(fd, "w"); 85 + if (CHECK(f == NULL, "failed to open temp output file: %s(%d)\n", 86 + strerror(errno), errno)) { 87 + close(fd); 88 + goto done; 89 + } 90 + 91 + test_case->opts.ctx = f; 92 + err = btf_dump_all_types(btf, &test_case->opts); 93 + fclose(f); 94 + close(fd); 95 + if (CHECK(err, "failure during C dumping: %d\n", err)) { 96 + goto done; 97 + } 98 + 99 + snprintf(test_file, sizeof(test_file), "progs/%s.c", test_case->name); 100 + /* 101 + * Diff test output and expected test output, contained between 102 + * START-EXPECTED-OUTPUT and END-EXPECTED-OUTPUT lines in test case. 103 + * For expected output lines, everything before '*' is stripped out. 104 + * Also lines containing comment start and comment end markers are 105 + * ignored. 106 + */ 107 + snprintf(diff_cmd, sizeof(diff_cmd), 108 + "awk '/START-EXPECTED-OUTPUT/{out=1;next} " 109 + "/END-EXPECTED-OUTPUT/{out=0} " 110 + "/\\/\\*|\\*\\//{next} " /* ignore comment start/end lines */ 111 + "out {sub(/^[ \\t]*\\*/, \"\"); print}' '%s' | diff -u - '%s'", 112 + test_file, out_file); 113 + err = system(diff_cmd); 114 + if (CHECK(err, 115 + "differing test output, output=%s, err=%d, diff cmd:\n%s\n", 116 + out_file, err, diff_cmd)) 117 + goto done; 118 + 119 + remove(out_file); 120 + fprintf(stderr, "OK\n"); 121 + 122 + done: 123 + btf__free(btf); 124 + return err; 125 + } 126 + 127 + int main() { 128 + int test_case_cnt, i, err, failed = 0; 129 + 130 + test_case_cnt = sizeof(btf_dump_test_cases) / 131 + sizeof(btf_dump_test_cases[0]); 132 + 133 + for (i = 0; i < test_case_cnt; i++) { 134 + err = test_btf_dump_case(i, &btf_dump_test_cases[i]); 135 + if (err) 136 + failed++; 137 + } 138 + 139 + fprintf(stderr, "%d tests succeeded, %d tests failed.\n", 140 + test_case_cnt - failed, failed); 141 + 142 + return failed; 143 + }

+382

tools/testing/selftests/bpf/test_hashmap.c

··· 1 + // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 + 3 + /* 4 + * Tests for libbpf's hashmap. 5 + * 6 + * Copyright (c) 2019 Facebook 7 + */ 8 + #include <stdio.h> 9 + #include <errno.h> 10 + #include <linux/err.h> 11 + #include "hashmap.h" 12 + 13 + #define CHECK(condition, format...) ({ \ 14 + int __ret = !!(condition); \ 15 + if (__ret) { \ 16 + fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ 17 + fprintf(stderr, format); \ 18 + } \ 19 + __ret; \ 20 + }) 21 + 22 + size_t hash_fn(const void *k, void *ctx) 23 + { 24 + return (long)k; 25 + } 26 + 27 + bool equal_fn(const void *a, const void *b, void *ctx) 28 + { 29 + return (long)a == (long)b; 30 + } 31 + 32 + static inline size_t next_pow_2(size_t n) 33 + { 34 + size_t r = 1; 35 + 36 + while (r < n) 37 + r <<= 1; 38 + return r; 39 + } 40 + 41 + static inline size_t exp_cap(size_t sz) 42 + { 43 + size_t r = next_pow_2(sz); 44 + 45 + if (sz * 4 / 3 > r) 46 + r <<= 1; 47 + return r; 48 + } 49 + 50 + #define ELEM_CNT 62 51 + 52 + int test_hashmap_generic(void) 53 + { 54 + struct hashmap_entry *entry, *tmp; 55 + int err, bkt, found_cnt, i; 56 + long long found_msk; 57 + struct hashmap *map; 58 + 59 + fprintf(stderr, "%s: ", __func__); 60 + 61 + map = hashmap__new(hash_fn, equal_fn, NULL); 62 + if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) 63 + return 1; 64 + 65 + for (i = 0; i < ELEM_CNT; i++) { 66 + const void *oldk, *k = (const void *)(long)i; 67 + void *oldv, *v = (void *)(long)(1024 + i); 68 + 69 + err = hashmap__update(map, k, v, &oldk, &oldv); 70 + if (CHECK(err != -ENOENT, "unexpected result: %d\n", err)) 71 + return 1; 72 + 73 + if (i % 2) { 74 + err = hashmap__add(map, k, v); 75 + } else { 76 + err = hashmap__set(map, k, v, &oldk, &oldv); 77 + if (CHECK(oldk != NULL || oldv != NULL, 78 + "unexpected k/v: %p=%p\n", oldk, oldv)) 79 + return 1; 80 + } 81 + 82 + if (CHECK(err, "failed to add k/v %ld = %ld: %d\n", 83 + (long)k, (long)v, err)) 84 + return 1; 85 + 86 + if (CHECK(!hashmap__find(map, k, &oldv), 87 + "failed to find key %ld\n", (long)k)) 88 + return 1; 89 + if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) 90 + return 1; 91 + } 92 + 93 + if (CHECK(hashmap__size(map) != ELEM_CNT, 94 + "invalid map size: %zu\n", hashmap__size(map))) 95 + return 1; 96 + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), 97 + "unexpected map capacity: %zu\n", hashmap__capacity(map))) 98 + return 1; 99 + 100 + found_msk = 0; 101 + hashmap__for_each_entry(map, entry, bkt) { 102 + long k = (long)entry->key; 103 + long v = (long)entry->value; 104 + 105 + found_msk |= 1ULL << k; 106 + if (CHECK(v - k != 1024, "invalid k/v pair: %ld = %ld\n", k, v)) 107 + return 1; 108 + } 109 + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, 110 + "not all keys iterated: %llx\n", found_msk)) 111 + return 1; 112 + 113 + for (i = 0; i < ELEM_CNT; i++) { 114 + const void *oldk, *k = (const void *)(long)i; 115 + void *oldv, *v = (void *)(long)(256 + i); 116 + 117 + err = hashmap__add(map, k, v); 118 + if (CHECK(err != -EEXIST, "unexpected add result: %d\n", err)) 119 + return 1; 120 + 121 + if (i % 2) 122 + err = hashmap__update(map, k, v, &oldk, &oldv); 123 + else 124 + err = hashmap__set(map, k, v, &oldk, &oldv); 125 + 126 + if (CHECK(err, "failed to update k/v %ld = %ld: %d\n", 127 + (long)k, (long)v, err)) 128 + return 1; 129 + if (CHECK(!hashmap__find(map, k, &oldv), 130 + "failed to find key %ld\n", (long)k)) 131 + return 1; 132 + if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) 133 + return 1; 134 + } 135 + 136 + if (CHECK(hashmap__size(map) != ELEM_CNT, 137 + "invalid updated map size: %zu\n", hashmap__size(map))) 138 + return 1; 139 + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), 140 + "unexpected map capacity: %zu\n", hashmap__capacity(map))) 141 + return 1; 142 + 143 + found_msk = 0; 144 + hashmap__for_each_entry_safe(map, entry, tmp, bkt) { 145 + long k = (long)entry->key; 146 + long v = (long)entry->value; 147 + 148 + found_msk |= 1ULL << k; 149 + if (CHECK(v - k != 256, 150 + "invalid updated k/v pair: %ld = %ld\n", k, v)) 151 + return 1; 152 + } 153 + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, 154 + "not all keys iterated after update: %llx\n", found_msk)) 155 + return 1; 156 + 157 + found_cnt = 0; 158 + hashmap__for_each_key_entry(map, entry, (void *)0) { 159 + found_cnt++; 160 + } 161 + if (CHECK(!found_cnt, "didn't find any entries for key 0\n")) 162 + return 1; 163 + 164 + found_msk = 0; 165 + found_cnt = 0; 166 + hashmap__for_each_key_entry_safe(map, entry, tmp, (void *)0) { 167 + const void *oldk, *k; 168 + void *oldv, *v; 169 + 170 + k = entry->key; 171 + v = entry->value; 172 + 173 + found_cnt++; 174 + found_msk |= 1ULL << (long)k; 175 + 176 + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), 177 + "failed to delete k/v %ld = %ld\n", 178 + (long)k, (long)v)) 179 + return 1; 180 + if (CHECK(oldk != k || oldv != v, 181 + "invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n", 182 + (long)k, (long)v, (long)oldk, (long)oldv)) 183 + return 1; 184 + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), 185 + "unexpectedly deleted k/v %ld = %ld\n", 186 + (long)oldk, (long)oldv)) 187 + return 1; 188 + } 189 + 190 + if (CHECK(!found_cnt || !found_msk, 191 + "didn't delete any key entries\n")) 192 + return 1; 193 + if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, 194 + "invalid updated map size (already deleted: %d): %zu\n", 195 + found_cnt, hashmap__size(map))) 196 + return 1; 197 + if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), 198 + "unexpected map capacity: %zu\n", hashmap__capacity(map))) 199 + return 1; 200 + 201 + hashmap__for_each_entry_safe(map, entry, tmp, bkt) { 202 + const void *oldk, *k; 203 + void *oldv, *v; 204 + 205 + k = entry->key; 206 + v = entry->value; 207 + 208 + found_cnt++; 209 + found_msk |= 1ULL << (long)k; 210 + 211 + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), 212 + "failed to delete k/v %ld = %ld\n", 213 + (long)k, (long)v)) 214 + return 1; 215 + if (CHECK(oldk != k || oldv != v, 216 + "invalid old k/v: expect %ld = %ld, got %ld = %ld\n", 217 + (long)k, (long)v, (long)oldk, (long)oldv)) 218 + return 1; 219 + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), 220 + "unexpectedly deleted k/v %ld = %ld\n", 221 + (long)k, (long)v)) 222 + return 1; 223 + } 224 + 225 + if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1, 226 + "not all keys were deleted: found_cnt:%d, found_msk:%llx\n", 227 + found_cnt, found_msk)) 228 + return 1; 229 + if (CHECK(hashmap__size(map) != 0, 230 + "invalid updated map size (already deleted: %d): %zu\n", 231 + found_cnt, hashmap__size(map))) 232 + return 1; 233 + 234 + found_cnt = 0; 235 + hashmap__for_each_entry(map, entry, bkt) { 236 + CHECK(false, "unexpected map entries left: %ld = %ld\n", 237 + (long)entry->key, (long)entry->value); 238 + return 1; 239 + } 240 + 241 + hashmap__free(map); 242 + hashmap__for_each_entry(map, entry, bkt) { 243 + CHECK(false, "unexpected map entries left: %ld = %ld\n", 244 + (long)entry->key, (long)entry->value); 245 + return 1; 246 + } 247 + 248 + fprintf(stderr, "OK\n"); 249 + return 0; 250 + } 251 + 252 + size_t collision_hash_fn(const void *k, void *ctx) 253 + { 254 + return 0; 255 + } 256 + 257 + int test_hashmap_multimap(void) 258 + { 259 + void *k1 = (void *)0, *k2 = (void *)1; 260 + struct hashmap_entry *entry; 261 + struct hashmap *map; 262 + long found_msk; 263 + int err, bkt; 264 + 265 + fprintf(stderr, "%s: ", __func__); 266 + 267 + /* force collisions */ 268 + map = hashmap__new(collision_hash_fn, equal_fn, NULL); 269 + if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) 270 + return 1; 271 + 272 + 273 + /* set up multimap: 274 + * [0] -> 1, 2, 4; 275 + * [1] -> 8, 16, 32; 276 + */ 277 + err = hashmap__append(map, k1, (void *)1); 278 + if (CHECK(err, "failed to add k/v: %d\n", err)) 279 + return 1; 280 + err = hashmap__append(map, k1, (void *)2); 281 + if (CHECK(err, "failed to add k/v: %d\n", err)) 282 + return 1; 283 + err = hashmap__append(map, k1, (void *)4); 284 + if (CHECK(err, "failed to add k/v: %d\n", err)) 285 + return 1; 286 + 287 + err = hashmap__append(map, k2, (void *)8); 288 + if (CHECK(err, "failed to add k/v: %d\n", err)) 289 + return 1; 290 + err = hashmap__append(map, k2, (void *)16); 291 + if (CHECK(err, "failed to add k/v: %d\n", err)) 292 + return 1; 293 + err = hashmap__append(map, k2, (void *)32); 294 + if (CHECK(err, "failed to add k/v: %d\n", err)) 295 + return 1; 296 + 297 + if (CHECK(hashmap__size(map) != 6, 298 + "invalid map size: %zu\n", hashmap__size(map))) 299 + return 1; 300 + 301 + /* verify global iteration still works and sees all values */ 302 + found_msk = 0; 303 + hashmap__for_each_entry(map, entry, bkt) { 304 + found_msk |= (long)entry->value; 305 + } 306 + if (CHECK(found_msk != (1 << 6) - 1, 307 + "not all keys iterated: %lx\n", found_msk)) 308 + return 1; 309 + 310 + /* iterate values for key 1 */ 311 + found_msk = 0; 312 + hashmap__for_each_key_entry(map, entry, k1) { 313 + found_msk |= (long)entry->value; 314 + } 315 + if (CHECK(found_msk != (1 | 2 | 4), 316 + "invalid k1 values: %lx\n", found_msk)) 317 + return 1; 318 + 319 + /* iterate values for key 2 */ 320 + found_msk = 0; 321 + hashmap__for_each_key_entry(map, entry, k2) { 322 + found_msk |= (long)entry->value; 323 + } 324 + if (CHECK(found_msk != (8 | 16 | 32), 325 + "invalid k2 values: %lx\n", found_msk)) 326 + return 1; 327 + 328 + fprintf(stderr, "OK\n"); 329 + return 0; 330 + } 331 + 332 + int test_hashmap_empty() 333 + { 334 + struct hashmap_entry *entry; 335 + int bkt; 336 + struct hashmap *map; 337 + void *k = (void *)0; 338 + 339 + fprintf(stderr, "%s: ", __func__); 340 + 341 + /* force collisions */ 342 + map = hashmap__new(hash_fn, equal_fn, NULL); 343 + if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) 344 + return 1; 345 + 346 + if (CHECK(hashmap__size(map) != 0, 347 + "invalid map size: %zu\n", hashmap__size(map))) 348 + return 1; 349 + if (CHECK(hashmap__capacity(map) != 0, 350 + "invalid map capacity: %zu\n", hashmap__capacity(map))) 351 + return 1; 352 + if (CHECK(hashmap__find(map, k, NULL), "unexpected find\n")) 353 + return 1; 354 + if (CHECK(hashmap__delete(map, k, NULL, NULL), "unexpected delete\n")) 355 + return 1; 356 + 357 + hashmap__for_each_entry(map, entry, bkt) { 358 + CHECK(false, "unexpected iterated entry\n"); 359 + return 1; 360 + } 361 + hashmap__for_each_key_entry(map, entry, k) { 362 + CHECK(false, "unexpected key entry\n"); 363 + return 1; 364 + } 365 + 366 + fprintf(stderr, "OK\n"); 367 + return 0; 368 + } 369 + 370 + int main(int argc, char **argv) 371 + { 372 + bool failed = false; 373 + 374 + if (test_hashmap_generic()) 375 + failed = true; 376 + if (test_hashmap_multimap()) 377 + failed = true; 378 + if (test_hashmap_empty()) 379 + failed = true; 380 + 381 + return failed; 382 + }

+1

tools/testing/selftests/bpf/test_sock_addr.c

··· 745 745 attr.file = path; 746 746 attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; 747 747 attr.expected_attach_type = test->expected_attach_type; 748 + attr.prog_flags = BPF_F_TEST_RND_HI32; 748 749 749 750 if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) { 750 751 if (test->expected_result != LOAD_REJECT)

+1

tools/testing/selftests/bpf/test_sock_fields.c

··· 414 414 struct bpf_prog_load_attr attr = { 415 415 .file = "test_sock_fields_kern.o", 416 416 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 417 + .prog_flags = BPF_F_TEST_RND_HI32, 417 418 }; 418 419 int cgroup_fd, egress_fd, ingress_fd, err; 419 420 struct bpf_program *ingress_prog;

+1

tools/testing/selftests/bpf/test_socket_cookie.c

··· 148 148 memset(&attr, 0, sizeof(attr)); 149 149 attr.file = SOCKET_COOKIE_PROG; 150 150 attr.prog_type = BPF_PROG_TYPE_UNSPEC; 151 + attr.prog_flags = BPF_F_TEST_RND_HI32; 151 152 152 153 err = bpf_prog_load_xattr(&attr, &pobj, &prog_fd); 153 154 if (err) {

-7

tools/testing/selftests/bpf/test_sockmap_kern.h

··· 28 28 * are established and verdicts are decided. 29 29 */ 30 30 31 - #define bpf_printk(fmt, ...) \ 32 - ({ \ 33 - char ____fmt[] = fmt; \ 34 - bpf_trace_printk(____fmt, sizeof(____fmt), \ 35 - ##__VA_ARGS__); \ 36 - }) 37 - 38 31 struct bpf_map_def SEC("maps") sock_map = { 39 32 .type = TEST_MAP_TYPE, 40 33 .key_size = sizeof(int),

+40

tools/testing/selftests/bpf/test_stub.c

··· 1 + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + /* Copyright (C) 2019 Netronome Systems, Inc. */ 3 + 4 + #include <bpf/bpf.h> 5 + #include <bpf/libbpf.h> 6 + #include <string.h> 7 + 8 + int bpf_prog_test_load(const char *file, enum bpf_prog_type type, 9 + struct bpf_object **pobj, int *prog_fd) 10 + { 11 + struct bpf_prog_load_attr attr; 12 + 13 + memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); 14 + attr.file = file; 15 + attr.prog_type = type; 16 + attr.expected_attach_type = 0; 17 + attr.prog_flags = BPF_F_TEST_RND_HI32; 18 + 19 + return bpf_prog_load_xattr(&attr, pobj, prog_fd); 20 + } 21 + 22 + int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, 23 + size_t insns_cnt, const char *license, 24 + __u32 kern_version, char *log_buf, 25 + size_t log_buf_sz) 26 + { 27 + struct bpf_load_program_attr load_attr; 28 + 29 + memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); 30 + load_attr.prog_type = type; 31 + load_attr.expected_attach_type = 0; 32 + load_attr.name = NULL; 33 + load_attr.insns = insns; 34 + load_attr.insns_cnt = insns_cnt; 35 + load_attr.license = license; 36 + load_attr.kern_version = kern_version; 37 + load_attr.prog_flags = BPF_F_TEST_RND_HI32; 38 + 39 + return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz); 40 + }

+32

tools/testing/selftests/bpf/test_tunnel.sh

··· 696 696 697 697 bpf_tunnel_test() 698 698 { 699 + local errors=0 700 + 699 701 echo "Testing GRE tunnel..." 700 702 test_gre 703 + errors=$(( $errors + $? )) 704 + 701 705 echo "Testing IP6GRE tunnel..." 702 706 test_ip6gre 707 + errors=$(( $errors + $? )) 708 + 703 709 echo "Testing IP6GRETAP tunnel..." 704 710 test_ip6gretap 711 + errors=$(( $errors + $? )) 712 + 705 713 echo "Testing ERSPAN tunnel..." 706 714 test_erspan v2 715 + errors=$(( $errors + $? )) 716 + 707 717 echo "Testing IP6ERSPAN tunnel..." 708 718 test_ip6erspan v2 719 + errors=$(( $errors + $? )) 720 + 709 721 echo "Testing VXLAN tunnel..." 710 722 test_vxlan 723 + errors=$(( $errors + $? )) 724 + 711 725 echo "Testing IP6VXLAN tunnel..." 712 726 test_ip6vxlan 727 + errors=$(( $errors + $? )) 728 + 713 729 echo "Testing GENEVE tunnel..." 714 730 test_geneve 731 + errors=$(( $errors + $? )) 732 + 715 733 echo "Testing IP6GENEVE tunnel..." 716 734 test_ip6geneve 735 + errors=$(( $errors + $? )) 736 + 717 737 echo "Testing IPIP tunnel..." 718 738 test_ipip 739 + errors=$(( $errors + $? )) 740 + 719 741 echo "Testing IPIP6 tunnel..." 720 742 test_ipip6 743 + errors=$(( $errors + $? )) 744 + 721 745 echo "Testing IPSec tunnel..." 722 746 test_xfrm_tunnel 747 + errors=$(( $errors + $? )) 748 + 749 + return $errors 723 750 } 724 751 725 752 trap cleanup 0 3 6 ··· 755 728 cleanup 756 729 bpf_tunnel_test 757 730 731 + if [ $? -ne 0 ]; then 732 + echo -e "$(basename $0): ${RED}FAIL${NC}" 733 + exit 1 734 + fi 735 + echo -e "$(basename $0): ${GREEN}PASS${NC}" 758 736 exit 0

+37 -25

tools/testing/selftests/bpf/test_verifier.c

··· 138 138 loop: 139 139 for (j = 0; j < PUSH_CNT; j++) { 140 140 insn[i++] = BPF_LD_ABS(BPF_B, 0); 141 - insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2); 141 + /* jump to error label */ 142 + insn[i] = BPF_JMP32_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 3); 142 143 i++; 143 144 insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); 144 145 insn[i++] = BPF_MOV64_IMM(BPF_REG_2, 1); 145 146 insn[i++] = BPF_MOV64_IMM(BPF_REG_3, 2); 146 147 insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 147 148 BPF_FUNC_skb_vlan_push), 148 - insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2); 149 + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 3); 149 150 i++; 150 151 } 151 152 152 153 for (j = 0; j < PUSH_CNT; j++) { 153 154 insn[i++] = BPF_LD_ABS(BPF_B, 0); 154 - insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2); 155 + insn[i] = BPF_JMP32_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 3); 155 156 i++; 156 157 insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); 157 158 insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 158 159 BPF_FUNC_skb_vlan_pop), 159 - insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2); 160 + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 3); 160 161 i++; 161 162 } 162 163 if (++k < 5) 163 164 goto loop; 164 165 165 - for (; i < len - 1; i++) 166 - insn[i] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 0xbef); 166 + for (; i < len - 3; i++) 167 + insn[i] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0xbef); 168 + insn[len - 3] = BPF_JMP_A(1); 169 + /* error label */ 170 + insn[len - 2] = BPF_MOV32_IMM(BPF_REG_0, 0); 167 171 insn[len - 1] = BPF_EXIT_INSN(); 168 172 self->prog_len = len; 169 173 } ··· 175 171 static void bpf_fill_jump_around_ld_abs(struct bpf_test *self) 176 172 { 177 173 struct bpf_insn *insn = self->fill_insns; 178 - /* jump range is limited to 16 bit. every ld_abs is replaced by 6 insns */ 179 - unsigned int len = (1 << 15) / 6; 174 + /* jump range is limited to 16 bit. every ld_abs is replaced by 6 insns, 175 + * but on arches like arm, ppc etc, there will be one BPF_ZEXT inserted 176 + * to extend the error value of the inlined ld_abs sequence which then 177 + * contains 7 insns. so, set the dividend to 7 so the testcase could 178 + * work on all arches. 179 + */ 180 + unsigned int len = (1 << 15) / 7; 180 181 int i = 0; 181 182 182 183 insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); ··· 219 210 self->retval = (uint32_t)res; 220 211 } 221 212 222 - /* test the sequence of 1k jumps */ 213 + #define MAX_JMP_SEQ 8192 214 + 215 + /* test the sequence of 8k jumps */ 223 216 static void bpf_fill_scale1(struct bpf_test *self) 224 217 { 225 218 struct bpf_insn *insn = self->fill_insns; 226 219 int i = 0, k = 0; 227 220 228 221 insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); 229 - /* test to check that the sequence of 1024 jumps is acceptable */ 230 - while (k++ < 1024) { 222 + /* test to check that the long sequence of jumps is acceptable */ 223 + while (k++ < MAX_JMP_SEQ) { 231 224 insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 232 225 BPF_FUNC_get_prandom_u32); 233 - insn[i++] = BPF_JMP_IMM(BPF_JGT, BPF_REG_0, bpf_semi_rand_get(), 2); 226 + insn[i++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, bpf_semi_rand_get(), 2); 234 227 insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10); 235 228 insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 236 229 -8 * (k % 64 + 1)); 237 230 } 238 - /* every jump adds 1024 steps to insn_processed, so to stay exactly 239 - * within 1m limit add MAX_TEST_INSNS - 1025 MOVs and 1 EXIT 231 + /* every jump adds 1 step to insn_processed, so to stay exactly 232 + * within 1m limit add MAX_TEST_INSNS - MAX_JMP_SEQ - 1 MOVs and 1 EXIT 240 233 */ 241 - while (i < MAX_TEST_INSNS - 1025) 242 - insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42); 234 + while (i < MAX_TEST_INSNS - MAX_JMP_SEQ - 1) 235 + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42); 243 236 insn[i] = BPF_EXIT_INSN(); 244 237 self->prog_len = i + 1; 245 238 self->retval = 42; 246 239 } 247 240 248 - /* test the sequence of 1k jumps in inner most function (function depth 8)*/ 241 + /* test the sequence of 8k jumps in inner most function (function depth 8)*/ 249 242 static void bpf_fill_scale2(struct bpf_test *self) 250 243 { 251 244 struct bpf_insn *insn = self->fill_insns; ··· 259 248 insn[i++] = BPF_EXIT_INSN(); 260 249 } 261 250 insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); 262 - /* test to check that the sequence of 1024 jumps is acceptable */ 263 - while (k++ < 1024) { 251 + /* test to check that the long sequence of jumps is acceptable */ 252 + k = 0; 253 + while (k++ < MAX_JMP_SEQ) { 264 254 insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 265 255 BPF_FUNC_get_prandom_u32); 266 - insn[i++] = BPF_JMP_IMM(BPF_JGT, BPF_REG_0, bpf_semi_rand_get(), 2); 256 + insn[i++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, bpf_semi_rand_get(), 2); 267 257 insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10); 268 258 insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 269 259 -8 * (k % (64 - 4 * FUNC_NEST) + 1)); 270 260 } 271 - /* every jump adds 1024 steps to insn_processed, so to stay exactly 272 - * within 1m limit add MAX_TEST_INSNS - 1025 MOVs and 1 EXIT 261 + /* every jump adds 1 step to insn_processed, so to stay exactly 262 + * within 1m limit add MAX_TEST_INSNS - MAX_JMP_SEQ - 1 MOVs and 1 EXIT 273 263 */ 274 - while (i < MAX_TEST_INSNS - 1025) 275 - insn[i++] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 42); 264 + while (i < MAX_TEST_INSNS - MAX_JMP_SEQ - 1) 265 + insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42); 276 266 insn[i] = BPF_EXIT_INSN(); 277 267 self->prog_len = i + 1; 278 268 self->retval = 42; ··· 882 870 if (fixup_skips != skips) 883 871 return; 884 872 885 - pflags = 0; 873 + pflags = BPF_F_TEST_RND_HI32; 886 874 if (test->flags & F_LOAD_WITH_STRICT_ALIGNMENT) 887 875 pflags |= BPF_F_STRICT_ALIGNMENT; 888 876 if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS)

+99

tools/testing/selftests/bpf/test_xdping.sh

··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + # xdping tests 5 + # Here we setup and teardown configuration required to run 6 + # xdping, exercising its options. 7 + # 8 + # Setup is similar to test_tunnel tests but without the tunnel. 9 + # 10 + # Topology: 11 + # --------- 12 + # root namespace | tc_ns0 namespace 13 + # | 14 + # ---------- | ---------- 15 + # | veth1 | --------- | veth0 | 16 + # ---------- peer ---------- 17 + # 18 + # Device Configuration 19 + # -------------------- 20 + # Root namespace with BPF 21 + # Device names and addresses: 22 + # veth1 IP: 10.1.1.200 23 + # xdp added to veth1, xdpings originate from here. 24 + # 25 + # Namespace tc_ns0 with BPF 26 + # Device names and addresses: 27 + # veth0 IPv4: 10.1.1.100 28 + # For some tests xdping run in server mode here. 29 + # 30 + 31 + readonly TARGET_IP="10.1.1.100" 32 + readonly TARGET_NS="xdp_ns0" 33 + 34 + readonly LOCAL_IP="10.1.1.200" 35 + 36 + setup() 37 + { 38 + ip netns add $TARGET_NS 39 + ip link add veth0 type veth peer name veth1 40 + ip link set veth0 netns $TARGET_NS 41 + ip netns exec $TARGET_NS ip addr add ${TARGET_IP}/24 dev veth0 42 + ip addr add ${LOCAL_IP}/24 dev veth1 43 + ip netns exec $TARGET_NS ip link set veth0 up 44 + ip link set veth1 up 45 + } 46 + 47 + cleanup() 48 + { 49 + set +e 50 + ip netns delete $TARGET_NS 2>/dev/null 51 + ip link del veth1 2>/dev/null 52 + if [[ $server_pid -ne 0 ]]; then 53 + kill -TERM $server_pid 54 + fi 55 + } 56 + 57 + test() 58 + { 59 + client_args="$1" 60 + server_args="$2" 61 + 62 + echo "Test client args '$client_args'; server args '$server_args'" 63 + 64 + server_pid=0 65 + if [[ -n "$server_args" ]]; then 66 + ip netns exec $TARGET_NS ./xdping $server_args & 67 + server_pid=$! 68 + sleep 10 69 + fi 70 + ./xdping $client_args $TARGET_IP 71 + 72 + if [[ $server_pid -ne 0 ]]; then 73 + kill -TERM $server_pid 74 + server_pid=0 75 + fi 76 + 77 + echo "Test client args '$client_args'; server args '$server_args': PASS" 78 + } 79 + 80 + set -e 81 + 82 + server_pid=0 83 + 84 + trap cleanup EXIT 85 + 86 + setup 87 + 88 + for server_args in "" "-I veth0 -s -S" ; do 89 + # client in skb mode 90 + client_args="-I veth1 -S" 91 + test "$client_args" "$server_args" 92 + 93 + # client with count of 10 RTT measurements. 94 + client_args="-I veth1 -S -c 10" 95 + test "$client_args" "$server_args" 96 + done 97 + 98 + echo "OK. All tests passed" 99 + exit 0

+1 -3

tools/testing/selftests/bpf/trace_helpers.c

··· 30 30 if (!f) 31 31 return -ENOENT; 32 32 33 - while (!feof(f)) { 34 - if (!fgets(buf, sizeof(buf), f)) 35 - break; 33 + while (fgets(buf, sizeof(buf), f)) { 36 34 if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) 37 35 break; 38 36 if (!addr)

+258

tools/testing/selftests/bpf/xdping.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ 3 + 4 + #include <linux/bpf.h> 5 + #include <linux/if_link.h> 6 + #include <arpa/inet.h> 7 + #include <assert.h> 8 + #include <errno.h> 9 + #include <signal.h> 10 + #include <stdio.h> 11 + #include <stdlib.h> 12 + #include <string.h> 13 + #include <unistd.h> 14 + #include <libgen.h> 15 + #include <sys/resource.h> 16 + #include <net/if.h> 17 + #include <sys/types.h> 18 + #include <sys/socket.h> 19 + #include <netdb.h> 20 + 21 + #include "bpf/bpf.h" 22 + #include "bpf/libbpf.h" 23 + 24 + #include "xdping.h" 25 + 26 + static int ifindex; 27 + static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; 28 + 29 + static void cleanup(int sig) 30 + { 31 + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); 32 + if (sig) 33 + exit(1); 34 + } 35 + 36 + static int get_stats(int fd, __u16 count, __u32 raddr) 37 + { 38 + struct pinginfo pinginfo = { 0 }; 39 + char inaddrbuf[INET_ADDRSTRLEN]; 40 + struct in_addr inaddr; 41 + __u16 i; 42 + 43 + inaddr.s_addr = raddr; 44 + 45 + printf("\nXDP RTT data:\n"); 46 + 47 + if (bpf_map_lookup_elem(fd, &raddr, &pinginfo)) { 48 + perror("bpf_map_lookup elem: "); 49 + return 1; 50 + } 51 + 52 + for (i = 0; i < count; i++) { 53 + if (pinginfo.times[i] == 0) 54 + break; 55 + 56 + printf("64 bytes from %s: icmp_seq=%d ttl=64 time=%#.5f ms\n", 57 + inet_ntop(AF_INET, &inaddr, inaddrbuf, 58 + sizeof(inaddrbuf)), 59 + count + i + 1, 60 + (double)pinginfo.times[i]/1000000); 61 + } 62 + 63 + if (i < count) { 64 + fprintf(stderr, "Expected %d samples, got %d.\n", count, i); 65 + return 1; 66 + } 67 + 68 + bpf_map_delete_elem(fd, &raddr); 69 + 70 + return 0; 71 + } 72 + 73 + static void show_usage(const char *prog) 74 + { 75 + fprintf(stderr, 76 + "usage: %s [OPTS] -I interface destination\n\n" 77 + "OPTS:\n" 78 + " -c count Stop after sending count requests\n" 79 + " (default %d, max %d)\n" 80 + " -I interface interface name\n" 81 + " -N Run in driver mode\n" 82 + " -s Server mode\n" 83 + " -S Run in skb mode\n", 84 + prog, XDPING_DEFAULT_COUNT, XDPING_MAX_COUNT); 85 + } 86 + 87 + int main(int argc, char **argv) 88 + { 89 + __u32 mode_flags = XDP_FLAGS_DRV_MODE | XDP_FLAGS_SKB_MODE; 90 + struct addrinfo *a, hints = { .ai_family = AF_INET }; 91 + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 92 + __u16 count = XDPING_DEFAULT_COUNT; 93 + struct pinginfo pinginfo = { 0 }; 94 + const char *optstr = "c:I:NsS"; 95 + struct bpf_program *main_prog; 96 + int prog_fd = -1, map_fd = -1; 97 + struct sockaddr_in rin; 98 + struct bpf_object *obj; 99 + struct bpf_map *map; 100 + char *ifname = NULL; 101 + char filename[256]; 102 + int opt, ret = 1; 103 + __u32 raddr = 0; 104 + int server = 0; 105 + char cmd[256]; 106 + 107 + while ((opt = getopt(argc, argv, optstr)) != -1) { 108 + switch (opt) { 109 + case 'c': 110 + count = atoi(optarg); 111 + if (count < 1 || count > XDPING_MAX_COUNT) { 112 + fprintf(stderr, 113 + "min count is 1, max count is %d\n", 114 + XDPING_MAX_COUNT); 115 + return 1; 116 + } 117 + break; 118 + case 'I': 119 + ifname = optarg; 120 + ifindex = if_nametoindex(ifname); 121 + if (!ifindex) { 122 + fprintf(stderr, "Could not get interface %s\n", 123 + ifname); 124 + return 1; 125 + } 126 + break; 127 + case 'N': 128 + xdp_flags |= XDP_FLAGS_DRV_MODE; 129 + break; 130 + case 's': 131 + /* use server program */ 132 + server = 1; 133 + break; 134 + case 'S': 135 + xdp_flags |= XDP_FLAGS_SKB_MODE; 136 + break; 137 + default: 138 + show_usage(basename(argv[0])); 139 + return 1; 140 + } 141 + } 142 + 143 + if (!ifname) { 144 + show_usage(basename(argv[0])); 145 + return 1; 146 + } 147 + if (!server && optind == argc) { 148 + show_usage(basename(argv[0])); 149 + return 1; 150 + } 151 + 152 + if ((xdp_flags & mode_flags) == mode_flags) { 153 + fprintf(stderr, "-N or -S can be specified, not both.\n"); 154 + show_usage(basename(argv[0])); 155 + return 1; 156 + } 157 + 158 + if (!server) { 159 + /* Only supports IPv4; see hints initiailization above. */ 160 + if (getaddrinfo(argv[optind], NULL, &hints, &a) || !a) { 161 + fprintf(stderr, "Could not resolve %s\n", argv[optind]); 162 + return 1; 163 + } 164 + memcpy(&rin, a->ai_addr, sizeof(rin)); 165 + raddr = rin.sin_addr.s_addr; 166 + freeaddrinfo(a); 167 + } 168 + 169 + if (setrlimit(RLIMIT_MEMLOCK, &r)) { 170 + perror("setrlimit(RLIMIT_MEMLOCK)"); 171 + return 1; 172 + } 173 + 174 + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 175 + 176 + if (bpf_prog_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) { 177 + fprintf(stderr, "load of %s failed\n", filename); 178 + return 1; 179 + } 180 + 181 + main_prog = bpf_object__find_program_by_title(obj, 182 + server ? "xdpserver" : 183 + "xdpclient"); 184 + if (main_prog) 185 + prog_fd = bpf_program__fd(main_prog); 186 + if (!main_prog || prog_fd < 0) { 187 + fprintf(stderr, "could not find xdping program"); 188 + return 1; 189 + } 190 + 191 + map = bpf_map__next(NULL, obj); 192 + if (map) 193 + map_fd = bpf_map__fd(map); 194 + if (!map || map_fd < 0) { 195 + fprintf(stderr, "Could not find ping map"); 196 + goto done; 197 + } 198 + 199 + signal(SIGINT, cleanup); 200 + signal(SIGTERM, cleanup); 201 + 202 + printf("Setting up XDP for %s, please wait...\n", ifname); 203 + 204 + printf("XDP setup disrupts network connectivity, hit Ctrl+C to quit\n"); 205 + 206 + if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { 207 + fprintf(stderr, "Link set xdp fd failed for %s\n", ifname); 208 + goto done; 209 + } 210 + 211 + if (server) { 212 + close(prog_fd); 213 + close(map_fd); 214 + printf("Running server on %s; press Ctrl+C to exit...\n", 215 + ifname); 216 + do { } while (1); 217 + } 218 + 219 + /* Start xdping-ing from last regular ping reply, e.g. for a count 220 + * of 10 ICMP requests, we start xdping-ing using reply with seq number 221 + * 10. The reason the last "real" ping RTT is much higher is that 222 + * the ping program sees the ICMP reply associated with the last 223 + * XDP-generated packet, so ping doesn't get a reply until XDP is done. 224 + */ 225 + pinginfo.seq = htons(count); 226 + pinginfo.count = count; 227 + 228 + if (bpf_map_update_elem(map_fd, &raddr, &pinginfo, BPF_ANY)) { 229 + fprintf(stderr, "could not communicate with BPF map: %s\n", 230 + strerror(errno)); 231 + cleanup(0); 232 + goto done; 233 + } 234 + 235 + /* We need to wait for XDP setup to complete. */ 236 + sleep(10); 237 + 238 + snprintf(cmd, sizeof(cmd), "ping -c %d -I %s %s", 239 + count, ifname, argv[optind]); 240 + 241 + printf("\nNormal ping RTT data\n"); 242 + printf("[Ignore final RTT; it is distorted by XDP using the reply]\n"); 243 + 244 + ret = system(cmd); 245 + 246 + if (!ret) 247 + ret = get_stats(map_fd, count, raddr); 248 + 249 + cleanup(0); 250 + 251 + done: 252 + if (prog_fd > 0) 253 + close(prog_fd); 254 + if (map_fd > 0) 255 + close(map_fd); 256 + 257 + return ret; 258 + }

+13

tools/testing/selftests/bpf/xdping.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ 3 + 4 + #define XDPING_MAX_COUNT 10 5 + #define XDPING_DEFAULT_COUNT 4 6 + 7 + struct pinginfo { 8 + __u64 start; 9 + __be16 seq; 10 + __u16 count; 11 + __u32 pad; 12 + __u64 times[XDPING_MAX_COUNT]; 13 + };