Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2018-01-26

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) A number of extensions to tcp-bpf, from Lawrence.
- direct R or R/W access to many tcp_sock fields via bpf_sock_ops
- passing up to 3 arguments to bpf_sock_ops functions
- tcp_sock field bpf_sock_ops_cb_flags for controlling callbacks
- optionally calling bpf_sock_ops program when RTO fires
- optionally calling bpf_sock_ops program when packet is retransmitted
- optionally calling bpf_sock_ops program when TCP state changes
- access to tclass and sk_txhash
- new selftest

2) div/mod exception handling, from Daniel.
One of the ugly leftovers from the early eBPF days is that div/mod
operations based on registers have a hard-coded src_reg == 0 test
in the interpreter as well as in JIT code generators that would
return from the BPF program with exit code 0. This was basically
adopted from cBPF interpreter for historical reasons.
There are multiple reasons why this is very suboptimal and prone
to bugs. To name one: the return code mapping for such abnormal
program exit of 0 does not always match with a suitable program
type's exit code mapping. For example, '0' in tc means action 'ok'
where the packet gets passed further up the stack, which is just
undesirable for such cases (e.g. when implementing policy) and
also does not match with other program types.
After considering _four_ different ways to address the problem,
we adapt the same behavior as on some major archs like ARMv8:
X div 0 results in 0, and X mod 0 results in X. aarch64 and
aarch32 ISA do not generate any traps or otherwise aborts
of program execution for unsigned divides.
Given the options, it seems the most suitable from
all of them, also since major archs have similar schemes in
place. Given this is all in the realm of undefined behavior,
we still have the option to adapt if deemed necessary.

3) sockmap sample refactoring, from John.

4) lpm map get_next_key fixes, from Yonghong.

5) test cleanups, from Alexei and Prashant.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+1972 -381
+1 -1
Documentation/networking/filter.txt
··· 1134 1134 mask and value; no bit should ever be 1 in both. For example, if a byte is read 1135 1135 into a register from memory, the register's top 56 bits are known zero, while 1136 1136 the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we 1137 - then OR this with 0x40, we get (0x40; 0xcf), then if we add 1 we get (0x0; 1137 + then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; 1138 1138 0x1ff), because of potential carries. 1139 1139 Besides arithmetic, the register state can also be updated by conditional 1140 1140 branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch
-8
arch/arm/net/bpf_jit_32.c
··· 363 363 static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) 364 364 { 365 365 const u8 *tmp = bpf2a32[TMP_REG_1]; 366 - s32 jmp_offset; 367 366 368 - /* checks if divisor is zero or not. If it is, then 369 - * exit directly. 370 - */ 371 - emit(ARM_CMP_I(rn, 0), ctx); 372 - _emit(ARM_COND_EQ, ARM_MOV_I(ARM_R0, 0), ctx); 373 - jmp_offset = epilogue_offset(ctx); 374 - _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); 375 367 #if __LINUX_ARM_ARCH__ == 7 376 368 if (elf_hwcap & HWCAP_IDIVA) { 377 369 if (op == BPF_DIV)
-13
arch/arm64/net/bpf_jit_comp.c
··· 390 390 case BPF_ALU64 | BPF_DIV | BPF_X: 391 391 case BPF_ALU | BPF_MOD | BPF_X: 392 392 case BPF_ALU64 | BPF_MOD | BPF_X: 393 - { 394 - const u8 r0 = bpf2a64[BPF_REG_0]; 395 - 396 - /* if (src == 0) return 0 */ 397 - jmp_offset = 3; /* skip ahead to else path */ 398 - check_imm19(jmp_offset); 399 - emit(A64_CBNZ(is64, src, jmp_offset), ctx); 400 - emit(A64_MOVZ(1, r0, 0, 0), ctx); 401 - jmp_offset = epilogue_offset(ctx); 402 - check_imm26(jmp_offset); 403 - emit(A64_B(jmp_offset), ctx); 404 - /* else */ 405 393 switch (BPF_OP(code)) { 406 394 case BPF_DIV: 407 395 emit(A64_UDIV(is64, dst, dst, src), ctx); ··· 401 413 break; 402 414 } 403 415 break; 404 - } 405 416 case BPF_ALU | BPF_LSH | BPF_X: 406 417 case BPF_ALU64 | BPF_LSH | BPF_X: 407 418 emit(A64_LSLV(is64, dst, dst, src), ctx);
+4 -25
arch/mips/net/ebpf_jit.c
··· 741 741 break; 742 742 case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */ 743 743 case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */ 744 + if (insn->imm == 0) 745 + return -EINVAL; 744 746 dst = ebpf_to_mips_reg(ctx, insn, dst_reg); 745 747 if (dst < 0) 746 748 return dst; 747 - if (insn->imm == 0) { /* Div by zero */ 748 - b_off = b_imm(exit_idx, ctx); 749 - if (is_bad_offset(b_off)) 750 - return -E2BIG; 751 - emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); 752 - emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO); 753 - } 754 749 td = get_reg_val_type(ctx, this_idx, insn->dst_reg); 755 750 if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) 756 751 /* sign extend */ ··· 765 770 break; 766 771 case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */ 767 772 case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */ 773 + if (insn->imm == 0) 774 + return -EINVAL; 768 775 dst = ebpf_to_mips_reg(ctx, insn, dst_reg); 769 776 if (dst < 0) 770 777 return dst; 771 - if (insn->imm == 0) { /* Div by zero */ 772 - b_off = b_imm(exit_idx, ctx); 773 - if (is_bad_offset(b_off)) 774 - return -E2BIG; 775 - emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); 776 - emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO); 777 - } 778 778 if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT) 779 779 emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); 780 - 781 780 if (insn->imm == 1) { 782 781 /* div by 1 is a nop, mod by 1 is zero */ 783 782 if (bpf_op == BPF_MOD) ··· 849 860 break; 850 861 case BPF_DIV: 851 862 case BPF_MOD: 852 - b_off = b_imm(exit_idx, ctx); 853 - if (is_bad_offset(b_off)) 854 - return -E2BIG; 855 - emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); 856 - emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); 857 863 emit_instr(ctx, ddivu, dst, src); 858 864 if (bpf_op == BPF_DIV) 859 865 emit_instr(ctx, mflo, dst); ··· 927 943 break; 928 944 case BPF_DIV: 929 945 case BPF_MOD: 930 - b_off = b_imm(exit_idx, ctx); 931 - if (is_bad_offset(b_off)) 932 - return -E2BIG; 933 - emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); 934 - emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); 935 946 emit_instr(ctx, divu, dst, src); 936 947 if (bpf_op == BPF_DIV) 937 948 emit_instr(ctx, mflo, dst);
-8
arch/powerpc/net/bpf_jit_comp64.c
··· 381 381 goto bpf_alu32_trunc; 382 382 case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */ 383 383 case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */ 384 - PPC_CMPWI(src_reg, 0); 385 - PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12); 386 - PPC_LI(b2p[BPF_REG_0], 0); 387 - PPC_JMP(exit_addr); 388 384 if (BPF_OP(code) == BPF_MOD) { 389 385 PPC_DIVWU(b2p[TMP_REG_1], dst_reg, src_reg); 390 386 PPC_MULW(b2p[TMP_REG_1], src_reg, ··· 391 395 goto bpf_alu32_trunc; 392 396 case BPF_ALU64 | BPF_DIV | BPF_X: /* dst /= src */ 393 397 case BPF_ALU64 | BPF_MOD | BPF_X: /* dst %= src */ 394 - PPC_CMPDI(src_reg, 0); 395 - PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12); 396 - PPC_LI(b2p[BPF_REG_0], 0); 397 - PPC_JMP(exit_addr); 398 398 if (BPF_OP(code) == BPF_MOD) { 399 399 PPC_DIVD(b2p[TMP_REG_1], dst_reg, src_reg); 400 400 PPC_MULD(b2p[TMP_REG_1], src_reg,
-10
arch/s390/net/bpf_jit_comp.c
··· 610 610 { 611 611 int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; 612 612 613 - jit->seen |= SEEN_RET0; 614 - /* ltr %src,%src (if src == 0 goto fail) */ 615 - EMIT2(0x1200, src_reg, src_reg); 616 - /* jz <ret0> */ 617 - EMIT4_PCREL(0xa7840000, jit->ret0_ip - jit->prg); 618 613 /* lhi %w0,0 */ 619 614 EMIT4_IMM(0xa7080000, REG_W0, 0); 620 615 /* lr %w1,%dst */ ··· 625 630 { 626 631 int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; 627 632 628 - jit->seen |= SEEN_RET0; 629 - /* ltgr %src,%src (if src == 0 goto fail) */ 630 - EMIT4(0xb9020000, src_reg, src_reg); 631 - /* jz <ret0> */ 632 - EMIT4_PCREL(0xa7840000, jit->ret0_ip - jit->prg); 633 633 /* lghi %w0,0 */ 634 634 EMIT4_IMM(0xa7090000, REG_W0, 0); 635 635 /* lgr %w1,%dst */
-18
arch/sparc/net/bpf_jit_comp_64.c
··· 967 967 emit_alu(MULX, src, dst, ctx); 968 968 break; 969 969 case BPF_ALU | BPF_DIV | BPF_X: 970 - emit_cmp(src, G0, ctx); 971 - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); 972 - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); 973 - 974 970 emit_write_y(G0, ctx); 975 971 emit_alu(DIV, src, dst, ctx); 976 972 break; 977 - 978 973 case BPF_ALU64 | BPF_DIV | BPF_X: 979 - emit_cmp(src, G0, ctx); 980 - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); 981 - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); 982 - 983 974 emit_alu(UDIVX, src, dst, ctx); 984 975 break; 985 - 986 976 case BPF_ALU | BPF_MOD | BPF_X: { 987 977 const u8 tmp = bpf2sparc[TMP_REG_1]; 988 978 989 979 ctx->tmp_1_used = true; 990 - 991 - emit_cmp(src, G0, ctx); 992 - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); 993 - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); 994 980 995 981 emit_write_y(G0, ctx); 996 982 emit_alu3(DIV, dst, src, tmp, ctx); ··· 988 1002 const u8 tmp = bpf2sparc[TMP_REG_1]; 989 1003 990 1004 ctx->tmp_1_used = true; 991 - 992 - emit_cmp(src, G0, ctx); 993 - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); 994 - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); 995 1005 996 1006 emit_alu3(UDIVX, dst, src, tmp, ctx); 997 1007 emit_alu3(MULX, tmp, src, tmp, ctx);
-20
arch/x86/net/bpf_jit_comp.c
··· 568 568 */ 569 569 EMIT2(0x31, 0xd2); 570 570 571 - if (BPF_SRC(insn->code) == BPF_X) { 572 - /* if (src_reg == 0) return 0 */ 573 - 574 - /* cmp r11, 0 */ 575 - EMIT4(0x49, 0x83, 0xFB, 0x00); 576 - 577 - /* jne .+9 (skip over pop, pop, xor and jmp) */ 578 - EMIT2(X86_JNE, 1 + 1 + 2 + 5); 579 - EMIT1(0x5A); /* pop rdx */ 580 - EMIT1(0x58); /* pop rax */ 581 - EMIT2(0x31, 0xc0); /* xor eax, eax */ 582 - 583 - /* jmp cleanup_addr 584 - * addrs[i] - 11, because there are 11 bytes 585 - * after this insn: div, mov, pop, pop, mov 586 - */ 587 - jmp_offset = ctx->cleanup_addr - (addrs[i] - 11); 588 - EMIT1_off32(0xE9, jmp_offset); 589 - } 590 - 591 571 if (BPF_CLASS(insn->code) == BPF_ALU64) 592 572 /* div r11 */ 593 573 EMIT3(0x49, 0xF7, 0xF3);
+12
include/linux/filter.h
··· 688 688 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); 689 689 void bpf_prog_free(struct bpf_prog *fp); 690 690 691 + bool bpf_opcode_in_insntable(u8 code); 692 + 691 693 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); 692 694 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, 693 695 gfp_t gfp_extra_flags); ··· 1005 1003 struct sock *sk; 1006 1004 u32 op; 1007 1005 union { 1006 + u32 args[4]; 1008 1007 u32 reply; 1009 1008 u32 replylong[4]; 1010 1009 }; 1011 1010 u32 is_fullsock; 1011 + u64 temp; /* temp and everything after is not 1012 + * initialized to 0 before calling 1013 + * the BPF program. New fields that 1014 + * should be initialized to 0 should 1015 + * be inserted before temp. 1016 + * temp is scratch storage used by 1017 + * sock_ops_convert_ctx_access 1018 + * as temporary storage of a register. 1019 + */ 1012 1020 }; 1013 1021 1014 1022 #endif /* __LINUX_FILTER_H__ */
+11
include/linux/tcp.h
··· 335 335 336 336 int linger2; 337 337 338 + 339 + /* Sock_ops bpf program related variables */ 340 + #ifdef CONFIG_BPF 341 + u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs 342 + * values defined in uapi/linux/tcp.h 343 + */ 344 + #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) 345 + #else 346 + #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 347 + #endif 348 + 338 349 /* Receiver side RTT estimation */ 339 350 struct { 340 351 u32 rtt_us;
+36 -6
include/net/tcp.h
··· 2006 2006 * program loaded). 2007 2007 */ 2008 2008 #ifdef CONFIG_BPF 2009 - static inline int tcp_call_bpf(struct sock *sk, int op) 2009 + static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) 2010 2010 { 2011 2011 struct bpf_sock_ops_kern sock_ops; 2012 2012 int ret; 2013 2013 2014 - memset(&sock_ops, 0, sizeof(sock_ops)); 2014 + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); 2015 2015 if (sk_fullsock(sk)) { 2016 2016 sock_ops.is_fullsock = 1; 2017 2017 sock_owned_by_me(sk); ··· 2019 2019 2020 2020 sock_ops.sk = sk; 2021 2021 sock_ops.op = op; 2022 + if (nargs > 0) 2023 + memcpy(sock_ops.args, args, nargs * sizeof(*args)); 2022 2024 2023 2025 ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); 2024 2026 if (ret == 0) ··· 2029 2027 ret = -1; 2030 2028 return ret; 2031 2029 } 2030 + 2031 + static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) 2032 + { 2033 + u32 args[2] = {arg1, arg2}; 2034 + 2035 + return tcp_call_bpf(sk, op, 2, args); 2036 + } 2037 + 2038 + static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, 2039 + u32 arg3) 2040 + { 2041 + u32 args[3] = {arg1, arg2, arg3}; 2042 + 2043 + return tcp_call_bpf(sk, op, 3, args); 2044 + } 2045 + 2032 2046 #else 2033 - static inline int tcp_call_bpf(struct sock *sk, int op) 2047 + static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) 2034 2048 { 2035 2049 return -EPERM; 2036 2050 } 2051 + 2052 + static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) 2053 + { 2054 + return -EPERM; 2055 + } 2056 + 2057 + static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, 2058 + u32 arg3) 2059 + { 2060 + return -EPERM; 2061 + } 2062 + 2037 2063 #endif 2038 2064 2039 2065 static inline u32 tcp_timeout_init(struct sock *sk) 2040 2066 { 2041 2067 int timeout; 2042 2068 2043 - timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT); 2069 + timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); 2044 2070 2045 2071 if (timeout <= 0) 2046 2072 timeout = TCP_TIMEOUT_INIT; ··· 2079 2049 { 2080 2050 int rwnd; 2081 2051 2082 - rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); 2052 + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); 2083 2053 2084 2054 if (rwnd < 0) 2085 2055 rwnd = 0; ··· 2088 2058 2089 2059 static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) 2090 2060 { 2091 - return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); 2061 + return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); 2092 2062 } 2093 2063 2094 2064 #if IS_ENABLED(CONFIG_SMC)
+81 -3
include/uapi/linux/bpf.h
··· 642 642 * @optlen: length of optval in bytes 643 643 * Return: 0 or negative error 644 644 * 645 + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) 646 + * Set callback flags for sock_ops 647 + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct 648 + * @flags: flags value 649 + * Return: 0 for no error 650 + * -EINVAL if there is no full tcp socket 651 + * bits in flags that are not supported by current kernel 652 + * 645 653 * int bpf_skb_adjust_room(skb, len_diff, mode, flags) 646 654 * Grow or shrink room in sk_buff. 647 655 * @skb: pointer to skb ··· 756 748 FN(perf_event_read_value), \ 757 749 FN(perf_prog_read_value), \ 758 750 FN(getsockopt), \ 759 - FN(override_return), 751 + FN(override_return), \ 752 + FN(sock_ops_cb_flags_set), 760 753 761 754 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 762 755 * function eBPF program intends to call ··· 961 952 struct bpf_sock_ops { 962 953 __u32 op; 963 954 union { 964 - __u32 reply; 965 - __u32 replylong[4]; 955 + __u32 args[4]; /* Optionally passed to bpf program */ 956 + __u32 reply; /* Returned by bpf program */ 957 + __u32 replylong[4]; /* Optionally returned by bpf prog */ 966 958 }; 967 959 __u32 family; 968 960 __u32 remote_ip4; /* Stored in network byte order */ ··· 978 968 */ 979 969 __u32 snd_cwnd; 980 970 __u32 srtt_us; /* Averaged RTT << 3 in usecs */ 971 + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ 972 + __u32 state; 973 + __u32 rtt_min; 974 + __u32 snd_ssthresh; 975 + __u32 rcv_nxt; 976 + __u32 snd_nxt; 977 + __u32 snd_una; 978 + __u32 mss_cache; 979 + __u32 ecn_flags; 980 + __u32 rate_delivered; 981 + __u32 rate_interval_us; 982 + __u32 packets_out; 983 + __u32 retrans_out; 984 + __u32 total_retrans; 985 + __u32 segs_in; 986 + __u32 data_segs_in; 987 + __u32 segs_out; 988 + __u32 data_segs_out; 989 + __u32 lost_out; 990 + __u32 sacked_out; 991 + __u32 sk_txhash; 992 + __u64 bytes_received; 993 + __u64 bytes_acked; 981 994 }; 995 + 996 + /* Definitions for bpf_sock_ops_cb_flags */ 997 + #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) 998 + #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) 999 + #define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) 1000 + #define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently 1001 + * supported cb flags 1002 + */ 982 1003 983 1004 /* List of known BPF sock_ops operators. 984 1005 * New entries can only be added at the end ··· 1044 1003 * a congestion threshold. RTTs above 1045 1004 * this indicate congestion 1046 1005 */ 1006 + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. 1007 + * Arg1: value of icsk_retransmits 1008 + * Arg2: value of icsk_rto 1009 + * Arg3: whether RTO has expired 1010 + */ 1011 + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. 1012 + * Arg1: sequence number of 1st byte 1013 + * Arg2: # segments 1014 + * Arg3: return value of 1015 + * tcp_transmit_skb (0 => success) 1016 + */ 1017 + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. 1018 + * Arg1: old_state 1019 + * Arg2: new_state 1020 + */ 1021 + }; 1022 + 1023 + /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect 1024 + * changes between the TCP and BPF versions. Ideally this should never happen. 1025 + * If it does, we need to add code to convert them before calling 1026 + * the BPF sock_ops function. 1027 + */ 1028 + enum { 1029 + BPF_TCP_ESTABLISHED = 1, 1030 + BPF_TCP_SYN_SENT, 1031 + BPF_TCP_SYN_RECV, 1032 + BPF_TCP_FIN_WAIT1, 1033 + BPF_TCP_FIN_WAIT2, 1034 + BPF_TCP_TIME_WAIT, 1035 + BPF_TCP_CLOSE, 1036 + BPF_TCP_CLOSE_WAIT, 1037 + BPF_TCP_LAST_ACK, 1038 + BPF_TCP_LISTEN, 1039 + BPF_TCP_CLOSING, /* Now a valid state */ 1040 + BPF_TCP_NEW_SYN_RECV, 1041 + 1042 + BPF_TCP_MAX_STATES /* Leave at the end! */ 1047 1043 }; 1048 1044 1049 1045 #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
+145 -113
kernel/bpf/core.c
··· 782 782 } 783 783 EXPORT_SYMBOL_GPL(__bpf_call_base); 784 784 785 + /* All UAPI available opcodes. */ 786 + #define BPF_INSN_MAP(INSN_2, INSN_3) \ 787 + /* 32 bit ALU operations. */ \ 788 + /* Register based. */ \ 789 + INSN_3(ALU, ADD, X), \ 790 + INSN_3(ALU, SUB, X), \ 791 + INSN_3(ALU, AND, X), \ 792 + INSN_3(ALU, OR, X), \ 793 + INSN_3(ALU, LSH, X), \ 794 + INSN_3(ALU, RSH, X), \ 795 + INSN_3(ALU, XOR, X), \ 796 + INSN_3(ALU, MUL, X), \ 797 + INSN_3(ALU, MOV, X), \ 798 + INSN_3(ALU, DIV, X), \ 799 + INSN_3(ALU, MOD, X), \ 800 + INSN_2(ALU, NEG), \ 801 + INSN_3(ALU, END, TO_BE), \ 802 + INSN_3(ALU, END, TO_LE), \ 803 + /* Immediate based. */ \ 804 + INSN_3(ALU, ADD, K), \ 805 + INSN_3(ALU, SUB, K), \ 806 + INSN_3(ALU, AND, K), \ 807 + INSN_3(ALU, OR, K), \ 808 + INSN_3(ALU, LSH, K), \ 809 + INSN_3(ALU, RSH, K), \ 810 + INSN_3(ALU, XOR, K), \ 811 + INSN_3(ALU, MUL, K), \ 812 + INSN_3(ALU, MOV, K), \ 813 + INSN_3(ALU, DIV, K), \ 814 + INSN_3(ALU, MOD, K), \ 815 + /* 64 bit ALU operations. */ \ 816 + /* Register based. */ \ 817 + INSN_3(ALU64, ADD, X), \ 818 + INSN_3(ALU64, SUB, X), \ 819 + INSN_3(ALU64, AND, X), \ 820 + INSN_3(ALU64, OR, X), \ 821 + INSN_3(ALU64, LSH, X), \ 822 + INSN_3(ALU64, RSH, X), \ 823 + INSN_3(ALU64, XOR, X), \ 824 + INSN_3(ALU64, MUL, X), \ 825 + INSN_3(ALU64, MOV, X), \ 826 + INSN_3(ALU64, ARSH, X), \ 827 + INSN_3(ALU64, DIV, X), \ 828 + INSN_3(ALU64, MOD, X), \ 829 + INSN_2(ALU64, NEG), \ 830 + /* Immediate based. */ \ 831 + INSN_3(ALU64, ADD, K), \ 832 + INSN_3(ALU64, SUB, K), \ 833 + INSN_3(ALU64, AND, K), \ 834 + INSN_3(ALU64, OR, K), \ 835 + INSN_3(ALU64, LSH, K), \ 836 + INSN_3(ALU64, RSH, K), \ 837 + INSN_3(ALU64, XOR, K), \ 838 + INSN_3(ALU64, MUL, K), \ 839 + INSN_3(ALU64, MOV, K), \ 840 + INSN_3(ALU64, ARSH, K), \ 841 + INSN_3(ALU64, DIV, K), \ 842 + INSN_3(ALU64, MOD, K), \ 843 + /* Call instruction. */ \ 844 + INSN_2(JMP, CALL), \ 845 + /* Exit instruction. */ \ 846 + INSN_2(JMP, EXIT), \ 847 + /* Jump instructions. */ \ 848 + /* Register based. */ \ 849 + INSN_3(JMP, JEQ, X), \ 850 + INSN_3(JMP, JNE, X), \ 851 + INSN_3(JMP, JGT, X), \ 852 + INSN_3(JMP, JLT, X), \ 853 + INSN_3(JMP, JGE, X), \ 854 + INSN_3(JMP, JLE, X), \ 855 + INSN_3(JMP, JSGT, X), \ 856 + INSN_3(JMP, JSLT, X), \ 857 + INSN_3(JMP, JSGE, X), \ 858 + INSN_3(JMP, JSLE, X), \ 859 + INSN_3(JMP, JSET, X), \ 860 + /* Immediate based. */ \ 861 + INSN_3(JMP, JEQ, K), \ 862 + INSN_3(JMP, JNE, K), \ 863 + INSN_3(JMP, JGT, K), \ 864 + INSN_3(JMP, JLT, K), \ 865 + INSN_3(JMP, JGE, K), \ 866 + INSN_3(JMP, JLE, K), \ 867 + INSN_3(JMP, JSGT, K), \ 868 + INSN_3(JMP, JSLT, K), \ 869 + INSN_3(JMP, JSGE, K), \ 870 + INSN_3(JMP, JSLE, K), \ 871 + INSN_3(JMP, JSET, K), \ 872 + INSN_2(JMP, JA), \ 873 + /* Store instructions. */ \ 874 + /* Register based. */ \ 875 + INSN_3(STX, MEM, B), \ 876 + INSN_3(STX, MEM, H), \ 877 + INSN_3(STX, MEM, W), \ 878 + INSN_3(STX, MEM, DW), \ 879 + INSN_3(STX, XADD, W), \ 880 + INSN_3(STX, XADD, DW), \ 881 + /* Immediate based. */ \ 882 + INSN_3(ST, MEM, B), \ 883 + INSN_3(ST, MEM, H), \ 884 + INSN_3(ST, MEM, W), \ 885 + INSN_3(ST, MEM, DW), \ 886 + /* Load instructions. */ \ 887 + /* Register based. */ \ 888 + INSN_3(LDX, MEM, B), \ 889 + INSN_3(LDX, MEM, H), \ 890 + INSN_3(LDX, MEM, W), \ 891 + INSN_3(LDX, MEM, DW), \ 892 + /* Immediate based. */ \ 893 + INSN_3(LD, IMM, DW), \ 894 + /* Misc (old cBPF carry-over). */ \ 895 + INSN_3(LD, ABS, B), \ 896 + INSN_3(LD, ABS, H), \ 897 + INSN_3(LD, ABS, W), \ 898 + INSN_3(LD, IND, B), \ 899 + INSN_3(LD, IND, H), \ 900 + INSN_3(LD, IND, W) 901 + 902 + bool bpf_opcode_in_insntable(u8 code) 903 + { 904 + #define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true 905 + #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true 906 + static const bool public_insntable[256] = { 907 + [0 ... 255] = false, 908 + /* Now overwrite non-defaults ... */ 909 + BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), 910 + }; 911 + #undef BPF_INSN_3_TBL 912 + #undef BPF_INSN_2_TBL 913 + return public_insntable[code]; 914 + } 915 + 785 916 #ifndef CONFIG_BPF_JIT_ALWAYS_ON 786 917 /** 787 918 * __bpf_prog_run - run eBPF program on a given context ··· 924 793 static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) 925 794 { 926 795 u64 tmp; 796 + #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y 797 + #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z 927 798 static const void *jumptable[256] = { 928 799 [0 ... 255] = &&default_label, 929 800 /* Now overwrite non-defaults ... */ 930 - /* 32 bit ALU operations */ 931 - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, 932 - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, 933 - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, 934 - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, 935 - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, 936 - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, 937 - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, 938 - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, 939 - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, 940 - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, 941 - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, 942 - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, 943 - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, 944 - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, 945 - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, 946 - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, 947 - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, 948 - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, 949 - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, 950 - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, 951 - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, 952 - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, 953 - [BPF_ALU | BPF_NEG] = &&ALU_NEG, 954 - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, 955 - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, 956 - /* 64 bit ALU operations */ 957 - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, 958 - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, 959 - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, 960 - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, 961 - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, 962 - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, 963 - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, 964 - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, 965 - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, 966 - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, 967 - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, 968 - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, 969 - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, 970 - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, 971 - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, 972 - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, 973 - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, 974 - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, 975 - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, 976 - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, 977 - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, 978 - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, 979 - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, 980 - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, 981 - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, 982 - /* Call instruction */ 983 - [BPF_JMP | BPF_CALL] = &&JMP_CALL, 801 + BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), 802 + /* Non-UAPI available opcodes. */ 984 803 [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, 985 804 [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, 986 - /* Jumps */ 987 - [BPF_JMP | BPF_JA] = &&JMP_JA, 988 - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, 989 - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, 990 - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, 991 - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, 992 - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, 993 - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, 994 - [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, 995 - [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, 996 - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, 997 - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, 998 - [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, 999 - [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, 1000 - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, 1001 - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, 1002 - [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, 1003 - [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, 1004 - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, 1005 - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, 1006 - [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, 1007 - [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, 1008 - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, 1009 - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, 1010 - /* Program return */ 1011 - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, 1012 - /* Store instructions */ 1013 - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, 1014 - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, 1015 - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, 1016 - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, 1017 - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, 1018 - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, 1019 - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, 1020 - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, 1021 - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, 1022 - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, 1023 - /* Load instructions */ 1024 - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, 1025 - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, 1026 - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, 1027 - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, 1028 - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, 1029 - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, 1030 - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, 1031 - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, 1032 - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, 1033 - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, 1034 - [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, 1035 805 }; 806 + #undef BPF_INSN_3_LBL 807 + #undef BPF_INSN_2_LBL 1036 808 u32 tail_call_cnt = 0; 1037 809 void *ptr; 1038 810 int off; ··· 999 965 (*(s64 *) &DST) >>= IMM; 1000 966 CONT; 1001 967 ALU64_MOD_X: 1002 - if (unlikely(SRC == 0)) 1003 - return 0; 1004 968 div64_u64_rem(DST, SRC, &tmp); 1005 969 DST = tmp; 1006 970 CONT; 1007 971 ALU_MOD_X: 1008 - if (unlikely((u32)SRC == 0)) 1009 - return 0; 1010 972 tmp = (u32) DST; 1011 973 DST = do_div(tmp, (u32) SRC); 1012 974 CONT; ··· 1015 985 DST = do_div(tmp, (u32) IMM); 1016 986 CONT; 1017 987 ALU64_DIV_X: 1018 - if (unlikely(SRC == 0)) 1019 - return 0; 1020 988 DST = div64_u64(DST, SRC); 1021 989 CONT; 1022 990 ALU_DIV_X: 1023 - if (unlikely((u32)SRC == 0)) 1024 - return 0; 1025 991 tmp = (u32) DST; 1026 992 do_div(tmp, (u32) SRC); 1027 993 DST = (u32) tmp; ··· 1328 1302 goto load_byte; 1329 1303 1330 1304 default_label: 1331 - /* If we ever reach this, we have a bug somewhere. */ 1332 - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); 1305 + /* If we ever reach this, we have a bug somewhere. Die hard here 1306 + * instead of just returning 0; we could be somewhere in a subprog, 1307 + * so execution could continue otherwise which we do /not/ want. 1308 + * 1309 + * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). 1310 + */ 1311 + pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); 1312 + BUG_ON(1); 1333 1313 return 0; 1334 1314 } 1335 1315 STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
+12 -16
kernel/bpf/lpm_trie.c
··· 593 593 594 594 static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) 595 595 { 596 + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; 596 597 struct lpm_trie *trie = container_of(map, struct lpm_trie, map); 597 598 struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; 598 - struct lpm_trie_node *node, *next_node = NULL, *parent; 599 599 struct lpm_trie_node **node_stack = NULL; 600 - struct lpm_trie_node __rcu **root; 601 600 int err = 0, stack_ptr = -1; 602 601 unsigned int next_bit; 603 602 size_t matchlen; ··· 613 614 */ 614 615 615 616 /* Empty trie */ 616 - if (!rcu_dereference(trie->root)) 617 + search_root = rcu_dereference(trie->root); 618 + if (!search_root) 617 619 return -ENOENT; 618 620 619 621 /* For invalid key, find the leftmost node in the trie */ 620 - if (!key || key->prefixlen > trie->max_prefixlen) { 621 - root = &trie->root; 622 + if (!key || key->prefixlen > trie->max_prefixlen) 622 623 goto find_leftmost; 623 - } 624 624 625 625 node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), 626 - GFP_USER | __GFP_NOWARN); 626 + GFP_ATOMIC | __GFP_NOWARN); 627 627 if (!node_stack) 628 628 return -ENOMEM; 629 629 630 630 /* Try to find the exact node for the given key */ 631 - for (node = rcu_dereference(trie->root); node;) { 631 + for (node = search_root; node;) { 632 632 node_stack[++stack_ptr] = node; 633 633 matchlen = longest_prefix_match(trie, node, key); 634 634 if (node->prefixlen != matchlen || ··· 638 640 node = rcu_dereference(node->child[next_bit]); 639 641 } 640 642 if (!node || node->prefixlen != key->prefixlen || 641 - (node->flags & LPM_TREE_NODE_FLAG_IM)) { 642 - root = &trie->root; 643 + (node->flags & LPM_TREE_NODE_FLAG_IM)) 643 644 goto find_leftmost; 644 - } 645 645 646 646 /* The node with the exactly-matching key has been found, 647 647 * find the first node in postorder after the matched node. ··· 647 651 node = node_stack[stack_ptr]; 648 652 while (stack_ptr > 0) { 649 653 parent = node_stack[stack_ptr - 1]; 650 - if (rcu_dereference(parent->child[0]) == node && 651 - rcu_dereference(parent->child[1])) { 652 - root = &parent->child[1]; 653 - goto find_leftmost; 654 + if (rcu_dereference(parent->child[0]) == node) { 655 + search_root = rcu_dereference(parent->child[1]); 656 + if (search_root) 657 + goto find_leftmost; 654 658 } 655 659 if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { 656 660 next_node = parent; ··· 669 673 /* Find the leftmost non-intermediate node, all intermediate nodes 670 674 * have exact two children, so this function will never return NULL. 671 675 */ 672 - for (node = rcu_dereference(*root); node;) { 676 + for (node = search_root; node;) { 673 677 if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) 674 678 next_node = node; 675 679 node = rcu_dereference(node->child[0]);
+1 -4
kernel/bpf/syscall.c
··· 709 709 err = bpf_percpu_hash_update(map, key, value, attr->flags); 710 710 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 711 711 err = bpf_percpu_array_update(map, key, value, attr->flags); 712 - } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || 713 - map->map_type == BPF_MAP_TYPE_PROG_ARRAY || 714 - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || 715 - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { 712 + } else if (IS_FD_ARRAY(map)) { 716 713 rcu_read_lock(); 717 714 err = bpf_fd_array_map_update_elem(map, f.file, key, value, 718 715 attr->flags);
+49 -13
kernel/bpf/verifier.c
··· 4981 4981 next_insn: 4982 4982 insn++; 4983 4983 i++; 4984 + continue; 4985 + } 4986 + 4987 + /* Basic sanity check before we invest more work here. */ 4988 + if (!bpf_opcode_in_insntable(insn->code)) { 4989 + verbose(env, "unknown opcode %02x\n", insn->code); 4990 + return -EINVAL; 4984 4991 } 4985 4992 } 4986 4993 ··· 5071 5064 return new_prog; 5072 5065 } 5073 5066 5074 - /* The verifier does more data flow analysis than llvm and will not explore 5075 - * branches that are dead at run time. Malicious programs can have dead code 5076 - * too. Therefore replace all dead at-run-time code with nops. 5067 + /* The verifier does more data flow analysis than llvm and will not 5068 + * explore branches that are dead at run time. Malicious programs can 5069 + * have dead code too. Therefore replace all dead at-run-time code 5070 + * with 'ja -1'. 5071 + * 5072 + * Just nops are not optimal, e.g. if they would sit at the end of the 5073 + * program and through another bug we would manage to jump there, then 5074 + * we'd execute beyond program memory otherwise. Returning exception 5075 + * code also wouldn't work since we can have subprogs where the dead 5076 + * code could be located. 5077 5077 */ 5078 5078 static void sanitize_dead_code(struct bpf_verifier_env *env) 5079 5079 { 5080 5080 struct bpf_insn_aux_data *aux_data = env->insn_aux_data; 5081 - struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); 5081 + struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); 5082 5082 struct bpf_insn *insn = env->prog->insnsi; 5083 5083 const int insn_cnt = env->prog->len; 5084 5084 int i; ··· 5093 5079 for (i = 0; i < insn_cnt; i++) { 5094 5080 if (aux_data[i].seen) 5095 5081 continue; 5096 - memcpy(insn + i, &nop, sizeof(nop)); 5082 + memcpy(insn + i, &trap, sizeof(trap)); 5097 5083 } 5098 5084 } 5099 5085 ··· 5400 5386 int i, cnt, delta = 0; 5401 5387 5402 5388 for (i = 0; i < insn_cnt; i++, insn++) { 5403 - if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || 5389 + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || 5390 + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || 5391 + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || 5404 5392 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { 5405 - /* due to JIT bugs clear upper 32-bits of src register 5406 - * before div/mod operation 5407 - */ 5408 - insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); 5409 - insn_buf[1] = *insn; 5410 - cnt = 2; 5411 - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 5393 + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; 5394 + struct bpf_insn mask_and_div[] = { 5395 + BPF_MOV32_REG(insn->src_reg, insn->src_reg), 5396 + /* Rx div 0 -> 0 */ 5397 + BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), 5398 + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), 5399 + BPF_JMP_IMM(BPF_JA, 0, 0, 1), 5400 + *insn, 5401 + }; 5402 + struct bpf_insn mask_and_mod[] = { 5403 + BPF_MOV32_REG(insn->src_reg, insn->src_reg), 5404 + /* Rx mod 0 -> Rx */ 5405 + BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), 5406 + *insn, 5407 + }; 5408 + struct bpf_insn *patchlet; 5409 + 5410 + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || 5411 + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { 5412 + patchlet = mask_and_div + (is64 ? 1 : 0); 5413 + cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); 5414 + } else { 5415 + patchlet = mask_and_mod + (is64 ? 1 : 0); 5416 + cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); 5417 + } 5418 + 5419 + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); 5412 5420 if (!new_prog) 5413 5421 return -ENOMEM; 5414 5422
+6 -2
lib/test_bpf.c
··· 2003 2003 { { 4, 0 }, { 5, 10 } } 2004 2004 }, 2005 2005 { 2006 - "INT: DIV by zero", 2006 + /* This one doesn't go through verifier, but is just raw insn 2007 + * as opposed to cBPF tests from here. Thus div by 0 tests are 2008 + * done in test_verifier in BPF kselftests. 2009 + */ 2010 + "INT: DIV by -1", 2007 2011 .u.insns_int = { 2008 2012 BPF_ALU64_REG(BPF_MOV, R6, R1), 2009 - BPF_ALU64_IMM(BPF_MOV, R7, 0), 2013 + BPF_ALU64_IMM(BPF_MOV, R7, -1), 2010 2014 BPF_LD_ABS(BPF_B, 3), 2011 2015 BPF_ALU32_REG(BPF_DIV, R0, R7), 2012 2016 BPF_EXIT_INSN(),
+282 -27
net/core/filter.c
··· 401 401 /* Classic BPF expects A and X to be reset first. These need 402 402 * to be guaranteed to be the first two instructions. 403 403 */ 404 - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 405 - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 404 + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 405 + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 406 406 407 407 /* All programs must keep CTX in callee saved BPF_REG_CTX. 408 408 * In eBPF case it's done by the compiler, here we need to ··· 459 459 break; 460 460 461 461 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || 462 - fp->code == (BPF_ALU | BPF_MOD | BPF_X)) 462 + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { 463 463 *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); 464 + /* Error with exception code on div/mod by 0. 465 + * For cBPF programs, this was always return 0. 466 + */ 467 + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); 468 + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 469 + *insn++ = BPF_EXIT_INSN(); 470 + } 464 471 465 472 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 466 473 break; ··· 3239 3232 ret = -EINVAL; 3240 3233 } 3241 3234 #ifdef CONFIG_INET 3235 + #if IS_ENABLED(CONFIG_IPV6) 3236 + } else if (level == SOL_IPV6) { 3237 + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) 3238 + return -EINVAL; 3239 + 3240 + val = *((int *)optval); 3241 + /* Only some options are supported */ 3242 + switch (optname) { 3243 + case IPV6_TCLASS: 3244 + if (val < -1 || val > 0xff) { 3245 + ret = -EINVAL; 3246 + } else { 3247 + struct ipv6_pinfo *np = inet6_sk(sk); 3248 + 3249 + if (val == -1) 3250 + val = 0; 3251 + np->tclass = val; 3252 + } 3253 + break; 3254 + default: 3255 + ret = -EINVAL; 3256 + } 3257 + #endif 3242 3258 } else if (level == SOL_TCP && 3243 3259 sk->sk_prot->setsockopt == tcp_setsockopt) { 3244 3260 if (optname == TCP_CONGESTION) { ··· 3271 3241 strncpy(name, optval, min_t(long, optlen, 3272 3242 TCP_CA_NAME_MAX-1)); 3273 3243 name[TCP_CA_NAME_MAX-1] = 0; 3274 - ret = tcp_set_congestion_control(sk, name, false, reinit); 3244 + ret = tcp_set_congestion_control(sk, name, false, 3245 + reinit); 3275 3246 } else { 3276 3247 struct tcp_sock *tp = tcp_sk(sk); 3277 3248 ··· 3338 3307 } else { 3339 3308 goto err_clear; 3340 3309 } 3310 + #if IS_ENABLED(CONFIG_IPV6) 3311 + } else if (level == SOL_IPV6) { 3312 + struct ipv6_pinfo *np = inet6_sk(sk); 3313 + 3314 + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) 3315 + goto err_clear; 3316 + 3317 + /* Only some options are supported */ 3318 + switch (optname) { 3319 + case IPV6_TCLASS: 3320 + *((int *)optval) = (int)np->tclass; 3321 + break; 3322 + default: 3323 + goto err_clear; 3324 + } 3325 + #endif 3341 3326 } else { 3342 3327 goto err_clear; 3343 3328 } ··· 3373 3326 .arg3_type = ARG_ANYTHING, 3374 3327 .arg4_type = ARG_PTR_TO_UNINIT_MEM, 3375 3328 .arg5_type = ARG_CONST_SIZE, 3329 + }; 3330 + 3331 + BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, 3332 + int, argval) 3333 + { 3334 + struct sock *sk = bpf_sock->sk; 3335 + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; 3336 + 3337 + if (!sk_fullsock(sk)) 3338 + return -EINVAL; 3339 + 3340 + #ifdef CONFIG_INET 3341 + if (val) 3342 + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; 3343 + 3344 + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); 3345 + #else 3346 + return -EINVAL; 3347 + #endif 3348 + } 3349 + 3350 + static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { 3351 + .func = bpf_sock_ops_cb_flags_set, 3352 + .gpl_only = false, 3353 + .ret_type = RET_INTEGER, 3354 + .arg1_type = ARG_PTR_TO_CTX, 3355 + .arg2_type = ARG_ANYTHING, 3376 3356 }; 3377 3357 3378 3358 static const struct bpf_func_proto * ··· 3584 3510 return &bpf_setsockopt_proto; 3585 3511 case BPF_FUNC_getsockopt: 3586 3512 return &bpf_getsockopt_proto; 3513 + case BPF_FUNC_sock_ops_cb_flags_set: 3514 + return &bpf_sock_ops_cb_flags_set_proto; 3587 3515 case BPF_FUNC_sock_map_update: 3588 3516 return &bpf_sock_map_update_proto; 3589 3517 default: ··· 3902 3826 } 3903 3827 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 3904 3828 3905 - static bool __is_valid_sock_ops_access(int off, int size) 3906 - { 3907 - if (off < 0 || off >= sizeof(struct bpf_sock_ops)) 3908 - return false; 3909 - /* The verifier guarantees that size > 0. */ 3910 - if (off % size != 0) 3911 - return false; 3912 - if (size != sizeof(__u32)) 3913 - return false; 3914 - 3915 - return true; 3916 - } 3917 - 3918 3829 static bool sock_ops_is_valid_access(int off, int size, 3919 3830 enum bpf_access_type type, 3920 3831 struct bpf_insn_access_aux *info) 3921 3832 { 3833 + const int size_default = sizeof(__u32); 3834 + 3835 + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) 3836 + return false; 3837 + 3838 + /* The verifier guarantees that size > 0. */ 3839 + if (off % size != 0) 3840 + return false; 3841 + 3922 3842 if (type == BPF_WRITE) { 3923 3843 switch (off) { 3924 - case offsetof(struct bpf_sock_ops, op) ... 3925 - offsetof(struct bpf_sock_ops, replylong[3]): 3844 + case offsetof(struct bpf_sock_ops, reply): 3845 + case offsetof(struct bpf_sock_ops, sk_txhash): 3846 + if (size != size_default) 3847 + return false; 3926 3848 break; 3927 3849 default: 3928 3850 return false; 3929 3851 } 3852 + } else { 3853 + switch (off) { 3854 + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, 3855 + bytes_acked): 3856 + if (size != sizeof(__u64)) 3857 + return false; 3858 + break; 3859 + default: 3860 + if (size != size_default) 3861 + return false; 3862 + break; 3863 + } 3930 3864 } 3931 3865 3932 - return __is_valid_sock_ops_access(off, size); 3866 + return true; 3933 3867 } 3934 3868 3935 3869 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, ··· 4556 4470 is_fullsock)); 4557 4471 break; 4558 4472 4559 - /* Helper macro for adding read access to tcp_sock fields. */ 4560 - #define SOCK_OPS_GET_TCP32(FIELD_NAME) \ 4473 + case offsetof(struct bpf_sock_ops, state): 4474 + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); 4475 + 4476 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 4477 + struct bpf_sock_ops_kern, sk), 4478 + si->dst_reg, si->src_reg, 4479 + offsetof(struct bpf_sock_ops_kern, sk)); 4480 + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, 4481 + offsetof(struct sock_common, skc_state)); 4482 + break; 4483 + 4484 + case offsetof(struct bpf_sock_ops, rtt_min): 4485 + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != 4486 + sizeof(struct minmax)); 4487 + BUILD_BUG_ON(sizeof(struct minmax) < 4488 + sizeof(struct minmax_sample)); 4489 + 4490 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 4491 + struct bpf_sock_ops_kern, sk), 4492 + si->dst_reg, si->src_reg, 4493 + offsetof(struct bpf_sock_ops_kern, sk)); 4494 + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4495 + offsetof(struct tcp_sock, rtt_min) + 4496 + FIELD_SIZEOF(struct minmax_sample, t)); 4497 + break; 4498 + 4499 + /* Helper macro for adding read access to tcp_sock or sock fields. */ 4500 + #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ 4561 4501 do { \ 4562 - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ 4502 + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ 4503 + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ 4563 4504 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 4564 4505 struct bpf_sock_ops_kern, \ 4565 4506 is_fullsock), \ ··· 4598 4485 struct bpf_sock_ops_kern, sk),\ 4599 4486 si->dst_reg, si->src_reg, \ 4600 4487 offsetof(struct bpf_sock_ops_kern, sk));\ 4601 - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ 4602 - offsetof(struct tcp_sock, FIELD_NAME)); \ 4488 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ 4489 + OBJ_FIELD), \ 4490 + si->dst_reg, si->dst_reg, \ 4491 + offsetof(OBJ, OBJ_FIELD)); \ 4492 + } while (0) 4493 + 4494 + /* Helper macro for adding write access to tcp_sock or sock fields. 4495 + * The macro is called with two registers, dst_reg which contains a pointer 4496 + * to ctx (context) and src_reg which contains the value that should be 4497 + * stored. However, we need an additional register since we cannot overwrite 4498 + * dst_reg because it may be used later in the program. 4499 + * Instead we "borrow" one of the other register. We first save its value 4500 + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore 4501 + * it at the end of the macro. 4502 + */ 4503 + #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ 4504 + do { \ 4505 + int reg = BPF_REG_9; \ 4506 + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ 4507 + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ 4508 + if (si->dst_reg == reg || si->src_reg == reg) \ 4509 + reg--; \ 4510 + if (si->dst_reg == reg || si->src_reg == reg) \ 4511 + reg--; \ 4512 + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ 4513 + offsetof(struct bpf_sock_ops_kern, \ 4514 + temp)); \ 4515 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 4516 + struct bpf_sock_ops_kern, \ 4517 + is_fullsock), \ 4518 + reg, si->dst_reg, \ 4519 + offsetof(struct bpf_sock_ops_kern, \ 4520 + is_fullsock)); \ 4521 + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ 4522 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 4523 + struct bpf_sock_ops_kern, sk),\ 4524 + reg, si->dst_reg, \ 4525 + offsetof(struct bpf_sock_ops_kern, sk));\ 4526 + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ 4527 + reg, si->src_reg, \ 4528 + offsetof(OBJ, OBJ_FIELD)); \ 4529 + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ 4530 + offsetof(struct bpf_sock_ops_kern, \ 4531 + temp)); \ 4532 + } while (0) 4533 + 4534 + #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ 4535 + do { \ 4536 + if (TYPE == BPF_WRITE) \ 4537 + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ 4538 + else \ 4539 + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ 4603 4540 } while (0) 4604 4541 4605 4542 case offsetof(struct bpf_sock_ops, snd_cwnd): 4606 - SOCK_OPS_GET_TCP32(snd_cwnd); 4543 + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); 4607 4544 break; 4608 4545 4609 4546 case offsetof(struct bpf_sock_ops, srtt_us): 4610 - SOCK_OPS_GET_TCP32(srtt_us); 4547 + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); 4611 4548 break; 4549 + 4550 + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): 4551 + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, 4552 + struct tcp_sock); 4553 + break; 4554 + 4555 + case offsetof(struct bpf_sock_ops, snd_ssthresh): 4556 + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); 4557 + break; 4558 + 4559 + case offsetof(struct bpf_sock_ops, rcv_nxt): 4560 + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); 4561 + break; 4562 + 4563 + case offsetof(struct bpf_sock_ops, snd_nxt): 4564 + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); 4565 + break; 4566 + 4567 + case offsetof(struct bpf_sock_ops, snd_una): 4568 + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); 4569 + break; 4570 + 4571 + case offsetof(struct bpf_sock_ops, mss_cache): 4572 + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); 4573 + break; 4574 + 4575 + case offsetof(struct bpf_sock_ops, ecn_flags): 4576 + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); 4577 + break; 4578 + 4579 + case offsetof(struct bpf_sock_ops, rate_delivered): 4580 + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, 4581 + struct tcp_sock); 4582 + break; 4583 + 4584 + case offsetof(struct bpf_sock_ops, rate_interval_us): 4585 + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, 4586 + struct tcp_sock); 4587 + break; 4588 + 4589 + case offsetof(struct bpf_sock_ops, packets_out): 4590 + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); 4591 + break; 4592 + 4593 + case offsetof(struct bpf_sock_ops, retrans_out): 4594 + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); 4595 + break; 4596 + 4597 + case offsetof(struct bpf_sock_ops, total_retrans): 4598 + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, 4599 + struct tcp_sock); 4600 + break; 4601 + 4602 + case offsetof(struct bpf_sock_ops, segs_in): 4603 + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); 4604 + break; 4605 + 4606 + case offsetof(struct bpf_sock_ops, data_segs_in): 4607 + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); 4608 + break; 4609 + 4610 + case offsetof(struct bpf_sock_ops, segs_out): 4611 + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); 4612 + break; 4613 + 4614 + case offsetof(struct bpf_sock_ops, data_segs_out): 4615 + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, 4616 + struct tcp_sock); 4617 + break; 4618 + 4619 + case offsetof(struct bpf_sock_ops, lost_out): 4620 + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); 4621 + break; 4622 + 4623 + case offsetof(struct bpf_sock_ops, sacked_out): 4624 + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); 4625 + break; 4626 + 4627 + case offsetof(struct bpf_sock_ops, sk_txhash): 4628 + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, 4629 + struct sock, type); 4630 + break; 4631 + 4632 + case offsetof(struct bpf_sock_ops, bytes_received): 4633 + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, 4634 + struct tcp_sock); 4635 + break; 4636 + 4637 + case offsetof(struct bpf_sock_ops, bytes_acked): 4638 + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); 4639 + break; 4640 + 4612 4641 } 4613 4642 return insn - insn_buf; 4614 4643 }
+25 -1
net/ipv4/tcp.c
··· 463 463 tcp_mtup_init(sk); 464 464 icsk->icsk_af_ops->rebuild_header(sk); 465 465 tcp_init_metrics(sk); 466 - tcp_call_bpf(sk, bpf_op); 466 + tcp_call_bpf(sk, bpf_op, 0, NULL); 467 467 tcp_init_congestion_control(sk); 468 468 tcp_init_buffer_space(sk); 469 469 } ··· 2041 2041 void tcp_set_state(struct sock *sk, int state) 2042 2042 { 2043 2043 int oldstate = sk->sk_state; 2044 + 2045 + /* We defined a new enum for TCP states that are exported in BPF 2046 + * so as not force the internal TCP states to be frozen. The 2047 + * following checks will detect if an internal state value ever 2048 + * differs from the BPF value. If this ever happens, then we will 2049 + * need to remap the internal value to the BPF value before calling 2050 + * tcp_call_bpf_2arg. 2051 + */ 2052 + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); 2053 + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); 2054 + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); 2055 + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); 2056 + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); 2057 + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); 2058 + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); 2059 + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); 2060 + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); 2061 + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); 2062 + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); 2063 + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); 2064 + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); 2065 + 2066 + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) 2067 + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); 2044 2068 2045 2069 switch (state) { 2046 2070 case TCP_ESTABLISHED:
+1 -1
net/ipv4/tcp_nv.c
··· 146 146 * within a datacenter, where we have reasonable estimates of 147 147 * RTTs 148 148 */ 149 - base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); 149 + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); 150 150 if (base_rtt > 0) { 151 151 ca->nv_base_rtt = base_rtt; 152 152 ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
+5 -1
net/ipv4/tcp_output.c
··· 2905 2905 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2906 2906 } 2907 2907 2908 + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) 2909 + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, 2910 + TCP_SKB_CB(skb)->seq, segs, err); 2911 + 2908 2912 if (likely(!err)) { 2909 2913 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; 2910 2914 trace_tcp_retransmit_skb(sk, skb); ··· 3473 3469 struct sk_buff *buff; 3474 3470 int err; 3475 3471 3476 - tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); 3472 + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); 3477 3473 3478 3474 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) 3479 3475 return -EHOSTUNREACH; /* Routing failure or similar. */
+7
net/ipv4/tcp_timer.c
··· 213 213 icsk->icsk_user_timeout); 214 214 } 215 215 tcp_fastopen_active_detect_blackhole(sk, expired); 216 + 217 + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) 218 + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, 219 + icsk->icsk_retransmits, 220 + icsk->icsk_rto, (int)expired); 221 + 216 222 if (expired) { 217 223 /* Has it gone just too far? */ 218 224 tcp_write_err(sk); 219 225 return 1; 220 226 } 227 + 221 228 return 0; 222 229 } 223 230
+4 -1
samples/bpf/Makefile
··· 201 201 endif 202 202 203 203 # Trick to allow make to be run from this directory 204 - all: 204 + all: $(LIBBPF) 205 205 $(MAKE) -C ../../ $(CURDIR)/ 206 206 207 207 clean: 208 208 $(MAKE) -C ../../ M=$(CURDIR) clean 209 209 @rm -f *~ 210 + 211 + $(LIBBPF): FORCE 212 + $(MAKE) -C $(dir $@) $(notdir $@) 210 213 211 214 $(obj)/syscall_nrs.s: $(src)/syscall_nrs.c 212 215 $(call if_changed_dep,cc_s_c)
+340 -52
samples/sockmap/sockmap_user.c
··· 23 23 #include <stdbool.h> 24 24 #include <signal.h> 25 25 #include <fcntl.h> 26 + #include <sys/wait.h> 27 + #include <time.h> 26 28 27 29 #include <sys/time.h> 30 + #include <sys/resource.h> 28 31 #include <sys/types.h> 29 32 30 33 #include <linux/netlink.h> ··· 37 34 #include <linux/if_link.h> 38 35 #include <assert.h> 39 36 #include <libgen.h> 37 + 38 + #include <getopt.h> 40 39 41 40 #include "../bpf/bpf_load.h" 42 41 #include "../bpf/bpf_util.h" ··· 51 46 #define S1_PORT 10000 52 47 #define S2_PORT 10001 53 48 54 - static int sockmap_test_sockets(int rate, int dot) 49 + /* global sockets */ 50 + int s1, s2, c1, c2, p1, p2; 51 + 52 + static const struct option long_options[] = { 53 + {"help", no_argument, NULL, 'h' }, 54 + {"cgroup", required_argument, NULL, 'c' }, 55 + {"rate", required_argument, NULL, 'r' }, 56 + {"verbose", no_argument, NULL, 'v' }, 57 + {"iov_count", required_argument, NULL, 'i' }, 58 + {"length", required_argument, NULL, 'l' }, 59 + {"test", required_argument, NULL, 't' }, 60 + {0, 0, NULL, 0 } 61 + }; 62 + 63 + static void usage(char *argv[]) 55 64 { 56 - int i, sc, err, max_fd, one = 1; 57 - int s1, s2, c1, c2, p1, p2; 65 + int i; 66 + 67 + printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]); 68 + printf(" options:\n"); 69 + for (i = 0; long_options[i].name != 0; i++) { 70 + printf(" --%-12s", long_options[i].name); 71 + if (long_options[i].flag != NULL) 72 + printf(" flag (internal value:%d)\n", 73 + *long_options[i].flag); 74 + else 75 + printf(" -%c\n", long_options[i].val); 76 + } 77 + printf("\n"); 78 + } 79 + 80 + static int sockmap_init_sockets(void) 81 + { 82 + int i, err, one = 1; 58 83 struct sockaddr_in addr; 59 - struct timeval timeout; 60 - char buf[1024] = {0}; 61 84 int *fds[4] = {&s1, &s2, &c1, &c2}; 62 - fd_set w; 63 85 64 86 s1 = s2 = p1 = p2 = c1 = c2 = 0; 65 87 ··· 95 63 *fds[i] = socket(AF_INET, SOCK_STREAM, 0); 96 64 if (*fds[i] < 0) { 97 65 perror("socket s1 failed()"); 98 - err = *fds[i]; 99 - goto out; 66 + return errno; 100 67 } 101 68 } 102 69 ··· 105 74 (char *)&one, sizeof(one)); 106 75 if (err) { 107 76 perror("setsockopt failed()"); 108 - goto out; 77 + return errno; 109 78 } 110 79 } 111 80 112 81 /* Non-blocking sockets */ 113 - for (i = 0; i < 4; i++) { 82 + for (i = 0; i < 2; i++) { 114 83 err = ioctl(*fds[i], FIONBIO, (char *)&one); 115 84 if (err < 0) { 116 85 perror("ioctl s1 failed()"); 117 - goto out; 86 + return errno; 118 87 } 119 88 } 120 89 ··· 127 96 err = bind(s1, (struct sockaddr *)&addr, sizeof(addr)); 128 97 if (err < 0) { 129 98 perror("bind s1 failed()\n"); 130 - goto out; 99 + return errno; 131 100 } 132 101 133 102 addr.sin_port = htons(S2_PORT); 134 103 err = bind(s2, (struct sockaddr *)&addr, sizeof(addr)); 135 104 if (err < 0) { 136 105 perror("bind s2 failed()\n"); 137 - goto out; 106 + return errno; 138 107 } 139 108 140 109 /* Listen server sockets */ ··· 142 111 err = listen(s1, 32); 143 112 if (err < 0) { 144 113 perror("listen s1 failed()\n"); 145 - goto out; 114 + return errno; 146 115 } 147 116 148 117 addr.sin_port = htons(S2_PORT); 149 118 err = listen(s2, 32); 150 119 if (err < 0) { 151 120 perror("listen s1 failed()\n"); 152 - goto out; 121 + return errno; 153 122 } 154 123 155 124 /* Initiate Connect */ ··· 157 126 err = connect(c1, (struct sockaddr *)&addr, sizeof(addr)); 158 127 if (err < 0 && errno != EINPROGRESS) { 159 128 perror("connect c1 failed()\n"); 160 - goto out; 129 + return errno; 161 130 } 162 131 163 132 addr.sin_port = htons(S2_PORT); 164 133 err = connect(c2, (struct sockaddr *)&addr, sizeof(addr)); 165 134 if (err < 0 && errno != EINPROGRESS) { 166 135 perror("connect c2 failed()\n"); 167 - goto out; 136 + return errno; 137 + } else if (err < 0) { 138 + err = 0; 168 139 } 169 140 170 141 /* Accept Connecrtions */ 171 142 p1 = accept(s1, NULL, NULL); 172 143 if (p1 < 0) { 173 144 perror("accept s1 failed()\n"); 174 - goto out; 145 + return errno; 175 146 } 176 147 177 148 p2 = accept(s2, NULL, NULL); 178 149 if (p2 < 0) { 179 150 perror("accept s1 failed()\n"); 180 - goto out; 151 + return errno; 181 152 } 182 - 183 - max_fd = p2; 184 - timeout.tv_sec = 10; 185 - timeout.tv_usec = 0; 186 153 187 154 printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); 188 155 printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", 189 156 c1, s1, c2, s2); 157 + return 0; 158 + } 159 + 160 + struct msg_stats { 161 + size_t bytes_sent; 162 + size_t bytes_recvd; 163 + struct timespec start; 164 + struct timespec end; 165 + }; 166 + 167 + static int msg_loop(int fd, int iov_count, int iov_length, int cnt, 168 + struct msg_stats *s, bool tx) 169 + { 170 + struct msghdr msg = {0}; 171 + int err, i, flags = MSG_NOSIGNAL; 172 + struct iovec *iov; 173 + 174 + iov = calloc(iov_count, sizeof(struct iovec)); 175 + if (!iov) 176 + return errno; 177 + 178 + for (i = 0; i < iov_count; i++) { 179 + char *d = calloc(iov_length, sizeof(char)); 180 + 181 + if (!d) { 182 + fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); 183 + goto out_errno; 184 + } 185 + iov[i].iov_base = d; 186 + iov[i].iov_len = iov_length; 187 + } 188 + 189 + msg.msg_iov = iov; 190 + msg.msg_iovlen = iov_count; 191 + 192 + if (tx) { 193 + clock_gettime(CLOCK_MONOTONIC, &s->start); 194 + for (i = 0; i < cnt; i++) { 195 + int sent = sendmsg(fd, &msg, flags); 196 + 197 + if (sent < 0) { 198 + perror("send loop error:"); 199 + goto out_errno; 200 + } 201 + s->bytes_sent += sent; 202 + } 203 + clock_gettime(CLOCK_MONOTONIC, &s->end); 204 + } else { 205 + int slct, recv, max_fd = fd; 206 + struct timeval timeout; 207 + float total_bytes; 208 + fd_set w; 209 + 210 + total_bytes = (float)iov_count * (float)iov_length * (float)cnt; 211 + err = clock_gettime(CLOCK_MONOTONIC, &s->start); 212 + if (err < 0) 213 + perror("recv start time: "); 214 + while (s->bytes_recvd < total_bytes) { 215 + timeout.tv_sec = 1; 216 + timeout.tv_usec = 0; 217 + 218 + /* FD sets */ 219 + FD_ZERO(&w); 220 + FD_SET(fd, &w); 221 + 222 + slct = select(max_fd + 1, &w, NULL, NULL, &timeout); 223 + if (slct == -1) { 224 + perror("select()"); 225 + clock_gettime(CLOCK_MONOTONIC, &s->end); 226 + goto out_errno; 227 + } else if (!slct) { 228 + fprintf(stderr, "unexpected timeout\n"); 229 + errno = -EIO; 230 + clock_gettime(CLOCK_MONOTONIC, &s->end); 231 + goto out_errno; 232 + } 233 + 234 + recv = recvmsg(fd, &msg, flags); 235 + if (recv < 0) { 236 + if (errno != EWOULDBLOCK) { 237 + clock_gettime(CLOCK_MONOTONIC, &s->end); 238 + perror("recv failed()\n"); 239 + goto out_errno; 240 + } 241 + } 242 + 243 + s->bytes_recvd += recv; 244 + } 245 + clock_gettime(CLOCK_MONOTONIC, &s->end); 246 + } 247 + 248 + for (i = 0; i < iov_count; i++) 249 + free(iov[i].iov_base); 250 + free(iov); 251 + return 0; 252 + out_errno: 253 + for (i = 0; i < iov_count; i++) 254 + free(iov[i].iov_base); 255 + free(iov); 256 + return errno; 257 + } 258 + 259 + static float giga = 1000000000; 260 + 261 + static inline float sentBps(struct msg_stats s) 262 + { 263 + return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec); 264 + } 265 + 266 + static inline float recvdBps(struct msg_stats s) 267 + { 268 + return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec); 269 + } 270 + 271 + static int sendmsg_test(int iov_count, int iov_buf, int cnt, 272 + int verbose, bool base) 273 + { 274 + float sent_Bps = 0, recvd_Bps = 0; 275 + int rx_fd, txpid, rxpid, err = 0; 276 + struct msg_stats s = {0}; 277 + int status; 278 + 279 + errno = 0; 280 + 281 + if (base) 282 + rx_fd = p1; 283 + else 284 + rx_fd = p2; 285 + 286 + rxpid = fork(); 287 + if (rxpid == 0) { 288 + err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false); 289 + if (err) 290 + fprintf(stderr, 291 + "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", 292 + iov_count, iov_buf, cnt, err); 293 + shutdown(p2, SHUT_RDWR); 294 + shutdown(p1, SHUT_RDWR); 295 + if (s.end.tv_sec - s.start.tv_sec) { 296 + sent_Bps = sentBps(s); 297 + recvd_Bps = recvdBps(s); 298 + } 299 + fprintf(stdout, 300 + "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n", 301 + s.bytes_sent, sent_Bps, sent_Bps/giga, 302 + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); 303 + exit(1); 304 + } else if (rxpid == -1) { 305 + perror("msg_loop_rx: "); 306 + return errno; 307 + } 308 + 309 + txpid = fork(); 310 + if (txpid == 0) { 311 + err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); 312 + if (err) 313 + fprintf(stderr, 314 + "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n", 315 + iov_count, iov_buf, cnt, err); 316 + shutdown(c1, SHUT_RDWR); 317 + if (s.end.tv_sec - s.start.tv_sec) { 318 + sent_Bps = sentBps(s); 319 + recvd_Bps = recvdBps(s); 320 + } 321 + fprintf(stdout, 322 + "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", 323 + s.bytes_sent, sent_Bps, sent_Bps/giga, 324 + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); 325 + exit(1); 326 + } else if (txpid == -1) { 327 + perror("msg_loop_tx: "); 328 + return errno; 329 + } 330 + 331 + assert(waitpid(rxpid, &status, 0) == rxpid); 332 + assert(waitpid(txpid, &status, 0) == txpid); 333 + return err; 334 + } 335 + 336 + static int forever_ping_pong(int rate, int verbose) 337 + { 338 + struct timeval timeout; 339 + char buf[1024] = {0}; 340 + int sc; 341 + 342 + timeout.tv_sec = 10; 343 + timeout.tv_usec = 0; 190 344 191 345 /* Ping/Pong data from client to server */ 192 346 sc = send(c1, buf, sizeof(buf), 0); 193 347 if (sc < 0) { 194 348 perror("send failed()\n"); 195 - goto out; 349 + return sc; 196 350 } 197 351 198 352 do { 199 - int s, rc, i; 353 + int s, rc, i, max_fd = p2; 354 + fd_set w; 200 355 201 356 /* FD sets */ 202 357 FD_ZERO(&w); ··· 410 193 if (rc < 0) { 411 194 if (errno != EWOULDBLOCK) { 412 195 perror("recv failed()\n"); 413 - break; 196 + return rc; 414 197 } 415 198 } 416 199 ··· 422 205 sc = send(i, buf, rc, 0); 423 206 if (sc < 0) { 424 207 perror("send failed()\n"); 425 - break; 208 + return sc; 426 209 } 427 210 } 428 - sleep(rate); 429 - if (dot) { 211 + 212 + if (rate) 213 + sleep(rate); 214 + 215 + if (verbose) { 430 216 printf("."); 431 217 fflush(stdout); 432 218 433 219 } 434 220 } while (running); 435 221 436 - out: 437 - close(s1); 438 - close(s2); 439 - close(p1); 440 - close(p2); 441 - close(c1); 442 - close(c2); 443 - return err; 222 + return 0; 444 223 } 224 + 225 + enum { 226 + PING_PONG, 227 + SENDMSG, 228 + BASE, 229 + }; 445 230 446 231 int main(int argc, char **argv) 447 232 { 448 - int rate = 1, dot = 1; 233 + int iov_count = 1, length = 1024, rate = 1, verbose = 0; 234 + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; 235 + int opt, longindex, err, cg_fd = 0; 236 + int test = PING_PONG; 449 237 char filename[256]; 450 - int err, cg_fd; 451 - char *cg_path; 452 238 453 - cg_path = argv[argc - 1]; 239 + while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:", 240 + long_options, &longindex)) != -1) { 241 + switch (opt) { 242 + /* Cgroup configuration */ 243 + case 'c': 244 + cg_fd = open(optarg, O_DIRECTORY, O_RDONLY); 245 + if (cg_fd < 0) { 246 + fprintf(stderr, 247 + "ERROR: (%i) open cg path failed: %s\n", 248 + cg_fd, optarg); 249 + return cg_fd; 250 + } 251 + break; 252 + case 'r': 253 + rate = atoi(optarg); 254 + break; 255 + case 'v': 256 + verbose = 1; 257 + break; 258 + case 'i': 259 + iov_count = atoi(optarg); 260 + break; 261 + case 'l': 262 + length = atoi(optarg); 263 + break; 264 + case 't': 265 + if (strcmp(optarg, "ping") == 0) { 266 + test = PING_PONG; 267 + } else if (strcmp(optarg, "sendmsg") == 0) { 268 + test = SENDMSG; 269 + } else if (strcmp(optarg, "base") == 0) { 270 + test = BASE; 271 + } else { 272 + usage(argv); 273 + return -1; 274 + } 275 + break; 276 + case 'h': 277 + default: 278 + usage(argv); 279 + return -1; 280 + } 281 + } 282 + 283 + if (!cg_fd) { 284 + fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", 285 + argv[0]); 286 + return -1; 287 + } 288 + 289 + if (setrlimit(RLIMIT_MEMLOCK, &r)) { 290 + perror("setrlimit(RLIMIT_MEMLOCK)"); 291 + return 1; 292 + } 293 + 454 294 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 455 295 456 296 running = 1; ··· 515 241 /* catch SIGINT */ 516 242 signal(SIGINT, running_handler); 517 243 244 + /* If base test skip BPF setup */ 245 + if (test == BASE) 246 + goto run; 247 + 518 248 if (load_bpf_file(filename)) { 519 249 fprintf(stderr, "load_bpf_file: (%s) %s\n", 520 250 filename, strerror(errno)); 521 251 return 1; 522 - } 523 - 524 - /* Cgroup configuration */ 525 - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); 526 - if (cg_fd < 0) { 527 - fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n", 528 - cg_fd, cg_path); 529 - return cg_fd; 530 252 } 531 253 532 254 /* Attach programs to sockmap */ ··· 550 280 return err; 551 281 } 552 282 553 - err = sockmap_test_sockets(rate, dot); 283 + run: 284 + err = sockmap_init_sockets(); 554 285 if (err) { 555 286 fprintf(stderr, "ERROR: test socket failed: %d\n", err); 556 - return err; 287 + goto out; 557 288 } 558 - return 0; 289 + 290 + if (test == PING_PONG) 291 + err = forever_ping_pong(rate, verbose); 292 + else if (test == SENDMSG) 293 + err = sendmsg_test(iov_count, length, rate, verbose, false); 294 + else if (test == BASE) 295 + err = sendmsg_test(iov_count, length, rate, verbose, true); 296 + else 297 + fprintf(stderr, "unknown test\n"); 298 + out: 299 + close(s1); 300 + close(s2); 301 + close(p1); 302 + close(p2); 303 + close(c1); 304 + close(c2); 305 + close(cg_fd); 306 + return err; 559 307 } 560 308 561 309 void running_handler(int a)
+82 -4
tools/include/uapi/linux/bpf.h
··· 17 17 #define BPF_ALU64 0x07 /* alu mode in double word width */ 18 18 19 19 /* ld/ldx fields */ 20 - #define BPF_DW 0x18 /* double word */ 20 + #define BPF_DW 0x18 /* double word (64-bit) */ 21 21 #define BPF_XADD 0xc0 /* exclusive add */ 22 22 23 23 /* alu/jmp fields */ ··· 642 642 * @optlen: length of optval in bytes 643 643 * Return: 0 or negative error 644 644 * 645 + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) 646 + * Set callback flags for sock_ops 647 + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct 648 + * @flags: flags value 649 + * Return: 0 for no error 650 + * -EINVAL if there is no full tcp socket 651 + * bits in flags that are not supported by current kernel 652 + * 645 653 * int bpf_skb_adjust_room(skb, len_diff, mode, flags) 646 654 * Grow or shrink room in sk_buff. 647 655 * @skb: pointer to skb ··· 756 748 FN(perf_event_read_value), \ 757 749 FN(perf_prog_read_value), \ 758 750 FN(getsockopt), \ 759 - FN(override_return), 751 + FN(override_return), \ 752 + FN(sock_ops_cb_flags_set), 760 753 761 754 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 762 755 * function eBPF program intends to call ··· 961 952 struct bpf_sock_ops { 962 953 __u32 op; 963 954 union { 964 - __u32 reply; 965 - __u32 replylong[4]; 955 + __u32 args[4]; /* Optionally passed to bpf program */ 956 + __u32 reply; /* Returned by bpf program */ 957 + __u32 replylong[4]; /* Optionally returned by bpf prog */ 966 958 }; 967 959 __u32 family; 968 960 __u32 remote_ip4; /* Stored in network byte order */ ··· 978 968 */ 979 969 __u32 snd_cwnd; 980 970 __u32 srtt_us; /* Averaged RTT << 3 in usecs */ 971 + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ 972 + __u32 state; 973 + __u32 rtt_min; 974 + __u32 snd_ssthresh; 975 + __u32 rcv_nxt; 976 + __u32 snd_nxt; 977 + __u32 snd_una; 978 + __u32 mss_cache; 979 + __u32 ecn_flags; 980 + __u32 rate_delivered; 981 + __u32 rate_interval_us; 982 + __u32 packets_out; 983 + __u32 retrans_out; 984 + __u32 total_retrans; 985 + __u32 segs_in; 986 + __u32 data_segs_in; 987 + __u32 segs_out; 988 + __u32 data_segs_out; 989 + __u32 lost_out; 990 + __u32 sacked_out; 991 + __u32 sk_txhash; 992 + __u64 bytes_received; 993 + __u64 bytes_acked; 981 994 }; 995 + 996 + /* Definitions for bpf_sock_ops_cb_flags */ 997 + #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) 998 + #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) 999 + #define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) 1000 + #define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently 1001 + * supported cb flags 1002 + */ 982 1003 983 1004 /* List of known BPF sock_ops operators. 984 1005 * New entries can only be added at the end ··· 1044 1003 * a congestion threshold. RTTs above 1045 1004 * this indicate congestion 1046 1005 */ 1006 + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. 1007 + * Arg1: value of icsk_retransmits 1008 + * Arg2: value of icsk_rto 1009 + * Arg3: whether RTO has expired 1010 + */ 1011 + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. 1012 + * Arg1: sequence number of 1st byte 1013 + * Arg2: # segments 1014 + * Arg3: return value of 1015 + * tcp_transmit_skb (0 => success) 1016 + */ 1017 + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. 1018 + * Arg1: old_state 1019 + * Arg2: new_state 1020 + */ 1021 + }; 1022 + 1023 + /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect 1024 + * changes between the TCP and BPF versions. Ideally this should never happen. 1025 + * If it does, we need to add code to convert them before calling 1026 + * the BPF sock_ops function. 1027 + */ 1028 + enum { 1029 + BPF_TCP_ESTABLISHED = 1, 1030 + BPF_TCP_SYN_SENT, 1031 + BPF_TCP_SYN_RECV, 1032 + BPF_TCP_FIN_WAIT1, 1033 + BPF_TCP_FIN_WAIT2, 1034 + BPF_TCP_TIME_WAIT, 1035 + BPF_TCP_CLOSE, 1036 + BPF_TCP_CLOSE_WAIT, 1037 + BPF_TCP_LAST_ACK, 1038 + BPF_TCP_LISTEN, 1039 + BPF_TCP_CLOSING, /* Now a valid state */ 1040 + BPF_TCP_NEW_SYN_RECV, 1041 + 1042 + BPF_TCP_MAX_STATES /* Leave at the end! */ 1047 1043 }; 1048 1044 1049 1045 #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
+3 -3
tools/testing/selftests/bpf/Makefile
··· 11 11 endif 12 12 13 13 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include 14 - LDLIBS += -lcap -lelf -lrt 14 + LDLIBS += -lcap -lelf -lrt -lpthread 15 15 16 16 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ 17 - test_align test_verifier_log test_dev_cgroup 17 + test_align test_verifier_log test_dev_cgroup test_tcpbpf_user 18 18 19 19 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ 20 20 test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ 21 21 sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ 22 22 test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ 23 - sample_map_ret0.o 23 + sample_map_ret0.o test_tcpbpf_kern.o 24 24 25 25 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ 26 26 test_offload.py
+2
tools/testing/selftests/bpf/bpf_helpers.h
··· 71 71 static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, 72 72 int optlen) = 73 73 (void *) BPF_FUNC_getsockopt; 74 + static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) = 75 + (void *) BPF_FUNC_sock_ops_cb_flags_set; 74 76 static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = 75 77 (void *) BPF_FUNC_sk_redirect_map; 76 78 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
+51
tools/testing/selftests/bpf/tcp_client.py
··· 1 + #!/usr/bin/env python2 2 + # 3 + # SPDX-License-Identifier: GPL-2.0 4 + # 5 + 6 + import sys, os, os.path, getopt 7 + import socket, time 8 + import subprocess 9 + import select 10 + 11 + def read(sock, n): 12 + buf = '' 13 + while len(buf) < n: 14 + rem = n - len(buf) 15 + try: s = sock.recv(rem) 16 + except (socket.error), e: return '' 17 + buf += s 18 + return buf 19 + 20 + def send(sock, s): 21 + total = len(s) 22 + count = 0 23 + while count < total: 24 + try: n = sock.send(s) 25 + except (socket.error), e: n = 0 26 + if n == 0: 27 + return count; 28 + count += n 29 + return count 30 + 31 + 32 + serverPort = int(sys.argv[1]) 33 + HostName = socket.gethostname() 34 + 35 + # create active socket 36 + sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) 37 + try: 38 + sock.connect((HostName, serverPort)) 39 + except socket.error as e: 40 + sys.exit(1) 41 + 42 + buf = '' 43 + n = 0 44 + while n < 1000: 45 + buf += '+' 46 + n += 1 47 + 48 + sock.settimeout(1); 49 + n = send(sock, buf) 50 + n = read(sock, 500) 51 + sys.exit(0)
+83
tools/testing/selftests/bpf/tcp_server.py
··· 1 + #!/usr/bin/env python2 2 + # 3 + # SPDX-License-Identifier: GPL-2.0 4 + # 5 + 6 + import sys, os, os.path, getopt 7 + import socket, time 8 + import subprocess 9 + import select 10 + 11 + def read(sock, n): 12 + buf = '' 13 + while len(buf) < n: 14 + rem = n - len(buf) 15 + try: s = sock.recv(rem) 16 + except (socket.error), e: return '' 17 + buf += s 18 + return buf 19 + 20 + def send(sock, s): 21 + total = len(s) 22 + count = 0 23 + while count < total: 24 + try: n = sock.send(s) 25 + except (socket.error), e: n = 0 26 + if n == 0: 27 + return count; 28 + count += n 29 + return count 30 + 31 + 32 + SERVER_PORT = 12877 33 + MAX_PORTS = 2 34 + 35 + serverPort = SERVER_PORT 36 + serverSocket = None 37 + 38 + HostName = socket.gethostname() 39 + 40 + # create passive socket 41 + serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) 42 + host = socket.gethostname() 43 + 44 + try: serverSocket.bind((host, 0)) 45 + except socket.error as msg: 46 + print 'bind fails: ', msg 47 + 48 + sn = serverSocket.getsockname() 49 + serverPort = sn[1] 50 + 51 + cmdStr = ("./tcp_client.py %d &") % (serverPort) 52 + os.system(cmdStr) 53 + 54 + buf = '' 55 + n = 0 56 + while n < 500: 57 + buf += '.' 58 + n += 1 59 + 60 + serverSocket.listen(MAX_PORTS) 61 + readList = [serverSocket] 62 + 63 + while True: 64 + readyRead, readyWrite, inError = \ 65 + select.select(readList, [], [], 2) 66 + 67 + if len(readyRead) > 0: 68 + waitCount = 0 69 + for sock in readyRead: 70 + if sock == serverSocket: 71 + (clientSocket, address) = serverSocket.accept() 72 + address = str(address[0]) 73 + readList.append(clientSocket) 74 + else: 75 + sock.settimeout(1); 76 + s = read(sock, 1000) 77 + n = send(sock, buf) 78 + sock.close() 79 + serverSocket.close() 80 + sys.exit(0) 81 + else: 82 + print 'Select timeout!' 83 + sys.exit(1)
+23 -7
tools/testing/selftests/bpf/test_align.c
··· 446 446 .insns = { 447 447 PREP_PKT_POINTERS, 448 448 BPF_MOV64_IMM(BPF_REG_0, 0), 449 - /* ptr & const => unknown & const */ 450 - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), 451 - BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 0x40), 452 - /* ptr << const => unknown << const */ 453 - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), 449 + /* (ptr - ptr) << 2 */ 450 + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), 451 + BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2), 454 452 BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2), 455 453 /* We have a (4n) value. Let's make a packet offset 456 454 * out of it. First add 14, to make it a (4n+2) ··· 471 473 .prog_type = BPF_PROG_TYPE_SCHED_CLS, 472 474 .result = REJECT, 473 475 .matches = { 474 - {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, 475 - /* R5 bitwise operator &= on pointer prohibited */ 476 + {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, 477 + /* (ptr - ptr) << 2 == unknown, (4n) */ 478 + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, 479 + /* (4n) + 14 == (4n+2). We blow our bounds, because 480 + * the add could overflow. 481 + */ 482 + {7, "R5=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, 483 + /* Checked s>=0 */ 484 + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, 485 + /* packet pointer + nonnegative (4n+2) */ 486 + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, 487 + {13, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, 488 + /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. 489 + * We checked the bounds, but it might have been able 490 + * to overflow if the packet pointer started in the 491 + * upper half of the address space. 492 + * So we did not get a 'range' on R6, and the access 493 + * attempt will fail. 494 + */ 495 + {15, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, 476 496 } 477 497 }, 478 498 {
+1 -1
tools/testing/selftests/bpf/test_dev_cgroup.c
··· 21 21 22 22 #define DEV_CGROUP_PROG "./dev_cgroup.o" 23 23 24 - #define TEST_CGROUP "test-bpf-based-device-cgroup/" 24 + #define TEST_CGROUP "/test-bpf-based-device-cgroup/" 25 25 26 26 int main(int argc, char **argv) 27 27 {
+95
tools/testing/selftests/bpf/test_lpm_map.c
··· 14 14 #include <errno.h> 15 15 #include <inttypes.h> 16 16 #include <linux/bpf.h> 17 + #include <pthread.h> 17 18 #include <stdio.h> 18 19 #include <stdlib.h> 19 20 #include <string.h> ··· 642 641 close(map_fd); 643 642 } 644 643 644 + #define MAX_TEST_KEYS 4 645 + struct lpm_mt_test_info { 646 + int cmd; /* 0: update, 1: delete, 2: lookup, 3: get_next_key */ 647 + int iter; 648 + int map_fd; 649 + struct { 650 + __u32 prefixlen; 651 + __u32 data; 652 + } key[MAX_TEST_KEYS]; 653 + }; 654 + 655 + static void *lpm_test_command(void *arg) 656 + { 657 + int i, j, ret, iter, key_size; 658 + struct lpm_mt_test_info *info = arg; 659 + struct bpf_lpm_trie_key *key_p; 660 + 661 + key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32); 662 + key_p = alloca(key_size); 663 + for (iter = 0; iter < info->iter; iter++) 664 + for (i = 0; i < MAX_TEST_KEYS; i++) { 665 + /* first half of iterations in forward order, 666 + * and second half in backward order. 667 + */ 668 + j = (iter < (info->iter / 2)) ? i : MAX_TEST_KEYS - i - 1; 669 + key_p->prefixlen = info->key[j].prefixlen; 670 + memcpy(key_p->data, &info->key[j].data, sizeof(__u32)); 671 + if (info->cmd == 0) { 672 + __u32 value = j; 673 + /* update must succeed */ 674 + assert(bpf_map_update_elem(info->map_fd, key_p, &value, 0) == 0); 675 + } else if (info->cmd == 1) { 676 + ret = bpf_map_delete_elem(info->map_fd, key_p); 677 + assert(ret == 0 || errno == ENOENT); 678 + } else if (info->cmd == 2) { 679 + __u32 value; 680 + ret = bpf_map_lookup_elem(info->map_fd, key_p, &value); 681 + assert(ret == 0 || errno == ENOENT); 682 + } else { 683 + struct bpf_lpm_trie_key *next_key_p = alloca(key_size); 684 + ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p); 685 + assert(ret == 0 || errno == ENOENT || errno == ENOMEM); 686 + } 687 + } 688 + 689 + // Pass successful exit info back to the main thread 690 + pthread_exit((void *)info); 691 + } 692 + 693 + static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd) 694 + { 695 + info->iter = 2000; 696 + info->map_fd = map_fd; 697 + info->key[0].prefixlen = 16; 698 + inet_pton(AF_INET, "192.168.0.0", &info->key[0].data); 699 + info->key[1].prefixlen = 24; 700 + inet_pton(AF_INET, "192.168.0.0", &info->key[1].data); 701 + info->key[2].prefixlen = 24; 702 + inet_pton(AF_INET, "192.168.128.0", &info->key[2].data); 703 + info->key[3].prefixlen = 24; 704 + inet_pton(AF_INET, "192.168.1.0", &info->key[3].data); 705 + } 706 + 707 + static void test_lpm_multi_thread(void) 708 + { 709 + struct lpm_mt_test_info info[4]; 710 + size_t key_size, value_size; 711 + pthread_t thread_id[4]; 712 + int i, map_fd; 713 + void *ret; 714 + 715 + /* create a trie */ 716 + value_size = sizeof(__u32); 717 + key_size = sizeof(struct bpf_lpm_trie_key) + value_size; 718 + map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size, 719 + 100, BPF_F_NO_PREALLOC); 720 + 721 + /* create 4 threads to test update, delete, lookup and get_next_key */ 722 + setup_lpm_mt_test_info(&info[0], map_fd); 723 + for (i = 0; i < 4; i++) { 724 + if (i != 0) 725 + memcpy(&info[i], &info[0], sizeof(info[i])); 726 + info[i].cmd = i; 727 + assert(pthread_create(&thread_id[i], NULL, &lpm_test_command, &info[i]) == 0); 728 + } 729 + 730 + for (i = 0; i < 4; i++) 731 + assert(pthread_join(thread_id[i], &ret) == 0 && ret == (void *)&info[i]); 732 + 733 + close(map_fd); 734 + } 735 + 645 736 int main(void) 646 737 { 647 738 struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY }; ··· 759 666 test_lpm_delete(); 760 667 761 668 test_lpm_get_next_key(); 669 + 670 + test_lpm_multi_thread(); 762 671 763 672 printf("test_lpm: OK\n"); 764 673 return 0;
+22 -10
tools/testing/selftests/bpf/test_maps.c
··· 242 242 243 243 static void test_hashmap_walk(int task, void *data) 244 244 { 245 - int fd, i, max_entries = 100000; 245 + int fd, i, max_entries = 1000; 246 246 long long key, value, next_key; 247 247 bool next_key_valid = true; 248 248 ··· 463 463 #define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o" 464 464 static void test_sockmap(int tasks, void *data) 465 465 { 466 - int one = 1, map_fd_rx, map_fd_tx, map_fd_break, s, sc, rc; 466 + int one = 1, map_fd_rx = 0, map_fd_tx = 0, map_fd_break, s, sc, rc; 467 467 struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_break; 468 468 int ports[] = {50200, 50201, 50202, 50204}; 469 469 int err, i, fd, udp, sfd[6] = {0xdeadbeef}; ··· 868 868 goto out_sockmap; 869 869 } 870 870 871 - /* Test map close sockets */ 872 - for (i = 0; i < 6; i++) 871 + /* Test map close sockets and empty maps */ 872 + for (i = 0; i < 6; i++) { 873 + bpf_map_delete_elem(map_fd_tx, &i); 874 + bpf_map_delete_elem(map_fd_rx, &i); 873 875 close(sfd[i]); 876 + } 874 877 close(fd); 875 878 close(map_fd_rx); 876 879 bpf_object__close(obj); ··· 884 881 printf("Failed to create sockmap '%i:%s'!\n", i, strerror(errno)); 885 882 exit(1); 886 883 out_sockmap: 887 - for (i = 0; i < 6; i++) 884 + for (i = 0; i < 6; i++) { 885 + if (map_fd_tx) 886 + bpf_map_delete_elem(map_fd_tx, &i); 887 + if (map_fd_rx) 888 + bpf_map_delete_elem(map_fd_rx, &i); 888 889 close(sfd[i]); 890 + } 889 891 close(fd); 890 892 exit(1); 891 893 } ··· 939 931 close(fd); 940 932 } 941 933 942 - static void run_parallel(int tasks, void (*fn)(int task, void *data), 943 - void *data) 934 + #define run_parallel(N, FN, DATA) \ 935 + printf("Fork %d tasks to '" #FN "'\n", N); \ 936 + __run_parallel(N, FN, DATA) 937 + 938 + static void __run_parallel(int tasks, void (*fn)(int task, void *data), 939 + void *data) 944 940 { 945 941 pid_t pid[tasks]; 946 942 int i; ··· 984 972 #define DO_UPDATE 1 985 973 #define DO_DELETE 0 986 974 987 - static void do_work(int fn, void *data) 975 + static void test_update_delete(int fn, void *data) 988 976 { 989 977 int do_update = ((int *)data)[1]; 990 978 int fd = ((int *)data)[0]; ··· 1024 1012 */ 1025 1013 data[0] = fd; 1026 1014 data[1] = DO_UPDATE; 1027 - run_parallel(TASKS, do_work, data); 1015 + run_parallel(TASKS, test_update_delete, data); 1028 1016 1029 1017 /* Check that key=0 is already there. */ 1030 1018 assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && ··· 1047 1035 1048 1036 /* Now let's delete all elemenets in parallel. */ 1049 1037 data[1] = DO_DELETE; 1050 - run_parallel(TASKS, do_work, data); 1038 + run_parallel(TASKS, test_update_delete, data); 1051 1039 1052 1040 /* Nothing should be left. */ 1053 1041 key = -1;
+16
tools/testing/selftests/bpf/test_tcpbpf.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #ifndef _TEST_TCPBPF_H 4 + #define _TEST_TCPBPF_H 5 + 6 + struct tcpbpf_globals { 7 + __u32 event_map; 8 + __u32 total_retrans; 9 + __u32 data_segs_in; 10 + __u32 data_segs_out; 11 + __u32 bad_cb_test_rv; 12 + __u32 good_cb_test_rv; 13 + __u64 bytes_received; 14 + __u64 bytes_acked; 15 + }; 16 + #endif
+115
tools/testing/selftests/bpf/test_tcpbpf_kern.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <stddef.h> 3 + #include <string.h> 4 + #include <linux/bpf.h> 5 + #include <linux/if_ether.h> 6 + #include <linux/if_packet.h> 7 + #include <linux/ip.h> 8 + #include <linux/in6.h> 9 + #include <linux/types.h> 10 + #include <linux/socket.h> 11 + #include <linux/tcp.h> 12 + #include <netinet/in.h> 13 + #include "bpf_helpers.h" 14 + #include "bpf_endian.h" 15 + #include "test_tcpbpf.h" 16 + 17 + struct bpf_map_def SEC("maps") global_map = { 18 + .type = BPF_MAP_TYPE_ARRAY, 19 + .key_size = sizeof(__u32), 20 + .value_size = sizeof(struct tcpbpf_globals), 21 + .max_entries = 2, 22 + }; 23 + 24 + static inline void update_event_map(int event) 25 + { 26 + __u32 key = 0; 27 + struct tcpbpf_globals g, *gp; 28 + 29 + gp = bpf_map_lookup_elem(&global_map, &key); 30 + if (gp == NULL) { 31 + struct tcpbpf_globals g = {0}; 32 + 33 + g.event_map |= (1 << event); 34 + bpf_map_update_elem(&global_map, &key, &g, 35 + BPF_ANY); 36 + } else { 37 + g = *gp; 38 + g.event_map |= (1 << event); 39 + bpf_map_update_elem(&global_map, &key, &g, 40 + BPF_ANY); 41 + } 42 + } 43 + 44 + int _version SEC("version") = 1; 45 + 46 + SEC("sockops") 47 + int bpf_testcb(struct bpf_sock_ops *skops) 48 + { 49 + int rv = -1; 50 + int bad_call_rv = 0; 51 + int good_call_rv = 0; 52 + int op; 53 + int v = 0; 54 + 55 + op = (int) skops->op; 56 + 57 + update_event_map(op); 58 + 59 + switch (op) { 60 + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: 61 + /* Test failure to set largest cb flag (assumes not defined) */ 62 + bad_call_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); 63 + /* Set callback */ 64 + good_call_rv = bpf_sock_ops_cb_flags_set(skops, 65 + BPF_SOCK_OPS_STATE_CB_FLAG); 66 + /* Update results */ 67 + { 68 + __u32 key = 0; 69 + struct tcpbpf_globals g, *gp; 70 + 71 + gp = bpf_map_lookup_elem(&global_map, &key); 72 + if (!gp) 73 + break; 74 + g = *gp; 75 + g.bad_cb_test_rv = bad_call_rv; 76 + g.good_cb_test_rv = good_call_rv; 77 + bpf_map_update_elem(&global_map, &key, &g, 78 + BPF_ANY); 79 + } 80 + break; 81 + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: 82 + skops->sk_txhash = 0x12345f; 83 + v = 0xff; 84 + rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, 85 + sizeof(v)); 86 + break; 87 + case BPF_SOCK_OPS_RTO_CB: 88 + break; 89 + case BPF_SOCK_OPS_RETRANS_CB: 90 + break; 91 + case BPF_SOCK_OPS_STATE_CB: 92 + if (skops->args[1] == BPF_TCP_CLOSE) { 93 + __u32 key = 0; 94 + struct tcpbpf_globals g, *gp; 95 + 96 + gp = bpf_map_lookup_elem(&global_map, &key); 97 + if (!gp) 98 + break; 99 + g = *gp; 100 + g.total_retrans = skops->total_retrans; 101 + g.data_segs_in = skops->data_segs_in; 102 + g.data_segs_out = skops->data_segs_out; 103 + g.bytes_received = skops->bytes_received; 104 + g.bytes_acked = skops->bytes_acked; 105 + bpf_map_update_elem(&global_map, &key, &g, 106 + BPF_ANY); 107 + } 108 + break; 109 + default: 110 + rv = -1; 111 + } 112 + skops->reply = rv; 113 + return 1; 114 + } 115 + char _license[] SEC("license") = "GPL";
+126
tools/testing/selftests/bpf/test_tcpbpf_user.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <stdio.h> 3 + #include <stdlib.h> 4 + #include <stdio.h> 5 + #include <unistd.h> 6 + #include <errno.h> 7 + #include <signal.h> 8 + #include <string.h> 9 + #include <assert.h> 10 + #include <linux/perf_event.h> 11 + #include <linux/ptrace.h> 12 + #include <linux/bpf.h> 13 + #include <sys/ioctl.h> 14 + #include <sys/types.h> 15 + #include <sys/stat.h> 16 + #include <fcntl.h> 17 + #include <bpf/bpf.h> 18 + #include <bpf/libbpf.h> 19 + #include "bpf_util.h" 20 + #include <linux/perf_event.h> 21 + #include "test_tcpbpf.h" 22 + 23 + static int bpf_find_map(const char *test, struct bpf_object *obj, 24 + const char *name) 25 + { 26 + struct bpf_map *map; 27 + 28 + map = bpf_object__find_map_by_name(obj, name); 29 + if (!map) { 30 + printf("%s:FAIL:map '%s' not found\n", test, name); 31 + return -1; 32 + } 33 + return bpf_map__fd(map); 34 + } 35 + 36 + #define SYSTEM(CMD) \ 37 + do { \ 38 + if (system(CMD)) { \ 39 + printf("system(%s) FAILS!\n", CMD); \ 40 + } \ 41 + } while (0) 42 + 43 + int main(int argc, char **argv) 44 + { 45 + const char *file = "test_tcpbpf_kern.o"; 46 + struct tcpbpf_globals g = {0}; 47 + int cg_fd, prog_fd, map_fd; 48 + bool debug_flag = false; 49 + int error = EXIT_FAILURE; 50 + struct bpf_object *obj; 51 + char cmd[100], *dir; 52 + struct stat buffer; 53 + __u32 key = 0; 54 + int pid; 55 + int rv; 56 + 57 + if (argc > 1 && strcmp(argv[1], "-d") == 0) 58 + debug_flag = true; 59 + 60 + dir = "/tmp/cgroupv2/foo"; 61 + 62 + if (stat(dir, &buffer) != 0) { 63 + SYSTEM("mkdir -p /tmp/cgroupv2"); 64 + SYSTEM("mount -t cgroup2 none /tmp/cgroupv2"); 65 + SYSTEM("mkdir -p /tmp/cgroupv2/foo"); 66 + } 67 + pid = (int) getpid(); 68 + sprintf(cmd, "echo %d >> /tmp/cgroupv2/foo/cgroup.procs", pid); 69 + SYSTEM(cmd); 70 + 71 + cg_fd = open(dir, O_DIRECTORY, O_RDONLY); 72 + if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) { 73 + printf("FAILED: load_bpf_file failed for: %s\n", file); 74 + goto err; 75 + } 76 + 77 + rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0); 78 + if (rv) { 79 + printf("FAILED: bpf_prog_attach: %d (%s)\n", 80 + error, strerror(errno)); 81 + goto err; 82 + } 83 + 84 + SYSTEM("./tcp_server.py"); 85 + 86 + map_fd = bpf_find_map(__func__, obj, "global_map"); 87 + if (map_fd < 0) 88 + goto err; 89 + 90 + rv = bpf_map_lookup_elem(map_fd, &key, &g); 91 + if (rv != 0) { 92 + printf("FAILED: bpf_map_lookup_elem returns %d\n", rv); 93 + goto err; 94 + } 95 + 96 + if (g.bytes_received != 501 || g.bytes_acked != 1002 || 97 + g.data_segs_in != 1 || g.data_segs_out != 1 || 98 + (g.event_map ^ 0x47e) != 0 || g.bad_cb_test_rv != 0x80 || 99 + g.good_cb_test_rv != 0) { 100 + printf("FAILED: Wrong stats\n"); 101 + if (debug_flag) { 102 + printf("\n"); 103 + printf("bytes_received: %d (expecting 501)\n", 104 + (int)g.bytes_received); 105 + printf("bytes_acked: %d (expecting 1002)\n", 106 + (int)g.bytes_acked); 107 + printf("data_segs_in: %d (expecting 1)\n", 108 + g.data_segs_in); 109 + printf("data_segs_out: %d (expecting 1)\n", 110 + g.data_segs_out); 111 + printf("event_map: 0x%x (at least 0x47e)\n", 112 + g.event_map); 113 + printf("bad_cb_test_rv: 0x%x (expecting 0x80)\n", 114 + g.bad_cb_test_rv); 115 + printf("good_cb_test_rv:0x%x (expecting 0)\n", 116 + g.good_cb_test_rv); 117 + } 118 + goto err; 119 + } 120 + printf("PASSED!\n"); 121 + error = 0; 122 + err: 123 + bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); 124 + return error; 125 + 126 + }
+331 -13
tools/testing/selftests/bpf/test_verifier.c
··· 21 21 #include <stddef.h> 22 22 #include <stdbool.h> 23 23 #include <sched.h> 24 + #include <limits.h> 24 25 25 26 #include <sys/capability.h> 26 27 #include <sys/resource.h> ··· 112 111 BPF_EXIT_INSN(), 113 112 }, 114 113 .result = ACCEPT, 115 - .retval = 0, 114 + .retval = 42, 116 115 }, 117 116 { 118 117 "DIV32 by 0, zero check 2", ··· 124 123 BPF_EXIT_INSN(), 125 124 }, 126 125 .result = ACCEPT, 127 - .retval = 0, 126 + .retval = 42, 128 127 }, 129 128 { 130 129 "DIV64 by 0, zero check", ··· 136 135 BPF_EXIT_INSN(), 137 136 }, 138 137 .result = ACCEPT, 139 - .retval = 0, 138 + .retval = 42, 140 139 }, 141 140 { 142 141 "MOD32 by 0, zero check 1", ··· 148 147 BPF_EXIT_INSN(), 149 148 }, 150 149 .result = ACCEPT, 151 - .retval = 0, 150 + .retval = 42, 152 151 }, 153 152 { 154 153 "MOD32 by 0, zero check 2", ··· 160 159 BPF_EXIT_INSN(), 161 160 }, 162 161 .result = ACCEPT, 163 - .retval = 0, 162 + .retval = 42, 164 163 }, 165 164 { 166 165 "MOD64 by 0, zero check", ··· 172 171 BPF_EXIT_INSN(), 173 172 }, 174 173 .result = ACCEPT, 174 + .retval = 42, 175 + }, 176 + { 177 + "DIV32 by 0, zero check ok, cls", 178 + .insns = { 179 + BPF_MOV32_IMM(BPF_REG_0, 42), 180 + BPF_MOV32_IMM(BPF_REG_1, 2), 181 + BPF_MOV32_IMM(BPF_REG_2, 16), 182 + BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), 183 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 184 + BPF_EXIT_INSN(), 185 + }, 186 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 187 + .result = ACCEPT, 188 + .retval = 8, 189 + }, 190 + { 191 + "DIV32 by 0, zero check 1, cls", 192 + .insns = { 193 + BPF_MOV32_IMM(BPF_REG_1, 0), 194 + BPF_MOV32_IMM(BPF_REG_0, 1), 195 + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), 196 + BPF_EXIT_INSN(), 197 + }, 198 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 199 + .result = ACCEPT, 175 200 .retval = 0, 201 + }, 202 + { 203 + "DIV32 by 0, zero check 2, cls", 204 + .insns = { 205 + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), 206 + BPF_MOV32_IMM(BPF_REG_0, 1), 207 + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), 208 + BPF_EXIT_INSN(), 209 + }, 210 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 211 + .result = ACCEPT, 212 + .retval = 0, 213 + }, 214 + { 215 + "DIV64 by 0, zero check, cls", 216 + .insns = { 217 + BPF_MOV32_IMM(BPF_REG_1, 0), 218 + BPF_MOV32_IMM(BPF_REG_0, 1), 219 + BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), 220 + BPF_EXIT_INSN(), 221 + }, 222 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 223 + .result = ACCEPT, 224 + .retval = 0, 225 + }, 226 + { 227 + "MOD32 by 0, zero check ok, cls", 228 + .insns = { 229 + BPF_MOV32_IMM(BPF_REG_0, 42), 230 + BPF_MOV32_IMM(BPF_REG_1, 3), 231 + BPF_MOV32_IMM(BPF_REG_2, 5), 232 + BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), 233 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 234 + BPF_EXIT_INSN(), 235 + }, 236 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 237 + .result = ACCEPT, 238 + .retval = 2, 239 + }, 240 + { 241 + "MOD32 by 0, zero check 1, cls", 242 + .insns = { 243 + BPF_MOV32_IMM(BPF_REG_1, 0), 244 + BPF_MOV32_IMM(BPF_REG_0, 1), 245 + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), 246 + BPF_EXIT_INSN(), 247 + }, 248 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 249 + .result = ACCEPT, 250 + .retval = 1, 251 + }, 252 + { 253 + "MOD32 by 0, zero check 2, cls", 254 + .insns = { 255 + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), 256 + BPF_MOV32_IMM(BPF_REG_0, 1), 257 + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), 258 + BPF_EXIT_INSN(), 259 + }, 260 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 261 + .result = ACCEPT, 262 + .retval = 1, 263 + }, 264 + { 265 + "MOD64 by 0, zero check 1, cls", 266 + .insns = { 267 + BPF_MOV32_IMM(BPF_REG_1, 0), 268 + BPF_MOV32_IMM(BPF_REG_0, 2), 269 + BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), 270 + BPF_EXIT_INSN(), 271 + }, 272 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 273 + .result = ACCEPT, 274 + .retval = 2, 275 + }, 276 + { 277 + "MOD64 by 0, zero check 2, cls", 278 + .insns = { 279 + BPF_MOV32_IMM(BPF_REG_1, 0), 280 + BPF_MOV32_IMM(BPF_REG_0, -1), 281 + BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), 282 + BPF_EXIT_INSN(), 283 + }, 284 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 285 + .result = ACCEPT, 286 + .retval = -1, 287 + }, 288 + /* Just make sure that JITs used udiv/umod as otherwise we get 289 + * an exception from INT_MIN/-1 overflow similarly as with div 290 + * by zero. 291 + */ 292 + { 293 + "DIV32 overflow, check 1", 294 + .insns = { 295 + BPF_MOV32_IMM(BPF_REG_1, -1), 296 + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), 297 + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), 298 + BPF_EXIT_INSN(), 299 + }, 300 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 301 + .result = ACCEPT, 302 + .retval = 0, 303 + }, 304 + { 305 + "DIV32 overflow, check 2", 306 + .insns = { 307 + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), 308 + BPF_ALU32_IMM(BPF_DIV, BPF_REG_0, -1), 309 + BPF_EXIT_INSN(), 310 + }, 311 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 312 + .result = ACCEPT, 313 + .retval = 0, 314 + }, 315 + { 316 + "DIV64 overflow, check 1", 317 + .insns = { 318 + BPF_MOV64_IMM(BPF_REG_1, -1), 319 + BPF_LD_IMM64(BPF_REG_0, LLONG_MIN), 320 + BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), 321 + BPF_EXIT_INSN(), 322 + }, 323 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 324 + .result = ACCEPT, 325 + .retval = 0, 326 + }, 327 + { 328 + "DIV64 overflow, check 2", 329 + .insns = { 330 + BPF_LD_IMM64(BPF_REG_0, LLONG_MIN), 331 + BPF_ALU64_IMM(BPF_DIV, BPF_REG_0, -1), 332 + BPF_EXIT_INSN(), 333 + }, 334 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 335 + .result = ACCEPT, 336 + .retval = 0, 337 + }, 338 + { 339 + "MOD32 overflow, check 1", 340 + .insns = { 341 + BPF_MOV32_IMM(BPF_REG_1, -1), 342 + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), 343 + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), 344 + BPF_EXIT_INSN(), 345 + }, 346 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 347 + .result = ACCEPT, 348 + .retval = INT_MIN, 349 + }, 350 + { 351 + "MOD32 overflow, check 2", 352 + .insns = { 353 + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), 354 + BPF_ALU32_IMM(BPF_MOD, BPF_REG_0, -1), 355 + BPF_EXIT_INSN(), 356 + }, 357 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 358 + .result = ACCEPT, 359 + .retval = INT_MIN, 360 + }, 361 + { 362 + "MOD64 overflow, check 1", 363 + .insns = { 364 + BPF_MOV64_IMM(BPF_REG_1, -1), 365 + BPF_LD_IMM64(BPF_REG_2, LLONG_MIN), 366 + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), 367 + BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), 368 + BPF_MOV32_IMM(BPF_REG_0, 0), 369 + BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1), 370 + BPF_MOV32_IMM(BPF_REG_0, 1), 371 + BPF_EXIT_INSN(), 372 + }, 373 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 374 + .result = ACCEPT, 375 + .retval = 1, 376 + }, 377 + { 378 + "MOD64 overflow, check 2", 379 + .insns = { 380 + BPF_LD_IMM64(BPF_REG_2, LLONG_MIN), 381 + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), 382 + BPF_ALU64_IMM(BPF_MOD, BPF_REG_2, -1), 383 + BPF_MOV32_IMM(BPF_REG_0, 0), 384 + BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1), 385 + BPF_MOV32_IMM(BPF_REG_0, 1), 386 + BPF_EXIT_INSN(), 387 + }, 388 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 389 + .result = ACCEPT, 390 + .retval = 1, 391 + }, 392 + { 393 + "xor32 zero extend check", 394 + .insns = { 395 + BPF_MOV32_IMM(BPF_REG_2, -1), 396 + BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), 397 + BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 0xffff), 398 + BPF_ALU32_REG(BPF_XOR, BPF_REG_2, BPF_REG_2), 399 + BPF_MOV32_IMM(BPF_REG_0, 2), 400 + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 1), 401 + BPF_MOV32_IMM(BPF_REG_0, 1), 402 + BPF_EXIT_INSN(), 403 + }, 404 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 405 + .result = ACCEPT, 406 + .retval = 1, 176 407 }, 177 408 { 178 409 "empty prog", 179 410 .insns = { 180 411 }, 181 - .errstr = "last insn is not an exit or jmp", 412 + .errstr = "unknown opcode 00", 182 413 .result = REJECT, 183 414 }, 184 415 { ··· 607 374 BPF_EXIT_INSN(), 608 375 }, 609 376 .result = REJECT, 610 - .errstr = "BPF_ARSH not supported for 32 bit ALU", 377 + .errstr = "unknown opcode c4", 611 378 }, 612 379 { 613 380 "arsh32 on reg", ··· 618 385 BPF_EXIT_INSN(), 619 386 }, 620 387 .result = REJECT, 621 - .errstr = "BPF_ARSH not supported for 32 bit ALU", 388 + .errstr = "unknown opcode cc", 622 389 }, 623 390 { 624 391 "arsh64 on imm", ··· 734 501 BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0), 735 502 BPF_EXIT_INSN(), 736 503 }, 737 - .errstr = "BPF_CALL uses reserved", 504 + .errstr = "unknown opcode 8d", 738 505 .result = REJECT, 739 506 }, 740 507 { ··· 924 691 BPF_RAW_INSN(0, 0, 0, 0, 0), 925 692 BPF_EXIT_INSN(), 926 693 }, 927 - .errstr = "invalid BPF_LD_IMM", 694 + .errstr = "unknown opcode 00", 928 695 .result = REJECT, 929 696 }, 930 697 { ··· 942 709 BPF_RAW_INSN(-1, 0, 0, 0, 0), 943 710 BPF_EXIT_INSN(), 944 711 }, 945 - .errstr = "invalid BPF_ALU opcode f0", 712 + .errstr = "unknown opcode ff", 946 713 .result = REJECT, 947 714 }, 948 715 { ··· 951 718 BPF_RAW_INSN(-1, -1, -1, -1, -1), 952 719 BPF_EXIT_INSN(), 953 720 }, 954 - .errstr = "invalid BPF_ALU opcode f0", 721 + .errstr = "unknown opcode ff", 955 722 .result = REJECT, 956 723 }, 957 724 { ··· 7776 7543 }, 7777 7544 BPF_EXIT_INSN(), 7778 7545 }, 7779 - .errstr = "BPF_END uses reserved fields", 7546 + .errstr = "unknown opcode d7", 7780 7547 .result = REJECT, 7781 7548 }, 7782 7549 { ··· 8999 8766 BPF_EXIT_INSN(), 9000 8767 }, 9001 8768 .result = ACCEPT, 8769 + .retval = 1, 9002 8770 }, 9003 8771 { 9004 8772 "check deducing bounds from const, 3", ··· 9195 8961 .result_unpriv = REJECT, 9196 8962 .result = ACCEPT, 9197 8963 .retval = 1, 8964 + }, 8965 + { 8966 + "calls: div by 0 in subprog", 8967 + .insns = { 8968 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8969 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), 8970 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 8971 + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 8972 + offsetof(struct __sk_buff, data_end)), 8973 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 8974 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), 8975 + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), 8976 + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), 8977 + BPF_MOV64_IMM(BPF_REG_0, 1), 8978 + BPF_EXIT_INSN(), 8979 + BPF_MOV32_IMM(BPF_REG_2, 0), 8980 + BPF_MOV32_IMM(BPF_REG_3, 1), 8981 + BPF_ALU32_REG(BPF_DIV, BPF_REG_3, BPF_REG_2), 8982 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8983 + offsetof(struct __sk_buff, data)), 8984 + BPF_EXIT_INSN(), 8985 + }, 8986 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 8987 + .result = ACCEPT, 8988 + .retval = 1, 8989 + }, 8990 + { 8991 + "calls: multiple ret types in subprog 1", 8992 + .insns = { 8993 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8994 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), 8995 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 8996 + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 8997 + offsetof(struct __sk_buff, data_end)), 8998 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 8999 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), 9000 + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), 9001 + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), 9002 + BPF_MOV64_IMM(BPF_REG_0, 1), 9003 + BPF_EXIT_INSN(), 9004 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 9005 + offsetof(struct __sk_buff, data)), 9006 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), 9007 + BPF_MOV32_IMM(BPF_REG_0, 42), 9008 + BPF_EXIT_INSN(), 9009 + }, 9010 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9011 + .result = REJECT, 9012 + .errstr = "R0 invalid mem access 'inv'", 9013 + }, 9014 + { 9015 + "calls: multiple ret types in subprog 2", 9016 + .insns = { 9017 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 9018 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), 9019 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 9020 + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 9021 + offsetof(struct __sk_buff, data_end)), 9022 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), 9023 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), 9024 + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), 9025 + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), 9026 + BPF_MOV64_IMM(BPF_REG_0, 1), 9027 + BPF_EXIT_INSN(), 9028 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 9029 + offsetof(struct __sk_buff, data)), 9030 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 9031 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 9), 9032 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 9033 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9034 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9035 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9036 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9037 + BPF_FUNC_map_lookup_elem), 9038 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), 9039 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 9040 + offsetof(struct __sk_buff, data)), 9041 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 64), 9042 + BPF_EXIT_INSN(), 9043 + }, 9044 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9045 + .fixup_map1 = { 16 }, 9046 + .result = REJECT, 9047 + .errstr = "R0 min value is outside of the array range", 9198 9048 }, 9199 9049 { 9200 9050 "calls: overlapping caller/callee",