Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2021-09-28

The following pull-request contains BPF updates for your *net* tree.

We've added 10 non-merge commits during the last 14 day(s) which contain
a total of 11 files changed, 139 insertions(+), 53 deletions(-).

The main changes are:

1) Fix MIPS JIT jump code emission for too large offsets, from Piotr Krysiuk.

2) Fix x86 JIT atomic/fetch emission when dst reg maps to rax, from Johan Almbladh.

3) Fix cgroup_sk_alloc corner case when called from interrupt, from Daniel Borkmann.

4) Fix segfault in libbpf's linker for objects without BTF, from Kumar Kartikeya Dwivedi.

5) Fix bpf_jit_charge_modmem for applications with CAP_BPF, from Lorenz Bauer.

6) Fix return value handling for struct_ops BPF programs, from Hou Tao.

7) Various fixes to BPF selftests, from Jiri Benc.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
,

+139 -53
+2
MAINTAINERS
··· 3384 3384 F: Documentation/userspace-api/ebpf/ 3385 3385 F: arch/*/net/* 3386 3386 F: include/linux/bpf* 3387 + F: include/linux/btf* 3387 3388 F: include/linux/filter.h 3388 3389 F: include/trace/events/xdp.h 3389 3390 F: include/uapi/linux/bpf* 3391 + F: include/uapi/linux/btf* 3390 3392 F: include/uapi/linux/filter.h 3391 3393 F: kernel/bpf/ 3392 3394 F: kernel/trace/bpf_trace.c
+43 -14
arch/mips/net/bpf_jit.c
··· 662 662 ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative : func) : \ 663 663 func##_positive) 664 664 665 + static bool is_bad_offset(int b_off) 666 + { 667 + return b_off > 0x1ffff || b_off < -0x20000; 668 + } 669 + 665 670 static int build_body(struct jit_ctx *ctx) 666 671 { 667 672 const struct bpf_prog *prog = ctx->skf; ··· 733 728 /* Load return register on DS for failures */ 734 729 emit_reg_move(r_ret, r_zero, ctx); 735 730 /* Return with error */ 736 - emit_b(b_imm(prog->len, ctx), ctx); 731 + b_off = b_imm(prog->len, ctx); 732 + if (is_bad_offset(b_off)) 733 + return -E2BIG; 734 + emit_b(b_off, ctx); 737 735 emit_nop(ctx); 738 736 break; 739 737 case BPF_LD | BPF_W | BPF_IND: ··· 783 775 emit_jalr(MIPS_R_RA, r_s0, ctx); 784 776 emit_reg_move(MIPS_R_A0, r_skb, ctx); /* delay slot */ 785 777 /* Check the error value */ 786 - emit_bcond(MIPS_COND_NE, r_ret, 0, 787 - b_imm(prog->len, ctx), ctx); 778 + b_off = b_imm(prog->len, ctx); 779 + if (is_bad_offset(b_off)) 780 + return -E2BIG; 781 + emit_bcond(MIPS_COND_NE, r_ret, 0, b_off, ctx); 788 782 emit_reg_move(r_ret, r_zero, ctx); 789 783 /* We are good */ 790 784 /* X <- P[1:K] & 0xf */ ··· 865 855 /* A /= X */ 866 856 ctx->flags |= SEEN_X | SEEN_A; 867 857 /* Check if r_X is zero */ 868 - emit_bcond(MIPS_COND_EQ, r_X, r_zero, 869 - b_imm(prog->len, ctx), ctx); 858 + b_off = b_imm(prog->len, ctx); 859 + if (is_bad_offset(b_off)) 860 + return -E2BIG; 861 + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); 870 862 emit_load_imm(r_ret, 0, ctx); /* delay slot */ 871 863 emit_div(r_A, r_X, ctx); 872 864 break; ··· 876 864 /* A %= X */ 877 865 ctx->flags |= SEEN_X | SEEN_A; 878 866 /* Check if r_X is zero */ 879 - emit_bcond(MIPS_COND_EQ, r_X, r_zero, 880 - b_imm(prog->len, ctx), ctx); 867 + b_off = b_imm(prog->len, ctx); 868 + if (is_bad_offset(b_off)) 869 + return -E2BIG; 870 + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); 881 871 emit_load_imm(r_ret, 0, ctx); /* delay slot */ 882 872 emit_mod(r_A, r_X, ctx); 883 873 break; ··· 940 926 break; 941 927 case BPF_JMP | BPF_JA: 942 928 /* pc += K */ 943 - emit_b(b_imm(i + k + 1, ctx), ctx); 929 + b_off = b_imm(i + k + 1, ctx); 930 + if (is_bad_offset(b_off)) 931 + return -E2BIG; 932 + emit_b(b_off, ctx); 944 933 emit_nop(ctx); 945 934 break; 946 935 case BPF_JMP | BPF_JEQ | BPF_K: ··· 1073 1056 break; 1074 1057 case BPF_RET | BPF_A: 1075 1058 ctx->flags |= SEEN_A; 1076 - if (i != prog->len - 1) 1059 + if (i != prog->len - 1) { 1077 1060 /* 1078 1061 * If this is not the last instruction 1079 1062 * then jump to the epilogue 1080 1063 */ 1081 - emit_b(b_imm(prog->len, ctx), ctx); 1064 + b_off = b_imm(prog->len, ctx); 1065 + if (is_bad_offset(b_off)) 1066 + return -E2BIG; 1067 + emit_b(b_off, ctx); 1068 + } 1082 1069 emit_reg_move(r_ret, r_A, ctx); /* delay slot */ 1083 1070 break; 1084 1071 case BPF_RET | BPF_K: ··· 1096 1075 * If this is not the last instruction 1097 1076 * then jump to the epilogue 1098 1077 */ 1099 - emit_b(b_imm(prog->len, ctx), ctx); 1078 + b_off = b_imm(prog->len, ctx); 1079 + if (is_bad_offset(b_off)) 1080 + return -E2BIG; 1081 + emit_b(b_off, ctx); 1100 1082 emit_nop(ctx); 1101 1083 } 1102 1084 break; ··· 1157 1133 /* Load *dev pointer */ 1158 1134 emit_load_ptr(r_s0, r_skb, off, ctx); 1159 1135 /* error (0) in the delay slot */ 1160 - emit_bcond(MIPS_COND_EQ, r_s0, r_zero, 1161 - b_imm(prog->len, ctx), ctx); 1136 + b_off = b_imm(prog->len, ctx); 1137 + if (is_bad_offset(b_off)) 1138 + return -E2BIG; 1139 + emit_bcond(MIPS_COND_EQ, r_s0, r_zero, b_off, ctx); 1162 1140 emit_reg_move(r_ret, r_zero, ctx); 1163 1141 if (code == (BPF_ANC | SKF_AD_IFINDEX)) { 1164 1142 BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); ··· 1270 1244 1271 1245 /* Generate the actual JIT code */ 1272 1246 build_prologue(&ctx); 1273 - build_body(&ctx); 1247 + if (build_body(&ctx)) { 1248 + module_memfree(ctx.target); 1249 + goto out; 1250 + } 1274 1251 build_epilogue(&ctx); 1275 1252 1276 1253 /* Update the icache */
+48 -18
arch/x86/net/bpf_jit_comp.c
··· 1341 1341 if (insn->imm == (BPF_AND | BPF_FETCH) || 1342 1342 insn->imm == (BPF_OR | BPF_FETCH) || 1343 1343 insn->imm == (BPF_XOR | BPF_FETCH)) { 1344 - u8 *branch_target; 1345 1344 bool is64 = BPF_SIZE(insn->code) == BPF_DW; 1346 1345 u32 real_src_reg = src_reg; 1346 + u32 real_dst_reg = dst_reg; 1347 + u8 *branch_target; 1347 1348 1348 1349 /* 1349 1350 * Can't be implemented with a single x86 insn. ··· 1355 1354 emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); 1356 1355 if (src_reg == BPF_REG_0) 1357 1356 real_src_reg = BPF_REG_AX; 1357 + if (dst_reg == BPF_REG_0) 1358 + real_dst_reg = BPF_REG_AX; 1358 1359 1359 1360 branch_target = prog; 1360 1361 /* Load old value */ 1361 1362 emit_ldx(&prog, BPF_SIZE(insn->code), 1362 - BPF_REG_0, dst_reg, insn->off); 1363 + BPF_REG_0, real_dst_reg, insn->off); 1363 1364 /* 1364 1365 * Perform the (commutative) operation locally, 1365 1366 * put the result in the AUX_REG. ··· 1372 1369 add_2reg(0xC0, AUX_REG, real_src_reg)); 1373 1370 /* Attempt to swap in new value */ 1374 1371 err = emit_atomic(&prog, BPF_CMPXCHG, 1375 - dst_reg, AUX_REG, insn->off, 1372 + real_dst_reg, AUX_REG, 1373 + insn->off, 1376 1374 BPF_SIZE(insn->code)); 1377 1375 if (WARN_ON(err)) 1378 1376 return err; ··· 1387 1383 /* Restore R0 after clobbering RAX */ 1388 1384 emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); 1389 1385 break; 1390 - 1391 1386 } 1392 1387 1393 1388 err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, 1394 - insn->off, BPF_SIZE(insn->code)); 1389 + insn->off, BPF_SIZE(insn->code)); 1395 1390 if (err) 1396 1391 return err; 1397 1392 break; ··· 1747 1744 } 1748 1745 1749 1746 static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, 1750 - struct bpf_prog *p, int stack_size, bool mod_ret) 1747 + struct bpf_prog *p, int stack_size, bool save_ret) 1751 1748 { 1752 1749 u8 *prog = *pprog; 1753 1750 u8 *jmp_insn; ··· 1780 1777 if (emit_call(&prog, p->bpf_func, prog)) 1781 1778 return -EINVAL; 1782 1779 1783 - /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return 1780 + /* 1781 + * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return 1784 1782 * of the previous call which is then passed on the stack to 1785 1783 * the next BPF program. 1784 + * 1785 + * BPF_TRAMP_FENTRY trampoline may need to return the return 1786 + * value of BPF_PROG_TYPE_STRUCT_OPS prog. 1786 1787 */ 1787 - if (mod_ret) 1788 + if (save_ret) 1788 1789 emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); 1789 1790 1790 1791 /* replace 2 nops with JE insn, since jmp target is known */ ··· 1835 1828 } 1836 1829 1837 1830 static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, 1838 - struct bpf_tramp_progs *tp, int stack_size) 1831 + struct bpf_tramp_progs *tp, int stack_size, 1832 + bool save_ret) 1839 1833 { 1840 1834 int i; 1841 1835 u8 *prog = *pprog; 1842 1836 1843 1837 for (i = 0; i < tp->nr_progs; i++) { 1844 - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) 1838 + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, 1839 + save_ret)) 1845 1840 return -EINVAL; 1846 1841 } 1847 1842 *pprog = prog; ··· 1884 1875 1885 1876 *pprog = prog; 1886 1877 return 0; 1878 + } 1879 + 1880 + static bool is_valid_bpf_tramp_flags(unsigned int flags) 1881 + { 1882 + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && 1883 + (flags & BPF_TRAMP_F_SKIP_FRAME)) 1884 + return false; 1885 + 1886 + /* 1887 + * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, 1888 + * and it must be used alone. 1889 + */ 1890 + if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) && 1891 + (flags & ~BPF_TRAMP_F_RET_FENTRY_RET)) 1892 + return false; 1893 + 1894 + return true; 1887 1895 } 1888 1896 1889 1897 /* Example: ··· 1975 1949 struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; 1976 1950 u8 **branches = NULL; 1977 1951 u8 *prog; 1952 + bool save_ret; 1978 1953 1979 1954 /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ 1980 1955 if (nr_args > 6) 1981 1956 return -ENOTSUPP; 1982 1957 1983 - if ((flags & BPF_TRAMP_F_RESTORE_REGS) && 1984 - (flags & BPF_TRAMP_F_SKIP_FRAME)) 1958 + if (!is_valid_bpf_tramp_flags(flags)) 1985 1959 return -EINVAL; 1986 1960 1987 - if (flags & BPF_TRAMP_F_CALL_ORIG) 1988 - stack_size += 8; /* room for return value of orig_call */ 1961 + /* room for return value of orig_call or fentry prog */ 1962 + save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); 1963 + if (save_ret) 1964 + stack_size += 8; 1989 1965 1990 1966 if (flags & BPF_TRAMP_F_IP_ARG) 1991 1967 stack_size += 8; /* room for IP address argument */ ··· 2033 2005 } 2034 2006 2035 2007 if (fentry->nr_progs) 2036 - if (invoke_bpf(m, &prog, fentry, stack_size)) 2008 + if (invoke_bpf(m, &prog, fentry, stack_size, 2009 + flags & BPF_TRAMP_F_RET_FENTRY_RET)) 2037 2010 return -EINVAL; 2038 2011 2039 2012 if (fmod_ret->nr_progs) { ··· 2081 2052 } 2082 2053 2083 2054 if (fexit->nr_progs) 2084 - if (invoke_bpf(m, &prog, fexit, stack_size)) { 2055 + if (invoke_bpf(m, &prog, fexit, stack_size, false)) { 2085 2056 ret = -EINVAL; 2086 2057 goto cleanup; 2087 2058 } ··· 2101 2072 ret = -EINVAL; 2102 2073 goto cleanup; 2103 2074 } 2104 - /* restore original return value back into RAX */ 2105 - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); 2106 2075 } 2076 + /* restore return value of orig_call or fentry prog back into RAX */ 2077 + if (save_ret) 2078 + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); 2107 2079 2108 2080 EMIT1(0x5B); /* pop rbx */ 2109 2081 EMIT1(0xC9); /* leave */
+2 -1
include/linux/bpf.h
··· 578 578 * programs only. Should not be used with normal calls and indirect calls. 579 579 */ 580 580 #define BPF_TRAMP_F_SKIP_FRAME BIT(2) 581 - 582 581 /* Store IP address of the caller on the trampoline stack, 583 582 * so it's available for trampoline's programs. 584 583 */ 585 584 #define BPF_TRAMP_F_IP_ARG BIT(3) 585 + /* Return the return value of fentry prog. Only used by bpf_struct_ops. */ 586 + #define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) 586 587 587 588 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 588 589 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
+5 -2
kernel/bpf/bpf_struct_ops.c
··· 368 368 const struct btf_type *mtype, *ptype; 369 369 struct bpf_prog *prog; 370 370 u32 moff; 371 + u32 flags; 371 372 372 373 moff = btf_member_bit_offset(t, member) / 8; 373 374 ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); ··· 432 431 433 432 tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; 434 433 tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; 434 + flags = st_ops->func_models[i].ret_size > 0 ? 435 + BPF_TRAMP_F_RET_FENTRY_RET : 0; 435 436 err = arch_prepare_bpf_trampoline(NULL, image, 436 437 st_map->image + PAGE_SIZE, 437 - &st_ops->func_models[i], 0, 438 - tprogs, NULL); 438 + &st_ops->func_models[i], 439 + flags, tprogs, NULL); 439 440 if (err < 0) 440 441 goto reset_unlock; 441 442
+1 -1
kernel/bpf/core.c
··· 827 827 { 828 828 if (atomic_long_add_return(pages, &bpf_jit_current) > 829 829 (bpf_jit_limit >> PAGE_SHIFT)) { 830 - if (!capable(CAP_SYS_ADMIN)) { 830 + if (!bpf_capable()) { 831 831 atomic_long_sub(pages, &bpf_jit_current); 832 832 return -EPERM; 833 833 }
+12 -5
kernel/cgroup/cgroup.c
··· 6574 6574 6575 6575 void cgroup_sk_alloc(struct sock_cgroup_data *skcd) 6576 6576 { 6577 - /* Don't associate the sock with unrelated interrupted task's cgroup. */ 6578 - if (in_interrupt()) 6579 - return; 6577 + struct cgroup *cgroup; 6580 6578 6581 6579 rcu_read_lock(); 6580 + /* Don't associate the sock with unrelated interrupted task's cgroup. */ 6581 + if (in_interrupt()) { 6582 + cgroup = &cgrp_dfl_root.cgrp; 6583 + cgroup_get(cgroup); 6584 + goto out; 6585 + } 6586 + 6582 6587 while (true) { 6583 6588 struct css_set *cset; 6584 6589 6585 6590 cset = task_css_set(current); 6586 6591 if (likely(cgroup_tryget(cset->dfl_cgrp))) { 6587 - skcd->cgroup = cset->dfl_cgrp; 6588 - cgroup_bpf_get(cset->dfl_cgrp); 6592 + cgroup = cset->dfl_cgrp; 6589 6593 break; 6590 6594 } 6591 6595 cpu_relax(); 6592 6596 } 6597 + out: 6598 + skcd->cgroup = cgroup; 6599 + cgroup_bpf_get(cgroup); 6593 6600 rcu_read_unlock(); 6594 6601 } 6595 6602
+9 -5
net/bpf/test_run.c
··· 552 552 __skb->gso_segs = skb_shinfo(skb)->gso_segs; 553 553 } 554 554 555 + static struct proto bpf_dummy_proto = { 556 + .name = "bpf_dummy", 557 + .owner = THIS_MODULE, 558 + .obj_size = sizeof(struct sock), 559 + }; 560 + 555 561 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, 556 562 union bpf_attr __user *uattr) 557 563 { ··· 602 596 break; 603 597 } 604 598 605 - sk = kzalloc(sizeof(struct sock), GFP_USER); 599 + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); 606 600 if (!sk) { 607 601 kfree(data); 608 602 kfree(ctx); 609 603 return -ENOMEM; 610 604 } 611 - sock_net_set(sk, net); 612 605 sock_init_data(NULL, sk); 613 606 614 607 skb = build_skb(data, 0); 615 608 if (!skb) { 616 609 kfree(data); 617 610 kfree(ctx); 618 - kfree(sk); 611 + sk_free(sk); 619 612 return -ENOMEM; 620 613 } 621 614 skb->sk = sk; ··· 687 682 if (dev && dev != net->loopback_dev) 688 683 dev_put(dev); 689 684 kfree_skb(skb); 690 - bpf_sk_storage_free(sk); 691 - kfree(sk); 685 + sk_free(sk); 692 686 kfree(ctx); 693 687 return ret; 694 688 }
+7 -1
tools/lib/bpf/linker.c
··· 1649 1649 static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, 1650 1650 int *out_btf_sec_id, int *out_btf_id) 1651 1651 { 1652 - int i, j, n = btf__get_nr_types(obj->btf), m, btf_id = 0; 1652 + int i, j, n, m, btf_id = 0; 1653 1653 const struct btf_type *t; 1654 1654 const struct btf_var_secinfo *vi; 1655 1655 const char *name; 1656 1656 1657 + if (!obj->btf) { 1658 + pr_warn("failed to find BTF info for object '%s'\n", obj->filename); 1659 + return -EINVAL; 1660 + } 1661 + 1662 + n = btf__get_nr_types(obj->btf); 1657 1663 for (i = 1; i <= n; i++) { 1658 1664 t = btf__type_by_id(obj->btf, i); 1659 1665
+2 -1
tools/testing/selftests/bpf/Makefile
··· 375 375 $(TRUNNER_BPF_PROGS_DIR)/%.c \ 376 376 $(TRUNNER_BPF_PROGS_DIR)/*.h \ 377 377 $$(INCLUDE_DIR)/vmlinux.h \ 378 - $(wildcard $(BPFDIR)/bpf_*.h) | $(TRUNNER_OUTPUT) 378 + $(wildcard $(BPFDIR)/bpf_*.h) \ 379 + | $(TRUNNER_OUTPUT) $$(BPFOBJ) 379 380 $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ 380 381 $(TRUNNER_BPF_CFLAGS)) 381 382
+8 -5
tools/testing/selftests/bpf/test_lwt_ip_encap.sh
··· 112 112 ip netns add "${NS2}" 113 113 ip netns add "${NS3}" 114 114 115 + # rp_filter gets confused by what these tests are doing, so disable it 116 + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 117 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 118 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 119 + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0 120 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0 121 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0 122 + 115 123 ip link add veth1 type veth peer name veth2 116 124 ip link add veth3 type veth peer name veth4 117 125 ip link add veth5 type veth peer name veth6 ··· 243 235 ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev 244 236 ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF} 245 237 ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF} 246 - 247 - # rp_filter gets confused by what these tests are doing, so disable it 248 - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 249 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 250 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 251 238 252 239 TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX) 253 240