Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2018-06-16

The following pull-request contains BPF updates for your *net* tree.

The main changes are:

1) Fix a panic in devmap handling in generic XDP where return type
of __devmap_lookup_elem() got changed recently but generic XDP
code missed the related update, from Toshiaki.

2) Fix a freeze when BPF progs are loaded that include BPF to BPF
calls when JIT is enabled where we would later bail out via error
path w/o dropping kallsyms, and another one to silence syzkaller
splats from locking prog read-only, from Daniel.

3) Fix a bug in test_offloads.py BPF selftest which must not assume
that the underlying system have no BPF progs loaded prior to test,
and one in bpftool to fix accuracy of program load time, from Jakub.

4) Fix a bug in bpftool's probe for availability of the bpf(2)
BPF_TASK_FD_QUERY subcommand, from Yonghong.

5) Fix a regression in AF_XDP's XDP_SKB receive path where queue
id check got erroneously removed, from Björn.

6) Fix missing state cleanup in BPF's xfrm tunnel test, from William.

7) Check tunnel type more accurately in BPF's tunnel collect metadata
kselftest, from Jian.

8) Fix missing Kconfig fragments for BPF kselftests, from Anders.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+194 -71
+12
include/linux/bpf.h
··· 488 488 489 489 /* Map specifics */ 490 490 struct xdp_buff; 491 + struct sk_buff; 491 492 492 493 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); 493 494 void __dev_map_insert_ctx(struct bpf_map *map, u32 index); 494 495 void __dev_map_flush(struct bpf_map *map); 495 496 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 496 497 struct net_device *dev_rx); 498 + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, 499 + struct bpf_prog *xdp_prog); 497 500 498 501 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); 499 502 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); ··· 585 582 static inline 586 583 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 587 584 struct net_device *dev_rx) 585 + { 586 + return 0; 587 + } 588 + 589 + struct sk_buff; 590 + 591 + static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, 592 + struct sk_buff *skb, 593 + struct bpf_prog *xdp_prog) 588 594 { 589 595 return 0; 590 596 }
+57 -22
include/linux/filter.h
··· 19 19 #include <linux/cryptohash.h> 20 20 #include <linux/set_memory.h> 21 21 #include <linux/kallsyms.h> 22 + #include <linux/if_vlan.h> 22 23 23 24 #include <net/sch_generic.h> 24 25 ··· 470 469 }; 471 470 472 471 struct bpf_binary_header { 473 - unsigned int pages; 472 + u16 pages; 473 + u16 locked:1; 474 474 u8 image[]; 475 475 }; 476 476 ··· 673 671 674 672 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) 675 673 676 - #ifdef CONFIG_ARCH_HAS_SET_MEMORY 677 674 static inline void bpf_prog_lock_ro(struct bpf_prog *fp) 678 675 { 676 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 679 677 fp->locked = 1; 680 - WARN_ON_ONCE(set_memory_ro((unsigned long)fp, fp->pages)); 678 + if (set_memory_ro((unsigned long)fp, fp->pages)) 679 + fp->locked = 0; 680 + #endif 681 681 } 682 682 683 683 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) 684 684 { 685 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 685 686 if (fp->locked) { 686 687 WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); 687 688 /* In case set_memory_rw() fails, we want to be the first ··· 692 687 */ 693 688 fp->locked = 0; 694 689 } 690 + #endif 695 691 } 696 692 697 693 static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) 698 694 { 699 - WARN_ON_ONCE(set_memory_ro((unsigned long)hdr, hdr->pages)); 695 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 696 + hdr->locked = 1; 697 + if (set_memory_ro((unsigned long)hdr, hdr->pages)) 698 + hdr->locked = 0; 699 + #endif 700 700 } 701 701 702 702 static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) 703 703 { 704 - WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); 704 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 705 + if (hdr->locked) { 706 + WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); 707 + /* In case set_memory_rw() fails, we want to be the first 708 + * to crash here instead of some random place later on. 709 + */ 710 + hdr->locked = 0; 711 + } 712 + #endif 705 713 } 706 - #else 707 - static inline void bpf_prog_lock_ro(struct bpf_prog *fp) 708 - { 709 - } 710 - 711 - static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) 712 - { 713 - } 714 - 715 - static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) 716 - { 717 - } 718 - 719 - static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) 720 - { 721 - } 722 - #endif /* CONFIG_ARCH_HAS_SET_MEMORY */ 723 714 724 715 static inline struct bpf_binary_header * 725 716 bpf_jit_binary_hdr(const struct bpf_prog *fp) ··· 725 724 726 725 return (void *)addr; 727 726 } 727 + 728 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 729 + static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp) 730 + { 731 + if (!fp->locked) 732 + return -ENOLCK; 733 + if (fp->jited) { 734 + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); 735 + 736 + if (!hdr->locked) 737 + return -ENOLCK; 738 + } 739 + 740 + return 0; 741 + } 742 + #endif 728 743 729 744 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); 730 745 static inline int sk_filter(struct sock *sk, struct sk_buff *skb) ··· 802 785 803 786 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, 804 787 const struct bpf_insn *patch, u32 len); 788 + 789 + static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, 790 + struct net_device *fwd) 791 + { 792 + unsigned int len; 793 + 794 + if (unlikely(!(fwd->flags & IFF_UP))) 795 + return -ENETDOWN; 796 + 797 + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; 798 + if (skb->len > len) 799 + return -EMSGSIZE; 800 + 801 + return 0; 802 + } 805 803 806 804 /* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the 807 805 * same cpu context. Further for best results no more than a single map ··· 992 960 { 993 961 } 994 962 #endif /* CONFIG_BPF_JIT */ 963 + 964 + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp); 965 + void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); 995 966 996 967 #define BPF_ANC BIT(15) 997 968
+61 -6
kernel/bpf/core.c
··· 350 350 return prog_adj; 351 351 } 352 352 353 + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) 354 + { 355 + int i; 356 + 357 + for (i = 0; i < fp->aux->func_cnt; i++) 358 + bpf_prog_kallsyms_del(fp->aux->func[i]); 359 + } 360 + 361 + void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) 362 + { 363 + bpf_prog_kallsyms_del_subprogs(fp); 364 + bpf_prog_kallsyms_del(fp); 365 + } 366 + 353 367 #ifdef CONFIG_BPF_JIT 354 368 /* All BPF JIT sysctl knobs here. */ 355 369 int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); ··· 598 584 bpf_fill_ill_insns(hdr, size); 599 585 600 586 hdr->pages = size / PAGE_SIZE; 587 + hdr->locked = 0; 588 + 601 589 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), 602 590 PAGE_SIZE - sizeof(*hdr)); 603 591 start = (get_random_int() % hole) & ~(alignment - 1); ··· 1450 1434 return 0; 1451 1435 } 1452 1436 1437 + static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) 1438 + { 1439 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 1440 + int i, err; 1441 + 1442 + for (i = 0; i < fp->aux->func_cnt; i++) { 1443 + err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); 1444 + if (err) 1445 + return err; 1446 + } 1447 + 1448 + return bpf_prog_check_pages_ro_single(fp); 1449 + #endif 1450 + return 0; 1451 + } 1452 + 1453 + static void bpf_prog_select_func(struct bpf_prog *fp) 1454 + { 1455 + #ifndef CONFIG_BPF_JIT_ALWAYS_ON 1456 + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); 1457 + 1458 + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; 1459 + #else 1460 + fp->bpf_func = __bpf_prog_ret0_warn; 1461 + #endif 1462 + } 1463 + 1453 1464 /** 1454 1465 * bpf_prog_select_runtime - select exec runtime for BPF program 1455 1466 * @fp: bpf_prog populated with internal BPF program ··· 1487 1444 */ 1488 1445 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) 1489 1446 { 1490 - #ifndef CONFIG_BPF_JIT_ALWAYS_ON 1491 - u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); 1447 + /* In case of BPF to BPF calls, verifier did all the prep 1448 + * work with regards to JITing, etc. 1449 + */ 1450 + if (fp->bpf_func) 1451 + goto finalize; 1492 1452 1493 - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; 1494 - #else 1495 - fp->bpf_func = __bpf_prog_ret0_warn; 1496 - #endif 1453 + bpf_prog_select_func(fp); 1497 1454 1498 1455 /* eBPF JITs can rewrite the program in case constant 1499 1456 * blinding is active. However, in case of error during ··· 1514 1471 if (*err) 1515 1472 return fp; 1516 1473 } 1474 + 1475 + finalize: 1517 1476 bpf_prog_lock_ro(fp); 1518 1477 1519 1478 /* The tail call compatibility check can only be done at ··· 1524 1479 * all eBPF JITs might immediately support all features. 1525 1480 */ 1526 1481 *err = bpf_check_tail_call(fp); 1482 + if (*err) 1483 + return fp; 1527 1484 1485 + /* Checkpoint: at this point onwards any cBPF -> eBPF or 1486 + * native eBPF program is read-only. If we failed to change 1487 + * the page attributes (e.g. allocation failure from 1488 + * splitting large pages), then reject the whole program 1489 + * in order to guarantee not ending up with any W+X pages 1490 + * from BPF side in kernel. 1491 + */ 1492 + *err = bpf_prog_check_pages_ro_locked(fp); 1528 1493 return fp; 1529 1494 } 1530 1495 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+14
kernel/bpf/devmap.c
··· 345 345 return bq_enqueue(dst, xdpf, dev_rx); 346 346 } 347 347 348 + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, 349 + struct bpf_prog *xdp_prog) 350 + { 351 + int err; 352 + 353 + err = __xdp_generic_ok_fwd_dev(skb, dst->dev); 354 + if (unlikely(err)) 355 + return err; 356 + skb->dev = dst->dev; 357 + generic_xdp_tx(skb, xdp_prog); 358 + 359 + return 0; 360 + } 361 + 348 362 static void *dev_map_lookup_elem(struct bpf_map *map, void *key) 349 363 { 350 364 struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+3 -9
kernel/bpf/syscall.c
··· 1034 1034 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) 1035 1035 { 1036 1036 if (atomic_dec_and_test(&prog->aux->refcnt)) { 1037 - int i; 1038 - 1039 1037 /* bpf_prog_free_id() must be called first */ 1040 1038 bpf_prog_free_id(prog, do_idr_lock); 1041 - 1042 - for (i = 0; i < prog->aux->func_cnt; i++) 1043 - bpf_prog_kallsyms_del(prog->aux->func[i]); 1044 - bpf_prog_kallsyms_del(prog); 1039 + bpf_prog_kallsyms_del_all(prog); 1045 1040 1046 1041 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 1047 1042 } ··· 1353 1358 if (err < 0) 1354 1359 goto free_used_maps; 1355 1360 1356 - /* eBPF program is ready to be JITed */ 1357 - if (!prog->bpf_func) 1358 - prog = bpf_prog_select_runtime(prog, &err); 1361 + prog = bpf_prog_select_runtime(prog, &err); 1359 1362 if (err < 0) 1360 1363 goto free_used_maps; 1361 1364 ··· 1377 1384 return err; 1378 1385 1379 1386 free_used_maps: 1387 + bpf_prog_kallsyms_del_subprogs(prog); 1380 1388 free_used_maps(prog->aux); 1381 1389 free_prog: 1382 1390 bpf_prog_uncharge_memlock(prog);
+4 -17
net/core/filter.c
··· 3214 3214 } 3215 3215 EXPORT_SYMBOL_GPL(xdp_do_redirect); 3216 3216 3217 - static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) 3218 - { 3219 - unsigned int len; 3220 - 3221 - if (unlikely(!(fwd->flags & IFF_UP))) 3222 - return -ENETDOWN; 3223 - 3224 - len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; 3225 - if (skb->len > len) 3226 - return -EMSGSIZE; 3227 - 3228 - return 0; 3229 - } 3230 - 3231 3217 static int xdp_do_generic_redirect_map(struct net_device *dev, 3232 3218 struct sk_buff *skb, 3233 3219 struct xdp_buff *xdp, ··· 3242 3256 } 3243 3257 3244 3258 if (map->map_type == BPF_MAP_TYPE_DEVMAP) { 3245 - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) 3259 + struct bpf_dtab_netdev *dst = fwd; 3260 + 3261 + err = dev_map_generic_redirect(dst, skb, xdp_prog); 3262 + if (unlikely(err)) 3246 3263 goto err; 3247 - skb->dev = fwd; 3248 - generic_xdp_tx(skb, xdp_prog); 3249 3264 } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { 3250 3265 struct xdp_sock *xs = fwd; 3251 3266
+3
net/xdp/xsk.c
··· 118 118 u64 addr; 119 119 int err; 120 120 121 + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 122 + return -EINVAL; 123 + 121 124 if (!xskq_peek_addr(xs->umem->fq, &addr) || 122 125 len > xs->umem->chunk_size_nohr) { 123 126 xs->rx_dropped++;
+3 -2
tools/bpf/bpftool/perf.c
··· 29 29 if (perf_query_supported) 30 30 goto out; 31 31 32 - fd = open(bin_name, O_RDONLY); 32 + fd = open("/", O_RDONLY); 33 33 if (fd < 0) { 34 - p_err("perf_query_support: %s", strerror(errno)); 34 + p_err("perf_query_support: cannot open directory \"/\" (%s)", 35 + strerror(errno)); 35 36 goto out; 36 37 } 37 38
+3 -1
tools/bpf/bpftool/prog.c
··· 90 90 } 91 91 92 92 wallclock_secs = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + 93 - nsecs / 1000000000; 93 + (real_time_ts.tv_nsec - boot_time_ts.tv_nsec + nsecs) / 94 + 1000000000; 95 + 94 96 95 97 if (!localtime_r(&wallclock_secs, &load_tm)) { 96 98 snprintf(buf, size, "%llu", nsecs / 1000000000);
+10
tools/testing/selftests/bpf/config
··· 7 7 CONFIG_NETDEVSIM=m 8 8 CONFIG_NET_CLS_ACT=y 9 9 CONFIG_NET_SCH_INGRESS=y 10 + CONFIG_NET_IPIP=y 11 + CONFIG_IPV6=y 12 + CONFIG_NET_IPGRE_DEMUX=y 13 + CONFIG_NET_IPGRE=y 14 + CONFIG_IPV6_GRE=y 15 + CONFIG_CRYPTO_USER_API_HASH=m 16 + CONFIG_CRYPTO_HMAC=m 17 + CONFIG_CRYPTO_SHA256=m 18 + CONFIG_VXLAN=y 19 + CONFIG_GENEVE=y
+10 -2
tools/testing/selftests/bpf/test_offload.py
··· 163 163 164 164 def bpftool_prog_list(expected=None, ns=""): 165 165 _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) 166 + # Remove the base progs 167 + for p in base_progs: 168 + if p in progs: 169 + progs.remove(p) 166 170 if expected is not None: 167 171 if len(progs) != expected: 168 172 fail(True, "%d BPF programs loaded, expected %d" % ··· 175 171 176 172 def bpftool_map_list(expected=None, ns=""): 177 173 _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) 174 + # Remove the base maps 175 + for m in base_maps: 176 + if m in maps: 177 + maps.remove(m) 178 178 if expected is not None: 179 179 if len(maps) != expected: 180 180 fail(True, "%d BPF maps loaded, expected %d" % ··· 593 585 # Check tools 594 586 ret, progs = bpftool("prog", fail=False) 595 587 skip(ret != 0, "bpftool not installed") 596 - # Check no BPF programs are loaded 597 - skip(len(progs) != 0, "BPF programs already loaded on the system") 588 + base_progs = progs 589 + _, base_maps = bpftool("map") 598 590 599 591 # Check netdevsim 600 592 ret, out = cmd("modprobe netdevsim", fail=False)
+14 -12
tools/testing/selftests/bpf/test_tunnel.sh
··· 608 608 test_xfrm_tunnel() 609 609 { 610 610 config_device 611 - #tcpdump -nei veth1 ip & 612 - output=$(mktemp) 613 - cat /sys/kernel/debug/tracing/trace_pipe | tee $output & 614 - setup_xfrm_tunnel 611 + > /sys/kernel/debug/tracing/trace 612 + setup_xfrm_tunnel 615 613 tc qdisc add dev veth1 clsact 616 614 tc filter add dev veth1 proto ip ingress bpf da obj test_tunnel_kern.o \ 617 615 sec xfrm_get_state 618 616 ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 619 617 sleep 1 620 - grep "reqid 1" $output 618 + grep "reqid 1" /sys/kernel/debug/tracing/trace 621 619 check_err $? 622 - grep "spi 0x1" $output 620 + grep "spi 0x1" /sys/kernel/debug/tracing/trace 623 621 check_err $? 624 - grep "remote ip 0xac100164" $output 622 + grep "remote ip 0xac100164" /sys/kernel/debug/tracing/trace 625 623 check_err $? 626 624 cleanup 627 625 628 626 if [ $ret -ne 0 ]; then 629 - echo -e ${RED}"FAIL: xfrm tunnel"${NC} 630 - return 1 631 - fi 632 - echo -e ${GREEN}"PASS: xfrm tunnel"${NC} 627 + echo -e ${RED}"FAIL: xfrm tunnel"${NC} 628 + return 1 629 + fi 630 + echo -e ${GREEN}"PASS: xfrm tunnel"${NC} 633 631 } 634 632 635 633 attach_bpf() ··· 655 657 ip link del ip6geneve11 2> /dev/null 656 658 ip link del erspan11 2> /dev/null 657 659 ip link del ip6erspan11 2> /dev/null 660 + ip xfrm policy delete dir out src 10.1.1.200/32 dst 10.1.1.100/32 2> /dev/null 661 + ip xfrm policy delete dir in src 10.1.1.100/32 dst 10.1.1.200/32 2> /dev/null 662 + ip xfrm state delete src 172.16.1.100 dst 172.16.1.200 proto esp spi 0x1 2> /dev/null 663 + ip xfrm state delete src 172.16.1.200 dst 172.16.1.100 proto esp spi 0x2 2> /dev/null 658 664 } 659 665 660 666 cleanup_exit() ··· 670 668 671 669 check() 672 670 { 673 - ip link help $1 2>&1 | grep -q "^Usage:" 671 + ip link help 2>&1 | grep -q "\s$1\s" 674 672 if [ $? -ne 0 ];then 675 673 echo "SKIP $1: iproute2 not support" 676 674 cleanup