Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

+12

include/linux/bpf.h

··· 488 488 489 489 /* Map specifics */ 490 490 struct xdp_buff; 491 + struct sk_buff; 491 492 492 493 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); 493 494 void __dev_map_insert_ctx(struct bpf_map *map, u32 index); 494 495 void __dev_map_flush(struct bpf_map *map); 495 496 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 496 497 struct net_device *dev_rx); 498 + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, 499 + struct bpf_prog *xdp_prog); 497 500 498 501 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); 499 502 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); ··· 585 582 static inline 586 583 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 587 584 struct net_device *dev_rx) 585 + { 586 + return 0; 587 + } 588 + 589 + struct sk_buff; 590 + 591 + static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, 592 + struct sk_buff *skb, 593 + struct bpf_prog *xdp_prog) 588 594 { 589 595 return 0; 590 596 }

+57 -22

include/linux/filter.h

··· 19 19 #include <linux/cryptohash.h> 20 20 #include <linux/set_memory.h> 21 21 #include <linux/kallsyms.h> 22 + #include <linux/if_vlan.h> 22 23 23 24 #include <net/sch_generic.h> 24 25 ··· 470 469 }; 471 470 472 471 struct bpf_binary_header { 473 - unsigned int pages; 472 + u16 pages; 473 + u16 locked:1; 474 474 u8 image[]; 475 475 }; 476 476 ··· 673 671 674 672 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) 675 673 676 - #ifdef CONFIG_ARCH_HAS_SET_MEMORY 677 674 static inline void bpf_prog_lock_ro(struct bpf_prog *fp) 678 675 { 676 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 679 677 fp->locked = 1; 680 - WARN_ON_ONCE(set_memory_ro((unsigned long)fp, fp->pages)); 678 + if (set_memory_ro((unsigned long)fp, fp->pages)) 679 + fp->locked = 0; 680 + #endif 681 681 } 682 682 683 683 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) 684 684 { 685 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 685 686 if (fp->locked) { 686 687 WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); 687 688 /* In case set_memory_rw() fails, we want to be the first ··· 692 687 */ 693 688 fp->locked = 0; 694 689 } 690 + #endif 695 691 } 696 692 697 693 static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) 698 694 { 699 - WARN_ON_ONCE(set_memory_ro((unsigned long)hdr, hdr->pages)); 695 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 696 + hdr->locked = 1; 697 + if (set_memory_ro((unsigned long)hdr, hdr->pages)) 698 + hdr->locked = 0; 699 + #endif 700 700 } 701 701 702 702 static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) 703 703 { 704 - WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); 704 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 705 + if (hdr->locked) { 706 + WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); 707 + /* In case set_memory_rw() fails, we want to be the first 708 + * to crash here instead of some random place later on. 709 + */ 710 + hdr->locked = 0; 711 + } 712 + #endif 705 713 } 706 - #else 707 - static inline void bpf_prog_lock_ro(struct bpf_prog *fp) 708 - { 709 - } 710 - 711 - static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) 712 - { 713 - } 714 - 715 - static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) 716 - { 717 - } 718 - 719 - static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) 720 - { 721 - } 722 - #endif /* CONFIG_ARCH_HAS_SET_MEMORY */ 723 714 724 715 static inline struct bpf_binary_header * 725 716 bpf_jit_binary_hdr(const struct bpf_prog *fp) ··· 725 724 726 725 return (void *)addr; 727 726 } 727 + 728 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 729 + static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp) 730 + { 731 + if (!fp->locked) 732 + return -ENOLCK; 733 + if (fp->jited) { 734 + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); 735 + 736 + if (!hdr->locked) 737 + return -ENOLCK; 738 + } 739 + 740 + return 0; 741 + } 742 + #endif 728 743 729 744 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); 730 745 static inline int sk_filter(struct sock *sk, struct sk_buff *skb) ··· 802 785 803 786 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, 804 787 const struct bpf_insn *patch, u32 len); 788 + 789 + static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, 790 + struct net_device *fwd) 791 + { 792 + unsigned int len; 793 + 794 + if (unlikely(!(fwd->flags & IFF_UP))) 795 + return -ENETDOWN; 796 + 797 + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; 798 + if (skb->len > len) 799 + return -EMSGSIZE; 800 + 801 + return 0; 802 + } 805 803 806 804 /* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the 807 805 * same cpu context. Further for best results no more than a single map ··· 992 960 { 993 961 } 994 962 #endif /* CONFIG_BPF_JIT */ 963 + 964 + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp); 965 + void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); 995 966 996 967 #define BPF_ANC BIT(15) 997 968

+61 -6

kernel/bpf/core.c

··· 350 350 return prog_adj; 351 351 } 352 352 353 + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) 354 + { 355 + int i; 356 + 357 + for (i = 0; i < fp->aux->func_cnt; i++) 358 + bpf_prog_kallsyms_del(fp->aux->func[i]); 359 + } 360 + 361 + void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) 362 + { 363 + bpf_prog_kallsyms_del_subprogs(fp); 364 + bpf_prog_kallsyms_del(fp); 365 + } 366 + 353 367 #ifdef CONFIG_BPF_JIT 354 368 /* All BPF JIT sysctl knobs here. */ 355 369 int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); ··· 598 584 bpf_fill_ill_insns(hdr, size); 599 585 600 586 hdr->pages = size / PAGE_SIZE; 587 + hdr->locked = 0; 588 + 601 589 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), 602 590 PAGE_SIZE - sizeof(*hdr)); 603 591 start = (get_random_int() % hole) & ~(alignment - 1); ··· 1450 1434 return 0; 1451 1435 } 1452 1436 1437 + static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) 1438 + { 1439 + #ifdef CONFIG_ARCH_HAS_SET_MEMORY 1440 + int i, err; 1441 + 1442 + for (i = 0; i < fp->aux->func_cnt; i++) { 1443 + err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); 1444 + if (err) 1445 + return err; 1446 + } 1447 + 1448 + return bpf_prog_check_pages_ro_single(fp); 1449 + #endif 1450 + return 0; 1451 + } 1452 + 1453 + static void bpf_prog_select_func(struct bpf_prog *fp) 1454 + { 1455 + #ifndef CONFIG_BPF_JIT_ALWAYS_ON 1456 + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); 1457 + 1458 + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; 1459 + #else 1460 + fp->bpf_func = __bpf_prog_ret0_warn; 1461 + #endif 1462 + } 1463 + 1453 1464 /** 1454 1465 * bpf_prog_select_runtime - select exec runtime for BPF program 1455 1466 * @fp: bpf_prog populated with internal BPF program ··· 1487 1444 */ 1488 1445 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) 1489 1446 { 1490 - #ifndef CONFIG_BPF_JIT_ALWAYS_ON 1491 - u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); 1447 + /* In case of BPF to BPF calls, verifier did all the prep 1448 + * work with regards to JITing, etc. 1449 + */ 1450 + if (fp->bpf_func) 1451 + goto finalize; 1492 1452 1493 - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; 1494 - #else 1495 - fp->bpf_func = __bpf_prog_ret0_warn; 1496 - #endif 1453 + bpf_prog_select_func(fp); 1497 1454 1498 1455 /* eBPF JITs can rewrite the program in case constant 1499 1456 * blinding is active. However, in case of error during ··· 1514 1471 if (*err) 1515 1472 return fp; 1516 1473 } 1474 + 1475 + finalize: 1517 1476 bpf_prog_lock_ro(fp); 1518 1477 1519 1478 /* The tail call compatibility check can only be done at ··· 1524 1479 * all eBPF JITs might immediately support all features. 1525 1480 */ 1526 1481 *err = bpf_check_tail_call(fp); 1482 + if (*err) 1483 + return fp; 1527 1484 1485 + /* Checkpoint: at this point onwards any cBPF -> eBPF or 1486 + * native eBPF program is read-only. If we failed to change 1487 + * the page attributes (e.g. allocation failure from 1488 + * splitting large pages), then reject the whole program 1489 + * in order to guarantee not ending up with any W+X pages 1490 + * from BPF side in kernel. 1491 + */ 1492 + *err = bpf_prog_check_pages_ro_locked(fp); 1528 1493 return fp; 1529 1494 } 1530 1495 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);

+14

kernel/bpf/devmap.c

··· 345 345 return bq_enqueue(dst, xdpf, dev_rx); 346 346 } 347 347 348 + int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, 349 + struct bpf_prog *xdp_prog) 350 + { 351 + int err; 352 + 353 + err = __xdp_generic_ok_fwd_dev(skb, dst->dev); 354 + if (unlikely(err)) 355 + return err; 356 + skb->dev = dst->dev; 357 + generic_xdp_tx(skb, xdp_prog); 358 + 359 + return 0; 360 + } 361 + 348 362 static void *dev_map_lookup_elem(struct bpf_map *map, void *key) 349 363 { 350 364 struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);

+3 -9

kernel/bpf/syscall.c

··· 1034 1034 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) 1035 1035 { 1036 1036 if (atomic_dec_and_test(&prog->aux->refcnt)) { 1037 - int i; 1038 - 1039 1037 /* bpf_prog_free_id() must be called first */ 1040 1038 bpf_prog_free_id(prog, do_idr_lock); 1041 - 1042 - for (i = 0; i < prog->aux->func_cnt; i++) 1043 - bpf_prog_kallsyms_del(prog->aux->func[i]); 1044 - bpf_prog_kallsyms_del(prog); 1039 + bpf_prog_kallsyms_del_all(prog); 1045 1040 1046 1041 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 1047 1042 } ··· 1353 1358 if (err < 0) 1354 1359 goto free_used_maps; 1355 1360 1356 - /* eBPF program is ready to be JITed */ 1357 - if (!prog->bpf_func) 1358 - prog = bpf_prog_select_runtime(prog, &err); 1361 + prog = bpf_prog_select_runtime(prog, &err); 1359 1362 if (err < 0) 1360 1363 goto free_used_maps; 1361 1364 ··· 1377 1384 return err; 1378 1385 1379 1386 free_used_maps: 1387 + bpf_prog_kallsyms_del_subprogs(prog); 1380 1388 free_used_maps(prog->aux); 1381 1389 free_prog: 1382 1390 bpf_prog_uncharge_memlock(prog);

+4 -17

net/core/filter.c

··· 3214 3214 } 3215 3215 EXPORT_SYMBOL_GPL(xdp_do_redirect); 3216 3216 3217 - static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) 3218 - { 3219 - unsigned int len; 3220 - 3221 - if (unlikely(!(fwd->flags & IFF_UP))) 3222 - return -ENETDOWN; 3223 - 3224 - len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; 3225 - if (skb->len > len) 3226 - return -EMSGSIZE; 3227 - 3228 - return 0; 3229 - } 3230 - 3231 3217 static int xdp_do_generic_redirect_map(struct net_device *dev, 3232 3218 struct sk_buff *skb, 3233 3219 struct xdp_buff *xdp, ··· 3242 3256 } 3243 3257 3244 3258 if (map->map_type == BPF_MAP_TYPE_DEVMAP) { 3245 - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) 3259 + struct bpf_dtab_netdev *dst = fwd; 3260 + 3261 + err = dev_map_generic_redirect(dst, skb, xdp_prog); 3262 + if (unlikely(err)) 3246 3263 goto err; 3247 - skb->dev = fwd; 3248 - generic_xdp_tx(skb, xdp_prog); 3249 3264 } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { 3250 3265 struct xdp_sock *xs = fwd; 3251 3266

+3

net/xdp/xsk.c

··· 118 118 u64 addr; 119 119 int err; 120 120 121 + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 122 + return -EINVAL; 123 + 121 124 if (!xskq_peek_addr(xs->umem->fq, &addr) || 122 125 len > xs->umem->chunk_size_nohr) { 123 126 xs->rx_dropped++;

+3 -2

tools/bpf/bpftool/perf.c

··· 29 29 if (perf_query_supported) 30 30 goto out; 31 31 32 - fd = open(bin_name, O_RDONLY); 32 + fd = open("/", O_RDONLY); 33 33 if (fd < 0) { 34 - p_err("perf_query_support: %s", strerror(errno)); 34 + p_err("perf_query_support: cannot open directory \"/\" (%s)", 35 + strerror(errno)); 35 36 goto out; 36 37 } 37 38

+3 -1

tools/bpf/bpftool/prog.c

··· 90 90 } 91 91 92 92 wallclock_secs = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + 93 - nsecs / 1000000000; 93 + (real_time_ts.tv_nsec - boot_time_ts.tv_nsec + nsecs) / 94 + 1000000000; 95 + 94 96 95 97 if (!localtime_r(&wallclock_secs, &load_tm)) { 96 98 snprintf(buf, size, "%llu", nsecs / 1000000000);

+10

tools/testing/selftests/bpf/config

··· 7 7 CONFIG_NETDEVSIM=m 8 8 CONFIG_NET_CLS_ACT=y 9 9 CONFIG_NET_SCH_INGRESS=y 10 + CONFIG_NET_IPIP=y 11 + CONFIG_IPV6=y 12 + CONFIG_NET_IPGRE_DEMUX=y 13 + CONFIG_NET_IPGRE=y 14 + CONFIG_IPV6_GRE=y 15 + CONFIG_CRYPTO_USER_API_HASH=m 16 + CONFIG_CRYPTO_HMAC=m 17 + CONFIG_CRYPTO_SHA256=m 18 + CONFIG_VXLAN=y 19 + CONFIG_GENEVE=y

+10 -2

tools/testing/selftests/bpf/test_offload.py

··· 163 163 164 164 def bpftool_prog_list(expected=None, ns=""): 165 165 _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) 166 + # Remove the base progs 167 + for p in base_progs: 168 + if p in progs: 169 + progs.remove(p) 166 170 if expected is not None: 167 171 if len(progs) != expected: 168 172 fail(True, "%d BPF programs loaded, expected %d" % ··· 175 171 176 172 def bpftool_map_list(expected=None, ns=""): 177 173 _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) 174 + # Remove the base maps 175 + for m in base_maps: 176 + if m in maps: 177 + maps.remove(m) 178 178 if expected is not None: 179 179 if len(maps) != expected: 180 180 fail(True, "%d BPF maps loaded, expected %d" % ··· 593 585 # Check tools 594 586 ret, progs = bpftool("prog", fail=False) 595 587 skip(ret != 0, "bpftool not installed") 596 - # Check no BPF programs are loaded 597 - skip(len(progs) != 0, "BPF programs already loaded on the system") 588 + base_progs = progs 589 + _, base_maps = bpftool("map") 598 590 599 591 # Check netdevsim 600 592 ret, out = cmd("modprobe netdevsim", fail=False)

+14 -12

tools/testing/selftests/bpf/test_tunnel.sh

··· 608 608 test_xfrm_tunnel() 609 609 { 610 610 config_device 611 - #tcpdump -nei veth1 ip & 612 - output=$(mktemp) 613 - cat /sys/kernel/debug/tracing/trace_pipe | tee $output & 614 - setup_xfrm_tunnel 611 + > /sys/kernel/debug/tracing/trace 612 + setup_xfrm_tunnel 615 613 tc qdisc add dev veth1 clsact 616 614 tc filter add dev veth1 proto ip ingress bpf da obj test_tunnel_kern.o \ 617 615 sec xfrm_get_state 618 616 ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 619 617 sleep 1 620 - grep "reqid 1" $output 618 + grep "reqid 1" /sys/kernel/debug/tracing/trace 621 619 check_err $? 622 - grep "spi 0x1" $output 620 + grep "spi 0x1" /sys/kernel/debug/tracing/trace 623 621 check_err $? 624 - grep "remote ip 0xac100164" $output 622 + grep "remote ip 0xac100164" /sys/kernel/debug/tracing/trace 625 623 check_err $? 626 624 cleanup 627 625 628 626 if [ $ret -ne 0 ]; then 629 - echo -e ${RED}"FAIL: xfrm tunnel"${NC} 630 - return 1 631 - fi 632 - echo -e ${GREEN}"PASS: xfrm tunnel"${NC} 627 + echo -e ${RED}"FAIL: xfrm tunnel"${NC} 628 + return 1 629 + fi 630 + echo -e ${GREEN}"PASS: xfrm tunnel"${NC} 633 631 } 634 632 635 633 attach_bpf() ··· 655 657 ip link del ip6geneve11 2> /dev/null 656 658 ip link del erspan11 2> /dev/null 657 659 ip link del ip6erspan11 2> /dev/null 660 + ip xfrm policy delete dir out src 10.1.1.200/32 dst 10.1.1.100/32 2> /dev/null 661 + ip xfrm policy delete dir in src 10.1.1.100/32 dst 10.1.1.200/32 2> /dev/null 662 + ip xfrm state delete src 172.16.1.100 dst 172.16.1.200 proto esp spi 0x1 2> /dev/null 663 + ip xfrm state delete src 172.16.1.200 dst 172.16.1.100 proto esp spi 0x2 2> /dev/null 658 664 } 659 665 660 666 cleanup_exit() ··· 670 668 671 669 check() 672 670 { 673 - ip link help $1 2>&1 | grep -q "^Usage:" 671 + ip link help 2>&1 | grep -q "\s$1\s" 674 672 if [ $? -ne 0 ];then 675 673 echo "SKIP $1: iproute2 not support" 676 674 cleanup