Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2018-10-27

The following pull-request contains BPF updates for your *net* tree.

The main changes are:

1) Fix toctou race in BTF header validation, from Martin and Wenwen.

2) Fix devmap interface comparison in notifier call which was
neglecting netns, from Taehee.

3) Several fixes in various places, for example, correcting direct
packet access and helper function availability, from Daniel.

4) Fix BPF kselftest config fragment to include af_xdp and sockmap,
from Naresh.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+134 -52
+8
Documentation/sysctl/net.txt
··· 92 92 0 - disable JIT kallsyms export (default value) 93 93 1 - enable JIT kallsyms export for privileged users only 94 94 95 + bpf_jit_limit 96 + ------------- 97 + 98 + This enforces a global limit for memory allocations to the BPF JIT 99 + compiler in order to reject unprivileged JIT requests once it has 100 + been surpassed. bpf_jit_limit contains the value of the global limit 101 + in bytes. 102 + 95 103 dev_weight 96 104 -------------- 97 105
+1
include/linux/filter.h
··· 854 854 extern int bpf_jit_enable; 855 855 extern int bpf_jit_harden; 856 856 extern int bpf_jit_kallsyms; 857 + extern int bpf_jit_limit; 857 858 858 859 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); 859 860
+25 -33
kernel/bpf/btf.c
··· 2067 2067 return 0; 2068 2068 } 2069 2069 2070 - static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, 2071 - u32 btf_data_size) 2070 + static int btf_parse_hdr(struct btf_verifier_env *env) 2072 2071 { 2072 + u32 hdr_len, hdr_copy, btf_data_size; 2073 2073 const struct btf_header *hdr; 2074 - u32 hdr_len, hdr_copy; 2075 - /* 2076 - * Minimal part of the "struct btf_header" that 2077 - * contains the hdr_len. 2078 - */ 2079 - struct btf_min_header { 2080 - u16 magic; 2081 - u8 version; 2082 - u8 flags; 2083 - u32 hdr_len; 2084 - } __user *min_hdr; 2085 2074 struct btf *btf; 2086 2075 int err; 2087 2076 2088 2077 btf = env->btf; 2089 - min_hdr = btf_data; 2078 + btf_data_size = btf->data_size; 2090 2079 2091 - if (btf_data_size < sizeof(*min_hdr)) { 2080 + if (btf_data_size < 2081 + offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { 2092 2082 btf_verifier_log(env, "hdr_len not found"); 2093 2083 return -EINVAL; 2094 2084 } 2095 2085 2096 - if (get_user(hdr_len, &min_hdr->hdr_len)) 2097 - return -EFAULT; 2098 - 2086 + hdr = btf->data; 2087 + hdr_len = hdr->hdr_len; 2099 2088 if (btf_data_size < hdr_len) { 2100 2089 btf_verifier_log(env, "btf_header not found"); 2101 2090 return -EINVAL; 2102 2091 } 2103 2092 2104 - err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); 2105 - if (err) { 2106 - if (err == -E2BIG) 2107 - btf_verifier_log(env, "Unsupported btf_header"); 2108 - return err; 2093 + /* Ensure the unsupported header fields are zero */ 2094 + if (hdr_len > sizeof(btf->hdr)) { 2095 + u8 *expected_zero = btf->data + sizeof(btf->hdr); 2096 + u8 *end = btf->data + hdr_len; 2097 + 2098 + for (; expected_zero < end; expected_zero++) { 2099 + if (*expected_zero) { 2100 + btf_verifier_log(env, "Unsupported btf_header"); 2101 + return -E2BIG; 2102 + } 2103 + } 2109 2104 } 2110 2105 2111 2106 hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); 2112 - if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) 2113 - return -EFAULT; 2107 + memcpy(&btf->hdr, btf->data, hdr_copy); 2114 2108 2115 2109 hdr = &btf->hdr; 2116 - 2117 - if (hdr->hdr_len != hdr_len) 2118 - return -EINVAL; 2119 2110 2120 2111 btf_verifier_log_hdr(env, btf_data_size); 2121 2112 ··· 2177 2186 } 2178 2187 env->btf = btf; 2179 2188 2180 - err = btf_parse_hdr(env, btf_data, btf_data_size); 2181 - if (err) 2182 - goto errout; 2183 - 2184 2189 data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); 2185 2190 if (!data) { 2186 2191 err = -ENOMEM; ··· 2185 2198 2186 2199 btf->data = data; 2187 2200 btf->data_size = btf_data_size; 2188 - btf->nohdr_data = btf->data + btf->hdr.hdr_len; 2189 2201 2190 2202 if (copy_from_user(data, btf_data, btf_data_size)) { 2191 2203 err = -EFAULT; 2192 2204 goto errout; 2193 2205 } 2206 + 2207 + err = btf_parse_hdr(env); 2208 + if (err) 2209 + goto errout; 2210 + 2211 + btf->nohdr_data = btf->data + btf->hdr.hdr_len; 2194 2212 2195 2213 err = btf_parse_str_sec(env); 2196 2214 if (err)
+47 -4
kernel/bpf/core.c
··· 365 365 } 366 366 367 367 #ifdef CONFIG_BPF_JIT 368 + # define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000) 369 + 368 370 /* All BPF JIT sysctl knobs here. */ 369 371 int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); 370 372 int bpf_jit_harden __read_mostly; 371 373 int bpf_jit_kallsyms __read_mostly; 374 + int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT; 372 375 373 376 static __always_inline void 374 377 bpf_get_prog_addr_region(const struct bpf_prog *prog, ··· 580 577 return ret; 581 578 } 582 579 580 + static atomic_long_t bpf_jit_current; 581 + 582 + #if defined(MODULES_VADDR) 583 + static int __init bpf_jit_charge_init(void) 584 + { 585 + /* Only used as heuristic here to derive limit. */ 586 + bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2, 587 + PAGE_SIZE), INT_MAX); 588 + return 0; 589 + } 590 + pure_initcall(bpf_jit_charge_init); 591 + #endif 592 + 593 + static int bpf_jit_charge_modmem(u32 pages) 594 + { 595 + if (atomic_long_add_return(pages, &bpf_jit_current) > 596 + (bpf_jit_limit >> PAGE_SHIFT)) { 597 + if (!capable(CAP_SYS_ADMIN)) { 598 + atomic_long_sub(pages, &bpf_jit_current); 599 + return -EPERM; 600 + } 601 + } 602 + 603 + return 0; 604 + } 605 + 606 + static void bpf_jit_uncharge_modmem(u32 pages) 607 + { 608 + atomic_long_sub(pages, &bpf_jit_current); 609 + } 610 + 583 611 struct bpf_binary_header * 584 612 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, 585 613 unsigned int alignment, 586 614 bpf_jit_fill_hole_t bpf_fill_ill_insns) 587 615 { 588 616 struct bpf_binary_header *hdr; 589 - unsigned int size, hole, start; 617 + u32 size, hole, start, pages; 590 618 591 619 /* Most of BPF filters are really small, but if some of them 592 620 * fill a page, allow at least 128 extra bytes to insert a 593 621 * random section of illegal instructions. 594 622 */ 595 623 size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); 596 - hdr = module_alloc(size); 597 - if (hdr == NULL) 624 + pages = size / PAGE_SIZE; 625 + 626 + if (bpf_jit_charge_modmem(pages)) 598 627 return NULL; 628 + hdr = module_alloc(size); 629 + if (!hdr) { 630 + bpf_jit_uncharge_modmem(pages); 631 + return NULL; 632 + } 599 633 600 634 /* Fill space with illegal/arch-dep instructions. */ 601 635 bpf_fill_ill_insns(hdr, size); 602 636 603 - hdr->pages = size / PAGE_SIZE; 637 + hdr->pages = pages; 604 638 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), 605 639 PAGE_SIZE - sizeof(*hdr)); 606 640 start = (get_random_int() % hole) & ~(alignment - 1); ··· 650 610 651 611 void bpf_jit_binary_free(struct bpf_binary_header *hdr) 652 612 { 613 + u32 pages = hdr->pages; 614 + 653 615 module_memfree(hdr); 616 + bpf_jit_uncharge_modmem(pages); 654 617 } 655 618 656 619 /* This symbol is only overridden by archs that have different
+1 -2
kernel/bpf/devmap.c
··· 512 512 struct bpf_dtab_netdev *dev, *odev; 513 513 514 514 dev = READ_ONCE(dtab->netdev_map[i]); 515 - if (!dev || 516 - dev->dev->ifindex != netdev->ifindex) 515 + if (!dev || netdev != dev->dev) 517 516 continue; 518 517 odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); 519 518 if (dev == odev)
-2
kernel/bpf/helpers.c
··· 99 99 const struct bpf_func_proto bpf_map_pop_elem_proto = { 100 100 .func = bpf_map_pop_elem, 101 101 .gpl_only = false, 102 - .pkt_access = true, 103 102 .ret_type = RET_INTEGER, 104 103 .arg1_type = ARG_CONST_MAP_PTR, 105 104 .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, ··· 112 113 const struct bpf_func_proto bpf_map_peek_elem_proto = { 113 114 .func = bpf_map_pop_elem, 114 115 .gpl_only = false, 115 - .pkt_access = true, 116 116 .ret_type = RET_INTEGER, 117 117 .arg1_type = ARG_CONST_MAP_PTR, 118 118 .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,
+2
kernel/bpf/queue_stack_maps.c
··· 122 122 raw_spin_lock_irqsave(&qs->lock, flags); 123 123 124 124 if (queue_stack_map_is_empty(qs)) { 125 + memset(value, 0, qs->map.value_size); 125 126 err = -ENOENT; 126 127 goto out; 127 128 } ··· 152 151 raw_spin_lock_irqsave(&qs->lock, flags); 153 152 154 153 if (queue_stack_map_is_empty(qs)) { 154 + memset(value, 0, qs->map.value_size); 155 155 err = -ENOENT; 156 156 goto out; 157 157 }
+10 -3
kernel/bpf/verifier.c
··· 1387 1387 enum bpf_access_type t) 1388 1388 { 1389 1389 switch (env->prog->type) { 1390 + /* Program types only with direct read access go here! */ 1390 1391 case BPF_PROG_TYPE_LWT_IN: 1391 1392 case BPF_PROG_TYPE_LWT_OUT: 1392 1393 case BPF_PROG_TYPE_LWT_SEG6LOCAL: 1393 1394 case BPF_PROG_TYPE_SK_REUSEPORT: 1394 - /* dst_input() and dst_output() can't write for now */ 1395 + case BPF_PROG_TYPE_FLOW_DISSECTOR: 1396 + case BPF_PROG_TYPE_CGROUP_SKB: 1395 1397 if (t == BPF_WRITE) 1396 1398 return false; 1397 1399 /* fallthrough */ 1400 + 1401 + /* Program types with direct read + write access go here! */ 1398 1402 case BPF_PROG_TYPE_SCHED_CLS: 1399 1403 case BPF_PROG_TYPE_SCHED_ACT: 1400 1404 case BPF_PROG_TYPE_XDP: 1401 1405 case BPF_PROG_TYPE_LWT_XMIT: 1402 1406 case BPF_PROG_TYPE_SK_SKB: 1403 1407 case BPF_PROG_TYPE_SK_MSG: 1404 - case BPF_PROG_TYPE_FLOW_DISSECTOR: 1405 1408 if (meta) 1406 1409 return meta->pkt_access; 1407 1410 ··· 5709 5706 bool is_narrower_load; 5710 5707 u32 target_size; 5711 5708 5712 - if (ops->gen_prologue) { 5709 + if (ops->gen_prologue || env->seen_direct_write) { 5710 + if (!ops->gen_prologue) { 5711 + verbose(env, "bpf verifier is misconfigured\n"); 5712 + return -EINVAL; 5713 + } 5713 5714 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, 5714 5715 env->prog); 5715 5716 if (cnt >= ARRAY_SIZE(insn_buf)) {
+17 -4
net/core/filter.c
··· 5264 5264 return &bpf_msg_pull_data_proto; 5265 5265 case BPF_FUNC_msg_push_data: 5266 5266 return &bpf_msg_push_data_proto; 5267 - case BPF_FUNC_get_local_storage: 5268 - return &bpf_get_local_storage_proto; 5269 5267 default: 5270 5268 return bpf_base_func_proto(func_id); 5271 5269 } ··· 5294 5296 return &bpf_sk_redirect_map_proto; 5295 5297 case BPF_FUNC_sk_redirect_hash: 5296 5298 return &bpf_sk_redirect_hash_proto; 5297 - case BPF_FUNC_get_local_storage: 5298 - return &bpf_get_local_storage_proto; 5299 5299 #ifdef CONFIG_INET 5300 5300 case BPF_FUNC_sk_lookup_tcp: 5301 5301 return &bpf_sk_lookup_tcp_proto; ··· 5492 5496 case bpf_ctx_range(struct __sk_buff, data_meta): 5493 5497 case bpf_ctx_range(struct __sk_buff, flow_keys): 5494 5498 return false; 5499 + case bpf_ctx_range(struct __sk_buff, data): 5500 + case bpf_ctx_range(struct __sk_buff, data_end): 5501 + if (!capable(CAP_SYS_ADMIN)) 5502 + return false; 5503 + break; 5495 5504 } 5505 + 5496 5506 if (type == BPF_WRITE) { 5497 5507 switch (off) { 5498 5508 case bpf_ctx_range(struct __sk_buff, mark): ··· 5638 5636 return false; 5639 5637 return __sock_filter_check_attach_type(off, type, 5640 5638 prog->expected_attach_type); 5639 + } 5640 + 5641 + static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, 5642 + const struct bpf_prog *prog) 5643 + { 5644 + /* Neither direct read nor direct write requires any preliminary 5645 + * action. 5646 + */ 5647 + return 0; 5641 5648 } 5642 5649 5643 5650 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, ··· 7215 7204 .get_func_proto = xdp_func_proto, 7216 7205 .is_valid_access = xdp_is_valid_access, 7217 7206 .convert_ctx_access = xdp_convert_ctx_access, 7207 + .gen_prologue = bpf_noop_prologue, 7218 7208 }; 7219 7209 7220 7210 const struct bpf_prog_ops xdp_prog_ops = { ··· 7314 7302 .get_func_proto = sk_msg_func_proto, 7315 7303 .is_valid_access = sk_msg_is_valid_access, 7316 7304 .convert_ctx_access = sk_msg_convert_ctx_access, 7305 + .gen_prologue = bpf_noop_prologue, 7317 7306 }; 7318 7307 7319 7308 const struct bpf_prog_ops sk_msg_prog_ops = {
+8 -2
net/core/sysctl_net_core.c
··· 279 279 return ret; 280 280 } 281 281 282 - # ifdef CONFIG_HAVE_EBPF_JIT 283 282 static int 284 283 proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, 285 284 void __user *buffer, size_t *lenp, ··· 289 290 290 291 return proc_dointvec_minmax(table, write, buffer, lenp, ppos); 291 292 } 292 - # endif 293 293 #endif 294 294 295 295 static struct ctl_table net_core_table[] = { ··· 395 397 .extra2 = &one, 396 398 }, 397 399 # endif 400 + { 401 + .procname = "bpf_jit_limit", 402 + .data = &bpf_jit_limit, 403 + .maxlen = sizeof(int), 404 + .mode = 0600, 405 + .proc_handler = proc_dointvec_minmax_bpf_restricted, 406 + .extra1 = &one, 407 + }, 398 408 #endif 399 409 { 400 410 .procname = "netdev_tstamp_prequeue",
+2
tools/testing/selftests/bpf/config
··· 20 20 CONFIG_GENEVE=y 21 21 CONFIG_NET_CLS_FLOWER=m 22 22 CONFIG_LWTUNNEL=y 23 + CONFIG_BPF_STREAM_PARSER=y 24 + CONFIG_XDP_SOCKETS=y
+13 -2
tools/testing/selftests/bpf/test_verifier.c
··· 4891 4891 BPF_EXIT_INSN(), 4892 4892 }, 4893 4893 .result = ACCEPT, 4894 + .result_unpriv = REJECT, 4895 + .errstr_unpriv = "invalid bpf_context access off=76 size=4", 4894 4896 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 4895 4897 }, 4896 4898 { ··· 5148 5146 .fixup_cgroup_storage = { 1 }, 5149 5147 .result = REJECT, 5150 5148 .errstr = "get_local_storage() doesn't support non-zero flags", 5149 + .errstr_unpriv = "R2 leaks addr into helper function", 5151 5150 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 5152 5151 }, 5153 5152 { ··· 5264 5261 .fixup_percpu_cgroup_storage = { 1 }, 5265 5262 .result = REJECT, 5266 5263 .errstr = "get_local_storage() doesn't support non-zero flags", 5264 + .errstr_unpriv = "R2 leaks addr into helper function", 5267 5265 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 5268 5266 }, 5269 5267 { ··· 14054 14050 fclose(fd); 14055 14051 } 14056 14052 14053 + static bool test_as_unpriv(struct bpf_test *test) 14054 + { 14055 + return !test->prog_type || 14056 + test->prog_type == BPF_PROG_TYPE_SOCKET_FILTER || 14057 + test->prog_type == BPF_PROG_TYPE_CGROUP_SKB; 14058 + } 14059 + 14057 14060 static int do_test(bool unpriv, unsigned int from, unsigned int to) 14058 14061 { 14059 14062 int i, passes = 0, errors = 0, skips = 0; ··· 14071 14060 /* Program types that are not supported by non-root we 14072 14061 * skip right away. 14073 14062 */ 14074 - if (!test->prog_type && unpriv_disabled) { 14063 + if (test_as_unpriv(test) && unpriv_disabled) { 14075 14064 printf("#%d/u %s SKIP\n", i, test->descr); 14076 14065 skips++; 14077 - } else if (!test->prog_type) { 14066 + } else if (test_as_unpriv(test)) { 14078 14067 if (!unpriv) 14079 14068 set_admin(false); 14080 14069 printf("#%d/u %s ", i, test->descr);