Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

+8

Documentation/sysctl/net.txt

··· 92 92 0 - disable JIT kallsyms export (default value) 93 93 1 - enable JIT kallsyms export for privileged users only 94 94 95 + bpf_jit_limit 96 + ------------- 97 + 98 + This enforces a global limit for memory allocations to the BPF JIT 99 + compiler in order to reject unprivileged JIT requests once it has 100 + been surpassed. bpf_jit_limit contains the value of the global limit 101 + in bytes. 102 + 95 103 dev_weight 96 104 -------------- 97 105

+1

include/linux/filter.h

··· 854 854 extern int bpf_jit_enable; 855 855 extern int bpf_jit_harden; 856 856 extern int bpf_jit_kallsyms; 857 + extern int bpf_jit_limit; 857 858 858 859 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); 859 860

+25 -33

kernel/bpf/btf.c

··· 2067 2067 return 0; 2068 2068 } 2069 2069 2070 - static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, 2071 - u32 btf_data_size) 2070 + static int btf_parse_hdr(struct btf_verifier_env *env) 2072 2071 { 2072 + u32 hdr_len, hdr_copy, btf_data_size; 2073 2073 const struct btf_header *hdr; 2074 - u32 hdr_len, hdr_copy; 2075 - /* 2076 - * Minimal part of the "struct btf_header" that 2077 - * contains the hdr_len. 2078 - */ 2079 - struct btf_min_header { 2080 - u16 magic; 2081 - u8 version; 2082 - u8 flags; 2083 - u32 hdr_len; 2084 - } __user *min_hdr; 2085 2074 struct btf *btf; 2086 2075 int err; 2087 2076 2088 2077 btf = env->btf; 2089 - min_hdr = btf_data; 2078 + btf_data_size = btf->data_size; 2090 2079 2091 - if (btf_data_size < sizeof(*min_hdr)) { 2080 + if (btf_data_size < 2081 + offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { 2092 2082 btf_verifier_log(env, "hdr_len not found"); 2093 2083 return -EINVAL; 2094 2084 } 2095 2085 2096 - if (get_user(hdr_len, &min_hdr->hdr_len)) 2097 - return -EFAULT; 2098 - 2086 + hdr = btf->data; 2087 + hdr_len = hdr->hdr_len; 2099 2088 if (btf_data_size < hdr_len) { 2100 2089 btf_verifier_log(env, "btf_header not found"); 2101 2090 return -EINVAL; 2102 2091 } 2103 2092 2104 - err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); 2105 - if (err) { 2106 - if (err == -E2BIG) 2107 - btf_verifier_log(env, "Unsupported btf_header"); 2108 - return err; 2093 + /* Ensure the unsupported header fields are zero */ 2094 + if (hdr_len > sizeof(btf->hdr)) { 2095 + u8 *expected_zero = btf->data + sizeof(btf->hdr); 2096 + u8 *end = btf->data + hdr_len; 2097 + 2098 + for (; expected_zero < end; expected_zero++) { 2099 + if (*expected_zero) { 2100 + btf_verifier_log(env, "Unsupported btf_header"); 2101 + return -E2BIG; 2102 + } 2103 + } 2109 2104 } 2110 2105 2111 2106 hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); 2112 - if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) 2113 - return -EFAULT; 2107 + memcpy(&btf->hdr, btf->data, hdr_copy); 2114 2108 2115 2109 hdr = &btf->hdr; 2116 - 2117 - if (hdr->hdr_len != hdr_len) 2118 - return -EINVAL; 2119 2110 2120 2111 btf_verifier_log_hdr(env, btf_data_size); 2121 2112 ··· 2177 2186 } 2178 2187 env->btf = btf; 2179 2188 2180 - err = btf_parse_hdr(env, btf_data, btf_data_size); 2181 - if (err) 2182 - goto errout; 2183 - 2184 2189 data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); 2185 2190 if (!data) { 2186 2191 err = -ENOMEM; ··· 2185 2198 2186 2199 btf->data = data; 2187 2200 btf->data_size = btf_data_size; 2188 - btf->nohdr_data = btf->data + btf->hdr.hdr_len; 2189 2201 2190 2202 if (copy_from_user(data, btf_data, btf_data_size)) { 2191 2203 err = -EFAULT; 2192 2204 goto errout; 2193 2205 } 2206 + 2207 + err = btf_parse_hdr(env); 2208 + if (err) 2209 + goto errout; 2210 + 2211 + btf->nohdr_data = btf->data + btf->hdr.hdr_len; 2194 2212 2195 2213 err = btf_parse_str_sec(env); 2196 2214 if (err)

+47 -4

kernel/bpf/core.c

··· 365 365 } 366 366 367 367 #ifdef CONFIG_BPF_JIT 368 + # define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000) 369 + 368 370 /* All BPF JIT sysctl knobs here. */ 369 371 int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); 370 372 int bpf_jit_harden __read_mostly; 371 373 int bpf_jit_kallsyms __read_mostly; 374 + int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT; 372 375 373 376 static __always_inline void 374 377 bpf_get_prog_addr_region(const struct bpf_prog *prog, ··· 580 577 return ret; 581 578 } 582 579 580 + static atomic_long_t bpf_jit_current; 581 + 582 + #if defined(MODULES_VADDR) 583 + static int __init bpf_jit_charge_init(void) 584 + { 585 + /* Only used as heuristic here to derive limit. */ 586 + bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2, 587 + PAGE_SIZE), INT_MAX); 588 + return 0; 589 + } 590 + pure_initcall(bpf_jit_charge_init); 591 + #endif 592 + 593 + static int bpf_jit_charge_modmem(u32 pages) 594 + { 595 + if (atomic_long_add_return(pages, &bpf_jit_current) > 596 + (bpf_jit_limit >> PAGE_SHIFT)) { 597 + if (!capable(CAP_SYS_ADMIN)) { 598 + atomic_long_sub(pages, &bpf_jit_current); 599 + return -EPERM; 600 + } 601 + } 602 + 603 + return 0; 604 + } 605 + 606 + static void bpf_jit_uncharge_modmem(u32 pages) 607 + { 608 + atomic_long_sub(pages, &bpf_jit_current); 609 + } 610 + 583 611 struct bpf_binary_header * 584 612 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, 585 613 unsigned int alignment, 586 614 bpf_jit_fill_hole_t bpf_fill_ill_insns) 587 615 { 588 616 struct bpf_binary_header *hdr; 589 - unsigned int size, hole, start; 617 + u32 size, hole, start, pages; 590 618 591 619 /* Most of BPF filters are really small, but if some of them 592 620 * fill a page, allow at least 128 extra bytes to insert a 593 621 * random section of illegal instructions. 594 622 */ 595 623 size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); 596 - hdr = module_alloc(size); 597 - if (hdr == NULL) 624 + pages = size / PAGE_SIZE; 625 + 626 + if (bpf_jit_charge_modmem(pages)) 598 627 return NULL; 628 + hdr = module_alloc(size); 629 + if (!hdr) { 630 + bpf_jit_uncharge_modmem(pages); 631 + return NULL; 632 + } 599 633 600 634 /* Fill space with illegal/arch-dep instructions. */ 601 635 bpf_fill_ill_insns(hdr, size); 602 636 603 - hdr->pages = size / PAGE_SIZE; 637 + hdr->pages = pages; 604 638 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), 605 639 PAGE_SIZE - sizeof(*hdr)); 606 640 start = (get_random_int() % hole) & ~(alignment - 1); ··· 650 610 651 611 void bpf_jit_binary_free(struct bpf_binary_header *hdr) 652 612 { 613 + u32 pages = hdr->pages; 614 + 653 615 module_memfree(hdr); 616 + bpf_jit_uncharge_modmem(pages); 654 617 } 655 618 656 619 /* This symbol is only overridden by archs that have different

+1 -2

kernel/bpf/devmap.c

··· 512 512 struct bpf_dtab_netdev *dev, *odev; 513 513 514 514 dev = READ_ONCE(dtab->netdev_map[i]); 515 - if (!dev || 516 - dev->dev->ifindex != netdev->ifindex) 515 + if (!dev || netdev != dev->dev) 517 516 continue; 518 517 odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); 519 518 if (dev == odev)

-2

kernel/bpf/helpers.c

··· 99 99 const struct bpf_func_proto bpf_map_pop_elem_proto = { 100 100 .func = bpf_map_pop_elem, 101 101 .gpl_only = false, 102 - .pkt_access = true, 103 102 .ret_type = RET_INTEGER, 104 103 .arg1_type = ARG_CONST_MAP_PTR, 105 104 .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, ··· 112 113 const struct bpf_func_proto bpf_map_peek_elem_proto = { 113 114 .func = bpf_map_pop_elem, 114 115 .gpl_only = false, 115 - .pkt_access = true, 116 116 .ret_type = RET_INTEGER, 117 117 .arg1_type = ARG_CONST_MAP_PTR, 118 118 .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE,

+2

kernel/bpf/queue_stack_maps.c

··· 122 122 raw_spin_lock_irqsave(&qs->lock, flags); 123 123 124 124 if (queue_stack_map_is_empty(qs)) { 125 + memset(value, 0, qs->map.value_size); 125 126 err = -ENOENT; 126 127 goto out; 127 128 } ··· 152 151 raw_spin_lock_irqsave(&qs->lock, flags); 153 152 154 153 if (queue_stack_map_is_empty(qs)) { 154 + memset(value, 0, qs->map.value_size); 155 155 err = -ENOENT; 156 156 goto out; 157 157 }

+10 -3

kernel/bpf/verifier.c

··· 1387 1387 enum bpf_access_type t) 1388 1388 { 1389 1389 switch (env->prog->type) { 1390 + /* Program types only with direct read access go here! */ 1390 1391 case BPF_PROG_TYPE_LWT_IN: 1391 1392 case BPF_PROG_TYPE_LWT_OUT: 1392 1393 case BPF_PROG_TYPE_LWT_SEG6LOCAL: 1393 1394 case BPF_PROG_TYPE_SK_REUSEPORT: 1394 - /* dst_input() and dst_output() can't write for now */ 1395 + case BPF_PROG_TYPE_FLOW_DISSECTOR: 1396 + case BPF_PROG_TYPE_CGROUP_SKB: 1395 1397 if (t == BPF_WRITE) 1396 1398 return false; 1397 1399 /* fallthrough */ 1400 + 1401 + /* Program types with direct read + write access go here! */ 1398 1402 case BPF_PROG_TYPE_SCHED_CLS: 1399 1403 case BPF_PROG_TYPE_SCHED_ACT: 1400 1404 case BPF_PROG_TYPE_XDP: 1401 1405 case BPF_PROG_TYPE_LWT_XMIT: 1402 1406 case BPF_PROG_TYPE_SK_SKB: 1403 1407 case BPF_PROG_TYPE_SK_MSG: 1404 - case BPF_PROG_TYPE_FLOW_DISSECTOR: 1405 1408 if (meta) 1406 1409 return meta->pkt_access; 1407 1410 ··· 5709 5706 bool is_narrower_load; 5710 5707 u32 target_size; 5711 5708 5712 - if (ops->gen_prologue) { 5709 + if (ops->gen_prologue || env->seen_direct_write) { 5710 + if (!ops->gen_prologue) { 5711 + verbose(env, "bpf verifier is misconfigured\n"); 5712 + return -EINVAL; 5713 + } 5713 5714 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, 5714 5715 env->prog); 5715 5716 if (cnt >= ARRAY_SIZE(insn_buf)) {

+17 -4

net/core/filter.c

··· 5264 5264 return &bpf_msg_pull_data_proto; 5265 5265 case BPF_FUNC_msg_push_data: 5266 5266 return &bpf_msg_push_data_proto; 5267 - case BPF_FUNC_get_local_storage: 5268 - return &bpf_get_local_storage_proto; 5269 5267 default: 5270 5268 return bpf_base_func_proto(func_id); 5271 5269 } ··· 5294 5296 return &bpf_sk_redirect_map_proto; 5295 5297 case BPF_FUNC_sk_redirect_hash: 5296 5298 return &bpf_sk_redirect_hash_proto; 5297 - case BPF_FUNC_get_local_storage: 5298 - return &bpf_get_local_storage_proto; 5299 5299 #ifdef CONFIG_INET 5300 5300 case BPF_FUNC_sk_lookup_tcp: 5301 5301 return &bpf_sk_lookup_tcp_proto; ··· 5492 5496 case bpf_ctx_range(struct __sk_buff, data_meta): 5493 5497 case bpf_ctx_range(struct __sk_buff, flow_keys): 5494 5498 return false; 5499 + case bpf_ctx_range(struct __sk_buff, data): 5500 + case bpf_ctx_range(struct __sk_buff, data_end): 5501 + if (!capable(CAP_SYS_ADMIN)) 5502 + return false; 5503 + break; 5495 5504 } 5505 + 5496 5506 if (type == BPF_WRITE) { 5497 5507 switch (off) { 5498 5508 case bpf_ctx_range(struct __sk_buff, mark): ··· 5638 5636 return false; 5639 5637 return __sock_filter_check_attach_type(off, type, 5640 5638 prog->expected_attach_type); 5639 + } 5640 + 5641 + static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, 5642 + const struct bpf_prog *prog) 5643 + { 5644 + /* Neither direct read nor direct write requires any preliminary 5645 + * action. 5646 + */ 5647 + return 0; 5641 5648 } 5642 5649 5643 5650 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, ··· 7215 7204 .get_func_proto = xdp_func_proto, 7216 7205 .is_valid_access = xdp_is_valid_access, 7217 7206 .convert_ctx_access = xdp_convert_ctx_access, 7207 + .gen_prologue = bpf_noop_prologue, 7218 7208 }; 7219 7209 7220 7210 const struct bpf_prog_ops xdp_prog_ops = { ··· 7314 7302 .get_func_proto = sk_msg_func_proto, 7315 7303 .is_valid_access = sk_msg_is_valid_access, 7316 7304 .convert_ctx_access = sk_msg_convert_ctx_access, 7305 + .gen_prologue = bpf_noop_prologue, 7317 7306 }; 7318 7307 7319 7308 const struct bpf_prog_ops sk_msg_prog_ops = {

+8 -2

net/core/sysctl_net_core.c

··· 279 279 return ret; 280 280 } 281 281 282 - # ifdef CONFIG_HAVE_EBPF_JIT 283 282 static int 284 283 proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, 285 284 void __user *buffer, size_t *lenp, ··· 289 290 290 291 return proc_dointvec_minmax(table, write, buffer, lenp, ppos); 291 292 } 292 - # endif 293 293 #endif 294 294 295 295 static struct ctl_table net_core_table[] = { ··· 395 397 .extra2 = &one, 396 398 }, 397 399 # endif 400 + { 401 + .procname = "bpf_jit_limit", 402 + .data = &bpf_jit_limit, 403 + .maxlen = sizeof(int), 404 + .mode = 0600, 405 + .proc_handler = proc_dointvec_minmax_bpf_restricted, 406 + .extra1 = &one, 407 + }, 398 408 #endif 399 409 { 400 410 .procname = "netdev_tstamp_prequeue",

+2

tools/testing/selftests/bpf/config

··· 20 20 CONFIG_GENEVE=y 21 21 CONFIG_NET_CLS_FLOWER=m 22 22 CONFIG_LWTUNNEL=y 23 + CONFIG_BPF_STREAM_PARSER=y 24 + CONFIG_XDP_SOCKETS=y

+13 -2

tools/testing/selftests/bpf/test_verifier.c

··· 4891 4891 BPF_EXIT_INSN(), 4892 4892 }, 4893 4893 .result = ACCEPT, 4894 + .result_unpriv = REJECT, 4895 + .errstr_unpriv = "invalid bpf_context access off=76 size=4", 4894 4896 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 4895 4897 }, 4896 4898 { ··· 5148 5146 .fixup_cgroup_storage = { 1 }, 5149 5147 .result = REJECT, 5150 5148 .errstr = "get_local_storage() doesn't support non-zero flags", 5149 + .errstr_unpriv = "R2 leaks addr into helper function", 5151 5150 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 5152 5151 }, 5153 5152 { ··· 5264 5261 .fixup_percpu_cgroup_storage = { 1 }, 5265 5262 .result = REJECT, 5266 5263 .errstr = "get_local_storage() doesn't support non-zero flags", 5264 + .errstr_unpriv = "R2 leaks addr into helper function", 5267 5265 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 5268 5266 }, 5269 5267 { ··· 14054 14050 fclose(fd); 14055 14051 } 14056 14052 14053 + static bool test_as_unpriv(struct bpf_test *test) 14054 + { 14055 + return !test->prog_type || 14056 + test->prog_type == BPF_PROG_TYPE_SOCKET_FILTER || 14057 + test->prog_type == BPF_PROG_TYPE_CGROUP_SKB; 14058 + } 14059 + 14057 14060 static int do_test(bool unpriv, unsigned int from, unsigned int to) 14058 14061 { 14059 14062 int i, passes = 0, errors = 0, skips = 0; ··· 14071 14060 /* Program types that are not supported by non-root we 14072 14061 * skip right away. 14073 14062 */ 14074 - if (!test->prog_type && unpriv_disabled) { 14063 + if (test_as_unpriv(test) && unpriv_disabled) { 14075 14064 printf("#%d/u %s SKIP\n", i, test->descr); 14076 14065 skips++; 14077 - } else if (!test->prog_type) { 14066 + } else if (test_as_unpriv(test)) { 14078 14067 if (!unpriv) 14079 14068 set_admin(false); 14080 14069 printf("#%d/u %s ", i, test->descr);