Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Add bpf_loop helper

This patch adds the kernel-side and API changes for a new helper
function, bpf_loop:

long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
u64 flags);

where long (*callback_fn)(u32 index, void *ctx);

bpf_loop invokes the "callback_fn" **nr_loops** times or until the
callback_fn returns 1. The callback_fn can only return 0 or 1, and
this is enforced by the verifier. The callback_fn index is zero-indexed.

A few things to please note:
~ The "u64 flags" parameter is currently unused but is included in
case a future use case for it arises.
~ In the kernel-side implementation of bpf_loop (kernel/bpf/bpf_iter.c),
bpf_callback_t is used as the callback function cast.
~ A program can have nested bpf_loop calls but the program must
still adhere to the verifier constraint of its stack depth (the stack depth
cannot exceed MAX_BPF_STACK))
~ Recursive callback_fns do not pass the verifier, due to the call stack
for these being too deep.
~ The next patch will include the tests and benchmark

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-2-joannekoong@fb.com

authored by

Joanne Koong and committed by
Alexei Starovoitov
e6f2dd0f 88691e9e

+142 -34
+1
include/linux/bpf.h
··· 2164 2164 extern const struct bpf_func_proto bpf_sk_getsockopt_proto; 2165 2165 extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; 2166 2166 extern const struct bpf_func_proto bpf_find_vma_proto; 2167 + extern const struct bpf_func_proto bpf_loop_proto; 2167 2168 2168 2169 const struct bpf_func_proto *tracing_prog_func_proto( 2169 2170 enum bpf_func_id func_id, const struct bpf_prog *prog);
+25
include/uapi/linux/bpf.h
··· 4957 4957 * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. 4958 4958 * **-EBUSY** if failed to try lock mmap_lock. 4959 4959 * **-EINVAL** for invalid **flags**. 4960 + * 4961 + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) 4962 + * Description 4963 + * For **nr_loops**, call **callback_fn** function 4964 + * with **callback_ctx** as the context parameter. 4965 + * The **callback_fn** should be a static function and 4966 + * the **callback_ctx** should be a pointer to the stack. 4967 + * The **flags** is used to control certain aspects of the helper. 4968 + * Currently, the **flags** must be 0. Currently, nr_loops is 4969 + * limited to 1 << 23 (~8 million) loops. 4970 + * 4971 + * long (\*callback_fn)(u32 index, void \*ctx); 4972 + * 4973 + * where **index** is the current index in the loop. The index 4974 + * is zero-indexed. 4975 + * 4976 + * If **callback_fn** returns 0, the helper will continue to the next 4977 + * loop. If return value is 1, the helper will skip the rest of 4978 + * the loops and return. Other return values are not used now, 4979 + * and will be rejected by the verifier. 4980 + * 4981 + * Return 4982 + * The number of loops performed, **-EINVAL** for invalid **flags**, 4983 + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. 4960 4984 */ 4961 4985 #define __BPF_FUNC_MAPPER(FN) \ 4962 4986 FN(unspec), \ ··· 5164 5140 FN(skc_to_unix_sock), \ 5165 5141 FN(kallsyms_lookup_name), \ 5166 5142 FN(find_vma), \ 5143 + FN(loop), \ 5167 5144 /* */ 5168 5145 5169 5146 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
+35
kernel/bpf/bpf_iter.c
··· 714 714 .arg3_type = ARG_PTR_TO_STACK_OR_NULL, 715 715 .arg4_type = ARG_ANYTHING, 716 716 }; 717 + 718 + /* maximum number of loops */ 719 + #define MAX_LOOPS BIT(23) 720 + 721 + BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, 722 + u64, flags) 723 + { 724 + bpf_callback_t callback = (bpf_callback_t)callback_fn; 725 + u64 ret; 726 + u32 i; 727 + 728 + if (flags) 729 + return -EINVAL; 730 + if (nr_loops > MAX_LOOPS) 731 + return -E2BIG; 732 + 733 + for (i = 0; i < nr_loops; i++) { 734 + ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0); 735 + /* return value: 0 - continue, 1 - stop and return */ 736 + if (ret) 737 + return i + 1; 738 + } 739 + 740 + return i; 741 + } 742 + 743 + const struct bpf_func_proto bpf_loop_proto = { 744 + .func = bpf_loop, 745 + .gpl_only = false, 746 + .ret_type = RET_INTEGER, 747 + .arg1_type = ARG_ANYTHING, 748 + .arg2_type = ARG_PTR_TO_FUNC, 749 + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, 750 + .arg4_type = ARG_ANYTHING, 751 + };
+2
kernel/bpf/helpers.c
··· 1378 1378 return &bpf_ringbuf_query_proto; 1379 1379 case BPF_FUNC_for_each_map_elem: 1380 1380 return &bpf_for_each_map_elem_proto; 1381 + case BPF_FUNC_loop: 1382 + return &bpf_loop_proto; 1381 1383 default: 1382 1384 break; 1383 1385 }
+54 -34
kernel/bpf/verifier.c
··· 6085 6085 return 0; 6086 6086 } 6087 6087 6088 + static int set_loop_callback_state(struct bpf_verifier_env *env, 6089 + struct bpf_func_state *caller, 6090 + struct bpf_func_state *callee, 6091 + int insn_idx) 6092 + { 6093 + /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, 6094 + * u64 flags); 6095 + * callback_fn(u32 index, void *callback_ctx); 6096 + */ 6097 + callee->regs[BPF_REG_1].type = SCALAR_VALUE; 6098 + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; 6099 + 6100 + /* unused */ 6101 + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); 6102 + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); 6103 + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); 6104 + 6105 + callee->in_callback_fn = true; 6106 + return 0; 6107 + } 6108 + 6088 6109 static int set_timer_callback_state(struct bpf_verifier_env *env, 6089 6110 struct bpf_func_state *caller, 6090 6111 struct bpf_func_state *callee, ··· 6479 6458 return err; 6480 6459 } 6481 6460 6482 - if (func_id == BPF_FUNC_tail_call) { 6483 - err = check_reference_leak(env); 6484 - if (err) { 6485 - verbose(env, "tail_call would lead to reference leak\n"); 6486 - return err; 6487 - } 6488 - } else if (is_release_function(func_id)) { 6461 + if (is_release_function(func_id)) { 6489 6462 err = release_reference(env, meta.ref_obj_id); 6490 6463 if (err) { 6491 6464 verbose(env, "func %s#%d reference has not been acquired before\n", ··· 6490 6475 6491 6476 regs = cur_regs(env); 6492 6477 6493 - /* check that flags argument in get_local_storage(map, flags) is 0, 6494 - * this is required because get_local_storage() can't return an error. 6495 - */ 6496 - if (func_id == BPF_FUNC_get_local_storage && 6497 - !register_is_null(&regs[BPF_REG_2])) { 6498 - verbose(env, "get_local_storage() doesn't support non-zero flags\n"); 6499 - return -EINVAL; 6500 - } 6501 - 6502 - if (func_id == BPF_FUNC_for_each_map_elem) { 6478 + switch (func_id) { 6479 + case BPF_FUNC_tail_call: 6480 + err = check_reference_leak(env); 6481 + if (err) { 6482 + verbose(env, "tail_call would lead to reference leak\n"); 6483 + return err; 6484 + } 6485 + break; 6486 + case BPF_FUNC_get_local_storage: 6487 + /* check that flags argument in get_local_storage(map, flags) is 0, 6488 + * this is required because get_local_storage() can't return an error. 6489 + */ 6490 + if (!register_is_null(&regs[BPF_REG_2])) { 6491 + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); 6492 + return -EINVAL; 6493 + } 6494 + break; 6495 + case BPF_FUNC_for_each_map_elem: 6503 6496 err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 6504 6497 set_map_elem_callback_state); 6505 - if (err < 0) 6506 - return -EINVAL; 6507 - } 6508 - 6509 - if (func_id == BPF_FUNC_timer_set_callback) { 6498 + break; 6499 + case BPF_FUNC_timer_set_callback: 6510 6500 err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 6511 6501 set_timer_callback_state); 6512 - if (err < 0) 6513 - return -EINVAL; 6514 - } 6515 - 6516 - if (func_id == BPF_FUNC_find_vma) { 6502 + break; 6503 + case BPF_FUNC_find_vma: 6517 6504 err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 6518 6505 set_find_vma_callback_state); 6519 - if (err < 0) 6520 - return -EINVAL; 6506 + break; 6507 + case BPF_FUNC_snprintf: 6508 + err = check_bpf_snprintf_call(env, regs); 6509 + break; 6510 + case BPF_FUNC_loop: 6511 + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 6512 + set_loop_callback_state); 6513 + break; 6521 6514 } 6522 6515 6523 - if (func_id == BPF_FUNC_snprintf) { 6524 - err = check_bpf_snprintf_call(env, regs); 6525 - if (err < 0) 6526 - return err; 6527 - } 6516 + if (err) 6517 + return err; 6528 6518 6529 6519 /* reset caller saved regs */ 6530 6520 for (i = 0; i < CALLER_SAVED_REGS; i++) {
+25
tools/include/uapi/linux/bpf.h
··· 4957 4957 * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. 4958 4958 * **-EBUSY** if failed to try lock mmap_lock. 4959 4959 * **-EINVAL** for invalid **flags**. 4960 + * 4961 + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) 4962 + * Description 4963 + * For **nr_loops**, call **callback_fn** function 4964 + * with **callback_ctx** as the context parameter. 4965 + * The **callback_fn** should be a static function and 4966 + * the **callback_ctx** should be a pointer to the stack. 4967 + * The **flags** is used to control certain aspects of the helper. 4968 + * Currently, the **flags** must be 0. Currently, nr_loops is 4969 + * limited to 1 << 23 (~8 million) loops. 4970 + * 4971 + * long (\*callback_fn)(u32 index, void \*ctx); 4972 + * 4973 + * where **index** is the current index in the loop. The index 4974 + * is zero-indexed. 4975 + * 4976 + * If **callback_fn** returns 0, the helper will continue to the next 4977 + * loop. If return value is 1, the helper will skip the rest of 4978 + * the loops and return. Other return values are not used now, 4979 + * and will be rejected by the verifier. 4980 + * 4981 + * Return 4982 + * The number of loops performed, **-EINVAL** for invalid **flags**, 4983 + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. 4960 4984 */ 4961 4985 #define __BPF_FUNC_MAPPER(FN) \ 4962 4986 FN(unspec), \ ··· 5164 5140 FN(skc_to_unix_sock), \ 5165 5141 FN(kallsyms_lookup_name), \ 5166 5142 FN(find_vma), \ 5143 + FN(loop), \ 5167 5144 /* */ 5168 5145 5169 5146 /* integer value in 'imm' field of BPF_CALL instruction selects which helper