Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Inline calls to bpf_loop when callback is known

Calls to `bpf_loop` are replaced with direct loops to avoid
indirection. E.g. the following:

bpf_loop(10, foo, NULL, 0);

Is replaced by equivalent of the following:

for (int i = 0; i < 10; ++i)
foo(i, NULL);

This transformation could be applied when:
- callback is known and does not change during program execution;
- flags passed to `bpf_loop` are always zero.

Inlining logic works as follows:

- During execution simulation function `update_loop_inline_state`
tracks the following information for each `bpf_loop` call
instruction:
- is callback known and constant?
- are flags constant and zero?
- Function `optimize_bpf_loop` increases stack depth for functions
where `bpf_loop` calls can be inlined and invokes `inline_bpf_loop`
to apply the inlining. The additional stack space is used to spill
registers R6, R7 and R8. These registers are used as loop counter,
loop maximal bound and callback context parameter;

Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/r/20220620235344.569325-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Eduard Zingerman and committed by
Alexei Starovoitov
1ade2371 7a42008c

+195 -9
+3
include/linux/bpf.h
··· 1286 1286 #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ 1287 1287 #define MAX_TAIL_CALL_CNT 33 1288 1288 1289 + /* Maximum number of loops for bpf_loop */ 1290 + #define BPF_MAX_LOOPS BIT(23) 1291 + 1289 1292 #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ 1290 1293 BPF_F_RDONLY_PROG | \ 1291 1294 BPF_F_WRONLY | \
+12
include/linux/bpf_verifier.h
··· 344 344 int miss_cnt, hit_cnt; 345 345 }; 346 346 347 + struct bpf_loop_inline_state { 348 + int initialized:1; /* set to true upon first entry */ 349 + int fit_for_inline:1; /* true if callback function is the same 350 + * at each call and flags are always zero 351 + */ 352 + u32 callback_subprogno; /* valid when fit_for_inline is true */ 353 + }; 354 + 347 355 /* Possible states for alu_state member. */ 348 356 #define BPF_ALU_SANITIZE_SRC (1U << 0) 349 357 #define BPF_ALU_SANITIZE_DST (1U << 1) ··· 381 373 u32 mem_size; /* mem_size for non-struct typed var */ 382 374 }; 383 375 } btf_var; 376 + /* if instruction is a call to bpf_loop this field tracks 377 + * the state of the relevant registers to make decision about inlining 378 + */ 379 + struct bpf_loop_inline_state loop_inline_state; 384 380 }; 385 381 u64 map_key_state; /* constant (32 bit) key tracking for maps */ 386 382 int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
+5 -4
kernel/bpf/bpf_iter.c
··· 723 723 .arg4_type = ARG_ANYTHING, 724 724 }; 725 725 726 - /* maximum number of loops */ 727 - #define MAX_LOOPS BIT(23) 728 - 729 726 BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, 730 727 u64, flags) 731 728 { ··· 730 733 u64 ret; 731 734 u32 i; 732 735 736 + /* Note: these safety checks are also verified when bpf_loop 737 + * is inlined, be careful to modify this code in sync. See 738 + * function verifier.c:inline_bpf_loop. 739 + */ 733 740 if (flags) 734 741 return -EINVAL; 735 - if (nr_loops > MAX_LOOPS) 742 + if (nr_loops > BPF_MAX_LOOPS) 736 743 return -E2BIG; 737 744 738 745 for (i = 0; i < nr_loops; i++) {
+175 -5
kernel/bpf/verifier.c
··· 7124 7124 return -ENOTSUPP; 7125 7125 } 7126 7126 7127 + static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) 7128 + { 7129 + return &env->insn_aux_data[env->insn_idx]; 7130 + } 7131 + 7132 + static bool loop_flag_is_zero(struct bpf_verifier_env *env) 7133 + { 7134 + struct bpf_reg_state *regs = cur_regs(env); 7135 + struct bpf_reg_state *reg = &regs[BPF_REG_4]; 7136 + bool reg_is_null = register_is_null(reg); 7137 + 7138 + if (reg_is_null) 7139 + mark_chain_precision(env, BPF_REG_4); 7140 + 7141 + return reg_is_null; 7142 + } 7143 + 7144 + static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno) 7145 + { 7146 + struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state; 7147 + 7148 + if (!state->initialized) { 7149 + state->initialized = 1; 7150 + state->fit_for_inline = loop_flag_is_zero(env); 7151 + state->callback_subprogno = subprogno; 7152 + return; 7153 + } 7154 + 7155 + if (!state->fit_for_inline) 7156 + return; 7157 + 7158 + state->fit_for_inline = (loop_flag_is_zero(env) && 7159 + state->callback_subprogno == subprogno); 7160 + } 7161 + 7127 7162 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, 7128 7163 int *insn_idx_p) 7129 7164 { ··· 7311 7276 err = check_bpf_snprintf_call(env, regs); 7312 7277 break; 7313 7278 case BPF_FUNC_loop: 7279 + update_loop_inline_state(env, meta.subprogno); 7314 7280 err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, 7315 7281 set_loop_callback_state); 7316 7282 break; ··· 7716 7680 } 7717 7681 7718 7682 return true; 7719 - } 7720 - 7721 - static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) 7722 - { 7723 - return &env->insn_aux_data[env->insn_idx]; 7724 7683 } 7725 7684 7726 7685 enum { ··· 14346 14315 return 0; 14347 14316 } 14348 14317 14318 + static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, 14319 + int position, 14320 + s32 stack_base, 14321 + u32 callback_subprogno, 14322 + u32 *cnt) 14323 + { 14324 + s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; 14325 + s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; 14326 + s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; 14327 + int reg_loop_max = BPF_REG_6; 14328 + int reg_loop_cnt = BPF_REG_7; 14329 + int reg_loop_ctx = BPF_REG_8; 14330 + 14331 + struct bpf_prog *new_prog; 14332 + u32 callback_start; 14333 + u32 call_insn_offset; 14334 + s32 callback_offset; 14335 + 14336 + /* This represents an inlined version of bpf_iter.c:bpf_loop, 14337 + * be careful to modify this code in sync. 14338 + */ 14339 + struct bpf_insn insn_buf[] = { 14340 + /* Return error and jump to the end of the patch if 14341 + * expected number of iterations is too big. 14342 + */ 14343 + BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2), 14344 + BPF_MOV32_IMM(BPF_REG_0, -E2BIG), 14345 + BPF_JMP_IMM(BPF_JA, 0, 0, 16), 14346 + /* spill R6, R7, R8 to use these as loop vars */ 14347 + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset), 14348 + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset), 14349 + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset), 14350 + /* initialize loop vars */ 14351 + BPF_MOV64_REG(reg_loop_max, BPF_REG_1), 14352 + BPF_MOV32_IMM(reg_loop_cnt, 0), 14353 + BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3), 14354 + /* loop header, 14355 + * if reg_loop_cnt >= reg_loop_max skip the loop body 14356 + */ 14357 + BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5), 14358 + /* callback call, 14359 + * correct callback offset would be set after patching 14360 + */ 14361 + BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt), 14362 + BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx), 14363 + BPF_CALL_REL(0), 14364 + /* increment loop counter */ 14365 + BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1), 14366 + /* jump to loop header if callback returned 0 */ 14367 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6), 14368 + /* return value of bpf_loop, 14369 + * set R0 to the number of iterations 14370 + */ 14371 + BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt), 14372 + /* restore original values of R6, R7, R8 */ 14373 + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset), 14374 + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset), 14375 + BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset), 14376 + }; 14377 + 14378 + *cnt = ARRAY_SIZE(insn_buf); 14379 + new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); 14380 + if (!new_prog) 14381 + return new_prog; 14382 + 14383 + /* callback start is known only after patching */ 14384 + callback_start = env->subprog_info[callback_subprogno].start; 14385 + /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ 14386 + call_insn_offset = position + 12; 14387 + callback_offset = callback_start - call_insn_offset - 1; 14388 + env->prog->insnsi[call_insn_offset].imm = callback_offset; 14389 + 14390 + return new_prog; 14391 + } 14392 + 14393 + static bool is_bpf_loop_call(struct bpf_insn *insn) 14394 + { 14395 + return insn->code == (BPF_JMP | BPF_CALL) && 14396 + insn->src_reg == 0 && 14397 + insn->imm == BPF_FUNC_loop; 14398 + } 14399 + 14400 + /* For all sub-programs in the program (including main) check 14401 + * insn_aux_data to see if there are bpf_loop calls that require 14402 + * inlining. If such calls are found the calls are replaced with a 14403 + * sequence of instructions produced by `inline_bpf_loop` function and 14404 + * subprog stack_depth is increased by the size of 3 registers. 14405 + * This stack space is used to spill values of the R6, R7, R8. These 14406 + * registers are used to store the loop bound, counter and context 14407 + * variables. 14408 + */ 14409 + static int optimize_bpf_loop(struct bpf_verifier_env *env) 14410 + { 14411 + struct bpf_subprog_info *subprogs = env->subprog_info; 14412 + int i, cur_subprog = 0, cnt, delta = 0; 14413 + struct bpf_insn *insn = env->prog->insnsi; 14414 + int insn_cnt = env->prog->len; 14415 + u16 stack_depth = subprogs[cur_subprog].stack_depth; 14416 + u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; 14417 + u16 stack_depth_extra = 0; 14418 + 14419 + for (i = 0; i < insn_cnt; i++, insn++) { 14420 + struct bpf_loop_inline_state *inline_state = 14421 + &env->insn_aux_data[i + delta].loop_inline_state; 14422 + 14423 + if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { 14424 + struct bpf_prog *new_prog; 14425 + 14426 + stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; 14427 + new_prog = inline_bpf_loop(env, 14428 + i + delta, 14429 + -(stack_depth + stack_depth_extra), 14430 + inline_state->callback_subprogno, 14431 + &cnt); 14432 + if (!new_prog) 14433 + return -ENOMEM; 14434 + 14435 + delta += cnt - 1; 14436 + env->prog = new_prog; 14437 + insn = new_prog->insnsi + i + delta; 14438 + } 14439 + 14440 + if (subprogs[cur_subprog + 1].start == i + delta + 1) { 14441 + subprogs[cur_subprog].stack_depth += stack_depth_extra; 14442 + cur_subprog++; 14443 + stack_depth = subprogs[cur_subprog].stack_depth; 14444 + stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; 14445 + stack_depth_extra = 0; 14446 + } 14447 + } 14448 + 14449 + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; 14450 + 14451 + return 0; 14452 + } 14453 + 14349 14454 static void free_states(struct bpf_verifier_env *env) 14350 14455 { 14351 14456 struct bpf_verifier_state_list *sl, *sln; ··· 15219 15052 ret = check_max_stack_depth(env); 15220 15053 15221 15054 /* instruction rewrites happen after this point */ 15055 + if (ret == 0) 15056 + ret = optimize_bpf_loop(env); 15057 + 15222 15058 if (is_priv) { 15223 15059 if (ret == 0) 15224 15060 opt_hard_wire_dead_code_branches(env);