Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Add verifier support for timed may_goto

Implement support in the verifier for replacing may_goto implementation
from a counter-based approach to one which samples time on the local CPU
to have a bigger loop bound.

We implement it by maintaining 16-bytes per-stack frame, and using 8
bytes for maintaining the count for amortizing time sampling, and 8
bytes for the starting timestamp. To minimize overhead, we need to avoid
spilling and filling of registers around this sequence, so we push this
cost into the time sampling function 'arch_bpf_timed_may_goto'. This is
a JIT-specific wrapper around bpf_check_timed_may_goto which returns us
the count to store into the stack through BPF_REG_AX. All caller-saved
registers (r0-r5) are guaranteed to remain untouched.

The loop can be broken by returning count as 0, otherwise we dispatch
into the function when the count drops to 0, and the runtime chooses to
refresh it (by returning count as BPF_MAX_TIMED_LOOPS) or returning 0
and aborting the loop on next iteration.

Since the check for 0 is done right after loading the count from the
stack, all subsequent cond_break sequences should immediately break as
well, of the same loop or subsequent loops in the program.

We pass in the stack_depth of the count (and thus the timestamp, by
adding 8 to it) to the arch_bpf_timed_may_goto call so that it can be
passed in to bpf_check_timed_may_goto as an argument after r1 is saved,
by adding the offset to r10/fp. This adjustment will be arch specific,
and the next patch will introduce support for x86.

Note that depending on loop complexity, time spent in the loop can be
more than the current limit (250 ms), but imposing an upper bound on
program runtime is an orthogonal problem which will be addressed when
program cancellations are supported.

The current time afforded by cond_break may not be enough for cases
where BPF programs want to implement locking algorithms inline, and use
cond_break as a promise to the verifier that they will eventually
terminate.

Below are some benchmarking numbers on the time taken per-iteration for
an empty loop that counts the number of iterations until cond_break
fires. For comparison, we compare it against bpf_for/bpf_repeat which is
another way to achieve the same number of spins (BPF_MAX_LOOPS). The
hardware used for benchmarking was a Sapphire Rapids Intel server with
performance governor enabled, mitigations were enabled.

+-----------------------------+--------------+--------------+------------------+
| Loop type | Iterations | Time (ms) | Time/iter (ns) |
+-----------------------------|--------------+--------------+------------------+
| may_goto | 8388608 | 3 | 0.36 |
| timed_may_goto (count=65535)| 589674932 | 250 | 0.42 |
| bpf_for | 8388608 | 10 | 1.19 |
+-----------------------------+--------------+--------------+------------------+

This gives a good approximation at low overhead while staying close to
the current implementation.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250304003239.2390751-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Kumar Kartikeya Dwivedi and committed by
Alexei Starovoitov
e723608b 2941e215

+96 -8
+1
include/linux/bpf.h
··· 1987 1987 */ 1988 1988 enum { 1989 1989 BPF_MAX_LOOPS = 8 * 1024 * 1024, 1990 + BPF_MAX_TIMED_LOOPS = 0xffff, 1990 1991 }; 1991 1992 1992 1993 #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \
+8
include/linux/filter.h
··· 669 669 struct u64_stats_sync syncp; 670 670 } __aligned(2 * sizeof(u64)); 671 671 672 + struct bpf_timed_may_goto { 673 + u64 count; 674 + u64 timestamp; 675 + }; 676 + 672 677 struct sk_filter { 673 678 refcount_t refcnt; 674 679 struct rcu_head rcu; ··· 1135 1130 bool bpf_jit_supports_arena(void); 1136 1131 bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); 1137 1132 bool bpf_jit_supports_private_stack(void); 1133 + bool bpf_jit_supports_timed_may_goto(void); 1138 1134 u64 bpf_arch_uaddress_limit(void); 1139 1135 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); 1136 + u64 arch_bpf_timed_may_goto(void); 1137 + u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *); 1140 1138 bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); 1141 1139 1142 1140 static inline bool bpf_dump_raw_ok(const struct cred *cred)
+26
kernel/bpf/core.c
··· 3069 3069 { 3070 3070 } 3071 3071 3072 + bool __weak bpf_jit_supports_timed_may_goto(void) 3073 + { 3074 + return false; 3075 + } 3076 + 3077 + u64 __weak arch_bpf_timed_may_goto(void) 3078 + { 3079 + return 0; 3080 + } 3081 + 3082 + u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) 3083 + { 3084 + u64 time = ktime_get_mono_fast_ns(); 3085 + 3086 + /* Populate the timestamp for this stack frame, and refresh count. */ 3087 + if (!p->timestamp) { 3088 + p->timestamp = time; 3089 + return BPF_MAX_TIMED_LOOPS; 3090 + } 3091 + /* Check if we've exhausted our time slice, and zero count. */ 3092 + if (time - p->timestamp >= (NSEC_PER_SEC / 4)) 3093 + return 0; 3094 + /* Refresh the count for the stack frame. */ 3095 + return BPF_MAX_TIMED_LOOPS; 3096 + } 3097 + 3072 3098 /* for configs without MMU or 32-bit */ 3073 3099 __weak const struct bpf_map_ops arena_map_ops; 3074 3100 __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+61 -8
kernel/bpf/verifier.c
··· 21572 21572 goto next_insn; 21573 21573 } 21574 21574 21575 - if (is_may_goto_insn(insn)) { 21575 + if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { 21576 + int stack_off_cnt = -stack_depth - 16; 21577 + 21578 + /* 21579 + * Two 8 byte slots, depth-16 stores the count, and 21580 + * depth-8 stores the start timestamp of the loop. 21581 + * 21582 + * The starting value of count is BPF_MAX_TIMED_LOOPS 21583 + * (0xffff). Every iteration loads it and subs it by 1, 21584 + * until the value becomes 0 in AX (thus, 1 in stack), 21585 + * after which we call arch_bpf_timed_may_goto, which 21586 + * either sets AX to 0xffff to keep looping, or to 0 21587 + * upon timeout. AX is then stored into the stack. In 21588 + * the next iteration, we either see 0 and break out, or 21589 + * continue iterating until the next time value is 0 21590 + * after subtraction, rinse and repeat. 21591 + */ 21592 + stack_depth_extra = 16; 21593 + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); 21594 + if (insn->off >= 0) 21595 + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); 21596 + else 21597 + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); 21598 + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); 21599 + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); 21600 + /* 21601 + * AX is used as an argument to pass in stack_off_cnt 21602 + * (to add to r10/fp), and also as the return value of 21603 + * the call to arch_bpf_timed_may_goto. 21604 + */ 21605 + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); 21606 + insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); 21607 + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); 21608 + cnt = 7; 21609 + 21610 + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 21611 + if (!new_prog) 21612 + return -ENOMEM; 21613 + 21614 + delta += cnt - 1; 21615 + env->prog = prog = new_prog; 21616 + insn = new_prog->insnsi + i + delta; 21617 + goto next_insn; 21618 + } else if (is_may_goto_insn(insn)) { 21576 21619 int stack_off = -stack_depth - 8; 21577 21620 21578 21621 stack_depth_extra = 8; ··· 22156 22113 22157 22114 env->prog->aux->stack_depth = subprogs[0].stack_depth; 22158 22115 for (i = 0; i < env->subprog_cnt; i++) { 22116 + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; 22159 22117 int subprog_start = subprogs[i].start; 22160 22118 int stack_slots = subprogs[i].stack_extra / 8; 22119 + int slots = delta, cnt = 0; 22161 22120 22162 22121 if (!stack_slots) 22163 22122 continue; 22164 - if (stack_slots > 1) { 22123 + /* We need two slots in case timed may_goto is supported. */ 22124 + if (stack_slots > slots) { 22165 22125 verbose(env, "verifier bug: stack_slots supports may_goto only\n"); 22166 22126 return -EFAULT; 22167 22127 } 22168 22128 22169 - /* Add ST insn to subprog prologue to init extra stack */ 22170 - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, 22171 - -subprogs[i].stack_depth, BPF_MAX_LOOPS); 22129 + stack_depth = subprogs[i].stack_depth; 22130 + if (bpf_jit_supports_timed_may_goto()) { 22131 + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, 22132 + BPF_MAX_TIMED_LOOPS); 22133 + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); 22134 + } else { 22135 + /* Add ST insn to subprog prologue to init extra stack */ 22136 + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, 22137 + BPF_MAX_LOOPS); 22138 + } 22172 22139 /* Copy first actual insn to preserve it */ 22173 - insn_buf[1] = env->prog->insnsi[subprog_start]; 22140 + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; 22174 22141 22175 - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); 22142 + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); 22176 22143 if (!new_prog) 22177 22144 return -ENOMEM; 22178 22145 env->prog = prog = new_prog; ··· 22192 22139 * to insn after BPF_ST that inits may_goto count. 22193 22140 * Adjustment will succeed because bpf_patch_insn_data() didn't fail. 22194 22141 */ 22195 - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); 22142 + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); 22196 22143 } 22197 22144 22198 22145 /* Since poke tab is now finalized, publish aux to tracker. */