Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Fix accesses to uninit stack slots

Privileged programs are supposed to be able to read uninitialized stack
memory (ever since 6715df8d5) but, before this patch, these accesses
were permitted inconsistently. In particular, accesses were permitted
above state->allocated_stack, but not below it. In other words, if the
stack was already "large enough", the access was permitted, but
otherwise the access was rejected instead of being allowed to "grow the
stack". This undesired rejection was happening in two places:
- in check_stack_slot_within_bounds()
- in check_stack_range_initialized()
This patch arranges for these accesses to be permitted. A bunch of tests
that were relying on the old rejection had to change; all of them were
changed to add also run unprivileged, in which case the old behavior
persists. One tests couldn't be updated - global_func16 - because it
can't run unprivileged for other reasons.

This patch also fixes the tracking of the stack size for variable-offset
reads. This second fix is bundled in the same commit as the first one
because they're inter-related. Before this patch, writes to the stack
using registers containing a variable offset (as opposed to registers
with fixed, known values) were not properly contributing to the
function's needed stack size. As a result, it was possible for a program
to verify, but then to attempt to read out-of-bounds data at runtime
because a too small stack had been allocated for it.

Each function tracks the size of the stack it needs in
bpf_subprog_info.stack_depth, which is maintained by
update_stack_depth(). For regular memory accesses, check_mem_access()
was calling update_state_depth() but it was passing in only the fixed
part of the offset register, ignoring the variable offset. This was
incorrect; the minimum possible value of that register should be used
instead.

This tracking is now fixed by centralizing the tracking of stack size in
grow_stack_state(), and by lifting the calls to grow_stack_state() to
check_stack_access_within_bounds() as suggested by Andrii. The code is
now simpler and more convincingly tracks the correct maximum stack size.
check_stack_range_initialized() can now rely on enough stack having been
allocated for the access; this helps with the fix for the first issue.

A few tests were changed to also check the stack depth computation. The
one that fails without this patch is verifier_var_off:stack_write_priv_vs_unpriv.

Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access")
Reported-by: Hao Sun <sunhao.th@gmail.com>
Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231208032519.260451-3-andreimatei1@gmail.com

Closes: https://lore.kernel.org/bpf/CABWLsev9g8UP_c3a=1qbuZUi20tGoUXoU07FPf-5FLvhOKOY+Q@mail.gmail.com/

authored by

Andrei Matei and committed by
Andrii Nakryiko
6b4a64ba 92e1567e

+92 -72
+26 -39
kernel/bpf/verifier.c
··· 1259 1259 return 0; 1260 1260 } 1261 1261 1262 - static int grow_stack_state(struct bpf_func_state *state, int size) 1262 + /* Possibly update state->allocated_stack to be at least size bytes. Also 1263 + * possibly update the function's high-water mark in its bpf_subprog_info. 1264 + */ 1265 + static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size) 1263 1266 { 1264 1267 size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE; 1265 1268 ··· 1274 1271 return -ENOMEM; 1275 1272 1276 1273 state->allocated_stack = size; 1274 + 1275 + /* update known max for given subprogram */ 1276 + if (env->subprog_info[state->subprogno].stack_depth < size) 1277 + env->subprog_info[state->subprogno].stack_depth = size; 1278 + 1277 1279 return 0; 1278 1280 } 1279 1281 ··· 4448 4440 struct bpf_reg_state *reg = NULL; 4449 4441 int insn_flags = insn_stack_access_flags(state->frameno, spi); 4450 4442 4451 - err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); 4452 - if (err) 4453 - return err; 4454 4443 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, 4455 4444 * so it's aligned access and [off, off + size) are within stack limits 4456 4445 */ ··· 4599 4594 if ((value_reg && register_is_null(value_reg)) || 4600 4595 (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0)) 4601 4596 writing_zero = true; 4602 - 4603 - err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE)); 4604 - if (err) 4605 - return err; 4606 4597 4607 4598 for (i = min_off; i < max_off; i++) { 4608 4599 int spi; ··· 5775 5774 strict); 5776 5775 } 5777 5776 5778 - static int update_stack_depth(struct bpf_verifier_env *env, 5779 - const struct bpf_func_state *func, 5780 - int off) 5781 - { 5782 - u16 stack = env->subprog_info[func->subprogno].stack_depth; 5783 - 5784 - if (stack >= -off) 5785 - return 0; 5786 - 5787 - /* update known max for given subprogram */ 5788 - env->subprog_info[func->subprogno].stack_depth = -off; 5789 - return 0; 5790 - } 5791 - 5792 5777 /* starting from main bpf function walk all instructions of the function 5793 5778 * and recursively walk all callees that given function can call. 5794 5779 * Ignore jump and exit insns. ··· 6564 6577 * The minimum valid offset is -MAX_BPF_STACK for writes, and 6565 6578 * -state->allocated_stack for reads. 6566 6579 */ 6567 - static int check_stack_slot_within_bounds(s64 off, 6568 - struct bpf_func_state *state, 6569 - enum bpf_access_type t) 6580 + static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, 6581 + s64 off, 6582 + struct bpf_func_state *state, 6583 + enum bpf_access_type t) 6570 6584 { 6571 6585 int min_valid_off; 6572 6586 6573 - if (t == BPF_WRITE) 6587 + if (t == BPF_WRITE || env->allow_uninit_stack) 6574 6588 min_valid_off = -MAX_BPF_STACK; 6575 6589 else 6576 6590 min_valid_off = -state->allocated_stack; ··· 6620 6632 max_off = reg->smax_value + off + access_size; 6621 6633 } 6622 6634 6623 - err = check_stack_slot_within_bounds(min_off, state, type); 6635 + err = check_stack_slot_within_bounds(env, min_off, state, type); 6624 6636 if (!err && max_off > 0) 6625 6637 err = -EINVAL; /* out of stack access into non-negative offsets */ 6626 6638 ··· 6635 6647 verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", 6636 6648 err_extra, regno, tn_buf, off, access_size); 6637 6649 } 6650 + return err; 6638 6651 } 6639 - return err; 6652 + 6653 + return grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE)); 6640 6654 } 6641 6655 6642 6656 /* check whether memory at (regno + off) is accessible for t = (read | write) ··· 6653 6663 { 6654 6664 struct bpf_reg_state *regs = cur_regs(env); 6655 6665 struct bpf_reg_state *reg = regs + regno; 6656 - struct bpf_func_state *state; 6657 6666 int size, err = 0; 6658 6667 6659 6668 size = bpf_size_to_bytes(bpf_size); ··· 6792 6803 } else if (reg->type == PTR_TO_STACK) { 6793 6804 /* Basic bounds checks. */ 6794 6805 err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); 6795 - if (err) 6796 - return err; 6797 - 6798 - state = func(env, reg); 6799 - err = update_stack_depth(env, state, off); 6800 6806 if (err) 6801 6807 return err; 6802 6808 ··· 6988 7004 6989 7005 /* When register 'regno' is used to read the stack (either directly or through 6990 7006 * a helper function) make sure that it's within stack boundary and, depending 6991 - * on the access type, that all elements of the stack are initialized. 7007 + * on the access type and privileges, that all elements of the stack are 7008 + * initialized. 6992 7009 * 6993 7010 * 'off' includes 'regno->off', but not its dynamic part (if any). 6994 7011 * ··· 7097 7112 7098 7113 slot = -i - 1; 7099 7114 spi = slot / BPF_REG_SIZE; 7100 - if (state->allocated_stack <= slot) 7101 - goto err; 7115 + if (state->allocated_stack <= slot) { 7116 + verbose(env, "verifier bug: allocated_stack too small"); 7117 + return -EFAULT; 7118 + } 7119 + 7102 7120 stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; 7103 7121 if (*stype == STACK_MISC) 7104 7122 goto mark; ··· 7125 7137 goto mark; 7126 7138 } 7127 7139 7128 - err: 7129 7140 if (tnum_is_const(reg->var_off)) { 7130 7141 verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", 7131 7142 err_extra, regno, min_off, i - min_off, access_size); ··· 7149 7162 * helper may write to the entire memory range. 7150 7163 */ 7151 7164 } 7152 - return update_stack_depth(env, state, min_off); 7165 + return 0; 7153 7166 } 7154 7167 7155 7168 static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
+1 -1
tools/testing/selftests/bpf/progs/iters.c
··· 846 846 "call %[bpf_iter_num_next];" 847 847 "if r0 == 0 goto 2f;" 848 848 "if r6 != 42 goto 3f;" 849 - "r7 = -32;" 849 + "r7 = -33;" 850 850 "call %[bpf_get_prandom_u32];" 851 851 "r6 = r0;" 852 852 "goto 1b;\n"
+1 -1
tools/testing/selftests/bpf/progs/test_global_func16.c
··· 13 13 } 14 14 15 15 SEC("cgroup_skb/ingress") 16 - __failure __msg("invalid indirect read from stack") 16 + __success 17 17 int global_func16(struct __sk_buff *skb) 18 18 { 19 19 int array[10];
+4 -4
tools/testing/selftests/bpf/progs/verifier_basic_stack.c
··· 27 27 28 28 SEC("socket") 29 29 __description("uninitialized stack1") 30 - __failure __msg("invalid indirect read from stack") 31 - __failure_unpriv 30 + __success __log_level(4) __msg("stack depth 8") 31 + __failure_unpriv __msg_unpriv("invalid indirect read from stack") 32 32 __naked void uninitialized_stack1(void) 33 33 { 34 34 asm volatile (" \ ··· 45 45 46 46 SEC("socket") 47 47 __description("uninitialized stack2") 48 - __failure __msg("invalid read from stack") 49 - __failure_unpriv 48 + __success __log_level(4) __msg("stack depth 8") 49 + __failure_unpriv __msg_unpriv("invalid read from stack") 50 50 __naked void uninitialized_stack2(void) 51 51 { 52 52 asm volatile (" \
+3 -2
tools/testing/selftests/bpf/progs/verifier_int_ptr.c
··· 5 5 #include <bpf/bpf_helpers.h> 6 6 #include "bpf_misc.h" 7 7 8 - SEC("cgroup/sysctl") 8 + SEC("socket") 9 9 __description("ARG_PTR_TO_LONG uninitialized") 10 - __failure __msg("invalid indirect read from stack R4 off -16+0 size 8") 10 + __success 11 + __failure_unpriv __msg_unpriv("invalid indirect read from stack R4 off -16+0 size 8") 11 12 __naked void arg_ptr_to_long_uninitialized(void) 12 13 { 13 14 asm volatile (" \
+3 -2
tools/testing/selftests/bpf/progs/verifier_raw_stack.c
··· 5 5 #include <bpf/bpf_helpers.h> 6 6 #include "bpf_misc.h" 7 7 8 - SEC("tc") 8 + SEC("socket") 9 9 __description("raw_stack: no skb_load_bytes") 10 - __failure __msg("invalid read from stack R6 off=-8 size=8") 10 + __success 11 + __failure_unpriv __msg_unpriv("invalid read from stack R6 off=-8 size=8") 11 12 __naked void stack_no_skb_load_bytes(void) 12 13 { 13 14 asm volatile (" \
+51 -11
tools/testing/selftests/bpf/progs/verifier_var_off.c
··· 59 59 " ::: __clobber_all); 60 60 } 61 61 62 - SEC("lwt_in") 62 + SEC("cgroup/skb") 63 63 __description("variable-offset stack read, uninitialized") 64 - __failure __msg("invalid variable-offset read from stack R2") 64 + __success 65 + __failure_unpriv __msg_unpriv("R2 variable stack access prohibited for !root") 65 66 __naked void variable_offset_stack_read_uninitialized(void) 66 67 { 67 68 asm volatile (" \ ··· 84 83 85 84 SEC("socket") 86 85 __description("variable-offset stack write, priv vs unpriv") 87 - __success __failure_unpriv 86 + __success 87 + /* Check that the maximum stack depth is correctly maintained according to the 88 + * maximum possible variable offset. 89 + */ 90 + __log_level(4) __msg("stack depth 16") 91 + __failure_unpriv 88 92 /* Variable stack access is rejected for unprivileged. 89 93 */ 90 94 __msg_unpriv("R2 variable stack access prohibited for !root") 91 95 __retval(0) 92 96 __naked void stack_write_priv_vs_unpriv(void) 97 + { 98 + asm volatile (" \ 99 + /* Get an unknown value */ \ 100 + r2 = *(u32*)(r1 + 0); \ 101 + /* Make it small and 8-byte aligned */ \ 102 + r2 &= 8; \ 103 + r2 -= 16; \ 104 + /* Add it to fp. We now have either fp-8 or \ 105 + * fp-16, but we don't know which \ 106 + */ \ 107 + r2 += r10; \ 108 + /* Dereference it for a stack write */ \ 109 + r0 = 0; \ 110 + *(u64*)(r2 + 0) = r0; \ 111 + exit; \ 112 + " ::: __clobber_all); 113 + } 114 + 115 + /* Similar to the previous test, but this time also perform a read from the 116 + * address written to with a variable offset. The read is allowed, showing that, 117 + * after a variable-offset write, a priviledged program can read the slots that 118 + * were in the range of that write (even if the verifier doesn't actually know if 119 + * the slot being read was really written to or not. 120 + * 121 + * Despite this test being mostly a superset, the previous test is also kept for 122 + * the sake of it checking the stack depth in the case where there is no read. 123 + */ 124 + SEC("socket") 125 + __description("variable-offset stack write followed by read") 126 + __success 127 + /* Check that the maximum stack depth is correctly maintained according to the 128 + * maximum possible variable offset. 129 + */ 130 + __log_level(4) __msg("stack depth 16") 131 + __failure_unpriv 132 + __msg_unpriv("R2 variable stack access prohibited for !root") 133 + __retval(0) 134 + __naked void stack_write_followed_by_read(void) 93 135 { 94 136 asm volatile (" \ 95 137 /* Get an unknown value */ \ ··· 147 103 /* Dereference it for a stack write */ \ 148 104 r0 = 0; \ 149 105 *(u64*)(r2 + 0) = r0; \ 150 - /* Now read from the address we just wrote. This shows\ 151 - * that, after a variable-offset write, a priviledged\ 152 - * program can read the slots that were in the range of\ 153 - * that write (even if the verifier doesn't actually know\ 154 - * if the slot being read was really written to or not.\ 155 - */ \ 106 + /* Now read from the address we just wrote. */ \ 156 107 r3 = *(u64*)(r2 + 0); \ 157 108 r0 = 0; \ 158 109 exit; \ ··· 321 282 : __clobber_all); 322 283 } 323 284 324 - SEC("lwt_in") 285 + SEC("cgroup/skb") 325 286 __description("indirect variable-offset stack access, min_off < min_initialized") 326 - __failure __msg("invalid indirect read from stack R2 var_off") 287 + __success 288 + __failure_unpriv __msg_unpriv("R2 variable stack access prohibited for !root") 327 289 __naked void access_min_off_min_initialized(void) 328 290 { 329 291 asm volatile (" \
-11
tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
··· 84 84 .errstr = "!read_ok", 85 85 }, 86 86 { 87 - "Can't use cmpxchg on uninit memory", 88 - .insns = { 89 - BPF_MOV64_IMM(BPF_REG_0, 3), 90 - BPF_MOV64_IMM(BPF_REG_2, 4), 91 - BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_2, -8), 92 - BPF_EXIT_INSN(), 93 - }, 94 - .result = REJECT, 95 - .errstr = "invalid read from stack", 96 - }, 97 - { 98 87 "BPF_W cmpxchg should zero top 32 bits", 99 88 .insns = { 100 89 /* r0 = U64_MAX; */
+3 -1
tools/testing/selftests/bpf/verifier/calls.c
··· 1505 1505 .prog_type = BPF_PROG_TYPE_XDP, 1506 1506 .fixup_map_hash_8b = { 23 }, 1507 1507 .result = REJECT, 1508 - .errstr = "invalid read from stack R7 off=-16 size=8", 1508 + .errstr = "R0 invalid mem access 'scalar'", 1509 + .result_unpriv = REJECT, 1510 + .errstr_unpriv = "invalid read from stack R7 off=-16 size=8", 1509 1511 }, 1510 1512 { 1511 1513 "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test1",