Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: teach verifier to recognize zero initialized stack

programs with function calls are often passing various
pointers via stack. When all calls are inlined llvm
flattens stack accesses and optimizes away extra branches.
When functions are not inlined it becomes the job of
the verifier to recognize zero initialized stack to avoid
exploring paths that program will not take.
The following program would fail otherwise:

ptr = &buffer_on_stack;
*ptr = 0;
...
func_call(.., ptr, ...) {
if (..)
*ptr = bpf_map_lookup();
}
...
if (*ptr != 0) {
// Access (*ptr)->field is valid.
// Without stack_zero tracking such (*ptr)->field access
// will be rejected
}

since stack slots are no longer uniform invalid | spill | misc
add liveness marking to all slots, but do it in 8 byte chunks.
So if nothing was read or written in [fp-16, fp-9] range
it will be marked as LIVE_NONE.
If any byte in that range was read, it will be marked LIVE_READ
and stacksafe() check will perform byte-by-byte verification.
If all bytes in the range were written the slot will be
marked as LIVE_WRITTEN.
This significantly speeds up state equality comparison
and reduces total number of states processed.

before after
bpf_lb-DLB_L3.o 2051 2003
bpf_lb-DLB_L4.o 3287 3164
bpf_lb-DUNKNOWN.o 1080 1080
bpf_lxc-DDROP_ALL.o 24980 12361
bpf_lxc-DUNKNOWN.o 34308 16605
bpf_netdev.o 15404 10962
bpf_overlay.o 7191 6679

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

authored by

Alexei Starovoitov and committed by
Daniel Borkmann
cc2b14d5 a7ff3eca

+103 -29
+2 -1
include/linux/bpf_verifier.h
··· 91 91 enum bpf_stack_slot_type { 92 92 STACK_INVALID, /* nothing was stored in this stack slot */ 93 93 STACK_SPILL, /* register spilled into stack */ 94 - STACK_MISC /* BPF program wrote some data into this slot */ 94 + STACK_MISC, /* BPF program wrote some data into this slot */ 95 + STACK_ZERO, /* BPF program wrote constant zero */ 95 96 }; 96 97 97 98 #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
+101 -28
kernel/bpf/verifier.c
··· 311 311 verbose(env, "=%s", 312 312 reg_type_str[state->stack[i].spilled_ptr.type]); 313 313 } 314 + if (state->stack[i].slot_type[0] == STACK_ZERO) 315 + verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); 314 316 } 315 317 verbose(env, "\n"); 316 318 } ··· 522 520 static void __mark_reg_known_zero(struct bpf_reg_state *reg) 523 521 { 524 522 __mark_reg_known(reg, 0); 523 + } 524 + 525 + static void __mark_reg_const_zero(struct bpf_reg_state *reg) 526 + { 527 + __mark_reg_known(reg, 0); 528 + reg->off = 0; 529 + reg->type = SCALAR_VALUE; 525 530 } 526 531 527 532 static void mark_reg_known_zero(struct bpf_verifier_env *env, ··· 946 937 } 947 938 } 948 939 940 + /* Does this register contain a constant zero? */ 941 + static bool register_is_null(struct bpf_reg_state *reg) 942 + { 943 + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); 944 + } 945 + 949 946 /* check_stack_read/write functions track spill/fill of registers, 950 947 * stack boundary and alignment are checked in check_mem_access() 951 948 */ ··· 999 984 for (i = 0; i < BPF_REG_SIZE; i++) 1000 985 state->stack[spi].slot_type[i] = STACK_SPILL; 1001 986 } else { 987 + u8 type = STACK_MISC; 988 + 1002 989 /* regular write of data into stack */ 1003 990 state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; 1004 991 992 + /* only mark the slot as written if all 8 bytes were written 993 + * otherwise read propagation may incorrectly stop too soon 994 + * when stack slots are partially written. 995 + * This heuristic means that read propagation will be 996 + * conservative, since it will add reg_live_read marks 997 + * to stack slots all the way to first state when programs 998 + * writes+reads less than 8 bytes 999 + */ 1000 + if (size == BPF_REG_SIZE) 1001 + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; 1002 + 1003 + /* when we zero initialize stack slots mark them as such */ 1004 + if (value_regno >= 0 && 1005 + register_is_null(&cur->regs[value_regno])) 1006 + type = STACK_ZERO; 1007 + 1005 1008 for (i = 0; i < size; i++) 1006 1009 state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = 1007 - STACK_MISC; 1010 + type; 1008 1011 } 1009 1012 return 0; 1010 1013 } ··· 1063 1030 bool writes = parent == state->parent; /* Observe write marks */ 1064 1031 1065 1032 while (parent) { 1033 + if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) 1034 + /* since LIVE_WRITTEN mark is only done for full 8-byte 1035 + * write the read marks are conservative and parent 1036 + * state may not even have the stack allocated. In such case 1037 + * end the propagation, since the loop reached beginning 1038 + * of the function 1039 + */ 1040 + break; 1066 1041 /* if read wasn't screened by an earlier write ... */ 1067 1042 if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) 1068 1043 break; ··· 1118 1077 * which resets stack/reg liveness for state transitions 1119 1078 */ 1120 1079 state->regs[value_regno].live |= REG_LIVE_WRITTEN; 1121 - mark_stack_slot_read(env, vstate, vstate->parent, spi, 1122 - reg_state->frameno); 1123 1080 } 1081 + mark_stack_slot_read(env, vstate, vstate->parent, spi, 1082 + reg_state->frameno); 1124 1083 return 0; 1125 1084 } else { 1085 + int zeros = 0; 1086 + 1126 1087 for (i = 0; i < size; i++) { 1127 - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { 1128 - verbose(env, "invalid read from stack off %d+%d size %d\n", 1129 - off, i, size); 1130 - return -EACCES; 1088 + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) 1089 + continue; 1090 + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { 1091 + zeros++; 1092 + continue; 1131 1093 } 1094 + verbose(env, "invalid read from stack off %d+%d size %d\n", 1095 + off, i, size); 1096 + return -EACCES; 1132 1097 } 1133 - if (value_regno >= 0) 1134 - /* have read misc data from the stack */ 1135 - mark_reg_unknown(env, state->regs, value_regno); 1098 + mark_stack_slot_read(env, vstate, vstate->parent, spi, 1099 + reg_state->frameno); 1100 + if (value_regno >= 0) { 1101 + if (zeros == size) { 1102 + /* any size read into register is zero extended, 1103 + * so the whole register == const_zero 1104 + */ 1105 + __mark_reg_const_zero(&state->regs[value_regno]); 1106 + } else { 1107 + /* have read misc data from the stack */ 1108 + mark_reg_unknown(env, state->regs, value_regno); 1109 + } 1110 + state->regs[value_regno].live |= REG_LIVE_WRITTEN; 1111 + } 1136 1112 return 0; 1137 1113 } 1138 1114 } ··· 1636 1578 BPF_SIZE(insn->code), BPF_WRITE, -1); 1637 1579 } 1638 1580 1639 - /* Does this register contain a constant zero? */ 1640 - static bool register_is_null(struct bpf_reg_state *reg) 1641 - { 1642 - return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); 1643 - } 1644 - 1645 1581 /* when register 'regno' is passed into function that will read 'access_size' 1646 1582 * bytes from that pointer, make sure that it's within stack boundary 1647 1583 * and all elements of stack are initialized. ··· 1685 1633 } 1686 1634 1687 1635 for (i = 0; i < access_size; i++) { 1636 + u8 *stype; 1637 + 1688 1638 slot = -(off + i) - 1; 1689 1639 spi = slot / BPF_REG_SIZE; 1690 - if (state->allocated_stack <= slot || 1691 - state->stack[spi].slot_type[slot % BPF_REG_SIZE] != 1692 - STACK_MISC) { 1693 - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", 1694 - off, i, access_size); 1695 - return -EACCES; 1640 + if (state->allocated_stack <= slot) 1641 + goto err; 1642 + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; 1643 + if (*stype == STACK_MISC) 1644 + goto mark; 1645 + if (*stype == STACK_ZERO) { 1646 + /* helper can write anything into the stack */ 1647 + *stype = STACK_MISC; 1648 + goto mark; 1696 1649 } 1650 + err: 1651 + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", 1652 + off, i, access_size); 1653 + return -EACCES; 1654 + mark: 1655 + /* reading any byte out of 8-byte 'spill_slot' will cause 1656 + * the whole slot to be marked as 'read' 1657 + */ 1658 + mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, 1659 + spi, state->frameno); 1697 1660 } 1698 1661 return update_stack_depth(env, state, off); 1699 1662 } ··· 4089 4022 for (i = 0; i < old->allocated_stack; i++) { 4090 4023 spi = i / BPF_REG_SIZE; 4091 4024 4025 + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) 4026 + /* explored state didn't use this */ 4027 + return true; 4028 + 4092 4029 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) 4030 + continue; 4031 + /* if old state was safe with misc data in the stack 4032 + * it will be safe with zero-initialized stack. 4033 + * The opposite is not true 4034 + */ 4035 + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && 4036 + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) 4093 4037 continue; 4094 4038 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != 4095 4039 cur->stack[spi].slot_type[i % BPF_REG_SIZE]) ··· 4242 4164 parent = vparent->frame[frame]; 4243 4165 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && 4244 4166 i < parent->allocated_stack / BPF_REG_SIZE; i++) { 4245 - if (parent->stack[i].slot_type[0] != STACK_SPILL) 4246 - continue; 4247 - if (state->stack[i].slot_type[0] != STACK_SPILL) 4248 - continue; 4249 4167 if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) 4250 4168 continue; 4251 4169 if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) ··· 4321 4247 struct bpf_func_state *frame = cur->frame[j]; 4322 4248 4323 4249 for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) 4324 - if (frame->stack[i].slot_type[0] == STACK_SPILL) 4325 - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; 4250 + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; 4326 4251 } 4327 4252 return 0; 4328 4253 }