Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Fix prog_array_map_poke_run map poke update

Lee pointed out issue found by syscaller [0] hitting BUG in prog array
map poke update in prog_array_map_poke_run function due to error value
returned from bpf_arch_text_poke function.

There's race window where bpf_arch_text_poke can fail due to missing
bpf program kallsym symbols, which is accounted for with check for
-EINVAL in that BUG_ON call.

The problem is that in such case we won't update the tail call jump
and cause imbalance for the next tail call update check which will
fail with -EBUSY in bpf_arch_text_poke.

I'm hitting following race during the program load:

CPU 0 CPU 1

bpf_prog_load
bpf_check
do_misc_fixups
prog_array_map_poke_track

map_update_elem
bpf_fd_array_map_update_elem
prog_array_map_poke_run

bpf_arch_text_poke returns -EINVAL

bpf_prog_kallsyms_add

After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next
poke update fails on expected jump instruction check in bpf_arch_text_poke
with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run.

Similar race exists on the program unload.

Fixing this by moving the update to bpf_arch_poke_desc_update function which
makes sure we call __bpf_arch_text_poke that skips the bpf address check.

Each architecture has slightly different approach wrt looking up bpf address
in bpf_arch_text_poke, so instead of splitting the function or adding new
'checkip' argument in previous version, it seems best to move the whole
map_poke_run update as arch specific code.

[0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Cc: Lee Jones <lee@kernel.org>
Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org

authored by

Jiri Olsa and committed by
Daniel Borkmann
4b7de801 e4d008d4

+59 -48
+46
arch/x86/net/bpf_jit_comp.c
··· 3025 3025 #endif 3026 3026 WARN(1, "verification of programs using bpf_throw should have failed\n"); 3027 3027 } 3028 + 3029 + void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, 3030 + struct bpf_prog *new, struct bpf_prog *old) 3031 + { 3032 + u8 *old_addr, *new_addr, *old_bypass_addr; 3033 + int ret; 3034 + 3035 + old_bypass_addr = old ? NULL : poke->bypass_addr; 3036 + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; 3037 + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; 3038 + 3039 + /* 3040 + * On program loading or teardown, the program's kallsym entry 3041 + * might not be in place, so we use __bpf_arch_text_poke to skip 3042 + * the kallsyms check. 3043 + */ 3044 + if (new) { 3045 + ret = __bpf_arch_text_poke(poke->tailcall_target, 3046 + BPF_MOD_JUMP, 3047 + old_addr, new_addr); 3048 + BUG_ON(ret < 0); 3049 + if (!old) { 3050 + ret = __bpf_arch_text_poke(poke->tailcall_bypass, 3051 + BPF_MOD_JUMP, 3052 + poke->bypass_addr, 3053 + NULL); 3054 + BUG_ON(ret < 0); 3055 + } 3056 + } else { 3057 + ret = __bpf_arch_text_poke(poke->tailcall_bypass, 3058 + BPF_MOD_JUMP, 3059 + old_bypass_addr, 3060 + poke->bypass_addr); 3061 + BUG_ON(ret < 0); 3062 + /* let other CPUs finish the execution of program 3063 + * so that it will not possible to expose them 3064 + * to invalid nop, stack unwind, nop state 3065 + */ 3066 + if (!ret) 3067 + synchronize_rcu(); 3068 + ret = __bpf_arch_text_poke(poke->tailcall_target, 3069 + BPF_MOD_JUMP, 3070 + old_addr, NULL); 3071 + BUG_ON(ret < 0); 3072 + } 3073 + }
+3
include/linux/bpf.h
··· 3175 3175 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, 3176 3176 void *addr1, void *addr2); 3177 3177 3178 + void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, 3179 + struct bpf_prog *new, struct bpf_prog *old); 3180 + 3178 3181 void *bpf_arch_text_copy(void *dst, void *src, size_t len); 3179 3182 int bpf_arch_text_invalidate(void *dst, size_t len); 3180 3183
+10 -48
kernel/bpf/arraymap.c
··· 1012 1012 mutex_unlock(&aux->poke_mutex); 1013 1013 } 1014 1014 1015 + void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, 1016 + struct bpf_prog *new, struct bpf_prog *old) 1017 + { 1018 + WARN_ON_ONCE(1); 1019 + } 1020 + 1015 1021 static void prog_array_map_poke_run(struct bpf_map *map, u32 key, 1016 1022 struct bpf_prog *old, 1017 1023 struct bpf_prog *new) 1018 1024 { 1019 - u8 *old_addr, *new_addr, *old_bypass_addr; 1020 1025 struct prog_poke_elem *elem; 1021 1026 struct bpf_array_aux *aux; 1022 1027 ··· 1030 1025 1031 1026 list_for_each_entry(elem, &aux->poke_progs, list) { 1032 1027 struct bpf_jit_poke_descriptor *poke; 1033 - int i, ret; 1028 + int i; 1034 1029 1035 1030 for (i = 0; i < elem->aux->size_poke_tab; i++) { 1036 1031 poke = &elem->aux->poke_tab[i]; ··· 1049 1044 * activated, so tail call updates can arrive from here 1050 1045 * while JIT is still finishing its final fixup for 1051 1046 * non-activated poke entries. 1052 - * 3) On program teardown, the program's kallsym entry gets 1053 - * removed out of RCU callback, but we can only untrack 1054 - * from sleepable context, therefore bpf_arch_text_poke() 1055 - * might not see that this is in BPF text section and 1056 - * bails out with -EINVAL. As these are unreachable since 1057 - * RCU grace period already passed, we simply skip them. 1058 - * 4) Also programs reaching refcount of zero while patching 1047 + * 3) Also programs reaching refcount of zero while patching 1059 1048 * is in progress is okay since we're protected under 1060 1049 * poke_mutex and untrack the programs before the JIT 1061 - * buffer is freed. When we're still in the middle of 1062 - * patching and suddenly kallsyms entry of the program 1063 - * gets evicted, we just skip the rest which is fine due 1064 - * to point 3). 1065 - * 5) Any other error happening below from bpf_arch_text_poke() 1066 - * is a unexpected bug. 1050 + * buffer is freed. 1067 1051 */ 1068 1052 if (!READ_ONCE(poke->tailcall_target_stable)) 1069 1053 continue; ··· 1062 1068 poke->tail_call.key != key) 1063 1069 continue; 1064 1070 1065 - old_bypass_addr = old ? NULL : poke->bypass_addr; 1066 - old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; 1067 - new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; 1068 - 1069 - if (new) { 1070 - ret = bpf_arch_text_poke(poke->tailcall_target, 1071 - BPF_MOD_JUMP, 1072 - old_addr, new_addr); 1073 - BUG_ON(ret < 0 && ret != -EINVAL); 1074 - if (!old) { 1075 - ret = bpf_arch_text_poke(poke->tailcall_bypass, 1076 - BPF_MOD_JUMP, 1077 - poke->bypass_addr, 1078 - NULL); 1079 - BUG_ON(ret < 0 && ret != -EINVAL); 1080 - } 1081 - } else { 1082 - ret = bpf_arch_text_poke(poke->tailcall_bypass, 1083 - BPF_MOD_JUMP, 1084 - old_bypass_addr, 1085 - poke->bypass_addr); 1086 - BUG_ON(ret < 0 && ret != -EINVAL); 1087 - /* let other CPUs finish the execution of program 1088 - * so that it will not possible to expose them 1089 - * to invalid nop, stack unwind, nop state 1090 - */ 1091 - if (!ret) 1092 - synchronize_rcu(); 1093 - ret = bpf_arch_text_poke(poke->tailcall_target, 1094 - BPF_MOD_JUMP, 1095 - old_addr, NULL); 1096 - BUG_ON(ret < 0 && ret != -EINVAL); 1097 - } 1071 + bpf_arch_poke_desc_update(poke, new, old); 1098 1072 } 1099 1073 } 1100 1074 }