Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: fix redirect to map under tail calls

Commits 109980b894e9 ("bpf: don't select potentially stale ri->map
from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner
pointer on bpf_prog_realloc") tried to mitigate that buggy programs
using bpf_redirect_map() helper call do not leave stale maps behind.
Idea was to add a map_owner cookie into the per CPU struct redirect_info
which was set to prog->aux by the prog making the helper call as a
proof that the map is not stale since the prog is implicitly holding
a reference to it. This owner cookie could later on get compared with
the program calling into BPF whether they match and therefore the
redirect could proceed with processing the map safely.

In (obvious) hindsight, this approach breaks down when tail calls are
involved since the original caller's prog->aux pointer does not have
to match the one from one of the progs out of the tail call chain,
and therefore the xdp buffer will be dropped instead of redirected.
A way around that would be to fix the issue differently (which also
allows to remove related work in fast path at the same time): once
the life-time of a redirect map has come to its end we use it's map
free callback where we need to wait on synchronize_rcu() for current
outstanding xdp buffers and remove such a map pointer from the
redirect info if found to be present. At that time no program is
using this map anymore so we simply invalidate the map pointers to
NULL iff they previously pointed to that instance while making sure
that the redirect path only reads out the map once.

Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine")
Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs")
Reported-by: Sebastiano Miano <sebastiano.miano@polito.it>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Daniel Borkmann and committed by
Alexei Starovoitov
f6069b9a a85da34e

+38 -63
+2 -1
include/linux/filter.h
··· 543 543 u32 flags; 544 544 struct bpf_map *map; 545 545 struct bpf_map *map_to_flush; 546 - unsigned long map_owner; 547 546 u32 kern_flags; 548 547 }; 549 548 ··· 779 780 780 781 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, 781 782 const struct bpf_insn *patch, u32 len); 783 + 784 + void bpf_clear_redirect_map(struct bpf_map *map); 782 785 783 786 static inline bool xdp_return_frame_no_direct(void) 784 787 {
+2 -3
include/trace/events/xdp.h
··· 147 147 148 148 #define devmap_ifindex(fwd, map) \ 149 149 (!fwd ? 0 : \ 150 - (!map ? 0 : \ 151 - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ 152 - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) 150 + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ 151 + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)) 153 152 154 153 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ 155 154 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
+2
kernel/bpf/cpumap.c
··· 479 479 * It does __not__ ensure pending flush operations (if any) are 480 480 * complete. 481 481 */ 482 + 483 + bpf_clear_redirect_map(map); 482 484 synchronize_rcu(); 483 485 484 486 /* To ensure all pending flush operations have completed wait for flush
+1
kernel/bpf/devmap.c
··· 161 161 list_del_rcu(&dtab->list); 162 162 spin_unlock(&dev_map_lock); 163 163 164 + bpf_clear_redirect_map(map); 164 165 synchronize_rcu(); 165 166 166 167 /* To ensure all pending flush operations have completed wait for flush
-21
kernel/bpf/verifier.c
··· 5844 5844 goto patch_call_imm; 5845 5845 } 5846 5846 5847 - if (insn->imm == BPF_FUNC_redirect_map) { 5848 - /* Note, we cannot use prog directly as imm as subsequent 5849 - * rewrites would still change the prog pointer. The only 5850 - * stable address we can use is aux, which also works with 5851 - * prog clones during blinding. 5852 - */ 5853 - u64 addr = (unsigned long)prog->aux; 5854 - struct bpf_insn r4_ld[] = { 5855 - BPF_LD_IMM64(BPF_REG_4, addr), 5856 - *insn, 5857 - }; 5858 - cnt = ARRAY_SIZE(r4_ld); 5859 - 5860 - new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); 5861 - if (!new_prog) 5862 - return -ENOMEM; 5863 - 5864 - delta += cnt - 1; 5865 - env->prog = prog = new_prog; 5866 - insn = new_prog->insnsi + i + delta; 5867 - } 5868 5847 patch_call_imm: 5869 5848 fn = env->ops->get_func_proto(insn->imm, env->prog); 5870 5849 /* all functions that have prototype and verifier allowed
+1
kernel/bpf/xskmap.c
··· 75 75 struct xsk_map *m = container_of(map, struct xsk_map, map); 76 76 int i; 77 77 78 + bpf_clear_redirect_map(map); 78 79 synchronize_net(); 79 80 80 81 for (i = 0; i < map->max_entries; i++) {
+30 -38
net/core/filter.c
··· 3246 3246 } 3247 3247 } 3248 3248 3249 - static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, 3250 - unsigned long aux) 3249 + void bpf_clear_redirect_map(struct bpf_map *map) 3251 3250 { 3252 - return (unsigned long)xdp_prog->aux != aux; 3251 + struct bpf_redirect_info *ri; 3252 + int cpu; 3253 + 3254 + for_each_possible_cpu(cpu) { 3255 + ri = per_cpu_ptr(&bpf_redirect_info, cpu); 3256 + /* Avoid polluting remote cacheline due to writes if 3257 + * not needed. Once we pass this test, we need the 3258 + * cmpxchg() to make sure it hasn't been changed in 3259 + * the meantime by remote CPU. 3260 + */ 3261 + if (unlikely(READ_ONCE(ri->map) == map)) 3262 + cmpxchg(&ri->map, map, NULL); 3263 + } 3253 3264 } 3254 3265 3255 3266 static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, 3256 - struct bpf_prog *xdp_prog) 3267 + struct bpf_prog *xdp_prog, struct bpf_map *map) 3257 3268 { 3258 3269 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3259 - unsigned long map_owner = ri->map_owner; 3260 - struct bpf_map *map = ri->map; 3261 3270 u32 index = ri->ifindex; 3262 3271 void *fwd = NULL; 3263 3272 int err; 3264 3273 3265 3274 ri->ifindex = 0; 3266 - ri->map = NULL; 3267 - ri->map_owner = 0; 3268 - 3269 - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { 3270 - err = -EFAULT; 3271 - map = NULL; 3272 - goto err; 3273 - } 3275 + WRITE_ONCE(ri->map, NULL); 3274 3276 3275 3277 fwd = __xdp_map_lookup_elem(map, index); 3276 3278 if (!fwd) { ··· 3298 3296 struct bpf_prog *xdp_prog) 3299 3297 { 3300 3298 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3299 + struct bpf_map *map = READ_ONCE(ri->map); 3301 3300 struct net_device *fwd; 3302 3301 u32 index = ri->ifindex; 3303 3302 int err; 3304 3303 3305 - if (ri->map) 3306 - return xdp_do_redirect_map(dev, xdp, xdp_prog); 3304 + if (map) 3305 + return xdp_do_redirect_map(dev, xdp, xdp_prog, map); 3307 3306 3308 3307 fwd = dev_get_by_index_rcu(dev_net(dev), index); 3309 3308 ri->ifindex = 0; ··· 3328 3325 static int xdp_do_generic_redirect_map(struct net_device *dev, 3329 3326 struct sk_buff *skb, 3330 3327 struct xdp_buff *xdp, 3331 - struct bpf_prog *xdp_prog) 3328 + struct bpf_prog *xdp_prog, 3329 + struct bpf_map *map) 3332 3330 { 3333 3331 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3334 - unsigned long map_owner = ri->map_owner; 3335 - struct bpf_map *map = ri->map; 3336 3332 u32 index = ri->ifindex; 3337 3333 void *fwd = NULL; 3338 3334 int err = 0; 3339 3335 3340 3336 ri->ifindex = 0; 3341 - ri->map = NULL; 3342 - ri->map_owner = 0; 3337 + WRITE_ONCE(ri->map, NULL); 3343 3338 3344 - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { 3345 - err = -EFAULT; 3346 - map = NULL; 3347 - goto err; 3348 - } 3349 3339 fwd = __xdp_map_lookup_elem(map, index); 3350 3340 if (unlikely(!fwd)) { 3351 3341 err = -EINVAL; ··· 3375 3379 struct xdp_buff *xdp, struct bpf_prog *xdp_prog) 3376 3380 { 3377 3381 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3382 + struct bpf_map *map = READ_ONCE(ri->map); 3378 3383 u32 index = ri->ifindex; 3379 3384 struct net_device *fwd; 3380 3385 int err = 0; 3381 3386 3382 - if (ri->map) 3383 - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); 3384 - 3387 + if (map) 3388 + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, 3389 + map); 3385 3390 ri->ifindex = 0; 3386 3391 fwd = dev_get_by_index_rcu(dev_net(dev), index); 3387 3392 if (unlikely(!fwd)) { ··· 3413 3416 3414 3417 ri->ifindex = ifindex; 3415 3418 ri->flags = flags; 3416 - ri->map = NULL; 3417 - ri->map_owner = 0; 3419 + WRITE_ONCE(ri->map, NULL); 3418 3420 3419 3421 return XDP_REDIRECT; 3420 3422 } ··· 3426 3430 .arg2_type = ARG_ANYTHING, 3427 3431 }; 3428 3432 3429 - BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, 3430 - unsigned long, map_owner) 3433 + BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, 3434 + u64, flags) 3431 3435 { 3432 3436 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3433 3437 ··· 3436 3440 3437 3441 ri->ifindex = ifindex; 3438 3442 ri->flags = flags; 3439 - ri->map = map; 3440 - ri->map_owner = map_owner; 3443 + WRITE_ONCE(ri->map, map); 3441 3444 3442 3445 return XDP_REDIRECT; 3443 3446 } 3444 3447 3445 - /* Note, arg4 is hidden from users and populated by the verifier 3446 - * with the right pointer. 3447 - */ 3448 3448 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { 3449 3449 .func = bpf_xdp_redirect_map, 3450 3450 .gpl_only = false,