Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: support deferring bpf_link dealloc to after RCU grace period

BPF link for some program types is passed as a "context" which can be
used by those BPF programs to look up additional information. E.g., for
multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values.

Because of this runtime dependency, when bpf_link refcnt drops to zero
there could still be active BPF programs running accessing link data.

This patch adds generic support to defer bpf_link dealloc callback to
after RCU GP, if requested. This is done by exposing two different
deallocation callbacks, one synchronous and one deferred. If deferred
one is provided, bpf_link_free() will schedule dealloc_deferred()
callback to happen after RCU GP.

BPF is using two flavors of RCU: "classic" non-sleepable one and RCU
tasks trace one. The latter is used when sleepable BPF programs are
used. bpf_link_free() accommodates that by checking underlying BPF
program's sleepable flag, and goes either through normal RCU GP only for
non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP
(taking into account rcu_trace_implies_rcu_gp() optimization), if BPF
program is sleepable.

We use this for multi-kprobe and multi-uprobe links, which dereference
link during program run. We also preventively switch raw_tp link to use
deferred dealloc callback, as upcoming changes in bpf-next tree expose
raw_tp link data (specifically, cookie value) to BPF program at runtime
as well.

Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com
Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com
Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Andrii Nakryiko and committed by
Alexei Starovoitov
1a80dbcb e9c856ca

+49 -6
+15 -1
include/linux/bpf.h
··· 1574 1574 enum bpf_link_type type; 1575 1575 const struct bpf_link_ops *ops; 1576 1576 struct bpf_prog *prog; 1577 - struct work_struct work; 1577 + /* rcu is used before freeing, work can be used to schedule that 1578 + * RCU-based freeing before that, so they never overlap 1579 + */ 1580 + union { 1581 + struct rcu_head rcu; 1582 + struct work_struct work; 1583 + }; 1578 1584 }; 1579 1585 1580 1586 struct bpf_link_ops { 1581 1587 void (*release)(struct bpf_link *link); 1588 + /* deallocate link resources callback, called without RCU grace period 1589 + * waiting 1590 + */ 1582 1591 void (*dealloc)(struct bpf_link *link); 1592 + /* deallocate link resources callback, called after RCU grace period; 1593 + * if underlying BPF program is sleepable we go through tasks trace 1594 + * RCU GP and then "classic" RCU GP 1595 + */ 1596 + void (*dealloc_deferred)(struct bpf_link *link); 1583 1597 int (*detach)(struct bpf_link *link); 1584 1598 int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, 1585 1599 struct bpf_prog *old_prog);
+32 -3
kernel/bpf/syscall.c
··· 3024 3024 atomic64_inc(&link->refcnt); 3025 3025 } 3026 3026 3027 + static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) 3028 + { 3029 + struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); 3030 + 3031 + /* free bpf_link and its containing memory */ 3032 + link->ops->dealloc_deferred(link); 3033 + } 3034 + 3035 + static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) 3036 + { 3037 + if (rcu_trace_implies_rcu_gp()) 3038 + bpf_link_defer_dealloc_rcu_gp(rcu); 3039 + else 3040 + call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); 3041 + } 3042 + 3027 3043 /* bpf_link_free is guaranteed to be called from process context */ 3028 3044 static void bpf_link_free(struct bpf_link *link) 3029 3045 { 3046 + bool sleepable = false; 3047 + 3030 3048 bpf_link_free_id(link->id); 3031 3049 if (link->prog) { 3050 + sleepable = link->prog->sleepable; 3032 3051 /* detach BPF program, clean up used resources */ 3033 3052 link->ops->release(link); 3034 3053 bpf_prog_put(link->prog); 3035 3054 } 3036 - /* free bpf_link and its containing memory */ 3037 - link->ops->dealloc(link); 3055 + if (link->ops->dealloc_deferred) { 3056 + /* schedule BPF link deallocation; if underlying BPF program 3057 + * is sleepable, we need to first wait for RCU tasks trace 3058 + * sync, then go through "classic" RCU grace period 3059 + */ 3060 + if (sleepable) 3061 + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); 3062 + else 3063 + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); 3064 + } 3065 + if (link->ops->dealloc) 3066 + link->ops->dealloc(link); 3038 3067 } 3039 3068 3040 3069 static void bpf_link_put_deferred(struct work_struct *work) ··· 3573 3544 3574 3545 static const struct bpf_link_ops bpf_raw_tp_link_lops = { 3575 3546 .release = bpf_raw_tp_link_release, 3576 - .dealloc = bpf_raw_tp_link_dealloc, 3547 + .dealloc_deferred = bpf_raw_tp_link_dealloc, 3577 3548 .show_fdinfo = bpf_raw_tp_link_show_fdinfo, 3578 3549 .fill_link_info = bpf_raw_tp_link_fill_link_info, 3579 3550 };
+2 -2
kernel/trace/bpf_trace.c
··· 2728 2728 2729 2729 static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { 2730 2730 .release = bpf_kprobe_multi_link_release, 2731 - .dealloc = bpf_kprobe_multi_link_dealloc, 2731 + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, 2732 2732 .fill_link_info = bpf_kprobe_multi_link_fill_link_info, 2733 2733 }; 2734 2734 ··· 3242 3242 3243 3243 static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { 3244 3244 .release = bpf_uprobe_multi_link_release, 3245 - .dealloc = bpf_uprobe_multi_link_dealloc, 3245 + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, 3246 3246 .fill_link_info = bpf_uprobe_multi_link_fill_link_info, 3247 3247 }; 3248 3248