Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpers

Add sleepable implementations of bpf_get_stack() and
bpf_get_task_stack() helpers and allow them to be used from sleepable
BPF program (e.g., sleepable uprobes).

Note, the stack trace IPs capturing itself is not sleepable (that would
need to be a separate project), only build ID fetching is sleepable and
thus more reliable, as it will wait for data to be paged in, if
necessary. For that we make use of sleepable build_id_parse()
implementation.

Now that build ID related internals in kernel/bpf/stackmap.c can be used
both in sleepable and non-sleepable contexts, we need to add additional
rcu_read_lock()/rcu_read_unlock() protection around fetching
perf_callchain_entry, but with the refactoring in previous commit it's
now pretty straightforward. We make sure to do rcu_read_unlock (in
sleepable mode only) right before stack_map_get_build_id_offset() call
which can sleep. By that time we don't have any more use of
perf_callchain_entry.

Note, bpf_get_task_stack() will fail for user mode if task != current.
And for kernel mode build ID are irrelevant. So in that sense adding
sleepable bpf_get_task_stack() implementation is a no-op. It feel right
to wire this up for symmetry and completeness, but I'm open to just
dropping it until we support `user && crosstask` condition.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240829174232.3133883-10-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Andrii Nakryiko and committed by
Alexei Starovoitov
d4dd9775 4f4c4fc0

+77 -20
+2
include/linux/bpf.h
··· 3200 3200 extern const struct bpf_func_proto bpf_get_current_comm_proto; 3201 3201 extern const struct bpf_func_proto bpf_get_stackid_proto; 3202 3202 extern const struct bpf_func_proto bpf_get_stack_proto; 3203 + extern const struct bpf_func_proto bpf_get_stack_sleepable_proto; 3203 3204 extern const struct bpf_func_proto bpf_get_task_stack_proto; 3205 + extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto; 3204 3206 extern const struct bpf_func_proto bpf_get_stackid_proto_pe; 3205 3207 extern const struct bpf_func_proto bpf_get_stack_proto_pe; 3206 3208 extern const struct bpf_func_proto bpf_sock_map_update_proto;
+72 -18
kernel/bpf/stackmap.c
··· 124 124 return ERR_PTR(err); 125 125 } 126 126 127 + static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault) 128 + { 129 + return may_fault ? build_id_parse(vma, build_id, NULL) 130 + : build_id_parse_nofault(vma, build_id, NULL); 131 + } 132 + 127 133 /* 128 134 * Expects all id_offs[i].ip values to be set to correct initial IPs. 129 135 * They will be subsequently: ··· 141 135 * BPF_STACK_BUILD_ID_IP. 142 136 */ 143 137 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, 144 - u32 trace_nr, bool user) 138 + u32 trace_nr, bool user, bool may_fault) 145 139 { 146 140 int i; 147 141 struct mmap_unlock_irq_work *work = NULL; ··· 172 166 goto build_id_valid; 173 167 } 174 168 vma = find_vma(current->mm, ip); 175 - if (!vma || build_id_parse_nofault(vma, id_offs[i].build_id, NULL)) { 169 + if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { 176 170 /* per entry fall back to ips */ 177 171 id_offs[i].status = BPF_STACK_BUILD_ID_IP; 178 172 memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); ··· 263 257 id_offs = (struct bpf_stack_build_id *)new_bucket->data; 264 258 for (i = 0; i < trace_nr; i++) 265 259 id_offs[i].ip = ips[i]; 266 - stack_map_get_build_id_offset(id_offs, trace_nr, user); 260 + stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */); 267 261 trace_len = trace_nr * sizeof(struct bpf_stack_build_id); 268 262 if (hash_matches && bucket->nr == trace_nr && 269 263 memcmp(bucket->data, new_bucket->data, trace_len) == 0) { ··· 404 398 405 399 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, 406 400 struct perf_callchain_entry *trace_in, 407 - void *buf, u32 size, u64 flags) 401 + void *buf, u32 size, u64 flags, bool may_fault) 408 402 { 409 403 u32 trace_nr, copy_len, elem_size, num_elem, max_depth; 410 404 bool user_build_id = flags & BPF_F_USER_BUILD_ID; ··· 422 416 if (kernel && user_build_id) 423 417 goto clear; 424 418 425 - elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) 426 - : sizeof(u64); 419 + elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64); 427 420 if (unlikely(size % elem_size)) 428 421 goto clear; 429 422 ··· 443 438 if (sysctl_perf_event_max_stack < max_depth) 444 439 max_depth = sysctl_perf_event_max_stack; 445 440 441 + if (may_fault) 442 + rcu_read_lock(); /* need RCU for perf's callchain below */ 443 + 446 444 if (trace_in) 447 445 trace = trace_in; 448 446 else if (kernel && task) ··· 453 445 else 454 446 trace = get_perf_callchain(regs, 0, kernel, user, max_depth, 455 447 crosstask, false); 456 - if (unlikely(!trace)) 457 - goto err_fault; 458 448 459 - if (trace->nr < skip) 449 + if (unlikely(!trace) || trace->nr < skip) { 450 + if (may_fault) 451 + rcu_read_unlock(); 460 452 goto err_fault; 453 + } 461 454 462 455 trace_nr = trace->nr - skip; 463 456 trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; 464 457 copy_len = trace_nr * elem_size; 465 458 466 459 ips = trace->ip + skip; 467 - if (user && user_build_id) { 460 + if (user_build_id) { 468 461 struct bpf_stack_build_id *id_offs = buf; 469 462 u32 i; 470 463 471 464 for (i = 0; i < trace_nr; i++) 472 465 id_offs[i].ip = ips[i]; 473 - stack_map_get_build_id_offset(buf, trace_nr, user); 474 466 } else { 475 467 memcpy(buf, ips, copy_len); 476 468 } 469 + 470 + /* trace/ips should not be dereferenced after this point */ 471 + if (may_fault) 472 + rcu_read_unlock(); 473 + 474 + if (user_build_id) 475 + stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); 477 476 478 477 if (size > copy_len) 479 478 memset(buf + copy_len, 0, size - copy_len); ··· 496 481 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, 497 482 u64, flags) 498 483 { 499 - return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); 484 + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); 500 485 } 501 486 502 487 const struct bpf_func_proto bpf_get_stack_proto = { ··· 509 494 .arg4_type = ARG_ANYTHING, 510 495 }; 511 496 512 - BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, 513 - u32, size, u64, flags) 497 + BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size, 498 + u64, flags) 499 + { 500 + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */); 501 + } 502 + 503 + const struct bpf_func_proto bpf_get_stack_sleepable_proto = { 504 + .func = bpf_get_stack_sleepable, 505 + .gpl_only = true, 506 + .ret_type = RET_INTEGER, 507 + .arg1_type = ARG_PTR_TO_CTX, 508 + .arg2_type = ARG_PTR_TO_UNINIT_MEM, 509 + .arg3_type = ARG_CONST_SIZE_OR_ZERO, 510 + .arg4_type = ARG_ANYTHING, 511 + }; 512 + 513 + static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, 514 + u64 flags, bool may_fault) 514 515 { 515 516 struct pt_regs *regs; 516 517 long res = -EINVAL; ··· 536 505 537 506 regs = task_pt_regs(task); 538 507 if (regs) 539 - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); 508 + res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault); 540 509 put_task_stack(task); 541 510 542 511 return res; 543 512 } 544 513 514 + BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, 515 + u32, size, u64, flags) 516 + { 517 + return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */); 518 + } 519 + 545 520 const struct bpf_func_proto bpf_get_task_stack_proto = { 546 521 .func = bpf_get_task_stack, 522 + .gpl_only = false, 523 + .ret_type = RET_INTEGER, 524 + .arg1_type = ARG_PTR_TO_BTF_ID, 525 + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 526 + .arg2_type = ARG_PTR_TO_UNINIT_MEM, 527 + .arg3_type = ARG_CONST_SIZE_OR_ZERO, 528 + .arg4_type = ARG_ANYTHING, 529 + }; 530 + 531 + BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf, 532 + u32, size, u64, flags) 533 + { 534 + return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */); 535 + } 536 + 537 + const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = { 538 + .func = bpf_get_task_stack_sleepable, 547 539 .gpl_only = false, 548 540 .ret_type = RET_INTEGER, 549 541 .arg1_type = ARG_PTR_TO_BTF_ID, ··· 587 533 __u64 nr_kernel; 588 534 589 535 if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) 590 - return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); 536 + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); 591 537 592 538 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 593 539 BPF_F_USER_BUILD_ID))) ··· 607 553 __u64 nr = trace->nr; 608 554 609 555 trace->nr = nr_kernel; 610 - err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); 556 + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); 611 557 612 558 /* restore nr */ 613 559 trace->nr = nr; ··· 619 565 goto clear; 620 566 621 567 flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; 622 - err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); 568 + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); 623 569 } 624 570 return err; 625 571
+3 -2
kernel/trace/bpf_trace.c
··· 1507 1507 case BPF_FUNC_jiffies64: 1508 1508 return &bpf_jiffies64_proto; 1509 1509 case BPF_FUNC_get_task_stack: 1510 - return &bpf_get_task_stack_proto; 1510 + return prog->sleepable ? &bpf_get_task_stack_sleepable_proto 1511 + : &bpf_get_task_stack_proto; 1511 1512 case BPF_FUNC_copy_from_user: 1512 1513 return &bpf_copy_from_user_proto; 1513 1514 case BPF_FUNC_copy_from_user_task: ··· 1564 1563 case BPF_FUNC_get_stackid: 1565 1564 return &bpf_get_stackid_proto; 1566 1565 case BPF_FUNC_get_stack: 1567 - return &bpf_get_stack_proto; 1566 + return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto; 1568 1567 #ifdef CONFIG_BPF_KPROBE_OVERRIDE 1569 1568 case BPF_FUNC_override_return: 1570 1569 return &bpf_override_return_proto;