Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

selftests/bpf: add multi-uprobe benchmarks

Add multi-uprobe and multi-uretprobe benchmarks to bench tool.
Multi- and classic uprobes/uretprobes have different low-level
triggering code paths, so it's sometimes important to be able to
benchmark both flavors of uprobes/uretprobes.

Sample examples from my dev machine below. Single-threaded peformance
almost doesn't differ, but with more parallel CPUs triggering the same
uprobe/uretprobe the difference grows. This might be due to [0], but
given the code is slightly different, there could be other sources of
slowdown.

Note, all these numbers will change due to ongoing work to improve
uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this
is useful for measurements and debugging nevertheless.

\#!/bin/bash
set -eufo pipefail
for p in 1 8 16 32; do
for i in uprobe-nop uretprobe-nop uprobe-multi-nop uretprobe-multi-nop; do
summary=$(sudo ./bench -w1 -d3 -p$p -a trig-$i | tail -n1)
total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-)
percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut -d'/' -f1)
printf "%-21s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu"
done
echo
done

uprobe-nop ( 1 cpus): 1.020 ± 0.005M/s ( 1.020M/s/cpu)
uretprobe-nop ( 1 cpus): 0.515 ± 0.009M/s ( 0.515M/s/cpu)
uprobe-multi-nop ( 1 cpus): 1.036 ± 0.004M/s ( 1.036M/s/cpu)
uretprobe-multi-nop ( 1 cpus): 0.512 ± 0.005M/s ( 0.512M/s/cpu)

uprobe-nop ( 8 cpus): 3.481 ± 0.030M/s ( 0.435M/s/cpu)
uretprobe-nop ( 8 cpus): 2.222 ± 0.008M/s ( 0.278M/s/cpu)
uprobe-multi-nop ( 8 cpus): 3.769 ± 0.094M/s ( 0.471M/s/cpu)
uretprobe-multi-nop ( 8 cpus): 2.482 ± 0.007M/s ( 0.310M/s/cpu)

uprobe-nop (16 cpus): 2.968 ± 0.011M/s ( 0.185M/s/cpu)
uretprobe-nop (16 cpus): 1.870 ± 0.002M/s ( 0.117M/s/cpu)
uprobe-multi-nop (16 cpus): 3.541 ± 0.037M/s ( 0.221M/s/cpu)
uretprobe-multi-nop (16 cpus): 2.123 ± 0.026M/s ( 0.133M/s/cpu)

uprobe-nop (32 cpus): 2.524 ± 0.026M/s ( 0.079M/s/cpu)
uretprobe-nop (32 cpus): 1.572 ± 0.003M/s ( 0.049M/s/cpu)
uprobe-multi-nop (32 cpus): 2.717 ± 0.003M/s ( 0.085M/s/cpu)
uretprobe-multi-nop (32 cpus): 1.687 ± 0.007M/s ( 0.053M/s/cpu)

[0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/
[1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240806042935.3867862-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Andrii Nakryiko and committed by
Alexei Starovoitov
f727b13d 4e9e0760

+85 -15
+12
tools/testing/selftests/bpf/bench.c
··· 520 520 extern const struct bench bench_trig_uretprobe_push; 521 521 extern const struct bench bench_trig_uprobe_ret; 522 522 extern const struct bench bench_trig_uretprobe_ret; 523 + extern const struct bench bench_trig_uprobe_multi_nop; 524 + extern const struct bench bench_trig_uretprobe_multi_nop; 525 + extern const struct bench bench_trig_uprobe_multi_push; 526 + extern const struct bench bench_trig_uretprobe_multi_push; 527 + extern const struct bench bench_trig_uprobe_multi_ret; 528 + extern const struct bench bench_trig_uretprobe_multi_ret; 523 529 524 530 extern const struct bench bench_rb_libbpf; 525 531 extern const struct bench bench_rb_custom; ··· 580 574 &bench_trig_uretprobe_push, 581 575 &bench_trig_uprobe_ret, 582 576 &bench_trig_uretprobe_ret, 577 + &bench_trig_uprobe_multi_nop, 578 + &bench_trig_uretprobe_multi_nop, 579 + &bench_trig_uprobe_multi_push, 580 + &bench_trig_uretprobe_multi_push, 581 + &bench_trig_uprobe_multi_ret, 582 + &bench_trig_uretprobe_multi_ret, 583 583 /* ringbuf/perfbuf benchmarks */ 584 584 &bench_rb_libbpf, 585 585 &bench_rb_custom,
+66 -15
tools/testing/selftests/bpf/benchs/bench_trigger.c
··· 332 332 return NULL; 333 333 } 334 334 335 - static void usetup(bool use_retprobe, void *target_addr) 335 + static void usetup(bool use_retprobe, bool use_multi, void *target_addr) 336 336 { 337 337 size_t uprobe_offset; 338 338 struct bpf_link *link; ··· 346 346 exit(1); 347 347 } 348 348 349 - bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); 349 + if (use_multi) 350 + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe_multi, true); 351 + else 352 + bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true); 350 353 351 354 err = trigger_bench__load(ctx.skel); 352 355 if (err) { ··· 358 355 } 359 356 360 357 uprobe_offset = get_uprobe_offset(target_addr); 361 - link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, 362 - use_retprobe, 363 - -1 /* all PIDs */, 364 - "/proc/self/exe", 365 - uprobe_offset); 358 + if (use_multi) { 359 + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, 360 + .retprobe = use_retprobe, 361 + .cnt = 1, 362 + .offsets = &uprobe_offset, 363 + ); 364 + link = bpf_program__attach_uprobe_multi( 365 + ctx.skel->progs.bench_trigger_uprobe_multi, 366 + -1 /* all PIDs */, "/proc/self/exe", NULL, &opts); 367 + ctx.skel->links.bench_trigger_uprobe_multi = link; 368 + } else { 369 + link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, 370 + use_retprobe, 371 + -1 /* all PIDs */, 372 + "/proc/self/exe", 373 + uprobe_offset); 374 + ctx.skel->links.bench_trigger_uprobe = link; 375 + } 366 376 if (!link) { 367 - fprintf(stderr, "failed to attach uprobe!\n"); 377 + fprintf(stderr, "failed to attach %s!\n", use_multi ? "multi-uprobe" : "uprobe"); 368 378 exit(1); 369 379 } 370 - ctx.skel->links.bench_trigger_uprobe = link; 371 380 } 372 381 373 382 static void usermode_count_setup(void) ··· 389 374 390 375 static void uprobe_nop_setup(void) 391 376 { 392 - usetup(false, &uprobe_target_nop); 377 + usetup(false, false /* !use_multi */, &uprobe_target_nop); 393 378 } 394 379 395 380 static void uretprobe_nop_setup(void) 396 381 { 397 - usetup(true, &uprobe_target_nop); 382 + usetup(true, false /* !use_multi */, &uprobe_target_nop); 398 383 } 399 384 400 385 static void uprobe_push_setup(void) 401 386 { 402 - usetup(false, &uprobe_target_push); 387 + usetup(false, false /* !use_multi */, &uprobe_target_push); 403 388 } 404 389 405 390 static void uretprobe_push_setup(void) 406 391 { 407 - usetup(true, &uprobe_target_push); 392 + usetup(true, false /* !use_multi */, &uprobe_target_push); 408 393 } 409 394 410 395 static void uprobe_ret_setup(void) 411 396 { 412 - usetup(false, &uprobe_target_ret); 397 + usetup(false, false /* !use_multi */, &uprobe_target_ret); 413 398 } 414 399 415 400 static void uretprobe_ret_setup(void) 416 401 { 417 - usetup(true, &uprobe_target_ret); 402 + usetup(true, false /* !use_multi */, &uprobe_target_ret); 403 + } 404 + 405 + static void uprobe_multi_nop_setup(void) 406 + { 407 + usetup(false, true /* use_multi */, &uprobe_target_nop); 408 + } 409 + 410 + static void uretprobe_multi_nop_setup(void) 411 + { 412 + usetup(true, true /* use_multi */, &uprobe_target_nop); 413 + } 414 + 415 + static void uprobe_multi_push_setup(void) 416 + { 417 + usetup(false, true /* use_multi */, &uprobe_target_push); 418 + } 419 + 420 + static void uretprobe_multi_push_setup(void) 421 + { 422 + usetup(true, true /* use_multi */, &uprobe_target_push); 423 + } 424 + 425 + static void uprobe_multi_ret_setup(void) 426 + { 427 + usetup(false, true /* use_multi */, &uprobe_target_ret); 428 + } 429 + 430 + static void uretprobe_multi_ret_setup(void) 431 + { 432 + usetup(true, true /* use_multi */, &uprobe_target_ret); 418 433 } 419 434 420 435 const struct bench bench_trig_syscall_count = { ··· 499 454 BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop"); 500 455 BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push"); 501 456 BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret"); 457 + BENCH_TRIG_USERMODE(uprobe_multi_nop, nop, "uprobe-multi-nop"); 458 + BENCH_TRIG_USERMODE(uprobe_multi_push, push, "uprobe-multi-push"); 459 + BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret"); 460 + BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop"); 461 + BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push"); 462 + BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret");
+7
tools/testing/selftests/bpf/progs/trigger_bench.c
··· 32 32 return 0; 33 33 } 34 34 35 + SEC("?uprobe.multi") 36 + int bench_trigger_uprobe_multi(void *ctx) 37 + { 38 + inc_counter(); 39 + return 0; 40 + } 41 + 35 42 const volatile int batch_iters = 0; 36 43 37 44 SEC("?raw_tp")