perf stat: Introduce 'bperf' to share hardware PMCs with BPF

+11

tools/perf/Documentation/perf-stat.txt

··· 93 93 94 94 1.102235068 seconds time elapsed 95 95 96 + --bpf-counters:: 97 + Use BPF programs to aggregate readings from perf_events. This 98 + allows multiple perf-stat sessions that are counting the same metric (cycles, 99 + instructions, etc.) to share hardware counters. 100 + 101 + --bpf-attr-map:: 102 + With option "--bpf-counters", different perf-stat sessions share 103 + information about shared BPF programs and maps via a pinned hashmap. 104 + Use "--bpf-attr-map" to specify the path of this pinned hashmap. 105 + The default path is /sys/fs/bpf/perf_attr_map. 106 + 96 107 ifdef::HAVE_LIBPFM[] 97 108 --pfm-events events:: 98 109 Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)

+1

tools/perf/Makefile.perf

··· 1007 1007 SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel) 1008 1008 SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp) 1009 1009 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h 1010 + SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h 1010 1011 1011 1012 ifdef BUILD_BPF_SKEL 1012 1013 BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool

+10

tools/perf/builtin-stat.c

··· 792 792 } 793 793 794 794 evlist__for_each_cpu (evsel_list, i, cpu) { 795 + /* 796 + * bperf calls evsel__open_per_cpu() in bperf__load(), so 797 + * no need to call it again here. 798 + */ 799 + if (target.use_bpf) 800 + break; 795 801 affinity__set(&affinity, cpu); 796 802 797 803 evlist__for_each_entry(evsel_list, counter) { ··· 1152 1146 #ifdef HAVE_BPF_SKEL 1153 1147 OPT_STRING('b', "bpf-prog", &target.bpf_str, "bpf-prog-id", 1154 1148 "stat events on existing bpf program id"), 1149 + OPT_BOOLEAN(0, "bpf-counters", &target.use_bpf, 1150 + "use bpf program to count events"), 1151 + OPT_STRING(0, "bpf-attr-map", &target.attr_map, "attr-map-path", 1152 + "path to perf_event_attr map"), 1155 1153 #endif 1156 1154 OPT_BOOLEAN('a', "all-cpus", &target.system_wide, 1157 1155 "system-wide collection from all CPUs"),

+514 -5

tools/perf/util/bpf_counter.c

··· 5 5 #include <assert.h> 6 6 #include <limits.h> 7 7 #include <unistd.h> 8 + #include <sys/file.h> 8 9 #include <sys/time.h> 9 10 #include <sys/resource.h> 10 11 #include <linux/err.h> ··· 13 12 #include <bpf/bpf.h> 14 13 #include <bpf/btf.h> 15 14 #include <bpf/libbpf.h> 15 + #include <api/fs/fs.h> 16 16 17 17 #include "bpf_counter.h" 18 18 #include "counts.h" 19 19 #include "debug.h" 20 20 #include "evsel.h" 21 + #include "evlist.h" 21 22 #include "target.h" 23 + #include "cpumap.h" 24 + #include "thread_map.h" 22 25 23 26 #include "bpf_skel/bpf_prog_profiler.skel.h" 27 + #include "bpf_skel/bperf_u.h" 28 + #include "bpf_skel/bperf_leader.skel.h" 29 + #include "bpf_skel/bperf_follower.skel.h" 30 + 31 + /* 32 + * bperf uses a hashmap, the attr_map, to track all the leader programs. 33 + * The hashmap is pinned in bpffs. flock() on this file is used to ensure 34 + * no concurrent access to the attr_map. The key of attr_map is struct 35 + * perf_event_attr, and the value is struct perf_event_attr_map_entry. 36 + * 37 + * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the 38 + * leader prog, and the diff_map. Each perf-stat session holds a reference 39 + * to the bpf_link to make sure the leader prog is attached to sched_switch 40 + * tracepoint. 41 + * 42 + * Since the hashmap only contains IDs of the bpf_link and diff_map, it 43 + * does not hold any references to the leader program. Once all perf-stat 44 + * sessions of these events exit, the leader prog, its maps, and the 45 + * perf_events will be freed. 46 + */ 47 + struct perf_event_attr_map_entry { 48 + __u32 link_id; 49 + __u32 diff_map_id; 50 + }; 51 + 52 + #define DEFAULT_ATTR_MAP_PATH "fs/bpf/perf_attr_map" 53 + #define ATTR_MAP_SIZE 16 24 54 25 55 static inline void *u64_to_ptr(__u64 ptr) 26 56 { ··· 306 274 .install_pe = bpf_program_profiler__install_pe, 307 275 }; 308 276 277 + static __u32 bpf_link_get_id(int fd) 278 + { 279 + struct bpf_link_info link_info = {0}; 280 + __u32 link_info_len = sizeof(link_info); 281 + 282 + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); 283 + return link_info.id; 284 + } 285 + 286 + static __u32 bpf_link_get_prog_id(int fd) 287 + { 288 + struct bpf_link_info link_info = {0}; 289 + __u32 link_info_len = sizeof(link_info); 290 + 291 + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); 292 + return link_info.prog_id; 293 + } 294 + 295 + static __u32 bpf_map_get_id(int fd) 296 + { 297 + struct bpf_map_info map_info = {0}; 298 + __u32 map_info_len = sizeof(map_info); 299 + 300 + bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len); 301 + return map_info.id; 302 + } 303 + 304 + static int bperf_lock_attr_map(struct target *target) 305 + { 306 + char path[PATH_MAX]; 307 + int map_fd, err; 308 + 309 + if (target->attr_map) { 310 + scnprintf(path, PATH_MAX, "%s", target->attr_map); 311 + } else { 312 + scnprintf(path, PATH_MAX, "%s/%s", sysfs__mountpoint(), 313 + DEFAULT_ATTR_MAP_PATH); 314 + } 315 + 316 + if (access(path, F_OK)) { 317 + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, 318 + sizeof(struct perf_event_attr), 319 + sizeof(struct perf_event_attr_map_entry), 320 + ATTR_MAP_SIZE, 0); 321 + if (map_fd < 0) 322 + return -1; 323 + 324 + err = bpf_obj_pin(map_fd, path); 325 + if (err) { 326 + /* someone pinned the map in parallel? */ 327 + close(map_fd); 328 + map_fd = bpf_obj_get(path); 329 + if (map_fd < 0) 330 + return -1; 331 + } 332 + } else { 333 + map_fd = bpf_obj_get(path); 334 + if (map_fd < 0) 335 + return -1; 336 + } 337 + 338 + err = flock(map_fd, LOCK_EX); 339 + if (err) { 340 + close(map_fd); 341 + return -1; 342 + } 343 + return map_fd; 344 + } 345 + 346 + /* trigger the leader program on a cpu */ 347 + static int bperf_trigger_reading(int prog_fd, int cpu) 348 + { 349 + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, 350 + .ctx_in = NULL, 351 + .ctx_size_in = 0, 352 + .flags = BPF_F_TEST_RUN_ON_CPU, 353 + .cpu = cpu, 354 + .retval = 0, 355 + ); 356 + 357 + return bpf_prog_test_run_opts(prog_fd, &opts); 358 + } 359 + 360 + static int bperf_check_target(struct evsel *evsel, 361 + struct target *target, 362 + enum bperf_filter_type *filter_type, 363 + __u32 *filter_entry_cnt) 364 + { 365 + if (evsel->leader->core.nr_members > 1) { 366 + pr_err("bpf managed perf events do not yet support groups.\n"); 367 + return -1; 368 + } 369 + 370 + /* determine filter type based on target */ 371 + if (target->system_wide) { 372 + *filter_type = BPERF_FILTER_GLOBAL; 373 + *filter_entry_cnt = 1; 374 + } else if (target->cpu_list) { 375 + *filter_type = BPERF_FILTER_CPU; 376 + *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel)); 377 + } else if (target->tid) { 378 + *filter_type = BPERF_FILTER_PID; 379 + *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); 380 + } else if (target->pid || evsel->evlist->workload.pid != -1) { 381 + *filter_type = BPERF_FILTER_TGID; 382 + *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); 383 + } else { 384 + pr_err("bpf managed perf events do not yet support these targets.\n"); 385 + return -1; 386 + } 387 + 388 + return 0; 389 + } 390 + 391 + static struct perf_cpu_map *all_cpu_map; 392 + 393 + static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, 394 + struct perf_event_attr_map_entry *entry) 395 + { 396 + struct bperf_leader_bpf *skel = bperf_leader_bpf__open(); 397 + int link_fd, diff_map_fd, err; 398 + struct bpf_link *link = NULL; 399 + 400 + if (!skel) { 401 + pr_err("Failed to open leader skeleton\n"); 402 + return -1; 403 + } 404 + 405 + bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus()); 406 + err = bperf_leader_bpf__load(skel); 407 + if (err) { 408 + pr_err("Failed to load leader skeleton\n"); 409 + goto out; 410 + } 411 + 412 + err = -1; 413 + link = bpf_program__attach(skel->progs.on_switch); 414 + if (!link) { 415 + pr_err("Failed to attach leader program\n"); 416 + goto out; 417 + } 418 + 419 + link_fd = bpf_link__fd(link); 420 + diff_map_fd = bpf_map__fd(skel->maps.diff_readings); 421 + entry->link_id = bpf_link_get_id(link_fd); 422 + entry->diff_map_id = bpf_map_get_id(diff_map_fd); 423 + err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY); 424 + assert(err == 0); 425 + 426 + evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id); 427 + assert(evsel->bperf_leader_link_fd >= 0); 428 + 429 + /* 430 + * save leader_skel for install_pe, which is called within 431 + * following evsel__open_per_cpu call 432 + */ 433 + evsel->leader_skel = skel; 434 + evsel__open_per_cpu(evsel, all_cpu_map, -1); 435 + 436 + out: 437 + bperf_leader_bpf__destroy(skel); 438 + bpf_link__destroy(link); 439 + return err; 440 + } 441 + 442 + static int bperf__load(struct evsel *evsel, struct target *target) 443 + { 444 + struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff}; 445 + int attr_map_fd, diff_map_fd = -1, err; 446 + enum bperf_filter_type filter_type; 447 + __u32 filter_entry_cnt, i; 448 + 449 + if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt)) 450 + return -1; 451 + 452 + if (!all_cpu_map) { 453 + all_cpu_map = perf_cpu_map__new(NULL); 454 + if (!all_cpu_map) 455 + return -1; 456 + } 457 + 458 + evsel->bperf_leader_prog_fd = -1; 459 + evsel->bperf_leader_link_fd = -1; 460 + 461 + /* 462 + * Step 1: hold a fd on the leader program and the bpf_link, if 463 + * the program is not already gone, reload the program. 464 + * Use flock() to ensure exclusive access to the perf_event_attr 465 + * map. 466 + */ 467 + attr_map_fd = bperf_lock_attr_map(target); 468 + if (attr_map_fd < 0) { 469 + pr_err("Failed to lock perf_event_attr map\n"); 470 + return -1; 471 + } 472 + 473 + err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry); 474 + if (err) { 475 + err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY); 476 + if (err) 477 + goto out; 478 + } 479 + 480 + evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id); 481 + if (evsel->bperf_leader_link_fd < 0 && 482 + bperf_reload_leader_program(evsel, attr_map_fd, &entry)) 483 + goto out; 484 + 485 + /* 486 + * The bpf_link holds reference to the leader program, and the 487 + * leader program holds reference to the maps. Therefore, if 488 + * link_id is valid, diff_map_id should also be valid. 489 + */ 490 + evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id( 491 + bpf_link_get_prog_id(evsel->bperf_leader_link_fd)); 492 + assert(evsel->bperf_leader_prog_fd >= 0); 493 + 494 + diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id); 495 + assert(diff_map_fd >= 0); 496 + 497 + /* 498 + * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check 499 + * whether the kernel support it 500 + */ 501 + err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0); 502 + if (err) { 503 + pr_err("The kernel does not support test_run for raw_tp BPF programs.\n" 504 + "Therefore, --use-bpf might show inaccurate readings\n"); 505 + goto out; 506 + } 507 + 508 + /* Step 2: load the follower skeleton */ 509 + evsel->follower_skel = bperf_follower_bpf__open(); 510 + if (!evsel->follower_skel) { 511 + pr_err("Failed to open follower skeleton\n"); 512 + goto out; 513 + } 514 + 515 + /* attach fexit program to the leader program */ 516 + bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX, 517 + evsel->bperf_leader_prog_fd, "on_switch"); 518 + 519 + /* connect to leader diff_reading map */ 520 + bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd); 521 + 522 + /* set up reading map */ 523 + bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings, 524 + filter_entry_cnt); 525 + /* set up follower filter based on target */ 526 + bpf_map__set_max_entries(evsel->follower_skel->maps.filter, 527 + filter_entry_cnt); 528 + err = bperf_follower_bpf__load(evsel->follower_skel); 529 + if (err) { 530 + pr_err("Failed to load follower skeleton\n"); 531 + bperf_follower_bpf__destroy(evsel->follower_skel); 532 + evsel->follower_skel = NULL; 533 + goto out; 534 + } 535 + 536 + for (i = 0; i < filter_entry_cnt; i++) { 537 + int filter_map_fd; 538 + __u32 key; 539 + 540 + if (filter_type == BPERF_FILTER_PID || 541 + filter_type == BPERF_FILTER_TGID) 542 + key = evsel->core.threads->map[i].pid; 543 + else if (filter_type == BPERF_FILTER_CPU) 544 + key = evsel->core.cpus->map[i]; 545 + else 546 + break; 547 + 548 + filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter); 549 + bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY); 550 + } 551 + 552 + evsel->follower_skel->bss->type = filter_type; 553 + 554 + err = bperf_follower_bpf__attach(evsel->follower_skel); 555 + 556 + out: 557 + if (err && evsel->bperf_leader_link_fd >= 0) 558 + close(evsel->bperf_leader_link_fd); 559 + if (err && evsel->bperf_leader_prog_fd >= 0) 560 + close(evsel->bperf_leader_prog_fd); 561 + if (diff_map_fd >= 0) 562 + close(diff_map_fd); 563 + 564 + flock(attr_map_fd, LOCK_UN); 565 + close(attr_map_fd); 566 + 567 + return err; 568 + } 569 + 570 + static int bperf__install_pe(struct evsel *evsel, int cpu, int fd) 571 + { 572 + struct bperf_leader_bpf *skel = evsel->leader_skel; 573 + 574 + return bpf_map_update_elem(bpf_map__fd(skel->maps.events), 575 + &cpu, &fd, BPF_ANY); 576 + } 577 + 578 + /* 579 + * trigger the leader prog on each cpu, so the accum_reading map could get 580 + * the latest readings. 581 + */ 582 + static int bperf_sync_counters(struct evsel *evsel) 583 + { 584 + int num_cpu, i, cpu; 585 + 586 + num_cpu = all_cpu_map->nr; 587 + for (i = 0; i < num_cpu; i++) { 588 + cpu = all_cpu_map->map[i]; 589 + bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu); 590 + } 591 + return 0; 592 + } 593 + 594 + static int bperf__enable(struct evsel *evsel) 595 + { 596 + evsel->follower_skel->bss->enabled = 1; 597 + return 0; 598 + } 599 + 600 + static int bperf__read(struct evsel *evsel) 601 + { 602 + struct bperf_follower_bpf *skel = evsel->follower_skel; 603 + __u32 num_cpu_bpf = cpu__max_cpu(); 604 + struct bpf_perf_event_value values[num_cpu_bpf]; 605 + int reading_map_fd, err = 0; 606 + __u32 i, j, num_cpu; 607 + 608 + bperf_sync_counters(evsel); 609 + reading_map_fd = bpf_map__fd(skel->maps.accum_readings); 610 + 611 + for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) { 612 + __u32 cpu; 613 + 614 + err = bpf_map_lookup_elem(reading_map_fd, &i, values); 615 + if (err) 616 + goto out; 617 + switch (evsel->follower_skel->bss->type) { 618 + case BPERF_FILTER_GLOBAL: 619 + assert(i == 0); 620 + 621 + num_cpu = all_cpu_map->nr; 622 + for (j = 0; j < num_cpu; j++) { 623 + cpu = all_cpu_map->map[j]; 624 + perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter; 625 + perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled; 626 + perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running; 627 + } 628 + break; 629 + case BPERF_FILTER_CPU: 630 + cpu = evsel->core.cpus->map[i]; 631 + perf_counts(evsel->counts, i, 0)->val = values[cpu].counter; 632 + perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled; 633 + perf_counts(evsel->counts, i, 0)->run = values[cpu].running; 634 + break; 635 + case BPERF_FILTER_PID: 636 + case BPERF_FILTER_TGID: 637 + perf_counts(evsel->counts, 0, i)->val = 0; 638 + perf_counts(evsel->counts, 0, i)->ena = 0; 639 + perf_counts(evsel->counts, 0, i)->run = 0; 640 + 641 + for (cpu = 0; cpu < num_cpu_bpf; cpu++) { 642 + perf_counts(evsel->counts, 0, i)->val += values[cpu].counter; 643 + perf_counts(evsel->counts, 0, i)->ena += values[cpu].enabled; 644 + perf_counts(evsel->counts, 0, i)->run += values[cpu].running; 645 + } 646 + break; 647 + default: 648 + break; 649 + } 650 + } 651 + out: 652 + return err; 653 + } 654 + 655 + static int bperf__destroy(struct evsel *evsel) 656 + { 657 + bperf_follower_bpf__destroy(evsel->follower_skel); 658 + close(evsel->bperf_leader_prog_fd); 659 + close(evsel->bperf_leader_link_fd); 660 + return 0; 661 + } 662 + 663 + /* 664 + * bperf: share hardware PMCs with BPF 665 + * 666 + * perf uses performance monitoring counters (PMC) to monitor system 667 + * performance. The PMCs are limited hardware resources. For example, 668 + * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu. 669 + * 670 + * Modern data center systems use these PMCs in many different ways: 671 + * system level monitoring, (maybe nested) container level monitoring, per 672 + * process monitoring, profiling (in sample mode), etc. In some cases, 673 + * there are more active perf_events than available hardware PMCs. To allow 674 + * all perf_events to have a chance to run, it is necessary to do expensive 675 + * time multiplexing of events. 676 + * 677 + * On the other hand, many monitoring tools count the common metrics 678 + * (cycles, instructions). It is a waste to have multiple tools create 679 + * multiple perf_events of "cycles" and occupy multiple PMCs. 680 + * 681 + * bperf tries to reduce such wastes by allowing multiple perf_events of 682 + * "cycles" or "instructions" (at different scopes) to share PMUs. Instead 683 + * of having each perf-stat session to read its own perf_events, bperf uses 684 + * BPF programs to read the perf_events and aggregate readings to BPF maps. 685 + * Then, the perf-stat session(s) reads the values from these BPF maps. 686 + * 687 + * || 688 + * shared progs and maps <- || -> per session progs and maps 689 + * || 690 + * --------------- || 691 + * | perf_events | || 692 + * --------------- fexit || ----------------- 693 + * | --------||----> | follower prog | 694 + * --------------- / || --- ----------------- 695 + * cs -> | leader prog |/ ||/ | | 696 + * --> --------------- /|| -------------- ------------------ 697 + * / | | / || | filter map | | accum_readings | 698 + * / ------------ ------------ || -------------- ------------------ 699 + * | | prev map | | diff map | || | 700 + * | ------------ ------------ || | 701 + * \ || | 702 + * = \ ==================================================== | ============ 703 + * \ / user space 704 + * \ / 705 + * \ / 706 + * BPF_PROG_TEST_RUN BPF_MAP_LOOKUP_ELEM 707 + * \ / 708 + * \ / 709 + * \------ perf-stat ----------------------/ 710 + * 711 + * The figure above shows the architecture of bperf. Note that the figure 712 + * is divided into 3 regions: shared progs and maps (top left), per session 713 + * progs and maps (top right), and user space (bottom). 714 + * 715 + * The leader prog is triggered on each context switch (cs). The leader 716 + * prog reads perf_events and stores the difference (current_reading - 717 + * previous_reading) to the diff map. For the same metric, e.g. "cycles", 718 + * multiple perf-stat sessions share the same leader prog. 719 + * 720 + * Each perf-stat session creates a follower prog as fexit program to the 721 + * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38) 722 + * follower progs to the same leader prog. The follower prog checks current 723 + * task and processor ID to decide whether to add the value from the diff 724 + * map to its accumulated reading map (accum_readings). 725 + * 726 + * Finally, perf-stat user space reads the value from accum_reading map. 727 + * 728 + * Besides context switch, it is also necessary to trigger the leader prog 729 + * before perf-stat reads the value. Otherwise, the accum_reading map may 730 + * not have the latest reading from the perf_events. This is achieved by 731 + * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU. 732 + * 733 + * Comment before the definition of struct perf_event_attr_map_entry 734 + * describes how different sessions of perf-stat share information about 735 + * the leader prog. 736 + */ 737 + 738 + struct bpf_counter_ops bperf_ops = { 739 + .load = bperf__load, 740 + .enable = bperf__enable, 741 + .read = bperf__read, 742 + .install_pe = bperf__install_pe, 743 + .destroy = bperf__destroy, 744 + }; 745 + 746 + static inline bool bpf_counter_skip(struct evsel *evsel) 747 + { 748 + return list_empty(&evsel->bpf_counter_list) && 749 + evsel->follower_skel == NULL; 750 + } 751 + 309 752 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd) 310 753 { 311 - if (list_empty(&evsel->bpf_counter_list)) 754 + if (bpf_counter_skip(evsel)) 312 755 return 0; 313 756 return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd); 314 757 } 315 758 316 759 int bpf_counter__load(struct evsel *evsel, struct target *target) 317 760 { 318 - if (target__has_bpf(target)) 761 + if (target->bpf_str) 319 762 evsel->bpf_counter_ops = &bpf_program_profiler_ops; 763 + else if (target->use_bpf) 764 + evsel->bpf_counter_ops = &bperf_ops; 320 765 321 766 if (evsel->bpf_counter_ops) 322 767 return evsel->bpf_counter_ops->load(evsel, target); ··· 802 293 803 294 int bpf_counter__enable(struct evsel *evsel) 804 295 { 805 - if (list_empty(&evsel->bpf_counter_list)) 296 + if (bpf_counter_skip(evsel)) 806 297 return 0; 807 298 return evsel->bpf_counter_ops->enable(evsel); 808 299 } 809 300 810 301 int bpf_counter__read(struct evsel *evsel) 811 302 { 812 - if (list_empty(&evsel->bpf_counter_list)) 303 + if (bpf_counter_skip(evsel)) 813 304 return -EAGAIN; 814 305 return evsel->bpf_counter_ops->read(evsel); 815 306 } 816 307 817 308 void bpf_counter__destroy(struct evsel *evsel) 818 309 { 819 - if (list_empty(&evsel->bpf_counter_list)) 310 + if (bpf_counter_skip(evsel)) 820 311 return; 821 312 evsel->bpf_counter_ops->destroy(evsel); 822 313 evsel->bpf_counter_ops = NULL;

+14

tools/perf/util/bpf_skel/bperf.h

··· 1 + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + // Copyright (c) 2021 Facebook 3 + 4 + #ifndef __BPERF_STAT_H 5 + #define __BPERF_STAT_H 6 + 7 + typedef struct { 8 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 9 + __uint(key_size, sizeof(__u32)); 10 + __uint(value_size, sizeof(struct bpf_perf_event_value)); 11 + __uint(max_entries, 1); 12 + } reading_map; 13 + 14 + #endif /* __BPERF_STAT_H */

+69

tools/perf/util/bpf_skel/bperf_follower.bpf.c

··· 1 + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + // Copyright (c) 2021 Facebook 3 + #include <linux/bpf.h> 4 + #include <linux/perf_event.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_tracing.h> 7 + #include "bperf.h" 8 + #include "bperf_u.h" 9 + 10 + reading_map diff_readings SEC(".maps"); 11 + reading_map accum_readings SEC(".maps"); 12 + 13 + struct { 14 + __uint(type, BPF_MAP_TYPE_HASH); 15 + __uint(key_size, sizeof(__u32)); 16 + __uint(value_size, sizeof(__u32)); 17 + } filter SEC(".maps"); 18 + 19 + enum bperf_filter_type type = 0; 20 + int enabled = 0; 21 + 22 + SEC("fexit/XXX") 23 + int BPF_PROG(fexit_XXX) 24 + { 25 + struct bpf_perf_event_value *diff_val, *accum_val; 26 + __u32 filter_key, zero = 0; 27 + __u32 *accum_key; 28 + 29 + if (!enabled) 30 + return 0; 31 + 32 + switch (type) { 33 + case BPERF_FILTER_GLOBAL: 34 + accum_key = &zero; 35 + goto do_add; 36 + case BPERF_FILTER_CPU: 37 + filter_key = bpf_get_smp_processor_id(); 38 + break; 39 + case BPERF_FILTER_PID: 40 + filter_key = bpf_get_current_pid_tgid() & 0xffffffff; 41 + break; 42 + case BPERF_FILTER_TGID: 43 + filter_key = bpf_get_current_pid_tgid() >> 32; 44 + break; 45 + default: 46 + return 0; 47 + } 48 + 49 + accum_key = bpf_map_lookup_elem(&filter, &filter_key); 50 + if (!accum_key) 51 + return 0; 52 + 53 + do_add: 54 + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); 55 + if (!diff_val) 56 + return 0; 57 + 58 + accum_val = bpf_map_lookup_elem(&accum_readings, accum_key); 59 + if (!accum_val) 60 + return 0; 61 + 62 + accum_val->counter += diff_val->counter; 63 + accum_val->enabled += diff_val->enabled; 64 + accum_val->running += diff_val->running; 65 + 66 + return 0; 67 + } 68 + 69 + char LICENSE[] SEC("license") = "Dual BSD/GPL";

+46

tools/perf/util/bpf_skel/bperf_leader.bpf.c

··· 1 + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + // Copyright (c) 2021 Facebook 3 + #include <linux/bpf.h> 4 + #include <linux/perf_event.h> 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_tracing.h> 7 + #include "bperf.h" 8 + 9 + struct { 10 + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 11 + __uint(key_size, sizeof(__u32)); 12 + __uint(value_size, sizeof(int)); 13 + __uint(map_flags, BPF_F_PRESERVE_ELEMS); 14 + } events SEC(".maps"); 15 + 16 + reading_map prev_readings SEC(".maps"); 17 + reading_map diff_readings SEC(".maps"); 18 + 19 + SEC("raw_tp/sched_switch") 20 + int BPF_PROG(on_switch) 21 + { 22 + struct bpf_perf_event_value val, *prev_val, *diff_val; 23 + __u32 key = bpf_get_smp_processor_id(); 24 + __u32 zero = 0; 25 + long err; 26 + 27 + prev_val = bpf_map_lookup_elem(&prev_readings, &zero); 28 + if (!prev_val) 29 + return 0; 30 + 31 + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); 32 + if (!diff_val) 33 + return 0; 34 + 35 + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); 36 + if (err) 37 + return 0; 38 + 39 + diff_val->counter = val.counter - prev_val->counter; 40 + diff_val->enabled = val.enabled - prev_val->enabled; 41 + diff_val->running = val.running - prev_val->running; 42 + *prev_val = val; 43 + return 0; 44 + } 45 + 46 + char LICENSE[] SEC("license") = "Dual BSD/GPL";

+14

tools/perf/util/bpf_skel/bperf_u.h

··· 1 + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + // Copyright (c) 2021 Facebook 3 + 4 + #ifndef __BPERF_STAT_U_H 5 + #define __BPERF_STAT_U_H 6 + 7 + enum bperf_filter_type { 8 + BPERF_FILTER_GLOBAL = 1, 9 + BPERF_FILTER_CPU, 10 + BPERF_FILTER_PID, 11 + BPERF_FILTER_TGID, 12 + }; 13 + 14 + #endif /* __BPERF_STAT_U_H */

+19 -1

tools/perf/util/evsel.h

··· 20 20 struct bpf_counter_ops; 21 21 struct target; 22 22 struct hashmap; 23 + struct bperf_leader_bpf; 24 + struct bperf_follower_bpf; 23 25 24 26 typedef int (evsel__sb_cb_t)(union perf_event *event, void *data); 25 27 ··· 132 130 * See also evsel__has_callchain(). 133 131 */ 134 132 __u64 synth_sample_type; 135 - struct list_head bpf_counter_list; 133 + 134 + /* 135 + * bpf_counter_ops serves two use cases: 136 + * 1. perf-stat -b counting events used byBPF programs 137 + * 2. perf-stat --use-bpf use BPF programs to aggregate counts 138 + */ 136 139 struct bpf_counter_ops *bpf_counter_ops; 140 + 141 + /* for perf-stat -b */ 142 + struct list_head bpf_counter_list; 143 + 144 + /* for perf-stat --use-bpf */ 145 + int bperf_leader_prog_fd; 146 + int bperf_leader_link_fd; 147 + union { 148 + struct bperf_leader_bpf *leader_skel; 149 + struct bperf_follower_bpf *follower_skel; 150 + }; 137 151 }; 138 152 139 153 struct perf_missing_features {

+3 -1

tools/perf/util/target.h

··· 16 16 bool uses_mmap; 17 17 bool default_per_cpu; 18 18 bool per_thread; 19 + bool use_bpf; 20 + const char *attr_map; 19 21 }; 20 22 21 23 enum target_errno { ··· 68 66 69 67 static inline bool target__has_bpf(struct target *target) 70 68 { 71 - return target->bpf_str; 69 + return target->bpf_str || target->use_bpf; 72 70 } 73 71 74 72 static inline bool target__none(struct target *target)