Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf evsel: Refactor tool events

Tool events unnecessarily open a dummy perf event which is useless
even with `perf record` which will still open a dummy event. Change
the behavior of tool events so:

- duration_time - call `rdclock` on open and then report the count as
a delta since the start in evsel__read_counter. This moves code out
of builtin-stat making it more general purpose.

- user_time/system_time - open the fd as either `/proc/pid/stat` or
`/proc/stat` for cases like system wide. evsel__read_counter will
read the appropriate field out of the procfs file. These values
were previously supplied by wait4, if the procfs read fails then
the wait4 values are used, assuming the process/thread terminated.
By reading user_time and system_time this way, interval mode, per
PID and per CPU can be supported although there are restrictions
given what the files provide (e.g. per PID can't be combined with
per CPU).

Opening any of the tool events for `perf record` is changed to return
invalid.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Weilin Wang <weilin.wang@intel.com>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: James Clark <james.clark@arm.com>
Cc: Dmitrii Dolgov <9erthalion6@gmail.com>
Cc: Ze Gao <zegao2021@gmail.com>
Cc: Song Liu <song@kernel.org>
Cc: Leo Yan <leo.yan@linux.dev>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240503232849.17752-1-irogers@google.com

authored by

Ian Rogers and committed by
Namhyung Kim
6828d692 658a8805

+276 -44
+32 -41
tools/perf/builtin-stat.c
··· 284 284 process_synthesized_event, NULL); 285 285 } 286 286 287 - static int read_single_counter(struct evsel *counter, int cpu_map_idx, 288 - int thread, struct timespec *rs) 287 + static int read_single_counter(struct evsel *counter, int cpu_map_idx, int thread) 289 288 { 290 - switch(counter->tool_event) { 291 - case PERF_TOOL_DURATION_TIME: { 292 - u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL; 293 - struct perf_counts_values *count = 294 - perf_counts(counter->counts, cpu_map_idx, thread); 295 - count->ena = count->run = val; 296 - count->val = val; 297 - return 0; 298 - } 299 - case PERF_TOOL_USER_TIME: 300 - case PERF_TOOL_SYSTEM_TIME: { 301 - u64 val; 302 - struct perf_counts_values *count = 303 - perf_counts(counter->counts, cpu_map_idx, thread); 304 - if (counter->tool_event == PERF_TOOL_USER_TIME) 305 - val = ru_stats.ru_utime_usec_stat.mean; 306 - else 307 - val = ru_stats.ru_stime_usec_stat.mean; 308 - count->ena = count->run = val; 309 - count->val = val; 310 - return 0; 311 - } 312 - default: 313 - case PERF_TOOL_NONE: 314 - return evsel__read_counter(counter, cpu_map_idx, thread); 315 - case PERF_TOOL_MAX: 316 - /* This should never be reached */ 317 - return 0; 289 + int err = evsel__read_counter(counter, cpu_map_idx, thread); 290 + 291 + /* 292 + * Reading user and system time will fail when the process 293 + * terminates. Use the wait4 values in that case. 294 + */ 295 + if (err && cpu_map_idx == 0 && 296 + (counter->tool_event == PERF_TOOL_USER_TIME || 297 + counter->tool_event == PERF_TOOL_SYSTEM_TIME)) { 298 + u64 val, *start_time; 299 + struct perf_counts_values *count = 300 + perf_counts(counter->counts, cpu_map_idx, thread); 301 + 302 + start_time = xyarray__entry(counter->start_times, cpu_map_idx, thread); 303 + if (counter->tool_event == PERF_TOOL_USER_TIME) 304 + val = ru_stats.ru_utime_usec_stat.mean; 305 + else 306 + val = ru_stats.ru_stime_usec_stat.mean; 307 + count->ena = count->run = *start_time + val; 308 + count->val = val; 309 + return 0; 318 310 } 311 + return err; 319 312 } 320 313 321 314 /* 322 315 * Read out the results of a single counter: 323 316 * do not aggregate counts across CPUs in system-wide mode 324 317 */ 325 - static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_map_idx) 318 + static int read_counter_cpu(struct evsel *counter, int cpu_map_idx) 326 319 { 327 320 int nthreads = perf_thread_map__nr(evsel_list->core.threads); 328 321 int thread; ··· 333 340 * (via evsel__read_counter()) and sets their count->loaded. 334 341 */ 335 342 if (!perf_counts__is_loaded(counter->counts, cpu_map_idx, thread) && 336 - read_single_counter(counter, cpu_map_idx, thread, rs)) { 343 + read_single_counter(counter, cpu_map_idx, thread)) { 337 344 counter->counts->scaled = -1; 338 345 perf_counts(counter->counts, cpu_map_idx, thread)->ena = 0; 339 346 perf_counts(counter->counts, cpu_map_idx, thread)->run = 0; ··· 362 369 return 0; 363 370 } 364 371 365 - static int read_affinity_counters(struct timespec *rs) 372 + static int read_affinity_counters(void) 366 373 { 367 374 struct evlist_cpu_iterator evlist_cpu_itr; 368 375 struct affinity saved_affinity, *affinity; ··· 383 390 if (evsel__is_bpf(counter)) 384 391 continue; 385 392 386 - if (!counter->err) { 387 - counter->err = read_counter_cpu(counter, rs, 388 - evlist_cpu_itr.cpu_map_idx); 389 - } 393 + if (!counter->err) 394 + counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx); 390 395 } 391 396 if (affinity) 392 397 affinity__cleanup(&saved_affinity); ··· 408 417 return 0; 409 418 } 410 419 411 - static int read_counters(struct timespec *rs) 420 + static int read_counters(void) 412 421 { 413 422 if (!stat_config.stop_read_counter) { 414 423 if (read_bpf_map_counters() || 415 - read_affinity_counters(rs)) 424 + read_affinity_counters()) 416 425 return -1; 417 426 } 418 427 return 0; ··· 443 452 444 453 evlist__reset_aggr_stats(evsel_list); 445 454 446 - if (read_counters(&rs) == 0) 455 + if (read_counters() == 0) 447 456 process_counters(); 448 457 449 458 if (STAT_RECORD) { ··· 931 940 * avoid arbitrary skew, we must read all counters before closing any 932 941 * group leaders. 933 942 */ 934 - if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0) 943 + if (read_counters() == 0) 935 944 process_counters(); 936 945 937 946 /*
+224 -2
tools/perf/util/evsel.c
··· 10 10 #include <errno.h> 11 11 #include <inttypes.h> 12 12 #include <linux/bitops.h> 13 + #include <api/io.h> 13 14 #include <api/fs/fs.h> 14 15 #include <api/fs/tracing_path.h> 15 16 #include <linux/hw_breakpoint.h> ··· 31 30 #include "counts.h" 32 31 #include "event.h" 33 32 #include "evsel.h" 33 + #include "time-utils.h" 34 34 #include "util/env.h" 35 35 #include "util/evsel_config.h" 36 36 #include "util/evsel_fprintf.h" ··· 1495 1493 evsel->per_pkg_mask = NULL; 1496 1494 zfree(&evsel->metric_events); 1497 1495 perf_evsel__object.fini(evsel); 1496 + if (evsel->tool_event == PERF_TOOL_SYSTEM_TIME || 1497 + evsel->tool_event == PERF_TOOL_USER_TIME) 1498 + xyarray__delete(evsel->start_times); 1498 1499 } 1499 1500 1500 1501 void evsel__delete(struct evsel *evsel) ··· 1611 1606 return evsel__process_group_data(leader, cpu_map_idx, thread, data); 1612 1607 } 1613 1608 1609 + static bool read_until_char(struct io *io, char e) 1610 + { 1611 + char c; 1612 + 1613 + do { 1614 + c = io__get_char(io); 1615 + if (c == -1) 1616 + return false; 1617 + } while (c != e); 1618 + return true; 1619 + } 1620 + 1621 + static int read_stat_field(int fd, struct perf_cpu cpu, int field, __u64 *val) 1622 + { 1623 + char buf[256]; 1624 + struct io io; 1625 + int i; 1626 + 1627 + io__init(&io, fd, buf, sizeof(buf)); 1628 + 1629 + /* Skip lines to relevant CPU. */ 1630 + for (i = -1; i < cpu.cpu; i++) { 1631 + if (!read_until_char(&io, '\n')) 1632 + return -EINVAL; 1633 + } 1634 + /* Skip to "cpu". */ 1635 + if (io__get_char(&io) != 'c') return -EINVAL; 1636 + if (io__get_char(&io) != 'p') return -EINVAL; 1637 + if (io__get_char(&io) != 'u') return -EINVAL; 1638 + 1639 + /* Skip N of cpuN. */ 1640 + if (!read_until_char(&io, ' ')) 1641 + return -EINVAL; 1642 + 1643 + i = 1; 1644 + while (true) { 1645 + if (io__get_dec(&io, val) != ' ') 1646 + break; 1647 + if (field == i) 1648 + return 0; 1649 + i++; 1650 + } 1651 + return -EINVAL; 1652 + } 1653 + 1654 + static int read_pid_stat_field(int fd, int field, __u64 *val) 1655 + { 1656 + char buf[256]; 1657 + struct io io; 1658 + int c, i; 1659 + 1660 + io__init(&io, fd, buf, sizeof(buf)); 1661 + if (io__get_dec(&io, val) != ' ') 1662 + return -EINVAL; 1663 + if (field == 1) 1664 + return 0; 1665 + 1666 + /* Skip comm. */ 1667 + if (io__get_char(&io) != '(' || !read_until_char(&io, ')')) 1668 + return -EINVAL; 1669 + if (field == 2) 1670 + return -EINVAL; /* String can't be returned. */ 1671 + 1672 + /* Skip state */ 1673 + if (io__get_char(&io) != ' ' || io__get_char(&io) == -1) 1674 + return -EINVAL; 1675 + if (field == 3) 1676 + return -EINVAL; /* String can't be returned. */ 1677 + 1678 + /* Loop over numeric fields*/ 1679 + if (io__get_char(&io) != ' ') 1680 + return -EINVAL; 1681 + 1682 + i = 4; 1683 + while (true) { 1684 + c = io__get_dec(&io, val); 1685 + if (c == -1) 1686 + return -EINVAL; 1687 + if (c == -2) { 1688 + /* Assume a -ve was read */ 1689 + c = io__get_dec(&io, val); 1690 + *val *= -1; 1691 + } 1692 + if (c != ' ') 1693 + return -EINVAL; 1694 + if (field == i) 1695 + return 0; 1696 + i++; 1697 + } 1698 + return -EINVAL; 1699 + } 1700 + 1701 + static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread) 1702 + { 1703 + __u64 *start_time, cur_time, delta_start; 1704 + int fd, err = 0; 1705 + struct perf_counts_values *count; 1706 + bool adjust = false; 1707 + 1708 + count = perf_counts(evsel->counts, cpu_map_idx, thread); 1709 + 1710 + switch (evsel->tool_event) { 1711 + case PERF_TOOL_DURATION_TIME: 1712 + /* 1713 + * Pretend duration_time is only on the first CPU and thread, or 1714 + * else aggregation will scale duration_time by the number of 1715 + * CPUs/threads. 1716 + */ 1717 + start_time = &evsel->start_time; 1718 + if (cpu_map_idx == 0 && thread == 0) 1719 + cur_time = rdclock(); 1720 + else 1721 + cur_time = *start_time; 1722 + break; 1723 + case PERF_TOOL_USER_TIME: 1724 + case PERF_TOOL_SYSTEM_TIME: { 1725 + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME; 1726 + 1727 + start_time = xyarray__entry(evsel->start_times, cpu_map_idx, thread); 1728 + fd = FD(evsel, cpu_map_idx, thread); 1729 + lseek(fd, SEEK_SET, 0); 1730 + if (evsel->pid_stat) { 1731 + /* The event exists solely on 1 CPU. */ 1732 + if (cpu_map_idx == 0) 1733 + err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time); 1734 + else 1735 + cur_time = 0; 1736 + } else { 1737 + /* The event is for all threads. */ 1738 + if (thread == 0) { 1739 + struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus, 1740 + cpu_map_idx); 1741 + 1742 + err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time); 1743 + } else { 1744 + cur_time = 0; 1745 + } 1746 + } 1747 + adjust = true; 1748 + break; 1749 + } 1750 + case PERF_TOOL_NONE: 1751 + case PERF_TOOL_MAX: 1752 + default: 1753 + err = -EINVAL; 1754 + } 1755 + if (err) 1756 + return err; 1757 + 1758 + delta_start = cur_time - *start_time; 1759 + if (adjust) { 1760 + __u64 ticks_per_sec = sysconf(_SC_CLK_TCK); 1761 + 1762 + delta_start *= 1000000000 / ticks_per_sec; 1763 + } 1764 + count->val = delta_start; 1765 + count->ena = count->run = delta_start; 1766 + count->lost = 0; 1767 + return 0; 1768 + } 1769 + 1614 1770 int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread) 1615 1771 { 1616 - u64 read_format = evsel->core.attr.read_format; 1772 + if (evsel__is_tool(evsel)) 1773 + return evsel__read_tool(evsel, cpu_map_idx, thread); 1617 1774 1618 - if (read_format & PERF_FORMAT_GROUP) 1775 + if (evsel->core.attr.read_format & PERF_FORMAT_GROUP) 1619 1776 return evsel__read_group(evsel, cpu_map_idx, thread); 1620 1777 1621 1778 return evsel__read_one(evsel, cpu_map_idx, thread); ··· 1996 1829 perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0) 1997 1830 return -ENOMEM; 1998 1831 1832 + if ((evsel->tool_event == PERF_TOOL_SYSTEM_TIME || 1833 + evsel->tool_event == PERF_TOOL_USER_TIME) && 1834 + !evsel->start_times) { 1835 + evsel->start_times = xyarray__new(perf_cpu_map__nr(cpus), nthreads, sizeof(__u64)); 1836 + if (!evsel->start_times) 1837 + return -ENOMEM; 1838 + } 1839 + 1999 1840 evsel->open_flags = PERF_FLAG_FD_CLOEXEC; 2000 1841 if (evsel->cgrp) 2001 1842 evsel->open_flags |= PERF_FLAG_PID_CGROUP; ··· 2186 2011 int pid = -1, err, old_errno; 2187 2012 enum rlimit_action set_rlimit = NO_CHANGE; 2188 2013 2014 + if (evsel->tool_event == PERF_TOOL_DURATION_TIME) { 2015 + if (evsel->core.attr.sample_period) /* no sampling */ 2016 + return -EINVAL; 2017 + evsel->start_time = rdclock(); 2018 + return 0; 2019 + } 2020 + 2189 2021 err = __evsel__prepare_open(evsel, cpus, threads); 2190 2022 if (err) 2191 2023 return err; ··· 2224 2042 2225 2043 if (!evsel->cgrp && !evsel->core.system_wide) 2226 2044 pid = perf_thread_map__pid(threads, thread); 2045 + 2046 + if (evsel->tool_event == PERF_TOOL_USER_TIME || 2047 + evsel->tool_event == PERF_TOOL_SYSTEM_TIME) { 2048 + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME; 2049 + __u64 *start_time = NULL; 2050 + 2051 + if (evsel->core.attr.sample_period) { 2052 + /* no sampling */ 2053 + err = -EINVAL; 2054 + goto out_close; 2055 + } 2056 + if (pid > -1) { 2057 + char buf[64]; 2058 + 2059 + snprintf(buf, sizeof(buf), "/proc/%d/stat", pid); 2060 + fd = open(buf, O_RDONLY); 2061 + evsel->pid_stat = true; 2062 + } else { 2063 + fd = open("/proc/stat", O_RDONLY); 2064 + } 2065 + FD(evsel, idx, thread) = fd; 2066 + if (fd < 0) { 2067 + err = -errno; 2068 + goto out_close; 2069 + } 2070 + start_time = xyarray__entry(evsel->start_times, idx, thread); 2071 + if (pid > -1) { 2072 + err = read_pid_stat_field(fd, system ? 15 : 14, 2073 + start_time); 2074 + } else { 2075 + struct perf_cpu cpu; 2076 + 2077 + cpu = perf_cpu_map__cpu(evsel->core.cpus, idx); 2078 + err = read_stat_field(fd, cpu, system ? 3 : 1, 2079 + start_time); 2080 + } 2081 + if (err) 2082 + goto out_close; 2083 + continue; 2084 + } 2227 2085 2228 2086 group_fd = get_group_fd(evsel, idx, thread); 2229 2087
+14
tools/perf/util/evsel.h
··· 170 170 171 171 /* for missing_features */ 172 172 struct perf_pmu *pmu; 173 + 174 + /* For tool events */ 175 + /* Beginning time subtracted when the counter is read. */ 176 + union { 177 + /* duration_time is a single global time. */ 178 + __u64 start_time; 179 + /* 180 + * user_time and system_time read an initial value potentially 181 + * per-CPU or per-pid. 182 + */ 183 + struct xyarray *start_times; 184 + }; 185 + /* Is the tool's fd for /proc/pid/stat or /proc/stat. */ 186 + bool pid_stat; 173 187 }; 174 188 175 189 struct perf_missing_features {
+6 -1
tools/perf/util/parse-events.c
··· 305 305 .type = PERF_TYPE_SOFTWARE, 306 306 .config = PERF_COUNT_SW_DUMMY, 307 307 }; 308 + const char *cpu_list = NULL; 308 309 310 + if (tool_event == PERF_TOOL_DURATION_TIME) { 311 + /* Duration time is gathered globally, pretend it is only on CPU0. */ 312 + cpu_list = "0"; 313 + } 309 314 evsel = __add_event(list, idx, &attr, /*init_attr=*/true, /*name=*/NULL, 310 315 /*metric_id=*/NULL, /*pmu=*/NULL, 311 316 /*config_terms=*/NULL, /*auto_merge_stats=*/false, 312 - /*cpu_list=*/"0"); 317 + cpu_list); 313 318 if (!evsel) 314 319 return -ENOMEM; 315 320 evsel->tool_event = tool_event;