Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf stat: Enable iostat mode for x86 platforms

This functionality is based on recently introduced sysfs attributes for
Intel® Xeon® Scalable processor family (code name Skylake-SP):

Commit bb42b3d39781d7fc ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
PCIe root port:

- Inbound Read: I/O devices below root port read from the host memory
- Inbound Write: I/O devices below root port write to the host memory
- Outbound Read: CPU reads from I/O devices below root port
- Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Alexander Antonov <alexander.antonov@linux.intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexey V Bayduraev <alexey.v.bayduraev@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210419094147.15909-4-alexander.antonov@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Alexander Antonov and committed by
Arnaldo Carvalho de Melo
f9ed693e 19776d3c

+466 -1
+88
tools/perf/Documentation/perf-iostat.txt
··· 1 + perf-iostat(1) 2 + =============== 3 + 4 + NAME 5 + ---- 6 + perf-iostat - Show I/O performance metrics 7 + 8 + SYNOPSIS 9 + -------- 10 + [verse] 11 + 'perf iostat' list 12 + 'perf iostat' <ports> -- <command> [<options>] 13 + 14 + DESCRIPTION 15 + ----------- 16 + Mode is intended to provide four I/O performance metrics per each PCIe root port: 17 + 18 + - Inbound Read - I/O devices below root port read from the host memory, in MB 19 + 20 + - Inbound Write - I/O devices below root port write to the host memory, in MB 21 + 22 + - Outbound Read - CPU reads from I/O devices below root port, in MB 23 + 24 + - Outbound Write - CPU writes to I/O devices below root port, in MB 25 + 26 + OPTIONS 27 + ------- 28 + <command>...:: 29 + Any command you can specify in a shell. 30 + 31 + list:: 32 + List all PCIe root ports. 33 + 34 + <ports>:: 35 + Select the root ports for monitoring. Comma-separated list is supported. 36 + 37 + EXAMPLES 38 + -------- 39 + 40 + 1. List all PCIe root ports (example for 2-S platform): 41 + 42 + $ perf iostat list 43 + S0-uncore_iio_0<0000:00> 44 + S1-uncore_iio_0<0000:80> 45 + S0-uncore_iio_1<0000:17> 46 + S1-uncore_iio_1<0000:85> 47 + S0-uncore_iio_2<0000:3a> 48 + S1-uncore_iio_2<0000:ae> 49 + S0-uncore_iio_3<0000:5d> 50 + S1-uncore_iio_3<0000:d7> 51 + 52 + 2. Collect metrics for all PCIe root ports: 53 + 54 + $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 55 + 357708+0 records in 56 + 357707+0 records out 57 + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s 58 + 59 + Performance counter stats for 'system wide': 60 + 61 + port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) 62 + 0000:00 1 0 2 3 63 + 0000:80 0 0 0 0 64 + 0000:17 352552 43 0 21 65 + 0000:85 0 0 0 0 66 + 0000:3a 3 0 0 0 67 + 0000:ae 0 0 0 0 68 + 0000:5d 0 0 0 0 69 + 0000:d7 0 0 0 0 70 + 71 + 3. Collect metrics for comma-separated list of PCIe root ports: 72 + 73 + $ perf iostat 0000:17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 74 + 357708+0 records in 75 + 357707+0 records out 76 + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s 77 + 78 + Performance counter stats for 'system wide': 79 + 80 + port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) 81 + 0000:17 358559 44 0 22 82 + 0000:3a 3 2 0 0 83 + 84 + 197.081983474 seconds time elapsed 85 + 86 + SEE ALSO 87 + -------- 88 + linkperf:perf-stat[1]
+4 -1
tools/perf/Makefile.perf
··· 283 283 284 284 SCRIPT_SH += perf-archive.sh 285 285 SCRIPT_SH += perf-with-kcore.sh 286 + SCRIPT_SH += perf-iostat.sh 286 287 287 288 grep-libs = $(filter -l%,$(1)) 288 289 strip-libs = $(filter-out -l%,$(1)) ··· 949 948 $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' 950 949 $(call QUIET_INSTALL, perf-with-kcore) \ 951 950 $(INSTALL) $(OUTPUT)perf-with-kcore -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' 951 + $(call QUIET_INSTALL, perf-iostat) \ 952 + $(INSTALL) $(OUTPUT)perf-iostat -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' 952 953 ifndef NO_LIBAUDIT 953 954 $(call QUIET_INSTALL, strace/groups) \ 954 955 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(STRACE_GROUPS_INSTDIR_SQ)'; \ ··· 1045 1042 $(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS) 1046 1043 1047 1044 clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean 1048 - $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) 1045 + $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(OUTPUT)perf-iostat $(LANG_BINDINGS) 1049 1046 $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete 1050 1047 $(Q)$(RM) $(OUTPUT).config-detected 1051 1048 $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so
+1
tools/perf/arch/x86/util/Build
··· 9 9 perf-y += evlist.o 10 10 perf-y += mem-events.o 11 11 perf-y += evsel.o 12 + perf-y += iostat.o 12 13 13 14 perf-$(CONFIG_DWARF) += dwarf-regs.o 14 15 perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
+360
tools/perf/arch/x86/util/iostat.c
··· 27 27 #include "util/counts.h" 28 28 #include "path.h" 29 29 30 + #ifndef MAX_PATH 31 + #define MAX_PATH 1024 32 + #endif 33 + 34 + #define UNCORE_IIO_PMU_PATH "devices/uncore_iio_%d" 35 + #define SYSFS_UNCORE_PMU_PATH "%s/"UNCORE_IIO_PMU_PATH 36 + #define PLATFORM_MAPPING_PATH UNCORE_IIO_PMU_PATH"/die%d" 37 + 38 + /* 39 + * Each metric requiries one IIO event which increments at every 4B transfer 40 + * in corresponding direction. The formulas to compute metrics are generic: 41 + * #EventCount * 4B / (1024 * 1024) 42 + */ 43 + static const char * const iostat_metrics[] = { 44 + "Inbound Read(MB)", 45 + "Inbound Write(MB)", 46 + "Outbound Read(MB)", 47 + "Outbound Write(MB)", 48 + }; 49 + 50 + static inline int iostat_metrics_count(void) 51 + { 52 + return sizeof(iostat_metrics) / sizeof(char *); 53 + } 54 + 55 + static const char *iostat_metric_by_idx(int idx) 56 + { 57 + return *(iostat_metrics + idx % iostat_metrics_count()); 58 + } 59 + 30 60 struct iio_root_port { 31 61 u32 domain; 32 62 u8 bus; ··· 69 39 struct iio_root_port **rps; 70 40 int nr_entries; 71 41 }; 42 + 43 + static struct iio_root_ports_list *root_ports; 72 44 73 45 static void iio_root_port_show(FILE *output, 74 46 const struct iio_root_port * const rp) ··· 139 107 list->rps = tmp_buf; 140 108 } 141 109 return 0; 110 + } 111 + 112 + static int iio_mapping(u8 pmu_idx, struct iio_root_ports_list * const list) 113 + { 114 + char *buf; 115 + char path[MAX_PATH]; 116 + u32 domain; 117 + u8 bus; 118 + struct iio_root_port *rp; 119 + size_t size; 120 + int ret; 121 + 122 + for (int die = 0; die < cpu__max_node(); die++) { 123 + scnprintf(path, MAX_PATH, PLATFORM_MAPPING_PATH, pmu_idx, die); 124 + if (sysfs__read_str(path, &buf, &size) < 0) { 125 + if (pmu_idx) 126 + goto out; 127 + pr_err("Mode iostat is not supported\n"); 128 + return -1; 129 + } 130 + ret = sscanf(buf, "%04x:%02hhx", &domain, &bus); 131 + free(buf); 132 + if (ret != 2) { 133 + pr_err("Invalid mapping data: iio_%d; die%d\n", 134 + pmu_idx, die); 135 + return -1; 136 + } 137 + rp = iio_root_port_new(domain, bus, die, pmu_idx); 138 + if (!rp || iio_root_ports_list_insert(list, rp)) { 139 + free(rp); 140 + return -ENOMEM; 141 + } 142 + } 143 + out: 144 + return 0; 145 + } 146 + 147 + static u8 iio_pmu_count(void) 148 + { 149 + u8 pmu_idx = 0; 150 + char path[MAX_PATH]; 151 + const char *sysfs = sysfs__mountpoint(); 152 + 153 + if (sysfs) { 154 + for (;; pmu_idx++) { 155 + snprintf(path, sizeof(path), SYSFS_UNCORE_PMU_PATH, 156 + sysfs, pmu_idx); 157 + if (access(path, F_OK) != 0) 158 + break; 159 + } 160 + } 161 + return pmu_idx; 162 + } 163 + 164 + static int iio_root_ports_scan(struct iio_root_ports_list **list) 165 + { 166 + int ret = -ENOMEM; 167 + struct iio_root_ports_list *tmp_list; 168 + u8 pmu_count = iio_pmu_count(); 169 + 170 + if (!pmu_count) { 171 + pr_err("Unsupported uncore pmu configuration\n"); 172 + return -1; 173 + } 174 + 175 + tmp_list = calloc(1, sizeof(*tmp_list)); 176 + if (!tmp_list) 177 + goto err; 178 + 179 + for (u8 pmu_idx = 0; pmu_idx < pmu_count; pmu_idx++) { 180 + ret = iio_mapping(pmu_idx, tmp_list); 181 + if (ret) 182 + break; 183 + } 184 + err: 185 + if (!ret) 186 + *list = tmp_list; 187 + else 188 + iio_root_ports_list_free(tmp_list); 189 + 190 + return ret; 191 + } 192 + 193 + static int iio_root_port_parse_str(u32 *domain, u8 *bus, char *str) 194 + { 195 + int ret; 196 + regex_t regex; 197 + /* 198 + * Expected format domain:bus: 199 + * Valid domain range [0:ffff] 200 + * Valid bus range [0:ff] 201 + * Example: 0000:af, 0:3d, 01:7 202 + */ 203 + regcomp(&regex, "^([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})", REG_EXTENDED); 204 + ret = regexec(&regex, str, 0, NULL, 0); 205 + if (ret || sscanf(str, "%08x:%02hhx", domain, bus) != 2) 206 + pr_warning("Unrecognized root port format: %s\n" 207 + "Please use the following format:\n" 208 + "\t [domain]:[bus]\n" 209 + "\t for example: 0000:3d\n", str); 210 + 211 + regfree(&regex); 212 + return ret; 213 + } 214 + 215 + static int iio_root_ports_list_filter(struct iio_root_ports_list **list, 216 + const char *filter) 217 + { 218 + char *tok, *tmp, *filter_copy = NULL; 219 + struct iio_root_port *rp; 220 + u32 domain; 221 + u8 bus; 222 + int ret = -ENOMEM; 223 + struct iio_root_ports_list *tmp_list = calloc(1, sizeof(*tmp_list)); 224 + 225 + if (!tmp_list) 226 + goto err; 227 + 228 + filter_copy = strdup(filter); 229 + if (!filter_copy) 230 + goto err; 231 + 232 + for (tok = strtok_r(filter_copy, ",", &tmp); tok; 233 + tok = strtok_r(NULL, ",", &tmp)) { 234 + if (!iio_root_port_parse_str(&domain, &bus, tok)) { 235 + rp = iio_root_port_find_by_notation(*list, domain, bus); 236 + if (rp) { 237 + (*list)->rps[rp->idx] = NULL; 238 + ret = iio_root_ports_list_insert(tmp_list, rp); 239 + if (ret) { 240 + free(rp); 241 + goto err; 242 + } 243 + } else if (!iio_root_port_find_by_notation(tmp_list, 244 + domain, bus)) 245 + pr_warning("Root port %04x:%02x were not found\n", 246 + domain, bus); 247 + } 248 + } 249 + 250 + if (tmp_list->nr_entries == 0) { 251 + pr_err("Requested root ports were not found\n"); 252 + ret = -EINVAL; 253 + } 254 + err: 255 + iio_root_ports_list_free(*list); 256 + if (ret) 257 + iio_root_ports_list_free(tmp_list); 258 + else 259 + *list = tmp_list; 260 + 261 + free(filter_copy); 262 + return ret; 263 + } 264 + 265 + static int iostat_event_group(struct evlist *evl, 266 + struct iio_root_ports_list *list) 267 + { 268 + int ret; 269 + int idx; 270 + const char *iostat_cmd_template = 271 + "{uncore_iio_%x/event=0x83,umask=0x04,ch_mask=0xF,fc_mask=0x07/,\ 272 + uncore_iio_%x/event=0x83,umask=0x01,ch_mask=0xF,fc_mask=0x07/,\ 273 + uncore_iio_%x/event=0xc0,umask=0x04,ch_mask=0xF,fc_mask=0x07/,\ 274 + uncore_iio_%x/event=0xc0,umask=0x01,ch_mask=0xF,fc_mask=0x07/}"; 275 + const int len_template = strlen(iostat_cmd_template) + 1; 276 + struct evsel *evsel = NULL; 277 + int metrics_count = iostat_metrics_count(); 278 + char *iostat_cmd = calloc(len_template, 1); 279 + 280 + if (!iostat_cmd) 281 + return -ENOMEM; 282 + 283 + for (idx = 0; idx < list->nr_entries; idx++) { 284 + sprintf(iostat_cmd, iostat_cmd_template, 285 + list->rps[idx]->pmu_idx, list->rps[idx]->pmu_idx, 286 + list->rps[idx]->pmu_idx, list->rps[idx]->pmu_idx); 287 + ret = parse_events(evl, iostat_cmd, NULL); 288 + if (ret) 289 + goto err; 290 + } 291 + 292 + evlist__for_each_entry(evl, evsel) { 293 + evsel->priv = list->rps[evsel->idx / metrics_count]; 294 + } 295 + list->nr_entries = 0; 296 + err: 297 + iio_root_ports_list_free(list); 298 + free(iostat_cmd); 299 + return ret; 300 + } 301 + 302 + int iostat_prepare(struct evlist *evlist, struct perf_stat_config *config) 303 + { 304 + if (evlist->core.nr_entries > 0) { 305 + pr_warning("The -e and -M options are not supported." 306 + "All chosen events/metrics will be dropped\n"); 307 + evlist__delete(evlist); 308 + evlist = evlist__new(); 309 + if (!evlist) 310 + return -ENOMEM; 311 + } 312 + 313 + config->metric_only = true; 314 + config->aggr_mode = AGGR_GLOBAL; 315 + 316 + return iostat_event_group(evlist, root_ports); 317 + } 318 + 319 + int iostat_parse(const struct option *opt, const char *str, 320 + int unset __maybe_unused) 321 + { 322 + int ret; 323 + struct perf_stat_config *config = (struct perf_stat_config *)opt->data; 324 + 325 + ret = iio_root_ports_scan(&root_ports); 326 + if (!ret) { 327 + config->iostat_run = true; 328 + if (!str) 329 + iostat_mode = IOSTAT_RUN; 330 + else if (!strcmp(str, "list")) 331 + iostat_mode = IOSTAT_LIST; 332 + else { 333 + iostat_mode = IOSTAT_RUN; 334 + ret = iio_root_ports_list_filter(&root_ports, str); 335 + } 336 + } 337 + return ret; 338 + } 339 + 340 + void iostat_list(struct evlist *evlist, struct perf_stat_config *config) 341 + { 342 + struct evsel *evsel; 343 + struct iio_root_port *rp = NULL; 344 + 345 + evlist__for_each_entry(evlist, evsel) { 346 + if (rp != evsel->priv) { 347 + rp = evsel->priv; 348 + iio_root_port_show(config->output, rp); 349 + } 350 + } 351 + } 352 + 353 + void iostat_release(struct evlist *evlist) 354 + { 355 + struct evsel *evsel; 356 + struct iio_root_port *rp = NULL; 357 + 358 + evlist__for_each_entry(evlist, evsel) { 359 + if (rp != evsel->priv) { 360 + rp = evsel->priv; 361 + free(evsel->priv); 362 + } 363 + } 364 + } 365 + 366 + void iostat_prefix(struct evlist *evlist, 367 + struct perf_stat_config *config, 368 + char *prefix, struct timespec *ts) 369 + { 370 + struct iio_root_port *rp = evlist->selected->priv; 371 + 372 + if (rp) { 373 + if (ts) 374 + sprintf(prefix, "%6lu.%09lu%s%04x:%02x%s", 375 + ts->tv_sec, ts->tv_nsec, 376 + config->csv_sep, rp->domain, rp->bus, 377 + config->csv_sep); 378 + else 379 + sprintf(prefix, "%04x:%02x%s", rp->domain, rp->bus, 380 + config->csv_sep); 381 + } 382 + } 383 + 384 + void iostat_print_header_prefix(struct perf_stat_config *config) 385 + { 386 + if (config->csv_output) 387 + fputs("port,", config->output); 388 + else if (config->interval) 389 + fprintf(config->output, "# time port "); 390 + else 391 + fprintf(config->output, " port "); 392 + } 393 + 394 + void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, 395 + struct perf_stat_output_ctx *out) 396 + { 397 + double iostat_value = 0; 398 + u64 prev_count_val = 0; 399 + const char *iostat_metric = iostat_metric_by_idx(evsel->idx); 400 + u8 die = ((struct iio_root_port *)evsel->priv)->die; 401 + struct perf_counts_values *count = perf_counts(evsel->counts, die, 0); 402 + 403 + if (count->run && count->ena) { 404 + if (evsel->prev_raw_counts && !out->force_header) { 405 + struct perf_counts_values *prev_count = 406 + perf_counts(evsel->prev_raw_counts, die, 0); 407 + 408 + prev_count_val = prev_count->val; 409 + prev_count->val = count->val; 410 + } 411 + iostat_value = (count->val - prev_count_val) / 412 + ((double) count->run / count->ena); 413 + } 414 + out->print_metric(config, out->ctx, NULL, "%8.0f", iostat_metric, 415 + iostat_value / (256 * 1024)); 416 + } 417 + 418 + void iostat_print_counters(struct evlist *evlist, 419 + struct perf_stat_config *config, struct timespec *ts, 420 + char *prefix, iostat_print_counter_t print_cnt_cb) 421 + { 422 + void *perf_device = NULL; 423 + struct evsel *counter = evlist__first(evlist); 424 + 425 + evlist__set_selected(evlist, counter); 426 + iostat_prefix(evlist, config, prefix, ts); 427 + fprintf(config->output, "%s", prefix); 428 + evlist__for_each_entry(evlist, counter) { 429 + perf_device = evlist->selected->priv; 430 + if (perf_device && perf_device != counter->priv) { 431 + evlist__set_selected(evlist, counter); 432 + iostat_prefix(evlist, config, prefix, ts); 433 + fprintf(config->output, "\n%s", prefix); 434 + } 435 + print_cnt_cb(config, counter, prefix); 436 + } 437 + fputc('\n', config->output); 142 438 }
+1
tools/perf/command-list.txt
··· 14 14 perf-evlist mainporcelain common 15 15 perf-ftrace mainporcelain common 16 16 perf-inject mainporcelain common 17 + perf-iostat mainporcelain common 17 18 perf-kallsyms mainporcelain common 18 19 perf-kmem mainporcelain common 19 20 perf-kvm mainporcelain common
+12
tools/perf/perf-iostat.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + # perf iostat 4 + # Alexander Antonov <alexander.antonov@linux.intel.com> 5 + 6 + if [[ "$1" == "list" ]] || [[ "$1" =~ ([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})(,)? ]]; then 7 + DELIMITER="=" 8 + else 9 + DELIMITER=" " 10 + fi 11 + 12 + perf stat --iostat$DELIMITER$*