Merge tag 'perf-core-for-mingo-4.14-20170901' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/urgent

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

- Support syscall name glob matching in 'perf trace' (Arnaldo Carvalho de Melo)

e.g.:

# perf trace -e pkey_*
32.784 (0.006 ms): pkey/16018 pkey_alloc(init_val: DISABLE_WRITE) = -1 EINVAL Invalid argument
32.795 (0.004 ms): pkey/16018 pkey_mprotect(start: 0x7f380d0a6000, len: 4096, prot: READ|WRITE, pkey: -1) = 0
32.801 (0.002 ms): pkey/16018 pkey_free(pkey: -1 ) = -1 EINVAL Invalid argument
^C#

- Do not auto merge counts for explicitely specified events in
'perf stat' (Arnaldo Carvalho de Melo)

- Fix syntax in documentation of .perfconfig intel-pt option (Jack Henschel)

- Calculate the average cycles of iterations for loops detected by the
branch history support in 'perf report' (Jin Yao)

- Support PERF_SAMPLE_PHYS_ADDR as a sort key "phys_daddr" in the 'script', 'mem',
'top' and 'report'. Also add a test entry for it in 'perf test' (Kan Liang)

- Fix 'Object code reading' 'perf test' entry in PowerPC (Ravi Bangoria)

- Remove some duplicate Power9 duplicate vendor events (described in JSON
files) (Sukadev Bhattiprolu)

- Add help entry in the TUI annotate browser about cycling thru hottest
instructions with TAB/shift+TAB (Arnaldo Carvalho de Melo)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

+366 -266
+3 -1
tools/include/uapi/linux/perf_event.h
··· 139 139 PERF_SAMPLE_IDENTIFIER = 1U << 16, 140 140 PERF_SAMPLE_TRANSACTION = 1U << 17, 141 141 PERF_SAMPLE_REGS_INTR = 1U << 18, 142 + PERF_SAMPLE_PHYS_ADDR = 1U << 19, 142 143 143 - PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ 144 + PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ 144 145 }; 145 146 146 147 /* ··· 815 814 * { u64 transaction; } && PERF_SAMPLE_TRANSACTION 816 815 * { u64 abi; # enum perf_sample_regs_abi 817 816 * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR 817 + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR 818 818 * }; 819 819 */ 820 820 PERF_RECORD_SAMPLE = 9,
+1 -1
tools/perf/Documentation/intel-pt.txt
··· 873 873 874 874 $ cat ~/.perfconfig 875 875 [intel-pt] 876 - mispred-all 876 + mispred-all = on 877 877 878 878 $ perf record -e intel_pt//u ./sort 3000 879 879 Bubble sorting array of 3000 elements
+4
tools/perf/Documentation/perf-mem.txt
··· 59 59 --ldload:: 60 60 Specify desired latency for loads event. 61 61 62 + -p:: 63 + --phys-data:: 64 + Record/Report sample physical addresses 65 + 62 66 SEE ALSO 63 67 -------- 64 68 linkperf:perf-record[1], linkperf:perf-report[1]
+4 -1
tools/perf/Documentation/perf-record.txt
··· 249 249 250 250 -d:: 251 251 --data:: 252 - Record the sample addresses. 252 + Record the sample virtual addresses. 253 + 254 + --phys-data:: 255 + Record the sample physical addresses. 253 256 254 257 -T:: 255 258 --timestamp::
+1
tools/perf/Documentation/perf-report.txt
··· 137 137 - mem: type of memory access for the data at the time of the sample 138 138 - snoop: type of snoop (if any) for the data at the time of the sample 139 139 - dcacheline: the cacheline the data address is on at the time of the sample 140 + - phys_daddr: physical address of data being executed on at the time of sample 140 141 141 142 And the default sort keys are changed to local_weight, mem, sym, dso, 142 143 symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
+1 -1
tools/perf/Documentation/perf-script.txt
··· 117 117 Comma separated list of fields to print. Options are: 118 118 comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, 119 119 srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackoff, 120 - callindent, insn, insnlen, synth. 120 + callindent, insn, insnlen, synth, phys_addr. 121 121 Field list can be prepended with the type, trace, sw or hw, 122 122 to indicate to which event type the field list applies. 123 123 e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
+1 -1
tools/perf/Documentation/perf-trace.txt
··· 37 37 --expr:: 38 38 --event:: 39 39 List of syscalls and other perf events (tracepoints, HW cache events, 40 - etc) to show. 40 + etc) to show. Globbing is supported, e.g.: "epoll_*", "*msg*", etc. 41 41 See 'perf list' for a complete list of events. 42 42 Prefixing with ! shows all syscalls but the ones specified. You may 43 43 need to escape it.
+72 -27
tools/perf/builtin-mem.c
··· 23 23 bool hide_unresolved; 24 24 bool dump_raw; 25 25 bool force; 26 + bool phys_addr; 26 27 int operation; 27 28 const char *cpu_list; 28 29 DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); ··· 102 101 103 102 rec_argv[i++] = "-d"; 104 103 104 + if (mem->phys_addr) 105 + rec_argv[i++] = "--phys-data"; 106 + 105 107 for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { 106 108 if (!perf_mem_events[j].record) 107 109 continue; ··· 165 161 if (al.map != NULL) 166 162 al.map->dso->hit = 1; 167 163 168 - if (symbol_conf.field_sep) { 169 - fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64 170 - "%s0x%"PRIx64"%s%s:%s\n"; 171 - } else { 172 - fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 173 - "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n"; 174 - symbol_conf.field_sep = " "; 175 - } 164 + if (mem->phys_addr) { 165 + if (symbol_conf.field_sep) { 166 + fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s0x%016"PRIx64 167 + "%s%"PRIu64"%s0x%"PRIx64"%s%s:%s\n"; 168 + } else { 169 + fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 170 + "%s0x%016"PRIx64"%s%5"PRIu64"%s0x%06"PRIx64 171 + "%s%s:%s\n"; 172 + symbol_conf.field_sep = " "; 173 + } 176 174 177 - printf(fmt, 178 - sample->pid, 179 - symbol_conf.field_sep, 180 - sample->tid, 181 - symbol_conf.field_sep, 182 - sample->ip, 183 - symbol_conf.field_sep, 184 - sample->addr, 185 - symbol_conf.field_sep, 186 - sample->weight, 187 - symbol_conf.field_sep, 188 - sample->data_src, 189 - symbol_conf.field_sep, 190 - al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", 191 - al.sym ? al.sym->name : "???"); 175 + printf(fmt, 176 + sample->pid, 177 + symbol_conf.field_sep, 178 + sample->tid, 179 + symbol_conf.field_sep, 180 + sample->ip, 181 + symbol_conf.field_sep, 182 + sample->addr, 183 + symbol_conf.field_sep, 184 + sample->phys_addr, 185 + symbol_conf.field_sep, 186 + sample->weight, 187 + symbol_conf.field_sep, 188 + sample->data_src, 189 + symbol_conf.field_sep, 190 + al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", 191 + al.sym ? al.sym->name : "???"); 192 + } else { 193 + if (symbol_conf.field_sep) { 194 + fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64 195 + "%s0x%"PRIx64"%s%s:%s\n"; 196 + } else { 197 + fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 198 + "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n"; 199 + symbol_conf.field_sep = " "; 200 + } 201 + 202 + printf(fmt, 203 + sample->pid, 204 + symbol_conf.field_sep, 205 + sample->tid, 206 + symbol_conf.field_sep, 207 + sample->ip, 208 + symbol_conf.field_sep, 209 + sample->addr, 210 + symbol_conf.field_sep, 211 + sample->weight, 212 + symbol_conf.field_sep, 213 + sample->data_src, 214 + symbol_conf.field_sep, 215 + al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", 216 + al.sym ? al.sym->name : "???"); 217 + } 192 218 out_put: 193 219 addr_location__put(&al); 194 220 return 0; ··· 258 224 if (ret < 0) 259 225 goto out_delete; 260 226 261 - printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); 227 + if (mem->phys_addr) 228 + printf("# PID, TID, IP, ADDR, PHYS ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); 229 + else 230 + printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); 262 231 263 232 ret = perf_session__process_events(session); 264 233 ··· 291 254 * there is no weight (cost) associated with stores, so don't print 292 255 * the column 293 256 */ 294 - if (!(mem->operation & MEM_OPERATION_LOAD)) 295 - rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," 296 - "dso_daddr,tlb,locked"; 257 + if (!(mem->operation & MEM_OPERATION_LOAD)) { 258 + if (mem->phys_addr) 259 + rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," 260 + "dso_daddr,tlb,locked,phys_daddr"; 261 + else 262 + rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," 263 + "dso_daddr,tlb,locked"; 264 + } else if (mem->phys_addr) 265 + rep_argv[i++] = "--sort=local_weight,mem,sym,dso,symbol_daddr," 266 + "dso_daddr,snoop,tlb,locked,phys_daddr"; 297 267 298 268 for (j = 1; j < argc; j++, i++) 299 269 rep_argv[i] = argv[j]; ··· 417 373 "separator for columns, no spaces will be added" 418 374 " between columns '.' is reserved."), 419 375 OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"), 376 + OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"), 420 377 OPT_END() 421 378 }; 422 379 const char *const mem_subcommands[] = { "record", "report", NULL };
+2
tools/perf/builtin-record.c
··· 1604 1604 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, 1605 1605 "per thread counts"), 1606 1606 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), 1607 + OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, 1608 + "Record the sample physical addresses"), 1607 1609 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), 1608 1610 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, 1609 1611 &record.opts.sample_time_set,
+13 -2
tools/perf/builtin-script.c
··· 87 87 PERF_OUTPUT_BRSTACKINSN = 1U << 23, 88 88 PERF_OUTPUT_BRSTACKOFF = 1U << 24, 89 89 PERF_OUTPUT_SYNTH = 1U << 25, 90 + PERF_OUTPUT_PHYS_ADDR = 1U << 26, 90 91 }; 91 92 92 93 struct output_option { ··· 120 119 {.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN}, 121 120 {.str = "brstackoff", .field = PERF_OUTPUT_BRSTACKOFF}, 122 121 {.str = "synth", .field = PERF_OUTPUT_SYNTH}, 122 + {.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR}, 123 123 }; 124 124 125 125 enum { ··· 177 175 PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | 178 176 PERF_OUTPUT_SYM | PERF_OUTPUT_DSO | 179 177 PERF_OUTPUT_PERIOD | PERF_OUTPUT_ADDR | 180 - PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT, 178 + PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT | 179 + PERF_OUTPUT_PHYS_ADDR, 181 180 182 181 .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT, 183 182 }, ··· 383 380 if (PRINT_FIELD(IREGS) && 384 381 perf_evsel__check_stype(evsel, PERF_SAMPLE_REGS_INTR, "IREGS", 385 382 PERF_OUTPUT_IREGS)) 383 + return -EINVAL; 384 + 385 + if (PRINT_FIELD(PHYS_ADDR) && 386 + perf_evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", 387 + PERF_OUTPUT_PHYS_ADDR)) 386 388 return -EINVAL; 387 389 388 390 return 0; ··· 1454 1446 if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) 1455 1447 print_sample_bpf_output(sample); 1456 1448 print_insn(sample, attr, thread, machine); 1449 + 1450 + if (PRINT_FIELD(PHYS_ADDR)) 1451 + printf("%16" PRIx64, sample->phys_addr); 1457 1452 printf("\n"); 1458 1453 } 1459 1454 ··· 2740 2729 "Valid types: hw,sw,trace,raw,synth. " 2741 2730 "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," 2742 2731 "addr,symoff,period,iregs,brstack,brstacksym,flags," 2743 - "bpf-output,callindent,insn,insnlen,brstackinsn,synth", 2732 + "bpf-output,callindent,insn,insnlen,brstackinsn,synth,phys_addr", 2744 2733 parse_output_fields), 2745 2734 OPT_BOOLEAN('a', "all-cpus", &system_wide, 2746 2735 "system-wide collection from all CPUs"),
+1 -1
tools/perf/builtin-stat.c
··· 1257 1257 if (counter->merged_stat) 1258 1258 return false; 1259 1259 cb(counter, data, true); 1260 - if (!no_merge) 1260 + if (!no_merge && counter->auto_merge_stats) 1261 1261 collect_all_aliases(counter, cb, data); 1262 1262 return true; 1263 1263 }
+35 -4
tools/perf/builtin-trace.c
··· 1261 1261 static int trace__validate_ev_qualifier(struct trace *trace) 1262 1262 { 1263 1263 int err = 0, i; 1264 + size_t nr_allocated; 1264 1265 struct str_node *pos; 1265 1266 1266 1267 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); ··· 1275 1274 goto out; 1276 1275 } 1277 1276 1277 + nr_allocated = trace->ev_qualifier_ids.nr; 1278 1278 i = 0; 1279 1279 1280 1280 strlist__for_each_entry(pos, trace->ev_qualifier) { 1281 1281 const char *sc = pos->s; 1282 - int id = syscalltbl__id(trace->sctbl, sc); 1282 + int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; 1283 1283 1284 1284 if (id < 0) { 1285 + id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); 1286 + if (id >= 0) 1287 + goto matches; 1288 + 1285 1289 if (err == 0) { 1286 1290 fputs("Error:\tInvalid syscall ", trace->output); 1287 1291 err = -EINVAL; ··· 1296 1290 1297 1291 fputs(sc, trace->output); 1298 1292 } 1299 - 1293 + matches: 1300 1294 trace->ev_qualifier_ids.entries[i++] = id; 1295 + if (match_next == -1) 1296 + continue; 1297 + 1298 + while (1) { 1299 + id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); 1300 + if (id < 0) 1301 + break; 1302 + if (nr_allocated == trace->ev_qualifier_ids.nr) { 1303 + void *entries; 1304 + 1305 + nr_allocated += 8; 1306 + entries = realloc(trace->ev_qualifier_ids.entries, 1307 + nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0])); 1308 + if (entries == NULL) { 1309 + err = -ENOMEM; 1310 + fputs("\nError:\t Not enough memory for parsing\n", trace->output); 1311 + goto out_free; 1312 + } 1313 + trace->ev_qualifier_ids.entries = entries; 1314 + } 1315 + trace->ev_qualifier_ids.nr++; 1316 + trace->ev_qualifier_ids.entries[i++] = id; 1317 + } 1301 1318 } 1302 1319 1303 1320 if (err < 0) { 1304 1321 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" 1305 1322 "\nHint:\tand: 'man syscalls'\n", trace->output); 1323 + out_free: 1306 1324 zfree(&trace->ev_qualifier_ids.entries); 1307 1325 trace->ev_qualifier_ids.nr = 0; 1308 1326 } ··· 2844 2814 struct trace *trace = (struct trace *)opt->value; 2845 2815 const char *s = str; 2846 2816 char *sep = NULL, *lists[2] = { NULL, NULL, }; 2847 - int len = strlen(str) + 1, err = -1, list; 2817 + int len = strlen(str) + 1, err = -1, list, idx; 2848 2818 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); 2849 2819 char group_name[PATH_MAX]; 2850 2820 ··· 2861 2831 *sep = '\0'; 2862 2832 2863 2833 list = 0; 2864 - if (syscalltbl__id(trace->sctbl, s) >= 0) { 2834 + if (syscalltbl__id(trace->sctbl, s) >= 0 || 2835 + syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { 2865 2836 list = 1; 2866 2837 } else { 2867 2838 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
+1
tools/perf/perf.h
··· 43 43 bool no_samples; 44 44 bool raw_samples; 45 45 bool sample_address; 46 + bool sample_phys_addr; 46 47 bool sample_weight; 47 48 bool sample_time; 48 49 bool sample_time_set;
-5
tools/perf/pmu-events/arch/powerpc/power9/frontend.json
··· 80 80 "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load." 81 81 }, 82 82 {, 83 - "EventCode": "0x400F0", 84 - "EventName": "PM_LD_MISS_L1", 85 - "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load." 86 - }, 87 - {, 88 83 "EventCode": "0x2E01A", 89 84 "EventName": "PM_CMPLU_STALL_LSU_FLUSH_NEXT", 90 85 "BriefDescription": "Completion stall of one cycle because the LSU requested to flush the next iop in the sequence. It takes 1 cycle for the ISU to process this request before the LSU instruction is allowed to complete"
-120
tools/perf/pmu-events/arch/powerpc/power9/other.json
··· 605 605 "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)" 606 606 }, 607 607 {, 608 - "EventCode": "0x3689E", 609 - "EventName": "PM_L2_RTY_LD", 610 - "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)" 611 - }, 612 - {, 613 608 "EventCode": "0xE08C", 614 609 "EventName": "PM_LSU0_ERAT_HIT", 615 610 "BriefDescription": "Primary ERAT hit. There is no secondary ERAT" ··· 706 711 }, 707 712 {, 708 713 "EventCode": "0x368B4", 709 - "EventName": "PM_L3_RD0_BUSY", 710 - "BriefDescription": "Lifetime, sample of RD machine 0 valid" 711 - }, 712 - {, 713 - "EventCode": "0x468B4", 714 714 "EventName": "PM_L3_RD0_BUSY", 715 715 "BriefDescription": "Lifetime, sample of RD machine 0 valid" 716 716 }, ··· 840 850 "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)" 841 851 }, 842 852 {, 843 - "EventCode": "0x2608C", 844 - "EventName": "PM_RC0_BUSY", 845 - "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)" 846 - }, 847 - {, 848 853 "EventCode": "0x36082", 849 854 "EventName": "PM_L2_LD_DISP", 850 855 "BriefDescription": "All successful I-or-D side load dispatches for this thread (excludes i_l2mru_tch_reqs)." 851 - }, 852 - {, 853 - "EventCode": "0x1609E", 854 - "EventName": "PM_L2_LD_DISP", 855 - "BriefDescription": "All successful D side load dispatches for this thread (L2 miss + L2 hits)" 856 856 }, 857 857 {, 858 858 "EventCode": "0xF8B0", ··· 1020 1040 "BriefDescription": "L3 castouts in Mepf state for this thread" 1021 1041 }, 1022 1042 {, 1023 - "EventCode": "0x168A0", 1024 - "EventName": "PM_L3_CO_MEPF", 1025 - "BriefDescription": "L3 CO of line in Mep state (includes casthrough to memory). The Mepf state indicates that a line was brought in to satisfy an L3 prefetch request" 1026 - }, 1027 - {, 1028 1043 "EventCode": "0x460A2", 1029 1044 "EventName": "PM_L3_LAT_CI_HIT", 1030 1045 "BriefDescription": "L3 Lateral Castins Hit" ··· 1125 1150 "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)" 1126 1151 }, 1127 1152 {, 1128 - "EventCode": "0x4689E", 1129 - "EventName": "PM_L2_RTY_ST", 1130 - "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)" 1131 - }, 1132 - {, 1133 1153 "EventCode": "0x24040", 1134 1154 "EventName": "PM_INST_FROM_L2_MEPF", 1135 1155 "BriefDescription": "The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to an instruction fetch (not prefetch)" ··· 1221 1251 }, 1222 1252 {, 1223 1253 "EventCode": "0x3608C", 1224 - "EventName": "PM_CO0_BUSY", 1225 - "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)" 1226 - }, 1227 - {, 1228 - "EventCode": "0x4608C", 1229 1254 "EventName": "PM_CO0_BUSY", 1230 1255 "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)" 1231 1256 }, ··· 1360 1395 "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's Memory due to a instruction side request" 1361 1396 }, 1362 1397 {, 1363 - "EventCode": "0x40006", 1364 - "EventName": "PM_ISLB_MISS", 1365 - "BriefDescription": "Number of ISLB misses for this thread" 1366 - }, 1367 - {, 1368 1398 "EventCode": "0xD8A8", 1369 1399 "EventName": "PM_ISLB_MISS", 1370 1400 "BriefDescription": "Instruction SLB miss - Total of all segment sizes" ··· 1473 1513 "EventCode": "0x36080", 1474 1514 "EventName": "PM_L2_INST", 1475 1515 "BriefDescription": "All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs)." 1476 - }, 1477 - {, 1478 - "EventCode": "0x3609E", 1479 - "EventName": "PM_L2_INST", 1480 - "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)" 1481 1516 }, 1482 1517 {, 1483 1518 "EventCode": "0x3504C", ··· 1645 1690 "BriefDescription": "All successful I-or-D side load dispatches for this thread that were L2 hits (excludes i_l2mru_tch_reqs)" 1646 1691 }, 1647 1692 {, 1648 - "EventCode": "0x2609E", 1649 - "EventName": "PM_L2_LD_HIT", 1650 - "BriefDescription": "All successful D side load dispatches for this thread that were L2 hits for this thread" 1651 - }, 1652 - {, 1653 1693 "EventCode": "0x168AC", 1654 1694 "EventName": "PM_L3_CI_USAGE", 1655 1695 "BriefDescription": "Rotating sample of 16 CI or CO actives" ··· 1745 1795 "BriefDescription": "Rotating sample of 8 WI valid" 1746 1796 }, 1747 1797 {, 1748 - "EventCode": "0x260B6", 1749 - "EventName": "PM_L3_WI0_BUSY", 1750 - "BriefDescription": "Rotating sample of 8 WI valid (duplicate)" 1751 - }, 1752 - {, 1753 1798 "EventCode": "0x368AC", 1754 - "EventName": "PM_L3_CO0_BUSY", 1755 - "BriefDescription": "Lifetime, sample of CO machine 0 valid" 1756 - }, 1757 - {, 1758 - "EventCode": "0x468AC", 1759 1799 "EventName": "PM_L3_CO0_BUSY", 1760 1800 "BriefDescription": "Lifetime, sample of CO machine 0 valid" 1761 1801 }, ··· 1776 1836 }, 1777 1837 {, 1778 1838 "EventCode": "0x160AE", 1779 - "EventName": "PM_L3_P0_PF_RTY", 1780 - "BriefDescription": "L3 PF received retry port 0, every retry counted" 1781 - }, 1782 - {, 1783 - "EventCode": "0x260AE", 1784 1839 "EventName": "PM_L3_P0_PF_RTY", 1785 1840 "BriefDescription": "L3 PF received retry port 0, every retry counted" 1786 1841 }, ··· 1830 1895 "BriefDescription": "Lifetime, sample of snooper machine 0 valid" 1831 1896 }, 1832 1897 {, 1833 - "EventCode": "0x460AC", 1834 - "EventName": "PM_L3_SN0_BUSY", 1835 - "BriefDescription": "Lifetime, sample of snooper machine 0 valid" 1836 - }, 1837 - {, 1838 1898 "EventCode": "0x3005C", 1839 1899 "EventName": "PM_BFU_BUSY", 1840 1900 "BriefDescription": "Cycles in which all 4 Binary Floating Point units are busy. The BFU is running at capacity" ··· 1861 1931 }, 1862 1932 {, 1863 1933 "EventCode": "0x360B4", 1864 - "EventName": "PM_L3_PF0_BUSY", 1865 - "BriefDescription": "Lifetime, sample of PF machine 0 valid" 1866 - }, 1867 - {, 1868 - "EventCode": "0x460B4", 1869 1934 "EventName": "PM_L3_PF0_BUSY", 1870 1935 "BriefDescription": "Lifetime, sample of PF machine 0 valid" 1871 1936 }, ··· 2010 2085 "BriefDescription": "L3 CO received retry port 1 (memory only), every retry counted" 2011 2086 }, 2012 2087 {, 2013 - "EventCode": "0x468AE", 2014 - "EventName": "PM_L3_P1_CO_RTY", 2015 - "BriefDescription": "L3 CO received retry port 3 (memory only), every retry counted" 2016 - }, 2017 - {, 2018 2088 "EventCode": "0xC0AC", 2019 2089 "EventName": "PM_LSU_FLUSH_EMSH", 2020 2090 "BriefDescription": "An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address" ··· 2115 2195 "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)" 2116 2196 }, 2117 2197 {, 2118 - "EventCode": "0x46886", 2119 - "EventName": "PM_L2_SN_M_WR_DONE", 2120 - "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)" 2121 - }, 2122 - {, 2123 2198 "EventCode": "0x489C", 2124 2199 "EventName": "PM_BR_CORECT_PRED_TAKEN_CMPL", 2125 2200 "BriefDescription": "Conditional Branch Completed in which the HW correctly predicted the direction as taken. Counted at completion time" ··· 2205 2290 "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)" 2206 2291 }, 2207 2292 {, 2208 - "EventCode": "0x26090", 2209 - "EventName": "PM_SN0_BUSY", 2210 - "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)" 2211 - }, 2212 - {, 2213 2293 "EventCode": "0x360AE", 2214 - "EventName": "PM_L3_P0_CO_RTY", 2215 - "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted" 2216 - }, 2217 - {, 2218 - "EventCode": "0x460AE", 2219 2294 "EventName": "PM_L3_P0_CO_RTY", 2220 2295 "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted" 2221 2296 }, ··· 2245 2340 "BriefDescription": "L3 PF received retry port 1, every retry counted" 2246 2341 }, 2247 2342 {, 2248 - "EventCode": "0x268AE", 2249 - "EventName": "PM_L3_P1_PF_RTY", 2250 - "BriefDescription": "L3 PF received retry port 3, every retry counted" 2251 - }, 2252 - {, 2253 2343 "EventCode": "0x46082", 2254 2344 "EventName": "PM_L2_ST_DISP", 2255 2345 "BriefDescription": "All successful D-side store dispatches for this thread " 2256 - }, 2257 - {, 2258 - "EventCode": "0x1689E", 2259 - "EventName": "PM_L2_ST_DISP", 2260 - "BriefDescription": "All successful D-side store dispatches for this thread (L2 miss + L2 hits)" 2261 - }, 2262 - {, 2263 - "EventCode": "0x36880", 2264 - "EventName": "PM_L2_INST_MISS", 2265 - "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)" 2266 2346 }, 2267 2347 {, 2268 2348 "EventCode": "0x4609E", ··· 2320 2430 "BriefDescription": "# PPC Dispatched" 2321 2431 }, 2322 2432 {, 2323 - "EventCode": "0x300F2", 2324 - "EventName": "PM_INST_DISP", 2325 - "BriefDescription": "# PPC Dispatched" 2326 - }, 2327 - {, 2328 2433 "EventCode": "0x4E05E", 2329 2434 "EventName": "PM_TM_OUTER_TBEGIN_DISP", 2330 2435 "BriefDescription": "Number of outer tbegin instructions dispatched. The dispatch unit determines whether the tbegin instruction is outer or nested. This is a speculative count, which includes flushed instructions" ··· 2343 2458 "EventCode": "0x46882", 2344 2459 "EventName": "PM_L2_ST_HIT", 2345 2460 "BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits" 2346 - }, 2347 - {, 2348 - "EventCode": "0x2689E", 2349 - "EventName": "PM_L2_ST_HIT", 2350 - "BriefDescription": "All successful D-side store dispatches that were L2 hits for this thread" 2351 2461 }, 2352 2462 {, 2353 2463 "EventCode": "0x360A8",
-5
tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
··· 420 420 "BriefDescription": "Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for an instruction fetch" 421 421 }, 422 422 {, 423 - "EventCode": "0x10016", 424 - "EventName": "PM_DSLB_MISS", 425 - "BriefDescription": "Data SLB Miss - Total of all segment sizes" 426 - }, 427 - {, 428 423 "EventCode": "0xD0A8", 429 424 "EventName": "PM_DSLB_MISS", 430 425 "BriefDescription": "Data SLB Miss - Total of all segment sizes"
-5
tools/perf/pmu-events/arch/powerpc/power9/pmc.json
··· 5 5 "BriefDescription": "Branches that are not strongly biased" 6 6 }, 7 7 {, 8 - "EventCode": "0x40036", 9 - "EventName": "PM_BR_2PATH", 10 - "BriefDescription": "Branches that are not strongly biased" 11 - }, 12 - {, 13 8 "EventCode": "0x40056", 14 9 "EventName": "PM_MEM_LOC_THRESH_LSU_HIGH", 15 10 "BriefDescription": "Local memory above threshold for LSU medium"
+5
tools/perf/tests/code-reading.c
··· 237 237 238 238 thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al); 239 239 if (!al.map || !al.map->dso) { 240 + if (cpumode == PERF_RECORD_MISC_HYPERVISOR) { 241 + pr_debug("Hypervisor address can not be resolved - skipping\n"); 242 + return 0; 243 + } 244 + 240 245 pr_debug("thread__find_addr_map failed\n"); 241 246 return -1; 242 247 }
+5 -1
tools/perf/tests/sample-parsing.c
··· 141 141 } 142 142 } 143 143 144 + if (type & PERF_SAMPLE_PHYS_ADDR) 145 + COMP(phys_addr); 146 + 144 147 return true; 145 148 } 146 149 ··· 209 206 .mask = sample_regs, 210 207 .regs = regs, 211 208 }, 209 + .phys_addr = 113, 212 210 }; 213 211 struct sample_read_value values[] = {{1, 5}, {9, 3}, {2, 7}, {6, 4},}; 214 212 struct perf_sample sample_out; ··· 309 305 * were added. Please actually update the test rather than just change 310 306 * the condition below. 311 307 */ 312 - if (PERF_SAMPLE_MAX > PERF_SAMPLE_REGS_INTR << 1) { 308 + if (PERF_SAMPLE_MAX > PERF_SAMPLE_PHYS_ADDR << 1) { 313 309 pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n"); 314 310 return -1; 315 311 }
+2 -1
tools/perf/ui/browsers/annotate.c
··· 829 829 "q/ESC/CTRL+C Exit\n\n" 830 830 "ENTER Go to target\n" 831 831 "ESC Exit\n" 832 - "H Cycle thru hottest instructions\n" 832 + "H Go to hottest instruction\n" 833 + "TAB/shift+TAB Cycle thru hottest instructions\n" 833 834 "j Toggle showing jump to target arrows\n" 834 835 "J Toggle showing number of jump sources on targets\n" 835 836 "n Search next string\n"
+2 -6
tools/perf/ui/browsers/hists.c
··· 931 931 browser->show_dso); 932 932 933 933 if (symbol_conf.show_branchflag_count) { 934 - if (need_percent) 935 - callchain_list_counts__printf_value(node, chain, NULL, 936 - buf, sizeof(buf)); 937 - else 938 - callchain_list_counts__printf_value(NULL, chain, NULL, 939 - buf, sizeof(buf)); 934 + callchain_list_counts__printf_value(chain, NULL, 935 + buf, sizeof(buf)); 940 936 941 937 if (asprintf(&alloc_str2, "%s%s", str, buf) < 0) 942 938 str = "Not enough memory!";
+3 -7
tools/perf/ui/stdio/hist.c
··· 124 124 str = callchain_list__sym_name(chain, bf, sizeof(bf), false); 125 125 126 126 if (symbol_conf.show_branchflag_count) { 127 - if (!period) 128 - callchain_list_counts__printf_value(node, chain, NULL, 129 - buf, sizeof(buf)); 130 - else 131 - callchain_list_counts__printf_value(NULL, chain, NULL, 132 - buf, sizeof(buf)); 127 + callchain_list_counts__printf_value(chain, NULL, 128 + buf, sizeof(buf)); 133 129 134 130 if (asprintf(&alloc_str, "%s%s", str, buf) < 0) 135 131 str = "Not enough memory!"; ··· 309 313 310 314 if (symbol_conf.show_branchflag_count) 311 315 ret += callchain_list_counts__printf_value( 312 - NULL, chain, fp, NULL, 0); 316 + chain, fp, NULL, 0); 313 317 ret += fprintf(fp, "\n"); 314 318 315 319 if (++entries_printed == callchain_param.print_limit)
+23 -26
tools/perf/util/callchain.c
··· 588 588 call->cycles_count = 589 589 cursor_node->branch_flags.cycles; 590 590 call->iter_count = cursor_node->nr_loop_iter; 591 - call->samples_count = cursor_node->samples; 591 + call->iter_cycles = cursor_node->iter_cycles; 592 592 } 593 593 } 594 594 ··· 722 722 cnode->cycles_count += 723 723 node->branch_flags.cycles; 724 724 cnode->iter_count += node->nr_loop_iter; 725 - cnode->samples_count += node->samples; 725 + cnode->iter_cycles += node->iter_cycles; 726 726 } 727 727 } 728 728 ··· 998 998 int callchain_cursor_append(struct callchain_cursor *cursor, 999 999 u64 ip, struct map *map, struct symbol *sym, 1000 1000 bool branch, struct branch_flags *flags, 1001 - int nr_loop_iter, int samples, u64 branch_from) 1001 + int nr_loop_iter, u64 iter_cycles, u64 branch_from) 1002 1002 { 1003 1003 struct callchain_cursor_node *node = *cursor->last; 1004 1004 ··· 1016 1016 node->sym = sym; 1017 1017 node->branch = branch; 1018 1018 node->nr_loop_iter = nr_loop_iter; 1019 - node->samples = samples; 1019 + node->iter_cycles = iter_cycles; 1020 1020 1021 1021 if (flags) 1022 1022 memcpy(&node->branch_flags, flags, ··· 1306 1306 static int branch_from_str(char *bf, int bfsize, 1307 1307 u64 branch_count, 1308 1308 u64 cycles_count, u64 iter_count, 1309 - u64 samples_count) 1309 + u64 iter_cycles) 1310 1310 { 1311 1311 int printed = 0, i = 0; 1312 1312 u64 cycles; ··· 1318 1318 bf + printed, bfsize - printed); 1319 1319 } 1320 1320 1321 - if (iter_count && samples_count) { 1322 - printed += count_pri64_printf(i++, "iterations", 1323 - iter_count / samples_count, 1321 + if (iter_count) { 1322 + printed += count_pri64_printf(i++, "iter", 1323 + iter_count, 1324 + bf + printed, bfsize - printed); 1325 + 1326 + printed += count_pri64_printf(i++, "avg_cycles", 1327 + iter_cycles / iter_count, 1324 1328 bf + printed, bfsize - printed); 1325 1329 } 1326 1330 ··· 1337 1333 static int counts_str_build(char *bf, int bfsize, 1338 1334 u64 branch_count, u64 predicted_count, 1339 1335 u64 abort_count, u64 cycles_count, 1340 - u64 iter_count, u64 samples_count, 1336 + u64 iter_count, u64 iter_cycles, 1341 1337 struct branch_type_stat *brtype_stat) 1342 1338 { 1343 1339 int printed; ··· 1350 1346 predicted_count, abort_count, brtype_stat); 1351 1347 } else { 1352 1348 printed = branch_from_str(bf, bfsize, branch_count, 1353 - cycles_count, iter_count, samples_count); 1349 + cycles_count, iter_count, iter_cycles); 1354 1350 } 1355 1351 1356 1352 if (!printed) ··· 1362 1358 static int callchain_counts_printf(FILE *fp, char *bf, int bfsize, 1363 1359 u64 branch_count, u64 predicted_count, 1364 1360 u64 abort_count, u64 cycles_count, 1365 - u64 iter_count, u64 samples_count, 1361 + u64 iter_count, u64 iter_cycles, 1366 1362 struct branch_type_stat *brtype_stat) 1367 1363 { 1368 1364 char str[256]; 1369 1365 1370 1366 counts_str_build(str, sizeof(str), branch_count, 1371 1367 predicted_count, abort_count, cycles_count, 1372 - iter_count, samples_count, brtype_stat); 1368 + iter_count, iter_cycles, brtype_stat); 1373 1369 1374 1370 if (fp) 1375 1371 return fprintf(fp, "%s", str); ··· 1377 1373 return scnprintf(bf, bfsize, "%s", str); 1378 1374 } 1379 1375 1380 - int callchain_list_counts__printf_value(struct callchain_node *node, 1381 - struct callchain_list *clist, 1376 + int callchain_list_counts__printf_value(struct callchain_list *clist, 1382 1377 FILE *fp, char *bf, int bfsize) 1383 1378 { 1384 1379 u64 branch_count, predicted_count; 1385 1380 u64 abort_count, cycles_count; 1386 - u64 iter_count = 0, samples_count = 0; 1381 + u64 iter_count, iter_cycles; 1387 1382 1388 1383 branch_count = clist->branch_count; 1389 1384 predicted_count = clist->predicted_count; 1390 1385 abort_count = clist->abort_count; 1391 1386 cycles_count = clist->cycles_count; 1392 - 1393 - if (node) { 1394 - struct callchain_list *call; 1395 - 1396 - list_for_each_entry(call, &node->val, list) { 1397 - iter_count += call->iter_count; 1398 - samples_count += call->samples_count; 1399 - } 1400 - } 1387 + iter_count = clist->iter_count; 1388 + iter_cycles = clist->iter_cycles; 1401 1389 1402 1390 return callchain_counts_printf(fp, bf, bfsize, branch_count, 1403 1391 predicted_count, abort_count, 1404 - cycles_count, iter_count, samples_count, 1392 + cycles_count, iter_count, iter_cycles, 1405 1393 &clist->brtype_stat); 1406 1394 } 1407 1395 ··· 1519 1523 1520 1524 rc = callchain_cursor_append(dst, node->ip, node->map, node->sym, 1521 1525 node->branch, &node->branch_flags, 1522 - node->nr_loop_iter, node->samples, 1526 + node->nr_loop_iter, 1527 + node->iter_cycles, 1523 1528 node->branch_from); 1524 1529 if (rc) 1525 1530 break;
+4 -5
tools/perf/util/callchain.h
··· 119 119 u64 abort_count; 120 120 u64 cycles_count; 121 121 u64 iter_count; 122 - u64 samples_count; 122 + u64 iter_cycles; 123 123 struct branch_type_stat brtype_stat; 124 124 char *srcline; 125 125 struct list_head list; ··· 139 139 struct branch_flags branch_flags; 140 140 u64 branch_from; 141 141 int nr_loop_iter; 142 - int samples; 142 + u64 iter_cycles; 143 143 struct callchain_cursor_node *next; 144 144 }; 145 145 ··· 201 201 int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, 202 202 struct map *map, struct symbol *sym, 203 203 bool branch, struct branch_flags *flags, 204 - int nr_loop_iter, int samples, u64 branch_from); 204 + int nr_loop_iter, u64 iter_cycles, u64 branch_from); 205 205 206 206 /* Close a cursor writing session. Initialize for the reader */ 207 207 static inline void callchain_cursor_commit(struct callchain_cursor *cursor) ··· 282 282 int callchain_node__fprintf_value(struct callchain_node *node, 283 283 FILE *fp, u64 total); 284 284 285 - int callchain_list_counts__printf_value(struct callchain_node *node, 286 - struct callchain_list *clist, 285 + int callchain_list_counts__printf_value(struct callchain_list *clist, 287 286 FILE *fp, char *bf, int bfsize); 288 287 289 288 void free_callchain(struct callchain_root *root);
+1
tools/perf/util/event.h
··· 200 200 u32 cpu; 201 201 u32 raw_size; 202 202 u64 data_src; 203 + u64 phys_addr; 203 204 u32 flags; 204 205 u16 insn_len; 205 206 u8 cpumode;
+18 -1
tools/perf/util/evsel.c
··· 955 955 if (opts->sample_address) 956 956 perf_evsel__set_sample_bit(evsel, DATA_SRC); 957 957 958 + if (opts->sample_phys_addr) 959 + perf_evsel__set_sample_bit(evsel, PHYS_ADDR); 960 + 958 961 if (opts->no_buffering) { 959 962 attr->watermark = 0; 960 963 attr->wakeup_events = 1; ··· 1467 1464 bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), 1468 1465 bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), 1469 1466 bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), 1470 - bit_name(WEIGHT), 1467 + bit_name(WEIGHT), bit_name(PHYS_ADDR), 1471 1468 { .name = NULL, } 1472 1469 }; 1473 1470 #undef bit_name ··· 2209 2206 } 2210 2207 } 2211 2208 2209 + data->phys_addr = 0; 2210 + if (type & PERF_SAMPLE_PHYS_ADDR) { 2211 + data->phys_addr = *array; 2212 + array++; 2213 + } 2214 + 2212 2215 return 0; 2213 2216 } 2214 2217 ··· 2319 2310 result += sizeof(u64); 2320 2311 } 2321 2312 } 2313 + 2314 + if (type & PERF_SAMPLE_PHYS_ADDR) 2315 + result += sizeof(u64); 2322 2316 2323 2317 return result; 2324 2318 } ··· 2510 2498 } else { 2511 2499 *array++ = 0; 2512 2500 } 2501 + } 2502 + 2503 + if (type & PERF_SAMPLE_PHYS_ADDR) { 2504 + *array = sample->phys_addr; 2505 + array++; 2513 2506 } 2514 2507 2515 2508 return 0;
+1
tools/perf/util/evsel.h
··· 131 131 bool cmdline_group_boundary; 132 132 struct list_head config_terms; 133 133 int bpf_fd; 134 + bool auto_merge_stats; 134 135 bool merged_stat; 135 136 const char * metric_expr; 136 137 const char * metric_name;
+4
tools/perf/util/hist.c
··· 167 167 symlen = unresolved_col_width + 4 + 2; 168 168 hists__set_unres_dso_col_len(hists, HISTC_MEM_DADDR_DSO); 169 169 } 170 + 171 + hists__new_col_len(hists, HISTC_MEM_PHYS_DADDR, 172 + unresolved_col_width + 4 + 2); 173 + 170 174 } else { 171 175 symlen = unresolved_col_width + 4 + 2; 172 176 hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen);
+1
tools/perf/util/hist.h
··· 47 47 HISTC_GLOBAL_WEIGHT, 48 48 HISTC_MEM_DADDR_SYMBOL, 49 49 HISTC_MEM_DADDR_DSO, 50 + HISTC_MEM_PHYS_DADDR, 50 51 HISTC_MEM_LOCKED, 51 52 HISTC_MEM_TLB, 52 53 HISTC_MEM_LVL,
+59 -37
tools/perf/util/machine.c
··· 1635 1635 ams->al_addr = al.addr; 1636 1636 ams->sym = al.sym; 1637 1637 ams->map = al.map; 1638 + ams->phys_addr = 0; 1638 1639 } 1639 1640 1640 1641 static void ip__resolve_data(struct thread *thread, 1641 - u8 m, struct addr_map_symbol *ams, u64 addr) 1642 + u8 m, struct addr_map_symbol *ams, 1643 + u64 addr, u64 phys_addr) 1642 1644 { 1643 1645 struct addr_location al; 1644 1646 ··· 1660 1658 ams->al_addr = al.addr; 1661 1659 ams->sym = al.sym; 1662 1660 ams->map = al.map; 1661 + ams->phys_addr = phys_addr; 1663 1662 } 1664 1663 1665 1664 struct mem_info *sample__resolve_mem(struct perf_sample *sample, ··· 1672 1669 return NULL; 1673 1670 1674 1671 ip__resolve_ams(al->thread, &mi->iaddr, sample->ip); 1675 - ip__resolve_data(al->thread, al->cpumode, &mi->daddr, sample->addr); 1672 + ip__resolve_data(al->thread, al->cpumode, &mi->daddr, 1673 + sample->addr, sample->phys_addr); 1676 1674 mi->data_src.val = sample->data_src; 1677 1675 1678 1676 return mi; 1679 1677 } 1678 + 1679 + struct iterations { 1680 + int nr_loop_iter; 1681 + u64 cycles; 1682 + }; 1680 1683 1681 1684 static int add_callchain_ip(struct thread *thread, 1682 1685 struct callchain_cursor *cursor, ··· 1692 1683 u64 ip, 1693 1684 bool branch, 1694 1685 struct branch_flags *flags, 1695 - int nr_loop_iter, 1696 - int samples, 1686 + struct iterations *iter, 1697 1687 u64 branch_from) 1698 1688 { 1699 1689 struct addr_location al; 1690 + int nr_loop_iter = 0; 1691 + u64 iter_cycles = 0; 1700 1692 1701 1693 al.filtered = 0; 1702 1694 al.sym = NULL; ··· 1747 1737 1748 1738 if (symbol_conf.hide_unresolved && al.sym == NULL) 1749 1739 return 0; 1740 + 1741 + if (iter) { 1742 + nr_loop_iter = iter->nr_loop_iter; 1743 + iter_cycles = iter->cycles; 1744 + } 1745 + 1750 1746 return callchain_cursor_append(cursor, al.addr, al.map, al.sym, 1751 - branch, flags, nr_loop_iter, samples, 1752 - branch_from); 1747 + branch, flags, nr_loop_iter, 1748 + iter_cycles, branch_from); 1753 1749 } 1754 1750 1755 1751 struct branch_info *sample__resolve_bstack(struct perf_sample *sample, ··· 1776 1760 return bi; 1777 1761 } 1778 1762 1763 + static void save_iterations(struct iterations *iter, 1764 + struct branch_entry *be, int nr) 1765 + { 1766 + int i; 1767 + 1768 + iter->nr_loop_iter = nr; 1769 + iter->cycles = 0; 1770 + 1771 + for (i = 0; i < nr; i++) 1772 + iter->cycles += be[i].flags.cycles; 1773 + } 1774 + 1779 1775 #define CHASHSZ 127 1780 1776 #define CHASHBITS 7 1781 1777 #define NO_ENTRY 0xff ··· 1795 1767 #define PERF_MAX_BRANCH_DEPTH 127 1796 1768 1797 1769 /* Remove loops. */ 1798 - static int remove_loops(struct branch_entry *l, int nr) 1770 + static int remove_loops(struct branch_entry *l, int nr, 1771 + struct iterations *iter) 1799 1772 { 1800 1773 int i, j, off; 1801 1774 unsigned char chash[CHASHSZ]; ··· 1821 1792 break; 1822 1793 } 1823 1794 if (is_loop) { 1824 - memmove(l + i, l + i + off, 1825 - (nr - (i + off)) * sizeof(*l)); 1795 + j = nr - (i + off); 1796 + if (j > 0) { 1797 + save_iterations(iter + i + off, 1798 + l + i, off); 1799 + 1800 + memmove(iter + i, iter + i + off, 1801 + j * sizeof(*iter)); 1802 + 1803 + memmove(l + i, l + i + off, 1804 + j * sizeof(*l)); 1805 + } 1806 + 1826 1807 nr -= off; 1827 1808 } 1828 1809 } ··· 1922 1883 1923 1884 err = add_callchain_ip(thread, cursor, parent, 1924 1885 root_al, &cpumode, ip, 1925 - branch, flags, 0, 0, 1886 + branch, flags, NULL, 1926 1887 branch_from); 1927 1888 if (err) 1928 1889 return (err < 0) ? err : 0; ··· 1948 1909 int i, j, err, nr_entries; 1949 1910 int skip_idx = -1; 1950 1911 int first_call = 0; 1951 - int nr_loop_iter; 1952 1912 1953 1913 if (chain) 1954 1914 chain_nr = chain->nr; ··· 1980 1942 if (branch && callchain_param.branch_callstack) { 1981 1943 int nr = min(max_stack, (int)branch->nr); 1982 1944 struct branch_entry be[nr]; 1945 + struct iterations iter[nr]; 1983 1946 1984 1947 if (branch->nr > PERF_MAX_BRANCH_DEPTH) { 1985 1948 pr_warning("corrupted branch chain. skipping...\n"); ··· 2011 1972 be[i] = branch->entries[branch->nr - i - 1]; 2012 1973 } 2013 1974 2014 - nr_loop_iter = nr; 2015 - nr = remove_loops(be, nr); 2016 - 2017 - /* 2018 - * Get the number of iterations. 2019 - * It's only approximation, but good enough in practice. 2020 - */ 2021 - if (nr_loop_iter > nr) 2022 - nr_loop_iter = nr_loop_iter - nr + 1; 2023 - else 2024 - nr_loop_iter = 0; 1975 + memset(iter, 0, sizeof(struct iterations) * nr); 1976 + nr = remove_loops(be, nr, iter); 2025 1977 2026 1978 for (i = 0; i < nr; i++) { 2027 - if (i == nr - 1) 2028 - err = add_callchain_ip(thread, cursor, parent, 2029 - root_al, 2030 - NULL, be[i].to, 2031 - true, &be[i].flags, 2032 - nr_loop_iter, 1, 2033 - be[i].from); 2034 - else 2035 - err = add_callchain_ip(thread, cursor, parent, 2036 - root_al, 2037 - NULL, be[i].to, 2038 - true, &be[i].flags, 2039 - 0, 0, be[i].from); 1979 + err = add_callchain_ip(thread, cursor, parent, 1980 + root_al, 1981 + NULL, be[i].to, 1982 + true, &be[i].flags, 1983 + NULL, be[i].from); 2040 1984 2041 1985 if (!err) 2042 1986 err = add_callchain_ip(thread, cursor, parent, root_al, 2043 1987 NULL, be[i].from, 2044 1988 true, &be[i].flags, 2045 - 0, 0, 0); 1989 + &iter[i], 0); 2046 1990 if (err == -EINVAL) 2047 1991 break; 2048 1992 if (err) ··· 2059 2037 2060 2038 err = add_callchain_ip(thread, cursor, parent, 2061 2039 root_al, &cpumode, ip, 2062 - false, NULL, 0, 0, 0); 2040 + false, NULL, NULL, 0); 2063 2041 2064 2042 if (err) 2065 2043 return (err < 0) ? err : 0;
+16 -8
tools/perf/util/parse-events.c
··· 310 310 __add_event(struct list_head *list, int *idx, 311 311 struct perf_event_attr *attr, 312 312 char *name, struct cpu_map *cpus, 313 - struct list_head *config_terms) 313 + struct list_head *config_terms, bool auto_merge_stats) 314 314 { 315 315 struct perf_evsel *evsel; 316 316 ··· 324 324 evsel->cpus = cpu_map__get(cpus); 325 325 evsel->own_cpus = cpu_map__get(cpus); 326 326 evsel->system_wide = !!cpus; 327 + evsel->auto_merge_stats = auto_merge_stats; 327 328 328 329 if (name) 329 330 evsel->name = strdup(name); ··· 340 339 struct perf_event_attr *attr, char *name, 341 340 struct list_head *config_terms) 342 341 { 343 - return __add_event(list, idx, attr, name, NULL, config_terms) ? 0 : -ENOMEM; 342 + return __add_event(list, idx, attr, name, NULL, config_terms, false) ? 0 : -ENOMEM; 344 343 } 345 344 346 345 static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size) ··· 1210 1209 get_config_name(head_config), &config_terms); 1211 1210 } 1212 1211 1213 - int parse_events_add_pmu(struct parse_events_state *parse_state, 1212 + static int __parse_events_add_pmu(struct parse_events_state *parse_state, 1214 1213 struct list_head *list, char *name, 1215 - struct list_head *head_config) 1214 + struct list_head *head_config, bool auto_merge_stats) 1216 1215 { 1217 1216 struct perf_event_attr attr; 1218 1217 struct perf_pmu_info info; ··· 1233 1232 1234 1233 if (!head_config) { 1235 1234 attr.type = pmu->type; 1236 - evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL); 1235 + evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL, auto_merge_stats); 1237 1236 return evsel ? 0 : -ENOMEM; 1238 1237 } 1239 1238 ··· 1255 1254 1256 1255 evsel = __add_event(list, &parse_state->idx, &attr, 1257 1256 get_config_name(head_config), pmu->cpus, 1258 - &config_terms); 1257 + &config_terms, auto_merge_stats); 1259 1258 if (evsel) { 1260 1259 evsel->unit = info.unit; 1261 1260 evsel->scale = info.scale; ··· 1266 1265 } 1267 1266 1268 1267 return evsel ? 0 : -ENOMEM; 1268 + } 1269 + 1270 + int parse_events_add_pmu(struct parse_events_state *parse_state, 1271 + struct list_head *list, char *name, 1272 + struct list_head *head_config) 1273 + { 1274 + return __parse_events_add_pmu(parse_state, list, name, head_config, false); 1269 1275 } 1270 1276 1271 1277 int parse_events_multi_pmu_add(struct parse_events_state *parse_state, ··· 1304 1296 return -1; 1305 1297 list_add_tail(&term->list, head); 1306 1298 1307 - if (!parse_events_add_pmu(parse_state, list, 1308 - pmu->name, head)) { 1299 + if (!__parse_events_add_pmu(parse_state, list, 1300 + pmu->name, head, true)) { 1309 1301 pr_debug("%s -> %s/%s/\n", str, 1310 1302 pmu->name, alias->str); 1311 1303 ok++;
+3
tools/perf/util/session.c
··· 1120 1120 if (sample_type & PERF_SAMPLE_DATA_SRC) 1121 1121 printf(" . data_src: 0x%"PRIx64"\n", sample->data_src); 1122 1122 1123 + if (sample_type & PERF_SAMPLE_PHYS_ADDR) 1124 + printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr); 1125 + 1123 1126 if (sample_type & PERF_SAMPLE_TRANSACTION) 1124 1127 printf("... transaction: %" PRIx64 "\n", sample->transaction); 1125 1128
+42
tools/perf/util/sort.c
··· 1316 1316 }; 1317 1317 1318 1318 static int64_t 1319 + sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right) 1320 + { 1321 + uint64_t l = 0, r = 0; 1322 + 1323 + if (left->mem_info) 1324 + l = left->mem_info->daddr.phys_addr; 1325 + if (right->mem_info) 1326 + r = right->mem_info->daddr.phys_addr; 1327 + 1328 + return (int64_t)(r - l); 1329 + } 1330 + 1331 + static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf, 1332 + size_t size, unsigned int width) 1333 + { 1334 + uint64_t addr = 0; 1335 + size_t ret = 0; 1336 + size_t len = BITS_PER_LONG / 4; 1337 + 1338 + addr = he->mem_info->daddr.phys_addr; 1339 + 1340 + ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", he->level); 1341 + 1342 + ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx", len, addr); 1343 + 1344 + ret += repsep_snprintf(bf + ret, size - ret, "%-*s", width - ret, ""); 1345 + 1346 + if (ret > width) 1347 + bf[width] = '\0'; 1348 + 1349 + return width; 1350 + } 1351 + 1352 + struct sort_entry sort_mem_phys_daddr = { 1353 + .se_header = "Data Physical Address", 1354 + .se_cmp = sort__phys_daddr_cmp, 1355 + .se_snprintf = hist_entry__phys_daddr_snprintf, 1356 + .se_width_idx = HISTC_MEM_PHYS_DADDR, 1357 + }; 1358 + 1359 + static int64_t 1319 1360 sort__abort_cmp(struct hist_entry *left, struct hist_entry *right) 1320 1361 { 1321 1362 if (!left->branch_info || !right->branch_info) ··· 1588 1547 DIM(SORT_MEM_LVL, "mem", sort_mem_lvl), 1589 1548 DIM(SORT_MEM_SNOOP, "snoop", sort_mem_snoop), 1590 1549 DIM(SORT_MEM_DCACHELINE, "dcacheline", sort_mem_dcacheline), 1550 + DIM(SORT_MEM_PHYS_DADDR, "phys_daddr", sort_mem_phys_daddr), 1591 1551 }; 1592 1552 1593 1553 #undef DIM
+1
tools/perf/util/sort.h
··· 245 245 SORT_MEM_SNOOP, 246 246 SORT_MEM_DCACHELINE, 247 247 SORT_MEM_IADDR_SYMBOL, 248 + SORT_MEM_PHYS_DADDR, 248 249 }; 249 250 250 251 /*
+1
tools/perf/util/symbol.h
··· 186 186 struct symbol *sym; 187 187 u64 addr; 188 188 u64 al_addr; 189 + u64 phys_addr; 189 190 }; 190 191 191 192 struct branch_info {
+33
tools/perf/util/syscalltbl.c
··· 19 19 #ifdef HAVE_SYSCALL_TABLE 20 20 #include <linux/compiler.h> 21 21 #include <string.h> 22 + #include "string2.h" 22 23 #include "util.h" 23 24 24 25 #if defined(__x86_64__) ··· 106 105 return sc ? sc->id : -1; 107 106 } 108 107 108 + int syscalltbl__strglobmatch_next(struct syscalltbl *tbl, const char *syscall_glob, int *idx) 109 + { 110 + int i; 111 + struct syscall *syscalls = tbl->syscalls.entries; 112 + 113 + for (i = *idx + 1; i < tbl->syscalls.nr_entries; ++i) { 114 + if (strglobmatch(syscalls[i].name, syscall_glob)) { 115 + *idx = i; 116 + return syscalls[i].id; 117 + } 118 + } 119 + 120 + return -1; 121 + } 122 + 123 + int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx) 124 + { 125 + *idx = -1; 126 + return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx); 127 + } 128 + 109 129 #else /* HAVE_SYSCALL_TABLE */ 110 130 111 131 #include <libaudit.h> ··· 152 130 int syscalltbl__id(struct syscalltbl *tbl, const char *name) 153 131 { 154 132 return audit_name_to_syscall(name, tbl->audit_machine); 133 + } 134 + 135 + int syscalltbl__strglobmatch_next(struct syscalltbl *tbl __maybe_unused, 136 + const char *syscall_glob __maybe_unused, int *idx __maybe_unused) 137 + { 138 + return -1; 139 + } 140 + 141 + int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx) 142 + { 143 + return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx); 155 144 } 156 145 #endif /* HAVE_SYSCALL_TABLE */
+3
tools/perf/util/syscalltbl.h
··· 17 17 const char *syscalltbl__name(const struct syscalltbl *tbl, int id); 18 18 int syscalltbl__id(struct syscalltbl *tbl, const char *name); 19 19 20 + int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx); 21 + int syscalltbl__strglobmatch_next(struct syscalltbl *tbl, const char *syscall_glob, int *idx); 22 + 20 23 #endif /* __PERF_SYSCALLTBL_H */