Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf stat: Fix wrong skipping for per-die aggregation

Uncore becomes die-scope on Xeon Cascade Lake-AP and perf has supported
--per-die aggregation yet.

One issue is found in check_per_pkg() for uncore events running on AP
system. On cascade Lake-AP, we have:

S0-D0
S0-D1
S1-D0
S1-D1

But in check_per_pkg(), S0-D1 and S1-D1 are skipped because the mask
bits for S0 and S1 have been set for S0-D0 and S1-D0. It doesn't check
die_id. So the counting for S0-D1 and S1-D1 are set to zero. That's not
correct.

root@lkp-csl-2ap4 ~# ./perf stat -a -I 1000 -e llc_misses.mem_read --per-die -- sleep 5
1.001460963 S0-D0 1 1317376 Bytes llc_misses.mem_read
1.001460963 S0-D1 1 998016 Bytes llc_misses.mem_read
1.001460963 S1-D0 1 970496 Bytes llc_misses.mem_read
1.001460963 S1-D1 1 1291264 Bytes llc_misses.mem_read
2.003488021 S0-D0 1 1082048 Bytes llc_misses.mem_read
2.003488021 S0-D1 1 1919040 Bytes llc_misses.mem_read
2.003488021 S1-D0 1 890752 Bytes llc_misses.mem_read
2.003488021 S1-D1 1 2380800 Bytes llc_misses.mem_read
3.005613270 S0-D0 1 1126080 Bytes llc_misses.mem_read
3.005613270 S0-D1 1 2898176 Bytes llc_misses.mem_read
3.005613270 S1-D0 1 870912 Bytes llc_misses.mem_read
3.005613270 S1-D1 1 3388608 Bytes llc_misses.mem_read
4.007627598 S0-D0 1 1124608 Bytes llc_misses.mem_read
4.007627598 S0-D1 1 3884416 Bytes llc_misses.mem_read
4.007627598 S1-D0 1 921088 Bytes llc_misses.mem_read
4.007627598 S1-D1 1 4451840 Bytes llc_misses.mem_read
5.001479927 S0-D0 1 963328 Bytes llc_misses.mem_read
5.001479927 S0-D1 1 4831936 Bytes llc_misses.mem_read
5.001479927 S1-D0 1 895104 Bytes llc_misses.mem_read
5.001479927 S1-D1 1 5496640 Bytes llc_misses.mem_read

From above output, we can see S0-D1 and S1-D1 don't report the interval
values, they are continued to grow. That's because check_per_pkg()
wrongly decides to use zero counts for S0-D1 and S1-D1.

So in check_per_pkg(), we should use hashmap(socket,die) to decide if
the cpu counts needs to skip. Only considering socket is not enough.

Now with this patch,

root@lkp-csl-2ap4 ~# ./perf stat -a -I 1000 -e llc_misses.mem_read --per-die -- sleep 5
1.001586691 S0-D0 1 1229440 Bytes llc_misses.mem_read
1.001586691 S0-D1 1 976832 Bytes llc_misses.mem_read
1.001586691 S1-D0 1 938304 Bytes llc_misses.mem_read
1.001586691 S1-D1 1 1227328 Bytes llc_misses.mem_read
2.003776312 S0-D0 1 1586752 Bytes llc_misses.mem_read
2.003776312 S0-D1 1 875392 Bytes llc_misses.mem_read
2.003776312 S1-D0 1 855616 Bytes llc_misses.mem_read
2.003776312 S1-D1 1 949376 Bytes llc_misses.mem_read
3.006512788 S0-D0 1 1338880 Bytes llc_misses.mem_read
3.006512788 S0-D1 1 920064 Bytes llc_misses.mem_read
3.006512788 S1-D0 1 877184 Bytes llc_misses.mem_read
3.006512788 S1-D1 1 1020736 Bytes llc_misses.mem_read
4.008895291 S0-D0 1 926592 Bytes llc_misses.mem_read
4.008895291 S0-D1 1 906368 Bytes llc_misses.mem_read
4.008895291 S1-D0 1 892224 Bytes llc_misses.mem_read
4.008895291 S1-D1 1 987712 Bytes llc_misses.mem_read
5.001590993 S0-D0 1 962624 Bytes llc_misses.mem_read
5.001590993 S0-D1 1 912512 Bytes llc_misses.mem_read
5.001590993 S1-D0 1 891200 Bytes llc_misses.mem_read
5.001590993 S1-D1 1 978432 Bytes llc_misses.mem_read

On no-die system, die_id is 0, actually it's hashmap(socket,0), original behavior
is not changed.

Reported-by: Ying Huang <ying.huang@intel.com>
Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@intel.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ying Huang <ying.huang@intel.com>
Link: http://lore.kernel.org/lkml/20210128013417.25597-1-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Jin Yao and committed by
Arnaldo Carvalho de Melo
034f7ee1 33dc525f

+59 -11
+17 -1
tools/perf/util/evsel.c
··· 46 46 #include "string2.h" 47 47 #include "memswap.h" 48 48 #include "util.h" 49 + #include "hashmap.h" 49 50 #include "../perf-sys.h" 50 51 #include "util/parse-branch-options.h" 51 52 #include <internal/xyarray.h> ··· 1391 1390 zfree(&evsel->group_name); 1392 1391 zfree(&evsel->name); 1393 1392 zfree(&evsel->pmu_name); 1394 - zfree(&evsel->per_pkg_mask); 1393 + evsel__zero_per_pkg(evsel); 1394 + hashmap__free(evsel->per_pkg_mask); 1395 + evsel->per_pkg_mask = NULL; 1395 1396 zfree(&evsel->metric_events); 1396 1397 perf_evsel__object.fini(evsel); 1397 1398 } ··· 2783 2780 return -ENOMEM; 2784 2781 2785 2782 return store_evsel_ids(evsel, evlist); 2783 + } 2784 + 2785 + void evsel__zero_per_pkg(struct evsel *evsel) 2786 + { 2787 + struct hashmap_entry *cur; 2788 + size_t bkt; 2789 + 2790 + if (evsel->per_pkg_mask) { 2791 + hashmap__for_each_entry(evsel->per_pkg_mask, cur, bkt) 2792 + free((char *)cur->key); 2793 + 2794 + hashmap__clear(evsel->per_pkg_mask); 2795 + } 2786 2796 }
+3 -1
tools/perf/util/evsel.h
··· 19 19 union perf_event; 20 20 struct bpf_counter_ops; 21 21 struct target; 22 + struct hashmap; 22 23 23 24 typedef int (evsel__sb_cb_t)(union perf_event *event, void *data); 24 25 ··· 113 112 bool merged_stat; 114 113 bool reset_group; 115 114 bool errored; 116 - unsigned long *per_pkg_mask; 115 + struct hashmap *per_pkg_mask; 117 116 struct evsel *leader; 118 117 struct list_head config_terms; 119 118 int err; ··· 434 433 435 434 int evsel__store_ids(struct evsel *evsel, struct evlist *evlist); 436 435 436 + void evsel__zero_per_pkg(struct evsel *evsel); 437 437 #endif /* __PERF_EVSEL_H */
+1
tools/perf/util/python-ext-sources
··· 36 36 util/units.c 37 37 util/affinity.c 38 38 util/rwsem.c 39 + util/hashmap.c
+38 -9
tools/perf/util/stat.c
··· 13 13 #include "evlist.h" 14 14 #include "evsel.h" 15 15 #include "thread_map.h" 16 + #include "hashmap.h" 16 17 #include <linux/zalloc.h> 17 18 18 19 void update_stats(struct stats *stats, u64 val) ··· 278 277 } 279 278 } 280 279 281 - static void zero_per_pkg(struct evsel *counter) 280 + static size_t pkg_id_hash(const void *__key, void *ctx __maybe_unused) 282 281 { 283 - if (counter->per_pkg_mask) 284 - memset(counter->per_pkg_mask, 0, cpu__max_cpu()); 282 + uint64_t *key = (uint64_t *) __key; 283 + 284 + return *key & 0xffffffff; 285 + } 286 + 287 + static bool pkg_id_equal(const void *__key1, const void *__key2, 288 + void *ctx __maybe_unused) 289 + { 290 + uint64_t *key1 = (uint64_t *) __key1; 291 + uint64_t *key2 = (uint64_t *) __key2; 292 + 293 + return *key1 == *key2; 285 294 } 286 295 287 296 static int check_per_pkg(struct evsel *counter, 288 297 struct perf_counts_values *vals, int cpu, bool *skip) 289 298 { 290 - unsigned long *mask = counter->per_pkg_mask; 299 + struct hashmap *mask = counter->per_pkg_mask; 291 300 struct perf_cpu_map *cpus = evsel__cpus(counter); 292 - int s; 301 + int s, d, ret = 0; 302 + uint64_t *key; 293 303 294 304 *skip = false; 295 305 ··· 311 299 return 0; 312 300 313 301 if (!mask) { 314 - mask = zalloc(cpu__max_cpu()); 302 + mask = hashmap__new(pkg_id_hash, pkg_id_equal, NULL); 315 303 if (!mask) 316 304 return -ENOMEM; 317 305 ··· 333 321 if (s < 0) 334 322 return -1; 335 323 336 - *skip = test_and_set_bit(s, mask) == 1; 337 - return 0; 324 + /* 325 + * On multi-die system, die_id > 0. On no-die system, die_id = 0. 326 + * We use hashmap(socket, die) to check the used socket+die pair. 327 + */ 328 + d = cpu_map__get_die(cpus, cpu, NULL).die; 329 + if (d < 0) 330 + return -1; 331 + 332 + key = malloc(sizeof(*key)); 333 + if (!key) 334 + return -ENOMEM; 335 + 336 + *key = (uint64_t)d << 32 | s; 337 + if (hashmap__find(mask, (void *)key, NULL)) 338 + *skip = true; 339 + else 340 + ret = hashmap__add(mask, (void *)key, (void *)1); 341 + 342 + return ret; 338 343 } 339 344 340 345 static int ··· 451 422 } 452 423 453 424 if (counter->per_pkg) 454 - zero_per_pkg(counter); 425 + evsel__zero_per_pkg(counter); 455 426 456 427 ret = process_counter_maps(config, counter); 457 428 if (ret)