Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf stat: Support per-cluster aggregation

Some platforms have 'cluster' topology and CPUs in the cluster will
share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2
cache (for Intel Jacobsville). Currently parsing and building cluster
topology have been supported since [1].

perf stat has already supported aggregation for other topologies like
die or socket, etc. It'll be useful to aggregate per-cluster to find
problems like L3T bandwidth contention.

This patch add support for "--per-cluster" option for per-cluster
aggregation. Also update the docs and related test. The output will
be like:

[root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5

Performance counter stats for 'system wide':

S56-D0-CLS158 4 1,321,521,570 LLC-load
S56-D0-CLS594 4 794,211,453 LLC-load
S56-D0-CLS1030 4 41,623 LLC-load
S56-D0-CLS1466 4 41,646 LLC-load
S56-D0-CLS1902 4 16,863 LLC-load
S56-D0-CLS2338 4 15,721 LLC-load
S56-D0-CLS2774 4 22,671 LLC-load
[...]

On a legacy system without cluster or cluster support, the output will
be look like:
[root@localhost perf]# perf stat -a -e cycles --per-cluster -- sleep 1

Performance counter stats for 'system wide':

S56-D0-CLS0 64 18,011,485 cycles
S7182-D0-CLS0 64 16,548,835 cycles

Note that this patch doesn't mix the cluster information in the outputs
of --per-core to avoid breaking any tools/scripts using it.

Note that perf recently supports "--per-cache" aggregation, but it's not
the same with the cluster although cluster CPUs may share some cache
resources. For example on my machine all clusters within a die share the
same L3 cache:
$ cat /sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list
0-31
$ cat /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list
0-3

[1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")

Tested-by: Jie Zhan <zhanjie9@hisilicon.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Cc: james.clark@arm.com
Cc: 21cnbao@gmail.com
Cc: prime.zeng@hisilicon.com
Cc: Jonathan.Cameron@huawei.com
Cc: fanghao11@huawei.com
Cc: linuxarm@huawei.com
Cc: tim.c.chen@intel.com
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240208024026.2691-1-yangyicong@huawei.com

authored by

Yicong Yang and committed by
Namhyung Kim
cbc917a1 9a440bb2

+154 -9
+11
tools/perf/Documentation/perf-stat.txt
··· 308 308 die number and the number of online processors on that die. This is 309 309 useful to gauge the amount of aggregation. 310 310 311 + --per-cluster:: 312 + Aggregate counts per processor cluster for system-wide mode measurement. This 313 + is a useful mode to detect imbalance between clusters. To enable this mode, 314 + use --per-cluster in addition to -a. (system-wide). The output includes the 315 + cluster number and the number of online processors on that cluster. This is 316 + useful to gauge the amount of aggregation. The information of cluster ID and 317 + related CPUs can be gotten from /sys/devices/system/cpu/cpuX/topology/cluster_{id, cpus}. 318 + 311 319 --per-cache:: 312 320 Aggregate counts per cache instance for system-wide mode measurements. By 313 321 default, the aggregation happens for the cache level at the highest index ··· 403 395 404 396 --per-die:: 405 397 Aggregate counts per processor die for system-wide mode measurements. 398 + 399 + --per-cluster:: 400 + Aggregate counts perf processor cluster for system-wide mode measurements. 406 401 407 402 --per-cache:: 408 403 Aggregate counts per cache instance for system-wide mode measurements. By
+49 -3
tools/perf/builtin-stat.c
··· 1238 1238 "aggregate counts per processor socket", AGGR_SOCKET), 1239 1239 OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode, 1240 1240 "aggregate counts per processor die", AGGR_DIE), 1241 + OPT_SET_UINT(0, "per-cluster", &stat_config.aggr_mode, 1242 + "aggregate counts per processor cluster", AGGR_CLUSTER), 1241 1243 OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level, 1242 1244 "cache level", "aggregate count at this cache level (Default: LLC)", 1243 1245 parse_cache_level), ··· 1430 1428 static const char *const aggr_mode__string[] = { 1431 1429 [AGGR_CORE] = "core", 1432 1430 [AGGR_CACHE] = "cache", 1431 + [AGGR_CLUSTER] = "cluster", 1433 1432 [AGGR_DIE] = "die", 1434 1433 [AGGR_GLOBAL] = "global", 1435 1434 [AGGR_NODE] = "node", ··· 1456 1453 struct perf_cpu cpu) 1457 1454 { 1458 1455 return aggr_cpu_id__cache(cpu, /*data=*/NULL); 1456 + } 1457 + 1458 + static struct aggr_cpu_id perf_stat__get_cluster(struct perf_stat_config *config __maybe_unused, 1459 + struct perf_cpu cpu) 1460 + { 1461 + return aggr_cpu_id__cluster(cpu, /*data=*/NULL); 1459 1462 } 1460 1463 1461 1464 static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused, ··· 1516 1507 return perf_stat__get_aggr(config, perf_stat__get_die, cpu); 1517 1508 } 1518 1509 1510 + static struct aggr_cpu_id perf_stat__get_cluster_cached(struct perf_stat_config *config, 1511 + struct perf_cpu cpu) 1512 + { 1513 + return perf_stat__get_aggr(config, perf_stat__get_cluster, cpu); 1514 + } 1515 + 1519 1516 static struct aggr_cpu_id perf_stat__get_cache_id_cached(struct perf_stat_config *config, 1520 1517 struct perf_cpu cpu) 1521 1518 { ··· 1559 1544 return aggr_cpu_id__socket; 1560 1545 case AGGR_DIE: 1561 1546 return aggr_cpu_id__die; 1547 + case AGGR_CLUSTER: 1548 + return aggr_cpu_id__cluster; 1562 1549 case AGGR_CACHE: 1563 1550 return aggr_cpu_id__cache; 1564 1551 case AGGR_CORE: ··· 1586 1569 return perf_stat__get_socket_cached; 1587 1570 case AGGR_DIE: 1588 1571 return perf_stat__get_die_cached; 1572 + case AGGR_CLUSTER: 1573 + return perf_stat__get_cluster_cached; 1589 1574 case AGGR_CACHE: 1590 1575 return perf_stat__get_cache_id_cached; 1591 1576 case AGGR_CORE: ··· 1756 1737 return id; 1757 1738 } 1758 1739 1740 + static struct aggr_cpu_id perf_env__get_cluster_aggr_by_cpu(struct perf_cpu cpu, 1741 + void *data) 1742 + { 1743 + struct perf_env *env = data; 1744 + struct aggr_cpu_id id = aggr_cpu_id__empty(); 1745 + 1746 + if (cpu.cpu != -1) { 1747 + id.socket = env->cpu[cpu.cpu].socket_id; 1748 + id.die = env->cpu[cpu.cpu].die_id; 1749 + id.cluster = env->cpu[cpu.cpu].cluster_id; 1750 + } 1751 + 1752 + return id; 1753 + } 1754 + 1759 1755 static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, void *data) 1760 1756 { 1761 1757 struct perf_env *env = data; ··· 1778 1744 1779 1745 if (cpu.cpu != -1) { 1780 1746 /* 1781 - * core_id is relative to socket and die, 1782 - * we need a global id. So we set 1783 - * socket, die id and core id 1747 + * core_id is relative to socket, die and cluster, we need a 1748 + * global id. So we set socket, die id, cluster id and core id. 1784 1749 */ 1785 1750 id.socket = env->cpu[cpu.cpu].socket_id; 1786 1751 id.die = env->cpu[cpu.cpu].die_id; 1752 + id.cluster = env->cpu[cpu.cpu].cluster_id; 1787 1753 id.core = env->cpu[cpu.cpu].core_id; 1788 1754 } 1789 1755 ··· 1839 1805 return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env); 1840 1806 } 1841 1807 1808 + static struct aggr_cpu_id perf_stat__get_cluster_file(struct perf_stat_config *config __maybe_unused, 1809 + struct perf_cpu cpu) 1810 + { 1811 + return perf_env__get_cluster_aggr_by_cpu(cpu, &perf_stat.session->header.env); 1812 + } 1813 + 1842 1814 static struct aggr_cpu_id perf_stat__get_cache_file(struct perf_stat_config *config __maybe_unused, 1843 1815 struct perf_cpu cpu) 1844 1816 { ··· 1882 1842 return perf_env__get_socket_aggr_by_cpu; 1883 1843 case AGGR_DIE: 1884 1844 return perf_env__get_die_aggr_by_cpu; 1845 + case AGGR_CLUSTER: 1846 + return perf_env__get_cluster_aggr_by_cpu; 1885 1847 case AGGR_CACHE: 1886 1848 return perf_env__get_cache_aggr_by_cpu; 1887 1849 case AGGR_CORE: ··· 1909 1867 return perf_stat__get_socket_file; 1910 1868 case AGGR_DIE: 1911 1869 return perf_stat__get_die_file; 1870 + case AGGR_CLUSTER: 1871 + return perf_stat__get_cluster_file; 1912 1872 case AGGR_CACHE: 1913 1873 return perf_stat__get_cache_file; 1914 1874 case AGGR_CORE: ··· 2442 2398 "aggregate counts per processor socket", AGGR_SOCKET), 2443 2399 OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode, 2444 2400 "aggregate counts per processor die", AGGR_DIE), 2401 + OPT_SET_UINT(0, "per-cluster", &perf_stat.aggr_mode, 2402 + "aggregate counts perf processor cluster", AGGR_CLUSTER), 2445 2403 OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level, 2446 2404 "cache level", 2447 2405 "aggregate count at this cache level (Default: LLC)",
+3 -1
tools/perf/tests/shell/lib/perf_json_output_lint.py
··· 15 15 ap.add_argument('--per-core', action='store_true') 16 16 ap.add_argument('--per-thread', action='store_true') 17 17 ap.add_argument('--per-cache', action='store_true') 18 + ap.add_argument('--per-cluster', action='store_true') 18 19 ap.add_argument('--per-die', action='store_true') 19 20 ap.add_argument('--per-node', action='store_true') 20 21 ap.add_argument('--per-socket', action='store_true') ··· 50 49 'cgroup': lambda x: True, 51 50 'cpu': lambda x: isint(x), 52 51 'cache': lambda x: True, 52 + 'cluster': lambda x: True, 53 53 'die': lambda x: True, 54 54 'event': lambda x: True, 55 55 'event-runtime': lambda x: isfloat(x), ··· 90 88 expected_items = 7 91 89 elif args.interval or args.per_thread or args.system_wide_no_aggr: 92 90 expected_items = 8 93 - elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cache: 91 + elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cluster or args.per_cache: 94 92 expected_items = 9 95 93 else: 96 94 # If no option is specified, don't check the number of items.
+12
tools/perf/tests/shell/lib/stat_output.sh
··· 97 97 echo "[Success]" 98 98 } 99 99 100 + check_per_cluster() 101 + { 102 + echo -n "Checking $1 output: per cluster " 103 + if ParanoidAndNotRoot 0 104 + then 105 + echo "[Skip] paranoid and not root" 106 + return 107 + fi 108 + perf stat --per-cluster -a $2 true 109 + echo "[Success]" 110 + } 111 + 100 112 check_per_die() 101 113 { 102 114 echo -n "Checking $1 output: per die "
+2
tools/perf/tests/shell/stat+csv_output.sh
··· 42 42 ;; "--per-socket") exp=8 43 43 ;; "--per-node") exp=8 44 44 ;; "--per-die") exp=8 45 + ;; "--per-cluster") exp=8 45 46 ;; "--per-cache") exp=8 46 47 esac 47 48 ··· 80 79 check_system_wide_no_aggr "CSV" "$perf_cmd" 81 80 check_per_core "CSV" "$perf_cmd" 82 81 check_per_cache_instance "CSV" "$perf_cmd" 82 + check_per_cluster "CSV" "$perf_cmd" 83 83 check_per_die "CSV" "$perf_cmd" 84 84 check_per_socket "CSV" "$perf_cmd" 85 85 else
+13
tools/perf/tests/shell/stat+json_output.sh
··· 122 122 echo "[Success]" 123 123 } 124 124 125 + check_per_cluster() 126 + { 127 + echo -n "Checking json output: per cluster " 128 + if ParanoidAndNotRoot 0 129 + then 130 + echo "[Skip] paranoia and not root" 131 + return 132 + fi 133 + perf stat -j --per-cluster -a true 2>&1 | $PYTHON $pythonchecker --per-cluster 134 + echo "[Success]" 135 + } 136 + 125 137 check_per_die() 126 138 { 127 139 echo -n "Checking json output: per die " ··· 212 200 check_system_wide_no_aggr 213 201 check_per_core 214 202 check_per_cache_instance 203 + check_per_cluster 215 204 check_per_die 216 205 check_per_socket 217 206 else
+2
tools/perf/tests/shell/stat+std_output.sh
··· 40 40 ;; "--per-node") prefix=3 41 41 ;; "--per-die") prefix=3 42 42 ;; "--per-cache") prefix=3 43 + ;; "--per-cluster") prefix=3 43 44 esac 44 45 45 46 while read line ··· 100 99 check_system_wide_no_aggr "STD" "$perf_cmd" 101 100 check_per_core "STD" "$perf_cmd" 102 101 check_per_cache_instance "STD" "$perf_cmd" 102 + check_per_cluster "STD" "$perf_cmd" 103 103 check_per_die "STD" "$perf_cmd" 104 104 check_per_socket "STD" "$perf_cmd" 105 105 else
+31 -2
tools/perf/util/cpumap.c
··· 222 222 return a->socket - b->socket; 223 223 else if (a->die != b->die) 224 224 return a->die - b->die; 225 + else if (a->cluster != b->cluster) 226 + return a->cluster - b->cluster; 225 227 else if (a->cache_lvl != b->cache_lvl) 226 228 return a->cache_lvl - b->cache_lvl; 227 229 else if (a->cache != b->cache) ··· 311 309 return id; 312 310 } 313 311 312 + int cpu__get_cluster_id(struct perf_cpu cpu) 313 + { 314 + int value, ret = cpu__get_topology_int(cpu.cpu, "cluster_id", &value); 315 + 316 + return ret ?: value; 317 + } 318 + 319 + struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data) 320 + { 321 + int cluster = cpu__get_cluster_id(cpu); 322 + struct aggr_cpu_id id; 323 + 324 + /* There is no cluster_id on legacy system. */ 325 + if (cluster == -1) 326 + cluster = 0; 327 + 328 + id = aggr_cpu_id__die(cpu, data); 329 + if (aggr_cpu_id__is_empty(&id)) 330 + return id; 331 + 332 + id.cluster = cluster; 333 + return id; 334 + } 335 + 314 336 int cpu__get_core_id(struct perf_cpu cpu) 315 337 { 316 338 int value, ret = cpu__get_topology_int(cpu.cpu, "core_id", &value); ··· 346 320 struct aggr_cpu_id id; 347 321 int core = cpu__get_core_id(cpu); 348 322 349 - /* aggr_cpu_id__die returns a struct with socket and die set. */ 350 - id = aggr_cpu_id__die(cpu, data); 323 + /* aggr_cpu_id__die returns a struct with socket die, and cluster set. */ 324 + id = aggr_cpu_id__cluster(cpu, data); 351 325 if (aggr_cpu_id__is_empty(&id)) 352 326 return id; 353 327 ··· 709 683 a->node == b->node && 710 684 a->socket == b->socket && 711 685 a->die == b->die && 686 + a->cluster == b->cluster && 712 687 a->cache_lvl == b->cache_lvl && 713 688 a->cache == b->cache && 714 689 a->core == b->core && ··· 722 695 a->node == -1 && 723 696 a->socket == -1 && 724 697 a->die == -1 && 698 + a->cluster == -1 && 725 699 a->cache_lvl == -1 && 726 700 a->cache == -1 && 727 701 a->core == -1 && ··· 736 708 .node = -1, 737 709 .socket = -1, 738 710 .die = -1, 711 + .cluster = -1, 739 712 .cache_lvl = -1, 740 713 .cache = -1, 741 714 .core = -1,
+16 -3
tools/perf/util/cpumap.h
··· 20 20 int socket; 21 21 /** The die id as read from /sys/devices/system/cpu/cpuX/topology/die_id. */ 22 22 int die; 23 + /** The cluster id as read from /sys/devices/system/cpu/cpuX/topology/cluster_id */ 24 + int cluster; 23 25 /** The cache level as read from /sys/devices/system/cpu/cpuX/cache/indexY/level */ 24 26 int cache_lvl; 25 27 /** ··· 89 87 */ 90 88 int cpu__get_die_id(struct perf_cpu cpu); 91 89 /** 90 + * cpu__get_cluster_id - Returns the cluster id as read from 91 + * /sys/devices/system/cpu/cpuX/topology/cluster_id for the given CPU 92 + */ 93 + int cpu__get_cluster_id(struct perf_cpu cpu); 94 + /** 92 95 * cpu__get_core_id - Returns the core id as read from 93 96 * /sys/devices/system/cpu/cpuX/topology/core_id for the given CPU. 94 97 */ ··· 134 127 */ 135 128 struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data); 136 129 /** 137 - * aggr_cpu_id__core - Create an aggr_cpu_id with the core, die and socket 138 - * populated with the core, die and socket for cpu. The function signature is 139 - * compatible with aggr_cpu_id_get_t. 130 + * aggr_cpu_id__cluster - Create an aggr_cpu_id with cluster, die and socket 131 + * populated with the cluster, die and socket for cpu. The function signature 132 + * is compatible with aggr_cpu_id_get_t. 133 + */ 134 + struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data); 135 + /** 136 + * aggr_cpu_id__core - Create an aggr_cpu_id with the core, cluster, die and 137 + * socket populated with the core, die and socket for cpu. The function 138 + * signature is compatible with aggr_cpu_id_get_t. 140 139 */ 141 140 struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data); 142 141 /**
+1
tools/perf/util/env.h
··· 12 12 struct cpu_topology_map { 13 13 int socket_id; 14 14 int die_id; 15 + int cluster_id; 15 16 int core_id; 16 17 }; 17 18
+13
tools/perf/util/stat-display.c
··· 201 201 snprintf(buf, sizeof(buf), "S%d-D%d-L%d-ID%d", 202 202 id.socket, id.die, id.cache_lvl, id.cache); 203 203 break; 204 + case AGGR_CLUSTER: 205 + snprintf(buf, sizeof(buf), "S%d-D%d-CLS%d", id.socket, id.die, id.cluster); 206 + break; 204 207 case AGGR_DIE: 205 208 snprintf(buf, sizeof(buf), "S%d-D%d", id.socket, id.die); 206 209 break; ··· 254 251 fprintf(config->output, "S%d-D%d-L%d-ID%d%s%d%s", 255 252 id.socket, id.die, id.cache_lvl, id.cache, sep, aggr_nr, sep); 256 253 break; 254 + case AGGR_CLUSTER: 255 + fprintf(config->output, "S%d-D%d-CLS%d%s%d%s", 256 + id.socket, id.die, id.cluster, sep, aggr_nr, sep); 257 + break; 257 258 case AGGR_DIE: 258 259 fprintf(output, "S%d-D%d%s%d%s", 259 260 id.socket, id.die, sep, aggr_nr, sep); ··· 306 299 case AGGR_CACHE: 307 300 fprintf(output, "\"cache\" : \"S%d-D%d-L%d-ID%d\", \"aggregate-number\" : %d, ", 308 301 id.socket, id.die, id.cache_lvl, id.cache, aggr_nr); 302 + break; 303 + case AGGR_CLUSTER: 304 + fprintf(output, "\"cluster\" : \"S%d-D%d-CLS%d\", \"aggregate-number\" : %d, ", 305 + id.socket, id.die, id.cluster, aggr_nr); 309 306 break; 310 307 case AGGR_DIE: 311 308 fprintf(output, "\"die\" : \"S%d-D%d\", \"aggregate-number\" : %d, ", ··· 1259 1248 case AGGR_NODE: 1260 1249 case AGGR_SOCKET: 1261 1250 case AGGR_DIE: 1251 + case AGGR_CLUSTER: 1262 1252 case AGGR_CACHE: 1263 1253 case AGGR_CORE: 1264 1254 fprintf(output, "#%*s %-*s cpus", ··· 1562 1550 switch (config->aggr_mode) { 1563 1551 case AGGR_CORE: 1564 1552 case AGGR_CACHE: 1553 + case AGGR_CLUSTER: 1565 1554 case AGGR_DIE: 1566 1555 case AGGR_SOCKET: 1567 1556 case AGGR_NODE:
+1
tools/perf/util/stat.h
··· 48 48 AGGR_GLOBAL, 49 49 AGGR_SOCKET, 50 50 AGGR_DIE, 51 + AGGR_CLUSTER, 51 52 AGGR_CACHE, 52 53 AGGR_CORE, 53 54 AGGR_THREAD,