perf stat: Support per-cluster aggregation

+11

tools/perf/Documentation/perf-stat.txt

··· 308 308 die number and the number of online processors on that die. This is 309 309 useful to gauge the amount of aggregation. 310 310 311 + --per-cluster:: 312 + Aggregate counts per processor cluster for system-wide mode measurement. This 313 + is a useful mode to detect imbalance between clusters. To enable this mode, 314 + use --per-cluster in addition to -a. (system-wide). The output includes the 315 + cluster number and the number of online processors on that cluster. This is 316 + useful to gauge the amount of aggregation. The information of cluster ID and 317 + related CPUs can be gotten from /sys/devices/system/cpu/cpuX/topology/cluster_{id, cpus}. 318 + 311 319 --per-cache:: 312 320 Aggregate counts per cache instance for system-wide mode measurements. By 313 321 default, the aggregation happens for the cache level at the highest index ··· 403 395 404 396 --per-die:: 405 397 Aggregate counts per processor die for system-wide mode measurements. 398 + 399 + --per-cluster:: 400 + Aggregate counts perf processor cluster for system-wide mode measurements. 406 401 407 402 --per-cache:: 408 403 Aggregate counts per cache instance for system-wide mode measurements. By

+49 -3

tools/perf/builtin-stat.c

··· 1238 1238 "aggregate counts per processor socket", AGGR_SOCKET), 1239 1239 OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode, 1240 1240 "aggregate counts per processor die", AGGR_DIE), 1241 + OPT_SET_UINT(0, "per-cluster", &stat_config.aggr_mode, 1242 + "aggregate counts per processor cluster", AGGR_CLUSTER), 1241 1243 OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level, 1242 1244 "cache level", "aggregate count at this cache level (Default: LLC)", 1243 1245 parse_cache_level), ··· 1430 1428 static const char *const aggr_mode__string[] = { 1431 1429 [AGGR_CORE] = "core", 1432 1430 [AGGR_CACHE] = "cache", 1431 + [AGGR_CLUSTER] = "cluster", 1433 1432 [AGGR_DIE] = "die", 1434 1433 [AGGR_GLOBAL] = "global", 1435 1434 [AGGR_NODE] = "node", ··· 1456 1453 struct perf_cpu cpu) 1457 1454 { 1458 1455 return aggr_cpu_id__cache(cpu, /*data=*/NULL); 1456 + } 1457 + 1458 + static struct aggr_cpu_id perf_stat__get_cluster(struct perf_stat_config *config __maybe_unused, 1459 + struct perf_cpu cpu) 1460 + { 1461 + return aggr_cpu_id__cluster(cpu, /*data=*/NULL); 1459 1462 } 1460 1463 1461 1464 static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused, ··· 1516 1507 return perf_stat__get_aggr(config, perf_stat__get_die, cpu); 1517 1508 } 1518 1509 1510 + static struct aggr_cpu_id perf_stat__get_cluster_cached(struct perf_stat_config *config, 1511 + struct perf_cpu cpu) 1512 + { 1513 + return perf_stat__get_aggr(config, perf_stat__get_cluster, cpu); 1514 + } 1515 + 1519 1516 static struct aggr_cpu_id perf_stat__get_cache_id_cached(struct perf_stat_config *config, 1520 1517 struct perf_cpu cpu) 1521 1518 { ··· 1559 1544 return aggr_cpu_id__socket; 1560 1545 case AGGR_DIE: 1561 1546 return aggr_cpu_id__die; 1547 + case AGGR_CLUSTER: 1548 + return aggr_cpu_id__cluster; 1562 1549 case AGGR_CACHE: 1563 1550 return aggr_cpu_id__cache; 1564 1551 case AGGR_CORE: ··· 1586 1569 return perf_stat__get_socket_cached; 1587 1570 case AGGR_DIE: 1588 1571 return perf_stat__get_die_cached; 1572 + case AGGR_CLUSTER: 1573 + return perf_stat__get_cluster_cached; 1589 1574 case AGGR_CACHE: 1590 1575 return perf_stat__get_cache_id_cached; 1591 1576 case AGGR_CORE: ··· 1756 1737 return id; 1757 1738 } 1758 1739 1740 + static struct aggr_cpu_id perf_env__get_cluster_aggr_by_cpu(struct perf_cpu cpu, 1741 + void *data) 1742 + { 1743 + struct perf_env *env = data; 1744 + struct aggr_cpu_id id = aggr_cpu_id__empty(); 1745 + 1746 + if (cpu.cpu != -1) { 1747 + id.socket = env->cpu[cpu.cpu].socket_id; 1748 + id.die = env->cpu[cpu.cpu].die_id; 1749 + id.cluster = env->cpu[cpu.cpu].cluster_id; 1750 + } 1751 + 1752 + return id; 1753 + } 1754 + 1759 1755 static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, void *data) 1760 1756 { 1761 1757 struct perf_env *env = data; ··· 1778 1744 1779 1745 if (cpu.cpu != -1) { 1780 1746 /* 1781 - * core_id is relative to socket and die, 1782 - * we need a global id. So we set 1783 - * socket, die id and core id 1747 + * core_id is relative to socket, die and cluster, we need a 1748 + * global id. So we set socket, die id, cluster id and core id. 1784 1749 */ 1785 1750 id.socket = env->cpu[cpu.cpu].socket_id; 1786 1751 id.die = env->cpu[cpu.cpu].die_id; 1752 + id.cluster = env->cpu[cpu.cpu].cluster_id; 1787 1753 id.core = env->cpu[cpu.cpu].core_id; 1788 1754 } 1789 1755 ··· 1839 1805 return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env); 1840 1806 } 1841 1807 1808 + static struct aggr_cpu_id perf_stat__get_cluster_file(struct perf_stat_config *config __maybe_unused, 1809 + struct perf_cpu cpu) 1810 + { 1811 + return perf_env__get_cluster_aggr_by_cpu(cpu, &perf_stat.session->header.env); 1812 + } 1813 + 1842 1814 static struct aggr_cpu_id perf_stat__get_cache_file(struct perf_stat_config *config __maybe_unused, 1843 1815 struct perf_cpu cpu) 1844 1816 { ··· 1882 1842 return perf_env__get_socket_aggr_by_cpu; 1883 1843 case AGGR_DIE: 1884 1844 return perf_env__get_die_aggr_by_cpu; 1845 + case AGGR_CLUSTER: 1846 + return perf_env__get_cluster_aggr_by_cpu; 1885 1847 case AGGR_CACHE: 1886 1848 return perf_env__get_cache_aggr_by_cpu; 1887 1849 case AGGR_CORE: ··· 1909 1867 return perf_stat__get_socket_file; 1910 1868 case AGGR_DIE: 1911 1869 return perf_stat__get_die_file; 1870 + case AGGR_CLUSTER: 1871 + return perf_stat__get_cluster_file; 1912 1872 case AGGR_CACHE: 1913 1873 return perf_stat__get_cache_file; 1914 1874 case AGGR_CORE: ··· 2442 2398 "aggregate counts per processor socket", AGGR_SOCKET), 2443 2399 OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode, 2444 2400 "aggregate counts per processor die", AGGR_DIE), 2401 + OPT_SET_UINT(0, "per-cluster", &perf_stat.aggr_mode, 2402 + "aggregate counts perf processor cluster", AGGR_CLUSTER), 2445 2403 OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level, 2446 2404 "cache level", 2447 2405 "aggregate count at this cache level (Default: LLC)",

+3 -1

tools/perf/tests/shell/lib/perf_json_output_lint.py

··· 15 15 ap.add_argument('--per-core', action='store_true') 16 16 ap.add_argument('--per-thread', action='store_true') 17 17 ap.add_argument('--per-cache', action='store_true') 18 + ap.add_argument('--per-cluster', action='store_true') 18 19 ap.add_argument('--per-die', action='store_true') 19 20 ap.add_argument('--per-node', action='store_true') 20 21 ap.add_argument('--per-socket', action='store_true') ··· 50 49 'cgroup': lambda x: True, 51 50 'cpu': lambda x: isint(x), 52 51 'cache': lambda x: True, 52 + 'cluster': lambda x: True, 53 53 'die': lambda x: True, 54 54 'event': lambda x: True, 55 55 'event-runtime': lambda x: isfloat(x), ··· 90 88 expected_items = 7 91 89 elif args.interval or args.per_thread or args.system_wide_no_aggr: 92 90 expected_items = 8 93 - elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cache: 91 + elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cluster or args.per_cache: 94 92 expected_items = 9 95 93 else: 96 94 # If no option is specified, don't check the number of items.

+12

tools/perf/tests/shell/lib/stat_output.sh

··· 97 97 echo "[Success]" 98 98 } 99 99 100 + check_per_cluster() 101 + { 102 + echo -n "Checking $1 output: per cluster " 103 + if ParanoidAndNotRoot 0 104 + then 105 + echo "[Skip] paranoid and not root" 106 + return 107 + fi 108 + perf stat --per-cluster -a $2 true 109 + echo "[Success]" 110 + } 111 + 100 112 check_per_die() 101 113 { 102 114 echo -n "Checking $1 output: per die "

+2

tools/perf/tests/shell/stat+csv_output.sh

··· 42 42 ;; "--per-socket") exp=8 43 43 ;; "--per-node") exp=8 44 44 ;; "--per-die") exp=8 45 + ;; "--per-cluster") exp=8 45 46 ;; "--per-cache") exp=8 46 47 esac 47 48 ··· 80 79 check_system_wide_no_aggr "CSV" "$perf_cmd" 81 80 check_per_core "CSV" "$perf_cmd" 82 81 check_per_cache_instance "CSV" "$perf_cmd" 82 + check_per_cluster "CSV" "$perf_cmd" 83 83 check_per_die "CSV" "$perf_cmd" 84 84 check_per_socket "CSV" "$perf_cmd" 85 85 else

+13

tools/perf/tests/shell/stat+json_output.sh

··· 122 122 echo "[Success]" 123 123 } 124 124 125 + check_per_cluster() 126 + { 127 + echo -n "Checking json output: per cluster " 128 + if ParanoidAndNotRoot 0 129 + then 130 + echo "[Skip] paranoia and not root" 131 + return 132 + fi 133 + perf stat -j --per-cluster -a true 2>&1 | $PYTHON $pythonchecker --per-cluster 134 + echo "[Success]" 135 + } 136 + 125 137 check_per_die() 126 138 { 127 139 echo -n "Checking json output: per die " ··· 212 200 check_system_wide_no_aggr 213 201 check_per_core 214 202 check_per_cache_instance 203 + check_per_cluster 215 204 check_per_die 216 205 check_per_socket 217 206 else

+2

tools/perf/tests/shell/stat+std_output.sh

··· 40 40 ;; "--per-node") prefix=3 41 41 ;; "--per-die") prefix=3 42 42 ;; "--per-cache") prefix=3 43 + ;; "--per-cluster") prefix=3 43 44 esac 44 45 45 46 while read line ··· 100 99 check_system_wide_no_aggr "STD" "$perf_cmd" 101 100 check_per_core "STD" "$perf_cmd" 102 101 check_per_cache_instance "STD" "$perf_cmd" 102 + check_per_cluster "STD" "$perf_cmd" 103 103 check_per_die "STD" "$perf_cmd" 104 104 check_per_socket "STD" "$perf_cmd" 105 105 else

+31 -2

tools/perf/util/cpumap.c

··· 222 222 return a->socket - b->socket; 223 223 else if (a->die != b->die) 224 224 return a->die - b->die; 225 + else if (a->cluster != b->cluster) 226 + return a->cluster - b->cluster; 225 227 else if (a->cache_lvl != b->cache_lvl) 226 228 return a->cache_lvl - b->cache_lvl; 227 229 else if (a->cache != b->cache) ··· 311 309 return id; 312 310 } 313 311 312 + int cpu__get_cluster_id(struct perf_cpu cpu) 313 + { 314 + int value, ret = cpu__get_topology_int(cpu.cpu, "cluster_id", &value); 315 + 316 + return ret ?: value; 317 + } 318 + 319 + struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data) 320 + { 321 + int cluster = cpu__get_cluster_id(cpu); 322 + struct aggr_cpu_id id; 323 + 324 + /* There is no cluster_id on legacy system. */ 325 + if (cluster == -1) 326 + cluster = 0; 327 + 328 + id = aggr_cpu_id__die(cpu, data); 329 + if (aggr_cpu_id__is_empty(&id)) 330 + return id; 331 + 332 + id.cluster = cluster; 333 + return id; 334 + } 335 + 314 336 int cpu__get_core_id(struct perf_cpu cpu) 315 337 { 316 338 int value, ret = cpu__get_topology_int(cpu.cpu, "core_id", &value); ··· 346 320 struct aggr_cpu_id id; 347 321 int core = cpu__get_core_id(cpu); 348 322 349 - /* aggr_cpu_id__die returns a struct with socket and die set. */ 350 - id = aggr_cpu_id__die(cpu, data); 323 + /* aggr_cpu_id__die returns a struct with socket die, and cluster set. */ 324 + id = aggr_cpu_id__cluster(cpu, data); 351 325 if (aggr_cpu_id__is_empty(&id)) 352 326 return id; 353 327 ··· 709 683 a->node == b->node && 710 684 a->socket == b->socket && 711 685 a->die == b->die && 686 + a->cluster == b->cluster && 712 687 a->cache_lvl == b->cache_lvl && 713 688 a->cache == b->cache && 714 689 a->core == b->core && ··· 722 695 a->node == -1 && 723 696 a->socket == -1 && 724 697 a->die == -1 && 698 + a->cluster == -1 && 725 699 a->cache_lvl == -1 && 726 700 a->cache == -1 && 727 701 a->core == -1 && ··· 736 708 .node = -1, 737 709 .socket = -1, 738 710 .die = -1, 711 + .cluster = -1, 739 712 .cache_lvl = -1, 740 713 .cache = -1, 741 714 .core = -1,

+16 -3

tools/perf/util/cpumap.h

··· 20 20 int socket; 21 21 /** The die id as read from /sys/devices/system/cpu/cpuX/topology/die_id. */ 22 22 int die; 23 + /** The cluster id as read from /sys/devices/system/cpu/cpuX/topology/cluster_id */ 24 + int cluster; 23 25 /** The cache level as read from /sys/devices/system/cpu/cpuX/cache/indexY/level */ 24 26 int cache_lvl; 25 27 /** ··· 89 87 */ 90 88 int cpu__get_die_id(struct perf_cpu cpu); 91 89 /** 90 + * cpu__get_cluster_id - Returns the cluster id as read from 91 + * /sys/devices/system/cpu/cpuX/topology/cluster_id for the given CPU 92 + */ 93 + int cpu__get_cluster_id(struct perf_cpu cpu); 94 + /** 92 95 * cpu__get_core_id - Returns the core id as read from 93 96 * /sys/devices/system/cpu/cpuX/topology/core_id for the given CPU. 94 97 */ ··· 134 127 */ 135 128 struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data); 136 129 /** 137 - * aggr_cpu_id__core - Create an aggr_cpu_id with the core, die and socket 138 - * populated with the core, die and socket for cpu. The function signature is 139 - * compatible with aggr_cpu_id_get_t. 130 + * aggr_cpu_id__cluster - Create an aggr_cpu_id with cluster, die and socket 131 + * populated with the cluster, die and socket for cpu. The function signature 132 + * is compatible with aggr_cpu_id_get_t. 133 + */ 134 + struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data); 135 + /** 136 + * aggr_cpu_id__core - Create an aggr_cpu_id with the core, cluster, die and 137 + * socket populated with the core, die and socket for cpu. The function 138 + * signature is compatible with aggr_cpu_id_get_t. 140 139 */ 141 140 struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data); 142 141 /**

+1

tools/perf/util/env.h

··· 12 12 struct cpu_topology_map { 13 13 int socket_id; 14 14 int die_id; 15 + int cluster_id; 15 16 int core_id; 16 17 }; 17 18

+13

tools/perf/util/stat-display.c

··· 201 201 snprintf(buf, sizeof(buf), "S%d-D%d-L%d-ID%d", 202 202 id.socket, id.die, id.cache_lvl, id.cache); 203 203 break; 204 + case AGGR_CLUSTER: 205 + snprintf(buf, sizeof(buf), "S%d-D%d-CLS%d", id.socket, id.die, id.cluster); 206 + break; 204 207 case AGGR_DIE: 205 208 snprintf(buf, sizeof(buf), "S%d-D%d", id.socket, id.die); 206 209 break; ··· 254 251 fprintf(config->output, "S%d-D%d-L%d-ID%d%s%d%s", 255 252 id.socket, id.die, id.cache_lvl, id.cache, sep, aggr_nr, sep); 256 253 break; 254 + case AGGR_CLUSTER: 255 + fprintf(config->output, "S%d-D%d-CLS%d%s%d%s", 256 + id.socket, id.die, id.cluster, sep, aggr_nr, sep); 257 + break; 257 258 case AGGR_DIE: 258 259 fprintf(output, "S%d-D%d%s%d%s", 259 260 id.socket, id.die, sep, aggr_nr, sep); ··· 306 299 case AGGR_CACHE: 307 300 fprintf(output, "\"cache\" : \"S%d-D%d-L%d-ID%d\", \"aggregate-number\" : %d, ", 308 301 id.socket, id.die, id.cache_lvl, id.cache, aggr_nr); 302 + break; 303 + case AGGR_CLUSTER: 304 + fprintf(output, "\"cluster\" : \"S%d-D%d-CLS%d\", \"aggregate-number\" : %d, ", 305 + id.socket, id.die, id.cluster, aggr_nr); 309 306 break; 310 307 case AGGR_DIE: 311 308 fprintf(output, "\"die\" : \"S%d-D%d\", \"aggregate-number\" : %d, ", ··· 1259 1248 case AGGR_NODE: 1260 1249 case AGGR_SOCKET: 1261 1250 case AGGR_DIE: 1251 + case AGGR_CLUSTER: 1262 1252 case AGGR_CACHE: 1263 1253 case AGGR_CORE: 1264 1254 fprintf(output, "#%*s %-*s cpus", ··· 1562 1550 switch (config->aggr_mode) { 1563 1551 case AGGR_CORE: 1564 1552 case AGGR_CACHE: 1553 + case AGGR_CLUSTER: 1565 1554 case AGGR_DIE: 1566 1555 case AGGR_SOCKET: 1567 1556 case AGGR_NODE:

+1

tools/perf/util/stat.h

··· 48 48 AGGR_GLOBAL, 49 49 AGGR_SOCKET, 50 50 AGGR_DIE, 51 + AGGR_CLUSTER, 51 52 AGGR_CACHE, 52 53 AGGR_CORE, 53 54 AGGR_THREAD,