perf vendor events intel: Update sapphirerapids events/metrics

+1 -1

tools/perf/pmu-events/arch/x86/mapfile.csv

··· 23 23 GenuineIntel-6-1[AEF],v3,nehalemep,core 24 24 GenuineIntel-6-2E,v3,nehalemex,core 25 25 GenuineIntel-6-2A,v19,sandybridge,core 26 - GenuineIntel-6-(8F|CF),v1.12,sapphirerapids,core 26 + GenuineIntel-6-(8F|CF),v1.13,sapphirerapids,core 27 27 GenuineIntel-6-AF,v1.00,sierraforest,core 28 28 GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core 29 29 GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v55,skylake,core

+4 -2

tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json

··· 32 32 "UMask": "0x3" 33 33 }, 34 34 { 35 - "BriefDescription": "MEMORY_ACTIVITY.STALLS_L2_MISS", 35 + "BriefDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding.", 36 36 "CounterMask": "5", 37 37 "EventCode": "0x47", 38 38 "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", 39 + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", 39 40 "SampleAfterValue": "1000003", 40 41 "UMask": "0x5" 41 42 }, 42 43 { 43 - "BriefDescription": "MEMORY_ACTIVITY.STALLS_L3_MISS", 44 + "BriefDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding.", 44 45 "CounterMask": "9", 45 46 "EventCode": "0x47", 46 47 "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", 48 + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", 47 49 "SampleAfterValue": "1000003", 48 50 "UMask": "0x9" 49 51 },

+1009 -740

tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json

··· 29 29 }, 30 30 { 31 31 "BriefDescription": "Uncore frequency per die [GHZ]", 32 - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", 32 + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", 33 33 "MetricGroup": "SoC", 34 34 "MetricName": "UNCORE_FREQ" 35 + }, 36 + { 37 + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", 38 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", 39 + "MetricName": "cpi", 40 + "ScaleUnit": "1per_instr" 41 + }, 42 + { 43 + "BriefDescription": "CPU operating frequency (in GHz)", 44 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", 45 + "MetricName": "cpu_operating_frequency", 46 + "ScaleUnit": "1GHz" 47 + }, 48 + { 49 + "BriefDescription": "Percentage of time spent in the active CPU power state C0", 50 + "MetricExpr": "tma_info_system_cpu_utilization", 51 + "MetricName": "cpu_utilization", 52 + "ScaleUnit": "100%" 53 + }, 54 + { 55 + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions", 56 + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", 57 + "MetricName": "dtlb_2nd_level_2mb_large_page_load_mpi", 58 + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", 59 + "ScaleUnit": "1per_instr" 60 + }, 61 + { 62 + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", 63 + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 64 + "MetricName": "dtlb_2nd_level_load_mpi", 65 + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", 66 + "ScaleUnit": "1per_instr" 67 + }, 68 + { 69 + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", 70 + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 71 + "MetricName": "dtlb_2nd_level_store_mpi", 72 + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", 73 + "ScaleUnit": "1per_instr" 74 + }, 75 + { 76 + "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", 77 + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e6 / duration_time", 78 + "MetricName": "io_bandwidth_read", 79 + "ScaleUnit": "1MB/s" 80 + }, 81 + { 82 + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", 83 + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_ITOM + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR) * 64 / 1e6 / duration_time", 84 + "MetricName": "io_bandwidth_write", 85 + "ScaleUnit": "1MB/s" 86 + }, 87 + { 88 + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", 89 + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", 90 + "MetricName": "itlb_2nd_level_large_page_mpi", 91 + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", 92 + "ScaleUnit": "1per_instr" 93 + }, 94 + { 95 + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", 96 + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 97 + "MetricName": "itlb_2nd_level_mpi", 98 + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", 99 + "ScaleUnit": "1per_instr" 100 + }, 101 + { 102 + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", 103 + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", 104 + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", 105 + "ScaleUnit": "1per_instr" 106 + }, 107 + { 108 + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", 109 + "MetricExpr": "MEM_LOAD_RETIRED.L1_HIT / INST_RETIRED.ANY", 110 + "MetricName": "l1d_demand_data_read_hits_per_instr", 111 + "ScaleUnit": "1per_instr" 112 + }, 113 + { 114 + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", 115 + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", 116 + "MetricName": "l1d_mpi", 117 + "ScaleUnit": "1per_instr" 118 + }, 119 + { 120 + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", 121 + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 122 + "MetricName": "l2_demand_code_mpi", 123 + "ScaleUnit": "1per_instr" 124 + }, 125 + { 126 + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", 127 + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT / INST_RETIRED.ANY", 128 + "MetricName": "l2_demand_data_read_hits_per_instr", 129 + "ScaleUnit": "1per_instr" 130 + }, 131 + { 132 + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", 133 + "MetricExpr": "MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 134 + "MetricName": "l2_demand_data_read_mpi", 135 + "ScaleUnit": "1per_instr" 136 + }, 137 + { 138 + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", 139 + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", 140 + "MetricName": "l2_mpi", 141 + "ScaleUnit": "1per_instr" 142 + }, 143 + { 144 + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", 145 + "MetricExpr": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD / INST_RETIRED.ANY", 146 + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", 147 + "ScaleUnit": "1per_instr" 148 + }, 149 + { 150 + "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", 151 + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA + UNC_CHA_TOR_INSERTS.IA_MISS_DRD + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF) / INST_RETIRED.ANY", 152 + "MetricName": "llc_data_read_mpi_demand_plus_prefetch", 153 + "ScaleUnit": "1per_instr" 154 + }, 155 + { 156 + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) in nano seconds", 157 + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD) * #num_packages)) * duration_time", 158 + "MetricName": "llc_demand_data_read_miss_latency", 159 + "ScaleUnit": "1ns" 160 + }, 161 + { 162 + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to local memory in nano seconds", 163 + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL) * #num_packages)) * duration_time", 164 + "MetricName": "llc_demand_data_read_miss_latency_for_local_requests", 165 + "ScaleUnit": "1ns" 166 + }, 167 + { 168 + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to remote memory in nano seconds", 169 + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE) * #num_packages)) * duration_time", 170 + "MetricName": "llc_demand_data_read_miss_latency_for_remote_requests", 171 + "ScaleUnit": "1ns" 172 + }, 173 + { 174 + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to DRAM in nano seconds", 175 + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR) * #num_packages)) * duration_time", 176 + "MetricName": "llc_demand_data_read_miss_to_dram_latency", 177 + "ScaleUnit": "1ns" 178 + }, 179 + { 180 + "BriefDescription": "Average latency of a last level cache (LLC) demand data read miss (read memory access) addressed to Intel(R) Optane(TM) Persistent Memory(PMEM) in nano seconds", 181 + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / (UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM) * #num_packages)) * duration_time", 182 + "MetricName": "llc_demand_data_read_miss_to_pmem_latency", 183 + "ScaleUnit": "1ns" 184 + }, 185 + { 186 + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to local memory.", 187 + "MetricExpr": "UNC_CHA_REQUESTS.READS_LOCAL * 64 / 1e6 / duration_time", 188 + "MetricName": "llc_miss_local_memory_bandwidth_read", 189 + "ScaleUnit": "1MB/s" 190 + }, 191 + { 192 + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to local memory.", 193 + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_LOCAL * 64 / 1e6 / duration_time", 194 + "MetricName": "llc_miss_local_memory_bandwidth_write", 195 + "ScaleUnit": "1MB/s" 196 + }, 197 + { 198 + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to remote memory.", 199 + "MetricExpr": "UNC_CHA_REQUESTS.READS_REMOTE * 64 / 1e6 / duration_time", 200 + "MetricName": "llc_miss_remote_memory_bandwidth_read", 201 + "ScaleUnit": "1MB/s" 202 + }, 203 + { 204 + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.", 205 + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time", 206 + "MetricName": "llc_miss_remote_memory_bandwidth_write", 207 + "ScaleUnit": "1MB/s" 208 + }, 209 + { 210 + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", 211 + "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", 212 + "MetricName": "loads_per_instr", 213 + "ScaleUnit": "1per_instr" 214 + }, 215 + { 216 + "BriefDescription": "DDR memory read bandwidth (MB/sec)", 217 + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", 218 + "MetricName": "memory_bandwidth_read", 219 + "ScaleUnit": "1MB/s" 220 + }, 221 + { 222 + "BriefDescription": "DDR memory bandwidth (MB/sec)", 223 + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", 224 + "MetricName": "memory_bandwidth_total", 225 + "ScaleUnit": "1MB/s" 226 + }, 227 + { 228 + "BriefDescription": "DDR memory write bandwidth (MB/sec)", 229 + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", 230 + "MetricName": "memory_bandwidth_write", 231 + "ScaleUnit": "1MB/s" 232 + }, 233 + { 234 + "BriefDescription": "Memory write bandwidth (MB/sec) caused by directory updates; includes DDR and Intel(R) Optane(TM) Persistent Memory(PMEM).", 235 + "MetricExpr": "(UNC_CHA_DIR_UPDATE.HA + UNC_CHA_DIR_UPDATE.TOR + UNC_M2M_DIRECTORY_UPDATE.ANY) * 64 / 1e6 / duration_time", 236 + "MetricName": "memory_extra_write_bw_due_to_directory_updates", 237 + "ScaleUnit": "1MB/s" 238 + }, 239 + { 240 + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", 241 + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)", 242 + "MetricName": "numa_reads_addressed_to_local_dram", 243 + "ScaleUnit": "100%" 244 + }, 245 + { 246 + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", 247 + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)", 248 + "MetricName": "numa_reads_addressed_to_remote_dram", 249 + "ScaleUnit": "100%" 250 + }, 251 + { 252 + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", 253 + "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", 254 + "MetricName": "percent_uops_delivered_from_decoded_icache", 255 + "ScaleUnit": "100%" 256 + }, 257 + { 258 + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", 259 + "MetricExpr": "IDQ.MITE_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", 260 + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", 261 + "ScaleUnit": "100%" 262 + }, 263 + { 264 + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", 265 + "MetricExpr": "IDQ.MS_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS + LSD.UOPS)", 266 + "MetricName": "percent_uops_delivered_from_microcode_sequencer", 267 + "ScaleUnit": "100%" 268 + }, 269 + { 270 + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory read bandwidth (MB/sec)", 271 + "MetricExpr": "UNC_M_PMM_RPQ_INSERTS * 64 / 1e6 / duration_time", 272 + "MetricName": "pmem_memory_bandwidth_read", 273 + "ScaleUnit": "1MB/s" 274 + }, 275 + { 276 + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory bandwidth (MB/sec)", 277 + "MetricExpr": "(UNC_M_PMM_RPQ_INSERTS + UNC_M_PMM_WPQ_INSERTS) * 64 / 1e6 / duration_time", 278 + "MetricName": "pmem_memory_bandwidth_total", 279 + "ScaleUnit": "1MB/s" 280 + }, 281 + { 282 + "BriefDescription": "Intel(R) Optane(TM) Persistent Memory(PMEM) memory write bandwidth (MB/sec)", 283 + "MetricExpr": "UNC_M_PMM_WPQ_INSERTS * 64 / 1e6 / duration_time", 284 + "MetricName": "pmem_memory_bandwidth_write", 285 + "ScaleUnit": "1MB/s" 35 286 }, 36 287 { 37 288 "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", ··· 300 49 "ScaleUnit": "1SMI#" 301 50 }, 302 51 { 52 + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", 53 + "MetricExpr": "MEM_INST_RETIRED.ALL_STORES / INST_RETIRED.ANY", 54 + "MetricName": "stores_per_instr", 55 + "ScaleUnit": "1per_instr" 56 + }, 57 + { 303 58 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", 304 - "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_clks)", 59 + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_core_clks)", 305 60 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 306 61 "MetricName": "tma_alu_op_utilization", 307 62 "MetricThreshold": "tma_alu_op_utilization > 0.6", ··· 315 58 }, 316 59 { 317 60 "BriefDescription": "This metric estimates fraction of cycles where the Advanced Matrix Extensions (AMX) execution engine was busy with tile (arithmetic) operations", 318 - "MetricExpr": "EXE.AMX_BUSY / tma_info_core_clks", 61 + "MetricExpr": "EXE.AMX_BUSY / tma_info_core_core_clks", 319 62 "MetricGroup": "Compute;HPC;Server;TopdownL5;tma_L5_group;tma_ports_utilized_0_group", 320 63 "MetricName": "tma_amx_busy", 321 64 "MetricThreshold": "tma_amx_busy > 0.5 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", ··· 323 66 }, 324 67 { 325 68 "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", 326 - "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_slots", 69 + "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_thread_slots", 327 70 "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", 328 71 "MetricName": "tma_assists", 329 72 "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", ··· 332 75 }, 333 76 { 334 77 "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists.", 335 - "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / tma_info_slots", 78 + "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / tma_info_thread_slots", 336 79 "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", 337 80 "MetricName": "tma_avx_assists", 338 81 "MetricThreshold": "tma_avx_assists > 0.1", ··· 340 83 }, 341 84 { 342 85 "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", 343 - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", 86 + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", 344 87 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 345 88 "MetricName": "tma_backend_bound", 346 89 "MetricThreshold": "tma_backend_bound > 0.2", ··· 360 103 }, 361 104 { 362 105 "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", 363 - "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", 106 + "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", 364 107 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 365 108 "MetricName": "tma_branch_mispredicts", 366 109 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 367 110 "MetricgroupNoGroup": "TopdownL2", 368 - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 111 + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", 369 112 "ScaleUnit": "100%" 370 113 }, 371 114 { 372 115 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", 373 - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", 116 + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", 374 117 "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", 375 118 "MetricName": "tma_branch_resteers", 376 119 "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 388 131 }, 389 132 { 390 133 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", 391 - "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", 134 + "MetricExpr": "(1 - tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", 392 135 "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", 393 136 "MetricName": "tma_clears_resteers", 394 137 "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", ··· 398 141 { 399 142 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", 400 143 "MetricConstraint": "NO_GROUP_EVENTS", 401 - "MetricExpr": "(76 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 144 + "MetricExpr": "(76 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 402 145 "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", 403 146 "MetricName": "tma_contested_accesses", 404 147 "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 418 161 { 419 162 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", 420 163 "MetricConstraint": "NO_GROUP_EVENTS", 421 - "MetricExpr": "75.5 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 164 + "MetricExpr": "75.5 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 422 165 "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", 423 166 "MetricName": "tma_data_sharing", 424 167 "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 427 170 }, 428 171 { 429 172 "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", 430 - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", 173 + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", 431 174 "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", 432 175 "MetricName": "tma_decoder0_alone", 433 - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35))", 176 + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35))", 434 177 "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", 435 178 "ScaleUnit": "100%" 436 179 }, 437 180 { 438 181 "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", 439 - "MetricExpr": "ARITH.DIV_ACTIVE / tma_info_clks", 182 + "MetricExpr": "ARITH.DIV_ACTIVE / tma_info_thread_clks", 440 183 "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", 441 184 "MetricName": "tma_divider", 442 185 "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", ··· 446 189 { 447 190 "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", 448 191 "MetricConstraint": "NO_GROUP_EVENTS", 449 - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks - tma_pmm_bound if #has_pmem > 0 else MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks)", 192 + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks - tma_pmm_bound if #has_pmem > 0 else MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks)", 450 193 "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 451 194 "MetricName": "tma_dram_bound", 452 195 "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 455 198 }, 456 199 { 457 200 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", 458 - "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_clks / 2", 201 + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", 459 202 "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 460 203 "MetricName": "tma_dsb", 461 - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", 204 + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", 462 205 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", 463 206 "ScaleUnit": "100%" 464 207 }, 465 208 { 466 209 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", 467 - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", 210 + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", 468 211 "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", 469 212 "MetricName": "tma_dsb_switches", 470 213 "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", 471 - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 214 + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", 472 215 "ScaleUnit": "100%" 473 216 }, 474 217 { 475 218 "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", 476 - "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", 219 + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", 477 220 "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", 478 221 "MetricName": "tma_dtlb_load", 479 222 "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 480 - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", 223 + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", 481 224 "ScaleUnit": "100%" 482 225 }, 483 226 { 484 227 "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", 485 - "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", 228 + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", 486 229 "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", 487 230 "MetricName": "tma_dtlb_store", 488 231 "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 489 - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", 232 + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", 490 233 "ScaleUnit": "100%" 491 234 }, 492 235 { 493 236 "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", 494 - "MetricExpr": "80 * tma_info_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", 237 + "MetricExpr": "80 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", 495 238 "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", 496 239 "MetricName": "tma_false_sharing", 497 240 "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 500 243 }, 501 244 { 502 245 "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", 503 - "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_clks", 246 + "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks", 504 247 "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", 505 248 "MetricName": "tma_fb_full", 506 249 "MetricThreshold": "tma_fb_full > 0.3", 507 - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", 250 + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", 508 251 "ScaleUnit": "100%" 509 252 }, 510 253 { ··· 512 255 "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", 513 256 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 514 257 "MetricName": "tma_fetch_bandwidth", 515 - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35", 258 + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35", 516 259 "MetricgroupNoGroup": "TopdownL2", 517 - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 260 + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", 518 261 "ScaleUnit": "100%" 519 262 }, 520 263 { 521 264 "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", 522 - "MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", 265 + "MetricExpr": "topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", 523 266 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 524 267 "MetricName": "tma_fetch_latency", 525 268 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", ··· 538 281 }, 539 282 { 540 283 "BriefDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine)", 541 - "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * tma_info_slots)", 284 + "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)", 542 285 "MetricGroup": "Compute;Flops;HPC;Pipeline;Server;TopdownL4;tma_L4_group;tma_fp_arith_group", 543 286 "MetricName": "tma_fp_amx", 544 287 "MetricThreshold": "tma_fp_amx > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", ··· 557 300 }, 558 301 { 559 302 "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists", 560 - "MetricExpr": "30 * ASSISTS.FP / tma_info_slots", 303 + "MetricExpr": "30 * ASSISTS.FP / tma_info_thread_slots", 561 304 "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group", 562 305 "MetricName": "tma_fp_assists", 563 306 "MetricThreshold": "tma_fp_assists > 0.1", ··· 566 309 }, 567 310 { 568 311 "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", 569 - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_slots)", 312 + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_thread_slots)", 570 313 "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", 571 314 "MetricName": "tma_fp_scalar", 572 315 "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", ··· 575 318 }, 576 319 { 577 320 "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", 578 - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_slots)", 321 + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_thread_slots)", 579 322 "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", 580 323 "MetricName": "tma_fp_vector", 581 324 "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", ··· 584 327 }, 585 328 { 586 329 "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", 587 - "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF) / (tma_retiring * tma_info_slots)", 330 + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", 588 331 "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", 589 332 "MetricName": "tma_fp_vector_128b", 590 333 "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", ··· 593 336 }, 594 337 { 595 338 "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", 596 - "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF) / (tma_retiring * tma_info_slots)", 339 + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", 597 340 "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", 598 341 "MetricName": "tma_fp_vector_256b", 599 342 "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", ··· 602 345 }, 603 346 { 604 347 "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", 605 - "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF) / (tma_retiring * tma_info_slots)", 348 + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF) / (tma_retiring * tma_info_thread_slots)", 606 349 "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", 607 350 "MetricName": "tma_fp_vector_512b", 608 351 "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", ··· 611 354 }, 612 355 { 613 356 "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", 614 - "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", 357 + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", 615 358 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 616 359 "MetricName": "tma_frontend_bound", 617 360 "MetricThreshold": "tma_frontend_bound > 0.15", ··· 621 364 }, 622 365 { 623 366 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions", 624 - "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_slots)", 367 + "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_thread_slots)", 625 368 "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", 626 369 "MetricName": "tma_fused_instructions", 627 370 "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6", ··· 630 373 }, 631 374 { 632 375 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", 633 - "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", 376 + "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", 634 377 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 635 378 "MetricName": "tma_heavy_operations", 636 379 "MetricThreshold": "tma_heavy_operations > 0.1", ··· 640 383 }, 641 384 { 642 385 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", 643 - "MetricExpr": "ICACHE_DATA.STALLS / tma_info_clks", 386 + "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks", 644 387 "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", 645 388 "MetricName": "tma_icache_misses", 646 389 "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 648 391 "ScaleUnit": "100%" 649 392 }, 650 393 { 651 - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", 652 - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", 653 - "MetricGroup": "Power;Summary", 654 - "MetricName": "tma_info_average_frequency" 655 - }, 656 - { 657 - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", 658 - "MetricConstraint": "NO_GROUP_EVENTS", 659 - "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", 660 - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", 661 - "MetricName": "tma_info_big_code", 662 - "MetricThreshold": "tma_info_big_code > 20", 663 - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" 664 - }, 665 - { 666 - "BriefDescription": "Branch instructions per taken branch.", 667 - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", 668 - "MetricGroup": "Branches;Fed;PGO", 669 - "MetricName": "tma_info_bptkbranch" 670 - }, 671 - { 672 394 "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", 673 395 "MetricConstraint": "NO_GROUP_EVENTS", 674 - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", 396 + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", 675 397 "MetricGroup": "Bad;BrMispredicts;tma_issueBM", 676 - "MetricName": "tma_info_branch_misprediction_cost", 677 - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" 398 + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", 399 + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" 678 400 }, 679 401 { 680 - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", 681 - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", 682 - "MetricGroup": "Ret;tma_issueBC", 683 - "MetricName": "tma_info_branching_overhead", 684 - "MetricThreshold": "tma_info_branching_overhead > 10", 685 - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" 402 + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", 403 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", 404 + "MetricGroup": "Bad;BrMispredicts", 405 + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", 406 + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200" 686 407 }, 687 408 { 688 - "BriefDescription": "Fraction of branches that are CALL or RET", 689 - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", 690 - "MetricGroup": "Bad;Branches", 691 - "MetricName": "tma_info_callret" 409 + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", 410 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", 411 + "MetricGroup": "Bad;BrMispredicts", 412 + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", 413 + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200" 692 414 }, 693 415 { 694 - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", 695 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", 696 - "MetricGroup": "Pipeline", 697 - "MetricName": "tma_info_clks" 416 + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", 417 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", 418 + "MetricGroup": "Bad;BrMispredicts", 419 + "MetricName": "tma_info_bad_spec_ipmisp_indirect", 420 + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" 698 421 }, 699 422 { 700 - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 701 - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 702 - "MetricGroup": "Fed;MemoryTLB", 703 - "MetricName": "tma_info_code_stlb_mpki" 423 + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", 424 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", 425 + "MetricGroup": "Bad;BrMispredicts", 426 + "MetricName": "tma_info_bad_spec_ipmisp_ret", 427 + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500" 704 428 }, 705 429 { 706 - "BriefDescription": "Fraction of branches that are non-taken conditionals", 707 - "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", 708 - "MetricGroup": "Bad;Branches;CodeGen;PGO", 709 - "MetricName": "tma_info_cond_nt" 710 - }, 711 - { 712 - "BriefDescription": "Fraction of branches that are taken conditionals", 713 - "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", 714 - "MetricGroup": "Bad;Branches;CodeGen;PGO", 715 - "MetricName": "tma_info_cond_tk" 430 + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", 431 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", 432 + "MetricGroup": "Bad;BadSpec;BrMispredicts", 433 + "MetricName": "tma_info_bad_spec_ipmispredict", 434 + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" 716 435 }, 717 436 { 718 437 "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", 719 438 "MetricConstraint": "NO_GROUP_EVENTS", 720 - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", 439 + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", 721 440 "MetricGroup": "Cor;SMT", 722 - "MetricName": "tma_info_core_bound_likely", 723 - "MetricThreshold": "tma_info_core_bound_likely > 0.5" 724 - }, 725 - { 726 - "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", 727 - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", 728 - "MetricGroup": "SMT", 729 - "MetricName": "tma_info_core_clks" 730 - }, 731 - { 732 - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", 733 - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", 734 - "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", 735 - "MetricName": "tma_info_coreipc" 736 - }, 737 - { 738 - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", 739 - "MetricExpr": "1 / tma_info_ipc", 740 - "MetricGroup": "Mem;Pipeline", 741 - "MetricName": "tma_info_cpi" 742 - }, 743 - { 744 - "BriefDescription": "Average CPU Utilization", 745 - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", 746 - "MetricGroup": "HPC;Summary", 747 - "MetricName": "tma_info_cpu_utilization" 748 - }, 749 - { 750 - "BriefDescription": "Average Parallel L2 cache miss data reads", 751 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", 752 - "MetricGroup": "Memory_BW;Offcore", 753 - "MetricName": "tma_info_data_l2_mlp" 754 - }, 755 - { 756 - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", 757 - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", 758 - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", 759 - "MetricName": "tma_info_dram_bw_use", 760 - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" 761 - }, 762 - { 763 - "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", 764 - "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", 765 - "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", 766 - "MetricName": "tma_info_dsb_coverage", 767 - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 6 > 0.35", 768 - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" 441 + "MetricName": "tma_info_botlnk_l0_core_bound_likely", 442 + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" 769 443 }, 770 444 { 771 445 "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", 772 446 "MetricConstraint": "NO_GROUP_EVENTS", 773 447 "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", 774 448 "MetricGroup": "DSBmiss;Fed;tma_issueFB", 775 - "MetricName": "tma_info_dsb_misses", 776 - "MetricThreshold": "tma_info_dsb_misses > 10", 777 - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" 778 - }, 779 - { 780 - "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", 781 - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", 782 - "MetricGroup": "DSBmiss", 783 - "MetricName": "tma_info_dsb_switch_cost" 784 - }, 785 - { 786 - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", 787 - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", 788 - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", 789 - "MetricName": "tma_info_execute" 790 - }, 791 - { 792 - "BriefDescription": "The ratio of Executed- by Issued-Uops", 793 - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", 794 - "MetricGroup": "Cor;Pipeline", 795 - "MetricName": "tma_info_execute_per_issue", 796 - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." 797 - }, 798 - { 799 - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", 800 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", 801 - "MetricGroup": "CacheMisses;Mem", 802 - "MetricName": "tma_info_fb_hpki" 803 - }, 804 - { 805 - "BriefDescription": "Average number of Uops issued by front-end when it issued something", 806 - "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", 807 - "MetricGroup": "Fed;FetchBW", 808 - "MetricName": "tma_info_fetch_upc" 809 - }, 810 - { 811 - "BriefDescription": "Floating Point Operations Per Cycle", 812 - "MetricConstraint": "NO_GROUP_EVENTS", 813 - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR_HALF + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16", 814 - "MetricGroup": "Flops;Ret", 815 - "MetricName": "tma_info_flopc" 816 - }, 817 - { 818 - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", 819 - "MetricConstraint": "NO_GROUP_EVENTS", 820 - "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * tma_info_core_clks)", 821 - "MetricGroup": "Cor;Flops;HPC", 822 - "MetricName": "tma_info_fp_arith_utilization", 823 - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." 824 - }, 825 - { 826 - "BriefDescription": "Giga Floating Point Operations Per Second", 827 - "MetricExpr": "tma_info_flopc / duration_time", 828 - "MetricGroup": "Cor;Flops;HPC", 829 - "MetricName": "tma_info_gflops", 830 - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." 449 + "MetricName": "tma_info_botlnk_l2_dsb_misses", 450 + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", 451 + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" 831 452 }, 832 453 { 833 454 "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", 834 455 "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", 835 456 "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", 836 - "MetricName": "tma_info_ic_misses", 837 - "MetricThreshold": "tma_info_ic_misses > 5", 457 + "MetricName": "tma_info_botlnk_l2_ic_misses", 458 + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", 838 459 "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " 839 460 }, 840 461 { 841 - "BriefDescription": "Average Latency for L1 instruction cache misses", 842 - "MetricExpr": "ICACHE_DATA.STALLS / cpu@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@", 843 - "MetricGroup": "Fed;FetchLat;IcMiss", 844 - "MetricName": "tma_info_icache_miss_latency" 462 + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", 463 + "MetricConstraint": "NO_GROUP_EVENTS", 464 + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", 465 + "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", 466 + "MetricName": "tma_info_bottleneck_big_code", 467 + "MetricThreshold": "tma_info_bottleneck_big_code > 20", 468 + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" 845 469 }, 846 470 { 847 - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", 848 - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", 849 - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", 850 - "MetricName": "tma_info_ilp" 471 + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", 472 + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", 473 + "MetricGroup": "Ret;tma_issueBC", 474 + "MetricName": "tma_info_bottleneck_branching_overhead", 475 + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", 476 + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" 851 477 }, 852 478 { 853 479 "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", 854 480 "MetricConstraint": "NO_GROUP_EVENTS", 855 - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", 481 + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", 856 482 "MetricGroup": "Fed;FetchBW;Frontend", 857 - "MetricName": "tma_info_instruction_fetch_bw", 858 - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" 859 - }, 860 - { 861 - "BriefDescription": "Total number of retired Instructions", 862 - "MetricExpr": "INST_RETIRED.ANY", 863 - "MetricGroup": "Summary;TmaL1;tma_L1_group", 864 - "MetricName": "tma_info_instructions", 865 - "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" 866 - }, 867 - { 868 - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", 869 - "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", 870 - "MetricGroup": "IoBW;Mem;Server;SoC", 871 - "MetricName": "tma_info_io_write_bw" 872 - }, 873 - { 874 - "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", 875 - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR))", 876 - "MetricGroup": "Flops;InsType", 877 - "MetricName": "tma_info_iparith", 878 - "MetricThreshold": "tma_info_iparith < 10", 879 - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." 880 - }, 881 - { 882 - "BriefDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate)", 883 - "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.BF16", 884 - "MetricGroup": "Flops;FpVector;InsType;Server", 885 - "MetricName": "tma_info_iparith_amx_f16", 886 - "MetricThreshold": "tma_info_iparith_amx_f16 < 10", 887 - "PublicDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." 888 - }, 889 - { 890 - "BriefDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate)", 891 - "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.INT8", 892 - "MetricGroup": "InsType;IntVector;Server", 893 - "MetricName": "tma_info_iparith_amx_int8", 894 - "MetricThreshold": "tma_info_iparith_amx_int8 < 10", 895 - "PublicDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." 896 - }, 897 - { 898 - "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", 899 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF)", 900 - "MetricGroup": "Flops;FpVector;InsType", 901 - "MetricName": "tma_info_iparith_avx128", 902 - "MetricThreshold": "tma_info_iparith_avx128 < 10", 903 - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 904 - }, 905 - { 906 - "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", 907 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF)", 908 - "MetricGroup": "Flops;FpVector;InsType", 909 - "MetricName": "tma_info_iparith_avx256", 910 - "MetricThreshold": "tma_info_iparith_avx256 < 10", 911 - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 912 - }, 913 - { 914 - "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", 915 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF)", 916 - "MetricGroup": "Flops;FpVector;InsType", 917 - "MetricName": "tma_info_iparith_avx512", 918 - "MetricThreshold": "tma_info_iparith_avx512 < 10", 919 - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 920 - }, 921 - { 922 - "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", 923 - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", 924 - "MetricGroup": "Flops;FpScalar;InsType", 925 - "MetricName": "tma_info_iparith_scalar_dp", 926 - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", 927 - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 928 - }, 929 - { 930 - "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", 931 - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", 932 - "MetricGroup": "Flops;FpScalar;InsType", 933 - "MetricName": "tma_info_iparith_scalar_sp", 934 - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", 935 - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 936 - }, 937 - { 938 - "BriefDescription": "Instructions per a microcode Assist invocation", 939 - "MetricExpr": "INST_RETIRED.ANY / cpu@ASSISTS.ANY\\,umask\\=0x1B@", 940 - "MetricGroup": "Pipeline;Ret;Retire", 941 - "MetricName": "tma_info_ipassist", 942 - "MetricThreshold": "tma_info_ipassist < 100e3", 943 - "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" 944 - }, 945 - { 946 - "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", 947 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", 948 - "MetricGroup": "Branches;Fed;InsType", 949 - "MetricName": "tma_info_ipbranch", 950 - "MetricThreshold": "tma_info_ipbranch < 8" 951 - }, 952 - { 953 - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", 954 - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", 955 - "MetricGroup": "Ret;Summary", 956 - "MetricName": "tma_info_ipc" 957 - }, 958 - { 959 - "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", 960 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", 961 - "MetricGroup": "Branches;Fed;PGO", 962 - "MetricName": "tma_info_ipcall", 963 - "MetricThreshold": "tma_info_ipcall < 200" 964 - }, 965 - { 966 - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", 967 - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", 968 - "MetricGroup": "DSBmiss;Fed", 969 - "MetricName": "tma_info_ipdsb_miss_ret", 970 - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" 971 - }, 972 - { 973 - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", 974 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", 975 - "MetricGroup": "Branches;OS", 976 - "MetricName": "tma_info_ipfarbranch", 977 - "MetricThreshold": "tma_info_ipfarbranch < 1e6" 978 - }, 979 - { 980 - "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", 981 - "MetricExpr": "INST_RETIRED.ANY / tma_info_flopc", 982 - "MetricGroup": "Flops;InsType", 983 - "MetricName": "tma_info_ipflop", 984 - "MetricThreshold": "tma_info_ipflop < 10" 985 - }, 986 - { 987 - "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", 988 - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", 989 - "MetricGroup": "InsType", 990 - "MetricName": "tma_info_ipload", 991 - "MetricThreshold": "tma_info_ipload < 3" 992 - }, 993 - { 994 - "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", 995 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", 996 - "MetricGroup": "Bad;BrMispredicts", 997 - "MetricName": "tma_info_ipmisp_cond_ntaken", 998 - "MetricThreshold": "tma_info_ipmisp_cond_ntaken < 200" 999 - }, 1000 - { 1001 - "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", 1002 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", 1003 - "MetricGroup": "Bad;BrMispredicts", 1004 - "MetricName": "tma_info_ipmisp_cond_taken", 1005 - "MetricThreshold": "tma_info_ipmisp_cond_taken < 200" 1006 - }, 1007 - { 1008 - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", 1009 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", 1010 - "MetricGroup": "Bad;BrMispredicts", 1011 - "MetricName": "tma_info_ipmisp_indirect", 1012 - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" 1013 - }, 1014 - { 1015 - "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", 1016 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", 1017 - "MetricGroup": "Bad;BrMispredicts", 1018 - "MetricName": "tma_info_ipmisp_ret", 1019 - "MetricThreshold": "tma_info_ipmisp_ret < 500" 1020 - }, 1021 - { 1022 - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", 1023 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", 1024 - "MetricGroup": "Bad;BadSpec;BrMispredicts", 1025 - "MetricName": "tma_info_ipmispredict", 1026 - "MetricThreshold": "tma_info_ipmispredict < 200" 1027 - }, 1028 - { 1029 - "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", 1030 - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", 1031 - "MetricGroup": "InsType", 1032 - "MetricName": "tma_info_ipstore", 1033 - "MetricThreshold": "tma_info_ipstore < 8" 1034 - }, 1035 - { 1036 - "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", 1037 - "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", 1038 - "MetricGroup": "Prefetches", 1039 - "MetricName": "tma_info_ipswpf", 1040 - "MetricThreshold": "tma_info_ipswpf < 100" 1041 - }, 1042 - { 1043 - "BriefDescription": "Instruction per taken branch", 1044 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", 1045 - "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", 1046 - "MetricName": "tma_info_iptb", 1047 - "MetricThreshold": "tma_info_iptb < 13", 1048 - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" 1049 - }, 1050 - { 1051 - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", 1052 - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", 1053 - "MetricGroup": "Fed", 1054 - "MetricName": "tma_info_ipunknown_branch" 1055 - }, 1056 - { 1057 - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", 1058 - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", 1059 - "MetricGroup": "Bad;Branches", 1060 - "MetricName": "tma_info_jump" 1061 - }, 1062 - { 1063 - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", 1064 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", 1065 - "MetricGroup": "OS", 1066 - "MetricName": "tma_info_kernel_cpi" 1067 - }, 1068 - { 1069 - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", 1070 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", 1071 - "MetricGroup": "OS", 1072 - "MetricName": "tma_info_kernel_utilization", 1073 - "MetricThreshold": "tma_info_kernel_utilization > 0.05" 1074 - }, 1075 - { 1076 - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", 1077 - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", 1078 - "MetricGroup": "Mem;MemoryBW", 1079 - "MetricName": "tma_info_l1d_cache_fill_bw" 1080 - }, 1081 - { 1082 - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", 1083 - "MetricExpr": "tma_info_l1d_cache_fill_bw", 1084 - "MetricGroup": "Mem;MemoryBW", 1085 - "MetricName": "tma_info_l1d_cache_fill_bw_1t" 1086 - }, 1087 - { 1088 - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", 1089 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", 1090 - "MetricGroup": "CacheMisses;Mem", 1091 - "MetricName": "tma_info_l1mpki" 1092 - }, 1093 - { 1094 - "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", 1095 - "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", 1096 - "MetricGroup": "CacheMisses;Mem", 1097 - "MetricName": "tma_info_l1mpki_load" 1098 - }, 1099 - { 1100 - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", 1101 - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", 1102 - "MetricGroup": "Mem;MemoryBW", 1103 - "MetricName": "tma_info_l2_cache_fill_bw" 1104 - }, 1105 - { 1106 - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", 1107 - "MetricExpr": "tma_info_l2_cache_fill_bw", 1108 - "MetricGroup": "Mem;MemoryBW", 1109 - "MetricName": "tma_info_l2_cache_fill_bw_1t" 1110 - }, 1111 - { 1112 - "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", 1113 - "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_instructions", 1114 - "MetricGroup": "L2Evicts;Mem;Server", 1115 - "MetricName": "tma_info_l2_evictions_nonsilent_pki" 1116 - }, 1117 - { 1118 - "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", 1119 - "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_instructions", 1120 - "MetricGroup": "L2Evicts;Mem;Server", 1121 - "MetricName": "tma_info_l2_evictions_silent_pki" 1122 - }, 1123 - { 1124 - "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", 1125 - "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", 1126 - "MetricGroup": "CacheMisses;Mem", 1127 - "MetricName": "tma_info_l2hpki_all" 1128 - }, 1129 - { 1130 - "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", 1131 - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", 1132 - "MetricGroup": "CacheMisses;Mem", 1133 - "MetricName": "tma_info_l2hpki_load" 1134 - }, 1135 - { 1136 - "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", 1137 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 1138 - "MetricGroup": "Backend;CacheMisses;Mem", 1139 - "MetricName": "tma_info_l2mpki" 1140 - }, 1141 - { 1142 - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", 1143 - "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", 1144 - "MetricGroup": "CacheMisses;Mem;Offcore", 1145 - "MetricName": "tma_info_l2mpki_all" 1146 - }, 1147 - { 1148 - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", 1149 - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", 1150 - "MetricGroup": "IcMiss", 1151 - "MetricName": "tma_info_l2mpki_code" 1152 - }, 1153 - { 1154 - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", 1155 - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 1156 - "MetricGroup": "IcMiss", 1157 - "MetricName": "tma_info_l2mpki_code_all" 1158 - }, 1159 - { 1160 - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", 1161 - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", 1162 - "MetricGroup": "CacheMisses;Mem", 1163 - "MetricName": "tma_info_l2mpki_load" 1164 - }, 1165 - { 1166 - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", 1167 - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", 1168 - "MetricGroup": "Mem;MemoryBW;Offcore", 1169 - "MetricName": "tma_info_l3_cache_access_bw" 1170 - }, 1171 - { 1172 - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", 1173 - "MetricExpr": "tma_info_l3_cache_access_bw", 1174 - "MetricGroup": "Mem;MemoryBW;Offcore", 1175 - "MetricName": "tma_info_l3_cache_access_bw_1t" 1176 - }, 1177 - { 1178 - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", 1179 - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", 1180 - "MetricGroup": "Mem;MemoryBW", 1181 - "MetricName": "tma_info_l3_cache_fill_bw" 1182 - }, 1183 - { 1184 - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", 1185 - "MetricExpr": "tma_info_l3_cache_fill_bw", 1186 - "MetricGroup": "Mem;MemoryBW", 1187 - "MetricName": "tma_info_l3_cache_fill_bw_1t" 1188 - }, 1189 - { 1190 - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", 1191 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", 1192 - "MetricGroup": "CacheMisses;Mem", 1193 - "MetricName": "tma_info_l3mpki" 1194 - }, 1195 - { 1196 - "BriefDescription": "Average Latency for L2 cache miss demand Loads", 1197 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", 1198 - "MetricGroup": "Memory_Lat;Offcore", 1199 - "MetricName": "tma_info_load_l2_miss_latency" 1200 - }, 1201 - { 1202 - "BriefDescription": "Average Parallel L2 cache miss demand Loads", 1203 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", 1204 - "MetricGroup": "Memory_BW;Offcore", 1205 - "MetricName": "tma_info_load_l2_mlp" 1206 - }, 1207 - { 1208 - "BriefDescription": "Average Latency for L3 cache miss demand Loads", 1209 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", 1210 - "MetricGroup": "Memory_Lat;Offcore", 1211 - "MetricName": "tma_info_load_l3_miss_latency" 1212 - }, 1213 - { 1214 - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", 1215 - "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", 1216 - "MetricGroup": "Mem;MemoryBound;MemoryLat", 1217 - "MetricName": "tma_info_load_miss_real_latency" 1218 - }, 1219 - { 1220 - "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1221 - "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1222 - "MetricGroup": "Mem;MemoryTLB", 1223 - "MetricName": "tma_info_load_stlb_mpki" 1224 - }, 1225 - { 1226 - "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", 1227 - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / uncore_cha_0@event\\=0x1@", 1228 - "MetricGroup": "Mem;MemoryLat;Server;SoC", 1229 - "MetricName": "tma_info_mem_dram_read_latency", 1230 - "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" 1231 - }, 1232 - { 1233 - "BriefDescription": "Average number of parallel data read requests to external memory", 1234 - "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", 1235 - "MetricGroup": "Mem;MemoryBW;SoC", 1236 - "MetricName": "tma_info_mem_parallel_reads", 1237 - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" 1238 - }, 1239 - { 1240 - "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", 1241 - "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@ if #has_pmem > 0 else 0)", 1242 - "MetricGroup": "Mem;MemoryLat;Server;SoC", 1243 - "MetricName": "tma_info_mem_pmm_read_latency", 1244 - "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" 1245 - }, 1246 - { 1247 - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", 1248 - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_socket_clks / duration_time)", 1249 - "MetricGroup": "Mem;MemoryLat;SoC", 1250 - "MetricName": "tma_info_mem_read_latency", 1251 - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" 483 + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", 484 + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" 1252 485 }, 1253 486 { 1254 487 "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", 1255 488 "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", 1256 489 "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", 1257 - "MetricName": "tma_info_memory_bandwidth", 1258 - "MetricThreshold": "tma_info_memory_bandwidth > 20", 1259 - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" 490 + "MetricName": "tma_info_bottleneck_memory_bandwidth", 491 + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", 492 + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" 1260 493 }, 1261 494 { 1262 495 "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", 1263 496 "MetricConstraint": "NO_GROUP_EVENTS", 1264 497 "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", 1265 498 "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", 1266 - "MetricName": "tma_info_memory_data_tlbs", 1267 - "MetricThreshold": "tma_info_memory_data_tlbs > 20", 499 + "MetricName": "tma_info_bottleneck_memory_data_tlbs", 500 + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", 1268 501 "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" 1269 502 }, 1270 503 { ··· 762 1015 "MetricConstraint": "NO_GROUP_EVENTS", 763 1016 "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))", 764 1017 "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", 765 - "MetricName": "tma_info_memory_latency", 766 - "MetricThreshold": "tma_info_memory_latency > 20", 1018 + "MetricName": "tma_info_bottleneck_memory_latency", 1019 + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", 767 1020 "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" 768 1021 }, 769 1022 { ··· 771 1024 "MetricConstraint": "NO_GROUP_EVENTS", 772 1025 "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", 773 1026 "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", 774 - "MetricName": "tma_info_mispredictions", 775 - "MetricThreshold": "tma_info_mispredictions > 20", 776 - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" 1027 + "MetricName": "tma_info_bottleneck_mispredictions", 1028 + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", 1029 + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" 1030 + }, 1031 + { 1032 + "BriefDescription": "Fraction of branches that are CALL or RET", 1033 + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", 1034 + "MetricGroup": "Bad;Branches", 1035 + "MetricName": "tma_info_branches_callret" 1036 + }, 1037 + { 1038 + "BriefDescription": "Fraction of branches that are non-taken conditionals", 1039 + "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", 1040 + "MetricGroup": "Bad;Branches;CodeGen;PGO", 1041 + "MetricName": "tma_info_branches_cond_nt" 1042 + }, 1043 + { 1044 + "BriefDescription": "Fraction of branches that are taken conditionals", 1045 + "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", 1046 + "MetricGroup": "Bad;Branches;CodeGen;PGO", 1047 + "MetricName": "tma_info_branches_cond_tk" 1048 + }, 1049 + { 1050 + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", 1051 + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", 1052 + "MetricGroup": "Bad;Branches", 1053 + "MetricName": "tma_info_branches_jump" 1054 + }, 1055 + { 1056 + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", 1057 + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", 1058 + "MetricGroup": "Bad;Branches", 1059 + "MetricName": "tma_info_branches_other_branches" 1060 + }, 1061 + { 1062 + "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", 1063 + "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", 1064 + "MetricGroup": "SMT", 1065 + "MetricName": "tma_info_core_core_clks" 1066 + }, 1067 + { 1068 + "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", 1069 + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", 1070 + "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", 1071 + "MetricName": "tma_info_core_coreipc" 1072 + }, 1073 + { 1074 + "BriefDescription": "Floating Point Operations Per Cycle", 1075 + "MetricConstraint": "NO_GROUP_EVENTS", 1076 + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR_HALF + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16", 1077 + "MetricGroup": "Flops;Ret", 1078 + "MetricName": "tma_info_core_flopc" 1079 + }, 1080 + { 1081 + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", 1082 + "MetricConstraint": "NO_GROUP_EVENTS", 1083 + "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * tma_info_core_core_clks)", 1084 + "MetricGroup": "Cor;Flops;HPC", 1085 + "MetricName": "tma_info_core_fp_arith_utilization", 1086 + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." 1087 + }, 1088 + { 1089 + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", 1090 + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", 1091 + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", 1092 + "MetricName": "tma_info_core_ilp" 1093 + }, 1094 + { 1095 + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", 1096 + "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", 1097 + "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", 1098 + "MetricName": "tma_info_frontend_dsb_coverage", 1099 + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 6 > 0.35", 1100 + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" 1101 + }, 1102 + { 1103 + "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", 1104 + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", 1105 + "MetricGroup": "DSBmiss", 1106 + "MetricName": "tma_info_frontend_dsb_switch_cost" 1107 + }, 1108 + { 1109 + "BriefDescription": "Average number of Uops issued by front-end when it issued something", 1110 + "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", 1111 + "MetricGroup": "Fed;FetchBW", 1112 + "MetricName": "tma_info_frontend_fetch_upc" 1113 + }, 1114 + { 1115 + "BriefDescription": "Average Latency for L1 instruction cache misses", 1116 + "MetricExpr": "ICACHE_DATA.STALLS / cpu@ICACHE_DATA.STALLS\\,cmask\\=1\\,edge@", 1117 + "MetricGroup": "Fed;FetchLat;IcMiss", 1118 + "MetricName": "tma_info_frontend_icache_miss_latency" 1119 + }, 1120 + { 1121 + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", 1122 + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", 1123 + "MetricGroup": "DSBmiss;Fed", 1124 + "MetricName": "tma_info_frontend_ipdsb_miss_ret", 1125 + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" 1126 + }, 1127 + { 1128 + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", 1129 + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", 1130 + "MetricGroup": "Fed", 1131 + "MetricName": "tma_info_frontend_ipunknown_branch" 1132 + }, 1133 + { 1134 + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", 1135 + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", 1136 + "MetricGroup": "IcMiss", 1137 + "MetricName": "tma_info_frontend_l2mpki_code" 1138 + }, 1139 + { 1140 + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", 1141 + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 1142 + "MetricGroup": "IcMiss", 1143 + "MetricName": "tma_info_frontend_l2mpki_code_all" 1144 + }, 1145 + { 1146 + "BriefDescription": "Branch instructions per taken branch.", 1147 + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", 1148 + "MetricGroup": "Branches;Fed;PGO", 1149 + "MetricName": "tma_info_inst_mix_bptkbranch" 1150 + }, 1151 + { 1152 + "BriefDescription": "Total number of retired Instructions", 1153 + "MetricExpr": "INST_RETIRED.ANY", 1154 + "MetricGroup": "Summary;TmaL1;tma_L1_group", 1155 + "MetricName": "tma_info_inst_mix_instructions", 1156 + "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" 1157 + }, 1158 + { 1159 + "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", 1160 + "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR))", 1161 + "MetricGroup": "Flops;InsType", 1162 + "MetricName": "tma_info_inst_mix_iparith", 1163 + "MetricThreshold": "tma_info_inst_mix_iparith < 10", 1164 + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." 1165 + }, 1166 + { 1167 + "BriefDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate)", 1168 + "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.BF16", 1169 + "MetricGroup": "Flops;FpVector;InsType;Server", 1170 + "MetricName": "tma_info_inst_mix_iparith_amx_f16", 1171 + "MetricThreshold": "tma_info_inst_mix_iparith_amx_f16 < 10", 1172 + "PublicDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." 1173 + }, 1174 + { 1175 + "BriefDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate)", 1176 + "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.INT8", 1177 + "MetricGroup": "InsType;IntVector;Server", 1178 + "MetricName": "tma_info_inst_mix_iparith_amx_int8", 1179 + "MetricThreshold": "tma_info_inst_mix_iparith_amx_int8 < 10", 1180 + "PublicDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions." 1181 + }, 1182 + { 1183 + "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", 1184 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.128B_PACKED_HALF)", 1185 + "MetricGroup": "Flops;FpVector;InsType", 1186 + "MetricName": "tma_info_inst_mix_iparith_avx128", 1187 + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", 1188 + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1189 + }, 1190 + { 1191 + "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", 1192 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.256B_PACKED_HALF)", 1193 + "MetricGroup": "Flops;FpVector;InsType", 1194 + "MetricName": "tma_info_inst_mix_iparith_avx256", 1195 + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", 1196 + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1197 + }, 1198 + { 1199 + "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", 1200 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE + FP_ARITH_INST_RETIRED2.512B_PACKED_HALF)", 1201 + "MetricGroup": "Flops;FpVector;InsType", 1202 + "MetricName": "tma_info_inst_mix_iparith_avx512", 1203 + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", 1204 + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1205 + }, 1206 + { 1207 + "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", 1208 + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", 1209 + "MetricGroup": "Flops;FpScalar;InsType", 1210 + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", 1211 + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", 1212 + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1213 + }, 1214 + { 1215 + "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", 1216 + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", 1217 + "MetricGroup": "Flops;FpScalar;InsType", 1218 + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", 1219 + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", 1220 + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1221 + }, 1222 + { 1223 + "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", 1224 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", 1225 + "MetricGroup": "Branches;Fed;InsType", 1226 + "MetricName": "tma_info_inst_mix_ipbranch", 1227 + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" 1228 + }, 1229 + { 1230 + "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", 1231 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", 1232 + "MetricGroup": "Branches;Fed;PGO", 1233 + "MetricName": "tma_info_inst_mix_ipcall", 1234 + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" 1235 + }, 1236 + { 1237 + "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", 1238 + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_flopc", 1239 + "MetricGroup": "Flops;InsType", 1240 + "MetricName": "tma_info_inst_mix_ipflop", 1241 + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" 1242 + }, 1243 + { 1244 + "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", 1245 + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", 1246 + "MetricGroup": "InsType", 1247 + "MetricName": "tma_info_inst_mix_ipload", 1248 + "MetricThreshold": "tma_info_inst_mix_ipload < 3" 1249 + }, 1250 + { 1251 + "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", 1252 + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", 1253 + "MetricGroup": "InsType", 1254 + "MetricName": "tma_info_inst_mix_ipstore", 1255 + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" 1256 + }, 1257 + { 1258 + "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", 1259 + "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", 1260 + "MetricGroup": "Prefetches", 1261 + "MetricName": "tma_info_inst_mix_ipswpf", 1262 + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" 1263 + }, 1264 + { 1265 + "BriefDescription": "Instruction per taken branch", 1266 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", 1267 + "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", 1268 + "MetricName": "tma_info_inst_mix_iptb", 1269 + "MetricThreshold": "tma_info_inst_mix_iptb < 13", 1270 + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" 1271 + }, 1272 + { 1273 + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", 1274 + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", 1275 + "MetricGroup": "Mem;MemoryBW", 1276 + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" 1277 + }, 1278 + { 1279 + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", 1280 + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", 1281 + "MetricGroup": "Mem;MemoryBW", 1282 + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" 1283 + }, 1284 + { 1285 + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", 1286 + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_inst_mix_instructions", 1287 + "MetricGroup": "L2Evicts;Mem;Server", 1288 + "MetricName": "tma_info_memory_core_l2_evictions_nonsilent_pki" 1289 + }, 1290 + { 1291 + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", 1292 + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_inst_mix_instructions", 1293 + "MetricGroup": "L2Evicts;Mem;Server", 1294 + "MetricName": "tma_info_memory_core_l2_evictions_silent_pki" 1295 + }, 1296 + { 1297 + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", 1298 + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", 1299 + "MetricGroup": "Mem;MemoryBW;Offcore", 1300 + "MetricName": "tma_info_memory_core_l3_cache_access_bw" 1301 + }, 1302 + { 1303 + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", 1304 + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", 1305 + "MetricGroup": "Mem;MemoryBW", 1306 + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" 1307 + }, 1308 + { 1309 + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", 1310 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", 1311 + "MetricGroup": "CacheMisses;Mem", 1312 + "MetricName": "tma_info_memory_fb_hpki" 1313 + }, 1314 + { 1315 + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", 1316 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", 1317 + "MetricGroup": "CacheMisses;Mem", 1318 + "MetricName": "tma_info_memory_l1mpki" 1319 + }, 1320 + { 1321 + "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", 1322 + "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", 1323 + "MetricGroup": "CacheMisses;Mem", 1324 + "MetricName": "tma_info_memory_l1mpki_load" 1325 + }, 1326 + { 1327 + "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", 1328 + "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", 1329 + "MetricGroup": "CacheMisses;Mem", 1330 + "MetricName": "tma_info_memory_l2hpki_all" 1331 + }, 1332 + { 1333 + "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", 1334 + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", 1335 + "MetricGroup": "CacheMisses;Mem", 1336 + "MetricName": "tma_info_memory_l2hpki_load" 1337 + }, 1338 + { 1339 + "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", 1340 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 1341 + "MetricGroup": "Backend;CacheMisses;Mem", 1342 + "MetricName": "tma_info_memory_l2mpki" 1343 + }, 1344 + { 1345 + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", 1346 + "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", 1347 + "MetricGroup": "CacheMisses;Mem;Offcore", 1348 + "MetricName": "tma_info_memory_l2mpki_all" 1349 + }, 1350 + { 1351 + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", 1352 + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", 1353 + "MetricGroup": "CacheMisses;Mem", 1354 + "MetricName": "tma_info_memory_l2mpki_load" 1355 + }, 1356 + { 1357 + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", 1358 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", 1359 + "MetricGroup": "CacheMisses;Mem", 1360 + "MetricName": "tma_info_memory_l3mpki" 1361 + }, 1362 + { 1363 + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", 1364 + "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY", 1365 + "MetricGroup": "Mem;MemoryBound;MemoryLat", 1366 + "MetricName": "tma_info_memory_load_miss_real_latency" 777 1367 }, 778 1368 { 779 1369 "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", 780 1370 "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", 781 1371 "MetricGroup": "Mem;MemoryBW;MemoryBound", 782 - "MetricName": "tma_info_mlp", 1372 + "MetricName": "tma_info_memory_mlp", 783 1373 "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" 784 1374 }, 785 1375 { 786 - "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", 787 - "MetricExpr": "1 - (tma_info_cond_nt + tma_info_cond_tk + tma_info_callret + tma_info_jump)", 788 - "MetricGroup": "Bad;Branches", 789 - "MetricName": "tma_info_other_branches" 1376 + "BriefDescription": "Average Parallel L2 cache miss data reads", 1377 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", 1378 + "MetricGroup": "Memory_BW;Offcore", 1379 + "MetricName": "tma_info_memory_oro_data_l2_mlp" 1380 + }, 1381 + { 1382 + "BriefDescription": "Average Latency for L2 cache miss demand Loads", 1383 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", 1384 + "MetricGroup": "Memory_Lat;Offcore", 1385 + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" 1386 + }, 1387 + { 1388 + "BriefDescription": "Average Parallel L2 cache miss demand Loads", 1389 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", 1390 + "MetricGroup": "Memory_BW;Offcore", 1391 + "MetricName": "tma_info_memory_oro_load_l2_mlp" 1392 + }, 1393 + { 1394 + "BriefDescription": "Average Latency for L3 cache miss demand Loads", 1395 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", 1396 + "MetricGroup": "Memory_Lat;Offcore", 1397 + "MetricName": "tma_info_memory_oro_load_l3_miss_latency" 1398 + }, 1399 + { 1400 + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", 1401 + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", 1402 + "MetricGroup": "Mem;MemoryBW", 1403 + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" 1404 + }, 1405 + { 1406 + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", 1407 + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", 1408 + "MetricGroup": "Mem;MemoryBW", 1409 + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" 1410 + }, 1411 + { 1412 + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", 1413 + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", 1414 + "MetricGroup": "Mem;MemoryBW;Offcore", 1415 + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" 1416 + }, 1417 + { 1418 + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", 1419 + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", 1420 + "MetricGroup": "Mem;MemoryBW", 1421 + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" 1422 + }, 1423 + { 1424 + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1425 + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1426 + "MetricGroup": "Fed;MemoryTLB", 1427 + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" 1428 + }, 1429 + { 1430 + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1431 + "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1432 + "MetricGroup": "Mem;MemoryTLB", 1433 + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" 790 1434 }, 791 1435 { 792 1436 "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", 793 - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * tma_info_core_clks)", 1437 + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * tma_info_core_core_clks)", 794 1438 "MetricGroup": "Mem;MemoryTLB", 795 - "MetricName": "tma_info_page_walks_utilization", 796 - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" 797 - }, 798 - { 799 - "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", 800 - "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", 801 - "MetricGroup": "Mem;MemoryBW;Server;SoC", 802 - "MetricName": "tma_info_pmm_read_bw" 803 - }, 804 - { 805 - "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", 806 - "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", 807 - "MetricGroup": "Mem;MemoryBW;Server;SoC", 808 - "MetricName": "tma_info_pmm_write_bw" 809 - }, 810 - { 811 - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", 812 - "MetricConstraint": "NO_GROUP_EVENTS", 813 - "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", 814 - "MetricGroup": "Pipeline;Ret", 815 - "MetricName": "tma_info_retire" 816 - }, 817 - { 818 - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", 819 - "MetricExpr": "TOPDOWN.SLOTS", 820 - "MetricGroup": "TmaL1;tma_L1_group", 821 - "MetricName": "tma_info_slots" 822 - }, 823 - { 824 - "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", 825 - "MetricExpr": "(tma_info_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", 826 - "MetricGroup": "SMT;TmaL1;tma_L1_group", 827 - "MetricName": "tma_info_slots_utilization" 828 - }, 829 - { 830 - "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", 831 - "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", 832 - "MetricGroup": "SMT", 833 - "MetricName": "tma_info_smt_2t_utilization" 834 - }, 835 - { 836 - "BriefDescription": "Socket actual clocks when any core is active on that socket", 837 - "MetricExpr": "uncore_cha_0@event\\=0x1@", 838 - "MetricGroup": "SoC", 839 - "MetricName": "tma_info_socket_clks" 1439 + "MetricName": "tma_info_memory_tlb_page_walks_utilization", 1440 + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" 840 1441 }, 841 1442 { 842 1443 "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 843 1444 "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 844 1445 "MetricGroup": "Mem;MemoryTLB", 845 - "MetricName": "tma_info_store_stlb_mpki" 1446 + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" 1447 + }, 1448 + { 1449 + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", 1450 + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", 1451 + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", 1452 + "MetricName": "tma_info_pipeline_execute" 1453 + }, 1454 + { 1455 + "BriefDescription": "Instructions per a microcode Assist invocation", 1456 + "MetricExpr": "INST_RETIRED.ANY / cpu@ASSISTS.ANY\\,umask\\=0x1B@", 1457 + "MetricGroup": "Pipeline;Ret;Retire", 1458 + "MetricName": "tma_info_pipeline_ipassist", 1459 + "MetricThreshold": "tma_info_pipeline_ipassist < 100e3", 1460 + "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)" 1461 + }, 1462 + { 1463 + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", 1464 + "MetricConstraint": "NO_GROUP_EVENTS", 1465 + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", 1466 + "MetricGroup": "Pipeline;Ret", 1467 + "MetricName": "tma_info_pipeline_retire" 846 1468 }, 847 1469 { 848 1470 "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions", 849 1471 "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", 850 1472 "MetricGroup": "Pipeline;Ret", 851 - "MetricName": "tma_info_strings_cycles", 852 - "MetricThreshold": "tma_info_strings_cycles > 0.1" 1473 + "MetricName": "tma_info_pipeline_strings_cycles", 1474 + "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1" 1475 + }, 1476 + { 1477 + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", 1478 + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", 1479 + "MetricGroup": "Power;Summary", 1480 + "MetricName": "tma_info_system_average_frequency" 1481 + }, 1482 + { 1483 + "BriefDescription": "Average CPU Utilization", 1484 + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", 1485 + "MetricGroup": "HPC;Summary", 1486 + "MetricName": "tma_info_system_cpu_utilization" 1487 + }, 1488 + { 1489 + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", 1490 + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", 1491 + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", 1492 + "MetricName": "tma_info_system_dram_bw_use", 1493 + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" 1494 + }, 1495 + { 1496 + "BriefDescription": "Giga Floating Point Operations Per Second", 1497 + "MetricExpr": "tma_info_core_flopc / duration_time", 1498 + "MetricGroup": "Cor;Flops;HPC", 1499 + "MetricName": "tma_info_system_gflops", 1500 + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." 1501 + }, 1502 + { 1503 + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", 1504 + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time", 1505 + "MetricGroup": "IoBW;Mem;Server;SoC", 1506 + "MetricName": "tma_info_system_io_write_bw" 1507 + }, 1508 + { 1509 + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", 1510 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", 1511 + "MetricGroup": "Branches;OS", 1512 + "MetricName": "tma_info_system_ipfarbranch", 1513 + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" 1514 + }, 1515 + { 1516 + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", 1517 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", 1518 + "MetricGroup": "OS", 1519 + "MetricName": "tma_info_system_kernel_cpi" 1520 + }, 1521 + { 1522 + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", 1523 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", 1524 + "MetricGroup": "OS", 1525 + "MetricName": "tma_info_system_kernel_utilization", 1526 + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" 1527 + }, 1528 + { 1529 + "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", 1530 + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / uncore_cha_0@event\\=0x1@", 1531 + "MetricGroup": "Mem;MemoryLat;Server;SoC", 1532 + "MetricName": "tma_info_system_mem_dram_read_latency", 1533 + "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" 1534 + }, 1535 + { 1536 + "BriefDescription": "Average number of parallel data read requests to external memory", 1537 + "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", 1538 + "MetricGroup": "Mem;MemoryBW;SoC", 1539 + "MetricName": "tma_info_system_mem_parallel_reads", 1540 + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" 1541 + }, 1542 + { 1543 + "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", 1544 + "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@ if #has_pmem > 0 else 0)", 1545 + "MetricGroup": "Mem;MemoryLat;Server;SoC", 1546 + "MetricName": "tma_info_system_mem_pmm_read_latency", 1547 + "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" 1548 + }, 1549 + { 1550 + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", 1551 + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / duration_time)", 1552 + "MetricGroup": "Mem;MemoryLat;SoC", 1553 + "MetricName": "tma_info_system_mem_read_latency", 1554 + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" 1555 + }, 1556 + { 1557 + "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", 1558 + "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", 1559 + "MetricGroup": "Mem;MemoryBW;Server;SoC", 1560 + "MetricName": "tma_info_system_pmm_read_bw" 1561 + }, 1562 + { 1563 + "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", 1564 + "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)", 1565 + "MetricGroup": "Mem;MemoryBW;Server;SoC", 1566 + "MetricName": "tma_info_system_pmm_write_bw" 1567 + }, 1568 + { 1569 + "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", 1570 + "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", 1571 + "MetricGroup": "SMT", 1572 + "MetricName": "tma_info_system_smt_2t_utilization" 1573 + }, 1574 + { 1575 + "BriefDescription": "Socket actual clocks when any core is active on that socket", 1576 + "MetricExpr": "uncore_cha_0@event\\=0x1@", 1577 + "MetricGroup": "SoC", 1578 + "MetricName": "tma_info_system_socket_clks" 853 1579 }, 854 1580 { 855 1581 "BriefDescription": "Tera Integer (matrix) Operations Per Second", 856 1582 "MetricExpr": "8 * AMX_OPS_RETIRED.INT8 / 1e12 / duration_time", 857 1583 "MetricGroup": "Cor;HPC;IntVector;Server", 858 - "MetricName": "tma_info_tiops" 1584 + "MetricName": "tma_info_system_tiops" 859 1585 }, 860 1586 { 861 1587 "BriefDescription": "Average Frequency Utilization relative nominal frequency", 862 - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", 1588 + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", 863 1589 "MetricGroup": "Power", 864 - "MetricName": "tma_info_turbo_utilization" 865 - }, 866 - { 867 - "BriefDescription": "Uops Per Instruction", 868 - "MetricExpr": "tma_retiring * tma_info_slots / INST_RETIRED.ANY", 869 - "MetricGroup": "Pipeline;Ret;Retire", 870 - "MetricName": "tma_info_uoppi", 871 - "MetricThreshold": "tma_info_uoppi > 1.05" 1590 + "MetricName": "tma_info_system_turbo_utilization" 872 1591 }, 873 1592 { 874 1593 "BriefDescription": "Cross-socket Ultra Path Interconnect (UPI) data transmit bandwidth for data only [MB / sec]", 875 1594 "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 64 / 9 / 1e6", 876 1595 "MetricGroup": "Server;SoC", 877 - "MetricName": "tma_info_upi_data_transmit_bw" 1596 + "MetricName": "tma_info_system_upi_data_transmit_bw" 1597 + }, 1598 + { 1599 + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", 1600 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", 1601 + "MetricGroup": "Pipeline", 1602 + "MetricName": "tma_info_thread_clks" 1603 + }, 1604 + { 1605 + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", 1606 + "MetricExpr": "1 / tma_info_thread_ipc", 1607 + "MetricGroup": "Mem;Pipeline", 1608 + "MetricName": "tma_info_thread_cpi" 1609 + }, 1610 + { 1611 + "BriefDescription": "The ratio of Executed- by Issued-Uops", 1612 + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", 1613 + "MetricGroup": "Cor;Pipeline", 1614 + "MetricName": "tma_info_thread_execute_per_issue", 1615 + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." 1616 + }, 1617 + { 1618 + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", 1619 + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", 1620 + "MetricGroup": "Ret;Summary", 1621 + "MetricName": "tma_info_thread_ipc" 1622 + }, 1623 + { 1624 + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", 1625 + "MetricExpr": "TOPDOWN.SLOTS", 1626 + "MetricGroup": "TmaL1;tma_L1_group", 1627 + "MetricName": "tma_info_thread_slots" 1628 + }, 1629 + { 1630 + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", 1631 + "MetricExpr": "(tma_info_thread_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", 1632 + "MetricGroup": "SMT;TmaL1;tma_L1_group", 1633 + "MetricName": "tma_info_thread_slots_utilization" 1634 + }, 1635 + { 1636 + "BriefDescription": "Uops Per Instruction", 1637 + "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY", 1638 + "MetricGroup": "Pipeline;Ret;Retire", 1639 + "MetricName": "tma_info_thread_uoppi", 1640 + "MetricThreshold": "tma_info_thread_uoppi > 1.05" 878 1641 }, 879 1642 { 880 1643 "BriefDescription": "Instruction per taken branch", 881 - "MetricExpr": "tma_retiring * tma_info_slots / BR_INST_RETIRED.NEAR_TAKEN", 1644 + "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN", 882 1645 "MetricGroup": "Branches;Fed;FetchBW", 883 - "MetricName": "tma_info_uptb", 884 - "MetricThreshold": "tma_info_uptb < 9" 1646 + "MetricName": "tma_info_thread_uptb", 1647 + "MetricThreshold": "tma_info_thread_uptb < 9" 885 1648 }, 886 1649 { 887 1650 "BriefDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine)", 888 - "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * tma_info_slots)", 1651 + "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)", 889 1652 "MetricGroup": "Compute;HPC;IntVector;Pipeline;Server;TopdownL4;tma_L4_group;tma_int_operations_group", 890 1653 "MetricName": "tma_int_amx", 891 1654 "MetricThreshold": "tma_int_amx > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", ··· 1413 1156 }, 1414 1157 { 1415 1158 "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", 1416 - "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * tma_info_slots)", 1159 + "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * tma_info_thread_slots)", 1417 1160 "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", 1418 1161 "MetricName": "tma_int_vector_128b", 1419 1162 "MetricThreshold": "tma_int_vector_128b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", ··· 1422 1165 }, 1423 1166 { 1424 1167 "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired", 1425 - "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_slots)", 1168 + "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_thread_slots)", 1426 1169 "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P", 1427 1170 "MetricName": "tma_int_vector_256b", 1428 1171 "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", ··· 1431 1174 }, 1432 1175 { 1433 1176 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", 1434 - "MetricExpr": "ICACHE_TAG.STALLS / tma_info_clks", 1177 + "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks", 1435 1178 "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", 1436 1179 "MetricName": "tma_itlb_misses", 1437 1180 "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 1440 1183 }, 1441 1184 { 1442 1185 "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", 1443 - "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", 1186 + "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", 1444 1187 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", 1445 1188 "MetricName": "tma_l1_bound", 1446 1189 "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1450 1193 { 1451 1194 "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", 1452 1195 "MetricConstraint": "NO_GROUP_EVENTS", 1453 - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_clks", 1196 + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks", 1454 1197 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1455 1198 "MetricName": "tma_l2_bound", 1456 1199 "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1459 1202 }, 1460 1203 { 1461 1204 "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", 1462 - "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", 1205 + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", 1463 1206 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1464 1207 "MetricName": "tma_l3_bound", 1465 1208 "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1468 1211 }, 1469 1212 { 1470 1213 "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", 1471 - "MetricExpr": "33 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1214 + "MetricExpr": "33 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1472 1215 "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", 1473 1216 "MetricName": "tma_l3_hit_latency", 1474 1217 "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1475 - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", 1218 + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", 1476 1219 "ScaleUnit": "100%" 1477 1220 }, 1478 1221 { 1479 1222 "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", 1480 - "MetricExpr": "DECODE.LCP / tma_info_clks", 1223 + "MetricExpr": "DECODE.LCP / tma_info_thread_clks", 1481 1224 "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", 1482 1225 "MetricName": "tma_lcp", 1483 1226 "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", 1484 - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", 1227 + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", 1485 1228 "ScaleUnit": "100%" 1486 1229 }, 1487 1230 { ··· 1496 1239 }, 1497 1240 { 1498 1241 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", 1499 - "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * tma_info_core_clks)", 1242 + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * tma_info_core_core_clks)", 1500 1243 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 1501 1244 "MetricName": "tma_load_op_utilization", 1502 1245 "MetricThreshold": "tma_load_op_utilization > 0.6", ··· 1513 1256 }, 1514 1257 { 1515 1258 "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", 1516 - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", 1259 + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", 1517 1260 "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", 1518 1261 "MetricName": "tma_load_stlb_miss", 1519 1262 "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1521 1264 }, 1522 1265 { 1523 1266 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", 1524 - "MetricExpr": "71 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1267 + "MetricExpr": "71 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1525 1268 "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", 1526 1269 "MetricName": "tma_local_dram", 1527 1270 "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1531 1274 { 1532 1275 "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", 1533 1276 "MetricConstraint": "NO_GROUP_EVENTS", 1534 - "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", 1277 + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", 1535 1278 "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", 1536 1279 "MetricName": "tma_lock_latency", 1537 1280 "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1550 1293 }, 1551 1294 { 1552 1295 "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to memory bandwidth Allocation feature (RDT's memory bandwidth throttling).", 1553 - "MetricExpr": "INT_MISC.MBA_STALLS / tma_info_clks", 1296 + "MetricExpr": "INT_MISC.MBA_STALLS / tma_info_thread_clks", 1554 1297 "MetricGroup": "MemoryBW;Offcore;Server;TopdownL5;tma_L5_group;tma_mem_bandwidth_group", 1555 1298 "MetricName": "tma_mba_stalls", 1556 1299 "MetricThreshold": "tma_mba_stalls > 0.1 & (tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1558 1301 }, 1559 1302 { 1560 1303 "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", 1561 - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", 1304 + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", 1562 1305 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", 1563 1306 "MetricName": "tma_mem_bandwidth", 1564 1307 "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1565 - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", 1308 + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", 1566 1309 "ScaleUnit": "100%" 1567 1310 }, 1568 1311 { 1569 1312 "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", 1570 - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", 1313 + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", 1571 1314 "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", 1572 1315 "MetricName": "tma_mem_latency", 1573 1316 "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1574 - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", 1317 + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", 1575 1318 "ScaleUnit": "100%" 1576 1319 }, 1577 1320 { 1578 1321 "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", 1579 - "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", 1322 + "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", 1580 1323 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 1581 1324 "MetricName": "tma_memory_bound", 1582 1325 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", ··· 1586 1329 }, 1587 1330 { 1588 1331 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", 1589 - "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_clks", 1332 + "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_thread_clks", 1590 1333 "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", 1591 1334 "MetricName": "tma_memory_fence", 1592 1335 "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", ··· 1595 1338 { 1596 1339 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.", 1597 1340 "MetricConstraint": "NO_GROUP_EVENTS", 1598 - "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_slots)", 1341 + "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * tma_info_thread_slots)", 1599 1342 "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", 1600 1343 "MetricName": "tma_memory_operations", 1601 1344 "MetricThreshold": "tma_memory_operations > 0.1 & tma_light_operations > 0.6", ··· 1603 1346 }, 1604 1347 { 1605 1348 "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", 1606 - "MetricExpr": "UOPS_RETIRED.MS / tma_info_slots", 1349 + "MetricExpr": "UOPS_RETIRED.MS / tma_info_thread_slots", 1607 1350 "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", 1608 1351 "MetricName": "tma_microcode_sequencer", 1609 1352 "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", ··· 1612 1355 }, 1613 1356 { 1614 1357 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", 1615 - "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", 1358 + "MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", 1616 1359 "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", 1617 1360 "MetricName": "tma_mispredicts_resteers", 1618 1361 "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", 1619 - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", 1362 + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", 1620 1363 "ScaleUnit": "100%" 1621 1364 }, 1622 1365 { 1623 1366 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", 1624 - "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_clks / 2", 1367 + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", 1625 1368 "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 1626 1369 "MetricName": "tma_mite", 1627 - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35)", 1370 + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)", 1628 1371 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", 1629 1372 "ScaleUnit": "100%" 1630 1373 }, 1631 1374 { 1632 1375 "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued", 1633 - "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_clks", 1376 + "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_thread_clks", 1634 1377 "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group", 1635 1378 "MetricName": "tma_mixing_vectors", 1636 1379 "MetricThreshold": "tma_mixing_vectors > 0.05", ··· 1639 1382 }, 1640 1383 { 1641 1384 "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", 1642 - "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_slots / UOPS_ISSUED.ANY) / tma_info_clks", 1385 + "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY) / tma_info_thread_clks", 1643 1386 "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", 1644 1387 "MetricName": "tma_ms_switches", 1645 1388 "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 1648 1391 }, 1649 1392 { 1650 1393 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused", 1651 - "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_slots)", 1394 + "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_thread_slots)", 1652 1395 "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", 1653 1396 "MetricName": "tma_non_fused_branches", 1654 1397 "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6", ··· 1657 1400 }, 1658 1401 { 1659 1402 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", 1660 - "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_slots)", 1403 + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", 1661 1404 "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", 1662 1405 "MetricName": "tma_nop_instructions", 1663 1406 "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", ··· 1676 1419 }, 1677 1420 { 1678 1421 "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults", 1679 - "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_slots", 1422 + "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_thread_slots", 1680 1423 "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group", 1681 1424 "MetricName": "tma_page_faults", 1682 1425 "MetricThreshold": "tma_page_faults > 0.05", ··· 1685 1428 }, 1686 1429 { 1687 1430 "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a", 1688 - "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", 1431 + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", 1689 1432 "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1690 1433 "MetricName": "tma_pmm_bound", 1691 1434 "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1694 1437 }, 1695 1438 { 1696 1439 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", 1697 - "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_clks", 1440 + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", 1698 1441 "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1699 1442 "MetricName": "tma_port_0", 1700 1443 "MetricThreshold": "tma_port_0 > 0.6", ··· 1703 1446 }, 1704 1447 { 1705 1448 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", 1706 - "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_clks", 1449 + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks", 1707 1450 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1708 1451 "MetricName": "tma_port_1", 1709 1452 "MetricThreshold": "tma_port_1 > 0.6", ··· 1712 1455 }, 1713 1456 { 1714 1457 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", 1715 - "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_clks", 1458 + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", 1716 1459 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1717 1460 "MetricName": "tma_port_6", 1718 1461 "MetricThreshold": "tma_port_6 > 0.6", ··· 1721 1464 }, 1722 1465 { 1723 1466 "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", 1724 - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_clks)", 1467 + "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)", 1725 1468 "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", 1726 1469 "MetricName": "tma_ports_utilization", 1727 1470 "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", ··· 1730 1473 }, 1731 1474 { 1732 1475 "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1733 - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_clks", 1476 + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks", 1734 1477 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", 1735 1478 "MetricName": "tma_ports_utilized_0", 1736 1479 "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1739 1482 }, 1740 1483 { 1741 1484 "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1742 - "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_clks", 1485 + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", 1743 1486 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", 1744 1487 "MetricName": "tma_ports_utilized_1", 1745 1488 "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1748 1491 }, 1749 1492 { 1750 1493 "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1751 - "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_clks", 1494 + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", 1752 1495 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", 1753 1496 "MetricName": "tma_ports_utilized_2", 1754 1497 "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1757 1500 }, 1758 1501 { 1759 1502 "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1760 - "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_clks", 1503 + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", 1761 1504 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", 1762 1505 "MetricName": "tma_ports_utilized_3m", 1763 1506 "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1766 1509 }, 1767 1510 { 1768 1511 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", 1769 - "MetricExpr": "(135.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1512 + "MetricExpr": "(135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1770 1513 "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", 1771 1514 "MetricName": "tma_remote_cache", 1772 1515 "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1775 1518 }, 1776 1519 { 1777 1520 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", 1778 - "MetricExpr": "149 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1521 + "MetricExpr": "149 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1779 1522 "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", 1780 1523 "MetricName": "tma_remote_dram", 1781 1524 "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1784 1527 }, 1785 1528 { 1786 1529 "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", 1787 - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", 1530 + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", 1788 1531 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1789 1532 "MetricName": "tma_retiring", 1790 1533 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", ··· 1794 1537 }, 1795 1538 { 1796 1539 "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", 1797 - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_clks", 1540 + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", 1798 1541 "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", 1799 1542 "MetricName": "tma_serializing_operation", 1800 1543 "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", ··· 1803 1546 }, 1804 1547 { 1805 1548 "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.", 1806 - "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_slots)", 1549 + "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_thread_slots)", 1807 1550 "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group", 1808 1551 "MetricName": "tma_shuffles", 1809 1552 "MetricThreshold": "tma_shuffles > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)", ··· 1811 1554 }, 1812 1555 { 1813 1556 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", 1814 - "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_clks", 1557 + "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks", 1815 1558 "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", 1816 1559 "MetricName": "tma_slow_pause", 1817 1560 "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", ··· 1820 1563 }, 1821 1564 { 1822 1565 "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", 1823 - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", 1566 + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", 1824 1567 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 1825 1568 "MetricName": "tma_split_loads", 1826 1569 "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1829 1572 }, 1830 1573 { 1831 1574 "BriefDescription": "This metric represents rate of split store accesses", 1832 - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", 1575 + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", 1833 1576 "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", 1834 1577 "MetricName": "tma_split_stores", 1835 1578 "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1838 1581 }, 1839 1582 { 1840 1583 "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", 1841 - "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / tma_info_clks", 1584 + "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / tma_info_thread_clks", 1842 1585 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", 1843 1586 "MetricName": "tma_sq_full", 1844 1587 "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1845 - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", 1588 + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", 1846 1589 "ScaleUnit": "100%" 1847 1590 }, 1848 1591 { 1849 1592 "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", 1850 - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", 1593 + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", 1851 1594 "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1852 1595 "MetricName": "tma_store_bound", 1853 1596 "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1856 1599 }, 1857 1600 { 1858 1601 "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", 1859 - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", 1602 + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", 1860 1603 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 1861 1604 "MetricName": "tma_store_fwd_blk", 1862 1605 "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1865 1608 }, 1866 1609 { 1867 1610 "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", 1868 - "MetricExpr": "(MEM_STORE_RETIRED.L2_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", 1611 + "MetricExpr": "(MEM_STORE_RETIRED.L2_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", 1869 1612 "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", 1870 1613 "MetricName": "tma_store_latency", 1871 1614 "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1874 1617 }, 1875 1618 { 1876 1619 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", 1877 - "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_clks)", 1620 + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_core_clks)", 1878 1621 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 1879 1622 "MetricName": "tma_store_op_utilization", 1880 1623 "MetricThreshold": "tma_store_op_utilization > 0.6", ··· 1891 1634 }, 1892 1635 { 1893 1636 "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", 1894 - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", 1637 + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", 1895 1638 "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", 1896 1639 "MetricName": "tma_store_stlb_miss", 1897 1640 "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1899 1642 }, 1900 1643 { 1901 1644 "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", 1902 - "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_clks", 1645 + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_thread_clks", 1903 1646 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", 1904 1647 "MetricName": "tma_streaming_stores", 1905 1648 "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1908 1651 }, 1909 1652 { 1910 1653 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", 1911 - "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_clks", 1654 + "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_thread_clks", 1912 1655 "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", 1913 1656 "MetricName": "tma_unknown_branches", 1914 1657 "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", ··· 1951 1694 "MetricGroup": "transaction", 1952 1695 "MetricName": "tsx_transactional_cycles", 1953 1696 "ScaleUnit": "100%" 1697 + }, 1698 + { 1699 + "BriefDescription": "Uncore operating frequency in GHz", 1700 + "MetricExpr": "UNC_CHA_CLOCKTICKS / (source_count(UNC_CHA_CLOCKTICKS) * #num_packages) / 1e9 / duration_time", 1701 + "MetricName": "uncore_frequency", 1702 + "ScaleUnit": "1GHz" 1703 + }, 1704 + { 1705 + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", 1706 + "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", 1707 + "MetricName": "upi_data_transmit_bw", 1708 + "ScaleUnit": "1MB/s" 1954 1709 } 1955 1710 ]

+1 -1

tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json

··· 464 464 "Unit": "M2M" 465 465 }, 466 466 { 467 - "BriefDescription": "Counts the time when FM didn? do d2c for fill reads (cross tile case)", 467 + "BriefDescription": "Counts the time when FM didn't do d2c for fill reads (cross tile case)", 468 468 "EventCode": "0x4a", 469 469 "EventName": "UNC_M2M_DIRECT2CORE_NOT_TAKEN_NOTFORKED", 470 470 "PerPkg": "1",

+4 -4

tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json

··· 2480 2480 "Unit": "iMC" 2481 2481 }, 2482 2482 { 2483 - "BriefDescription": "DRAM Precharge commands. : Precharge due to (?)", 2483 + "BriefDescription": "DRAM Precharge commands", 2484 2484 "EventCode": "0x03", 2485 2485 "EventName": "UNC_M_PRE_COUNT.PGT", 2486 2486 "PerPkg": "1", 2487 - "PublicDescription": "DRAM Precharge commands. : Precharge due to (?) : Counts the number of DRAM Precharge commands sent on this channel.", 2487 + "PublicDescription": "DRAM Precharge commands. Counts the number of DRAM Precharge commands sent on this channel.", 2488 2488 "UMask": "0x88", 2489 2489 "Unit": "iMC" 2490 2490 }, ··· 3236 3236 "Unit": "iMC" 3237 3237 }, 3238 3238 { 3239 - "BriefDescription": "2LM Tag check hit due to memory read (bug?)", 3239 + "BriefDescription": "2LM Tag check hit due to memory read", 3240 3240 "EventCode": "0xd3", 3241 3241 "EventName": "UNC_M_TAGCHK.NM_RD_HIT", 3242 3242 "PerPkg": "1", ··· 3244 3244 "Unit": "iMC" 3245 3245 }, 3246 3246 { 3247 - "BriefDescription": "2LM Tag check hit due to memory write (bug?)", 3247 + "BriefDescription": "2LM Tag check hit due to memory write", 3248 3248 "EventCode": "0xd3", 3249 3249 "EventName": "UNC_M_TAGCHK.NM_WR_HIT", 3250 3250 "PerPkg": "1",