perf vendor events intel: Update tigerlake events/metrics

+1 -1

tools/perf/pmu-events/arch/x86/mapfile.csv

··· 29 29 GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v56,skylake,core 30 30 GenuineIntel-6-55-[01234],v1.30,skylakex,core 31 31 GenuineIntel-6-86,v1.21,snowridgex,core 32 - GenuineIntel-6-8[CD],v1.10,tigerlake,core 32 + GenuineIntel-6-8[CD],v1.12,tigerlake,core 33 33 GenuineIntel-6-2C,v4,westmereep-dp,core 34 34 GenuineIntel-6-25,v3,westmereep-sp,core 35 35 GenuineIntel-6-2F,v3,westmereex,core

+18

tools/perf/pmu-events/arch/x86/tigerlake/cache.json

··· 323 323 "UMask": "0x2" 324 324 }, 325 325 { 326 + "BriefDescription": "Retired instructions with at least 1 uncacheable load or lock.", 327 + "Data_LA": "1", 328 + "EventCode": "0xd4", 329 + "EventName": "MEM_LOAD_MISC_RETIRED.UC", 330 + "PEBS": "1", 331 + "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access", 332 + "SampleAfterValue": "100007", 333 + "UMask": "0x4" 334 + }, 335 + { 326 336 "BriefDescription": "Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.", 327 337 "Data_LA": "1", 328 338 "EventCode": "0xd1", ··· 519 509 "PublicDescription": "Counts the number of off-core outstanding read-for-ownership (RFO) store transactions every cycle. An RFO transaction is considered to be in the Off-core outstanding state between L2 cache miss and transaction completion.", 520 510 "SampleAfterValue": "1000003", 521 511 "UMask": "0x4" 512 + }, 513 + { 514 + "BriefDescription": "Counts bus locks, accounts for cache line split locks and UC locks.", 515 + "EventCode": "0xf4", 516 + "EventName": "SQ_MISC.BUS_LOCK", 517 + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", 518 + "SampleAfterValue": "100003", 519 + "UMask": "0x10" 522 520 }, 523 521 { 524 522 "BriefDescription": "Cycles the superQ cannot take any more entries.",

+1

tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json

··· 395 395 { 396 396 "BriefDescription": "Clears speculative count", 397 397 "CounterMask": "1", 398 + "EdgeDetect": "1", 398 399 "EventCode": "0x0d", 399 400 "EventName": "INT_MISC.CLEARS_COUNT", 400 401 "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears",

+677 -677

tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json

··· 79 79 }, 80 80 { 81 81 "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", 82 - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", 82 + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", 83 83 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 84 84 "MetricName": "tma_4k_aliasing", 85 85 "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 88 88 }, 89 89 { 90 90 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", 91 - "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_clks)", 91 + "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)", 92 92 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 93 93 "MetricName": "tma_alu_op_utilization", 94 94 "MetricThreshold": "tma_alu_op_utilization > 0.6", ··· 96 96 }, 97 97 { 98 98 "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", 99 - "MetricExpr": "100 * ASSISTS.ANY / tma_info_slots", 99 + "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots", 100 100 "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", 101 101 "MetricName": "tma_assists", 102 102 "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", ··· 105 105 }, 106 106 { 107 107 "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", 108 - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_slots", 108 + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots", 109 109 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 110 110 "MetricName": "tma_backend_bound", 111 111 "MetricThreshold": "tma_backend_bound > 0.2", ··· 125 125 }, 126 126 { 127 127 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.", 128 - "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_slots)", 128 + "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)", 129 129 "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", 130 130 "MetricName": "tma_branch_instructions", 131 131 "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6", ··· 138 138 "MetricName": "tma_branch_mispredicts", 139 139 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 140 140 "MetricgroupNoGroup": "TopdownL2", 141 - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 141 + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", 142 142 "ScaleUnit": "100%" 143 143 }, 144 144 { 145 145 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", 146 - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", 146 + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", 147 147 "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", 148 148 "MetricName": "tma_branch_resteers", 149 149 "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 161 161 }, 162 162 { 163 163 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", 164 - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", 164 + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", 165 165 "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", 166 166 "MetricName": "tma_clears_resteers", 167 167 "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", ··· 171 171 { 172 172 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", 173 173 "MetricConstraint": "NO_GROUP_EVENTS", 174 - "MetricExpr": "(49 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 174 + "MetricExpr": "(49 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 175 175 "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", 176 176 "MetricName": "tma_contested_accesses", 177 177 "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 191 191 { 192 192 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", 193 193 "MetricConstraint": "NO_GROUP_EVENTS", 194 - "MetricExpr": "48 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 194 + "MetricExpr": "48 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 195 195 "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", 196 196 "MetricName": "tma_data_sharing", 197 197 "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 200 200 }, 201 201 { 202 202 "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", 203 - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", 203 + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", 204 204 "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", 205 205 "MetricName": "tma_decoder0_alone", 206 - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35))", 206 + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", 207 207 "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", 208 208 "ScaleUnit": "100%" 209 209 }, 210 210 { 211 211 "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", 212 - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", 212 + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", 213 213 "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", 214 214 "MetricName": "tma_divider", 215 215 "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", ··· 219 219 { 220 220 "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", 221 221 "MetricConstraint": "NO_GROUP_EVENTS", 222 - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound", 222 + "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", 223 223 "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 224 224 "MetricName": "tma_dram_bound", 225 225 "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 228 228 }, 229 229 { 230 230 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", 231 - "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_clks / 2", 231 + "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2", 232 232 "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 233 233 "MetricName": "tma_dsb", 234 - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", 234 + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", 235 235 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", 236 236 "ScaleUnit": "100%" 237 237 }, 238 238 { 239 239 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", 240 - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", 240 + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", 241 241 "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", 242 242 "MetricName": "tma_dsb_switches", 243 243 "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", 244 - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 244 + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", 245 245 "ScaleUnit": "100%" 246 246 }, 247 247 { 248 248 "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", 249 - "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", 249 + "MetricExpr": "min(7 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", 250 250 "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", 251 251 "MetricName": "tma_dtlb_load", 252 252 "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 253 - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", 253 + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", 254 254 "ScaleUnit": "100%" 255 255 }, 256 256 { 257 257 "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", 258 - "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", 258 + "MetricExpr": "(7 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", 259 259 "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", 260 260 "MetricName": "tma_dtlb_store", 261 261 "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 262 - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", 262 + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", 263 263 "ScaleUnit": "100%" 264 264 }, 265 265 { 266 266 "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", 267 - "MetricExpr": "54 * tma_info_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", 267 + "MetricExpr": "54 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", 268 268 "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", 269 269 "MetricName": "tma_false_sharing", 270 270 "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 273 273 }, 274 274 { 275 275 "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", 276 - "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_clks", 276 + "MetricExpr": "L1D_PEND_MISS.FB_FULL / tma_info_thread_clks", 277 277 "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", 278 278 "MetricName": "tma_fb_full", 279 279 "MetricThreshold": "tma_fb_full > 0.3", 280 - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", 280 + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", 281 281 "ScaleUnit": "100%" 282 282 }, 283 283 { ··· 285 285 "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)", 286 286 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 287 287 "MetricName": "tma_fetch_bandwidth", 288 - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35", 288 + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35", 289 289 "MetricgroupNoGroup": "TopdownL2", 290 - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 290 + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", 291 291 "ScaleUnit": "100%" 292 292 }, 293 293 { 294 294 "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", 295 - "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_slots", 295 + "MetricExpr": "(5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / tma_info_thread_slots", 296 296 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 297 297 "MetricName": "tma_fetch_latency", 298 298 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", ··· 321 321 }, 322 322 { 323 323 "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", 324 - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_slots)", 324 + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)", 325 325 "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", 326 326 "MetricName": "tma_fp_scalar", 327 327 "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", ··· 330 330 }, 331 331 { 332 332 "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", 333 - "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_slots)", 333 + "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / (tma_retiring * tma_info_thread_slots)", 334 334 "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", 335 335 "MetricName": "tma_fp_vector", 336 336 "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", ··· 339 339 }, 340 340 { 341 341 "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", 342 - "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", 342 + "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", 343 343 "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", 344 344 "MetricName": "tma_fp_vector_128b", 345 345 "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", ··· 348 348 }, 349 349 { 350 350 "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", 351 - "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", 351 + "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", 352 352 "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", 353 353 "MetricName": "tma_fp_vector_256b", 354 354 "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", ··· 357 357 }, 358 358 { 359 359 "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 512-bit wide vectors", 360 - "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_slots)", 360 + "MetricExpr": "(FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / (tma_retiring * tma_info_thread_slots)", 361 361 "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", 362 362 "MetricName": "tma_fp_vector_512b", 363 363 "MetricThreshold": "tma_fp_vector_512b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", ··· 366 366 }, 367 367 { 368 368 "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", 369 - "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_slots", 369 + "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / tma_info_thread_slots", 370 370 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 371 371 "MetricName": "tma_frontend_bound", 372 372 "MetricThreshold": "tma_frontend_bound > 0.15", ··· 386 386 }, 387 387 { 388 388 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", 389 - "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_clks", 389 + "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks", 390 390 "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", 391 391 "MetricName": "tma_icache_misses", 392 392 "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 394 394 "ScaleUnit": "100%" 395 395 }, 396 396 { 397 - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", 398 - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", 399 - "MetricGroup": "Power;Summary", 400 - "MetricName": "tma_info_average_frequency" 401 - }, 402 - { 403 - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", 404 - "MetricConstraint": "NO_GROUP_EVENTS", 405 - "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", 406 - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", 407 - "MetricName": "tma_info_big_code", 408 - "MetricThreshold": "tma_info_big_code > 20", 409 - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" 410 - }, 411 - { 412 - "BriefDescription": "Branch instructions per taken branch.", 413 - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", 414 - "MetricGroup": "Branches;Fed;PGO", 415 - "MetricName": "tma_info_bptkbranch" 416 - }, 417 - { 418 397 "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", 419 398 "MetricConstraint": "NO_GROUP_EVENTS", 420 - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", 399 + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", 421 400 "MetricGroup": "Bad;BrMispredicts;tma_issueBM", 422 - "MetricName": "tma_info_branch_misprediction_cost", 423 - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" 401 + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", 402 + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" 424 403 }, 425 404 { 426 - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", 427 - "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", 428 - "MetricGroup": "Ret;tma_issueBC", 429 - "MetricName": "tma_info_branching_overhead", 430 - "MetricThreshold": "tma_info_branching_overhead > 10", 431 - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" 405 + "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", 406 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", 407 + "MetricGroup": "Bad;BrMispredicts", 408 + "MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken", 409 + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200" 432 410 }, 433 411 { 434 - "BriefDescription": "Fraction of branches that are CALL or RET", 435 - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", 436 - "MetricGroup": "Bad;Branches", 437 - "MetricName": "tma_info_callret" 412 + "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", 413 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", 414 + "MetricGroup": "Bad;BrMispredicts", 415 + "MetricName": "tma_info_bad_spec_ipmisp_cond_taken", 416 + "MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200" 438 417 }, 439 418 { 440 - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", 441 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", 442 - "MetricGroup": "Pipeline", 443 - "MetricName": "tma_info_clks" 419 + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", 420 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", 421 + "MetricGroup": "Bad;BrMispredicts", 422 + "MetricName": "tma_info_bad_spec_ipmisp_indirect", 423 + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" 444 424 }, 445 425 { 446 - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 447 - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 448 - "MetricGroup": "Fed;MemoryTLB", 449 - "MetricName": "tma_info_code_stlb_mpki" 426 + "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", 427 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", 428 + "MetricGroup": "Bad;BrMispredicts", 429 + "MetricName": "tma_info_bad_spec_ipmisp_ret", 430 + "MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500" 450 431 }, 451 432 { 452 - "BriefDescription": "Fraction of branches that are non-taken conditionals", 453 - "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", 454 - "MetricGroup": "Bad;Branches;CodeGen;PGO", 455 - "MetricName": "tma_info_cond_nt" 456 - }, 457 - { 458 - "BriefDescription": "Fraction of branches that are taken conditionals", 459 - "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", 460 - "MetricGroup": "Bad;Branches;CodeGen;PGO", 461 - "MetricName": "tma_info_cond_tk" 433 + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", 434 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", 435 + "MetricGroup": "Bad;BadSpec;BrMispredicts", 436 + "MetricName": "tma_info_bad_spec_ipmispredict", 437 + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" 462 438 }, 463 439 { 464 440 "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", 465 441 "MetricConstraint": "NO_GROUP_EVENTS", 466 - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", 442 + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", 467 443 "MetricGroup": "Cor;SMT", 468 - "MetricName": "tma_info_core_bound_likely", 469 - "MetricThreshold": "tma_info_core_bound_likely > 0.5" 470 - }, 471 - { 472 - "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", 473 - "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", 474 - "MetricGroup": "SMT", 475 - "MetricName": "tma_info_core_clks" 476 - }, 477 - { 478 - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", 479 - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", 480 - "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", 481 - "MetricName": "tma_info_coreipc" 482 - }, 483 - { 484 - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", 485 - "MetricExpr": "1 / tma_info_ipc", 486 - "MetricGroup": "Mem;Pipeline", 487 - "MetricName": "tma_info_cpi" 488 - }, 489 - { 490 - "BriefDescription": "Average CPU Utilization", 491 - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", 492 - "MetricGroup": "HPC;Summary", 493 - "MetricName": "tma_info_cpu_utilization" 494 - }, 495 - { 496 - "BriefDescription": "Average Parallel L2 cache miss data reads", 497 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", 498 - "MetricGroup": "Memory_BW;Offcore", 499 - "MetricName": "tma_info_data_l2_mlp" 500 - }, 501 - { 502 - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", 503 - "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3", 504 - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", 505 - "MetricName": "tma_info_dram_bw_use", 506 - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" 507 - }, 508 - { 509 - "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", 510 - "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", 511 - "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", 512 - "MetricName": "tma_info_dsb_coverage", 513 - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 5 > 0.35", 514 - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" 444 + "MetricName": "tma_info_botlnk_l0_core_bound_likely", 445 + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" 515 446 }, 516 447 { 517 448 "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", 518 449 "MetricConstraint": "NO_GROUP_EVENTS", 519 450 "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))", 520 451 "MetricGroup": "DSBmiss;Fed;tma_issueFB", 521 - "MetricName": "tma_info_dsb_misses", 522 - "MetricThreshold": "tma_info_dsb_misses > 10", 523 - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" 524 - }, 525 - { 526 - "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", 527 - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", 528 - "MetricGroup": "DSBmiss", 529 - "MetricName": "tma_info_dsb_switch_cost" 530 - }, 531 - { 532 - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", 533 - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", 534 - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", 535 - "MetricName": "tma_info_execute" 536 - }, 537 - { 538 - "BriefDescription": "The ratio of Executed- by Issued-Uops", 539 - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", 540 - "MetricGroup": "Cor;Pipeline", 541 - "MetricName": "tma_info_execute_per_issue", 542 - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." 543 - }, 544 - { 545 - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", 546 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", 547 - "MetricGroup": "CacheMisses;Mem", 548 - "MetricName": "tma_info_fb_hpki" 549 - }, 550 - { 551 - "BriefDescription": "Average number of Uops issued by front-end when it issued something", 552 - "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", 553 - "MetricGroup": "Fed;FetchBW", 554 - "MetricName": "tma_info_fetch_upc" 555 - }, 556 - { 557 - "BriefDescription": "Floating Point Operations Per Cycle", 558 - "MetricConstraint": "NO_GROUP_EVENTS", 559 - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_clks", 560 - "MetricGroup": "Flops;Ret", 561 - "MetricName": "tma_info_flopc" 562 - }, 563 - { 564 - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", 565 - "MetricConstraint": "NO_GROUP_EVENTS", 566 - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_clks)", 567 - "MetricGroup": "Cor;Flops;HPC", 568 - "MetricName": "tma_info_fp_arith_utilization", 569 - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." 570 - }, 571 - { 572 - "BriefDescription": "Giga Floating Point Operations Per Second", 573 - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", 574 - "MetricGroup": "Cor;Flops;HPC", 575 - "MetricName": "tma_info_gflops", 576 - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." 452 + "MetricName": "tma_info_botlnk_l2_dsb_misses", 453 + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", 454 + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" 577 455 }, 578 456 { 579 457 "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", 580 458 "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", 581 459 "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", 582 - "MetricName": "tma_info_ic_misses", 583 - "MetricThreshold": "tma_info_ic_misses > 5", 460 + "MetricName": "tma_info_botlnk_l2_ic_misses", 461 + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", 584 462 "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " 585 463 }, 586 464 { 587 - "BriefDescription": "Average Latency for L1 instruction cache misses", 588 - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@", 589 - "MetricGroup": "Fed;FetchLat;IcMiss", 590 - "MetricName": "tma_info_icache_miss_latency" 465 + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", 466 + "MetricConstraint": "NO_GROUP_EVENTS", 467 + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", 468 + "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", 469 + "MetricName": "tma_info_bottleneck_big_code", 470 + "MetricThreshold": "tma_info_bottleneck_big_code > 20", 471 + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" 591 472 }, 592 473 { 593 - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", 594 - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", 595 - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", 596 - "MetricName": "tma_info_ilp" 474 + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", 475 + "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", 476 + "MetricGroup": "Ret;tma_issueBC", 477 + "MetricName": "tma_info_bottleneck_branching_overhead", 478 + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", 479 + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" 597 480 }, 598 481 { 599 482 "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", 600 483 "MetricConstraint": "NO_GROUP_EVENTS", 601 - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", 484 + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", 602 485 "MetricGroup": "Fed;FetchBW;Frontend", 603 - "MetricName": "tma_info_instruction_fetch_bw", 604 - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" 605 - }, 606 - { 607 - "BriefDescription": "Total number of retired Instructions", 608 - "MetricExpr": "INST_RETIRED.ANY", 609 - "MetricGroup": "Summary;TmaL1;tma_L1_group", 610 - "MetricName": "tma_info_instructions", 611 - "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" 612 - }, 613 - { 614 - "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", 615 - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", 616 - "MetricGroup": "Flops;InsType", 617 - "MetricName": "tma_info_iparith", 618 - "MetricThreshold": "tma_info_iparith < 10", 619 - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." 620 - }, 621 - { 622 - "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", 623 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", 624 - "MetricGroup": "Flops;FpVector;InsType", 625 - "MetricName": "tma_info_iparith_avx128", 626 - "MetricThreshold": "tma_info_iparith_avx128 < 10", 627 - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 628 - }, 629 - { 630 - "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", 631 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", 632 - "MetricGroup": "Flops;FpVector;InsType", 633 - "MetricName": "tma_info_iparith_avx256", 634 - "MetricThreshold": "tma_info_iparith_avx256 < 10", 635 - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 636 - }, 637 - { 638 - "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", 639 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", 640 - "MetricGroup": "Flops;FpVector;InsType", 641 - "MetricName": "tma_info_iparith_avx512", 642 - "MetricThreshold": "tma_info_iparith_avx512 < 10", 643 - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 644 - }, 645 - { 646 - "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", 647 - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", 648 - "MetricGroup": "Flops;FpScalar;InsType", 649 - "MetricName": "tma_info_iparith_scalar_dp", 650 - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", 651 - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 652 - }, 653 - { 654 - "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", 655 - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", 656 - "MetricGroup": "Flops;FpScalar;InsType", 657 - "MetricName": "tma_info_iparith_scalar_sp", 658 - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", 659 - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 660 - }, 661 - { 662 - "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", 663 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", 664 - "MetricGroup": "Branches;Fed;InsType", 665 - "MetricName": "tma_info_ipbranch", 666 - "MetricThreshold": "tma_info_ipbranch < 8" 667 - }, 668 - { 669 - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", 670 - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", 671 - "MetricGroup": "Ret;Summary", 672 - "MetricName": "tma_info_ipc" 673 - }, 674 - { 675 - "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", 676 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", 677 - "MetricGroup": "Branches;Fed;PGO", 678 - "MetricName": "tma_info_ipcall", 679 - "MetricThreshold": "tma_info_ipcall < 200" 680 - }, 681 - { 682 - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", 683 - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", 684 - "MetricGroup": "DSBmiss;Fed", 685 - "MetricName": "tma_info_ipdsb_miss_ret", 686 - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" 687 - }, 688 - { 689 - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", 690 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", 691 - "MetricGroup": "Branches;OS", 692 - "MetricName": "tma_info_ipfarbranch", 693 - "MetricThreshold": "tma_info_ipfarbranch < 1e6" 694 - }, 695 - { 696 - "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", 697 - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", 698 - "MetricGroup": "Flops;InsType", 699 - "MetricName": "tma_info_ipflop", 700 - "MetricThreshold": "tma_info_ipflop < 10" 701 - }, 702 - { 703 - "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", 704 - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", 705 - "MetricGroup": "InsType", 706 - "MetricName": "tma_info_ipload", 707 - "MetricThreshold": "tma_info_ipload < 3" 708 - }, 709 - { 710 - "BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).", 711 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_NTAKEN", 712 - "MetricGroup": "Bad;BrMispredicts", 713 - "MetricName": "tma_info_ipmisp_cond_ntaken", 714 - "MetricThreshold": "tma_info_ipmisp_cond_ntaken < 200" 715 - }, 716 - { 717 - "BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).", 718 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.COND_TAKEN", 719 - "MetricGroup": "Bad;BrMispredicts", 720 - "MetricName": "tma_info_ipmisp_cond_taken", 721 - "MetricThreshold": "tma_info_ipmisp_cond_taken < 200" 722 - }, 723 - { 724 - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", 725 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.INDIRECT", 726 - "MetricGroup": "Bad;BrMispredicts", 727 - "MetricName": "tma_info_ipmisp_indirect", 728 - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" 729 - }, 730 - { 731 - "BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).", 732 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.RET", 733 - "MetricGroup": "Bad;BrMispredicts", 734 - "MetricName": "tma_info_ipmisp_ret", 735 - "MetricThreshold": "tma_info_ipmisp_ret < 500" 736 - }, 737 - { 738 - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", 739 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", 740 - "MetricGroup": "Bad;BadSpec;BrMispredicts", 741 - "MetricName": "tma_info_ipmispredict", 742 - "MetricThreshold": "tma_info_ipmispredict < 200" 743 - }, 744 - { 745 - "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", 746 - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", 747 - "MetricGroup": "InsType", 748 - "MetricName": "tma_info_ipstore", 749 - "MetricThreshold": "tma_info_ipstore < 8" 750 - }, 751 - { 752 - "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", 753 - "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", 754 - "MetricGroup": "Prefetches", 755 - "MetricName": "tma_info_ipswpf", 756 - "MetricThreshold": "tma_info_ipswpf < 100" 757 - }, 758 - { 759 - "BriefDescription": "Instruction per taken branch", 760 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", 761 - "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", 762 - "MetricName": "tma_info_iptb", 763 - "MetricThreshold": "tma_info_iptb < 11", 764 - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" 765 - }, 766 - { 767 - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", 768 - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", 769 - "MetricGroup": "Fed", 770 - "MetricName": "tma_info_ipunknown_branch" 771 - }, 772 - { 773 - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", 774 - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", 775 - "MetricGroup": "Bad;Branches", 776 - "MetricName": "tma_info_jump" 777 - }, 778 - { 779 - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", 780 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", 781 - "MetricGroup": "OS", 782 - "MetricName": "tma_info_kernel_cpi" 783 - }, 784 - { 785 - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", 786 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", 787 - "MetricGroup": "OS", 788 - "MetricName": "tma_info_kernel_utilization", 789 - "MetricThreshold": "tma_info_kernel_utilization > 0.05" 790 - }, 791 - { 792 - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", 793 - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", 794 - "MetricGroup": "Mem;MemoryBW", 795 - "MetricName": "tma_info_l1d_cache_fill_bw" 796 - }, 797 - { 798 - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", 799 - "MetricExpr": "tma_info_l1d_cache_fill_bw", 800 - "MetricGroup": "Mem;MemoryBW", 801 - "MetricName": "tma_info_l1d_cache_fill_bw_1t" 802 - }, 803 - { 804 - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", 805 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", 806 - "MetricGroup": "CacheMisses;Mem", 807 - "MetricName": "tma_info_l1mpki" 808 - }, 809 - { 810 - "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", 811 - "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", 812 - "MetricGroup": "CacheMisses;Mem", 813 - "MetricName": "tma_info_l1mpki_load" 814 - }, 815 - { 816 - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", 817 - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", 818 - "MetricGroup": "Mem;MemoryBW", 819 - "MetricName": "tma_info_l2_cache_fill_bw" 820 - }, 821 - { 822 - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", 823 - "MetricExpr": "tma_info_l2_cache_fill_bw", 824 - "MetricGroup": "Mem;MemoryBW", 825 - "MetricName": "tma_info_l2_cache_fill_bw_1t" 826 - }, 827 - { 828 - "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", 829 - "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", 830 - "MetricGroup": "CacheMisses;Mem", 831 - "MetricName": "tma_info_l2hpki_all" 832 - }, 833 - { 834 - "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", 835 - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", 836 - "MetricGroup": "CacheMisses;Mem", 837 - "MetricName": "tma_info_l2hpki_load" 838 - }, 839 - { 840 - "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", 841 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 842 - "MetricGroup": "Backend;CacheMisses;Mem", 843 - "MetricName": "tma_info_l2mpki" 844 - }, 845 - { 846 - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", 847 - "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", 848 - "MetricGroup": "CacheMisses;Mem;Offcore", 849 - "MetricName": "tma_info_l2mpki_all" 850 - }, 851 - { 852 - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", 853 - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", 854 - "MetricGroup": "IcMiss", 855 - "MetricName": "tma_info_l2mpki_code" 856 - }, 857 - { 858 - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", 859 - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 860 - "MetricGroup": "IcMiss", 861 - "MetricName": "tma_info_l2mpki_code_all" 862 - }, 863 - { 864 - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", 865 - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", 866 - "MetricGroup": "CacheMisses;Mem", 867 - "MetricName": "tma_info_l2mpki_load" 868 - }, 869 - { 870 - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", 871 - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", 872 - "MetricGroup": "Mem;MemoryBW;Offcore", 873 - "MetricName": "tma_info_l3_cache_access_bw" 874 - }, 875 - { 876 - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", 877 - "MetricExpr": "tma_info_l3_cache_access_bw", 878 - "MetricGroup": "Mem;MemoryBW;Offcore", 879 - "MetricName": "tma_info_l3_cache_access_bw_1t" 880 - }, 881 - { 882 - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", 883 - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", 884 - "MetricGroup": "Mem;MemoryBW", 885 - "MetricName": "tma_info_l3_cache_fill_bw" 886 - }, 887 - { 888 - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", 889 - "MetricExpr": "tma_info_l3_cache_fill_bw", 890 - "MetricGroup": "Mem;MemoryBW", 891 - "MetricName": "tma_info_l3_cache_fill_bw_1t" 892 - }, 893 - { 894 - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", 895 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", 896 - "MetricGroup": "CacheMisses;Mem", 897 - "MetricName": "tma_info_l3mpki" 898 - }, 899 - { 900 - "BriefDescription": "Average Latency for L2 cache miss demand Loads", 901 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", 902 - "MetricGroup": "Memory_Lat;Offcore", 903 - "MetricName": "tma_info_load_l2_miss_latency" 904 - }, 905 - { 906 - "BriefDescription": "Average Parallel L2 cache miss demand Loads", 907 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", 908 - "MetricGroup": "Memory_BW;Offcore", 909 - "MetricName": "tma_info_load_l2_mlp" 910 - }, 911 - { 912 - "BriefDescription": "Average Latency for L3 cache miss demand Loads", 913 - "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", 914 - "MetricGroup": "Memory_Lat;Offcore", 915 - "MetricName": "tma_info_load_l3_miss_latency" 916 - }, 917 - { 918 - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", 919 - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", 920 - "MetricGroup": "Mem;MemoryBound;MemoryLat", 921 - "MetricName": "tma_info_load_miss_real_latency" 922 - }, 923 - { 924 - "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 925 - "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 926 - "MetricGroup": "Mem;MemoryTLB", 927 - "MetricName": "tma_info_load_stlb_mpki" 928 - }, 929 - { 930 - "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", 931 - "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY", 932 - "MetricGroup": "Fed;LSD", 933 - "MetricName": "tma_info_lsd_coverage" 934 - }, 935 - { 936 - "BriefDescription": "Average number of parallel data read requests to external memory", 937 - "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", 938 - "MetricGroup": "Mem;MemoryBW;SoC", 939 - "MetricName": "tma_info_mem_parallel_reads", 940 - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" 941 - }, 942 - { 943 - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", 944 - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.RD + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.RD", 945 - "MetricGroup": "Mem;MemoryLat;SoC", 946 - "MetricName": "tma_info_mem_read_latency", 947 - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" 948 - }, 949 - { 950 - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", 951 - "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / arb@event\\=0x81\\,umask\\=0x1@", 952 - "MetricGroup": "Mem;SoC", 953 - "MetricName": "tma_info_mem_request_latency" 486 + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", 487 + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" 954 488 }, 955 489 { 956 490 "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", 957 491 "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", 958 492 "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", 959 - "MetricName": "tma_info_memory_bandwidth", 960 - "MetricThreshold": "tma_info_memory_bandwidth > 20", 961 - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" 493 + "MetricName": "tma_info_bottleneck_memory_bandwidth", 494 + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", 495 + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" 962 496 }, 963 497 { 964 498 "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", 965 499 "MetricConstraint": "NO_GROUP_EVENTS", 966 500 "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", 967 501 "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", 968 - "MetricName": "tma_info_memory_data_tlbs", 969 - "MetricThreshold": "tma_info_memory_data_tlbs > 20", 502 + "MetricName": "tma_info_bottleneck_memory_data_tlbs", 503 + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", 970 504 "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" 971 505 }, 972 506 { ··· 508 974 "MetricConstraint": "NO_GROUP_EVENTS", 509 975 "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", 510 976 "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", 511 - "MetricName": "tma_info_memory_latency", 512 - "MetricThreshold": "tma_info_memory_latency > 20", 977 + "MetricName": "tma_info_bottleneck_memory_latency", 978 + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", 513 979 "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" 514 980 }, 515 981 { ··· 517 983 "MetricConstraint": "NO_GROUP_EVENTS", 518 984 "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", 519 985 "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", 520 - "MetricName": "tma_info_mispredictions", 521 - "MetricThreshold": "tma_info_mispredictions > 20", 522 - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" 986 + "MetricName": "tma_info_bottleneck_mispredictions", 987 + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", 988 + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" 989 + }, 990 + { 991 + "BriefDescription": "Fraction of branches that are CALL or RET", 992 + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", 993 + "MetricGroup": "Bad;Branches", 994 + "MetricName": "tma_info_branches_callret" 995 + }, 996 + { 997 + "BriefDescription": "Fraction of branches that are non-taken conditionals", 998 + "MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES", 999 + "MetricGroup": "Bad;Branches;CodeGen;PGO", 1000 + "MetricName": "tma_info_branches_cond_nt" 1001 + }, 1002 + { 1003 + "BriefDescription": "Fraction of branches that are taken conditionals", 1004 + "MetricExpr": "BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", 1005 + "MetricGroup": "Bad;Branches;CodeGen;PGO", 1006 + "MetricName": "tma_info_branches_cond_tk" 1007 + }, 1008 + { 1009 + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", 1010 + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", 1011 + "MetricGroup": "Bad;Branches", 1012 + "MetricName": "tma_info_branches_jump" 1013 + }, 1014 + { 1015 + "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", 1016 + "MetricExpr": "1 - (tma_info_branches_cond_nt + tma_info_branches_cond_tk + tma_info_branches_callret + tma_info_branches_jump)", 1017 + "MetricGroup": "Bad;Branches", 1018 + "MetricName": "tma_info_branches_other_branches" 1019 + }, 1020 + { 1021 + "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", 1022 + "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED", 1023 + "MetricGroup": "SMT", 1024 + "MetricName": "tma_info_core_core_clks" 1025 + }, 1026 + { 1027 + "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", 1028 + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", 1029 + "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", 1030 + "MetricName": "tma_info_core_coreipc" 1031 + }, 1032 + { 1033 + "BriefDescription": "Floating Point Operations Per Cycle", 1034 + "MetricConstraint": "NO_GROUP_EVENTS", 1035 + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", 1036 + "MetricGroup": "Flops;Ret", 1037 + "MetricName": "tma_info_core_flopc" 1038 + }, 1039 + { 1040 + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", 1041 + "MetricConstraint": "NO_GROUP_EVENTS", 1042 + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)", 1043 + "MetricGroup": "Cor;Flops;HPC", 1044 + "MetricName": "tma_info_core_fp_arith_utilization", 1045 + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." 1046 + }, 1047 + { 1048 + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", 1049 + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", 1050 + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", 1051 + "MetricName": "tma_info_core_ilp" 1052 + }, 1053 + { 1054 + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", 1055 + "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", 1056 + "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", 1057 + "MetricName": "tma_info_frontend_dsb_coverage", 1058 + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 5 > 0.35", 1059 + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" 1060 + }, 1061 + { 1062 + "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", 1063 + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / cpu@DSB2MITE_SWITCHES.PENALTY_CYCLES\\,cmask\\=1\\,edge@", 1064 + "MetricGroup": "DSBmiss", 1065 + "MetricName": "tma_info_frontend_dsb_switch_cost" 1066 + }, 1067 + { 1068 + "BriefDescription": "Average number of Uops issued by front-end when it issued something", 1069 + "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", 1070 + "MetricGroup": "Fed;FetchBW", 1071 + "MetricName": "tma_info_frontend_fetch_upc" 1072 + }, 1073 + { 1074 + "BriefDescription": "Average Latency for L1 instruction cache misses", 1075 + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@", 1076 + "MetricGroup": "Fed;FetchLat;IcMiss", 1077 + "MetricName": "tma_info_frontend_icache_miss_latency" 1078 + }, 1079 + { 1080 + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", 1081 + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", 1082 + "MetricGroup": "DSBmiss;Fed", 1083 + "MetricName": "tma_info_frontend_ipdsb_miss_ret", 1084 + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" 1085 + }, 1086 + { 1087 + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", 1088 + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", 1089 + "MetricGroup": "Fed", 1090 + "MetricName": "tma_info_frontend_ipunknown_branch" 1091 + }, 1092 + { 1093 + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", 1094 + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", 1095 + "MetricGroup": "IcMiss", 1096 + "MetricName": "tma_info_frontend_l2mpki_code" 1097 + }, 1098 + { 1099 + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", 1100 + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 1101 + "MetricGroup": "IcMiss", 1102 + "MetricName": "tma_info_frontend_l2mpki_code_all" 1103 + }, 1104 + { 1105 + "BriefDescription": "Fraction of Uops delivered by the LSD (Loop Stream Detector; aka Loop Cache)", 1106 + "MetricExpr": "LSD.UOPS / UOPS_ISSUED.ANY", 1107 + "MetricGroup": "Fed;LSD", 1108 + "MetricName": "tma_info_frontend_lsd_coverage" 1109 + }, 1110 + { 1111 + "BriefDescription": "Branch instructions per taken branch.", 1112 + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", 1113 + "MetricGroup": "Branches;Fed;PGO", 1114 + "MetricName": "tma_info_inst_mix_bptkbranch" 1115 + }, 1116 + { 1117 + "BriefDescription": "Total number of retired Instructions", 1118 + "MetricExpr": "INST_RETIRED.ANY", 1119 + "MetricGroup": "Summary;TmaL1;tma_L1_group", 1120 + "MetricName": "tma_info_inst_mix_instructions", 1121 + "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" 1122 + }, 1123 + { 1124 + "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", 1125 + "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", 1126 + "MetricGroup": "Flops;InsType", 1127 + "MetricName": "tma_info_inst_mix_iparith", 1128 + "MetricThreshold": "tma_info_inst_mix_iparith < 10", 1129 + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." 1130 + }, 1131 + { 1132 + "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", 1133 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", 1134 + "MetricGroup": "Flops;FpVector;InsType", 1135 + "MetricName": "tma_info_inst_mix_iparith_avx128", 1136 + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", 1137 + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1138 + }, 1139 + { 1140 + "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", 1141 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", 1142 + "MetricGroup": "Flops;FpVector;InsType", 1143 + "MetricName": "tma_info_inst_mix_iparith_avx256", 1144 + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", 1145 + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1146 + }, 1147 + { 1148 + "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", 1149 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", 1150 + "MetricGroup": "Flops;FpVector;InsType", 1151 + "MetricName": "tma_info_inst_mix_iparith_avx512", 1152 + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", 1153 + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1154 + }, 1155 + { 1156 + "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", 1157 + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", 1158 + "MetricGroup": "Flops;FpScalar;InsType", 1159 + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", 1160 + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", 1161 + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1162 + }, 1163 + { 1164 + "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", 1165 + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", 1166 + "MetricGroup": "Flops;FpScalar;InsType", 1167 + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", 1168 + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", 1169 + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1170 + }, 1171 + { 1172 + "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", 1173 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", 1174 + "MetricGroup": "Branches;Fed;InsType", 1175 + "MetricName": "tma_info_inst_mix_ipbranch", 1176 + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" 1177 + }, 1178 + { 1179 + "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", 1180 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", 1181 + "MetricGroup": "Branches;Fed;PGO", 1182 + "MetricName": "tma_info_inst_mix_ipcall", 1183 + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" 1184 + }, 1185 + { 1186 + "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", 1187 + "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", 1188 + "MetricGroup": "Flops;InsType", 1189 + "MetricName": "tma_info_inst_mix_ipflop", 1190 + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" 1191 + }, 1192 + { 1193 + "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", 1194 + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", 1195 + "MetricGroup": "InsType", 1196 + "MetricName": "tma_info_inst_mix_ipload", 1197 + "MetricThreshold": "tma_info_inst_mix_ipload < 3" 1198 + }, 1199 + { 1200 + "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", 1201 + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", 1202 + "MetricGroup": "InsType", 1203 + "MetricName": "tma_info_inst_mix_ipstore", 1204 + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" 1205 + }, 1206 + { 1207 + "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", 1208 + "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", 1209 + "MetricGroup": "Prefetches", 1210 + "MetricName": "tma_info_inst_mix_ipswpf", 1211 + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" 1212 + }, 1213 + { 1214 + "BriefDescription": "Instruction per taken branch", 1215 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", 1216 + "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", 1217 + "MetricName": "tma_info_inst_mix_iptb", 1218 + "MetricThreshold": "tma_info_inst_mix_iptb < 11", 1219 + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" 1220 + }, 1221 + { 1222 + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", 1223 + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", 1224 + "MetricGroup": "Mem;MemoryBW", 1225 + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" 1226 + }, 1227 + { 1228 + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", 1229 + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", 1230 + "MetricGroup": "Mem;MemoryBW", 1231 + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" 1232 + }, 1233 + { 1234 + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", 1235 + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", 1236 + "MetricGroup": "Mem;MemoryBW;Offcore", 1237 + "MetricName": "tma_info_memory_core_l3_cache_access_bw" 1238 + }, 1239 + { 1240 + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", 1241 + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", 1242 + "MetricGroup": "Mem;MemoryBW", 1243 + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" 1244 + }, 1245 + { 1246 + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", 1247 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", 1248 + "MetricGroup": "CacheMisses;Mem", 1249 + "MetricName": "tma_info_memory_fb_hpki" 1250 + }, 1251 + { 1252 + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", 1253 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", 1254 + "MetricGroup": "CacheMisses;Mem", 1255 + "MetricName": "tma_info_memory_l1mpki" 1256 + }, 1257 + { 1258 + "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", 1259 + "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", 1260 + "MetricGroup": "CacheMisses;Mem", 1261 + "MetricName": "tma_info_memory_l1mpki_load" 1262 + }, 1263 + { 1264 + "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", 1265 + "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", 1266 + "MetricGroup": "CacheMisses;Mem", 1267 + "MetricName": "tma_info_memory_l2hpki_all" 1268 + }, 1269 + { 1270 + "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", 1271 + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", 1272 + "MetricGroup": "CacheMisses;Mem", 1273 + "MetricName": "tma_info_memory_l2hpki_load" 1274 + }, 1275 + { 1276 + "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", 1277 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 1278 + "MetricGroup": "Backend;CacheMisses;Mem", 1279 + "MetricName": "tma_info_memory_l2mpki" 1280 + }, 1281 + { 1282 + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", 1283 + "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", 1284 + "MetricGroup": "CacheMisses;Mem;Offcore", 1285 + "MetricName": "tma_info_memory_l2mpki_all" 1286 + }, 1287 + { 1288 + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", 1289 + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", 1290 + "MetricGroup": "CacheMisses;Mem", 1291 + "MetricName": "tma_info_memory_l2mpki_load" 1292 + }, 1293 + { 1294 + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", 1295 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", 1296 + "MetricGroup": "CacheMisses;Mem", 1297 + "MetricName": "tma_info_memory_l3mpki" 1298 + }, 1299 + { 1300 + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", 1301 + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", 1302 + "MetricGroup": "Mem;MemoryBound;MemoryLat", 1303 + "MetricName": "tma_info_memory_load_miss_real_latency" 523 1304 }, 524 1305 { 525 1306 "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", 526 1307 "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", 527 1308 "MetricGroup": "Mem;MemoryBW;MemoryBound", 528 - "MetricName": "tma_info_mlp", 1309 + "MetricName": "tma_info_memory_mlp", 529 1310 "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" 530 1311 }, 531 1312 { 532 - "BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)", 533 - "MetricExpr": "1 - (tma_info_cond_nt + tma_info_cond_tk + tma_info_callret + tma_info_jump)", 534 - "MetricGroup": "Bad;Branches", 535 - "MetricName": "tma_info_other_branches" 1313 + "BriefDescription": "Average Parallel L2 cache miss data reads", 1314 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", 1315 + "MetricGroup": "Memory_BW;Offcore", 1316 + "MetricName": "tma_info_memory_oro_data_l2_mlp" 1317 + }, 1318 + { 1319 + "BriefDescription": "Average Latency for L2 cache miss demand Loads", 1320 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", 1321 + "MetricGroup": "Memory_Lat;Offcore", 1322 + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" 1323 + }, 1324 + { 1325 + "BriefDescription": "Average Parallel L2 cache miss demand Loads", 1326 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@", 1327 + "MetricGroup": "Memory_BW;Offcore", 1328 + "MetricName": "tma_info_memory_oro_load_l2_mlp" 1329 + }, 1330 + { 1331 + "BriefDescription": "Average Latency for L3 cache miss demand Loads", 1332 + "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", 1333 + "MetricGroup": "Memory_Lat;Offcore", 1334 + "MetricName": "tma_info_memory_oro_load_l3_miss_latency" 1335 + }, 1336 + { 1337 + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", 1338 + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", 1339 + "MetricGroup": "Mem;MemoryBW", 1340 + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" 1341 + }, 1342 + { 1343 + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", 1344 + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", 1345 + "MetricGroup": "Mem;MemoryBW", 1346 + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" 1347 + }, 1348 + { 1349 + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", 1350 + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", 1351 + "MetricGroup": "Mem;MemoryBW;Offcore", 1352 + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" 1353 + }, 1354 + { 1355 + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", 1356 + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", 1357 + "MetricGroup": "Mem;MemoryBW", 1358 + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" 1359 + }, 1360 + { 1361 + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1362 + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1363 + "MetricGroup": "Fed;MemoryTLB", 1364 + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" 1365 + }, 1366 + { 1367 + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1368 + "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1369 + "MetricGroup": "Mem;MemoryTLB", 1370 + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" 536 1371 }, 537 1372 { 538 1373 "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", 539 - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_clks)", 1374 + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * tma_info_core_core_clks)", 540 1375 "MetricGroup": "Mem;MemoryTLB", 541 - "MetricName": "tma_info_page_walks_utilization", 542 - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" 543 - }, 544 - { 545 - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", 546 - "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_clks", 547 - "MetricGroup": "Power", 548 - "MetricName": "tma_info_power_license0_utilization", 549 - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." 550 - }, 551 - { 552 - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", 553 - "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_clks", 554 - "MetricGroup": "Power", 555 - "MetricName": "tma_info_power_license1_utilization", 556 - "MetricThreshold": "tma_info_power_license1_utilization > 0.5", 557 - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." 558 - }, 559 - { 560 - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", 561 - "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_clks", 562 - "MetricGroup": "Power", 563 - "MetricName": "tma_info_power_license2_utilization", 564 - "MetricThreshold": "tma_info_power_license2_utilization > 0.5", 565 - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." 566 - }, 567 - { 568 - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", 569 - "MetricConstraint": "NO_GROUP_EVENTS", 570 - "MetricExpr": "tma_retiring * tma_info_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", 571 - "MetricGroup": "Pipeline;Ret", 572 - "MetricName": "tma_info_retire" 573 - }, 574 - { 575 - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", 576 - "MetricExpr": "TOPDOWN.SLOTS", 577 - "MetricGroup": "TmaL1;tma_L1_group", 578 - "MetricName": "tma_info_slots" 579 - }, 580 - { 581 - "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", 582 - "MetricExpr": "(tma_info_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", 583 - "MetricGroup": "SMT;TmaL1;tma_L1_group", 584 - "MetricName": "tma_info_slots_utilization" 585 - }, 586 - { 587 - "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", 588 - "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", 589 - "MetricGroup": "SMT", 590 - "MetricName": "tma_info_smt_2t_utilization" 1376 + "MetricName": "tma_info_memory_tlb_page_walks_utilization", 1377 + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" 591 1378 }, 592 1379 { 593 1380 "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 594 1381 "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 595 1382 "MetricGroup": "Mem;MemoryTLB", 596 - "MetricName": "tma_info_store_stlb_mpki" 1383 + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" 1384 + }, 1385 + { 1386 + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", 1387 + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", 1388 + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", 1389 + "MetricName": "tma_info_pipeline_execute" 1390 + }, 1391 + { 1392 + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", 1393 + "MetricConstraint": "NO_GROUP_EVENTS", 1394 + "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@", 1395 + "MetricGroup": "Pipeline;Ret", 1396 + "MetricName": "tma_info_pipeline_retire" 1397 + }, 1398 + { 1399 + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", 1400 + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", 1401 + "MetricGroup": "Power;Summary", 1402 + "MetricName": "tma_info_system_average_frequency" 1403 + }, 1404 + { 1405 + "BriefDescription": "Average CPU Utilization", 1406 + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", 1407 + "MetricGroup": "HPC;Summary", 1408 + "MetricName": "tma_info_system_cpu_utilization" 1409 + }, 1410 + { 1411 + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", 1412 + "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3", 1413 + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", 1414 + "MetricName": "tma_info_system_dram_bw_use", 1415 + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" 1416 + }, 1417 + { 1418 + "BriefDescription": "Giga Floating Point Operations Per Second", 1419 + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", 1420 + "MetricGroup": "Cor;Flops;HPC", 1421 + "MetricName": "tma_info_system_gflops", 1422 + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." 1423 + }, 1424 + { 1425 + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", 1426 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", 1427 + "MetricGroup": "Branches;OS", 1428 + "MetricName": "tma_info_system_ipfarbranch", 1429 + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" 1430 + }, 1431 + { 1432 + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", 1433 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", 1434 + "MetricGroup": "OS", 1435 + "MetricName": "tma_info_system_kernel_cpi" 1436 + }, 1437 + { 1438 + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", 1439 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", 1440 + "MetricGroup": "OS", 1441 + "MetricName": "tma_info_system_kernel_utilization", 1442 + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" 1443 + }, 1444 + { 1445 + "BriefDescription": "Average number of parallel data read requests to external memory", 1446 + "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", 1447 + "MetricGroup": "Mem;MemoryBW;SoC", 1448 + "MetricName": "tma_info_system_mem_parallel_reads", 1449 + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" 1450 + }, 1451 + { 1452 + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", 1453 + "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.RD + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.RD", 1454 + "MetricGroup": "Mem;MemoryLat;SoC", 1455 + "MetricName": "tma_info_system_mem_read_latency", 1456 + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" 1457 + }, 1458 + { 1459 + "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", 1460 + "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / arb@event\\=0x81\\,umask\\=0x1@", 1461 + "MetricGroup": "Mem;SoC", 1462 + "MetricName": "tma_info_system_mem_request_latency" 1463 + }, 1464 + { 1465 + "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", 1466 + "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks", 1467 + "MetricGroup": "Power", 1468 + "MetricName": "tma_info_system_power_license0_utilization", 1469 + "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." 1470 + }, 1471 + { 1472 + "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", 1473 + "MetricExpr": "CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks", 1474 + "MetricGroup": "Power", 1475 + "MetricName": "tma_info_system_power_license1_utilization", 1476 + "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5", 1477 + "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." 1478 + }, 1479 + { 1480 + "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", 1481 + "MetricExpr": "CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks", 1482 + "MetricGroup": "Power", 1483 + "MetricName": "tma_info_system_power_license2_utilization", 1484 + "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5", 1485 + "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." 1486 + }, 1487 + { 1488 + "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", 1489 + "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_DISTRIBUTED if #SMT_on else 0)", 1490 + "MetricGroup": "SMT", 1491 + "MetricName": "tma_info_system_smt_2t_utilization" 597 1492 }, 598 1493 { 599 1494 "BriefDescription": "Average Frequency Utilization relative nominal frequency", 600 - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", 1495 + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", 601 1496 "MetricGroup": "Power", 602 - "MetricName": "tma_info_turbo_utilization" 1497 + "MetricName": "tma_info_system_turbo_utilization" 1498 + }, 1499 + { 1500 + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", 1501 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", 1502 + "MetricGroup": "Pipeline", 1503 + "MetricName": "tma_info_thread_clks" 1504 + }, 1505 + { 1506 + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", 1507 + "MetricExpr": "1 / tma_info_thread_ipc", 1508 + "MetricGroup": "Mem;Pipeline", 1509 + "MetricName": "tma_info_thread_cpi" 1510 + }, 1511 + { 1512 + "BriefDescription": "The ratio of Executed- by Issued-Uops", 1513 + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", 1514 + "MetricGroup": "Cor;Pipeline", 1515 + "MetricName": "tma_info_thread_execute_per_issue", 1516 + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." 1517 + }, 1518 + { 1519 + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", 1520 + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", 1521 + "MetricGroup": "Ret;Summary", 1522 + "MetricName": "tma_info_thread_ipc" 1523 + }, 1524 + { 1525 + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", 1526 + "MetricExpr": "TOPDOWN.SLOTS", 1527 + "MetricGroup": "TmaL1;tma_L1_group", 1528 + "MetricName": "tma_info_thread_slots" 1529 + }, 1530 + { 1531 + "BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor", 1532 + "MetricExpr": "(tma_info_thread_slots / (TOPDOWN.SLOTS / 2) if #SMT_on else 1)", 1533 + "MetricGroup": "SMT;TmaL1;tma_L1_group", 1534 + "MetricName": "tma_info_thread_slots_utilization" 603 1535 }, 604 1536 { 605 1537 "BriefDescription": "Uops Per Instruction", 606 - "MetricExpr": "tma_retiring * tma_info_slots / INST_RETIRED.ANY", 1538 + "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY", 607 1539 "MetricGroup": "Pipeline;Ret;Retire", 608 - "MetricName": "tma_info_uoppi", 609 - "MetricThreshold": "tma_info_uoppi > 1.05" 1540 + "MetricName": "tma_info_thread_uoppi", 1541 + "MetricThreshold": "tma_info_thread_uoppi > 1.05" 610 1542 }, 611 1543 { 612 1544 "BriefDescription": "Instruction per taken branch", 613 - "MetricExpr": "tma_retiring * tma_info_slots / BR_INST_RETIRED.NEAR_TAKEN", 1545 + "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN", 614 1546 "MetricGroup": "Branches;Fed;FetchBW", 615 - "MetricName": "tma_info_uptb", 616 - "MetricThreshold": "tma_info_uptb < 7.5" 1547 + "MetricName": "tma_info_thread_uptb", 1548 + "MetricThreshold": "tma_info_thread_uptb < 7.5" 617 1549 }, 618 1550 { 619 1551 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", 620 - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", 1552 + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", 621 1553 "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", 622 1554 "MetricName": "tma_itlb_misses", 623 1555 "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 1092 1092 }, 1093 1093 { 1094 1094 "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", 1095 - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", 1095 + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", 1096 1096 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", 1097 1097 "MetricName": "tma_l1_bound", 1098 1098 "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1102 1102 { 1103 1103 "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", 1104 1104 "MetricConstraint": "NO_GROUP_EVENTS", 1105 - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", 1105 + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", 1106 1106 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1107 1107 "MetricName": "tma_l2_bound", 1108 1108 "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1111 1111 }, 1112 1112 { 1113 1113 "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", 1114 - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", 1114 + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", 1115 1115 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1116 1116 "MetricName": "tma_l3_bound", 1117 1117 "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1120 1120 }, 1121 1121 { 1122 1122 "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", 1123 - "MetricExpr": "17.5 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1123 + "MetricExpr": "17.5 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1124 1124 "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", 1125 1125 "MetricName": "tma_l3_hit_latency", 1126 1126 "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1127 - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", 1127 + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", 1128 1128 "ScaleUnit": "100%" 1129 1129 }, 1130 1130 { 1131 1131 "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", 1132 - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", 1132 + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", 1133 1133 "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", 1134 1134 "MetricName": "tma_lcp", 1135 1135 "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", 1136 - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", 1136 + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", 1137 1137 "ScaleUnit": "100%" 1138 1138 }, 1139 1139 { ··· 1148 1148 }, 1149 1149 { 1150 1150 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", 1151 - "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_clks)", 1151 + "MetricExpr": "UOPS_DISPATCHED.PORT_2_3 / (2 * tma_info_core_core_clks)", 1152 1152 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 1153 1153 "MetricName": "tma_load_op_utilization", 1154 1154 "MetricThreshold": "tma_load_op_utilization > 0.6", ··· 1165 1165 }, 1166 1166 { 1167 1167 "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", 1168 - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", 1168 + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", 1169 1169 "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", 1170 1170 "MetricName": "tma_load_stlb_miss", 1171 1171 "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1174 1174 { 1175 1175 "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", 1176 1176 "MetricConstraint": "NO_GROUP_EVENTS", 1177 - "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", 1177 + "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", 1178 1178 "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", 1179 1179 "MetricName": "tma_lock_latency", 1180 1180 "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1183 1183 }, 1184 1184 { 1185 1185 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", 1186 - "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_clks / 2", 1186 + "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2", 1187 1187 "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 1188 1188 "MetricName": "tma_lsd", 1189 - "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", 1189 + "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", 1190 1190 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.", 1191 1191 "ScaleUnit": "100%" 1192 1192 }, ··· 1202 1202 }, 1203 1203 { 1204 1204 "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", 1205 - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", 1205 + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", 1206 1206 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", 1207 1207 "MetricName": "tma_mem_bandwidth", 1208 1208 "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1209 - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", 1209 + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", 1210 1210 "ScaleUnit": "100%" 1211 1211 }, 1212 1212 { 1213 1213 "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", 1214 - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", 1214 + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", 1215 1215 "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", 1216 1216 "MetricName": "tma_mem_latency", 1217 1217 "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1218 - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", 1218 + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", 1219 1219 "ScaleUnit": "100%" 1220 1220 }, 1221 1221 { ··· 1239 1239 }, 1240 1240 { 1241 1241 "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", 1242 - "MetricExpr": "tma_retiring * tma_info_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", 1242 + "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", 1243 1243 "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", 1244 1244 "MetricName": "tma_microcode_sequencer", 1245 1245 "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", ··· 1248 1248 }, 1249 1249 { 1250 1250 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", 1251 - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", 1251 + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", 1252 1252 "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", 1253 1253 "MetricName": "tma_mispredicts_resteers", 1254 1254 "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", 1255 - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", 1255 + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", 1256 1256 "ScaleUnit": "100%" 1257 1257 }, 1258 1258 { 1259 1259 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", 1260 - "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_clks / 2", 1260 + "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2", 1261 1261 "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 1262 1262 "MetricName": "tma_mite", 1263 - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35)", 1263 + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)", 1264 1264 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", 1265 1265 "ScaleUnit": "100%" 1266 1266 }, 1267 1267 { 1268 1268 "BriefDescription": "This metric represents fraction of cycles where (only) 4 uops were delivered by the MITE pipeline", 1269 - "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_clks", 1269 + "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks", 1270 1270 "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group", 1271 1271 "MetricName": "tma_mite_4wide", 1272 - "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35))", 1272 + "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))", 1273 1273 "ScaleUnit": "100%" 1274 1274 }, 1275 1275 { ··· 1283 1283 }, 1284 1284 { 1285 1285 "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", 1286 - "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_clks", 1286 + "MetricExpr": "3 * IDQ.MS_SWITCHES / tma_info_thread_clks", 1287 1287 "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", 1288 1288 "MetricName": "tma_ms_switches", 1289 1289 "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 1292 1292 }, 1293 1293 { 1294 1294 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions", 1295 - "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_slots)", 1295 + "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)", 1296 1296 "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", 1297 1297 "MetricName": "tma_nop_instructions", 1298 1298 "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6", ··· 1311 1311 }, 1312 1312 { 1313 1313 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", 1314 - "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_clks", 1314 + "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks", 1315 1315 "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1316 1316 "MetricName": "tma_port_0", 1317 1317 "MetricThreshold": "tma_port_0 > 0.6", ··· 1320 1320 }, 1321 1321 { 1322 1322 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", 1323 - "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_clks", 1323 + "MetricExpr": "UOPS_DISPATCHED.PORT_1 / tma_info_core_core_clks", 1324 1324 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1325 1325 "MetricName": "tma_port_1", 1326 1326 "MetricThreshold": "tma_port_1 > 0.6", ··· 1329 1329 }, 1330 1330 { 1331 1331 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", 1332 - "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_clks", 1332 + "MetricExpr": "UOPS_DISPATCHED.PORT_5 / tma_info_core_core_clks", 1333 1333 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1334 1334 "MetricName": "tma_port_5", 1335 1335 "MetricThreshold": "tma_port_5 > 0.6", ··· 1338 1338 }, 1339 1339 { 1340 1340 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", 1341 - "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_clks", 1341 + "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks", 1342 1342 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1343 1343 "MetricName": "tma_port_6", 1344 1344 "MetricThreshold": "tma_port_6 > 0.6", ··· 1347 1347 }, 1348 1348 { 1349 1349 "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", 1350 - "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", 1350 + "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", 1351 1351 "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", 1352 1352 "MetricName": "tma_ports_utilization", 1353 1353 "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", ··· 1356 1356 }, 1357 1357 { 1358 1358 "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1359 - "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_clks", 1359 + "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks", 1360 1360 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", 1361 1361 "MetricName": "tma_ports_utilized_0", 1362 1362 "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1365 1365 }, 1366 1366 { 1367 1367 "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1368 - "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_clks", 1368 + "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", 1369 1369 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", 1370 1370 "MetricName": "tma_ports_utilized_1", 1371 1371 "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1374 1374 }, 1375 1375 { 1376 1376 "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1377 - "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_clks", 1377 + "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", 1378 1378 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", 1379 1379 "MetricName": "tma_ports_utilized_2", 1380 1380 "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1383 1383 }, 1384 1384 { 1385 1385 "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1386 - "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_clks", 1386 + "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", 1387 1387 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", 1388 1388 "MetricName": "tma_ports_utilized_3m", 1389 1389 "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1392 1392 }, 1393 1393 { 1394 1394 "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", 1395 - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_slots", 1395 + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", 1396 1396 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1397 1397 "MetricName": "tma_retiring", 1398 1398 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", ··· 1402 1402 }, 1403 1403 { 1404 1404 "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", 1405 - "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_clks", 1405 + "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks", 1406 1406 "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", 1407 1407 "MetricName": "tma_serializing_operation", 1408 1408 "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", ··· 1411 1411 }, 1412 1412 { 1413 1413 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", 1414 - "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_clks", 1414 + "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks", 1415 1415 "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group", 1416 1416 "MetricName": "tma_slow_pause", 1417 1417 "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))", ··· 1420 1420 }, 1421 1421 { 1422 1422 "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", 1423 - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", 1423 + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", 1424 1424 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 1425 1425 "MetricName": "tma_split_loads", 1426 1426 "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1429 1429 }, 1430 1430 { 1431 1431 "BriefDescription": "This metric represents rate of split store accesses", 1432 - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", 1432 + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", 1433 1433 "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", 1434 1434 "MetricName": "tma_split_stores", 1435 1435 "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1438 1438 }, 1439 1439 { 1440 1440 "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", 1441 - "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_clks", 1441 + "MetricExpr": "L1D_PEND_MISS.L2_STALL / tma_info_thread_clks", 1442 1442 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", 1443 1443 "MetricName": "tma_sq_full", 1444 1444 "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1445 - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", 1445 + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", 1446 1446 "ScaleUnit": "100%" 1447 1447 }, 1448 1448 { 1449 1449 "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", 1450 - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", 1450 + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", 1451 1451 "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1452 1452 "MetricName": "tma_store_bound", 1453 1453 "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1456 1456 }, 1457 1457 { 1458 1458 "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", 1459 - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", 1459 + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", 1460 1460 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 1461 1461 "MetricName": "tma_store_fwd_blk", 1462 1462 "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1465 1465 }, 1466 1466 { 1467 1467 "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", 1468 - "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", 1468 + "MetricExpr": "(L2_RQSTS.RFO_HIT * 10 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", 1469 1469 "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", 1470 1470 "MetricName": "tma_store_latency", 1471 1471 "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1474 1474 }, 1475 1475 { 1476 1476 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", 1477 - "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_clks)", 1477 + "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * tma_info_core_core_clks)", 1478 1478 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 1479 1479 "MetricName": "tma_store_op_utilization", 1480 1480 "MetricThreshold": "tma_store_op_utilization > 0.6", ··· 1491 1491 }, 1492 1492 { 1493 1493 "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", 1494 - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", 1494 + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", 1495 1495 "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", 1496 1496 "MetricName": "tma_store_stlb_miss", 1497 1497 "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1499 1499 }, 1500 1500 { 1501 1501 "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores", 1502 - "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_clks", 1502 + "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / tma_info_thread_clks", 1503 1503 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueSmSt;tma_store_bound_group", 1504 1504 "MetricName": "tma_streaming_stores", 1505 1505 "MetricThreshold": "tma_streaming_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1508 1508 }, 1509 1509 { 1510 1510 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", 1511 - "MetricExpr": "10 * BACLEARS.ANY / tma_info_clks", 1511 + "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks", 1512 1512 "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", 1513 1513 "MetricName": "tma_unknown_branches", 1514 1514 "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",