perf vendor events intel: Update skylake/skylakex events/metrics

+2 -2

tools/perf/pmu-events/arch/x86/mapfile.csv

··· 26 26 GenuineIntel-6-(8F|CF),v1.13,sapphirerapids,core 27 27 GenuineIntel-6-AF,v1.00,sierraforest,core 28 28 GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core 29 - GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v55,skylake,core 30 - GenuineIntel-6-55-[01234],v1.29,skylakex,core 29 + GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v56,skylake,core 30 + GenuineIntel-6-55-[01234],v1.30,skylakex,core 31 31 GenuineIntel-6-86,v1.20,snowridgex,core 32 32 GenuineIntel-6-8[CD],v1.10,tigerlake,core 33 33 GenuineIntel-6-2C,v4,westmereep-dp,core

+8

tools/perf/pmu-events/arch/x86/skylake/floating-point.json

··· 32 32 "UMask": "0x20" 33 33 }, 34 34 { 35 + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", 36 + "EventCode": "0xC7", 37 + "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", 38 + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", 39 + "SampleAfterValue": "1000003", 40 + "UMask": "0x18" 41 + }, 42 + { 35 43 "BriefDescription": "Counts once for most SIMD scalar computational floating-point instructions retired. Counts twice for DPP and FM(N)ADD/SUB instructions retired.", 36 44 "EventCode": "0xC7", 37 45 "EventName": "FP_ARITH_INST_RETIRED.SCALAR",

+12 -3

tools/perf/pmu-events/arch/x86/skylake/pipeline.json

··· 26 26 "UMask": "0x4" 27 27 }, 28 28 { 29 - "BriefDescription": "Conditional branch instructions retired.", 29 + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", 30 + "Errata": "SKL091", 31 + "EventCode": "0xC4", 32 + "EventName": "BR_INST_RETIRED.COND", 33 + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", 34 + "SampleAfterValue": "400009", 35 + "UMask": "0x1" 36 + }, 37 + { 38 + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", 30 39 "Errata": "SKL091", 31 40 "EventCode": "0xC4", 32 41 "EventName": "BR_INST_RETIRED.CONDITIONAL", 33 42 "PEBS": "1", 34 - "PublicDescription": "This event counts conditional branch instructions retired.", 43 + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", 35 44 "SampleAfterValue": "400009", 36 45 "UMask": "0x1" 37 46 }, ··· 414 405 "UMask": "0x1" 415 406 }, 416 407 { 417 - "AnyThread": "1", 418 408 "BriefDescription": "Clears speculative count", 419 409 "CounterMask": "1", 410 + "EdgeDetect": "1", 420 411 "EventCode": "0x0D", 421 412 "EventName": "INT_MISC.CLEARS_COUNT", 422 413 "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears",

+615 -616

tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json

··· 50 50 }, 51 51 { 52 52 "BriefDescription": "Uncore frequency per die [GHZ]", 53 - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", 53 + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", 54 54 "MetricGroup": "SoC", 55 55 "MetricName": "UNCORE_FREQ" 56 56 }, ··· 71 71 }, 72 72 { 73 73 "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", 74 - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", 74 + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", 75 75 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 76 76 "MetricName": "tma_4k_aliasing", 77 77 "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 80 80 }, 81 81 { 82 82 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", 83 - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", 83 + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", 84 84 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 85 85 "MetricName": "tma_alu_op_utilization", 86 86 "MetricThreshold": "tma_alu_op_utilization > 0.6", ··· 88 88 }, 89 89 { 90 90 "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", 91 - "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_slots", 91 + "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", 92 92 "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", 93 93 "MetricName": "tma_assists", 94 94 "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", ··· 97 97 }, 98 98 { 99 99 "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", 100 - "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", 100 + "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", 101 101 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 102 102 "MetricName": "tma_backend_bound", 103 103 "MetricThreshold": "tma_backend_bound > 0.2", ··· 107 107 }, 108 108 { 109 109 "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", 110 - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", 110 + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", 111 111 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 112 112 "MetricName": "tma_bad_speculation", 113 113 "MetricThreshold": "tma_bad_speculation > 0.15", ··· 123 123 "MetricName": "tma_branch_mispredicts", 124 124 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 125 125 "MetricgroupNoGroup": "TopdownL2", 126 - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 126 + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", 127 127 "ScaleUnit": "100%" 128 128 }, 129 129 { 130 130 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", 131 - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", 131 + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", 132 132 "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", 133 133 "MetricName": "tma_branch_resteers", 134 134 "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 146 146 }, 147 147 { 148 148 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", 149 - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", 149 + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", 150 150 "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", 151 151 "MetricName": "tma_clears_resteers", 152 152 "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", ··· 156 156 { 157 157 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", 158 158 "MetricConstraint": "NO_GROUP_EVENTS", 159 - "MetricExpr": "(18.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 16.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 159 + "MetricExpr": "(18.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 16.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 160 160 "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", 161 161 "MetricName": "tma_contested_accesses", 162 162 "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 177 177 { 178 178 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", 179 179 "MetricConstraint": "NO_GROUP_EVENTS", 180 - "MetricExpr": "16.5 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 180 + "MetricExpr": "16.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 181 181 "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", 182 182 "MetricName": "tma_data_sharing", 183 183 "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 186 186 }, 187 187 { 188 188 "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", 189 - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", 189 + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", 190 190 "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", 191 191 "MetricName": "tma_decoder0_alone", 192 - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35))", 192 + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))", 193 193 "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", 194 194 "ScaleUnit": "100%" 195 195 }, 196 196 { 197 197 "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", 198 - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", 198 + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", 199 199 "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", 200 200 "MetricName": "tma_divider", 201 201 "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", ··· 205 205 { 206 206 "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", 207 207 "MetricConstraint": "NO_GROUP_EVENTS", 208 - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound", 208 + "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", 209 209 "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 210 210 "MetricName": "tma_dram_bound", 211 211 "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 214 214 }, 215 215 { 216 216 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", 217 - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", 217 + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", 218 218 "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 219 219 "MetricName": "tma_dsb", 220 - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", 220 + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", 221 221 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", 222 222 "ScaleUnit": "100%" 223 223 }, 224 224 { 225 225 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", 226 - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", 226 + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", 227 227 "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", 228 228 "MetricName": "tma_dsb_switches", 229 229 "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", 230 - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 230 + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", 231 231 "ScaleUnit": "100%" 232 232 }, 233 233 { 234 234 "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", 235 235 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 236 - "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", 236 + "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", 237 237 "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", 238 238 "MetricName": "tma_dtlb_load", 239 239 "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 240 - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", 240 + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", 241 241 "ScaleUnit": "100%" 242 242 }, 243 243 { 244 244 "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", 245 - "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", 245 + "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", 246 246 "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", 247 247 "MetricName": "tma_dtlb_store", 248 248 "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 249 - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", 249 + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", 250 250 "ScaleUnit": "100%" 251 251 }, 252 252 { 253 253 "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", 254 254 "MetricConstraint": "NO_GROUP_EVENTS", 255 - "MetricExpr": "22 * tma_info_average_frequency * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_clks", 255 + "MetricExpr": "22 * tma_info_system_average_frequency * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks", 256 256 "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", 257 257 "MetricName": "tma_false_sharing", 258 258 "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 262 262 { 263 263 "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", 264 264 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 265 - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", 265 + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", 266 266 "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", 267 267 "MetricName": "tma_fb_full", 268 268 "MetricThreshold": "tma_fb_full > 0.3", 269 - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", 269 + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", 270 270 "ScaleUnit": "100%" 271 271 }, 272 272 { ··· 274 274 "MetricExpr": "tma_frontend_bound - tma_fetch_latency", 275 275 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 276 276 "MetricName": "tma_fetch_bandwidth", 277 - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 277 + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", 278 278 "MetricgroupNoGroup": "TopdownL2", 279 - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 279 + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", 280 280 "ScaleUnit": "100%" 281 281 }, 282 282 { 283 283 "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", 284 - "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots", 284 + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots", 285 285 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 286 286 "MetricName": "tma_fetch_latency", 287 287 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", ··· 347 347 }, 348 348 { 349 349 "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", 350 - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", 350 + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", 351 351 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 352 352 "MetricName": "tma_frontend_bound", 353 353 "MetricThreshold": "tma_frontend_bound > 0.15", ··· 366 366 }, 367 367 { 368 368 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", 369 - "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_slots", 369 + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_thread_slots", 370 370 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 371 371 "MetricName": "tma_heavy_operations", 372 372 "MetricThreshold": "tma_heavy_operations > 0.1", ··· 376 376 }, 377 377 { 378 378 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", 379 - "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_clks", 379 + "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks", 380 380 "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", 381 381 "MetricName": "tma_icache_misses", 382 382 "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 384 384 "ScaleUnit": "100%" 385 385 }, 386 386 { 387 - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", 388 - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", 389 - "MetricGroup": "Power;Summary", 390 - "MetricName": "tma_info_average_frequency" 391 - }, 392 - { 393 - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", 394 - "MetricConstraint": "NO_GROUP_EVENTS", 395 - "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", 396 - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", 397 - "MetricName": "tma_info_big_code", 398 - "MetricThreshold": "tma_info_big_code > 20", 399 - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" 400 - }, 401 - { 402 - "BriefDescription": "Branch instructions per taken branch.", 403 - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", 404 - "MetricGroup": "Branches;Fed;PGO", 405 - "MetricName": "tma_info_bptkbranch" 406 - }, 407 - { 408 387 "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", 409 - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", 388 + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", 410 389 "MetricGroup": "Bad;BrMispredicts;tma_issueBM", 411 - "MetricName": "tma_info_branch_misprediction_cost", 412 - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" 390 + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", 391 + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" 413 392 }, 414 393 { 415 - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", 416 - "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", 417 - "MetricGroup": "Ret;tma_issueBC", 418 - "MetricName": "tma_info_branching_overhead", 419 - "MetricThreshold": "tma_info_branching_overhead > 10", 420 - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" 394 + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", 395 + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", 396 + "MetricGroup": "Bad;BrMispredicts", 397 + "MetricName": "tma_info_bad_spec_ipmisp_indirect", 398 + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" 421 399 }, 422 400 { 423 - "BriefDescription": "Fraction of branches that are CALL or RET", 424 - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", 425 - "MetricGroup": "Bad;Branches", 426 - "MetricName": "tma_info_callret" 427 - }, 428 - { 429 - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", 430 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", 431 - "MetricGroup": "Pipeline", 432 - "MetricName": "tma_info_clks" 433 - }, 434 - { 435 - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 436 - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 437 - "MetricGroup": "Fed;MemoryTLB", 438 - "MetricName": "tma_info_code_stlb_mpki" 439 - }, 440 - { 441 - "BriefDescription": "Fraction of branches that are non-taken conditionals", 442 - "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", 443 - "MetricGroup": "Bad;Branches;CodeGen;PGO", 444 - "MetricName": "tma_info_cond_nt" 445 - }, 446 - { 447 - "BriefDescription": "Fraction of branches that are taken conditionals", 448 - "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", 449 - "MetricGroup": "Bad;Branches;CodeGen;PGO", 450 - "MetricName": "tma_info_cond_tk" 401 + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", 402 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", 403 + "MetricGroup": "Bad;BadSpec;BrMispredicts", 404 + "MetricName": "tma_info_bad_spec_ipmispredict", 405 + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" 451 406 }, 452 407 { 453 408 "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", 454 409 "MetricConstraint": "NO_GROUP_EVENTS", 455 - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", 410 + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", 456 411 "MetricGroup": "Cor;SMT", 457 - "MetricName": "tma_info_core_bound_likely", 458 - "MetricThreshold": "tma_info_core_bound_likely > 0.5" 459 - }, 460 - { 461 - "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", 462 - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", 463 - "MetricGroup": "SMT", 464 - "MetricName": "tma_info_core_clks" 465 - }, 466 - { 467 - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", 468 - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", 469 - "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", 470 - "MetricName": "tma_info_coreipc" 471 - }, 472 - { 473 - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", 474 - "MetricExpr": "1 / tma_info_ipc", 475 - "MetricGroup": "Mem;Pipeline", 476 - "MetricName": "tma_info_cpi" 477 - }, 478 - { 479 - "BriefDescription": "Average CPU Utilization", 480 - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", 481 - "MetricGroup": "HPC;Summary", 482 - "MetricName": "tma_info_cpu_utilization" 483 - }, 484 - { 485 - "BriefDescription": "Average Parallel L2 cache miss data reads", 486 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", 487 - "MetricGroup": "Memory_BW;Offcore", 488 - "MetricName": "tma_info_data_l2_mlp" 489 - }, 490 - { 491 - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", 492 - "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", 493 - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", 494 - "MetricName": "tma_info_dram_bw_use", 495 - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" 496 - }, 497 - { 498 - "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", 499 - "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", 500 - "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", 501 - "MetricName": "tma_info_dsb_coverage", 502 - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", 503 - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" 412 + "MetricName": "tma_info_botlnk_l0_core_bound_likely", 413 + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" 504 414 }, 505 415 { 506 416 "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", 507 417 "MetricConstraint": "NO_GROUP_EVENTS", 508 418 "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", 509 419 "MetricGroup": "DSBmiss;Fed;tma_issueFB", 510 - "MetricName": "tma_info_dsb_misses", 511 - "MetricThreshold": "tma_info_dsb_misses > 10", 512 - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" 513 - }, 514 - { 515 - "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", 516 - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / DSB2MITE_SWITCHES.COUNT", 517 - "MetricGroup": "DSBmiss", 518 - "MetricName": "tma_info_dsb_switch_cost" 519 - }, 520 - { 521 - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", 522 - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", 523 - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", 524 - "MetricName": "tma_info_execute" 525 - }, 526 - { 527 - "BriefDescription": "The ratio of Executed- by Issued-Uops", 528 - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", 529 - "MetricGroup": "Cor;Pipeline", 530 - "MetricName": "tma_info_execute_per_issue", 531 - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." 532 - }, 533 - { 534 - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", 535 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", 536 - "MetricGroup": "CacheMisses;Mem", 537 - "MetricName": "tma_info_fb_hpki" 538 - }, 539 - { 540 - "BriefDescription": "Average number of Uops issued by front-end when it issued something", 541 - "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", 542 - "MetricGroup": "Fed;FetchBW", 543 - "MetricName": "tma_info_fetch_upc" 544 - }, 545 - { 546 - "BriefDescription": "Floating Point Operations Per Cycle", 547 - "MetricConstraint": "NO_GROUP_EVENTS", 548 - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_clks", 549 - "MetricGroup": "Flops;Ret", 550 - "MetricName": "tma_info_flopc" 551 - }, 552 - { 553 - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", 554 - "MetricConstraint": "NO_GROUP_EVENTS", 555 - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_clks)", 556 - "MetricGroup": "Cor;Flops;HPC", 557 - "MetricName": "tma_info_fp_arith_utilization", 558 - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." 559 - }, 560 - { 561 - "BriefDescription": "Giga Floating Point Operations Per Second", 562 - "MetricConstraint": "NO_GROUP_EVENTS", 563 - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", 564 - "MetricGroup": "Cor;Flops;HPC", 565 - "MetricName": "tma_info_gflops", 566 - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." 420 + "MetricName": "tma_info_botlnk_l2_dsb_misses", 421 + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", 422 + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" 567 423 }, 568 424 { 569 425 "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", 570 426 "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", 571 427 "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", 572 - "MetricName": "tma_info_ic_misses", 573 - "MetricThreshold": "tma_info_ic_misses > 5", 428 + "MetricName": "tma_info_botlnk_l2_ic_misses", 429 + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", 574 430 "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " 575 431 }, 576 432 { 577 - "BriefDescription": "Average Latency for L1 instruction cache misses", 578 - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", 579 - "MetricGroup": "Fed;FetchLat;IcMiss", 580 - "MetricName": "tma_info_icache_miss_latency" 433 + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", 434 + "MetricConstraint": "NO_GROUP_EVENTS", 435 + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", 436 + "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", 437 + "MetricName": "tma_info_bottleneck_big_code", 438 + "MetricThreshold": "tma_info_bottleneck_big_code > 20", 439 + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" 581 440 }, 582 441 { 583 - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", 584 - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", 585 - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", 586 - "MetricName": "tma_info_ilp" 442 + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", 443 + "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", 444 + "MetricGroup": "Ret;tma_issueBC", 445 + "MetricName": "tma_info_bottleneck_branching_overhead", 446 + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", 447 + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" 587 448 }, 588 449 { 589 450 "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", 590 451 "MetricConstraint": "NO_GROUP_EVENTS", 591 - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", 452 + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", 592 453 "MetricGroup": "Fed;FetchBW;Frontend", 593 - "MetricName": "tma_info_instruction_fetch_bw", 594 - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" 595 - }, 596 - { 597 - "BriefDescription": "Total number of retired Instructions", 598 - "MetricExpr": "INST_RETIRED.ANY", 599 - "MetricGroup": "Summary;TmaL1;tma_L1_group", 600 - "MetricName": "tma_info_instructions", 601 - "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" 602 - }, 603 - { 604 - "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", 605 - "MetricConstraint": "NO_GROUP_EVENTS", 606 - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)", 607 - "MetricGroup": "Flops;InsType", 608 - "MetricName": "tma_info_iparith", 609 - "MetricThreshold": "tma_info_iparith < 10", 610 - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." 611 - }, 612 - { 613 - "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", 614 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", 615 - "MetricGroup": "Flops;FpVector;InsType", 616 - "MetricName": "tma_info_iparith_avx128", 617 - "MetricThreshold": "tma_info_iparith_avx128 < 10", 618 - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 619 - }, 620 - { 621 - "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", 622 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", 623 - "MetricGroup": "Flops;FpVector;InsType", 624 - "MetricName": "tma_info_iparith_avx256", 625 - "MetricThreshold": "tma_info_iparith_avx256 < 10", 626 - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 627 - }, 628 - { 629 - "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", 630 - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", 631 - "MetricGroup": "Flops;FpScalar;InsType", 632 - "MetricName": "tma_info_iparith_scalar_dp", 633 - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", 634 - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 635 - }, 636 - { 637 - "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", 638 - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", 639 - "MetricGroup": "Flops;FpScalar;InsType", 640 - "MetricName": "tma_info_iparith_scalar_sp", 641 - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", 642 - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 643 - }, 644 - { 645 - "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", 646 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", 647 - "MetricGroup": "Branches;Fed;InsType", 648 - "MetricName": "tma_info_ipbranch", 649 - "MetricThreshold": "tma_info_ipbranch < 8" 650 - }, 651 - { 652 - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", 653 - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", 654 - "MetricGroup": "Ret;Summary", 655 - "MetricName": "tma_info_ipc" 656 - }, 657 - { 658 - "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", 659 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", 660 - "MetricGroup": "Branches;Fed;PGO", 661 - "MetricName": "tma_info_ipcall", 662 - "MetricThreshold": "tma_info_ipcall < 200" 663 - }, 664 - { 665 - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", 666 - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", 667 - "MetricGroup": "DSBmiss;Fed", 668 - "MetricName": "tma_info_ipdsb_miss_ret", 669 - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" 670 - }, 671 - { 672 - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", 673 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", 674 - "MetricGroup": "Branches;OS", 675 - "MetricName": "tma_info_ipfarbranch", 676 - "MetricThreshold": "tma_info_ipfarbranch < 1e6" 677 - }, 678 - { 679 - "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", 680 - "MetricConstraint": "NO_GROUP_EVENTS", 681 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", 682 - "MetricGroup": "Flops;InsType", 683 - "MetricName": "tma_info_ipflop", 684 - "MetricThreshold": "tma_info_ipflop < 10" 685 - }, 686 - { 687 - "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", 688 - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", 689 - "MetricGroup": "InsType", 690 - "MetricName": "tma_info_ipload", 691 - "MetricThreshold": "tma_info_ipload < 3" 692 - }, 693 - { 694 - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", 695 - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", 696 - "MetricGroup": "Bad;BrMispredicts", 697 - "MetricName": "tma_info_ipmisp_indirect", 698 - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" 699 - }, 700 - { 701 - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", 702 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", 703 - "MetricGroup": "Bad;BadSpec;BrMispredicts", 704 - "MetricName": "tma_info_ipmispredict", 705 - "MetricThreshold": "tma_info_ipmispredict < 200" 706 - }, 707 - { 708 - "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", 709 - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", 710 - "MetricGroup": "InsType", 711 - "MetricName": "tma_info_ipstore", 712 - "MetricThreshold": "tma_info_ipstore < 8" 713 - }, 714 - { 715 - "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", 716 - "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", 717 - "MetricGroup": "Prefetches", 718 - "MetricName": "tma_info_ipswpf", 719 - "MetricThreshold": "tma_info_ipswpf < 100" 720 - }, 721 - { 722 - "BriefDescription": "Instruction per taken branch", 723 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", 724 - "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", 725 - "MetricName": "tma_info_iptb", 726 - "MetricThreshold": "tma_info_iptb < 9", 727 - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" 728 - }, 729 - { 730 - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", 731 - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", 732 - "MetricGroup": "Fed", 733 - "MetricName": "tma_info_ipunknown_branch" 734 - }, 735 - { 736 - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", 737 - "MetricConstraint": "NO_GROUP_EVENTS", 738 - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", 739 - "MetricGroup": "Bad;Branches", 740 - "MetricName": "tma_info_jump" 741 - }, 742 - { 743 - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", 744 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", 745 - "MetricGroup": "OS", 746 - "MetricName": "tma_info_kernel_cpi" 747 - }, 748 - { 749 - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", 750 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", 751 - "MetricGroup": "OS", 752 - "MetricName": "tma_info_kernel_utilization", 753 - "MetricThreshold": "tma_info_kernel_utilization > 0.05" 754 - }, 755 - { 756 - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", 757 - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", 758 - "MetricGroup": "Mem;MemoryBW", 759 - "MetricName": "tma_info_l1d_cache_fill_bw" 760 - }, 761 - { 762 - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", 763 - "MetricExpr": "tma_info_l1d_cache_fill_bw", 764 - "MetricGroup": "Mem;MemoryBW", 765 - "MetricName": "tma_info_l1d_cache_fill_bw_1t" 766 - }, 767 - { 768 - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", 769 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", 770 - "MetricGroup": "CacheMisses;Mem", 771 - "MetricName": "tma_info_l1mpki" 772 - }, 773 - { 774 - "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", 775 - "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", 776 - "MetricGroup": "CacheMisses;Mem", 777 - "MetricName": "tma_info_l1mpki_load" 778 - }, 779 - { 780 - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", 781 - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", 782 - "MetricGroup": "Mem;MemoryBW", 783 - "MetricName": "tma_info_l2_cache_fill_bw" 784 - }, 785 - { 786 - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", 787 - "MetricExpr": "tma_info_l2_cache_fill_bw", 788 - "MetricGroup": "Mem;MemoryBW", 789 - "MetricName": "tma_info_l2_cache_fill_bw_1t" 790 - }, 791 - { 792 - "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", 793 - "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", 794 - "MetricGroup": "CacheMisses;Mem", 795 - "MetricName": "tma_info_l2hpki_all" 796 - }, 797 - { 798 - "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", 799 - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", 800 - "MetricGroup": "CacheMisses;Mem", 801 - "MetricName": "tma_info_l2hpki_load" 802 - }, 803 - { 804 - "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", 805 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 806 - "MetricGroup": "Backend;CacheMisses;Mem", 807 - "MetricName": "tma_info_l2mpki" 808 - }, 809 - { 810 - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", 811 - "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", 812 - "MetricGroup": "CacheMisses;Mem;Offcore", 813 - "MetricName": "tma_info_l2mpki_all" 814 - }, 815 - { 816 - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", 817 - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", 818 - "MetricGroup": "IcMiss", 819 - "MetricName": "tma_info_l2mpki_code" 820 - }, 821 - { 822 - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", 823 - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 824 - "MetricGroup": "IcMiss", 825 - "MetricName": "tma_info_l2mpki_code_all" 826 - }, 827 - { 828 - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", 829 - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", 830 - "MetricGroup": "CacheMisses;Mem", 831 - "MetricName": "tma_info_l2mpki_load" 832 - }, 833 - { 834 - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", 835 - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", 836 - "MetricGroup": "Mem;MemoryBW;Offcore", 837 - "MetricName": "tma_info_l3_cache_access_bw" 838 - }, 839 - { 840 - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", 841 - "MetricExpr": "tma_info_l3_cache_access_bw", 842 - "MetricGroup": "Mem;MemoryBW;Offcore", 843 - "MetricName": "tma_info_l3_cache_access_bw_1t" 844 - }, 845 - { 846 - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", 847 - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", 848 - "MetricGroup": "Mem;MemoryBW", 849 - "MetricName": "tma_info_l3_cache_fill_bw" 850 - }, 851 - { 852 - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", 853 - "MetricExpr": "tma_info_l3_cache_fill_bw", 854 - "MetricGroup": "Mem;MemoryBW", 855 - "MetricName": "tma_info_l3_cache_fill_bw_1t" 856 - }, 857 - { 858 - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", 859 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", 860 - "MetricGroup": "CacheMisses;Mem", 861 - "MetricName": "tma_info_l3mpki" 862 - }, 863 - { 864 - "BriefDescription": "Average Latency for L2 cache miss demand Loads", 865 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", 866 - "MetricGroup": "Memory_Lat;Offcore", 867 - "MetricName": "tma_info_load_l2_miss_latency" 868 - }, 869 - { 870 - "BriefDescription": "Average Parallel L2 cache miss demand Loads", 871 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", 872 - "MetricGroup": "Memory_BW;Offcore", 873 - "MetricName": "tma_info_load_l2_mlp" 874 - }, 875 - { 876 - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", 877 - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", 878 - "MetricGroup": "Mem;MemoryBound;MemoryLat", 879 - "MetricName": "tma_info_load_miss_real_latency" 880 - }, 881 - { 882 - "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 883 - "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 884 - "MetricGroup": "Mem;MemoryTLB", 885 - "MetricName": "tma_info_load_stlb_mpki" 886 - }, 887 - { 888 - "BriefDescription": "Average number of parallel data read requests to external memory", 889 - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_OCCUPANCY.DATA_READ@thresh\\=1@", 890 - "MetricGroup": "Mem;MemoryBW;SoC", 891 - "MetricName": "tma_info_mem_parallel_reads", 892 - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" 893 - }, 894 - { 895 - "BriefDescription": "Average number of parallel requests to external memory", 896 - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", 897 - "MetricGroup": "Mem;SoC", 898 - "MetricName": "tma_info_mem_parallel_requests", 899 - "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" 900 - }, 901 - { 902 - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", 903 - "MetricExpr": "1e9 * (UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_REQUESTS.DATA_READ) / (tma_info_socket_clks / duration_time)", 904 - "MetricGroup": "Mem;MemoryLat;SoC", 905 - "MetricName": "tma_info_mem_read_latency", 906 - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" 907 - }, 908 - { 909 - "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", 910 - "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", 911 - "MetricGroup": "Mem;SoC", 912 - "MetricName": "tma_info_mem_request_latency" 454 + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", 455 + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" 913 456 }, 914 457 { 915 458 "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", 916 459 "MetricConstraint": "NO_GROUP_EVENTS", 917 460 "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", 918 461 "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", 919 - "MetricName": "tma_info_memory_bandwidth", 920 - "MetricThreshold": "tma_info_memory_bandwidth > 20", 921 - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" 462 + "MetricName": "tma_info_bottleneck_memory_bandwidth", 463 + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", 464 + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" 922 465 }, 923 466 { 924 467 "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", 925 468 "MetricConstraint": "NO_GROUP_EVENTS", 926 469 "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", 927 470 "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", 928 - "MetricName": "tma_info_memory_data_tlbs", 929 - "MetricThreshold": "tma_info_memory_data_tlbs > 20", 471 + "MetricName": "tma_info_bottleneck_memory_data_tlbs", 472 + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", 930 473 "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" 931 474 }, 932 475 { ··· 477 934 "MetricConstraint": "NO_GROUP_EVENTS", 478 935 "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", 479 936 "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", 480 - "MetricName": "tma_info_memory_latency", 481 - "MetricThreshold": "tma_info_memory_latency > 20", 937 + "MetricName": "tma_info_bottleneck_memory_latency", 938 + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", 482 939 "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" 483 940 }, 484 941 { ··· 486 943 "MetricConstraint": "NO_GROUP_EVENTS", 487 944 "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", 488 945 "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", 489 - "MetricName": "tma_info_mispredictions", 490 - "MetricThreshold": "tma_info_mispredictions > 20", 491 - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" 946 + "MetricName": "tma_info_bottleneck_mispredictions", 947 + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", 948 + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" 949 + }, 950 + { 951 + "BriefDescription": "Fraction of branches that are CALL or RET", 952 + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", 953 + "MetricGroup": "Bad;Branches", 954 + "MetricName": "tma_info_branches_callret" 955 + }, 956 + { 957 + "BriefDescription": "Fraction of branches that are non-taken conditionals", 958 + "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", 959 + "MetricGroup": "Bad;Branches;CodeGen;PGO", 960 + "MetricName": "tma_info_branches_cond_nt" 961 + }, 962 + { 963 + "BriefDescription": "Fraction of branches that are taken conditionals", 964 + "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", 965 + "MetricGroup": "Bad;Branches;CodeGen;PGO", 966 + "MetricName": "tma_info_branches_cond_tk" 967 + }, 968 + { 969 + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", 970 + "MetricConstraint": "NO_GROUP_EVENTS", 971 + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", 972 + "MetricGroup": "Bad;Branches", 973 + "MetricName": "tma_info_branches_jump" 974 + }, 975 + { 976 + "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", 977 + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", 978 + "MetricGroup": "SMT", 979 + "MetricName": "tma_info_core_core_clks" 980 + }, 981 + { 982 + "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", 983 + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", 984 + "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", 985 + "MetricName": "tma_info_core_coreipc" 986 + }, 987 + { 988 + "BriefDescription": "Floating Point Operations Per Cycle", 989 + "MetricConstraint": "NO_GROUP_EVENTS", 990 + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks", 991 + "MetricGroup": "Flops;Ret", 992 + "MetricName": "tma_info_core_flopc" 993 + }, 994 + { 995 + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", 996 + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@) / (2 * tma_info_core_core_clks)", 997 + "MetricGroup": "Cor;Flops;HPC", 998 + "MetricName": "tma_info_core_fp_arith_utilization", 999 + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." 1000 + }, 1001 + { 1002 + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", 1003 + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", 1004 + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", 1005 + "MetricName": "tma_info_core_ilp" 1006 + }, 1007 + { 1008 + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", 1009 + "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", 1010 + "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", 1011 + "MetricName": "tma_info_frontend_dsb_coverage", 1012 + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", 1013 + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" 1014 + }, 1015 + { 1016 + "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", 1017 + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / DSB2MITE_SWITCHES.COUNT", 1018 + "MetricGroup": "DSBmiss", 1019 + "MetricName": "tma_info_frontend_dsb_switch_cost" 1020 + }, 1021 + { 1022 + "BriefDescription": "Average number of Uops issued by front-end when it issued something", 1023 + "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", 1024 + "MetricGroup": "Fed;FetchBW", 1025 + "MetricName": "tma_info_frontend_fetch_upc" 1026 + }, 1027 + { 1028 + "BriefDescription": "Average Latency for L1 instruction cache misses", 1029 + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", 1030 + "MetricGroup": "Fed;FetchLat;IcMiss", 1031 + "MetricName": "tma_info_frontend_icache_miss_latency" 1032 + }, 1033 + { 1034 + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", 1035 + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", 1036 + "MetricGroup": "DSBmiss;Fed", 1037 + "MetricName": "tma_info_frontend_ipdsb_miss_ret", 1038 + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" 1039 + }, 1040 + { 1041 + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", 1042 + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", 1043 + "MetricGroup": "Fed", 1044 + "MetricName": "tma_info_frontend_ipunknown_branch" 1045 + }, 1046 + { 1047 + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", 1048 + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", 1049 + "MetricGroup": "IcMiss", 1050 + "MetricName": "tma_info_frontend_l2mpki_code" 1051 + }, 1052 + { 1053 + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", 1054 + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 1055 + "MetricGroup": "IcMiss", 1056 + "MetricName": "tma_info_frontend_l2mpki_code_all" 1057 + }, 1058 + { 1059 + "BriefDescription": "Branch instructions per taken branch.", 1060 + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", 1061 + "MetricGroup": "Branches;Fed;PGO", 1062 + "MetricName": "tma_info_inst_mix_bptkbranch" 1063 + }, 1064 + { 1065 + "BriefDescription": "Total number of retired Instructions", 1066 + "MetricExpr": "INST_RETIRED.ANY", 1067 + "MetricGroup": "Summary;TmaL1;tma_L1_group", 1068 + "MetricName": "tma_info_inst_mix_instructions", 1069 + "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" 1070 + }, 1071 + { 1072 + "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", 1073 + "MetricConstraint": "NO_GROUP_EVENTS", 1074 + "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)", 1075 + "MetricGroup": "Flops;InsType", 1076 + "MetricName": "tma_info_inst_mix_iparith", 1077 + "MetricThreshold": "tma_info_inst_mix_iparith < 10", 1078 + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." 1079 + }, 1080 + { 1081 + "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", 1082 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", 1083 + "MetricGroup": "Flops;FpVector;InsType", 1084 + "MetricName": "tma_info_inst_mix_iparith_avx128", 1085 + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", 1086 + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1087 + }, 1088 + { 1089 + "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", 1090 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", 1091 + "MetricGroup": "Flops;FpVector;InsType", 1092 + "MetricName": "tma_info_inst_mix_iparith_avx256", 1093 + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", 1094 + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1095 + }, 1096 + { 1097 + "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", 1098 + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", 1099 + "MetricGroup": "Flops;FpScalar;InsType", 1100 + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", 1101 + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", 1102 + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1103 + }, 1104 + { 1105 + "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", 1106 + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", 1107 + "MetricGroup": "Flops;FpScalar;InsType", 1108 + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", 1109 + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", 1110 + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1111 + }, 1112 + { 1113 + "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", 1114 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", 1115 + "MetricGroup": "Branches;Fed;InsType", 1116 + "MetricName": "tma_info_inst_mix_ipbranch", 1117 + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" 1118 + }, 1119 + { 1120 + "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", 1121 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", 1122 + "MetricGroup": "Branches;Fed;PGO", 1123 + "MetricName": "tma_info_inst_mix_ipcall", 1124 + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" 1125 + }, 1126 + { 1127 + "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", 1128 + "MetricConstraint": "NO_GROUP_EVENTS", 1129 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", 1130 + "MetricGroup": "Flops;InsType", 1131 + "MetricName": "tma_info_inst_mix_ipflop", 1132 + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" 1133 + }, 1134 + { 1135 + "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", 1136 + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", 1137 + "MetricGroup": "InsType", 1138 + "MetricName": "tma_info_inst_mix_ipload", 1139 + "MetricThreshold": "tma_info_inst_mix_ipload < 3" 1140 + }, 1141 + { 1142 + "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", 1143 + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", 1144 + "MetricGroup": "InsType", 1145 + "MetricName": "tma_info_inst_mix_ipstore", 1146 + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" 1147 + }, 1148 + { 1149 + "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", 1150 + "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", 1151 + "MetricGroup": "Prefetches", 1152 + "MetricName": "tma_info_inst_mix_ipswpf", 1153 + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" 1154 + }, 1155 + { 1156 + "BriefDescription": "Instruction per taken branch", 1157 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", 1158 + "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", 1159 + "MetricName": "tma_info_inst_mix_iptb", 1160 + "MetricThreshold": "tma_info_inst_mix_iptb < 9", 1161 + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" 1162 + }, 1163 + { 1164 + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", 1165 + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", 1166 + "MetricGroup": "Mem;MemoryBW", 1167 + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" 1168 + }, 1169 + { 1170 + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", 1171 + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", 1172 + "MetricGroup": "Mem;MemoryBW", 1173 + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" 1174 + }, 1175 + { 1176 + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", 1177 + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", 1178 + "MetricGroup": "Mem;MemoryBW;Offcore", 1179 + "MetricName": "tma_info_memory_core_l3_cache_access_bw" 1180 + }, 1181 + { 1182 + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", 1183 + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", 1184 + "MetricGroup": "Mem;MemoryBW", 1185 + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" 1186 + }, 1187 + { 1188 + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", 1189 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", 1190 + "MetricGroup": "CacheMisses;Mem", 1191 + "MetricName": "tma_info_memory_fb_hpki" 1192 + }, 1193 + { 1194 + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", 1195 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", 1196 + "MetricGroup": "CacheMisses;Mem", 1197 + "MetricName": "tma_info_memory_l1mpki" 1198 + }, 1199 + { 1200 + "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", 1201 + "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", 1202 + "MetricGroup": "CacheMisses;Mem", 1203 + "MetricName": "tma_info_memory_l1mpki_load" 1204 + }, 1205 + { 1206 + "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", 1207 + "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", 1208 + "MetricGroup": "CacheMisses;Mem", 1209 + "MetricName": "tma_info_memory_l2hpki_all" 1210 + }, 1211 + { 1212 + "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", 1213 + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", 1214 + "MetricGroup": "CacheMisses;Mem", 1215 + "MetricName": "tma_info_memory_l2hpki_load" 1216 + }, 1217 + { 1218 + "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", 1219 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 1220 + "MetricGroup": "Backend;CacheMisses;Mem", 1221 + "MetricName": "tma_info_memory_l2mpki" 1222 + }, 1223 + { 1224 + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", 1225 + "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", 1226 + "MetricGroup": "CacheMisses;Mem;Offcore", 1227 + "MetricName": "tma_info_memory_l2mpki_all" 1228 + }, 1229 + { 1230 + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", 1231 + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", 1232 + "MetricGroup": "CacheMisses;Mem", 1233 + "MetricName": "tma_info_memory_l2mpki_load" 1234 + }, 1235 + { 1236 + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", 1237 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", 1238 + "MetricGroup": "CacheMisses;Mem", 1239 + "MetricName": "tma_info_memory_l3mpki" 1240 + }, 1241 + { 1242 + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", 1243 + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", 1244 + "MetricGroup": "Mem;MemoryBound;MemoryLat", 1245 + "MetricName": "tma_info_memory_load_miss_real_latency" 492 1246 }, 493 1247 { 494 1248 "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", 495 1249 "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", 496 1250 "MetricGroup": "Mem;MemoryBW;MemoryBound", 497 - "MetricName": "tma_info_mlp", 1251 + "MetricName": "tma_info_memory_mlp", 498 1252 "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" 1253 + }, 1254 + { 1255 + "BriefDescription": "Average Parallel L2 cache miss data reads", 1256 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", 1257 + "MetricGroup": "Memory_BW;Offcore", 1258 + "MetricName": "tma_info_memory_oro_data_l2_mlp" 1259 + }, 1260 + { 1261 + "BriefDescription": "Average Latency for L2 cache miss demand Loads", 1262 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", 1263 + "MetricGroup": "Memory_Lat;Offcore", 1264 + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" 1265 + }, 1266 + { 1267 + "BriefDescription": "Average Parallel L2 cache miss demand Loads", 1268 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", 1269 + "MetricGroup": "Memory_BW;Offcore", 1270 + "MetricName": "tma_info_memory_oro_load_l2_mlp" 1271 + }, 1272 + { 1273 + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", 1274 + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", 1275 + "MetricGroup": "Mem;MemoryBW", 1276 + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" 1277 + }, 1278 + { 1279 + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", 1280 + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", 1281 + "MetricGroup": "Mem;MemoryBW", 1282 + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" 1283 + }, 1284 + { 1285 + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", 1286 + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", 1287 + "MetricGroup": "Mem;MemoryBW;Offcore", 1288 + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" 1289 + }, 1290 + { 1291 + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", 1292 + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", 1293 + "MetricGroup": "Mem;MemoryBW", 1294 + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" 1295 + }, 1296 + { 1297 + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1298 + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1299 + "MetricGroup": "Fed;MemoryTLB", 1300 + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" 1301 + }, 1302 + { 1303 + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1304 + "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1305 + "MetricGroup": "Mem;MemoryTLB", 1306 + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" 499 1307 }, 500 1308 { 501 1309 "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", 502 1310 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 503 - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_clks)", 1311 + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_core_clks)", 504 1312 "MetricGroup": "Mem;MemoryTLB", 505 - "MetricName": "tma_info_page_walks_utilization", 506 - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" 507 - }, 508 - { 509 - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", 510 - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", 511 - "MetricGroup": "Pipeline;Ret", 512 - "MetricName": "tma_info_retire" 513 - }, 514 - { 515 - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", 516 - "MetricExpr": "4 * tma_info_core_clks", 517 - "MetricGroup": "TmaL1;tma_L1_group", 518 - "MetricName": "tma_info_slots" 519 - }, 520 - { 521 - "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", 522 - "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", 523 - "MetricGroup": "SMT", 524 - "MetricName": "tma_info_smt_2t_utilization" 525 - }, 526 - { 527 - "BriefDescription": "Socket actual clocks when any core is active on that socket", 528 - "MetricExpr": "UNC_CLOCK.SOCKET", 529 - "MetricGroup": "SoC", 530 - "MetricName": "tma_info_socket_clks" 1313 + "MetricName": "tma_info_memory_tlb_page_walks_utilization", 1314 + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" 531 1315 }, 532 1316 { 533 1317 "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 534 1318 "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 535 1319 "MetricGroup": "Mem;MemoryTLB", 536 - "MetricName": "tma_info_store_stlb_mpki" 1320 + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" 1321 + }, 1322 + { 1323 + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", 1324 + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", 1325 + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", 1326 + "MetricName": "tma_info_pipeline_execute" 1327 + }, 1328 + { 1329 + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", 1330 + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", 1331 + "MetricGroup": "Pipeline;Ret", 1332 + "MetricName": "tma_info_pipeline_retire" 1333 + }, 1334 + { 1335 + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", 1336 + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", 1337 + "MetricGroup": "Power;Summary", 1338 + "MetricName": "tma_info_system_average_frequency" 1339 + }, 1340 + { 1341 + "BriefDescription": "Average CPU Utilization", 1342 + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", 1343 + "MetricGroup": "HPC;Summary", 1344 + "MetricName": "tma_info_system_cpu_utilization" 1345 + }, 1346 + { 1347 + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", 1348 + "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3", 1349 + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", 1350 + "MetricName": "tma_info_system_dram_bw_use", 1351 + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" 1352 + }, 1353 + { 1354 + "BriefDescription": "Giga Floating Point Operations Per Second", 1355 + "MetricConstraint": "NO_GROUP_EVENTS", 1356 + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time", 1357 + "MetricGroup": "Cor;Flops;HPC", 1358 + "MetricName": "tma_info_system_gflops", 1359 + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." 1360 + }, 1361 + { 1362 + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", 1363 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", 1364 + "MetricGroup": "Branches;OS", 1365 + "MetricName": "tma_info_system_ipfarbranch", 1366 + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" 1367 + }, 1368 + { 1369 + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", 1370 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", 1371 + "MetricGroup": "OS", 1372 + "MetricName": "tma_info_system_kernel_cpi" 1373 + }, 1374 + { 1375 + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", 1376 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", 1377 + "MetricGroup": "OS", 1378 + "MetricName": "tma_info_system_kernel_utilization", 1379 + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" 1380 + }, 1381 + { 1382 + "BriefDescription": "Average number of parallel data read requests to external memory", 1383 + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_OCCUPANCY.DATA_READ@thresh\\=1@", 1384 + "MetricGroup": "Mem;MemoryBW;SoC", 1385 + "MetricName": "tma_info_system_mem_parallel_reads", 1386 + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" 1387 + }, 1388 + { 1389 + "BriefDescription": "Average number of parallel requests to external memory", 1390 + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST", 1391 + "MetricGroup": "Mem;SoC", 1392 + "MetricName": "tma_info_system_mem_parallel_requests", 1393 + "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests" 1394 + }, 1395 + { 1396 + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", 1397 + "MetricExpr": "1e9 * (UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_REQUESTS.DATA_READ) / (tma_info_system_socket_clks / duration_time)", 1398 + "MetricGroup": "Mem;MemoryLat;SoC", 1399 + "MetricName": "tma_info_system_mem_read_latency", 1400 + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" 1401 + }, 1402 + { 1403 + "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)", 1404 + "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL", 1405 + "MetricGroup": "Mem;SoC", 1406 + "MetricName": "tma_info_system_mem_request_latency" 1407 + }, 1408 + { 1409 + "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", 1410 + "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", 1411 + "MetricGroup": "SMT", 1412 + "MetricName": "tma_info_system_smt_2t_utilization" 1413 + }, 1414 + { 1415 + "BriefDescription": "Socket actual clocks when any core is active on that socket", 1416 + "MetricExpr": "UNC_CLOCK.SOCKET", 1417 + "MetricGroup": "SoC", 1418 + "MetricName": "tma_info_system_socket_clks" 537 1419 }, 538 1420 { 539 1421 "BriefDescription": "Average Frequency Utilization relative nominal frequency", 540 - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", 1422 + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", 541 1423 "MetricGroup": "Power", 542 - "MetricName": "tma_info_turbo_utilization" 1424 + "MetricName": "tma_info_system_turbo_utilization" 1425 + }, 1426 + { 1427 + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", 1428 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", 1429 + "MetricGroup": "Pipeline", 1430 + "MetricName": "tma_info_thread_clks" 1431 + }, 1432 + { 1433 + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", 1434 + "MetricExpr": "1 / tma_info_thread_ipc", 1435 + "MetricGroup": "Mem;Pipeline", 1436 + "MetricName": "tma_info_thread_cpi" 1437 + }, 1438 + { 1439 + "BriefDescription": "The ratio of Executed- by Issued-Uops", 1440 + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", 1441 + "MetricGroup": "Cor;Pipeline", 1442 + "MetricName": "tma_info_thread_execute_per_issue", 1443 + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." 1444 + }, 1445 + { 1446 + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", 1447 + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", 1448 + "MetricGroup": "Ret;Summary", 1449 + "MetricName": "tma_info_thread_ipc" 1450 + }, 1451 + { 1452 + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", 1453 + "MetricExpr": "4 * tma_info_core_core_clks", 1454 + "MetricGroup": "TmaL1;tma_L1_group", 1455 + "MetricName": "tma_info_thread_slots" 543 1456 }, 544 1457 { 545 1458 "BriefDescription": "Uops Per Instruction", 546 1459 "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", 547 1460 "MetricGroup": "Pipeline;Ret;Retire", 548 - "MetricName": "tma_info_uoppi", 549 - "MetricThreshold": "tma_info_uoppi > 1.05" 1461 + "MetricName": "tma_info_thread_uoppi", 1462 + "MetricThreshold": "tma_info_thread_uoppi > 1.05" 550 1463 }, 551 1464 { 552 1465 "BriefDescription": "Instruction per taken branch", 553 1466 "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", 554 1467 "MetricGroup": "Branches;Fed;FetchBW", 555 - "MetricName": "tma_info_uptb", 556 - "MetricThreshold": "tma_info_uptb < 6" 1468 + "MetricName": "tma_info_thread_uptb", 1469 + "MetricThreshold": "tma_info_thread_uptb < 6" 557 1470 }, 558 1471 { 559 1472 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", 560 - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", 1473 + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", 561 1474 "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", 562 1475 "MetricName": "tma_itlb_misses", 563 1476 "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 1022 1023 }, 1023 1024 { 1024 1025 "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", 1025 - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", 1026 + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", 1026 1027 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", 1027 1028 "MetricName": "tma_l1_bound", 1028 1029 "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1032 1033 { 1033 1034 "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", 1034 1035 "MetricConstraint": "NO_GROUP_EVENTS", 1035 - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", 1036 + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", 1036 1037 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1037 1038 "MetricName": "tma_l2_bound", 1038 1039 "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1041 1042 }, 1042 1043 { 1043 1044 "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", 1044 - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", 1045 + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", 1045 1046 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1046 1047 "MetricName": "tma_l3_bound", 1047 1048 "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1050 1051 }, 1051 1052 { 1052 1053 "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", 1053 - "MetricExpr": "6.5 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1054 + "MetricExpr": "6.5 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1054 1055 "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", 1055 1056 "MetricName": "tma_l3_hit_latency", 1056 1057 "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1057 - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", 1058 + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", 1058 1059 "ScaleUnit": "100%" 1059 1060 }, 1060 1061 { 1061 1062 "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", 1062 - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", 1063 + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", 1063 1064 "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", 1064 1065 "MetricName": "tma_lcp", 1065 1066 "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", 1066 - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", 1067 + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", 1067 1068 "ScaleUnit": "100%" 1068 1069 }, 1069 1070 { ··· 1078 1079 }, 1079 1080 { 1080 1081 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", 1081 - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", 1082 + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", 1082 1083 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 1083 1084 "MetricName": "tma_load_op_utilization", 1084 1085 "MetricThreshold": "tma_load_op_utilization > 0.6", ··· 1096 1097 }, 1097 1098 { 1098 1099 "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", 1099 - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", 1100 + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", 1100 1101 "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", 1101 1102 "MetricName": "tma_load_stlb_miss", 1102 1103 "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1104 1105 }, 1105 1106 { 1106 1107 "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", 1107 - "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (9 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", 1108 + "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (9 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", 1108 1109 "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", 1109 1110 "MetricName": "tma_lock_latency", 1110 1111 "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1124 1125 }, 1125 1126 { 1126 1127 "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", 1127 - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", 1128 + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", 1128 1129 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", 1129 1130 "MetricName": "tma_mem_bandwidth", 1130 1131 "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1131 - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", 1132 + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", 1132 1133 "ScaleUnit": "100%" 1133 1134 }, 1134 1135 { 1135 1136 "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", 1136 - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", 1137 + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", 1137 1138 "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", 1138 1139 "MetricName": "tma_mem_latency", 1139 1140 "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1140 - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", 1141 + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", 1141 1142 "ScaleUnit": "100%" 1142 1143 }, 1143 1144 { ··· 1161 1162 }, 1162 1163 { 1163 1164 "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", 1164 - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", 1165 + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", 1165 1166 "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", 1166 1167 "MetricName": "tma_microcode_sequencer", 1167 1168 "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", ··· 1170 1171 }, 1171 1172 { 1172 1173 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", 1173 - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", 1174 + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", 1174 1175 "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", 1175 1176 "MetricName": "tma_mispredicts_resteers", 1176 1177 "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", 1177 - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", 1178 + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", 1178 1179 "ScaleUnit": "100%" 1179 1180 }, 1180 1181 { 1181 1182 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", 1182 - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", 1183 + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", 1183 1184 "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 1184 1185 "MetricName": "tma_mite", 1185 - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", 1186 + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", 1186 1187 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", 1187 1188 "ScaleUnit": "100%" 1188 1189 }, ··· 1197 1198 }, 1198 1199 { 1199 1200 "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", 1200 - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", 1201 + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", 1201 1202 "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", 1202 1203 "MetricName": "tma_ms_switches", 1203 1204 "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 1233 1234 }, 1234 1235 { 1235 1236 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", 1236 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", 1237 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", 1237 1238 "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1238 1239 "MetricName": "tma_port_0", 1239 1240 "MetricThreshold": "tma_port_0 > 0.6", ··· 1242 1243 }, 1243 1244 { 1244 1245 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", 1245 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", 1246 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", 1246 1247 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1247 1248 "MetricName": "tma_port_1", 1248 1249 "MetricThreshold": "tma_port_1 > 0.6", ··· 1251 1252 }, 1252 1253 { 1253 1254 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", 1254 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", 1255 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", 1255 1256 "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", 1256 1257 "MetricName": "tma_port_2", 1257 1258 "MetricThreshold": "tma_port_2 > 0.6", ··· 1260 1261 }, 1261 1262 { 1262 1263 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", 1263 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", 1264 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", 1264 1265 "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", 1265 1266 "MetricName": "tma_port_3", 1266 1267 "MetricThreshold": "tma_port_3 > 0.6", ··· 1278 1279 }, 1279 1280 { 1280 1281 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", 1281 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", 1282 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", 1282 1283 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1283 1284 "MetricName": "tma_port_5", 1284 1285 "MetricThreshold": "tma_port_5 > 0.6", ··· 1287 1288 }, 1288 1289 { 1289 1290 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", 1290 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", 1291 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", 1291 1292 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1292 1293 "MetricName": "tma_port_6", 1293 1294 "MetricThreshold": "tma_port_6 > 0.6", ··· 1296 1297 }, 1297 1298 { 1298 1299 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", 1299 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", 1300 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", 1300 1301 "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", 1301 1302 "MetricName": "tma_port_7", 1302 1303 "MetricThreshold": "tma_port_7 > 0.6", ··· 1305 1306 }, 1306 1307 { 1307 1308 "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", 1308 - "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", 1309 + "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", 1309 1310 "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", 1310 1311 "MetricName": "tma_ports_utilization", 1311 1312 "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", ··· 1314 1315 }, 1315 1316 { 1316 1317 "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1317 - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_clks", 1318 + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks", 1318 1319 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", 1319 1320 "MetricName": "tma_ports_utilized_0", 1320 1321 "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1323 1324 }, 1324 1325 { 1325 1326 "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1326 - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_clks", 1327 + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_core_clks", 1327 1328 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", 1328 1329 "MetricName": "tma_ports_utilized_1", 1329 1330 "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1332 1333 }, 1333 1334 { 1334 1335 "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1335 - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_clks", 1336 + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_core_clks", 1336 1337 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", 1337 1338 "MetricName": "tma_ports_utilized_2", 1338 1339 "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1341 1342 }, 1342 1343 { 1343 1344 "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", 1344 - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_clks", 1345 + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks", 1345 1346 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", 1346 1347 "MetricName": "tma_ports_utilized_3m", 1347 1348 "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1349 1350 }, 1350 1351 { 1351 1352 "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", 1352 - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", 1353 + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", 1353 1354 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1354 1355 "MetricName": "tma_retiring", 1355 1356 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", ··· 1359 1360 }, 1360 1361 { 1361 1362 "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", 1362 - "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_clks", 1363 + "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks", 1363 1364 "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", 1364 1365 "MetricName": "tma_serializing_operation", 1365 1366 "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", ··· 1369 1370 { 1370 1371 "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", 1371 1372 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 1372 - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", 1373 + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", 1373 1374 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 1374 1375 "MetricName": "tma_split_loads", 1375 1376 "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1378 1379 }, 1379 1380 { 1380 1381 "BriefDescription": "This metric represents rate of split store accesses", 1381 - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", 1382 + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", 1382 1383 "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", 1383 1384 "MetricName": "tma_split_stores", 1384 1385 "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1387 1388 }, 1388 1389 { 1389 1390 "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", 1390 - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", 1391 + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", 1391 1392 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", 1392 1393 "MetricName": "tma_sq_full", 1393 1394 "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1394 - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", 1395 + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", 1395 1396 "ScaleUnit": "100%" 1396 1397 }, 1397 1398 { 1398 1399 "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", 1399 - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", 1400 + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", 1400 1401 "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1401 1402 "MetricName": "tma_store_bound", 1402 1403 "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1405 1406 }, 1406 1407 { 1407 1408 "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", 1408 - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", 1409 + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", 1409 1410 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 1410 1411 "MetricName": "tma_store_fwd_blk", 1411 1412 "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1415 1416 { 1416 1417 "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", 1417 1418 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 1418 - "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", 1419 + "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", 1419 1420 "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", 1420 1421 "MetricName": "tma_store_latency", 1421 1422 "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1424 1425 }, 1425 1426 { 1426 1427 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", 1427 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", 1428 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", 1428 1429 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 1429 1430 "MetricName": "tma_store_op_utilization", 1430 1431 "MetricThreshold": "tma_store_op_utilization > 0.6", ··· 1440 1441 }, 1441 1442 { 1442 1443 "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", 1443 - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", 1444 + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", 1444 1445 "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", 1445 1446 "MetricName": "tma_store_stlb_miss", 1446 1447 "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1448 1449 }, 1449 1450 { 1450 1451 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", 1451 - "MetricExpr": "9 * BACLEARS.ANY / tma_info_clks", 1452 + "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks", 1452 1453 "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", 1453 1454 "MetricName": "tma_unknown_branches", 1454 1455 "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",

+31

tools/perf/pmu-events/arch/x86/skylakex/floating-point.json

··· 32 32 "UMask": "0x20" 33 33 }, 34 34 { 35 + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", 36 + "EventCode": "0xC7", 37 + "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", 38 + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", 39 + "SampleAfterValue": "1000003", 40 + "UMask": "0x18" 41 + }, 42 + { 35 43 "BriefDescription": "Counts number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", 36 44 "EventCode": "0xC7", 37 45 "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", ··· 56 48 "UMask": "0x80" 57 49 }, 58 50 { 51 + "BriefDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", 52 + "EventCode": "0xC7", 53 + "EventName": "FP_ARITH_INST_RETIRED.8_FLOPS", 54 + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", 55 + "SampleAfterValue": "1000003", 56 + "UMask": "0x18" 57 + }, 58 + { 59 + "BriefDescription": "Counts once for most SIMD scalar computational floating-point instructions retired. Counts twice for DPP and FM(N)ADD/SUB instructions retired.", 60 + "EventCode": "0xC7", 61 + "EventName": "FP_ARITH_INST_RETIRED.SCALAR", 62 + "PublicDescription": "Counts once for most SIMD scalar computational single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SIMD scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", 63 + "SampleAfterValue": "2000003", 64 + "UMask": "0x3" 65 + }, 66 + { 59 67 "BriefDescription": "Counts once for most SIMD scalar computational double precision floating-point instructions retired. Counts twice for DPP and FM(N)ADD/SUB instructions retired.", 60 68 "EventCode": "0xC7", 61 69 "EventName": "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", ··· 86 62 "PublicDescription": "Counts once for most SIMD scalar computational single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SIMD scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", 87 63 "SampleAfterValue": "2000003", 88 64 "UMask": "0x2" 65 + }, 66 + { 67 + "BriefDescription": "Number of any Vector retired FP arithmetic instructions", 68 + "EventCode": "0xC7", 69 + "EventName": "FP_ARITH_INST_RETIRED.VECTOR", 70 + "SampleAfterValue": "2000003", 71 + "UMask": "0xfc" 89 72 }, 90 73 { 91 74 "BriefDescription": "Cycles with any input/output SSE or FP assist",

+21 -2

tools/perf/pmu-events/arch/x86/skylakex/pipeline.json

··· 26 26 "UMask": "0x4" 27 27 }, 28 28 { 29 - "BriefDescription": "Conditional branch instructions retired.", 29 + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", 30 + "Errata": "SKL091", 31 + "EventCode": "0xC4", 32 + "EventName": "BR_INST_RETIRED.COND", 33 + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.CONDITIONAL]", 34 + "SampleAfterValue": "400009", 35 + "UMask": "0x1" 36 + }, 37 + { 38 + "BriefDescription": "Conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", 30 39 "Errata": "SKL091", 31 40 "EventCode": "0xC4", 32 41 "EventName": "BR_INST_RETIRED.CONDITIONAL", 33 42 "PEBS": "1", 34 - "PublicDescription": "This event counts conditional branch instructions retired.", 43 + "PublicDescription": "This event counts conditional branch instructions retired. [This event is alias to BR_INST_RETIRED.COND]", 35 44 "SampleAfterValue": "400009", 36 45 "UMask": "0x1" 37 46 }, ··· 419 410 "Invert": "1", 420 411 "PEBS": "2", 421 412 "PublicDescription": "Number of cycles using an always true condition applied to PEBS instructions retired event. (inst_ret< 16)", 413 + "SampleAfterValue": "2000003", 414 + "UMask": "0x1" 415 + }, 416 + { 417 + "BriefDescription": "Clears speculative count", 418 + "CounterMask": "1", 419 + "EdgeDetect": "1", 420 + "EventCode": "0x0D", 421 + "EventName": "INT_MISC.CLEARS_COUNT", 422 + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", 422 423 "SampleAfterValue": "2000003", 423 424 "UMask": "0x1" 424 425 },

+901 -668

tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json

··· 50 50 }, 51 51 { 52 52 "BriefDescription": "Uncore frequency per die [GHZ]", 53 - "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9", 53 + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", 54 54 "MetricGroup": "SoC", 55 55 "MetricName": "UNCORE_FREQ" 56 + }, 57 + { 58 + "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.", 59 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY", 60 + "MetricName": "cpi", 61 + "ScaleUnit": "1per_instr" 62 + }, 63 + { 64 + "BriefDescription": "CPU operating frequency (in GHz)", 65 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", 66 + "MetricName": "cpu_operating_frequency", 67 + "ScaleUnit": "1GHz" 68 + }, 69 + { 70 + "BriefDescription": "Percentage of time spent in the active CPU power state C0", 71 + "MetricExpr": "tma_info_system_cpu_utilization", 72 + "MetricName": "cpu_utilization", 73 + "ScaleUnit": "100%" 74 + }, 75 + { 76 + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions", 77 + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", 78 + "MetricName": "dtlb_2mb_large_page_load_mpi", 79 + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the Data Translation Lookaside Buffer (DTLB) and further levels of TLB.", 80 + "ScaleUnit": "1per_instr" 81 + }, 82 + { 83 + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions", 84 + "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 85 + "MetricName": "dtlb_load_mpi", 86 + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", 87 + "ScaleUnit": "1per_instr" 88 + }, 89 + { 90 + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions", 91 + "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 92 + "MetricName": "dtlb_store_mpi", 93 + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.", 94 + "ScaleUnit": "1per_instr" 95 + }, 96 + { 97 + "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.", 98 + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e6 / duration_time", 99 + "MetricName": "io_bandwidth_read", 100 + "ScaleUnit": "1MB/s" 101 + }, 102 + { 103 + "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.", 104 + "MetricExpr": "(UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART0 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART1 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART2 + UNC_IIO_PAYLOAD_BYTES_IN.MEM_WRITE.PART3) * 4 / 1e6 / duration_time", 105 + "MetricName": "io_bandwidth_write", 106 + "ScaleUnit": "1MB/s" 107 + }, 108 + { 109 + "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", 110 + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", 111 + "MetricName": "itlb_large_page_mpi", 112 + "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.", 113 + "ScaleUnit": "1per_instr" 114 + }, 115 + { 116 + "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions", 117 + "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 118 + "MetricName": "itlb_mpi", 119 + "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.", 120 + "ScaleUnit": "1per_instr" 121 + }, 122 + { 123 + "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions", 124 + "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY", 125 + "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr", 126 + "ScaleUnit": "1per_instr" 127 + }, 128 + { 129 + "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions", 130 + "MetricExpr": "MEM_LOAD_RETIRED.L1_HIT / INST_RETIRED.ANY", 131 + "MetricName": "l1d_demand_data_read_hits_per_instr", 132 + "ScaleUnit": "1per_instr" 133 + }, 134 + { 135 + "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions", 136 + "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY", 137 + "MetricName": "l1d_mpi", 138 + "ScaleUnit": "1per_instr" 139 + }, 140 + { 141 + "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions", 142 + "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 143 + "MetricName": "l2_demand_code_mpi", 144 + "ScaleUnit": "1per_instr" 145 + }, 146 + { 147 + "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions", 148 + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT / INST_RETIRED.ANY", 149 + "MetricName": "l2_demand_data_read_hits_per_instr", 150 + "ScaleUnit": "1per_instr" 151 + }, 152 + { 153 + "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions", 154 + "MetricExpr": "MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 155 + "MetricName": "l2_demand_data_read_mpi", 156 + "ScaleUnit": "1per_instr" 157 + }, 158 + { 159 + "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions", 160 + "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY", 161 + "MetricName": "l2_mpi", 162 + "ScaleUnit": "1per_instr" 163 + }, 164 + { 165 + "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", 166 + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x12CC0233@ / INST_RETIRED.ANY", 167 + "MetricName": "llc_code_read_mpi_demand_plus_prefetch", 168 + "ScaleUnit": "1per_instr" 169 + }, 170 + { 171 + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds", 172 + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40433@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40433@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", 173 + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency", 174 + "ScaleUnit": "1ns" 175 + }, 176 + { 177 + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds", 178 + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40432@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", 179 + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests", 180 + "ScaleUnit": "1ns" 181 + }, 182 + { 183 + "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds", 184 + "MetricExpr": "1e9 * (cha@UNC_CHA_TOR_OCCUPANCY.IA_MISS\\,config1\\=0x40431@ / cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@) / (UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time", 185 + "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests", 186 + "ScaleUnit": "1ns" 187 + }, 188 + { 189 + "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions", 190 + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x12D40433@ / INST_RETIRED.ANY", 191 + "MetricName": "llc_data_read_mpi_demand_plus_prefetch", 192 + "ScaleUnit": "1per_instr" 193 + }, 194 + { 195 + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to local memory.", 196 + "MetricExpr": "UNC_CHA_REQUESTS.READS_LOCAL * 64 / 1e6 / duration_time", 197 + "MetricName": "llc_miss_local_memory_bandwidth_read", 198 + "ScaleUnit": "1MB/s" 199 + }, 200 + { 201 + "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to local memory.", 202 + "MetricExpr": "UNC_CHA_REQUESTS.WRITES_LOCAL * 64 / 1e6 / duration_time", 203 + "MetricName": "llc_miss_local_memory_bandwidth_write", 204 + "ScaleUnit": "1MB/s" 205 + }, 206 + { 207 + "BriefDescription": "Bandwidth (MB/sec) of read requests that miss the last level cache (LLC) and go to remote memory.", 208 + "MetricExpr": "UNC_CHA_REQUESTS.READS_REMOTE * 64 / 1e6 / duration_time", 209 + "MetricName": "llc_miss_remote_memory_bandwidth_read", 210 + "ScaleUnit": "1MB/s" 211 + }, 212 + { 213 + "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions", 214 + "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY", 215 + "MetricName": "loads_per_instr", 216 + "ScaleUnit": "1per_instr" 217 + }, 218 + { 219 + "BriefDescription": "DDR memory read bandwidth (MB/sec)", 220 + "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time", 221 + "MetricName": "memory_bandwidth_read", 222 + "ScaleUnit": "1MB/s" 223 + }, 224 + { 225 + "BriefDescription": "DDR memory bandwidth (MB/sec)", 226 + "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time", 227 + "MetricName": "memory_bandwidth_total", 228 + "ScaleUnit": "1MB/s" 229 + }, 230 + { 231 + "BriefDescription": "DDR memory write bandwidth (MB/sec)", 232 + "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time", 233 + "MetricName": "memory_bandwidth_write", 234 + "ScaleUnit": "1MB/s" 235 + }, 236 + { 237 + "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", 238 + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ / (cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ + cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@)", 239 + "MetricName": "numa_reads_addressed_to_local_dram", 240 + "ScaleUnit": "100%" 241 + }, 242 + { 243 + "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.", 244 + "MetricExpr": "cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@ / (cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40432@ + cha@UNC_CHA_TOR_INSERTS.IA_MISS\\,config1\\=0x40431@)", 245 + "MetricName": "numa_reads_addressed_to_remote_dram", 246 + "ScaleUnit": "100%" 247 + }, 248 + { 249 + "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue", 250 + "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY", 251 + "MetricName": "percent_uops_delivered_from_decoded_icache", 252 + "ScaleUnit": "100%" 253 + }, 254 + { 255 + "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue", 256 + "MetricExpr": "IDQ.MITE_UOPS / UOPS_ISSUED.ANY", 257 + "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline", 258 + "ScaleUnit": "100%" 259 + }, 260 + { 261 + "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue", 262 + "MetricExpr": "IDQ.MS_UOPS / UOPS_ISSUED.ANY", 263 + "MetricName": "percent_uops_delivered_from_microcode_sequencer", 264 + "ScaleUnit": "100%" 56 265 }, 57 266 { 58 267 "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", ··· 279 70 "ScaleUnit": "1SMI#" 280 71 }, 281 72 { 73 + "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions", 74 + "MetricExpr": "MEM_INST_RETIRED.ALL_STORES / INST_RETIRED.ANY", 75 + "MetricName": "stores_per_instr", 76 + "ScaleUnit": "1per_instr" 77 + }, 78 + { 282 79 "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", 283 - "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks", 80 + "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", 284 81 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 285 82 "MetricName": "tma_4k_aliasing", 286 83 "MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 295 80 }, 296 81 { 297 82 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", 298 - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots", 83 + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", 299 84 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 300 85 "MetricName": "tma_alu_op_utilization", 301 86 "MetricThreshold": "tma_alu_op_utilization > 0.6", ··· 303 88 }, 304 89 { 305 90 "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists", 306 - "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_slots", 91 + "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots", 307 92 "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", 308 93 "MetricName": "tma_assists", 309 94 "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)", ··· 312 97 }, 313 98 { 314 99 "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", 315 - "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", 100 + "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", 316 101 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 317 102 "MetricName": "tma_backend_bound", 318 103 "MetricThreshold": "tma_backend_bound > 0.2", ··· 322 107 }, 323 108 { 324 109 "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", 325 - "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots", 110 + "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots", 326 111 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 327 112 "MetricName": "tma_bad_speculation", 328 113 "MetricThreshold": "tma_bad_speculation > 0.15", ··· 338 123 "MetricName": "tma_branch_mispredicts", 339 124 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 340 125 "MetricgroupNoGroup": "TopdownL2", 341 - "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 126 + "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers", 342 127 "ScaleUnit": "100%" 343 128 }, 344 129 { 345 130 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers", 346 - "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks + tma_unknown_branches", 131 + "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks + tma_unknown_branches", 347 132 "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group", 348 133 "MetricName": "tma_branch_resteers", 349 134 "MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 361 146 }, 362 147 { 363 148 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears", 364 - "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", 149 + "MetricExpr": "(1 - BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", 365 150 "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueMC", 366 151 "MetricName": "tma_clears_resteers", 367 152 "MetricThreshold": "tma_clears_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", ··· 371 156 { 372 157 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", 373 158 "MetricConstraint": "NO_GROUP_EVENTS", 374 - "MetricExpr": "(44 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 159 + "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 375 160 "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", 376 161 "MetricName": "tma_contested_accesses", 377 162 "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 392 177 { 393 178 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", 394 179 "MetricConstraint": "NO_GROUP_EVENTS", 395 - "MetricExpr": "44 * tma_info_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 180 + "MetricExpr": "44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 396 181 "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", 397 182 "MetricName": "tma_data_sharing", 398 183 "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 401 186 }, 402 187 { 403 188 "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder", 404 - "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_clks / 2", 189 + "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2", 405 190 "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group", 406 191 "MetricName": "tma_decoder0_alone", 407 - "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35))", 192 + "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))", 408 193 "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions", 409 194 "ScaleUnit": "100%" 410 195 }, 411 196 { 412 197 "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active", 413 - "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_clks", 198 + "MetricExpr": "ARITH.DIVIDER_ACTIVE / tma_info_thread_clks", 414 199 "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group", 415 200 "MetricName": "tma_divider", 416 201 "MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", ··· 420 205 { 421 206 "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", 422 207 "MetricConstraint": "NO_GROUP_EVENTS", 423 - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks - tma_l2_bound", 208 + "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", 424 209 "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 425 210 "MetricName": "tma_dram_bound", 426 211 "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 429 214 }, 430 215 { 431 216 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", 432 - "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2", 217 + "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", 433 218 "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 434 219 "MetricName": "tma_dsb", 435 - "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", 220 + "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", 436 221 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.", 437 222 "ScaleUnit": "100%" 438 223 }, 439 224 { 440 225 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines", 441 - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks", 226 + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks", 442 227 "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", 443 228 "MetricName": "tma_dsb_switches", 444 229 "MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", 445 - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 230 + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", 446 231 "ScaleUnit": "100%" 447 232 }, 448 233 { 449 234 "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses", 450 235 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 451 - "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_clks", 236 + "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", 452 237 "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group", 453 238 "MetricName": "tma_dtlb_load", 454 239 "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 455 - "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_memory_data_tlbs", 240 + "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs", 456 241 "ScaleUnit": "100%" 457 242 }, 458 243 { 459 244 "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses", 460 - "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_clks", 245 + "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / tma_info_core_core_clks", 461 246 "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group", 462 247 "MetricName": "tma_dtlb_store", 463 248 "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 464 - "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_memory_data_tlbs", 249 + "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs", 465 250 "ScaleUnit": "100%" 466 251 }, 467 252 { 468 253 "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing", 469 254 "MetricConstraint": "NO_GROUP_EVENTS", 470 - "MetricExpr": "(110 * tma_info_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_clks", 255 + "MetricExpr": "(110 * tma_info_system_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks", 471 256 "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group", 472 257 "MetricName": "tma_false_sharing", 473 258 "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 477 262 { 478 263 "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed", 479 264 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 480 - "MetricExpr": "tma_info_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_clks", 265 + "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / tma_info_thread_clks", 481 266 "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", 482 267 "MetricName": "tma_fb_full", 483 268 "MetricThreshold": "tma_fb_full > 0.3", 484 - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", 269 + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", 485 270 "ScaleUnit": "100%" 486 271 }, 487 272 { ··· 489 274 "MetricExpr": "tma_frontend_bound - tma_fetch_latency", 490 275 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 491 276 "MetricName": "tma_fetch_bandwidth", 492 - "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 277 + "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35", 493 278 "MetricgroupNoGroup": "TopdownL2", 494 - "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 279 + "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp", 495 280 "ScaleUnit": "100%" 496 281 }, 497 282 { 498 283 "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", 499 - "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_slots", 284 + "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / tma_info_thread_slots", 500 285 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 501 286 "MetricName": "tma_fetch_latency", 502 287 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", ··· 571 356 }, 572 357 { 573 358 "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", 574 - "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots", 359 + "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots", 575 360 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 576 361 "MetricName": "tma_frontend_bound", 577 362 "MetricThreshold": "tma_frontend_bound > 0.15", ··· 590 375 }, 591 376 { 592 377 "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", 593 - "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_slots", 378 + "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS + UOPS_RETIRED.MACRO_FUSED - INST_RETIRED.ANY) / tma_info_thread_slots", 594 379 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 595 380 "MetricName": "tma_heavy_operations", 596 381 "MetricThreshold": "tma_heavy_operations > 0.1", ··· 600 385 }, 601 386 { 602 387 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses", 603 - "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_clks", 388 + "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks", 604 389 "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group", 605 390 "MetricName": "tma_icache_misses", 606 391 "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 608 393 "ScaleUnit": "100%" 609 394 }, 610 395 { 611 - "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", 612 - "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time", 613 - "MetricGroup": "Power;Summary", 614 - "MetricName": "tma_info_average_frequency" 615 - }, 616 - { 617 - "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", 618 - "MetricConstraint": "NO_GROUP_EVENTS", 619 - "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", 620 - "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", 621 - "MetricName": "tma_info_big_code", 622 - "MetricThreshold": "tma_info_big_code > 20", 623 - "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_branching_overhead" 624 - }, 625 - { 626 - "BriefDescription": "Branch instructions per taken branch.", 627 - "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", 628 - "MetricGroup": "Branches;Fed;PGO", 629 - "MetricName": "tma_info_bptkbranch" 630 - }, 631 - { 632 396 "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", 633 - "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_slots / BR_MISP_RETIRED.ALL_BRANCHES", 397 + "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES", 634 398 "MetricGroup": "Bad;BrMispredicts;tma_issueBM", 635 - "MetricName": "tma_info_branch_misprediction_cost", 636 - "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_mispredictions, tma_mispredicts_resteers" 399 + "MetricName": "tma_info_bad_spec_branch_misprediction_cost", 400 + "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers" 637 401 }, 638 402 { 639 - "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", 640 - "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_slots)", 641 - "MetricGroup": "Ret;tma_issueBC", 642 - "MetricName": "tma_info_branching_overhead", 643 - "MetricThreshold": "tma_info_branching_overhead > 10", 644 - "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_big_code" 403 + "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", 404 + "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", 405 + "MetricGroup": "Bad;BrMispredicts", 406 + "MetricName": "tma_info_bad_spec_ipmisp_indirect", 407 + "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3" 645 408 }, 646 409 { 647 - "BriefDescription": "Fraction of branches that are CALL or RET", 648 - "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", 649 - "MetricGroup": "Bad;Branches", 650 - "MetricName": "tma_info_callret" 651 - }, 652 - { 653 - "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", 654 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD", 655 - "MetricGroup": "Pipeline", 656 - "MetricName": "tma_info_clks" 657 - }, 658 - { 659 - "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 660 - "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 661 - "MetricGroup": "Fed;MemoryTLB", 662 - "MetricName": "tma_info_code_stlb_mpki" 663 - }, 664 - { 665 - "BriefDescription": "Fraction of branches that are non-taken conditionals", 666 - "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", 667 - "MetricGroup": "Bad;Branches;CodeGen;PGO", 668 - "MetricName": "tma_info_cond_nt" 669 - }, 670 - { 671 - "BriefDescription": "Fraction of branches that are taken conditionals", 672 - "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", 673 - "MetricGroup": "Bad;Branches;CodeGen;PGO", 674 - "MetricName": "tma_info_cond_tk" 410 + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", 411 + "MetricExpr": "tma_info_core_ipmispredict", 412 + "MetricGroup": "Bad;BadSpec;BrMispredicts", 413 + "MetricName": "tma_info_bad_spec_ipmispredict", 414 + "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200" 675 415 }, 676 416 { 677 417 "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts", 678 418 "MetricConstraint": "NO_GROUP_EVENTS", 679 - "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_smt_2t_utilization > 0.5 else 0)", 419 + "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)", 680 420 "MetricGroup": "Cor;SMT", 681 - "MetricName": "tma_info_core_bound_likely", 682 - "MetricThreshold": "tma_info_core_bound_likely > 0.5" 683 - }, 684 - { 685 - "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", 686 - "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))", 687 - "MetricGroup": "SMT", 688 - "MetricName": "tma_info_core_clks" 689 - }, 690 - { 691 - "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", 692 - "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks", 693 - "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", 694 - "MetricName": "tma_info_coreipc" 695 - }, 696 - { 697 - "BriefDescription": "Cycles Per Instruction (per Logical Processor)", 698 - "MetricExpr": "1 / tma_info_ipc", 699 - "MetricGroup": "Mem;Pipeline", 700 - "MetricName": "tma_info_cpi" 701 - }, 702 - { 703 - "BriefDescription": "Average CPU Utilization", 704 - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", 705 - "MetricGroup": "HPC;Summary", 706 - "MetricName": "tma_info_cpu_utilization" 707 - }, 708 - { 709 - "BriefDescription": "Average Parallel L2 cache miss data reads", 710 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", 711 - "MetricGroup": "Memory_BW;Offcore", 712 - "MetricName": "tma_info_data_l2_mlp" 713 - }, 714 - { 715 - "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", 716 - "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", 717 - "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", 718 - "MetricName": "tma_info_dram_bw_use", 719 - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" 720 - }, 721 - { 722 - "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", 723 - "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", 724 - "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", 725 - "MetricName": "tma_info_dsb_coverage", 726 - "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35", 727 - "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_misses, tma_info_iptb, tma_lcp" 421 + "MetricName": "tma_info_botlnk_l0_core_bound_likely", 422 + "MetricThreshold": "tma_info_botlnk_l0_core_bound_likely > 0.5" 728 423 }, 729 424 { 730 425 "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck", 731 426 "MetricConstraint": "NO_GROUP_EVENTS", 732 427 "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_mite))", 733 428 "MetricGroup": "DSBmiss;Fed;tma_issueFB", 734 - "MetricName": "tma_info_dsb_misses", 735 - "MetricThreshold": "tma_info_dsb_misses > 10", 736 - "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp" 737 - }, 738 - { 739 - "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", 740 - "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / DSB2MITE_SWITCHES.COUNT", 741 - "MetricGroup": "DSBmiss", 742 - "MetricName": "tma_info_dsb_switch_cost" 743 - }, 744 - { 745 - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", 746 - "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", 747 - "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", 748 - "MetricName": "tma_info_execute" 749 - }, 750 - { 751 - "BriefDescription": "The ratio of Executed- by Issued-Uops", 752 - "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", 753 - "MetricGroup": "Cor;Pipeline", 754 - "MetricName": "tma_info_execute_per_issue", 755 - "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." 756 - }, 757 - { 758 - "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", 759 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", 760 - "MetricGroup": "CacheMisses;Mem", 761 - "MetricName": "tma_info_fb_hpki" 762 - }, 763 - { 764 - "BriefDescription": "Average number of Uops issued by front-end when it issued something", 765 - "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", 766 - "MetricGroup": "Fed;FetchBW", 767 - "MetricName": "tma_info_fetch_upc" 768 - }, 769 - { 770 - "BriefDescription": "Floating Point Operations Per Cycle", 771 - "MetricConstraint": "NO_GROUP_EVENTS", 772 - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_clks", 773 - "MetricGroup": "Flops;Ret", 774 - "MetricName": "tma_info_flopc" 775 - }, 776 - { 777 - "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", 778 - "MetricConstraint": "NO_GROUP_EVENTS", 779 - "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_clks)", 780 - "MetricGroup": "Cor;Flops;HPC", 781 - "MetricName": "tma_info_fp_arith_utilization", 782 - "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." 783 - }, 784 - { 785 - "BriefDescription": "Giga Floating Point Operations Per Second", 786 - "MetricConstraint": "NO_GROUP_EVENTS", 787 - "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", 788 - "MetricGroup": "Cor;Flops;HPC", 789 - "MetricName": "tma_info_gflops", 790 - "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." 429 + "MetricName": "tma_info_botlnk_l2_dsb_misses", 430 + "MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10", 431 + "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp" 791 432 }, 792 433 { 793 434 "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", 794 435 "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", 795 436 "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", 796 - "MetricName": "tma_info_ic_misses", 797 - "MetricThreshold": "tma_info_ic_misses > 5", 437 + "MetricName": "tma_info_botlnk_l2_ic_misses", 438 + "MetricThreshold": "tma_info_botlnk_l2_ic_misses > 5", 798 439 "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: " 799 440 }, 800 441 { 801 - "BriefDescription": "Average Latency for L1 instruction cache misses", 802 - "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", 803 - "MetricGroup": "Fed;FetchLat;IcMiss", 804 - "MetricName": "tma_info_icache_miss_latency" 442 + "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", 443 + "MetricConstraint": "NO_GROUP_EVENTS", 444 + "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", 445 + "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC", 446 + "MetricName": "tma_info_bottleneck_big_code", 447 + "MetricThreshold": "tma_info_bottleneck_big_code > 20", 448 + "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead" 805 449 }, 806 450 { 807 - "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", 808 - "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", 809 - "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", 810 - "MetricName": "tma_info_ilp" 451 + "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)", 452 + "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)", 453 + "MetricGroup": "Ret;tma_issueBC", 454 + "MetricName": "tma_info_bottleneck_branching_overhead", 455 + "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10", 456 + "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code" 811 457 }, 812 458 { 813 459 "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks", 814 460 "MetricConstraint": "NO_GROUP_EVENTS", 815 - "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_big_code", 461 + "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code", 816 462 "MetricGroup": "Fed;FetchBW;Frontend", 817 - "MetricName": "tma_info_instruction_fetch_bw", 818 - "MetricThreshold": "tma_info_instruction_fetch_bw > 20" 819 - }, 820 - { 821 - "BriefDescription": "Total number of retired Instructions", 822 - "MetricExpr": "INST_RETIRED.ANY", 823 - "MetricGroup": "Summary;TmaL1;tma_L1_group", 824 - "MetricName": "tma_info_instructions", 825 - "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" 826 - }, 827 - { 828 - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", 829 - "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time", 830 - "MetricGroup": "IoBW;Mem;Server;SoC", 831 - "MetricName": "tma_info_io_read_bw" 832 - }, 833 - { 834 - "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", 835 - "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time", 836 - "MetricGroup": "IoBW;Mem;Server;SoC", 837 - "MetricName": "tma_info_io_write_bw" 838 - }, 839 - { 840 - "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", 841 - "MetricConstraint": "NO_GROUP_EVENTS", 842 - "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", 843 - "MetricGroup": "Flops;InsType", 844 - "MetricName": "tma_info_iparith", 845 - "MetricThreshold": "tma_info_iparith < 10", 846 - "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." 847 - }, 848 - { 849 - "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", 850 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", 851 - "MetricGroup": "Flops;FpVector;InsType", 852 - "MetricName": "tma_info_iparith_avx128", 853 - "MetricThreshold": "tma_info_iparith_avx128 < 10", 854 - "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 855 - }, 856 - { 857 - "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", 858 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", 859 - "MetricGroup": "Flops;FpVector;InsType", 860 - "MetricName": "tma_info_iparith_avx256", 861 - "MetricThreshold": "tma_info_iparith_avx256 < 10", 862 - "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 863 - }, 864 - { 865 - "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", 866 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", 867 - "MetricGroup": "Flops;FpVector;InsType", 868 - "MetricName": "tma_info_iparith_avx512", 869 - "MetricThreshold": "tma_info_iparith_avx512 < 10", 870 - "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 871 - }, 872 - { 873 - "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", 874 - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", 875 - "MetricGroup": "Flops;FpScalar;InsType", 876 - "MetricName": "tma_info_iparith_scalar_dp", 877 - "MetricThreshold": "tma_info_iparith_scalar_dp < 10", 878 - "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 879 - }, 880 - { 881 - "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", 882 - "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", 883 - "MetricGroup": "Flops;FpScalar;InsType", 884 - "MetricName": "tma_info_iparith_scalar_sp", 885 - "MetricThreshold": "tma_info_iparith_scalar_sp < 10", 886 - "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 887 - }, 888 - { 889 - "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", 890 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", 891 - "MetricGroup": "Branches;Fed;InsType", 892 - "MetricName": "tma_info_ipbranch", 893 - "MetricThreshold": "tma_info_ipbranch < 8" 894 - }, 895 - { 896 - "BriefDescription": "Instructions Per Cycle (per Logical Processor)", 897 - "MetricExpr": "INST_RETIRED.ANY / tma_info_clks", 898 - "MetricGroup": "Ret;Summary", 899 - "MetricName": "tma_info_ipc" 900 - }, 901 - { 902 - "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", 903 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", 904 - "MetricGroup": "Branches;Fed;PGO", 905 - "MetricName": "tma_info_ipcall", 906 - "MetricThreshold": "tma_info_ipcall < 200" 907 - }, 908 - { 909 - "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", 910 - "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", 911 - "MetricGroup": "DSBmiss;Fed", 912 - "MetricName": "tma_info_ipdsb_miss_ret", 913 - "MetricThreshold": "tma_info_ipdsb_miss_ret < 50" 914 - }, 915 - { 916 - "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", 917 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", 918 - "MetricGroup": "Branches;OS", 919 - "MetricName": "tma_info_ipfarbranch", 920 - "MetricThreshold": "tma_info_ipfarbranch < 1e6" 921 - }, 922 - { 923 - "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", 924 - "MetricConstraint": "NO_GROUP_EVENTS", 925 - "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", 926 - "MetricGroup": "Flops;InsType", 927 - "MetricName": "tma_info_ipflop", 928 - "MetricThreshold": "tma_info_ipflop < 10" 929 - }, 930 - { 931 - "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", 932 - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", 933 - "MetricGroup": "InsType", 934 - "MetricName": "tma_info_ipload", 935 - "MetricThreshold": "tma_info_ipload < 3" 936 - }, 937 - { 938 - "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).", 939 - "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)", 940 - "MetricGroup": "Bad;BrMispredicts", 941 - "MetricName": "tma_info_ipmisp_indirect", 942 - "MetricThreshold": "tma_info_ipmisp_indirect < 1e3" 943 - }, 944 - { 945 - "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)", 946 - "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", 947 - "MetricGroup": "Bad;BadSpec;BrMispredicts", 948 - "MetricName": "tma_info_ipmispredict", 949 - "MetricThreshold": "tma_info_ipmispredict < 200" 950 - }, 951 - { 952 - "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", 953 - "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", 954 - "MetricGroup": "InsType", 955 - "MetricName": "tma_info_ipstore", 956 - "MetricThreshold": "tma_info_ipstore < 8" 957 - }, 958 - { 959 - "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", 960 - "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", 961 - "MetricGroup": "Prefetches", 962 - "MetricName": "tma_info_ipswpf", 963 - "MetricThreshold": "tma_info_ipswpf < 100" 964 - }, 965 - { 966 - "BriefDescription": "Instruction per taken branch", 967 - "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", 968 - "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", 969 - "MetricName": "tma_info_iptb", 970 - "MetricThreshold": "tma_info_iptb < 9", 971 - "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_lcp" 972 - }, 973 - { 974 - "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", 975 - "MetricExpr": "tma_info_instructions / BACLEARS.ANY", 976 - "MetricGroup": "Fed", 977 - "MetricName": "tma_info_ipunknown_branch" 978 - }, 979 - { 980 - "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", 981 - "MetricConstraint": "NO_GROUP_EVENTS", 982 - "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", 983 - "MetricGroup": "Bad;Branches", 984 - "MetricName": "tma_info_jump" 985 - }, 986 - { 987 - "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", 988 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", 989 - "MetricGroup": "OS", 990 - "MetricName": "tma_info_kernel_cpi" 991 - }, 992 - { 993 - "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", 994 - "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", 995 - "MetricGroup": "OS", 996 - "MetricName": "tma_info_kernel_utilization", 997 - "MetricThreshold": "tma_info_kernel_utilization > 0.05" 998 - }, 999 - { 1000 - "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", 1001 - "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", 1002 - "MetricGroup": "Mem;MemoryBW", 1003 - "MetricName": "tma_info_l1d_cache_fill_bw" 1004 - }, 1005 - { 1006 - "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", 1007 - "MetricExpr": "tma_info_l1d_cache_fill_bw", 1008 - "MetricGroup": "Mem;MemoryBW", 1009 - "MetricName": "tma_info_l1d_cache_fill_bw_1t" 1010 - }, 1011 - { 1012 - "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", 1013 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", 1014 - "MetricGroup": "CacheMisses;Mem", 1015 - "MetricName": "tma_info_l1mpki" 1016 - }, 1017 - { 1018 - "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", 1019 - "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", 1020 - "MetricGroup": "CacheMisses;Mem", 1021 - "MetricName": "tma_info_l1mpki_load" 1022 - }, 1023 - { 1024 - "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", 1025 - "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", 1026 - "MetricGroup": "Mem;MemoryBW", 1027 - "MetricName": "tma_info_l2_cache_fill_bw" 1028 - }, 1029 - { 1030 - "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", 1031 - "MetricExpr": "tma_info_l2_cache_fill_bw", 1032 - "MetricGroup": "Mem;MemoryBW", 1033 - "MetricName": "tma_info_l2_cache_fill_bw_1t" 1034 - }, 1035 - { 1036 - "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", 1037 - "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_instructions", 1038 - "MetricGroup": "L2Evicts;Mem;Server", 1039 - "MetricName": "tma_info_l2_evictions_nonsilent_pki" 1040 - }, 1041 - { 1042 - "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", 1043 - "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_instructions", 1044 - "MetricGroup": "L2Evicts;Mem;Server", 1045 - "MetricName": "tma_info_l2_evictions_silent_pki" 1046 - }, 1047 - { 1048 - "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", 1049 - "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", 1050 - "MetricGroup": "CacheMisses;Mem", 1051 - "MetricName": "tma_info_l2hpki_all" 1052 - }, 1053 - { 1054 - "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", 1055 - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", 1056 - "MetricGroup": "CacheMisses;Mem", 1057 - "MetricName": "tma_info_l2hpki_load" 1058 - }, 1059 - { 1060 - "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", 1061 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 1062 - "MetricGroup": "Backend;CacheMisses;Mem", 1063 - "MetricName": "tma_info_l2mpki" 1064 - }, 1065 - { 1066 - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", 1067 - "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", 1068 - "MetricGroup": "CacheMisses;Mem;Offcore", 1069 - "MetricName": "tma_info_l2mpki_all" 1070 - }, 1071 - { 1072 - "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", 1073 - "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", 1074 - "MetricGroup": "IcMiss", 1075 - "MetricName": "tma_info_l2mpki_code" 1076 - }, 1077 - { 1078 - "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", 1079 - "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 1080 - "MetricGroup": "IcMiss", 1081 - "MetricName": "tma_info_l2mpki_code_all" 1082 - }, 1083 - { 1084 - "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", 1085 - "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", 1086 - "MetricGroup": "CacheMisses;Mem", 1087 - "MetricName": "tma_info_l2mpki_load" 1088 - }, 1089 - { 1090 - "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", 1091 - "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", 1092 - "MetricGroup": "Mem;MemoryBW;Offcore", 1093 - "MetricName": "tma_info_l3_cache_access_bw" 1094 - }, 1095 - { 1096 - "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", 1097 - "MetricExpr": "tma_info_l3_cache_access_bw", 1098 - "MetricGroup": "Mem;MemoryBW;Offcore", 1099 - "MetricName": "tma_info_l3_cache_access_bw_1t" 1100 - }, 1101 - { 1102 - "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", 1103 - "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", 1104 - "MetricGroup": "Mem;MemoryBW", 1105 - "MetricName": "tma_info_l3_cache_fill_bw" 1106 - }, 1107 - { 1108 - "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", 1109 - "MetricExpr": "tma_info_l3_cache_fill_bw", 1110 - "MetricGroup": "Mem;MemoryBW", 1111 - "MetricName": "tma_info_l3_cache_fill_bw_1t" 1112 - }, 1113 - { 1114 - "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", 1115 - "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", 1116 - "MetricGroup": "CacheMisses;Mem", 1117 - "MetricName": "tma_info_l3mpki" 1118 - }, 1119 - { 1120 - "BriefDescription": "Average Latency for L2 cache miss demand Loads", 1121 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", 1122 - "MetricGroup": "Memory_Lat;Offcore", 1123 - "MetricName": "tma_info_load_l2_miss_latency" 1124 - }, 1125 - { 1126 - "BriefDescription": "Average Parallel L2 cache miss demand Loads", 1127 - "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", 1128 - "MetricGroup": "Memory_BW;Offcore", 1129 - "MetricName": "tma_info_load_l2_mlp" 1130 - }, 1131 - { 1132 - "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", 1133 - "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", 1134 - "MetricGroup": "Mem;MemoryBound;MemoryLat", 1135 - "MetricName": "tma_info_load_miss_real_latency" 1136 - }, 1137 - { 1138 - "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1139 - "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1140 - "MetricGroup": "Mem;MemoryTLB", 1141 - "MetricName": "tma_info_load_stlb_mpki" 1142 - }, 1143 - { 1144 - "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", 1145 - "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", 1146 - "MetricGroup": "Mem;MemoryLat;Server;SoC", 1147 - "MetricName": "tma_info_mem_dram_read_latency", 1148 - "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" 1149 - }, 1150 - { 1151 - "BriefDescription": "Average number of parallel data read requests to external memory", 1152 - "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", 1153 - "MetricGroup": "Mem;MemoryBW;SoC", 1154 - "MetricName": "tma_info_mem_parallel_reads", 1155 - "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" 1156 - }, 1157 - { 1158 - "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", 1159 - "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_socket_clks / duration_time)", 1160 - "MetricGroup": "Mem;MemoryLat;SoC", 1161 - "MetricName": "tma_info_mem_read_latency", 1162 - "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" 463 + "MetricName": "tma_info_bottleneck_instruction_fetch_bw", 464 + "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20" 1163 465 }, 1164 466 { 1165 467 "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks", 1166 468 "MetricConstraint": "NO_GROUP_EVENTS", 1167 469 "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))", 1168 470 "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW", 1169 - "MetricName": "tma_info_memory_bandwidth", 1170 - "MetricThreshold": "tma_info_memory_bandwidth > 20", 1171 - "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full" 471 + "MetricName": "tma_info_bottleneck_memory_bandwidth", 472 + "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20", 473 + "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" 1172 474 }, 1173 475 { 1174 476 "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", 1175 477 "MetricConstraint": "NO_GROUP_EVENTS", 1176 478 "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", 1177 479 "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB", 1178 - "MetricName": "tma_info_memory_data_tlbs", 1179 - "MetricThreshold": "tma_info_memory_data_tlbs > 20", 480 + "MetricName": "tma_info_bottleneck_memory_data_tlbs", 481 + "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20", 1180 482 "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store" 1181 483 }, 1182 484 { ··· 701 969 "MetricConstraint": "NO_GROUP_EVENTS", 702 970 "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))", 703 971 "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat", 704 - "MetricName": "tma_info_memory_latency", 705 - "MetricThreshold": "tma_info_memory_latency > 20", 972 + "MetricName": "tma_info_bottleneck_memory_latency", 973 + "MetricThreshold": "tma_info_bottleneck_memory_latency > 20", 706 974 "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency" 707 975 }, 708 976 { ··· 710 978 "MetricConstraint": "NO_GROUP_EVENTS", 711 979 "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", 712 980 "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM", 713 - "MetricName": "tma_info_mispredictions", 714 - "MetricThreshold": "tma_info_mispredictions > 20", 715 - "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_mispredicts_resteers" 981 + "MetricName": "tma_info_bottleneck_mispredictions", 982 + "MetricThreshold": "tma_info_bottleneck_mispredictions > 20", 983 + "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers" 984 + }, 985 + { 986 + "BriefDescription": "Fraction of branches that are CALL or RET", 987 + "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES", 988 + "MetricGroup": "Bad;Branches", 989 + "MetricName": "tma_info_branches_callret" 990 + }, 991 + { 992 + "BriefDescription": "Fraction of branches that are non-taken conditionals", 993 + "MetricExpr": "BR_INST_RETIRED.NOT_TAKEN / BR_INST_RETIRED.ALL_BRANCHES", 994 + "MetricGroup": "Bad;Branches;CodeGen;PGO", 995 + "MetricName": "tma_info_branches_cond_nt" 996 + }, 997 + { 998 + "BriefDescription": "Fraction of branches that are taken conditionals", 999 + "MetricExpr": "(BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) / BR_INST_RETIRED.ALL_BRANCHES", 1000 + "MetricGroup": "Bad;Branches;CodeGen;PGO", 1001 + "MetricName": "tma_info_branches_cond_tk" 1002 + }, 1003 + { 1004 + "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps", 1005 + "MetricConstraint": "NO_GROUP_EVENTS", 1006 + "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES", 1007 + "MetricGroup": "Bad;Branches", 1008 + "MetricName": "tma_info_branches_jump" 1009 + }, 1010 + { 1011 + "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core", 1012 + "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))", 1013 + "MetricGroup": "SMT", 1014 + "MetricName": "tma_info_core_core_clks" 1015 + }, 1016 + { 1017 + "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)", 1018 + "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks", 1019 + "MetricGroup": "Ret;SMT;TmaL1;tma_L1_group", 1020 + "MetricName": "tma_info_core_coreipc" 1021 + }, 1022 + { 1023 + "BriefDescription": "Floating Point Operations Per Cycle", 1024 + "MetricConstraint": "NO_GROUP_EVENTS", 1025 + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks", 1026 + "MetricGroup": "Flops;Ret", 1027 + "MetricName": "tma_info_core_flopc" 1028 + }, 1029 + { 1030 + "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)", 1031 + "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)", 1032 + "MetricGroup": "Cor;Flops;HPC", 1033 + "MetricName": "tma_info_core_fp_arith_utilization", 1034 + "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)." 1035 + }, 1036 + { 1037 + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core", 1038 + "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)", 1039 + "MetricGroup": "Backend;Cor;Pipeline;PortsUtil", 1040 + "MetricName": "tma_info_core_ilp" 1041 + }, 1042 + { 1043 + "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)", 1044 + "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES", 1045 + "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group", 1046 + "MetricName": "tma_info_core_ipmispredict", 1047 + "MetricgroupNoGroup": "TopdownL1" 1048 + }, 1049 + { 1050 + "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)", 1051 + "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)", 1052 + "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", 1053 + "MetricName": "tma_info_frontend_dsb_coverage", 1054 + "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35", 1055 + "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp" 1056 + }, 1057 + { 1058 + "BriefDescription": "Average number of cycles of a switch from the DSB fetch-unit to MITE fetch unit - see DSB_Switches tree node for details.", 1059 + "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / DSB2MITE_SWITCHES.COUNT", 1060 + "MetricGroup": "DSBmiss", 1061 + "MetricName": "tma_info_frontend_dsb_switch_cost" 1062 + }, 1063 + { 1064 + "BriefDescription": "Average number of Uops issued by front-end when it issued something", 1065 + "MetricExpr": "UOPS_ISSUED.ANY / cpu@UOPS_ISSUED.ANY\\,cmask\\=1@", 1066 + "MetricGroup": "Fed;FetchBW", 1067 + "MetricName": "tma_info_frontend_fetch_upc" 1068 + }, 1069 + { 1070 + "BriefDescription": "Average Latency for L1 instruction cache misses", 1071 + "MetricExpr": "ICACHE_16B.IFDATA_STALL / cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@ + 2", 1072 + "MetricGroup": "Fed;FetchLat;IcMiss", 1073 + "MetricName": "tma_info_frontend_icache_miss_latency" 1074 + }, 1075 + { 1076 + "BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)", 1077 + "MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS", 1078 + "MetricGroup": "DSBmiss;Fed", 1079 + "MetricName": "tma_info_frontend_ipdsb_miss_ret", 1080 + "MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50" 1081 + }, 1082 + { 1083 + "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)", 1084 + "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY", 1085 + "MetricGroup": "Fed", 1086 + "MetricName": "tma_info_frontend_ipunknown_branch" 1087 + }, 1088 + { 1089 + "BriefDescription": "L2 cache true code cacheline misses per kilo instruction", 1090 + "MetricExpr": "1e3 * FRONTEND_RETIRED.L2_MISS / INST_RETIRED.ANY", 1091 + "MetricGroup": "IcMiss", 1092 + "MetricName": "tma_info_frontend_l2mpki_code" 1093 + }, 1094 + { 1095 + "BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction", 1096 + "MetricExpr": "1e3 * L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY", 1097 + "MetricGroup": "IcMiss", 1098 + "MetricName": "tma_info_frontend_l2mpki_code_all" 1099 + }, 1100 + { 1101 + "BriefDescription": "Branch instructions per taken branch.", 1102 + "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN", 1103 + "MetricGroup": "Branches;Fed;PGO", 1104 + "MetricName": "tma_info_inst_mix_bptkbranch" 1105 + }, 1106 + { 1107 + "BriefDescription": "Total number of retired Instructions", 1108 + "MetricExpr": "INST_RETIRED.ANY", 1109 + "MetricGroup": "Summary;TmaL1;tma_L1_group", 1110 + "MetricName": "tma_info_inst_mix_instructions", 1111 + "PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST" 1112 + }, 1113 + { 1114 + "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", 1115 + "MetricConstraint": "NO_GROUP_EVENTS", 1116 + "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", 1117 + "MetricGroup": "Flops;InsType", 1118 + "MetricName": "tma_info_inst_mix_iparith", 1119 + "MetricThreshold": "tma_info_inst_mix_iparith < 10", 1120 + "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW." 1121 + }, 1122 + { 1123 + "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", 1124 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)", 1125 + "MetricGroup": "Flops;FpVector;InsType", 1126 + "MetricName": "tma_info_inst_mix_iparith_avx128", 1127 + "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", 1128 + "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1129 + }, 1130 + { 1131 + "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", 1132 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)", 1133 + "MetricGroup": "Flops;FpVector;InsType", 1134 + "MetricName": "tma_info_inst_mix_iparith_avx256", 1135 + "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", 1136 + "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1137 + }, 1138 + { 1139 + "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)", 1140 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", 1141 + "MetricGroup": "Flops;FpVector;InsType", 1142 + "MetricName": "tma_info_inst_mix_iparith_avx512", 1143 + "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10", 1144 + "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1145 + }, 1146 + { 1147 + "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", 1148 + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", 1149 + "MetricGroup": "Flops;FpScalar;InsType", 1150 + "MetricName": "tma_info_inst_mix_iparith_scalar_dp", 1151 + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", 1152 + "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1153 + }, 1154 + { 1155 + "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", 1156 + "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED.SCALAR_SINGLE", 1157 + "MetricGroup": "Flops;FpScalar;InsType", 1158 + "MetricName": "tma_info_inst_mix_iparith_scalar_sp", 1159 + "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", 1160 + "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting." 1161 + }, 1162 + { 1163 + "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)", 1164 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES", 1165 + "MetricGroup": "Branches;Fed;InsType", 1166 + "MetricName": "tma_info_inst_mix_ipbranch", 1167 + "MetricThreshold": "tma_info_inst_mix_ipbranch < 8" 1168 + }, 1169 + { 1170 + "BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)", 1171 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL", 1172 + "MetricGroup": "Branches;Fed;PGO", 1173 + "MetricName": "tma_info_inst_mix_ipcall", 1174 + "MetricThreshold": "tma_info_inst_mix_ipcall < 200" 1175 + }, 1176 + { 1177 + "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", 1178 + "MetricConstraint": "NO_GROUP_EVENTS", 1179 + "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)", 1180 + "MetricGroup": "Flops;InsType", 1181 + "MetricName": "tma_info_inst_mix_ipflop", 1182 + "MetricThreshold": "tma_info_inst_mix_ipflop < 10" 1183 + }, 1184 + { 1185 + "BriefDescription": "Instructions per Load (lower number means higher occurrence rate)", 1186 + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_LOADS", 1187 + "MetricGroup": "InsType", 1188 + "MetricName": "tma_info_inst_mix_ipload", 1189 + "MetricThreshold": "tma_info_inst_mix_ipload < 3" 1190 + }, 1191 + { 1192 + "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)", 1193 + "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES", 1194 + "MetricGroup": "InsType", 1195 + "MetricName": "tma_info_inst_mix_ipstore", 1196 + "MetricThreshold": "tma_info_inst_mix_ipstore < 8" 1197 + }, 1198 + { 1199 + "BriefDescription": "Instructions per Software prefetch instruction (of any type: NTA/T0/T1/T2/Prefetch) (lower number means higher occurrence rate)", 1200 + "MetricExpr": "INST_RETIRED.ANY / cpu@SW_PREFETCH_ACCESS.T0\\,umask\\=0xF@", 1201 + "MetricGroup": "Prefetches", 1202 + "MetricName": "tma_info_inst_mix_ipswpf", 1203 + "MetricThreshold": "tma_info_inst_mix_ipswpf < 100" 1204 + }, 1205 + { 1206 + "BriefDescription": "Instruction per taken branch", 1207 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN", 1208 + "MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB", 1209 + "MetricName": "tma_info_inst_mix_iptb", 1210 + "MetricThreshold": "tma_info_inst_mix_iptb < 9", 1211 + "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp" 1212 + }, 1213 + { 1214 + "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]", 1215 + "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time", 1216 + "MetricGroup": "Mem;MemoryBW", 1217 + "MetricName": "tma_info_memory_core_l1d_cache_fill_bw" 1218 + }, 1219 + { 1220 + "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]", 1221 + "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time", 1222 + "MetricGroup": "Mem;MemoryBW", 1223 + "MetricName": "tma_info_memory_core_l2_cache_fill_bw" 1224 + }, 1225 + { 1226 + "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction", 1227 + "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / tma_info_inst_mix_instructions", 1228 + "MetricGroup": "L2Evicts;Mem;Server", 1229 + "MetricName": "tma_info_memory_core_l2_evictions_nonsilent_pki" 1230 + }, 1231 + { 1232 + "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)", 1233 + "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / tma_info_inst_mix_instructions", 1234 + "MetricGroup": "L2Evicts;Mem;Server", 1235 + "MetricName": "tma_info_memory_core_l2_evictions_silent_pki" 1236 + }, 1237 + { 1238 + "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]", 1239 + "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time", 1240 + "MetricGroup": "Mem;MemoryBW;Offcore", 1241 + "MetricName": "tma_info_memory_core_l3_cache_access_bw" 1242 + }, 1243 + { 1244 + "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]", 1245 + "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time", 1246 + "MetricGroup": "Mem;MemoryBW", 1247 + "MetricName": "tma_info_memory_core_l3_cache_fill_bw" 1248 + }, 1249 + { 1250 + "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)", 1251 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY", 1252 + "MetricGroup": "CacheMisses;Mem", 1253 + "MetricName": "tma_info_memory_fb_hpki" 1254 + }, 1255 + { 1256 + "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", 1257 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY", 1258 + "MetricGroup": "CacheMisses;Mem", 1259 + "MetricName": "tma_info_memory_l1mpki" 1260 + }, 1261 + { 1262 + "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)", 1263 + "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY", 1264 + "MetricGroup": "CacheMisses;Mem", 1265 + "MetricName": "tma_info_memory_l1mpki_load" 1266 + }, 1267 + { 1268 + "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)", 1269 + "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY", 1270 + "MetricGroup": "CacheMisses;Mem", 1271 + "MetricName": "tma_info_memory_l2hpki_all" 1272 + }, 1273 + { 1274 + "BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)", 1275 + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY", 1276 + "MetricGroup": "CacheMisses;Mem", 1277 + "MetricName": "tma_info_memory_l2hpki_load" 1278 + }, 1279 + { 1280 + "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads", 1281 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY", 1282 + "MetricGroup": "Backend;CacheMisses;Mem", 1283 + "MetricName": "tma_info_memory_l2mpki" 1284 + }, 1285 + { 1286 + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)", 1287 + "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY", 1288 + "MetricGroup": "CacheMisses;Mem;Offcore", 1289 + "MetricName": "tma_info_memory_l2mpki_all" 1290 + }, 1291 + { 1292 + "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)", 1293 + "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY", 1294 + "MetricGroup": "CacheMisses;Mem", 1295 + "MetricName": "tma_info_memory_l2mpki_load" 1296 + }, 1297 + { 1298 + "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads", 1299 + "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY", 1300 + "MetricGroup": "CacheMisses;Mem", 1301 + "MetricName": "tma_info_memory_l3mpki" 1302 + }, 1303 + { 1304 + "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)", 1305 + "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)", 1306 + "MetricGroup": "Mem;MemoryBound;MemoryLat", 1307 + "MetricName": "tma_info_memory_load_miss_real_latency" 716 1308 }, 717 1309 { 718 1310 "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss", 719 1311 "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES", 720 1312 "MetricGroup": "Mem;MemoryBW;MemoryBound", 721 - "MetricName": "tma_info_mlp", 1313 + "MetricName": "tma_info_memory_mlp", 722 1314 "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)" 1315 + }, 1316 + { 1317 + "BriefDescription": "Average Parallel L2 cache miss data reads", 1318 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", 1319 + "MetricGroup": "Memory_BW;Offcore", 1320 + "MetricName": "tma_info_memory_oro_data_l2_mlp" 1321 + }, 1322 + { 1323 + "BriefDescription": "Average Latency for L2 cache miss demand Loads", 1324 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD", 1325 + "MetricGroup": "Memory_Lat;Offcore", 1326 + "MetricName": "tma_info_memory_oro_load_l2_miss_latency" 1327 + }, 1328 + { 1329 + "BriefDescription": "Average Parallel L2 cache miss demand Loads", 1330 + "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", 1331 + "MetricGroup": "Memory_BW;Offcore", 1332 + "MetricName": "tma_info_memory_oro_load_l2_mlp" 1333 + }, 1334 + { 1335 + "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", 1336 + "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw", 1337 + "MetricGroup": "Mem;MemoryBW", 1338 + "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t" 1339 + }, 1340 + { 1341 + "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]", 1342 + "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw", 1343 + "MetricGroup": "Mem;MemoryBW", 1344 + "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t" 1345 + }, 1346 + { 1347 + "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]", 1348 + "MetricExpr": "tma_info_memory_core_l3_cache_access_bw", 1349 + "MetricGroup": "Mem;MemoryBW;Offcore", 1350 + "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t" 1351 + }, 1352 + { 1353 + "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]", 1354 + "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw", 1355 + "MetricGroup": "Mem;MemoryBW", 1356 + "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t" 1357 + }, 1358 + { 1359 + "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1360 + "MetricExpr": "1e3 * ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1361 + "MetricGroup": "Fed;MemoryTLB", 1362 + "MetricName": "tma_info_memory_tlb_code_stlb_mpki" 1363 + }, 1364 + { 1365 + "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 1366 + "MetricExpr": "1e3 * DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 1367 + "MetricGroup": "Mem;MemoryTLB", 1368 + "MetricName": "tma_info_memory_tlb_load_stlb_mpki" 723 1369 }, 724 1370 { 725 1371 "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", 726 1372 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 727 - "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_clks)", 1373 + "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING) / (2 * tma_info_core_core_clks)", 728 1374 "MetricGroup": "Mem;MemoryTLB", 729 - "MetricName": "tma_info_page_walks_utilization", 730 - "MetricThreshold": "tma_info_page_walks_utilization > 0.5" 731 - }, 732 - { 733 - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", 734 - "MetricExpr": "(CORE_POWER.LVL0_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_clks)", 735 - "MetricGroup": "Power", 736 - "MetricName": "tma_info_power_license0_utilization", 737 - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." 738 - }, 739 - { 740 - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", 741 - "MetricExpr": "(CORE_POWER.LVL1_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_clks)", 742 - "MetricGroup": "Power", 743 - "MetricName": "tma_info_power_license1_utilization", 744 - "MetricThreshold": "tma_info_power_license1_utilization > 0.5", 745 - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." 746 - }, 747 - { 748 - "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", 749 - "MetricExpr": "(CORE_POWER.LVL2_TURBO_LICENSE / 2 / tma_info_core_clks if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_clks)", 750 - "MetricGroup": "Power", 751 - "MetricName": "tma_info_power_license2_utilization", 752 - "MetricThreshold": "tma_info_power_license2_utilization > 0.5", 753 - "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." 754 - }, 755 - { 756 - "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", 757 - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", 758 - "MetricGroup": "Pipeline;Ret", 759 - "MetricName": "tma_info_retire" 760 - }, 761 - { 762 - "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", 763 - "MetricExpr": "4 * tma_info_core_clks", 764 - "MetricGroup": "TmaL1;tma_L1_group", 765 - "MetricName": "tma_info_slots" 766 - }, 767 - { 768 - "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", 769 - "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", 770 - "MetricGroup": "SMT", 771 - "MetricName": "tma_info_smt_2t_utilization" 772 - }, 773 - { 774 - "BriefDescription": "Socket actual clocks when any core is active on that socket", 775 - "MetricExpr": "cha_0@event\\=0x0@", 776 - "MetricGroup": "SoC", 777 - "MetricName": "tma_info_socket_clks" 1375 + "MetricName": "tma_info_memory_tlb_page_walks_utilization", 1376 + "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" 778 1377 }, 779 1378 { 780 1379 "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)", 781 1380 "MetricExpr": "1e3 * DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY", 782 1381 "MetricGroup": "Mem;MemoryTLB", 783 - "MetricName": "tma_info_store_stlb_mpki" 1382 + "MetricName": "tma_info_memory_tlb_store_stlb_mpki" 1383 + }, 1384 + { 1385 + "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread", 1386 + "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@", 1387 + "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", 1388 + "MetricName": "tma_info_pipeline_execute" 1389 + }, 1390 + { 1391 + "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.", 1392 + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@", 1393 + "MetricGroup": "Pipeline;Ret", 1394 + "MetricName": "tma_info_pipeline_retire" 1395 + }, 1396 + { 1397 + "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]", 1398 + "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time", 1399 + "MetricGroup": "Power;Summary", 1400 + "MetricName": "tma_info_system_average_frequency" 1401 + }, 1402 + { 1403 + "BriefDescription": "Average CPU Utilization", 1404 + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", 1405 + "MetricGroup": "HPC;Summary", 1406 + "MetricName": "tma_info_system_cpu_utilization" 1407 + }, 1408 + { 1409 + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", 1410 + "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time", 1411 + "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW", 1412 + "MetricName": "tma_info_system_dram_bw_use", 1413 + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full" 1414 + }, 1415 + { 1416 + "BriefDescription": "Giga Floating Point Operations Per Second", 1417 + "MetricConstraint": "NO_GROUP_EVENTS", 1418 + "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time", 1419 + "MetricGroup": "Cor;Flops;HPC", 1420 + "MetricName": "tma_info_system_gflops", 1421 + "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine." 1422 + }, 1423 + { 1424 + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]", 1425 + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time", 1426 + "MetricGroup": "IoBW;Mem;Server;SoC", 1427 + "MetricName": "tma_info_system_io_read_bw" 1428 + }, 1429 + { 1430 + "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]", 1431 + "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time", 1432 + "MetricGroup": "IoBW;Mem;Server;SoC", 1433 + "MetricName": "tma_info_system_io_write_bw" 1434 + }, 1435 + { 1436 + "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]", 1437 + "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u", 1438 + "MetricGroup": "Branches;OS", 1439 + "MetricName": "tma_info_system_ipfarbranch", 1440 + "MetricThreshold": "tma_info_system_ipfarbranch < 1e6" 1441 + }, 1442 + { 1443 + "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode", 1444 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k", 1445 + "MetricGroup": "OS", 1446 + "MetricName": "tma_info_system_kernel_cpi" 1447 + }, 1448 + { 1449 + "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode", 1450 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD", 1451 + "MetricGroup": "OS", 1452 + "MetricName": "tma_info_system_kernel_utilization", 1453 + "MetricThreshold": "tma_info_system_kernel_utilization > 0.05" 1454 + }, 1455 + { 1456 + "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]", 1457 + "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@", 1458 + "MetricGroup": "Mem;MemoryLat;Server;SoC", 1459 + "MetricName": "tma_info_system_mem_dram_read_latency", 1460 + "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" 1461 + }, 1462 + { 1463 + "BriefDescription": "Average number of parallel data read requests to external memory", 1464 + "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", 1465 + "MetricGroup": "Mem;MemoryBW;SoC", 1466 + "MetricName": "tma_info_system_mem_parallel_reads", 1467 + "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" 1468 + }, 1469 + { 1470 + "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", 1471 + "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / duration_time)", 1472 + "MetricGroup": "Mem;MemoryLat;SoC", 1473 + "MetricName": "tma_info_system_mem_read_latency", 1474 + "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)" 1475 + }, 1476 + { 1477 + "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0", 1478 + "MetricExpr": "(CORE_POWER.LVL0_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks)", 1479 + "MetricGroup": "Power", 1480 + "MetricName": "tma_info_system_power_license0_utilization", 1481 + "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0. This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes." 1482 + }, 1483 + { 1484 + "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1", 1485 + "MetricExpr": "(CORE_POWER.LVL1_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL1_TURBO_LICENSE / tma_info_core_core_clks)", 1486 + "MetricGroup": "Power", 1487 + "MetricName": "tma_info_system_power_license1_utilization", 1488 + "MetricThreshold": "tma_info_system_power_license1_utilization > 0.5", 1489 + "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 1. This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions." 1490 + }, 1491 + { 1492 + "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX)", 1493 + "MetricExpr": "(CORE_POWER.LVL2_TURBO_LICENSE / 2 / tma_info_core_core_clks if #SMT_on else CORE_POWER.LVL2_TURBO_LICENSE / tma_info_core_core_clks)", 1494 + "MetricGroup": "Power", 1495 + "MetricName": "tma_info_system_power_license2_utilization", 1496 + "MetricThreshold": "tma_info_system_power_license2_utilization > 0.5", 1497 + "PublicDescription": "Fraction of Core cycles where the core was running with power-delivery for license level 2 (introduced in SKX). This includes high current AVX 512-bit instructions." 1498 + }, 1499 + { 1500 + "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active", 1501 + "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)", 1502 + "MetricGroup": "SMT", 1503 + "MetricName": "tma_info_system_smt_2t_utilization" 1504 + }, 1505 + { 1506 + "BriefDescription": "Socket actual clocks when any core is active on that socket", 1507 + "MetricExpr": "cha_0@event\\=0x0@", 1508 + "MetricGroup": "SoC", 1509 + "MetricName": "tma_info_system_socket_clks" 784 1510 }, 785 1511 { 786 1512 "BriefDescription": "Average Frequency Utilization relative nominal frequency", 787 - "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC", 1513 + "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC", 788 1514 "MetricGroup": "Power", 789 - "MetricName": "tma_info_turbo_utilization" 1515 + "MetricName": "tma_info_system_turbo_utilization" 1516 + }, 1517 + { 1518 + "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", 1519 + "MetricExpr": "CPU_CLK_UNHALTED.THREAD", 1520 + "MetricGroup": "Pipeline", 1521 + "MetricName": "tma_info_thread_clks" 1522 + }, 1523 + { 1524 + "BriefDescription": "Cycles Per Instruction (per Logical Processor)", 1525 + "MetricExpr": "1 / tma_info_thread_ipc", 1526 + "MetricGroup": "Mem;Pipeline", 1527 + "MetricName": "tma_info_thread_cpi" 1528 + }, 1529 + { 1530 + "BriefDescription": "The ratio of Executed- by Issued-Uops", 1531 + "MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY", 1532 + "MetricGroup": "Cor;Pipeline", 1533 + "MetricName": "tma_info_thread_execute_per_issue", 1534 + "PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage." 1535 + }, 1536 + { 1537 + "BriefDescription": "Instructions Per Cycle (per Logical Processor)", 1538 + "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks", 1539 + "MetricGroup": "Ret;Summary", 1540 + "MetricName": "tma_info_thread_ipc" 1541 + }, 1542 + { 1543 + "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)", 1544 + "MetricExpr": "4 * tma_info_core_core_clks", 1545 + "MetricGroup": "TmaL1;tma_L1_group", 1546 + "MetricName": "tma_info_thread_slots" 790 1547 }, 791 1548 { 792 1549 "BriefDescription": "Uops Per Instruction", 793 1550 "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY", 794 1551 "MetricGroup": "Pipeline;Ret;Retire", 795 - "MetricName": "tma_info_uoppi", 796 - "MetricThreshold": "tma_info_uoppi > 1.05" 1552 + "MetricName": "tma_info_thread_uoppi", 1553 + "MetricThreshold": "tma_info_thread_uoppi > 1.05" 797 1554 }, 798 1555 { 799 1556 "BriefDescription": "Instruction per taken branch", 800 1557 "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN", 801 1558 "MetricGroup": "Branches;Fed;FetchBW", 802 - "MetricName": "tma_info_uptb", 803 - "MetricThreshold": "tma_info_uptb < 6" 1559 + "MetricName": "tma_info_thread_uptb", 1560 + "MetricThreshold": "tma_info_thread_uptb < 6" 804 1561 }, 805 1562 { 806 1563 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses", 807 - "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_clks", 1564 + "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks", 808 1565 "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group", 809 1566 "MetricName": "tma_itlb_misses", 810 1567 "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 1302 1081 }, 1303 1082 { 1304 1083 "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache", 1305 - "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_clks, 0)", 1084 + "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)", 1306 1085 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group", 1307 1086 "MetricName": "tma_l1_bound", 1308 1087 "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1312 1091 { 1313 1092 "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads", 1314 1093 "MetricConstraint": "NO_GROUP_EVENTS", 1315 - "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_clks)", 1094 + "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)", 1316 1095 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1317 1096 "MetricName": "tma_l2_bound", 1318 1097 "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1321 1100 }, 1322 1101 { 1323 1102 "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", 1324 - "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_clks", 1103 + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", 1325 1104 "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1326 1105 "MetricName": "tma_l3_bound", 1327 1106 "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1330 1109 }, 1331 1110 { 1332 1111 "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", 1333 - "MetricExpr": "17 * tma_info_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1112 + "MetricExpr": "17 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1334 1113 "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", 1335 1114 "MetricName": "tma_l3_hit_latency", 1336 1115 "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1337 - "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_memory_latency, tma_mem_latency", 1116 + "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency", 1338 1117 "ScaleUnit": "100%" 1339 1118 }, 1340 1119 { 1341 1120 "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)", 1342 - "MetricExpr": "ILD_STALL.LCP / tma_info_clks", 1121 + "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks", 1343 1122 "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB", 1344 1123 "MetricName": "tma_lcp", 1345 1124 "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", 1346 - "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb", 1125 + "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb", 1347 1126 "ScaleUnit": "100%" 1348 1127 }, 1349 1128 { ··· 1358 1137 }, 1359 1138 { 1360 1139 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", 1361 - "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)", 1140 + "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", 1362 1141 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 1363 1142 "MetricName": "tma_load_op_utilization", 1364 1143 "MetricThreshold": "tma_load_op_utilization > 0.6", ··· 1376 1155 }, 1377 1156 { 1378 1157 "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk", 1379 - "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_clks", 1158 + "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / tma_info_thread_clks", 1380 1159 "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_load_group", 1381 1160 "MetricName": "tma_load_stlb_miss", 1382 1161 "MetricThreshold": "tma_load_stlb_miss > 0.05 & (tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1384 1163 }, 1385 1164 { 1386 1165 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", 1387 - "MetricExpr": "59.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1166 + "MetricExpr": "59.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1388 1167 "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", 1389 1168 "MetricName": "tma_local_dram", 1390 1169 "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1393 1172 }, 1394 1173 { 1395 1174 "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", 1396 - "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_clks", 1175 + "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", 1397 1176 "MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", 1398 1177 "MetricName": "tma_lock_latency", 1399 1178 "MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1413 1192 }, 1414 1193 { 1415 1194 "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)", 1416 - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_clks", 1195 + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks", 1417 1196 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", 1418 1197 "MetricName": "tma_mem_bandwidth", 1419 1198 "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1420 - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_sq_full", 1199 + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full", 1421 1200 "ScaleUnit": "100%" 1422 1201 }, 1423 1202 { 1424 1203 "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)", 1425 - "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth", 1204 + "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth", 1426 1205 "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", 1427 1206 "MetricName": "tma_mem_latency", 1428 1207 "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1429 - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_memory_latency, tma_l3_hit_latency", 1208 + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency", 1430 1209 "ScaleUnit": "100%" 1431 1210 }, 1432 1211 { ··· 1450 1229 }, 1451 1230 { 1452 1231 "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", 1453 - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots", 1232 + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", 1454 1233 "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", 1455 1234 "MetricName": "tma_microcode_sequencer", 1456 1235 "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1", ··· 1459 1238 }, 1460 1239 { 1461 1240 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", 1462 - "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_clks", 1241 + "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", 1463 1242 "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", 1464 1243 "MetricName": "tma_mispredicts_resteers", 1465 1244 "MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", 1466 - "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_branch_misprediction_cost, tma_info_mispredictions", 1245 + "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions", 1467 1246 "ScaleUnit": "100%" 1468 1247 }, 1469 1248 { 1470 1249 "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", 1471 - "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2", 1250 + "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2", 1472 1251 "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", 1473 1252 "MetricName": "tma_mite", 1474 - "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)", 1253 + "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)", 1475 1254 "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS", 1476 1255 "ScaleUnit": "100%" 1477 1256 }, ··· 1486 1265 }, 1487 1266 { 1488 1267 "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)", 1489 - "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks", 1268 + "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks", 1490 1269 "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO", 1491 1270 "MetricName": "tma_ms_switches", 1492 1271 "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)", ··· 1522 1301 }, 1523 1302 { 1524 1303 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)", 1525 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks", 1304 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks", 1526 1305 "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1527 1306 "MetricName": "tma_port_0", 1528 1307 "MetricThreshold": "tma_port_0 > 0.6", ··· 1531 1310 }, 1532 1311 { 1533 1312 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)", 1534 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks", 1313 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks", 1535 1314 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1536 1315 "MetricName": "tma_port_1", 1537 1316 "MetricThreshold": "tma_port_1 > 0.6", ··· 1540 1319 }, 1541 1320 { 1542 1321 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)", 1543 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks", 1322 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks", 1544 1323 "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", 1545 1324 "MetricName": "tma_port_2", 1546 1325 "MetricThreshold": "tma_port_2 > 0.6", ··· 1549 1328 }, 1550 1329 { 1551 1330 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)", 1552 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks", 1331 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks", 1553 1332 "MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group", 1554 1333 "MetricName": "tma_port_3", 1555 1334 "MetricThreshold": "tma_port_3 > 0.6", ··· 1567 1346 }, 1568 1347 { 1569 1348 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)", 1570 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks", 1349 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks", 1571 1350 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1572 1351 "MetricName": "tma_port_5", 1573 1352 "MetricThreshold": "tma_port_5 > 0.6", ··· 1576 1355 }, 1577 1356 { 1578 1357 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)", 1579 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks", 1358 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks", 1580 1359 "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P", 1581 1360 "MetricName": "tma_port_6", 1582 1361 "MetricThreshold": "tma_port_6 > 0.6", ··· 1585 1364 }, 1586 1365 { 1587 1366 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)", 1588 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks", 1367 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks", 1589 1368 "MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group", 1590 1369 "MetricName": "tma_port_7", 1591 1370 "MetricThreshold": "tma_port_7 > 0.6", ··· 1594 1373 }, 1595 1374 { 1596 1375 "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", 1597 - "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_clks)", 1376 + "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", 1598 1377 "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", 1599 1378 "MetricName": "tma_ports_utilization", 1600 1379 "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", ··· 1603 1382 }, 1604 1383 { 1605 1384 "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1606 - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_clks", 1385 + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks", 1607 1386 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", 1608 1387 "MetricName": "tma_ports_utilized_0", 1609 1388 "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1612 1391 }, 1613 1392 { 1614 1393 "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1615 - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_clks", 1394 + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / tma_info_core_core_clks", 1616 1395 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", 1617 1396 "MetricName": "tma_ports_utilized_1", 1618 1397 "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1621 1400 }, 1622 1401 { 1623 1402 "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", 1624 - "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_clks", 1403 + "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_core_core_clks", 1625 1404 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", 1626 1405 "MetricName": "tma_ports_utilized_2", 1627 1406 "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1630 1409 }, 1631 1410 { 1632 1411 "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).", 1633 - "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_clks", 1412 + "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks", 1634 1413 "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", 1635 1414 "MetricName": "tma_ports_utilized_3m", 1636 1415 "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", ··· 1639 1418 { 1640 1419 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", 1641 1420 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 1642 - "MetricExpr": "(89.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1421 + "MetricExpr": "(89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1643 1422 "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", 1644 1423 "MetricName": "tma_remote_cache", 1645 1424 "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1648 1427 }, 1649 1428 { 1650 1429 "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", 1651 - "MetricExpr": "127 * tma_info_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_clks", 1430 + "MetricExpr": "127 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", 1652 1431 "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", 1653 1432 "MetricName": "tma_remote_dram", 1654 1433 "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1657 1436 }, 1658 1437 { 1659 1438 "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", 1660 - "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots", 1439 + "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots", 1661 1440 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1662 1441 "MetricName": "tma_retiring", 1663 1442 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", ··· 1667 1446 }, 1668 1447 { 1669 1448 "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations", 1670 - "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_clks", 1449 + "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks", 1671 1450 "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group", 1672 1451 "MetricName": "tma_serializing_operation", 1673 1452 "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))", ··· 1677 1456 { 1678 1457 "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary", 1679 1458 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 1680 - "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks", 1459 + "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks", 1681 1460 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 1682 1461 "MetricName": "tma_split_loads", 1683 1462 "MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1686 1465 }, 1687 1466 { 1688 1467 "BriefDescription": "This metric represents rate of split store accesses", 1689 - "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_clks", 1468 + "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", 1690 1469 "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", 1691 1470 "MetricName": "tma_split_stores", 1692 1471 "MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1695 1474 }, 1696 1475 { 1697 1476 "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)", 1698 - "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks", 1477 + "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks", 1699 1478 "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", 1700 1479 "MetricName": "tma_sq_full", 1701 1480 "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", 1702 - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_info_memory_bandwidth, tma_mem_bandwidth", 1481 + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth", 1703 1482 "ScaleUnit": "100%" 1704 1483 }, 1705 1484 { 1706 1485 "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write", 1707 - "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_clks", 1486 + "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / tma_info_thread_clks", 1708 1487 "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", 1709 1488 "MetricName": "tma_store_bound", 1710 1489 "MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", ··· 1713 1492 }, 1714 1493 { 1715 1494 "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", 1716 - "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks", 1495 + "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", 1717 1496 "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", 1718 1497 "MetricName": "tma_store_fwd_blk", 1719 1498 "MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1723 1502 { 1724 1503 "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses", 1725 1504 "MetricConstraint": "NO_GROUP_EVENTS_NMI", 1726 - "MetricExpr": "(L2_RQSTS.RFO_HIT * 11 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks", 1505 + "MetricExpr": "(L2_RQSTS.RFO_HIT * 11 * (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) + (1 - MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks", 1727 1506 "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group", 1728 1507 "MetricName": "tma_store_latency", 1729 1508 "MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", ··· 1732 1511 }, 1733 1512 { 1734 1513 "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations", 1735 - "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks", 1514 + "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks", 1736 1515 "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", 1737 1516 "MetricName": "tma_store_op_utilization", 1738 1517 "MetricThreshold": "tma_store_op_utilization > 0.6", ··· 1748 1527 }, 1749 1528 { 1750 1529 "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk", 1751 - "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_clks", 1530 + "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / tma_info_core_core_clks", 1752 1531 "MetricGroup": "MemoryTLB;TopdownL5;tma_L5_group;tma_dtlb_store_group", 1753 1532 "MetricName": "tma_store_stlb_miss", 1754 1533 "MetricThreshold": "tma_store_stlb_miss > 0.05 & (tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", ··· 1756 1535 }, 1757 1536 { 1758 1537 "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears", 1759 - "MetricExpr": "9 * BACLEARS.ANY / tma_info_clks", 1538 + "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks", 1760 1539 "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group", 1761 1540 "MetricName": "tma_unknown_branches", 1762 1541 "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))", ··· 1799 1578 "MetricGroup": "transaction", 1800 1579 "MetricName": "tsx_transactional_cycles", 1801 1580 "ScaleUnit": "100%" 1581 + }, 1582 + { 1583 + "BriefDescription": "Uncore operating frequency in GHz", 1584 + "MetricExpr": "UNC_CHA_CLOCKTICKS / (#num_cores / #num_packages * #num_packages) / 1e9 / duration_time", 1585 + "MetricName": "uncore_frequency", 1586 + "ScaleUnit": "1GHz" 1587 + }, 1588 + { 1589 + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", 1590 + "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", 1591 + "MetricName": "upi_data_transmit_bw", 1592 + "ScaleUnit": "1MB/s" 1802 1593 } 1803 1594 ]