Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf arm-spe: Use SPE data source for neoverse cores

When synthesizing data from SPE, augment the type with source information
for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
the same encoding. I can't find encoding information for any other SPE
implementations to unify their choices with Arm's thus that is left for
future work.

This change populates the mem_lvl_num for Neoverse cores as well as the
deprecated mem_lvl namespace.

Reviewed-by: German Gomez <german.gomez@arm.com>
Reviewed-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Ali Saidi <alisaidi@amazon.com>
Tested-by: Leo Yan <leo.yan@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Like Xu <likexu@tencent.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Timothy Hayes <timothy.hayes@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20220811062451.435810-4-leo.yan@linaro.org
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Ali Saidi and committed by
Arnaldo Carvalho de Melo
4e6430cb f78d6250

+131 -20
+1
tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
··· 220 220 221 221 break; 222 222 case ARM_SPE_DATA_SOURCE: 223 + decoder->record.source = payload; 223 224 break; 224 225 case ARM_SPE_BAD: 225 226 break;
+12
tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
··· 29 29 ARM_SPE_ST = 1 << 1, 30 30 }; 31 31 32 + enum arm_spe_neoverse_data_source { 33 + ARM_SPE_NV_L1D = 0x0, 34 + ARM_SPE_NV_L2 = 0x8, 35 + ARM_SPE_NV_PEER_CORE = 0x9, 36 + ARM_SPE_NV_LOCAL_CLUSTER = 0xa, 37 + ARM_SPE_NV_SYS_CACHE = 0xb, 38 + ARM_SPE_NV_PEER_CLUSTER = 0xc, 39 + ARM_SPE_NV_REMOTE = 0xd, 40 + ARM_SPE_NV_DRAM = 0xe, 41 + }; 42 + 32 43 struct arm_spe_record { 33 44 enum arm_spe_sample_type type; 34 45 int err; ··· 51 40 u64 virt_addr; 52 41 u64 phys_addr; 53 42 u64 context_id; 43 + u16 source; 54 44 }; 55 45 56 46 struct arm_spe_insn;
+118 -20
tools/perf/util/arm-spe.c
··· 34 34 #include "arm-spe-decoder/arm-spe-decoder.h" 35 35 #include "arm-spe-decoder/arm-spe-pkt-decoder.h" 36 36 37 + #include "../../arch/arm64/include/asm/cputype.h" 37 38 #define MAX_TIMESTAMP (~0ULL) 38 39 39 40 struct arm_spe { ··· 46 45 struct perf_session *session; 47 46 struct machine *machine; 48 47 u32 pmu_type; 48 + u64 midr; 49 49 50 50 struct perf_tsc_conversion tc; 51 51 ··· 389 387 return arm_spe_deliver_synth_event(spe, speq, event, &sample); 390 388 } 391 389 392 - static u64 arm_spe__synth_data_source(const struct arm_spe_record *record) 390 + static const struct midr_range neoverse_spe[] = { 391 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), 392 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), 393 + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), 394 + {}, 395 + }; 396 + 397 + static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record, 398 + union perf_mem_data_src *data_src) 399 + { 400 + /* 401 + * Even though four levels of cache hierarchy are possible, no known 402 + * production Neoverse systems currently include more than three levels 403 + * so for the time being we assume three exist. If a production system 404 + * is built with four the this function would have to be changed to 405 + * detect the number of levels for reporting. 406 + */ 407 + 408 + /* 409 + * We have no data on the hit level or data source for stores in the 410 + * Neoverse SPE records. 411 + */ 412 + if (record->op & ARM_SPE_ST) { 413 + data_src->mem_lvl = PERF_MEM_LVL_NA; 414 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; 415 + data_src->mem_snoop = PERF_MEM_SNOOP_NA; 416 + return; 417 + } 418 + 419 + switch (record->source) { 420 + case ARM_SPE_NV_L1D: 421 + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 422 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; 423 + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 424 + break; 425 + case ARM_SPE_NV_L2: 426 + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 427 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 428 + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 429 + break; 430 + case ARM_SPE_NV_PEER_CORE: 431 + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 432 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; 433 + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 434 + break; 435 + /* 436 + * We don't know if this is L1, L2 but we do know it was a cache-2-cache 437 + * transfer, so set SNOOPX_PEER 438 + */ 439 + case ARM_SPE_NV_LOCAL_CLUSTER: 440 + case ARM_SPE_NV_PEER_CLUSTER: 441 + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 442 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 443 + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 444 + break; 445 + /* 446 + * System cache is assumed to be L3 447 + */ 448 + case ARM_SPE_NV_SYS_CACHE: 449 + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 450 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; 451 + data_src->mem_snoop = PERF_MEM_SNOOP_HIT; 452 + break; 453 + /* 454 + * We don't know what level it hit in, except it came from the other 455 + * socket 456 + */ 457 + case ARM_SPE_NV_REMOTE: 458 + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; 459 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; 460 + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 461 + data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; 462 + break; 463 + case ARM_SPE_NV_DRAM: 464 + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 465 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM; 466 + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; 467 + break; 468 + default: 469 + break; 470 + } 471 + } 472 + 473 + static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record, 474 + union perf_mem_data_src *data_src) 475 + { 476 + if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { 477 + data_src->mem_lvl = PERF_MEM_LVL_L3; 478 + 479 + if (record->type & ARM_SPE_LLC_MISS) 480 + data_src->mem_lvl |= PERF_MEM_LVL_MISS; 481 + else 482 + data_src->mem_lvl |= PERF_MEM_LVL_HIT; 483 + } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { 484 + data_src->mem_lvl = PERF_MEM_LVL_L1; 485 + 486 + if (record->type & ARM_SPE_L1D_MISS) 487 + data_src->mem_lvl |= PERF_MEM_LVL_MISS; 488 + else 489 + data_src->mem_lvl |= PERF_MEM_LVL_HIT; 490 + } 491 + 492 + if (record->type & ARM_SPE_REMOTE_ACCESS) 493 + data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; 494 + } 495 + 496 + static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr) 393 497 { 394 498 union perf_mem_data_src data_src = { 0 }; 499 + bool is_neoverse = is_midr_in_range(midr, neoverse_spe); 395 500 396 501 if (record->op == ARM_SPE_LD) 397 502 data_src.mem_op = PERF_MEM_OP_LOAD; ··· 507 398 else 508 399 return 0; 509 400 510 - if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { 511 - data_src.mem_lvl = PERF_MEM_LVL_L3; 512 - 513 - if (record->type & ARM_SPE_LLC_MISS) 514 - data_src.mem_lvl |= PERF_MEM_LVL_MISS; 515 - else 516 - data_src.mem_lvl |= PERF_MEM_LVL_HIT; 517 - } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { 518 - data_src.mem_lvl = PERF_MEM_LVL_L1; 519 - 520 - if (record->type & ARM_SPE_L1D_MISS) 521 - data_src.mem_lvl |= PERF_MEM_LVL_MISS; 522 - else 523 - data_src.mem_lvl |= PERF_MEM_LVL_HIT; 524 - } 525 - 526 - if (record->type & ARM_SPE_REMOTE_ACCESS) 527 - data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1; 401 + if (is_neoverse) 402 + arm_spe__synth_data_source_neoverse(record, &data_src); 403 + else 404 + arm_spe__synth_data_source_generic(record, &data_src); 528 405 529 406 if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { 530 407 data_src.mem_dtlb = PERF_MEM_TLB_WK; ··· 531 436 u64 data_src; 532 437 int err; 533 438 534 - data_src = arm_spe__synth_data_source(record); 439 + data_src = arm_spe__synth_data_source(record, spe->midr); 535 440 536 441 if (spe->sample_flc) { 537 442 if (record->type & ARM_SPE_L1D_MISS) { ··· 1273 1178 struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; 1274 1179 size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX; 1275 1180 struct perf_record_time_conv *tc = &session->time_conv; 1181 + const char *cpuid = perf_env__cpuid(session->evlist->env); 1182 + u64 midr = strtol(cpuid, NULL, 16); 1276 1183 struct arm_spe *spe; 1277 1184 int err; 1278 1185 ··· 1294 1197 spe->machine = &session->machines.host; /* No kvm support */ 1295 1198 spe->auxtrace_type = auxtrace_info->type; 1296 1199 spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE]; 1200 + spe->midr = midr; 1297 1201 1298 1202 spe->timeless_decoding = arm_spe__is_timeless_decoding(spe); 1299 1203