Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf arm-spe: Support previous branch target (PBT) address

When FEAT_SPE_PBT is implemented, the previous branch target address
(named as PBT) before the sampled operation, will be recorded.

This commit first introduces a 'prev_br_tgt' field in the record for
saving the PBT address in the decoder.

If the current operation is a branch instruction, by combining with PBT,
it can create a chain with two consecutive branches. As the branch
stack stores branches in descending order, meaning a newer branch is
stored in a lower entry in the stack. Arm SPE stores the latest branch
in the first entry of branch stack, and the previous branch coming from
PBT is stored into the second entry.

Otherwise, if current operation is not a branch, the last branch will be
saved for PBT only. PBT lacks associated information such as branch
source address, branch type, and events. The branch entry fills zeros
for the corresponding fields and only set its target address.

After:

perf script -f --itrace=bl -F flags,addr,brstack
jcc ffff800080187914 0xffff8000801878fc/0xffff800080187914/P/-/-/1/COND/- 0x0/0xffff8000801878f8/-/-/-/0//-
jcc ffff8000802d12d8 0xffff8000802d12f8/0xffff8000802d12d8/P/-/-/1/COND/- 0x0/0xffff8000802d12ec/-/-/-/0//-
jcc ffff8000813fe200 0xffff8000813fe20c/0xffff8000813fe200/P/-/-/1/COND/- 0x0/0xffff8000813fe200/-/-/-/0//-
jcc ffff8000813fe200 0xffff8000813fe20c/0xffff8000813fe200/P/-/-/1/COND/- 0x0/0xffff8000813fe200/-/-/-/0//-
jmp ffff800081410980 0xffff800081419108/0xffff800081410980/P/-/-/1//- 0x0/0xffff800081419104/-/-/-/0//-
return ffff80008036e064 0xffff80008141ba84/0xffff80008036e064/P/-/-/1/RET/- 0x0/0xffff80008141ba60/-/-/-/0//-
jcc ffff8000803d54f0 0xffff8000803d54e8/0xffff8000803d54f0/P/-/-/1/COND/- 0x0/0xffff8000803d54e0/-/-/-/0//-
jmp ffff80008015e468 0xffff8000803d46dc/0xffff80008015e468/P/-/-/1//- 0x0/0xffff8000803d46c8/-/-/-/0//-
jmp ffff8000806e2d50 0xffff80008040f710/0xffff8000806e2d50/P/-/-/1//- 0x0/0xffff80008040f6e8/-/-/-/0//-
jcc ffff800080721704 0xffff8000807216b4/0xffff800080721704/P/-/-/1/COND/- 0x0/0xffff8000807216ac/-/-/-/0//-

Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Link: https://lore.kernel.org/r/20250304111240.3378214-13-leo.yan@arm.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>

authored by

Leo Yan and committed by
Namhyung Kim
2cc2f258 73cb57f5

+71 -51
+4 -1
tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
··· 28 28 29 29 /* Instruction virtual address or Branch target address */ 30 30 if (index == SPE_ADDR_PKT_HDR_INDEX_INS || 31 - index == SPE_ADDR_PKT_HDR_INDEX_BRANCH) { 31 + index == SPE_ADDR_PKT_HDR_INDEX_BRANCH || 32 + index == SPE_ADDR_PKT_HDR_INDEX_PREV_BRANCH) { 32 33 ns = SPE_ADDR_PKT_GET_NS(payload); 33 34 el = SPE_ADDR_PKT_GET_EL(payload); 34 35 ··· 182 181 decoder->record.virt_addr = ip; 183 182 else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS) 184 183 decoder->record.phys_addr = ip; 184 + else if (idx == SPE_ADDR_PKT_HDR_INDEX_PREV_BRANCH) 185 + decoder->record.prev_br_tgt = ip; 185 186 break; 186 187 case ARM_SPE_COUNTER: 187 188 if (idx == SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT)
+1
tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
··· 89 89 u32 latency; 90 90 u64 from_ip; 91 91 u64 to_ip; 92 + u64 prev_br_tgt; 92 93 u64 timestamp; 93 94 u64 virt_addr; 94 95 u64 phys_addr;
+66 -50
tools/perf/util/arm-spe.c
··· 237 237 if (spe->synth_opts.last_branch) { 238 238 size_t sz = sizeof(struct branch_stack); 239 239 240 - /* Allocate one entry for TGT */ 241 - sz += sizeof(struct branch_entry); 240 + /* Allocate up to two entries for PBT + TGT */ 241 + sz += sizeof(struct branch_entry) * 242 + min(spe->synth_opts.last_branch_sz, 2U); 242 243 speq->last_branch = zalloc(sz); 243 244 if (!speq->last_branch) 244 245 goto out_free; ··· 363 362 364 363 static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq) 365 364 { 365 + struct arm_spe *spe = speq->spe; 366 366 struct arm_spe_record *record = &speq->decoder->record; 367 367 struct branch_stack *bstack = speq->last_branch; 368 368 struct branch_flags *bs_flags; 369 + unsigned int last_branch_sz = spe->synth_opts.last_branch_sz; 370 + bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH); 371 + bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt; 369 372 size_t sz = sizeof(struct branch_stack) + 370 - sizeof(struct branch_entry) /* TGT */; 373 + sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */; 374 + int i = 0; 371 375 372 376 /* Clean up branch stack */ 373 377 memset(bstack, 0x0, sz); 374 378 375 - if (!(speq->flags & PERF_IP_FLAG_BRANCH)) 379 + if (!have_tgt && !have_pbt) 376 380 return; 377 381 378 - bstack->entries[0].from = record->from_ip; 379 - bstack->entries[0].to = record->to_ip; 382 + if (have_tgt) { 383 + bstack->entries[i].from = record->from_ip; 384 + bstack->entries[i].to = record->to_ip; 380 385 381 - bs_flags = &bstack->entries[0].flags; 382 - bs_flags->value = 0; 386 + bs_flags = &bstack->entries[i].flags; 387 + bs_flags->value = 0; 383 388 384 - if (record->op & ARM_SPE_OP_BR_CR_BL) { 385 - if (record->op & ARM_SPE_OP_BR_COND) 386 - bs_flags->type |= PERF_BR_COND_CALL; 387 - else 388 - bs_flags->type |= PERF_BR_CALL; 389 - /* 390 - * Indirect branch instruction without link (e.g. BR), 391 - * take this case as function return. 392 - */ 393 - } else if (record->op & ARM_SPE_OP_BR_CR_RET || 394 - record->op & ARM_SPE_OP_BR_INDIRECT) { 395 - if (record->op & ARM_SPE_OP_BR_COND) 396 - bs_flags->type |= PERF_BR_COND_RET; 397 - else 398 - bs_flags->type |= PERF_BR_RET; 399 - } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { 400 - if (record->op & ARM_SPE_OP_BR_COND) 401 - bs_flags->type |= PERF_BR_COND; 402 - else 403 - bs_flags->type |= PERF_BR_UNCOND; 404 - } else { 405 - if (record->op & ARM_SPE_OP_BR_COND) 406 - bs_flags->type |= PERF_BR_COND; 407 - else 408 - bs_flags->type |= PERF_BR_UNKNOWN; 389 + if (record->op & ARM_SPE_OP_BR_CR_BL) { 390 + if (record->op & ARM_SPE_OP_BR_COND) 391 + bs_flags->type |= PERF_BR_COND_CALL; 392 + else 393 + bs_flags->type |= PERF_BR_CALL; 394 + /* 395 + * Indirect branch instruction without link (e.g. BR), 396 + * take this case as function return. 397 + */ 398 + } else if (record->op & ARM_SPE_OP_BR_CR_RET || 399 + record->op & ARM_SPE_OP_BR_INDIRECT) { 400 + if (record->op & ARM_SPE_OP_BR_COND) 401 + bs_flags->type |= PERF_BR_COND_RET; 402 + else 403 + bs_flags->type |= PERF_BR_RET; 404 + } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) { 405 + if (record->op & ARM_SPE_OP_BR_COND) 406 + bs_flags->type |= PERF_BR_COND; 407 + else 408 + bs_flags->type |= PERF_BR_UNCOND; 409 + } else { 410 + if (record->op & ARM_SPE_OP_BR_COND) 411 + bs_flags->type |= PERF_BR_COND; 412 + else 413 + bs_flags->type |= PERF_BR_UNKNOWN; 414 + } 415 + 416 + if (record->type & ARM_SPE_BRANCH_MISS) { 417 + bs_flags->mispred = 1; 418 + bs_flags->predicted = 0; 419 + } else { 420 + bs_flags->mispred = 0; 421 + bs_flags->predicted = 1; 422 + } 423 + 424 + if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 425 + bs_flags->not_taken = 1; 426 + 427 + if (record->type & ARM_SPE_IN_TXN) 428 + bs_flags->in_tx = 1; 429 + 430 + bs_flags->cycles = min(record->latency, 0xFFFFU); 431 + i++; 409 432 } 410 433 411 - if (record->type & ARM_SPE_BRANCH_MISS) { 412 - bs_flags->mispred = 1; 413 - bs_flags->predicted = 0; 414 - } else { 415 - bs_flags->mispred = 0; 416 - bs_flags->predicted = 1; 434 + if (have_pbt) { 435 + bs_flags = &bstack->entries[i].flags; 436 + bs_flags->type |= PERF_BR_UNKNOWN; 437 + bstack->entries[i].to = record->prev_br_tgt; 438 + i++; 417 439 } 418 440 419 - if (record->type & ARM_SPE_BRANCH_NOT_TAKEN) 420 - bs_flags->not_taken = 1; 421 - 422 - if (record->type & ARM_SPE_IN_TXN) 423 - bs_flags->in_tx = 1; 424 - 425 - bs_flags->cycles = min(record->latency, 0xFFFFU); 426 - 427 - bstack->nr = 1; 441 + bstack->nr = i; 428 442 bstack->hw_idx = -1ULL; 429 443 } 430 444 ··· 1600 1584 } 1601 1585 1602 1586 if (spe->synth_opts.last_branch) { 1603 - if (spe->synth_opts.last_branch_sz > 1) 1604 - pr_debug("Arm SPE supports only one bstack entry (TGT).\n"); 1587 + if (spe->synth_opts.last_branch_sz > 2) 1588 + pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n"); 1605 1589 1606 1590 attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; 1607 1591 /*