utils/telemetry/perfettoTracing.ts at main · nonbinary.computer/claude-code

nonbinary.computer / claude-code
forked from oppi.li/claude-code
fork atom
source dump of claude code
fork atom
claude-code / utils / telemetry / perfettoTracing.ts
at main 1120 lines 30 kB view raw
wrap content
oppi.li dump from zip 15d ago
63aada3f
   1/**
   2 * Perfetto Tracing for Claude Code (Ant-only)
   3 *
   4 * This module generates traces in the Chrome Trace Event format that can be
   5 * viewed in ui.perfetto.dev or Chrome's chrome://tracing.
   6 *
   7 * NOTE: This feature is ant-only and eliminated from external builds.
   8 *
   9 * The trace file includes:
  10 * - Agent hierarchy (parent-child relationships in a swarm)
  11 * - API requests with TTFT, TTLT, prompt length, cache stats, msg ID, speculative flag
  12 * - Tool executions with name, duration, and token usage
  13 * - User input waiting time
  14 *
  15 * Usage:
  16 * 1. Enable via CLAUDE_CODE_PERFETTO_TRACE=1 or CLAUDE_CODE_PERFETTO_TRACE=<path>
  17 * 2. Optionally set CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S=<positive integer> to write the
  18 *    trace file periodically (default: write only on exit).
  19 * 3. Run Claude Code normally
  20 * 4. Trace file is written to ~/.claude/traces/trace-<session-id>.json
  21 *    or to the specified path
  22 * 5. Open in ui.perfetto.dev to visualize
  23 */
  24
  25import { feature } from 'bun:bundle'
  26import { mkdirSync, writeFileSync } from 'fs'
  27import { mkdir, writeFile } from 'fs/promises'
  28import { dirname, join } from 'path'
  29import { getSessionId } from '../../bootstrap/state.js'
  30import { registerCleanup } from '../cleanupRegistry.js'
  31import { logForDebugging } from '../debug.js'
  32import {
  33  getClaudeConfigHomeDir,
  34  isEnvDefinedFalsy,
  35  isEnvTruthy,
  36} from '../envUtils.js'
  37import { errorMessage } from '../errors.js'
  38import { djb2Hash } from '../hash.js'
  39import { jsonStringify } from '../slowOperations.js'
  40import { getAgentId, getAgentName, getParentSessionId } from '../teammate.js'
  41
  42/**
  43 * Chrome Trace Event format types
  44 * See: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU
  45 */
  46
  47export type TraceEventPhase =
  48  | 'B' // Begin duration event
  49  | 'E' // End duration event
  50  | 'X' // Complete event (with duration)
  51  | 'i' // Instant event
  52  | 'C' // Counter event
  53  | 'b' // Async begin
  54  | 'n' // Async instant
  55  | 'e' // Async end
  56  | 'M' // Metadata event
  57
  58export type TraceEvent = {
  59  name: string
  60  cat: string
  61  ph: TraceEventPhase
  62  ts: number // Timestamp in microseconds
  63  pid: number // Process ID (we use 1 for main, agent IDs for subagents)
  64  tid: number // Thread ID (we use numeric hash of agent name or 1 for main)
  65  dur?: number // Duration in microseconds (for 'X' events)
  66  args?: Record<string, unknown>
  67  id?: string // For async events
  68  scope?: string
  69}
  70
  71/**
  72 * Agent info for tracking hierarchy
  73 */
  74type AgentInfo = {
  75  agentId: string
  76  agentName: string
  77  parentAgentId?: string
  78  processId: number
  79  threadId: number
  80}
  81
  82/**
  83 * Pending span for tracking begin/end pairs
  84 */
  85type PendingSpan = {
  86  name: string
  87  category: string
  88  startTime: number
  89  agentInfo: AgentInfo
  90  args: Record<string, unknown>
  91}
  92
  93// Global state for the Perfetto tracer
  94let isEnabled = false
  95let tracePath: string | null = null
  96// Metadata events (ph: 'M' — process/thread names, parent links) are kept
  97// separate so they survive eviction — Perfetto UI needs them to label
  98// tracks. Bounded by agent count (~3 events per agent).
  99const metadataEvents: TraceEvent[] = []
 100const events: TraceEvent[] = []
 101// events[] cap. Cron-driven sessions run for days; 22 push sites × many
 102// turns would otherwise grow unboundedly (periodicWrite flushes to disk but
 103// does not truncate — it writes the full snapshot). At ~300B/event this is
 104// ~30MB, enough trace history for any debugging session. Eviction drops the
 105// oldest half when hit, amortized O(1).
 106const MAX_EVENTS = 100_000
 107const pendingSpans = new Map<string, PendingSpan>()
 108const agentRegistry = new Map<string, AgentInfo>()
 109let totalAgentCount = 0
 110let startTimeMs = 0
 111let spanIdCounter = 0
 112let traceWritten = false // Flag to avoid double writes
 113
 114// Map agent IDs to numeric process IDs (Perfetto requires numeric IDs)
 115let processIdCounter = 1
 116const agentIdToProcessId = new Map<string, number>()
 117
 118// Periodic write interval handle
 119let writeIntervalId: ReturnType<typeof setInterval> | null = null
 120
 121const STALE_SPAN_TTL_MS = 30 * 60 * 1000 // 30 minutes
 122const STALE_SPAN_CLEANUP_INTERVAL_MS = 60 * 1000 // 1 minute
 123let staleSpanCleanupId: ReturnType<typeof setInterval> | null = null
 124
 125/**
 126 * Convert a string to a numeric hash for use as thread ID
 127 */
 128function stringToNumericHash(str: string): number {
 129  return Math.abs(djb2Hash(str)) || 1 // Ensure non-zero
 130}
 131
 132/**
 133 * Get or create a numeric process ID for an agent
 134 */
 135function getProcessIdForAgent(agentId: string): number {
 136  const existing = agentIdToProcessId.get(agentId)
 137  if (existing !== undefined) return existing
 138
 139  processIdCounter++
 140  agentIdToProcessId.set(agentId, processIdCounter)
 141  return processIdCounter
 142}
 143
 144/**
 145 * Get current agent info
 146 */
 147function getCurrentAgentInfo(): AgentInfo {
 148  const agentId = getAgentId() ?? getSessionId()
 149  const agentName = getAgentName() ?? 'main'
 150  const parentSessionId = getParentSessionId()
 151
 152  // Check if we've already registered this agent
 153  const existing = agentRegistry.get(agentId)
 154  if (existing) return existing
 155
 156  const info: AgentInfo = {
 157    agentId,
 158    agentName,
 159    parentAgentId: parentSessionId,
 160    processId: agentId === getSessionId() ? 1 : getProcessIdForAgent(agentId),
 161    threadId: stringToNumericHash(agentName),
 162  }
 163
 164  agentRegistry.set(agentId, info)
 165  totalAgentCount++
 166  return info
 167}
 168
 169/**
 170 * Get timestamp in microseconds relative to trace start
 171 */
 172function getTimestamp(): number {
 173  return (Date.now() - startTimeMs) * 1000
 174}
 175
 176/**
 177 * Generate a unique span ID
 178 */
 179function generateSpanId(): string {
 180  return `span_${++spanIdCounter}`
 181}
 182
 183/**
 184 * Evict pending spans older than STALE_SPAN_TTL_MS.
 185 * Mirrors the TTL cleanup pattern in sessionTracing.ts.
 186 */
 187function evictStaleSpans(): void {
 188  const now = getTimestamp()
 189  const ttlUs = STALE_SPAN_TTL_MS * 1000 // Convert ms to microseconds
 190  for (const [spanId, span] of pendingSpans) {
 191    if (now - span.startTime > ttlUs) {
 192      // Emit an end event so the span shows up in the trace as incomplete
 193      events.push({
 194        name: span.name,
 195        cat: span.category,
 196        ph: 'E',
 197        ts: now,
 198        pid: span.agentInfo.processId,
 199        tid: span.agentInfo.threadId,
 200        args: {
 201          ...span.args,
 202          evicted: true,
 203          duration_ms: (now - span.startTime) / 1000,
 204        },
 205      })
 206      pendingSpans.delete(spanId)
 207    }
 208  }
 209}
 210
 211/**
 212 * Build the full trace document (Chrome Trace JSON format).
 213 */
 214function buildTraceDocument(): string {
 215  return jsonStringify({
 216    traceEvents: [...metadataEvents, ...events],
 217    metadata: {
 218      session_id: getSessionId(),
 219      trace_start_time: new Date(startTimeMs).toISOString(),
 220      agent_count: totalAgentCount,
 221      total_event_count: metadataEvents.length + events.length,
 222    },
 223  })
 224}
 225
 226/**
 227 * Drop the oldest half of events[] when over MAX_EVENTS. Called from the
 228 * stale-span cleanup interval (60s). The half-batch splice keeps this
 229 * amortized O(1) — we don't pay splice cost per-push. A synthetic marker
 230 * is inserted so the gap is visible in ui.perfetto.dev.
 231 */
 232function evictOldestEvents(): void {
 233  if (events.length < MAX_EVENTS) return
 234  const dropped = events.splice(0, MAX_EVENTS / 2)
 235  events.unshift({
 236    name: 'trace_truncated',
 237    cat: '__metadata',
 238    ph: 'i',
 239    ts: dropped[dropped.length - 1]?.ts ?? 0,
 240    pid: 1,
 241    tid: 0,
 242    args: { dropped_events: dropped.length },
 243  })
 244  logForDebugging(
 245    `[Perfetto] Evicted ${dropped.length} oldest events (cap ${MAX_EVENTS})`,
 246  )
 247}
 248
 249/**
 250 * Initialize Perfetto tracing
 251 * Call this early in the application lifecycle
 252 */
 253export function initializePerfettoTracing(): void {
 254  const envValue = process.env.CLAUDE_CODE_PERFETTO_TRACE
 255  logForDebugging(
 256    `[Perfetto] initializePerfettoTracing called, env value: ${envValue}`,
 257  )
 258
 259  // Wrap in feature() for dead code elimination - entire block removed from external builds
 260  if (feature('PERFETTO_TRACING')) {
 261    if (!envValue || isEnvDefinedFalsy(envValue)) {
 262      logForDebugging(
 263        '[Perfetto] Tracing disabled (env var not set or disabled)',
 264      )
 265      return
 266    }
 267
 268    isEnabled = true
 269    startTimeMs = Date.now()
 270
 271    // Determine trace file path
 272    if (isEnvTruthy(envValue)) {
 273      const tracesDir = join(getClaudeConfigHomeDir(), 'traces')
 274      tracePath = join(tracesDir, `trace-${getSessionId()}.json`)
 275    } else {
 276      // Use the provided path
 277      tracePath = envValue
 278    }
 279
 280    logForDebugging(
 281      `[Perfetto] Tracing enabled, will write to: ${tracePath}, isEnabled=${isEnabled}`,
 282    )
 283
 284    // Start periodic full-trace write if CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S is a positive integer
 285    const intervalSec = parseInt(
 286      process.env.CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S ?? '',
 287      10,
 288    )
 289    if (intervalSec > 0) {
 290      writeIntervalId = setInterval(() => {
 291        void periodicWrite()
 292      }, intervalSec * 1000)
 293      // Don't let the interval keep the process alive on its own
 294      if (writeIntervalId.unref) writeIntervalId.unref()
 295      logForDebugging(
 296        `[Perfetto] Periodic write enabled, interval: ${intervalSec}s`,
 297      )
 298    }
 299
 300    // Start stale span cleanup interval
 301    staleSpanCleanupId = setInterval(() => {
 302      evictStaleSpans()
 303      evictOldestEvents()
 304    }, STALE_SPAN_CLEANUP_INTERVAL_MS)
 305    if (staleSpanCleanupId.unref) staleSpanCleanupId.unref()
 306
 307    // Register cleanup to write final trace on exit
 308    registerCleanup(async () => {
 309      logForDebugging('[Perfetto] Cleanup callback invoked')
 310      await writePerfettoTrace()
 311    })
 312
 313    // Also register a beforeExit handler as a fallback
 314    // This ensures the trace is written even if cleanup registry is not called
 315    process.on('beforeExit', () => {
 316      logForDebugging('[Perfetto] beforeExit handler invoked')
 317      void writePerfettoTrace()
 318    })
 319
 320    // Register a synchronous exit handler as a last resort
 321    // This is the final fallback to ensure trace is written before process exits
 322    process.on('exit', () => {
 323      if (!traceWritten) {
 324        logForDebugging(
 325          '[Perfetto] exit handler invoked, writing trace synchronously',
 326        )
 327        writePerfettoTraceSync()
 328      }
 329    })
 330
 331    // Emit process metadata events for main process
 332    const mainAgent = getCurrentAgentInfo()
 333    emitProcessMetadata(mainAgent)
 334  }
 335}
 336
 337/**
 338 * Emit metadata events for a process/agent
 339 */
 340function emitProcessMetadata(agentInfo: AgentInfo): void {
 341  if (!isEnabled) return
 342
 343  // Process name
 344  metadataEvents.push({
 345    name: 'process_name',
 346    cat: '__metadata',
 347    ph: 'M',
 348    ts: 0,
 349    pid: agentInfo.processId,
 350    tid: 0,
 351    args: { name: agentInfo.agentName },
 352  })
 353
 354  // Thread name (same as process for now)
 355  metadataEvents.push({
 356    name: 'thread_name',
 357    cat: '__metadata',
 358    ph: 'M',
 359    ts: 0,
 360    pid: agentInfo.processId,
 361    tid: agentInfo.threadId,
 362    args: { name: agentInfo.agentName },
 363  })
 364
 365  // Add parent info if available
 366  if (agentInfo.parentAgentId) {
 367    metadataEvents.push({
 368      name: 'parent_agent',
 369      cat: '__metadata',
 370      ph: 'M',
 371      ts: 0,
 372      pid: agentInfo.processId,
 373      tid: 0,
 374      args: {
 375        parent_agent_id: agentInfo.parentAgentId,
 376      },
 377    })
 378  }
 379}
 380
 381/**
 382 * Check if Perfetto tracing is enabled
 383 */
 384export function isPerfettoTracingEnabled(): boolean {
 385  return isEnabled
 386}
 387
 388/**
 389 * Register a new agent in the trace
 390 * Call this when a subagent/teammate is spawned
 391 */
 392export function registerAgent(
 393  agentId: string,
 394  agentName: string,
 395  parentAgentId?: string,
 396): void {
 397  if (!isEnabled) return
 398
 399  const info: AgentInfo = {
 400    agentId,
 401    agentName,
 402    parentAgentId,
 403    processId: getProcessIdForAgent(agentId),
 404    threadId: stringToNumericHash(agentName),
 405  }
 406
 407  agentRegistry.set(agentId, info)
 408  totalAgentCount++
 409  emitProcessMetadata(info)
 410}
 411
 412/**
 413 * Unregister an agent from the trace.
 414 * Call this when an agent completes, fails, or is aborted to free memory.
 415 */
 416export function unregisterAgent(agentId: string): void {
 417  if (!isEnabled) return
 418  agentRegistry.delete(agentId)
 419  agentIdToProcessId.delete(agentId)
 420}
 421
 422/**
 423 * Start an API call span
 424 */
 425export function startLLMRequestPerfettoSpan(args: {
 426  model: string
 427  promptTokens?: number
 428  messageId?: string
 429  isSpeculative?: boolean
 430  querySource?: string
 431}): string {
 432  if (!isEnabled) return ''
 433
 434  const spanId = generateSpanId()
 435  const agentInfo = getCurrentAgentInfo()
 436
 437  pendingSpans.set(spanId, {
 438    name: 'API Call',
 439    category: 'api',
 440    startTime: getTimestamp(),
 441    agentInfo,
 442    args: {
 443      model: args.model,
 444      prompt_tokens: args.promptTokens,
 445      message_id: args.messageId,
 446      is_speculative: args.isSpeculative ?? false,
 447      query_source: args.querySource,
 448    },
 449  })
 450
 451  // Emit begin event
 452  events.push({
 453    name: 'API Call',
 454    cat: 'api',
 455    ph: 'B',
 456    ts: pendingSpans.get(spanId)!.startTime,
 457    pid: agentInfo.processId,
 458    tid: agentInfo.threadId,
 459    args: pendingSpans.get(spanId)!.args,
 460  })
 461
 462  return spanId
 463}
 464
 465/**
 466 * End an API call span with response metadata
 467 */
 468export function endLLMRequestPerfettoSpan(
 469  spanId: string,
 470  metadata: {
 471    ttftMs?: number
 472    ttltMs?: number
 473    promptTokens?: number
 474    outputTokens?: number
 475    cacheReadTokens?: number
 476    cacheCreationTokens?: number
 477    messageId?: string
 478    success?: boolean
 479    error?: string
 480    /** Time spent in pre-request setup (client creation, retries) before the successful attempt */
 481    requestSetupMs?: number
 482    /** Timestamps (Date.now()) of each attempt start — used to emit retry sub-spans */
 483    attemptStartTimes?: number[]
 484  },
 485): void {
 486  if (!isEnabled || !spanId) return
 487
 488  const pending = pendingSpans.get(spanId)
 489  if (!pending) return
 490
 491  const endTime = getTimestamp()
 492  const duration = endTime - pending.startTime
 493
 494  const promptTokens =
 495    metadata.promptTokens ?? (pending.args.prompt_tokens as number | undefined)
 496  const ttftMs = metadata.ttftMs
 497  const ttltMs = metadata.ttltMs
 498  const outputTokens = metadata.outputTokens
 499  const cacheReadTokens = metadata.cacheReadTokens
 500
 501  // Compute derived metrics
 502  // ITPS: input tokens per second (prompt processing speed)
 503  const itps =
 504    ttftMs !== undefined && promptTokens !== undefined && ttftMs > 0
 505      ? Math.round((promptTokens / (ttftMs / 1000)) * 100) / 100
 506      : undefined
 507
 508  // OTPS: output tokens per second (sampling speed)
 509  const samplingMs =
 510    ttltMs !== undefined && ttftMs !== undefined ? ttltMs - ttftMs : undefined
 511  const otps =
 512    samplingMs !== undefined && outputTokens !== undefined && samplingMs > 0
 513      ? Math.round((outputTokens / (samplingMs / 1000)) * 100) / 100
 514      : undefined
 515
 516  // Cache hit rate: percentage of prompt tokens from cache
 517  const cacheHitRate =
 518    cacheReadTokens !== undefined &&
 519    promptTokens !== undefined &&
 520    promptTokens > 0
 521      ? Math.round((cacheReadTokens / promptTokens) * 10000) / 100
 522      : undefined
 523
 524  const requestSetupMs = metadata.requestSetupMs
 525  const attemptStartTimes = metadata.attemptStartTimes
 526
 527  // Merge metadata with original args
 528  const args = {
 529    ...pending.args,
 530    ttft_ms: ttftMs,
 531    ttlt_ms: ttltMs,
 532    prompt_tokens: promptTokens,
 533    output_tokens: outputTokens,
 534    cache_read_tokens: cacheReadTokens,
 535    cache_creation_tokens: metadata.cacheCreationTokens,
 536    message_id: metadata.messageId ?? pending.args.message_id,
 537    success: metadata.success ?? true,
 538    error: metadata.error,
 539    duration_ms: duration / 1000,
 540    request_setup_ms: requestSetupMs,
 541    // Derived metrics
 542    itps,
 543    otps,
 544    cache_hit_rate_pct: cacheHitRate,
 545  }
 546
 547  // Emit Request Setup sub-span when there was measurable setup time
 548  // (client creation, param building, retries before the successful attempt)
 549  const setupUs =
 550    requestSetupMs !== undefined && requestSetupMs > 0
 551      ? requestSetupMs * 1000
 552      : 0
 553  if (setupUs > 0) {
 554    const setupEndTs = pending.startTime + setupUs
 555
 556    events.push({
 557      name: 'Request Setup',
 558      cat: 'api,setup',
 559      ph: 'B',
 560      ts: pending.startTime,
 561      pid: pending.agentInfo.processId,
 562      tid: pending.agentInfo.threadId,
 563      args: {
 564        request_setup_ms: requestSetupMs,
 565        attempt_count: attemptStartTimes?.length ?? 1,
 566      },
 567    })
 568
 569    // Emit retry attempt sub-spans within Request Setup.
 570    // Each failed attempt runs from its start to the next attempt's start.
 571    if (attemptStartTimes && attemptStartTimes.length > 1) {
 572      // attemptStartTimes[0] is the reference point (first attempt).
 573      // Convert wall-clock deltas into Perfetto-relative microseconds.
 574      const baseWallMs = attemptStartTimes[0]!
 575      for (let i = 0; i < attemptStartTimes.length - 1; i++) {
 576        const attemptStartUs =
 577          pending.startTime + (attemptStartTimes[i]! - baseWallMs) * 1000
 578        const attemptEndUs =
 579          pending.startTime + (attemptStartTimes[i + 1]! - baseWallMs) * 1000
 580
 581        events.push({
 582          name: `Attempt ${i + 1} (retry)`,
 583          cat: 'api,retry',
 584          ph: 'B',
 585          ts: attemptStartUs,
 586          pid: pending.agentInfo.processId,
 587          tid: pending.agentInfo.threadId,
 588          args: { attempt: i + 1 },
 589        })
 590        events.push({
 591          name: `Attempt ${i + 1} (retry)`,
 592          cat: 'api,retry',
 593          ph: 'E',
 594          ts: attemptEndUs,
 595          pid: pending.agentInfo.processId,
 596          tid: pending.agentInfo.threadId,
 597        })
 598      }
 599    }
 600
 601    events.push({
 602      name: 'Request Setup',
 603      cat: 'api,setup',
 604      ph: 'E',
 605      ts: setupEndTs,
 606      pid: pending.agentInfo.processId,
 607      tid: pending.agentInfo.threadId,
 608    })
 609  }
 610
 611  // Emit sub-spans for First Token and Sampling phases (before API Call end)
 612  // Using B/E pairs in proper nesting order for correct Perfetto visualization
 613  if (ttftMs !== undefined) {
 614    // First Token starts after request setup (if any)
 615    const firstTokenStartTs = pending.startTime + setupUs
 616    const firstTokenEndTs = firstTokenStartTs + ttftMs * 1000
 617
 618    // First Token phase: from successful attempt start to first token
 619    events.push({
 620      name: 'First Token',
 621      cat: 'api,ttft',
 622      ph: 'B',
 623      ts: firstTokenStartTs,
 624      pid: pending.agentInfo.processId,
 625      tid: pending.agentInfo.threadId,
 626      args: {
 627        ttft_ms: ttftMs,
 628        prompt_tokens: promptTokens,
 629        itps,
 630        cache_hit_rate_pct: cacheHitRate,
 631      },
 632    })
 633    events.push({
 634      name: 'First Token',
 635      cat: 'api,ttft',
 636      ph: 'E',
 637      ts: firstTokenEndTs,
 638      pid: pending.agentInfo.processId,
 639      tid: pending.agentInfo.threadId,
 640    })
 641
 642    // Sampling phase: from first token to last token
 643    // Note: samplingMs = ttltMs - ttftMs still includes setup time in ttltMs,
 644    // so we compute the actual sampling duration for the span as the time from
 645    // first token to API call end (endTime), not samplingMs directly.
 646    const actualSamplingMs =
 647      ttltMs !== undefined ? ttltMs - ttftMs - setupUs / 1000 : undefined
 648    if (actualSamplingMs !== undefined && actualSamplingMs > 0) {
 649      events.push({
 650        name: 'Sampling',
 651        cat: 'api,sampling',
 652        ph: 'B',
 653        ts: firstTokenEndTs,
 654        pid: pending.agentInfo.processId,
 655        tid: pending.agentInfo.threadId,
 656        args: {
 657          sampling_ms: actualSamplingMs,
 658          output_tokens: outputTokens,
 659          otps,
 660        },
 661      })
 662      events.push({
 663        name: 'Sampling',
 664        cat: 'api,sampling',
 665        ph: 'E',
 666        ts: firstTokenEndTs + actualSamplingMs * 1000,
 667        pid: pending.agentInfo.processId,
 668        tid: pending.agentInfo.threadId,
 669      })
 670    }
 671  }
 672
 673  // Emit API Call end event (after sub-spans)
 674  events.push({
 675    name: pending.name,
 676    cat: pending.category,
 677    ph: 'E',
 678    ts: endTime,
 679    pid: pending.agentInfo.processId,
 680    tid: pending.agentInfo.threadId,
 681    args,
 682  })
 683
 684  pendingSpans.delete(spanId)
 685}
 686
 687/**
 688 * Start a tool execution span
 689 */
 690export function startToolPerfettoSpan(
 691  toolName: string,
 692  args?: Record<string, unknown>,
 693): string {
 694  if (!isEnabled) return ''
 695
 696  const spanId = generateSpanId()
 697  const agentInfo = getCurrentAgentInfo()
 698
 699  pendingSpans.set(spanId, {
 700    name: `Tool: ${toolName}`,
 701    category: 'tool',
 702    startTime: getTimestamp(),
 703    agentInfo,
 704    args: {
 705      tool_name: toolName,
 706      ...args,
 707    },
 708  })
 709
 710  // Emit begin event
 711  events.push({
 712    name: `Tool: ${toolName}`,
 713    cat: 'tool',
 714    ph: 'B',
 715    ts: pendingSpans.get(spanId)!.startTime,
 716    pid: agentInfo.processId,
 717    tid: agentInfo.threadId,
 718    args: pendingSpans.get(spanId)!.args,
 719  })
 720
 721  return spanId
 722}
 723
 724/**
 725 * End a tool execution span
 726 */
 727export function endToolPerfettoSpan(
 728  spanId: string,
 729  metadata?: {
 730    success?: boolean
 731    error?: string
 732    resultTokens?: number
 733  },
 734): void {
 735  if (!isEnabled || !spanId) return
 736
 737  const pending = pendingSpans.get(spanId)
 738  if (!pending) return
 739
 740  const endTime = getTimestamp()
 741  const duration = endTime - pending.startTime
 742
 743  const args = {
 744    ...pending.args,
 745    success: metadata?.success ?? true,
 746    error: metadata?.error,
 747    result_tokens: metadata?.resultTokens,
 748    duration_ms: duration / 1000,
 749  }
 750
 751  // Emit end event
 752  events.push({
 753    name: pending.name,
 754    cat: pending.category,
 755    ph: 'E',
 756    ts: endTime,
 757    pid: pending.agentInfo.processId,
 758    tid: pending.agentInfo.threadId,
 759    args,
 760  })
 761
 762  pendingSpans.delete(spanId)
 763}
 764
 765/**
 766 * Start a user input waiting span
 767 */
 768export function startUserInputPerfettoSpan(context?: string): string {
 769  if (!isEnabled) return ''
 770
 771  const spanId = generateSpanId()
 772  const agentInfo = getCurrentAgentInfo()
 773
 774  pendingSpans.set(spanId, {
 775    name: 'Waiting for User Input',
 776    category: 'user_input',
 777    startTime: getTimestamp(),
 778    agentInfo,
 779    args: {
 780      context,
 781    },
 782  })
 783
 784  // Emit begin event
 785  events.push({
 786    name: 'Waiting for User Input',
 787    cat: 'user_input',
 788    ph: 'B',
 789    ts: pendingSpans.get(spanId)!.startTime,
 790    pid: agentInfo.processId,
 791    tid: agentInfo.threadId,
 792    args: pendingSpans.get(spanId)!.args,
 793  })
 794
 795  return spanId
 796}
 797
 798/**
 799 * End a user input waiting span
 800 */
 801export function endUserInputPerfettoSpan(
 802  spanId: string,
 803  metadata?: {
 804    decision?: string
 805    source?: string
 806  },
 807): void {
 808  if (!isEnabled || !spanId) return
 809
 810  const pending = pendingSpans.get(spanId)
 811  if (!pending) return
 812
 813  const endTime = getTimestamp()
 814  const duration = endTime - pending.startTime
 815
 816  const args = {
 817    ...pending.args,
 818    decision: metadata?.decision,
 819    source: metadata?.source,
 820    duration_ms: duration / 1000,
 821  }
 822
 823  // Emit end event
 824  events.push({
 825    name: pending.name,
 826    cat: pending.category,
 827    ph: 'E',
 828    ts: endTime,
 829    pid: pending.agentInfo.processId,
 830    tid: pending.agentInfo.threadId,
 831    args,
 832  })
 833
 834  pendingSpans.delete(spanId)
 835}
 836
 837/**
 838 * Emit an instant event (marker)
 839 */
 840export function emitPerfettoInstant(
 841  name: string,
 842  category: string,
 843  args?: Record<string, unknown>,
 844): void {
 845  if (!isEnabled) return
 846
 847  const agentInfo = getCurrentAgentInfo()
 848
 849  events.push({
 850    name,
 851    cat: category,
 852    ph: 'i',
 853    ts: getTimestamp(),
 854    pid: agentInfo.processId,
 855    tid: agentInfo.threadId,
 856    args,
 857  })
 858}
 859
 860/**
 861 * Emit a counter event for tracking metrics over time
 862 */
 863export function emitPerfettoCounter(
 864  name: string,
 865  values: Record<string, number>,
 866): void {
 867  if (!isEnabled) return
 868
 869  const agentInfo = getCurrentAgentInfo()
 870
 871  events.push({
 872    name,
 873    cat: 'counter',
 874    ph: 'C',
 875    ts: getTimestamp(),
 876    pid: agentInfo.processId,
 877    tid: agentInfo.threadId,
 878    args: values,
 879  })
 880}
 881
 882/**
 883 * Start an interaction span (wraps a full user request cycle)
 884 */
 885export function startInteractionPerfettoSpan(userPrompt?: string): string {
 886  if (!isEnabled) return ''
 887
 888  const spanId = generateSpanId()
 889  const agentInfo = getCurrentAgentInfo()
 890
 891  pendingSpans.set(spanId, {
 892    name: 'Interaction',
 893    category: 'interaction',
 894    startTime: getTimestamp(),
 895    agentInfo,
 896    args: {
 897      user_prompt_length: userPrompt?.length,
 898    },
 899  })
 900
 901  // Emit begin event
 902  events.push({
 903    name: 'Interaction',
 904    cat: 'interaction',
 905    ph: 'B',
 906    ts: pendingSpans.get(spanId)!.startTime,
 907    pid: agentInfo.processId,
 908    tid: agentInfo.threadId,
 909    args: pendingSpans.get(spanId)!.args,
 910  })
 911
 912  return spanId
 913}
 914
 915/**
 916 * End an interaction span
 917 */
 918export function endInteractionPerfettoSpan(spanId: string): void {
 919  if (!isEnabled || !spanId) return
 920
 921  const pending = pendingSpans.get(spanId)
 922  if (!pending) return
 923
 924  const endTime = getTimestamp()
 925  const duration = endTime - pending.startTime
 926
 927  // Emit end event
 928  events.push({
 929    name: pending.name,
 930    cat: pending.category,
 931    ph: 'E',
 932    ts: endTime,
 933    pid: pending.agentInfo.processId,
 934    tid: pending.agentInfo.threadId,
 935    args: {
 936      ...pending.args,
 937      duration_ms: duration / 1000,
 938    },
 939  })
 940
 941  pendingSpans.delete(spanId)
 942}
 943
 944// ---------------------------------------------------------------------------
 945// Periodic write helpers
 946// ---------------------------------------------------------------------------
 947
 948/**
 949 * Stop the periodic write timer.
 950 */
 951function stopWriteInterval(): void {
 952  if (staleSpanCleanupId) {
 953    clearInterval(staleSpanCleanupId)
 954    staleSpanCleanupId = null
 955  }
 956  if (writeIntervalId) {
 957    clearInterval(writeIntervalId)
 958    writeIntervalId = null
 959  }
 960}
 961
 962/**
 963 * Force-close any remaining open spans at session end.
 964 */
 965function closeOpenSpans(): void {
 966  for (const [spanId, pending] of pendingSpans) {
 967    const endTime = getTimestamp()
 968    events.push({
 969      name: pending.name,
 970      cat: pending.category,
 971      ph: 'E',
 972      ts: endTime,
 973      pid: pending.agentInfo.processId,
 974      tid: pending.agentInfo.threadId,
 975      args: {
 976        ...pending.args,
 977        incomplete: true,
 978        duration_ms: (endTime - pending.startTime) / 1000,
 979      },
 980    })
 981    pendingSpans.delete(spanId)
 982  }
 983}
 984
 985/**
 986 * Write the full trace to disk.  Errors are logged but swallowed so that a
 987 * transient I/O problem does not crash the session — the next periodic tick
 988 * (or the final exit write) will retry with a complete snapshot.
 989 */
 990async function periodicWrite(): Promise<void> {
 991  if (!isEnabled || !tracePath || traceWritten) return
 992
 993  try {
 994    await mkdir(dirname(tracePath), { recursive: true })
 995    await writeFile(tracePath, buildTraceDocument())
 996    logForDebugging(
 997      `[Perfetto] Periodic write: ${events.length} events to ${tracePath}`,
 998    )
 999  } catch (error) {
1000    logForDebugging(
1001      `[Perfetto] Periodic write failed: ${errorMessage(error)}`,
1002      { level: 'error' },
1003    )
1004  }
1005}
1006
1007/**
1008 * Final async write: close open spans and write the complete trace.
1009 * Idempotent — sets `traceWritten` on success so subsequent calls are no-ops.
1010 */
1011async function writePerfettoTrace(): Promise<void> {
1012  if (!isEnabled || !tracePath || traceWritten) {
1013    logForDebugging(
1014      `[Perfetto] Skipping final write: isEnabled=${isEnabled}, tracePath=${tracePath}, traceWritten=${traceWritten}`,
1015    )
1016    return
1017  }
1018
1019  stopWriteInterval()
1020  closeOpenSpans()
1021
1022  logForDebugging(
1023    `[Perfetto] writePerfettoTrace called: events=${events.length}`,
1024  )
1025
1026  try {
1027    await mkdir(dirname(tracePath), { recursive: true })
1028    await writeFile(tracePath, buildTraceDocument())
1029    traceWritten = true
1030    logForDebugging(`[Perfetto] Trace finalized at: ${tracePath}`)
1031  } catch (error) {
1032    logForDebugging(
1033      `[Perfetto] Failed to write final trace: ${errorMessage(error)}`,
1034      { level: 'error' },
1035    )
1036  }
1037}
1038
1039/**
1040 * Final synchronous write (fallback for process 'exit' handler where async is forbidden).
1041 */
1042function writePerfettoTraceSync(): void {
1043  if (!isEnabled || !tracePath || traceWritten) {
1044    logForDebugging(
1045      `[Perfetto] Skipping final sync write: isEnabled=${isEnabled}, tracePath=${tracePath}, traceWritten=${traceWritten}`,
1046    )
1047    return
1048  }
1049
1050  stopWriteInterval()
1051  closeOpenSpans()
1052
1053  logForDebugging(
1054    `[Perfetto] writePerfettoTraceSync called: events=${events.length}`,
1055  )
1056
1057  try {
1058    const dir = dirname(tracePath)
1059    // eslint-disable-next-line custom-rules/no-sync-fs -- Only called from process.on('exit') handler
1060    mkdirSync(dir, { recursive: true })
1061    // eslint-disable-next-line custom-rules/no-sync-fs, eslint-plugin-n/no-sync -- Required for process 'exit' handler which doesn't support async
1062    writeFileSync(tracePath, buildTraceDocument())
1063    traceWritten = true
1064    logForDebugging(`[Perfetto] Trace finalized synchronously at: ${tracePath}`)
1065  } catch (error) {
1066    logForDebugging(
1067      `[Perfetto] Failed to write final trace synchronously: ${errorMessage(error)}`,
1068      { level: 'error' },
1069    )
1070  }
1071}
1072
1073/**
1074 * Get all recorded events (for testing)
1075 */
1076export function getPerfettoEvents(): TraceEvent[] {
1077  return [...metadataEvents, ...events]
1078}
1079
1080/**
1081 * Reset the tracer state (for testing)
1082 */
1083export function resetPerfettoTracer(): void {
1084  if (staleSpanCleanupId) {
1085    clearInterval(staleSpanCleanupId)
1086    staleSpanCleanupId = null
1087  }
1088  stopWriteInterval()
1089  metadataEvents.length = 0
1090  events.length = 0
1091  pendingSpans.clear()
1092  agentRegistry.clear()
1093  agentIdToProcessId.clear()
1094  totalAgentCount = 0
1095  processIdCounter = 1
1096  spanIdCounter = 0
1097  isEnabled = false
1098  tracePath = null
1099  startTimeMs = 0
1100  traceWritten = false
1101}
1102
1103/**
1104 * Trigger a periodic write immediately (for testing)
1105 */
1106export async function triggerPeriodicWriteForTesting(): Promise<void> {
1107  await periodicWrite()
1108}
1109
1110/**
1111 * Evict stale spans immediately (for testing)
1112 */
1113export function evictStaleSpansForTesting(): void {
1114  evictStaleSpans()
1115}
1116
1117export const MAX_EVENTS_FOR_TESTING = MAX_EVENTS
1118export function evictOldestEventsForTesting(): void {
1119  evictOldestEvents()
1120}