utils/toolResultStorage.ts at main · oppi.li/claude-code

oppi.li / claude-code
fork atom
source dump of claude code
fork atom
claude-code / utils / toolResultStorage.ts
at main 1040 lines 38 kB view raw
wrap content
oppi.li dump from zip 2d ago
63aada3f
   1/**
   2 * Utility for persisting large tool results to disk instead of truncating them.
   3 */
   4
   5import type { ToolResultBlockParam } from '@anthropic-ai/sdk/resources/index.mjs'
   6import { mkdir, writeFile } from 'fs/promises'
   7import { join } from 'path'
   8import { getOriginalCwd, getSessionId } from '../bootstrap/state.js'
   9import {
  10  BYTES_PER_TOKEN,
  11  DEFAULT_MAX_RESULT_SIZE_CHARS,
  12  MAX_TOOL_RESULT_BYTES,
  13  MAX_TOOL_RESULTS_PER_MESSAGE_CHARS,
  14} from '../constants/toolLimits.js'
  15import { getFeatureValue_CACHED_MAY_BE_STALE } from '../services/analytics/growthbook.js'
  16import { logEvent } from '../services/analytics/index.js'
  17import { sanitizeToolNameForAnalytics } from '../services/analytics/metadata.js'
  18import type { Message } from '../types/message.js'
  19import { logForDebugging } from './debug.js'
  20import { getErrnoCode, toError } from './errors.js'
  21import { formatFileSize } from './format.js'
  22import { logError } from './log.js'
  23import { getProjectDir } from './sessionStorage.js'
  24import { jsonStringify } from './slowOperations.js'
  25
  26// Subdirectory name for tool results within a session
  27export const TOOL_RESULTS_SUBDIR = 'tool-results'
  28
  29// XML tag used to wrap persisted output messages
  30export const PERSISTED_OUTPUT_TAG = '<persisted-output>'
  31export const PERSISTED_OUTPUT_CLOSING_TAG = '</persisted-output>'
  32
  33// Message used when tool result content was cleared without persisting to file
  34export const TOOL_RESULT_CLEARED_MESSAGE = '[Old tool result content cleared]'
  35
  36/**
  37 * GrowthBook override map: tool name -> persistence threshold (chars).
  38 * When a tool name is present in this map, that value is used directly as the
  39 * effective threshold, bypassing the Math.min() clamp against the 50k default.
  40 * Tools absent from the map use the hardcoded fallback.
  41 * Flag default is {} (no overrides == behavior unchanged).
  42 */
  43const PERSIST_THRESHOLD_OVERRIDE_FLAG = 'tengu_satin_quoll'
  44
  45/**
  46 * Resolve the effective persistence threshold for a tool.
  47 * GrowthBook override wins when present; otherwise falls back to the declared
  48 * per-tool cap clamped by the global default.
  49 *
  50 * Defensive: GrowthBook's cache returns `cached !== undefined ? cached : default`,
  51 * so a flag served as `null` leaks through. We guard with optional chaining and a
  52 * typeof check so any non-object flag value (null, string, number) falls through
  53 * to the hardcoded default instead of throwing on index or returning 0.
  54 */
  55export function getPersistenceThreshold(
  56  toolName: string,
  57  declaredMaxResultSizeChars: number,
  58): number {
  59  // Infinity = hard opt-out. Read self-bounds via maxTokens; persisting its
  60  // output to a file the model reads back with Read is circular. Checked
  61  // before the GB override so tengu_satin_quoll can't force it back on.
  62  if (!Number.isFinite(declaredMaxResultSizeChars)) {
  63    return declaredMaxResultSizeChars
  64  }
  65  const overrides = getFeatureValue_CACHED_MAY_BE_STALE<Record<
  66    string,
  67    number
  68  > | null>(PERSIST_THRESHOLD_OVERRIDE_FLAG, {})
  69  const override = overrides?.[toolName]
  70  if (
  71    typeof override === 'number' &&
  72    Number.isFinite(override) &&
  73    override > 0
  74  ) {
  75    return override
  76  }
  77  return Math.min(declaredMaxResultSizeChars, DEFAULT_MAX_RESULT_SIZE_CHARS)
  78}
  79
  80// Result of persisting a tool result to disk
  81export type PersistedToolResult = {
  82  filepath: string
  83  originalSize: number
  84  isJson: boolean
  85  preview: string
  86  hasMore: boolean
  87}
  88
  89// Error result when persistence fails
  90export type PersistToolResultError = {
  91  error: string
  92}
  93
  94/**
  95 * Get the session directory (projectDir/sessionId)
  96 */
  97function getSessionDir(): string {
  98  return join(getProjectDir(getOriginalCwd()), getSessionId())
  99}
 100
 101/**
 102 * Get the tool results directory for this session (projectDir/sessionId/tool-results)
 103 */
 104export function getToolResultsDir(): string {
 105  return join(getSessionDir(), TOOL_RESULTS_SUBDIR)
 106}
 107
 108// Preview size in bytes for the reference message
 109export const PREVIEW_SIZE_BYTES = 2000
 110
 111/**
 112 * Get the filepath where a tool result would be persisted.
 113 */
 114export function getToolResultPath(id: string, isJson: boolean): string {
 115  const ext = isJson ? 'json' : 'txt'
 116  return join(getToolResultsDir(), `${id}.${ext}`)
 117}
 118
 119/**
 120 * Ensure the session-specific tool results directory exists
 121 */
 122export async function ensureToolResultsDir(): Promise<void> {
 123  try {
 124    await mkdir(getToolResultsDir(), { recursive: true })
 125  } catch {
 126    // Directory may already exist
 127  }
 128}
 129
 130/**
 131 * Persist a tool result to disk and return information about the persisted file
 132 *
 133 * @param content - The tool result content to persist (string or array of content blocks)
 134 * @param toolUseId - The ID of the tool use that produced the result
 135 * @returns Information about the persisted file including filepath and preview
 136 */
 137export async function persistToolResult(
 138  content: NonNullable<ToolResultBlockParam['content']>,
 139  toolUseId: string,
 140): Promise<PersistedToolResult | PersistToolResultError> {
 141  const isJson = Array.isArray(content)
 142
 143  // Check for non-text content - we can only persist text blocks
 144  if (isJson) {
 145    const hasNonTextContent = content.some(block => block.type !== 'text')
 146    if (hasNonTextContent) {
 147      return {
 148        error: 'Cannot persist tool results containing non-text content',
 149      }
 150    }
 151  }
 152
 153  await ensureToolResultsDir()
 154  const filepath = getToolResultPath(toolUseId, isJson)
 155  const contentStr = isJson ? jsonStringify(content, null, 2) : content
 156
 157  // tool_use_id is unique per invocation and content is deterministic for a
 158  // given id, so skip if the file already exists. This prevents re-writing
 159  // the same content on every API turn when microcompact replays the
 160  // original messages. Use 'wx' instead of a stat-then-write race.
 161  try {
 162    await writeFile(filepath, contentStr, { encoding: 'utf-8', flag: 'wx' })
 163    logForDebugging(
 164      `Persisted tool result to ${filepath} (${formatFileSize(contentStr.length)})`,
 165    )
 166  } catch (error) {
 167    if (getErrnoCode(error) !== 'EEXIST') {
 168      logError(toError(error))
 169      return { error: getFileSystemErrorMessage(toError(error)) }
 170    }
 171    // EEXIST: already persisted on a prior turn, fall through to preview
 172  }
 173
 174  // Generate a preview
 175  const { preview, hasMore } = generatePreview(contentStr, PREVIEW_SIZE_BYTES)
 176
 177  return {
 178    filepath,
 179    originalSize: contentStr.length,
 180    isJson,
 181    preview,
 182    hasMore,
 183  }
 184}
 185
 186/**
 187 * Build a message for large tool results with preview
 188 */
 189export function buildLargeToolResultMessage(
 190  result: PersistedToolResult,
 191): string {
 192  let message = `${PERSISTED_OUTPUT_TAG}\n`
 193  message += `Output too large (${formatFileSize(result.originalSize)}). Full output saved to: ${result.filepath}\n\n`
 194  message += `Preview (first ${formatFileSize(PREVIEW_SIZE_BYTES)}):\n`
 195  message += result.preview
 196  message += result.hasMore ? '\n...\n' : '\n'
 197  message += PERSISTED_OUTPUT_CLOSING_TAG
 198  return message
 199}
 200
 201/**
 202 * Process a tool result for inclusion in a message.
 203 * Maps the result to the API format and persists large results to disk.
 204 */
 205export async function processToolResultBlock<T>(
 206  tool: {
 207    name: string
 208    maxResultSizeChars: number
 209    mapToolResultToToolResultBlockParam: (
 210      result: T,
 211      toolUseID: string,
 212    ) => ToolResultBlockParam
 213  },
 214  toolUseResult: T,
 215  toolUseID: string,
 216): Promise<ToolResultBlockParam> {
 217  const toolResultBlock = tool.mapToolResultToToolResultBlockParam(
 218    toolUseResult,
 219    toolUseID,
 220  )
 221  return maybePersistLargeToolResult(
 222    toolResultBlock,
 223    tool.name,
 224    getPersistenceThreshold(tool.name, tool.maxResultSizeChars),
 225  )
 226}
 227
 228/**
 229 * Process a pre-mapped tool result block. Applies persistence for large results
 230 * without re-calling mapToolResultToToolResultBlockParam.
 231 */
 232export async function processPreMappedToolResultBlock(
 233  toolResultBlock: ToolResultBlockParam,
 234  toolName: string,
 235  maxResultSizeChars: number,
 236): Promise<ToolResultBlockParam> {
 237  return maybePersistLargeToolResult(
 238    toolResultBlock,
 239    toolName,
 240    getPersistenceThreshold(toolName, maxResultSizeChars),
 241  )
 242}
 243
 244/**
 245 * True when a tool_result's content is empty or effectively empty. Covers:
 246 * undefined/null/'', whitespace-only strings, empty arrays, and arrays whose
 247 * only blocks are text blocks with empty/whitespace text. Non-text blocks
 248 * (images, tool_reference) are treated as non-empty.
 249 */
 250export function isToolResultContentEmpty(
 251  content: ToolResultBlockParam['content'],
 252): boolean {
 253  if (!content) return true
 254  if (typeof content === 'string') return content.trim() === ''
 255  if (!Array.isArray(content)) return false
 256  if (content.length === 0) return true
 257  return content.every(
 258    block =>
 259      typeof block === 'object' &&
 260      'type' in block &&
 261      block.type === 'text' &&
 262      'text' in block &&
 263      (typeof block.text !== 'string' || block.text.trim() === ''),
 264  )
 265}
 266
 267/**
 268 * Handle large tool results by persisting to disk instead of truncating.
 269 * Returns the original block if no persistence needed, or a modified block
 270 * with the content replaced by a reference to the persisted file.
 271 */
 272async function maybePersistLargeToolResult(
 273  toolResultBlock: ToolResultBlockParam,
 274  toolName: string,
 275  persistenceThreshold?: number,
 276): Promise<ToolResultBlockParam> {
 277  // Check size first before doing any async work - most tool results are small
 278  const content = toolResultBlock.content
 279
 280  // inc-4586: Empty tool_result content at the prompt tail causes some models
 281  // (notably capybara) to emit the \n\nHuman: stop sequence and end their turn
 282  // with zero output. The server renderer inserts no \n\nAssistant: marker after
 283  // tool results, so a bare </function_results>\n\n pattern-matches to a turn
 284  // boundary. Several tools can legitimately produce empty output (silent-success
 285  // shell commands, MCP servers returning content:[], REPL statements, etc.).
 286  // Inject a short marker so the model always has something to react to.
 287  if (isToolResultContentEmpty(content)) {
 288    logEvent('tengu_tool_empty_result', {
 289      toolName: sanitizeToolNameForAnalytics(toolName),
 290    })
 291    return {
 292      ...toolResultBlock,
 293      content: `(${toolName} completed with no output)`,
 294    }
 295  }
 296  // Narrow after the emptiness guard — content is non-nullish past this point.
 297  if (!content) {
 298    return toolResultBlock
 299  }
 300
 301  // Skip persistence for image content blocks - they need to be sent as-is to Claude
 302  if (hasImageBlock(content)) {
 303    return toolResultBlock
 304  }
 305
 306  const size = contentSize(content)
 307
 308  // Use tool-specific threshold if provided, otherwise fall back to global limit
 309  const threshold = persistenceThreshold ?? MAX_TOOL_RESULT_BYTES
 310  if (size <= threshold) {
 311    return toolResultBlock
 312  }
 313
 314  // Persist the entire content as a unit
 315  const result = await persistToolResult(content, toolResultBlock.tool_use_id)
 316  if (isPersistError(result)) {
 317    // If persistence failed, return the original block unchanged
 318    return toolResultBlock
 319  }
 320
 321  const message = buildLargeToolResultMessage(result)
 322
 323  // Log analytics
 324  logEvent('tengu_tool_result_persisted', {
 325    toolName: sanitizeToolNameForAnalytics(toolName),
 326    originalSizeBytes: result.originalSize,
 327    persistedSizeBytes: message.length,
 328    estimatedOriginalTokens: Math.ceil(result.originalSize / BYTES_PER_TOKEN),
 329    estimatedPersistedTokens: Math.ceil(message.length / BYTES_PER_TOKEN),
 330    thresholdUsed: threshold,
 331  })
 332
 333  return { ...toolResultBlock, content: message }
 334}
 335
 336/**
 337 * Generate a preview of content, truncating at a newline boundary when possible.
 338 */
 339export function generatePreview(
 340  content: string,
 341  maxBytes: number,
 342): { preview: string; hasMore: boolean } {
 343  if (content.length <= maxBytes) {
 344    return { preview: content, hasMore: false }
 345  }
 346
 347  // Find the last newline within the limit to avoid cutting mid-line
 348  const truncated = content.slice(0, maxBytes)
 349  const lastNewline = truncated.lastIndexOf('\n')
 350
 351  // If we found a newline reasonably close to the limit, use it
 352  // Otherwise fall back to the exact limit
 353  const cutPoint = lastNewline > maxBytes * 0.5 ? lastNewline : maxBytes
 354
 355  return { preview: content.slice(0, cutPoint), hasMore: true }
 356}
 357
 358/**
 359 * Type guard to check if persist result is an error
 360 */
 361export function isPersistError(
 362  result: PersistedToolResult | PersistToolResultError,
 363): result is PersistToolResultError {
 364  return 'error' in result
 365}
 366
 367// --- Message-level aggregate tool result budget ---
 368//
 369// Tracks replacement state across turns so enforceToolResultBudget makes the
 370// same choices every time (preserves prompt cache prefix).
 371
 372/**
 373 * Per-conversation-thread state for the aggregate tool result budget.
 374 * State must be stable to preserve prompt cache:
 375 *   - seenIds: results that have passed through the budget check (replaced
 376 *     or not). Once seen, a result's fate is frozen for the conversation.
 377 *   - replacements: subset of seenIds that were persisted to disk and
 378 *     replaced with previews, mapped to the exact preview string shown to
 379 *     the model. Re-application is a Map lookup — no file I/O, guaranteed
 380 *     byte-identical, cannot fail.
 381 *
 382 * Lifecycle: one instance per conversation thread, carried on ToolUseContext.
 383 * Main thread: REPL provisions once, never resets — stale entries after
 384 * /clear, rewind, resume, or compact are never looked up (tool_use_ids are
 385 * UUIDs) so they're harmless. Subagents: createSubagentContext clones the
 386 * parent's state by default (cache-sharing forks like agentSummary need
 387 * identical decisions), or resumeAgentBackground threads one reconstructed
 388 * from sidechain records.
 389 */
 390export type ContentReplacementState = {
 391  seenIds: Set<string>
 392  replacements: Map<string, string>
 393}
 394
 395export function createContentReplacementState(): ContentReplacementState {
 396  return { seenIds: new Set(), replacements: new Map() }
 397}
 398
 399/**
 400 * Clone replacement state for a cache-sharing fork (e.g. agentSummary).
 401 * The fork needs state identical to the source at fork time so
 402 * enforceToolResultBudget makes the same choices → same wire prefix →
 403 * prompt cache hit. Mutating the clone does not affect the source.
 404 */
 405export function cloneContentReplacementState(
 406  source: ContentReplacementState,
 407): ContentReplacementState {
 408  return {
 409    seenIds: new Set(source.seenIds),
 410    replacements: new Map(source.replacements),
 411  }
 412}
 413
 414/**
 415 * Resolve the per-message aggregate budget limit. GrowthBook override
 416 * (tengu_hawthorn_window) wins when present and a finite positive number;
 417 * otherwise falls back to the hardcoded constant. Defensive typeof/finite
 418 * check: GrowthBook's cache returns `cached !== undefined ? cached : default`,
 419 * so a flag served as null/string/NaN leaks through.
 420 */
 421export function getPerMessageBudgetLimit(): number {
 422  const override = getFeatureValue_CACHED_MAY_BE_STALE<number | null>(
 423    'tengu_hawthorn_window',
 424    null,
 425  )
 426  if (
 427    typeof override === 'number' &&
 428    Number.isFinite(override) &&
 429    override > 0
 430  ) {
 431    return override
 432  }
 433  return MAX_TOOL_RESULTS_PER_MESSAGE_CHARS
 434}
 435
 436/**
 437 * Provision replacement state for a new conversation thread.
 438 *
 439 * Encapsulates the feature-flag gate + reconstruct-vs-fresh choice:
 440 *   - Flag off → undefined (query.ts skips enforcement entirely)
 441 *   - No initialMessages (cold start) → fresh
 442 *   - initialMessages present → reconstruct (freeze all candidate IDs so the
 443 *     budget never replaces content the model already saw unreplaced). Empty
 444 *     or absent records freeze everything; non-empty records additionally
 445 *     populate the replacements Map for byte-identical re-apply.
 446 */
 447export function provisionContentReplacementState(
 448  initialMessages?: Message[],
 449  initialContentReplacements?: ContentReplacementRecord[],
 450): ContentReplacementState | undefined {
 451  const enabled = getFeatureValue_CACHED_MAY_BE_STALE(
 452    'tengu_hawthorn_steeple',
 453    false,
 454  )
 455  if (!enabled) return undefined
 456  if (initialMessages) {
 457    return reconstructContentReplacementState(
 458      initialMessages,
 459      initialContentReplacements ?? [],
 460    )
 461  }
 462  return createContentReplacementState()
 463}
 464
 465/**
 466 * Serializable record of one content-replacement decision. Written to the
 467 * transcript as a ContentReplacementEntry so decisions survive resume.
 468 * Discriminated by `kind` so future replacement mechanisms (user text,
 469 * offloaded images) can share the same transcript entry type.
 470 *
 471 * `replacement` is the exact string the model saw — stored rather than
 472 * derived on resume so code changes to the preview template, size formatting,
 473 * or path layout can't silently break prompt cache.
 474 */
 475export type ContentReplacementRecord = {
 476  kind: 'tool-result'
 477  toolUseId: string
 478  replacement: string
 479}
 480
 481export type ToolResultReplacementRecord = Extract<
 482  ContentReplacementRecord,
 483  { kind: 'tool-result' }
 484>
 485
 486type ToolResultCandidate = {
 487  toolUseId: string
 488  content: NonNullable<ToolResultBlockParam['content']>
 489  size: number
 490}
 491
 492type CandidatePartition = {
 493  mustReapply: Array<ToolResultCandidate & { replacement: string }>
 494  frozen: ToolResultCandidate[]
 495  fresh: ToolResultCandidate[]
 496}
 497
 498function isContentAlreadyCompacted(
 499  content: ToolResultBlockParam['content'],
 500): boolean {
 501  // All budget-produced content starts with the tag (buildLargeToolResultMessage).
 502  // `.startsWith()` avoids false-positives when the tag appears anywhere else
 503  // in the content (e.g., reading this source file).
 504  return typeof content === 'string' && content.startsWith(PERSISTED_OUTPUT_TAG)
 505}
 506
 507function hasImageBlock(
 508  content: NonNullable<ToolResultBlockParam['content']>,
 509): boolean {
 510  return (
 511    Array.isArray(content) &&
 512    content.some(
 513      b => typeof b === 'object' && 'type' in b && b.type === 'image',
 514    )
 515  )
 516}
 517
 518function contentSize(
 519  content: NonNullable<ToolResultBlockParam['content']>,
 520): number {
 521  if (typeof content === 'string') return content.length
 522  // Sum text-block lengths directly. Slightly under-counts vs serialized
 523  // (no JSON framing), but the budget is a rough token heuristic anyway.
 524  // Avoids allocating a content-sized string every enforcement pass.
 525  return content.reduce(
 526    (sum, b) => sum + (b.type === 'text' ? b.text.length : 0),
 527    0,
 528  )
 529}
 530
 531/**
 532 * Walk messages and build tool_use_id → tool_name from assistant tool_use
 533 * blocks. tool_use always precedes its tool_result (model calls, then result
 534 * arrives), so by the time budget enforcement sees a result, its name is known.
 535 */
 536function buildToolNameMap(messages: Message[]): Map<string, string> {
 537  const map = new Map<string, string>()
 538  for (const message of messages) {
 539    if (message.type !== 'assistant') continue
 540    const content = message.message.content
 541    if (!Array.isArray(content)) continue
 542    for (const block of content) {
 543      if (block.type === 'tool_use') {
 544        map.set(block.id, block.name)
 545      }
 546    }
 547  }
 548  return map
 549}
 550
 551/**
 552 * Extract candidate tool_result blocks from a single user message: blocks
 553 * that are non-empty, non-image, and not already compacted by tag (i.e. by
 554 * the per-tool limit, or an earlier iteration of this same query call).
 555 * Returns [] for messages with no eligible blocks.
 556 */
 557function collectCandidatesFromMessage(message: Message): ToolResultCandidate[] {
 558  if (message.type !== 'user' || !Array.isArray(message.message.content)) {
 559    return []
 560  }
 561  return message.message.content.flatMap(block => {
 562    if (block.type !== 'tool_result' || !block.content) return []
 563    if (isContentAlreadyCompacted(block.content)) return []
 564    if (hasImageBlock(block.content)) return []
 565    return [
 566      {
 567        toolUseId: block.tool_use_id,
 568        content: block.content,
 569        size: contentSize(block.content),
 570      },
 571    ]
 572  })
 573}
 574
 575/**
 576 * Extract candidate tool_result blocks grouped by API-level user message.
 577 *
 578 * normalizeMessagesForAPI merges consecutive user messages into one
 579 * (Bedrock compat; 1P does the same server-side), so parallel tool
 580 * results that arrive as N separate user messages in our state become
 581 * ONE user message on the wire. The budget must group the same way or
 582 * it would see N under-budget messages instead of one over-budget
 583 * message and fail to enforce exactly when it matters most.
 584 *
 585 * A "group" is a maximal run of user messages NOT separated by an
 586 * assistant message. Only assistant messages create wire-level
 587 * boundaries — normalizeMessagesForAPI filters out progress entirely
 588 * and merges attachment / system(local_command) INTO adjacent user
 589 * blocks, so those types do NOT break groups here either.
 590 *
 591 * This matters for abort-during-parallel-tools paths: agent_progress
 592 * messages (non-ephemeral, persisted in REPL state) can interleave
 593 * between fresh tool_result messages. If we flushed on progress, those
 594 * tool_results would split into under-budget groups, slip through
 595 * unreplaced, get frozen, then be merged by normalizeMessagesForAPI
 596 * into one over-budget wire message — defeating the feature.
 597 *
 598 * Only groups with at least one eligible candidate are returned.
 599 */
 600function collectCandidatesByMessage(
 601  messages: Message[],
 602): ToolResultCandidate[][] {
 603  const groups: ToolResultCandidate[][] = []
 604  let current: ToolResultCandidate[] = []
 605
 606  const flush = () => {
 607    if (current.length > 0) groups.push(current)
 608    current = []
 609  }
 610
 611  // Track all assistant message.ids seen so far — same-ID fragments are
 612  // merged by normalizeMessagesForAPI (messages.ts ~2126 walks back PAST
 613  // different-ID assistants via `continue`), so any re-appearance of a
 614  // previously-seen ID must NOT create a group boundary. Two scenarios:
 615  //   • Consecutive: streamingToolExecution yields one AssistantMessage per
 616  //     content_block_stop (same id); a fast tool drains between blocks;
 617  //     abort/hook-stop leaves [asst(X), user(trA), asst(X), user(trB)].
 618  //   • Interleaved: coordinator/teammate streams mix different responses
 619  //     so [asst(X), user(trA), asst(Y), user(trB), asst(X), user(trC)].
 620  // In both, normalizeMessagesForAPI merges the X fragments into one wire
 621  // assistant, and their following tool_results merge into one wire user
 622  // message — so the budget must see them as one group too.
 623  const seenAsstIds = new Set<string>()
 624  for (const message of messages) {
 625    if (message.type === 'user') {
 626      current.push(...collectCandidatesFromMessage(message))
 627    } else if (message.type === 'assistant') {
 628      if (!seenAsstIds.has(message.message.id)) {
 629        flush()
 630        seenAsstIds.add(message.message.id)
 631      }
 632    }
 633    // progress / attachment / system are filtered or merged by
 634    // normalizeMessagesForAPI — they don't create wire boundaries.
 635  }
 636  flush()
 637
 638  return groups
 639}
 640
 641/**
 642 * Partition candidates by their prior decision state:
 643 *  - mustReapply: previously replaced → re-apply the cached replacement for
 644 *    prefix stability
 645 *  - frozen: previously seen and left unreplaced → off-limits (replacing
 646 *    now would change a prefix that was already cached)
 647 *  - fresh: never seen → eligible for new replacement decisions
 648 */
 649function partitionByPriorDecision(
 650  candidates: ToolResultCandidate[],
 651  state: ContentReplacementState,
 652): CandidatePartition {
 653  return candidates.reduce<CandidatePartition>(
 654    (acc, c) => {
 655      const replacement = state.replacements.get(c.toolUseId)
 656      if (replacement !== undefined) {
 657        acc.mustReapply.push({ ...c, replacement })
 658      } else if (state.seenIds.has(c.toolUseId)) {
 659        acc.frozen.push(c)
 660      } else {
 661        acc.fresh.push(c)
 662      }
 663      return acc
 664    },
 665    { mustReapply: [], frozen: [], fresh: [] },
 666  )
 667}
 668
 669/**
 670 * Pick the largest fresh results to replace until the model-visible total
 671 * (frozen + remaining fresh) is at or under budget, or fresh is exhausted.
 672 * If frozen results alone exceed budget we accept the overage — microcompact
 673 * will eventually clear them.
 674 */
 675function selectFreshToReplace(
 676  fresh: ToolResultCandidate[],
 677  frozenSize: number,
 678  limit: number,
 679): ToolResultCandidate[] {
 680  const sorted = [...fresh].sort((a, b) => b.size - a.size)
 681  const selected: ToolResultCandidate[] = []
 682  let remaining = frozenSize + fresh.reduce((sum, c) => sum + c.size, 0)
 683  for (const c of sorted) {
 684    if (remaining <= limit) break
 685    selected.push(c)
 686    // We don't know the replacement size until after persist, but previews
 687    // are ~2K and results hitting this path are much larger, so subtracting
 688    // the full size is a close approximation for selection purposes.
 689    remaining -= c.size
 690  }
 691  return selected
 692}
 693
 694/**
 695 * Return a new Message[] where each tool_result block whose id appears in
 696 * replacementMap has its content replaced. Messages and blocks with no
 697 * replacements are passed through by reference.
 698 */
 699function replaceToolResultContents(
 700  messages: Message[],
 701  replacementMap: Map<string, string>,
 702): Message[] {
 703  return messages.map(message => {
 704    if (message.type !== 'user' || !Array.isArray(message.message.content)) {
 705      return message
 706    }
 707    const content = message.message.content
 708    const needsReplace = content.some(
 709      b => b.type === 'tool_result' && replacementMap.has(b.tool_use_id),
 710    )
 711    if (!needsReplace) return message
 712    return {
 713      ...message,
 714      message: {
 715        ...message.message,
 716        content: content.map(block => {
 717          if (block.type !== 'tool_result') return block
 718          const replacement = replacementMap.get(block.tool_use_id)
 719          return replacement === undefined
 720            ? block
 721            : { ...block, content: replacement }
 722        }),
 723      },
 724    }
 725  })
 726}
 727
 728async function buildReplacement(
 729  candidate: ToolResultCandidate,
 730): Promise<{ content: string; originalSize: number } | null> {
 731  const result = await persistToolResult(candidate.content, candidate.toolUseId)
 732  if (isPersistError(result)) return null
 733  return {
 734    content: buildLargeToolResultMessage(result),
 735    originalSize: result.originalSize,
 736  }
 737}
 738
 739/**
 740 * Enforce the per-message budget on aggregate tool result size.
 741 *
 742 * For each user message whose tool_result blocks together exceed the
 743 * per-message limit (see getPerMessageBudgetLimit), the largest FRESH
 744 * (never-before-seen) results in THAT message are persisted to disk and
 745 * replaced with previews.
 746 * Messages are evaluated independently — a 150K result in one message and
 747 * a 150K result in another are both under budget and untouched.
 748 *
 749 * State is tracked by tool_use_id in `state`. Once a result is seen its
 750 * fate is frozen: previously-replaced results get the same replacement
 751 * re-applied every turn from the cached preview string (zero I/O,
 752 * byte-identical), and previously-unreplaced results are never replaced
 753 * later (would break prompt cache).
 754 *
 755 * Each turn adds at most one new user message with tool_result blocks,
 756 * so the per-message loop typically does the budget check at most once;
 757 * all prior messages just re-apply cached replacements.
 758 *
 759 * @param state — MUTATED: seenIds and replacements are updated in place
 760 *   to record choices made this call. The caller holds a stable reference
 761 *   across turns; returning a new object would require error-prone ref
 762 *   updates after every query.
 763 *
 764 * Returns `{ messages, newlyReplaced }`:
 765 *   - messages: same array instance when no replacement is needed
 766 *   - newlyReplaced: replacements made THIS call (not re-applies).
 767 *     Caller persists these to the transcript for resume reconstruction.
 768 */
 769export async function enforceToolResultBudget(
 770  messages: Message[],
 771  state: ContentReplacementState,
 772  skipToolNames: ReadonlySet<string> = new Set(),
 773): Promise<{
 774  messages: Message[]
 775  newlyReplaced: ToolResultReplacementRecord[]
 776}> {
 777  const candidatesByMessage = collectCandidatesByMessage(messages)
 778  const nameByToolUseId =
 779    skipToolNames.size > 0 ? buildToolNameMap(messages) : undefined
 780  const shouldSkip = (id: string): boolean =>
 781    nameByToolUseId !== undefined &&
 782    skipToolNames.has(nameByToolUseId.get(id) ?? '')
 783  // Resolve once per call. A mid-session flag change only affects FRESH
 784  // messages (prior decisions are frozen via seenIds/replacements), so
 785  // prompt cache for already-seen content is preserved regardless.
 786  const limit = getPerMessageBudgetLimit()
 787
 788  // Walk each API-level message group independently. For previously-processed messages
 789  // (all IDs in seenIds) this just re-applies cached replacements. For the
 790  // single new message this turn added, it runs the budget check.
 791  const replacementMap = new Map<string, string>()
 792  const toPersist: ToolResultCandidate[] = []
 793  let reappliedCount = 0
 794  let messagesOverBudget = 0
 795
 796  for (const candidates of candidatesByMessage) {
 797    const { mustReapply, frozen, fresh } = partitionByPriorDecision(
 798      candidates,
 799      state,
 800    )
 801
 802    // Re-apply: pure Map lookups. No file I/O, byte-identical, cannot fail.
 803    mustReapply.forEach(c => replacementMap.set(c.toolUseId, c.replacement))
 804    reappliedCount += mustReapply.length
 805
 806    // Fresh means this is a new message. Check its per-message budget.
 807    // (A previously-processed message has fresh.length === 0 because all
 808    // its IDs were added to seenIds when first seen.)
 809    if (fresh.length === 0) {
 810      // mustReapply/frozen are already in seenIds from their first pass —
 811      // re-adding is a no-op but keeps the invariant explicit.
 812      candidates.forEach(c => state.seenIds.add(c.toolUseId))
 813      continue
 814    }
 815
 816    // Tools with maxResultSizeChars: Infinity (Read) — never persist.
 817    // Mark as seen (frozen) so the decision sticks across turns. They don't
 818    // count toward freshSize; if that lets the group slip under budget and
 819    // the wire message is still large, that's the contract — Read's own
 820    // maxTokens is the bound, not this wrapper.
 821    const skipped = fresh.filter(c => shouldSkip(c.toolUseId))
 822    skipped.forEach(c => state.seenIds.add(c.toolUseId))
 823    const eligible = fresh.filter(c => !shouldSkip(c.toolUseId))
 824
 825    const frozenSize = frozen.reduce((sum, c) => sum + c.size, 0)
 826    const freshSize = eligible.reduce((sum, c) => sum + c.size, 0)
 827
 828    const selected =
 829      frozenSize + freshSize > limit
 830        ? selectFreshToReplace(eligible, frozenSize, limit)
 831        : []
 832
 833    // Mark non-persisting candidates as seen NOW (synchronously). IDs
 834    // selected for persist are marked seen AFTER the await, alongside
 835    // replacements.set — keeps the pair atomic under observation so no
 836    // concurrent reader (once subagents share state) ever sees X∈seenIds
 837    // but X∉replacements, which would misclassify X as frozen and send
 838    // full content while the main thread sends the preview → cache miss.
 839    const selectedIds = new Set(selected.map(c => c.toolUseId))
 840    candidates
 841      .filter(c => !selectedIds.has(c.toolUseId))
 842      .forEach(c => state.seenIds.add(c.toolUseId))
 843
 844    if (selected.length === 0) continue
 845    messagesOverBudget++
 846    toPersist.push(...selected)
 847  }
 848
 849  if (replacementMap.size === 0 && toPersist.length === 0) {
 850    return { messages, newlyReplaced: [] }
 851  }
 852
 853  // Fresh: concurrent persist for all selected candidates across all
 854  // messages. In practice toPersist comes from a single message per turn.
 855  const freshReplacements = await Promise.all(
 856    toPersist.map(async c => [c, await buildReplacement(c)] as const),
 857  )
 858  const newlyReplaced: ToolResultReplacementRecord[] = []
 859  let replacedSize = 0
 860  for (const [candidate, replacement] of freshReplacements) {
 861    // Mark seen HERE, post-await, atomically with replacements.set for
 862    // success cases. For persist failures (replacement === null) the ID
 863    // is seen-but-unreplaced — the original content was sent to the
 864    // model, so treating it as frozen going forward is correct.
 865    state.seenIds.add(candidate.toolUseId)
 866    if (replacement === null) continue
 867    replacedSize += candidate.size
 868    replacementMap.set(candidate.toolUseId, replacement.content)
 869    state.replacements.set(candidate.toolUseId, replacement.content)
 870    newlyReplaced.push({
 871      kind: 'tool-result',
 872      toolUseId: candidate.toolUseId,
 873      replacement: replacement.content,
 874    })
 875    logEvent('tengu_tool_result_persisted_message_budget', {
 876      originalSizeBytes: replacement.originalSize,
 877      persistedSizeBytes: replacement.content.length,
 878      estimatedOriginalTokens: Math.ceil(
 879        replacement.originalSize / BYTES_PER_TOKEN,
 880      ),
 881      estimatedPersistedTokens: Math.ceil(
 882        replacement.content.length / BYTES_PER_TOKEN,
 883      ),
 884    })
 885  }
 886
 887  if (replacementMap.size === 0) {
 888    return { messages, newlyReplaced: [] }
 889  }
 890
 891  if (newlyReplaced.length > 0) {
 892    logForDebugging(
 893      `Per-message budget: persisted ${newlyReplaced.length} tool results ` +
 894        `across ${messagesOverBudget} over-budget message(s), ` +
 895        `shed ~${formatFileSize(replacedSize)}, ${reappliedCount} re-applied`,
 896    )
 897    logEvent('tengu_message_level_tool_result_budget_enforced', {
 898      resultsPersisted: newlyReplaced.length,
 899      messagesOverBudget,
 900      replacedSizeBytes: replacedSize,
 901      reapplied: reappliedCount,
 902    })
 903  }
 904
 905  return {
 906    messages: replaceToolResultContents(messages, replacementMap),
 907    newlyReplaced,
 908  }
 909}
 910
 911/**
 912 * Query-loop integration point for the aggregate budget.
 913 *
 914 * Gates on `state` (undefined means feature disabled → no-op return),
 915 * applies enforcement, and fires an optional transcript-write callback
 916 * for new replacements. The caller (query.ts) owns the persistence gate
 917 * — it passes a callback only for querySources that read records back on
 918 * resume (repl_main_thread*, agent:*); ephemeral runForkedAgent callers
 919 * (agentSummary, sessionMemory, /btw, compact) pass undefined.
 920 *
 921 * @returns messages with replacements applied, or the input array unchanged
 922 *   when the feature is off or no replacement occurred.
 923 */
 924export async function applyToolResultBudget(
 925  messages: Message[],
 926  state: ContentReplacementState | undefined,
 927  writeToTranscript?: (records: ToolResultReplacementRecord[]) => void,
 928  skipToolNames?: ReadonlySet<string>,
 929): Promise<Message[]> {
 930  if (!state) return messages
 931  const result = await enforceToolResultBudget(messages, state, skipToolNames)
 932  if (result.newlyReplaced.length > 0) {
 933    writeToTranscript?.(result.newlyReplaced)
 934  }
 935  return result.messages
 936}
 937
 938/**
 939 * Reconstruct replacement state from content-replacement records loaded from
 940 * the transcript. Used on resume so the budget makes the same choices it
 941 * made in the original session (prompt cache stability).
 942 *
 943 * Accepts the full ContentReplacementRecord[] from LogOption (may include
 944 * future non-tool-result kinds); only tool-result records are applied here.
 945 *
 946 *   - replacements: populated directly from the stored replacement strings.
 947 *     Records for IDs not in messages (e.g. after compact) are skipped —
 948 *     they're inert anyway.
 949 *   - seenIds: every candidate tool_use_id in the loaded messages. A result
 950 *     being in the transcript means it was sent to the model, so it was seen.
 951 *     This freezes unreplaced results against future replacement.
 952 *   - inheritedReplacements: gap-fill for fork-subagent resume. A fork's
 953 *     original run applies parent-inherited replacements via mustReapply
 954 *     (never persisted — not newlyReplaced). On resume the sidechain has
 955 *     the original content but no record, so records alone would classify
 956 *     it as frozen. The parent's live state still has the mapping; copy
 957 *     it for IDs in messages that records don't cover. No-op for non-fork
 958 *     resumes (parent IDs aren't in the subagent's messages).
 959 */
 960export function reconstructContentReplacementState(
 961  messages: Message[],
 962  records: ContentReplacementRecord[],
 963  inheritedReplacements?: ReadonlyMap<string, string>,
 964): ContentReplacementState {
 965  const state = createContentReplacementState()
 966  const candidateIds = new Set(
 967    collectCandidatesByMessage(messages)
 968      .flat()
 969      .map(c => c.toolUseId),
 970  )
 971
 972  for (const id of candidateIds) {
 973    state.seenIds.add(id)
 974  }
 975  for (const r of records) {
 976    if (r.kind === 'tool-result' && candidateIds.has(r.toolUseId)) {
 977      state.replacements.set(r.toolUseId, r.replacement)
 978    }
 979  }
 980  if (inheritedReplacements) {
 981    for (const [id, replacement] of inheritedReplacements) {
 982      if (candidateIds.has(id) && !state.replacements.has(id)) {
 983        state.replacements.set(id, replacement)
 984      }
 985    }
 986  }
 987  return state
 988}
 989
 990/**
 991 * AgentTool-resume variant: encapsulates the feature-flag gate + parent
 992 * gap-fill so both AgentTool.call and resumeAgentBackground share one
 993 * implementation. Returns undefined when parentState is undefined (feature
 994 * off); otherwise reconstructs from sidechain records with parent's live
 995 * replacements filling gaps for fork-inherited mustReapply entries.
 996 *
 997 * Kept out of AgentTool.tsx — that file is at the feature() DCE complexity
 998 * cliff and cannot tolerate even +1 net source line without silently
 999 * breaking feature('TRANSCRIPT_CLASSIFIER') eval in tests.
1000 */
1001export function reconstructForSubagentResume(
1002  parentState: ContentReplacementState | undefined,
1003  resumedMessages: Message[],
1004  sidechainRecords: ContentReplacementRecord[],
1005): ContentReplacementState | undefined {
1006  if (!parentState) return undefined
1007  return reconstructContentReplacementState(
1008    resumedMessages,
1009    sidechainRecords,
1010    parentState.replacements,
1011  )
1012}
1013
1014/**
1015 * Get a human-readable error message from a filesystem error
1016 */
1017function getFileSystemErrorMessage(error: Error): string {
1018  // Node.js filesystem errors have a 'code' property
1019  // eslint-disable-next-line no-restricted-syntax -- uses .path, not just .code
1020  const nodeError = error as NodeJS.ErrnoException
1021  if (nodeError.code) {
1022    switch (nodeError.code) {
1023      case 'ENOENT':
1024        return `Directory not found: ${nodeError.path ?? 'unknown path'}`
1025      case 'EACCES':
1026        return `Permission denied: ${nodeError.path ?? 'unknown path'}`
1027      case 'ENOSPC':
1028        return 'No space left on device'
1029      case 'EROFS':
1030        return 'Read-only file system'
1031      case 'EMFILE':
1032        return 'Too many open files'
1033      case 'EEXIST':
1034        return `File already exists: ${nodeError.path ?? 'unknown path'}`
1035      default:
1036        return `${nodeError.code}: ${nodeError.message}`
1037    }
1038  }
1039  return error.message
1040}