source dump of claude code
at main 1040 lines 38 kB view raw
1/** 2 * Utility for persisting large tool results to disk instead of truncating them. 3 */ 4 5import type { ToolResultBlockParam } from '@anthropic-ai/sdk/resources/index.mjs' 6import { mkdir, writeFile } from 'fs/promises' 7import { join } from 'path' 8import { getOriginalCwd, getSessionId } from '../bootstrap/state.js' 9import { 10 BYTES_PER_TOKEN, 11 DEFAULT_MAX_RESULT_SIZE_CHARS, 12 MAX_TOOL_RESULT_BYTES, 13 MAX_TOOL_RESULTS_PER_MESSAGE_CHARS, 14} from '../constants/toolLimits.js' 15import { getFeatureValue_CACHED_MAY_BE_STALE } from '../services/analytics/growthbook.js' 16import { logEvent } from '../services/analytics/index.js' 17import { sanitizeToolNameForAnalytics } from '../services/analytics/metadata.js' 18import type { Message } from '../types/message.js' 19import { logForDebugging } from './debug.js' 20import { getErrnoCode, toError } from './errors.js' 21import { formatFileSize } from './format.js' 22import { logError } from './log.js' 23import { getProjectDir } from './sessionStorage.js' 24import { jsonStringify } from './slowOperations.js' 25 26// Subdirectory name for tool results within a session 27export const TOOL_RESULTS_SUBDIR = 'tool-results' 28 29// XML tag used to wrap persisted output messages 30export const PERSISTED_OUTPUT_TAG = '<persisted-output>' 31export const PERSISTED_OUTPUT_CLOSING_TAG = '</persisted-output>' 32 33// Message used when tool result content was cleared without persisting to file 34export const TOOL_RESULT_CLEARED_MESSAGE = '[Old tool result content cleared]' 35 36/** 37 * GrowthBook override map: tool name -> persistence threshold (chars). 38 * When a tool name is present in this map, that value is used directly as the 39 * effective threshold, bypassing the Math.min() clamp against the 50k default. 40 * Tools absent from the map use the hardcoded fallback. 41 * Flag default is {} (no overrides == behavior unchanged). 42 */ 43const PERSIST_THRESHOLD_OVERRIDE_FLAG = 'tengu_satin_quoll' 44 45/** 46 * Resolve the effective persistence threshold for a tool. 47 * GrowthBook override wins when present; otherwise falls back to the declared 48 * per-tool cap clamped by the global default. 49 * 50 * Defensive: GrowthBook's cache returns `cached !== undefined ? cached : default`, 51 * so a flag served as `null` leaks through. We guard with optional chaining and a 52 * typeof check so any non-object flag value (null, string, number) falls through 53 * to the hardcoded default instead of throwing on index or returning 0. 54 */ 55export function getPersistenceThreshold( 56 toolName: string, 57 declaredMaxResultSizeChars: number, 58): number { 59 // Infinity = hard opt-out. Read self-bounds via maxTokens; persisting its 60 // output to a file the model reads back with Read is circular. Checked 61 // before the GB override so tengu_satin_quoll can't force it back on. 62 if (!Number.isFinite(declaredMaxResultSizeChars)) { 63 return declaredMaxResultSizeChars 64 } 65 const overrides = getFeatureValue_CACHED_MAY_BE_STALE<Record< 66 string, 67 number 68 > | null>(PERSIST_THRESHOLD_OVERRIDE_FLAG, {}) 69 const override = overrides?.[toolName] 70 if ( 71 typeof override === 'number' && 72 Number.isFinite(override) && 73 override > 0 74 ) { 75 return override 76 } 77 return Math.min(declaredMaxResultSizeChars, DEFAULT_MAX_RESULT_SIZE_CHARS) 78} 79 80// Result of persisting a tool result to disk 81export type PersistedToolResult = { 82 filepath: string 83 originalSize: number 84 isJson: boolean 85 preview: string 86 hasMore: boolean 87} 88 89// Error result when persistence fails 90export type PersistToolResultError = { 91 error: string 92} 93 94/** 95 * Get the session directory (projectDir/sessionId) 96 */ 97function getSessionDir(): string { 98 return join(getProjectDir(getOriginalCwd()), getSessionId()) 99} 100 101/** 102 * Get the tool results directory for this session (projectDir/sessionId/tool-results) 103 */ 104export function getToolResultsDir(): string { 105 return join(getSessionDir(), TOOL_RESULTS_SUBDIR) 106} 107 108// Preview size in bytes for the reference message 109export const PREVIEW_SIZE_BYTES = 2000 110 111/** 112 * Get the filepath where a tool result would be persisted. 113 */ 114export function getToolResultPath(id: string, isJson: boolean): string { 115 const ext = isJson ? 'json' : 'txt' 116 return join(getToolResultsDir(), `${id}.${ext}`) 117} 118 119/** 120 * Ensure the session-specific tool results directory exists 121 */ 122export async function ensureToolResultsDir(): Promise<void> { 123 try { 124 await mkdir(getToolResultsDir(), { recursive: true }) 125 } catch { 126 // Directory may already exist 127 } 128} 129 130/** 131 * Persist a tool result to disk and return information about the persisted file 132 * 133 * @param content - The tool result content to persist (string or array of content blocks) 134 * @param toolUseId - The ID of the tool use that produced the result 135 * @returns Information about the persisted file including filepath and preview 136 */ 137export async function persistToolResult( 138 content: NonNullable<ToolResultBlockParam['content']>, 139 toolUseId: string, 140): Promise<PersistedToolResult | PersistToolResultError> { 141 const isJson = Array.isArray(content) 142 143 // Check for non-text content - we can only persist text blocks 144 if (isJson) { 145 const hasNonTextContent = content.some(block => block.type !== 'text') 146 if (hasNonTextContent) { 147 return { 148 error: 'Cannot persist tool results containing non-text content', 149 } 150 } 151 } 152 153 await ensureToolResultsDir() 154 const filepath = getToolResultPath(toolUseId, isJson) 155 const contentStr = isJson ? jsonStringify(content, null, 2) : content 156 157 // tool_use_id is unique per invocation and content is deterministic for a 158 // given id, so skip if the file already exists. This prevents re-writing 159 // the same content on every API turn when microcompact replays the 160 // original messages. Use 'wx' instead of a stat-then-write race. 161 try { 162 await writeFile(filepath, contentStr, { encoding: 'utf-8', flag: 'wx' }) 163 logForDebugging( 164 `Persisted tool result to ${filepath} (${formatFileSize(contentStr.length)})`, 165 ) 166 } catch (error) { 167 if (getErrnoCode(error) !== 'EEXIST') { 168 logError(toError(error)) 169 return { error: getFileSystemErrorMessage(toError(error)) } 170 } 171 // EEXIST: already persisted on a prior turn, fall through to preview 172 } 173 174 // Generate a preview 175 const { preview, hasMore } = generatePreview(contentStr, PREVIEW_SIZE_BYTES) 176 177 return { 178 filepath, 179 originalSize: contentStr.length, 180 isJson, 181 preview, 182 hasMore, 183 } 184} 185 186/** 187 * Build a message for large tool results with preview 188 */ 189export function buildLargeToolResultMessage( 190 result: PersistedToolResult, 191): string { 192 let message = `${PERSISTED_OUTPUT_TAG}\n` 193 message += `Output too large (${formatFileSize(result.originalSize)}). Full output saved to: ${result.filepath}\n\n` 194 message += `Preview (first ${formatFileSize(PREVIEW_SIZE_BYTES)}):\n` 195 message += result.preview 196 message += result.hasMore ? '\n...\n' : '\n' 197 message += PERSISTED_OUTPUT_CLOSING_TAG 198 return message 199} 200 201/** 202 * Process a tool result for inclusion in a message. 203 * Maps the result to the API format and persists large results to disk. 204 */ 205export async function processToolResultBlock<T>( 206 tool: { 207 name: string 208 maxResultSizeChars: number 209 mapToolResultToToolResultBlockParam: ( 210 result: T, 211 toolUseID: string, 212 ) => ToolResultBlockParam 213 }, 214 toolUseResult: T, 215 toolUseID: string, 216): Promise<ToolResultBlockParam> { 217 const toolResultBlock = tool.mapToolResultToToolResultBlockParam( 218 toolUseResult, 219 toolUseID, 220 ) 221 return maybePersistLargeToolResult( 222 toolResultBlock, 223 tool.name, 224 getPersistenceThreshold(tool.name, tool.maxResultSizeChars), 225 ) 226} 227 228/** 229 * Process a pre-mapped tool result block. Applies persistence for large results 230 * without re-calling mapToolResultToToolResultBlockParam. 231 */ 232export async function processPreMappedToolResultBlock( 233 toolResultBlock: ToolResultBlockParam, 234 toolName: string, 235 maxResultSizeChars: number, 236): Promise<ToolResultBlockParam> { 237 return maybePersistLargeToolResult( 238 toolResultBlock, 239 toolName, 240 getPersistenceThreshold(toolName, maxResultSizeChars), 241 ) 242} 243 244/** 245 * True when a tool_result's content is empty or effectively empty. Covers: 246 * undefined/null/'', whitespace-only strings, empty arrays, and arrays whose 247 * only blocks are text blocks with empty/whitespace text. Non-text blocks 248 * (images, tool_reference) are treated as non-empty. 249 */ 250export function isToolResultContentEmpty( 251 content: ToolResultBlockParam['content'], 252): boolean { 253 if (!content) return true 254 if (typeof content === 'string') return content.trim() === '' 255 if (!Array.isArray(content)) return false 256 if (content.length === 0) return true 257 return content.every( 258 block => 259 typeof block === 'object' && 260 'type' in block && 261 block.type === 'text' && 262 'text' in block && 263 (typeof block.text !== 'string' || block.text.trim() === ''), 264 ) 265} 266 267/** 268 * Handle large tool results by persisting to disk instead of truncating. 269 * Returns the original block if no persistence needed, or a modified block 270 * with the content replaced by a reference to the persisted file. 271 */ 272async function maybePersistLargeToolResult( 273 toolResultBlock: ToolResultBlockParam, 274 toolName: string, 275 persistenceThreshold?: number, 276): Promise<ToolResultBlockParam> { 277 // Check size first before doing any async work - most tool results are small 278 const content = toolResultBlock.content 279 280 // inc-4586: Empty tool_result content at the prompt tail causes some models 281 // (notably capybara) to emit the \n\nHuman: stop sequence and end their turn 282 // with zero output. The server renderer inserts no \n\nAssistant: marker after 283 // tool results, so a bare </function_results>\n\n pattern-matches to a turn 284 // boundary. Several tools can legitimately produce empty output (silent-success 285 // shell commands, MCP servers returning content:[], REPL statements, etc.). 286 // Inject a short marker so the model always has something to react to. 287 if (isToolResultContentEmpty(content)) { 288 logEvent('tengu_tool_empty_result', { 289 toolName: sanitizeToolNameForAnalytics(toolName), 290 }) 291 return { 292 ...toolResultBlock, 293 content: `(${toolName} completed with no output)`, 294 } 295 } 296 // Narrow after the emptiness guard — content is non-nullish past this point. 297 if (!content) { 298 return toolResultBlock 299 } 300 301 // Skip persistence for image content blocks - they need to be sent as-is to Claude 302 if (hasImageBlock(content)) { 303 return toolResultBlock 304 } 305 306 const size = contentSize(content) 307 308 // Use tool-specific threshold if provided, otherwise fall back to global limit 309 const threshold = persistenceThreshold ?? MAX_TOOL_RESULT_BYTES 310 if (size <= threshold) { 311 return toolResultBlock 312 } 313 314 // Persist the entire content as a unit 315 const result = await persistToolResult(content, toolResultBlock.tool_use_id) 316 if (isPersistError(result)) { 317 // If persistence failed, return the original block unchanged 318 return toolResultBlock 319 } 320 321 const message = buildLargeToolResultMessage(result) 322 323 // Log analytics 324 logEvent('tengu_tool_result_persisted', { 325 toolName: sanitizeToolNameForAnalytics(toolName), 326 originalSizeBytes: result.originalSize, 327 persistedSizeBytes: message.length, 328 estimatedOriginalTokens: Math.ceil(result.originalSize / BYTES_PER_TOKEN), 329 estimatedPersistedTokens: Math.ceil(message.length / BYTES_PER_TOKEN), 330 thresholdUsed: threshold, 331 }) 332 333 return { ...toolResultBlock, content: message } 334} 335 336/** 337 * Generate a preview of content, truncating at a newline boundary when possible. 338 */ 339export function generatePreview( 340 content: string, 341 maxBytes: number, 342): { preview: string; hasMore: boolean } { 343 if (content.length <= maxBytes) { 344 return { preview: content, hasMore: false } 345 } 346 347 // Find the last newline within the limit to avoid cutting mid-line 348 const truncated = content.slice(0, maxBytes) 349 const lastNewline = truncated.lastIndexOf('\n') 350 351 // If we found a newline reasonably close to the limit, use it 352 // Otherwise fall back to the exact limit 353 const cutPoint = lastNewline > maxBytes * 0.5 ? lastNewline : maxBytes 354 355 return { preview: content.slice(0, cutPoint), hasMore: true } 356} 357 358/** 359 * Type guard to check if persist result is an error 360 */ 361export function isPersistError( 362 result: PersistedToolResult | PersistToolResultError, 363): result is PersistToolResultError { 364 return 'error' in result 365} 366 367// --- Message-level aggregate tool result budget --- 368// 369// Tracks replacement state across turns so enforceToolResultBudget makes the 370// same choices every time (preserves prompt cache prefix). 371 372/** 373 * Per-conversation-thread state for the aggregate tool result budget. 374 * State must be stable to preserve prompt cache: 375 * - seenIds: results that have passed through the budget check (replaced 376 * or not). Once seen, a result's fate is frozen for the conversation. 377 * - replacements: subset of seenIds that were persisted to disk and 378 * replaced with previews, mapped to the exact preview string shown to 379 * the model. Re-application is a Map lookup — no file I/O, guaranteed 380 * byte-identical, cannot fail. 381 * 382 * Lifecycle: one instance per conversation thread, carried on ToolUseContext. 383 * Main thread: REPL provisions once, never resets — stale entries after 384 * /clear, rewind, resume, or compact are never looked up (tool_use_ids are 385 * UUIDs) so they're harmless. Subagents: createSubagentContext clones the 386 * parent's state by default (cache-sharing forks like agentSummary need 387 * identical decisions), or resumeAgentBackground threads one reconstructed 388 * from sidechain records. 389 */ 390export type ContentReplacementState = { 391 seenIds: Set<string> 392 replacements: Map<string, string> 393} 394 395export function createContentReplacementState(): ContentReplacementState { 396 return { seenIds: new Set(), replacements: new Map() } 397} 398 399/** 400 * Clone replacement state for a cache-sharing fork (e.g. agentSummary). 401 * The fork needs state identical to the source at fork time so 402 * enforceToolResultBudget makes the same choices → same wire prefix → 403 * prompt cache hit. Mutating the clone does not affect the source. 404 */ 405export function cloneContentReplacementState( 406 source: ContentReplacementState, 407): ContentReplacementState { 408 return { 409 seenIds: new Set(source.seenIds), 410 replacements: new Map(source.replacements), 411 } 412} 413 414/** 415 * Resolve the per-message aggregate budget limit. GrowthBook override 416 * (tengu_hawthorn_window) wins when present and a finite positive number; 417 * otherwise falls back to the hardcoded constant. Defensive typeof/finite 418 * check: GrowthBook's cache returns `cached !== undefined ? cached : default`, 419 * so a flag served as null/string/NaN leaks through. 420 */ 421export function getPerMessageBudgetLimit(): number { 422 const override = getFeatureValue_CACHED_MAY_BE_STALE<number | null>( 423 'tengu_hawthorn_window', 424 null, 425 ) 426 if ( 427 typeof override === 'number' && 428 Number.isFinite(override) && 429 override > 0 430 ) { 431 return override 432 } 433 return MAX_TOOL_RESULTS_PER_MESSAGE_CHARS 434} 435 436/** 437 * Provision replacement state for a new conversation thread. 438 * 439 * Encapsulates the feature-flag gate + reconstruct-vs-fresh choice: 440 * - Flag off → undefined (query.ts skips enforcement entirely) 441 * - No initialMessages (cold start) → fresh 442 * - initialMessages present → reconstruct (freeze all candidate IDs so the 443 * budget never replaces content the model already saw unreplaced). Empty 444 * or absent records freeze everything; non-empty records additionally 445 * populate the replacements Map for byte-identical re-apply. 446 */ 447export function provisionContentReplacementState( 448 initialMessages?: Message[], 449 initialContentReplacements?: ContentReplacementRecord[], 450): ContentReplacementState | undefined { 451 const enabled = getFeatureValue_CACHED_MAY_BE_STALE( 452 'tengu_hawthorn_steeple', 453 false, 454 ) 455 if (!enabled) return undefined 456 if (initialMessages) { 457 return reconstructContentReplacementState( 458 initialMessages, 459 initialContentReplacements ?? [], 460 ) 461 } 462 return createContentReplacementState() 463} 464 465/** 466 * Serializable record of one content-replacement decision. Written to the 467 * transcript as a ContentReplacementEntry so decisions survive resume. 468 * Discriminated by `kind` so future replacement mechanisms (user text, 469 * offloaded images) can share the same transcript entry type. 470 * 471 * `replacement` is the exact string the model saw — stored rather than 472 * derived on resume so code changes to the preview template, size formatting, 473 * or path layout can't silently break prompt cache. 474 */ 475export type ContentReplacementRecord = { 476 kind: 'tool-result' 477 toolUseId: string 478 replacement: string 479} 480 481export type ToolResultReplacementRecord = Extract< 482 ContentReplacementRecord, 483 { kind: 'tool-result' } 484> 485 486type ToolResultCandidate = { 487 toolUseId: string 488 content: NonNullable<ToolResultBlockParam['content']> 489 size: number 490} 491 492type CandidatePartition = { 493 mustReapply: Array<ToolResultCandidate & { replacement: string }> 494 frozen: ToolResultCandidate[] 495 fresh: ToolResultCandidate[] 496} 497 498function isContentAlreadyCompacted( 499 content: ToolResultBlockParam['content'], 500): boolean { 501 // All budget-produced content starts with the tag (buildLargeToolResultMessage). 502 // `.startsWith()` avoids false-positives when the tag appears anywhere else 503 // in the content (e.g., reading this source file). 504 return typeof content === 'string' && content.startsWith(PERSISTED_OUTPUT_TAG) 505} 506 507function hasImageBlock( 508 content: NonNullable<ToolResultBlockParam['content']>, 509): boolean { 510 return ( 511 Array.isArray(content) && 512 content.some( 513 b => typeof b === 'object' && 'type' in b && b.type === 'image', 514 ) 515 ) 516} 517 518function contentSize( 519 content: NonNullable<ToolResultBlockParam['content']>, 520): number { 521 if (typeof content === 'string') return content.length 522 // Sum text-block lengths directly. Slightly under-counts vs serialized 523 // (no JSON framing), but the budget is a rough token heuristic anyway. 524 // Avoids allocating a content-sized string every enforcement pass. 525 return content.reduce( 526 (sum, b) => sum + (b.type === 'text' ? b.text.length : 0), 527 0, 528 ) 529} 530 531/** 532 * Walk messages and build tool_use_id → tool_name from assistant tool_use 533 * blocks. tool_use always precedes its tool_result (model calls, then result 534 * arrives), so by the time budget enforcement sees a result, its name is known. 535 */ 536function buildToolNameMap(messages: Message[]): Map<string, string> { 537 const map = new Map<string, string>() 538 for (const message of messages) { 539 if (message.type !== 'assistant') continue 540 const content = message.message.content 541 if (!Array.isArray(content)) continue 542 for (const block of content) { 543 if (block.type === 'tool_use') { 544 map.set(block.id, block.name) 545 } 546 } 547 } 548 return map 549} 550 551/** 552 * Extract candidate tool_result blocks from a single user message: blocks 553 * that are non-empty, non-image, and not already compacted by tag (i.e. by 554 * the per-tool limit, or an earlier iteration of this same query call). 555 * Returns [] for messages with no eligible blocks. 556 */ 557function collectCandidatesFromMessage(message: Message): ToolResultCandidate[] { 558 if (message.type !== 'user' || !Array.isArray(message.message.content)) { 559 return [] 560 } 561 return message.message.content.flatMap(block => { 562 if (block.type !== 'tool_result' || !block.content) return [] 563 if (isContentAlreadyCompacted(block.content)) return [] 564 if (hasImageBlock(block.content)) return [] 565 return [ 566 { 567 toolUseId: block.tool_use_id, 568 content: block.content, 569 size: contentSize(block.content), 570 }, 571 ] 572 }) 573} 574 575/** 576 * Extract candidate tool_result blocks grouped by API-level user message. 577 * 578 * normalizeMessagesForAPI merges consecutive user messages into one 579 * (Bedrock compat; 1P does the same server-side), so parallel tool 580 * results that arrive as N separate user messages in our state become 581 * ONE user message on the wire. The budget must group the same way or 582 * it would see N under-budget messages instead of one over-budget 583 * message and fail to enforce exactly when it matters most. 584 * 585 * A "group" is a maximal run of user messages NOT separated by an 586 * assistant message. Only assistant messages create wire-level 587 * boundaries — normalizeMessagesForAPI filters out progress entirely 588 * and merges attachment / system(local_command) INTO adjacent user 589 * blocks, so those types do NOT break groups here either. 590 * 591 * This matters for abort-during-parallel-tools paths: agent_progress 592 * messages (non-ephemeral, persisted in REPL state) can interleave 593 * between fresh tool_result messages. If we flushed on progress, those 594 * tool_results would split into under-budget groups, slip through 595 * unreplaced, get frozen, then be merged by normalizeMessagesForAPI 596 * into one over-budget wire message — defeating the feature. 597 * 598 * Only groups with at least one eligible candidate are returned. 599 */ 600function collectCandidatesByMessage( 601 messages: Message[], 602): ToolResultCandidate[][] { 603 const groups: ToolResultCandidate[][] = [] 604 let current: ToolResultCandidate[] = [] 605 606 const flush = () => { 607 if (current.length > 0) groups.push(current) 608 current = [] 609 } 610 611 // Track all assistant message.ids seen so far — same-ID fragments are 612 // merged by normalizeMessagesForAPI (messages.ts ~2126 walks back PAST 613 // different-ID assistants via `continue`), so any re-appearance of a 614 // previously-seen ID must NOT create a group boundary. Two scenarios: 615 // • Consecutive: streamingToolExecution yields one AssistantMessage per 616 // content_block_stop (same id); a fast tool drains between blocks; 617 // abort/hook-stop leaves [asst(X), user(trA), asst(X), user(trB)]. 618 // • Interleaved: coordinator/teammate streams mix different responses 619 // so [asst(X), user(trA), asst(Y), user(trB), asst(X), user(trC)]. 620 // In both, normalizeMessagesForAPI merges the X fragments into one wire 621 // assistant, and their following tool_results merge into one wire user 622 // message — so the budget must see them as one group too. 623 const seenAsstIds = new Set<string>() 624 for (const message of messages) { 625 if (message.type === 'user') { 626 current.push(...collectCandidatesFromMessage(message)) 627 } else if (message.type === 'assistant') { 628 if (!seenAsstIds.has(message.message.id)) { 629 flush() 630 seenAsstIds.add(message.message.id) 631 } 632 } 633 // progress / attachment / system are filtered or merged by 634 // normalizeMessagesForAPI — they don't create wire boundaries. 635 } 636 flush() 637 638 return groups 639} 640 641/** 642 * Partition candidates by their prior decision state: 643 * - mustReapply: previously replaced → re-apply the cached replacement for 644 * prefix stability 645 * - frozen: previously seen and left unreplaced → off-limits (replacing 646 * now would change a prefix that was already cached) 647 * - fresh: never seen → eligible for new replacement decisions 648 */ 649function partitionByPriorDecision( 650 candidates: ToolResultCandidate[], 651 state: ContentReplacementState, 652): CandidatePartition { 653 return candidates.reduce<CandidatePartition>( 654 (acc, c) => { 655 const replacement = state.replacements.get(c.toolUseId) 656 if (replacement !== undefined) { 657 acc.mustReapply.push({ ...c, replacement }) 658 } else if (state.seenIds.has(c.toolUseId)) { 659 acc.frozen.push(c) 660 } else { 661 acc.fresh.push(c) 662 } 663 return acc 664 }, 665 { mustReapply: [], frozen: [], fresh: [] }, 666 ) 667} 668 669/** 670 * Pick the largest fresh results to replace until the model-visible total 671 * (frozen + remaining fresh) is at or under budget, or fresh is exhausted. 672 * If frozen results alone exceed budget we accept the overage — microcompact 673 * will eventually clear them. 674 */ 675function selectFreshToReplace( 676 fresh: ToolResultCandidate[], 677 frozenSize: number, 678 limit: number, 679): ToolResultCandidate[] { 680 const sorted = [...fresh].sort((a, b) => b.size - a.size) 681 const selected: ToolResultCandidate[] = [] 682 let remaining = frozenSize + fresh.reduce((sum, c) => sum + c.size, 0) 683 for (const c of sorted) { 684 if (remaining <= limit) break 685 selected.push(c) 686 // We don't know the replacement size until after persist, but previews 687 // are ~2K and results hitting this path are much larger, so subtracting 688 // the full size is a close approximation for selection purposes. 689 remaining -= c.size 690 } 691 return selected 692} 693 694/** 695 * Return a new Message[] where each tool_result block whose id appears in 696 * replacementMap has its content replaced. Messages and blocks with no 697 * replacements are passed through by reference. 698 */ 699function replaceToolResultContents( 700 messages: Message[], 701 replacementMap: Map<string, string>, 702): Message[] { 703 return messages.map(message => { 704 if (message.type !== 'user' || !Array.isArray(message.message.content)) { 705 return message 706 } 707 const content = message.message.content 708 const needsReplace = content.some( 709 b => b.type === 'tool_result' && replacementMap.has(b.tool_use_id), 710 ) 711 if (!needsReplace) return message 712 return { 713 ...message, 714 message: { 715 ...message.message, 716 content: content.map(block => { 717 if (block.type !== 'tool_result') return block 718 const replacement = replacementMap.get(block.tool_use_id) 719 return replacement === undefined 720 ? block 721 : { ...block, content: replacement } 722 }), 723 }, 724 } 725 }) 726} 727 728async function buildReplacement( 729 candidate: ToolResultCandidate, 730): Promise<{ content: string; originalSize: number } | null> { 731 const result = await persistToolResult(candidate.content, candidate.toolUseId) 732 if (isPersistError(result)) return null 733 return { 734 content: buildLargeToolResultMessage(result), 735 originalSize: result.originalSize, 736 } 737} 738 739/** 740 * Enforce the per-message budget on aggregate tool result size. 741 * 742 * For each user message whose tool_result blocks together exceed the 743 * per-message limit (see getPerMessageBudgetLimit), the largest FRESH 744 * (never-before-seen) results in THAT message are persisted to disk and 745 * replaced with previews. 746 * Messages are evaluated independently — a 150K result in one message and 747 * a 150K result in another are both under budget and untouched. 748 * 749 * State is tracked by tool_use_id in `state`. Once a result is seen its 750 * fate is frozen: previously-replaced results get the same replacement 751 * re-applied every turn from the cached preview string (zero I/O, 752 * byte-identical), and previously-unreplaced results are never replaced 753 * later (would break prompt cache). 754 * 755 * Each turn adds at most one new user message with tool_result blocks, 756 * so the per-message loop typically does the budget check at most once; 757 * all prior messages just re-apply cached replacements. 758 * 759 * @param state — MUTATED: seenIds and replacements are updated in place 760 * to record choices made this call. The caller holds a stable reference 761 * across turns; returning a new object would require error-prone ref 762 * updates after every query. 763 * 764 * Returns `{ messages, newlyReplaced }`: 765 * - messages: same array instance when no replacement is needed 766 * - newlyReplaced: replacements made THIS call (not re-applies). 767 * Caller persists these to the transcript for resume reconstruction. 768 */ 769export async function enforceToolResultBudget( 770 messages: Message[], 771 state: ContentReplacementState, 772 skipToolNames: ReadonlySet<string> = new Set(), 773): Promise<{ 774 messages: Message[] 775 newlyReplaced: ToolResultReplacementRecord[] 776}> { 777 const candidatesByMessage = collectCandidatesByMessage(messages) 778 const nameByToolUseId = 779 skipToolNames.size > 0 ? buildToolNameMap(messages) : undefined 780 const shouldSkip = (id: string): boolean => 781 nameByToolUseId !== undefined && 782 skipToolNames.has(nameByToolUseId.get(id) ?? '') 783 // Resolve once per call. A mid-session flag change only affects FRESH 784 // messages (prior decisions are frozen via seenIds/replacements), so 785 // prompt cache for already-seen content is preserved regardless. 786 const limit = getPerMessageBudgetLimit() 787 788 // Walk each API-level message group independently. For previously-processed messages 789 // (all IDs in seenIds) this just re-applies cached replacements. For the 790 // single new message this turn added, it runs the budget check. 791 const replacementMap = new Map<string, string>() 792 const toPersist: ToolResultCandidate[] = [] 793 let reappliedCount = 0 794 let messagesOverBudget = 0 795 796 for (const candidates of candidatesByMessage) { 797 const { mustReapply, frozen, fresh } = partitionByPriorDecision( 798 candidates, 799 state, 800 ) 801 802 // Re-apply: pure Map lookups. No file I/O, byte-identical, cannot fail. 803 mustReapply.forEach(c => replacementMap.set(c.toolUseId, c.replacement)) 804 reappliedCount += mustReapply.length 805 806 // Fresh means this is a new message. Check its per-message budget. 807 // (A previously-processed message has fresh.length === 0 because all 808 // its IDs were added to seenIds when first seen.) 809 if (fresh.length === 0) { 810 // mustReapply/frozen are already in seenIds from their first pass — 811 // re-adding is a no-op but keeps the invariant explicit. 812 candidates.forEach(c => state.seenIds.add(c.toolUseId)) 813 continue 814 } 815 816 // Tools with maxResultSizeChars: Infinity (Read) — never persist. 817 // Mark as seen (frozen) so the decision sticks across turns. They don't 818 // count toward freshSize; if that lets the group slip under budget and 819 // the wire message is still large, that's the contract — Read's own 820 // maxTokens is the bound, not this wrapper. 821 const skipped = fresh.filter(c => shouldSkip(c.toolUseId)) 822 skipped.forEach(c => state.seenIds.add(c.toolUseId)) 823 const eligible = fresh.filter(c => !shouldSkip(c.toolUseId)) 824 825 const frozenSize = frozen.reduce((sum, c) => sum + c.size, 0) 826 const freshSize = eligible.reduce((sum, c) => sum + c.size, 0) 827 828 const selected = 829 frozenSize + freshSize > limit 830 ? selectFreshToReplace(eligible, frozenSize, limit) 831 : [] 832 833 // Mark non-persisting candidates as seen NOW (synchronously). IDs 834 // selected for persist are marked seen AFTER the await, alongside 835 // replacements.set — keeps the pair atomic under observation so no 836 // concurrent reader (once subagents share state) ever sees X∈seenIds 837 // but X∉replacements, which would misclassify X as frozen and send 838 // full content while the main thread sends the preview → cache miss. 839 const selectedIds = new Set(selected.map(c => c.toolUseId)) 840 candidates 841 .filter(c => !selectedIds.has(c.toolUseId)) 842 .forEach(c => state.seenIds.add(c.toolUseId)) 843 844 if (selected.length === 0) continue 845 messagesOverBudget++ 846 toPersist.push(...selected) 847 } 848 849 if (replacementMap.size === 0 && toPersist.length === 0) { 850 return { messages, newlyReplaced: [] } 851 } 852 853 // Fresh: concurrent persist for all selected candidates across all 854 // messages. In practice toPersist comes from a single message per turn. 855 const freshReplacements = await Promise.all( 856 toPersist.map(async c => [c, await buildReplacement(c)] as const), 857 ) 858 const newlyReplaced: ToolResultReplacementRecord[] = [] 859 let replacedSize = 0 860 for (const [candidate, replacement] of freshReplacements) { 861 // Mark seen HERE, post-await, atomically with replacements.set for 862 // success cases. For persist failures (replacement === null) the ID 863 // is seen-but-unreplaced — the original content was sent to the 864 // model, so treating it as frozen going forward is correct. 865 state.seenIds.add(candidate.toolUseId) 866 if (replacement === null) continue 867 replacedSize += candidate.size 868 replacementMap.set(candidate.toolUseId, replacement.content) 869 state.replacements.set(candidate.toolUseId, replacement.content) 870 newlyReplaced.push({ 871 kind: 'tool-result', 872 toolUseId: candidate.toolUseId, 873 replacement: replacement.content, 874 }) 875 logEvent('tengu_tool_result_persisted_message_budget', { 876 originalSizeBytes: replacement.originalSize, 877 persistedSizeBytes: replacement.content.length, 878 estimatedOriginalTokens: Math.ceil( 879 replacement.originalSize / BYTES_PER_TOKEN, 880 ), 881 estimatedPersistedTokens: Math.ceil( 882 replacement.content.length / BYTES_PER_TOKEN, 883 ), 884 }) 885 } 886 887 if (replacementMap.size === 0) { 888 return { messages, newlyReplaced: [] } 889 } 890 891 if (newlyReplaced.length > 0) { 892 logForDebugging( 893 `Per-message budget: persisted ${newlyReplaced.length} tool results ` + 894 `across ${messagesOverBudget} over-budget message(s), ` + 895 `shed ~${formatFileSize(replacedSize)}, ${reappliedCount} re-applied`, 896 ) 897 logEvent('tengu_message_level_tool_result_budget_enforced', { 898 resultsPersisted: newlyReplaced.length, 899 messagesOverBudget, 900 replacedSizeBytes: replacedSize, 901 reapplied: reappliedCount, 902 }) 903 } 904 905 return { 906 messages: replaceToolResultContents(messages, replacementMap), 907 newlyReplaced, 908 } 909} 910 911/** 912 * Query-loop integration point for the aggregate budget. 913 * 914 * Gates on `state` (undefined means feature disabled → no-op return), 915 * applies enforcement, and fires an optional transcript-write callback 916 * for new replacements. The caller (query.ts) owns the persistence gate 917 * — it passes a callback only for querySources that read records back on 918 * resume (repl_main_thread*, agent:*); ephemeral runForkedAgent callers 919 * (agentSummary, sessionMemory, /btw, compact) pass undefined. 920 * 921 * @returns messages with replacements applied, or the input array unchanged 922 * when the feature is off or no replacement occurred. 923 */ 924export async function applyToolResultBudget( 925 messages: Message[], 926 state: ContentReplacementState | undefined, 927 writeToTranscript?: (records: ToolResultReplacementRecord[]) => void, 928 skipToolNames?: ReadonlySet<string>, 929): Promise<Message[]> { 930 if (!state) return messages 931 const result = await enforceToolResultBudget(messages, state, skipToolNames) 932 if (result.newlyReplaced.length > 0) { 933 writeToTranscript?.(result.newlyReplaced) 934 } 935 return result.messages 936} 937 938/** 939 * Reconstruct replacement state from content-replacement records loaded from 940 * the transcript. Used on resume so the budget makes the same choices it 941 * made in the original session (prompt cache stability). 942 * 943 * Accepts the full ContentReplacementRecord[] from LogOption (may include 944 * future non-tool-result kinds); only tool-result records are applied here. 945 * 946 * - replacements: populated directly from the stored replacement strings. 947 * Records for IDs not in messages (e.g. after compact) are skipped — 948 * they're inert anyway. 949 * - seenIds: every candidate tool_use_id in the loaded messages. A result 950 * being in the transcript means it was sent to the model, so it was seen. 951 * This freezes unreplaced results against future replacement. 952 * - inheritedReplacements: gap-fill for fork-subagent resume. A fork's 953 * original run applies parent-inherited replacements via mustReapply 954 * (never persisted — not newlyReplaced). On resume the sidechain has 955 * the original content but no record, so records alone would classify 956 * it as frozen. The parent's live state still has the mapping; copy 957 * it for IDs in messages that records don't cover. No-op for non-fork 958 * resumes (parent IDs aren't in the subagent's messages). 959 */ 960export function reconstructContentReplacementState( 961 messages: Message[], 962 records: ContentReplacementRecord[], 963 inheritedReplacements?: ReadonlyMap<string, string>, 964): ContentReplacementState { 965 const state = createContentReplacementState() 966 const candidateIds = new Set( 967 collectCandidatesByMessage(messages) 968 .flat() 969 .map(c => c.toolUseId), 970 ) 971 972 for (const id of candidateIds) { 973 state.seenIds.add(id) 974 } 975 for (const r of records) { 976 if (r.kind === 'tool-result' && candidateIds.has(r.toolUseId)) { 977 state.replacements.set(r.toolUseId, r.replacement) 978 } 979 } 980 if (inheritedReplacements) { 981 for (const [id, replacement] of inheritedReplacements) { 982 if (candidateIds.has(id) && !state.replacements.has(id)) { 983 state.replacements.set(id, replacement) 984 } 985 } 986 } 987 return state 988} 989 990/** 991 * AgentTool-resume variant: encapsulates the feature-flag gate + parent 992 * gap-fill so both AgentTool.call and resumeAgentBackground share one 993 * implementation. Returns undefined when parentState is undefined (feature 994 * off); otherwise reconstructs from sidechain records with parent's live 995 * replacements filling gaps for fork-inherited mustReapply entries. 996 * 997 * Kept out of AgentTool.tsx — that file is at the feature() DCE complexity 998 * cliff and cannot tolerate even +1 net source line without silently 999 * breaking feature('TRANSCRIPT_CLASSIFIER') eval in tests. 1000 */ 1001export function reconstructForSubagentResume( 1002 parentState: ContentReplacementState | undefined, 1003 resumedMessages: Message[], 1004 sidechainRecords: ContentReplacementRecord[], 1005): ContentReplacementState | undefined { 1006 if (!parentState) return undefined 1007 return reconstructContentReplacementState( 1008 resumedMessages, 1009 sidechainRecords, 1010 parentState.replacements, 1011 ) 1012} 1013 1014/** 1015 * Get a human-readable error message from a filesystem error 1016 */ 1017function getFileSystemErrorMessage(error: Error): string { 1018 // Node.js filesystem errors have a 'code' property 1019 // eslint-disable-next-line no-restricted-syntax -- uses .path, not just .code 1020 const nodeError = error as NodeJS.ErrnoException 1021 if (nodeError.code) { 1022 switch (nodeError.code) { 1023 case 'ENOENT': 1024 return `Directory not found: ${nodeError.path ?? 'unknown path'}` 1025 case 'EACCES': 1026 return `Permission denied: ${nodeError.path ?? 'unknown path'}` 1027 case 'ENOSPC': 1028 return 'No space left on device' 1029 case 'EROFS': 1030 return 'Read-only file system' 1031 case 'EMFILE': 1032 return 'Too many open files' 1033 case 'EEXIST': 1034 return `File already exists: ${nodeError.path ?? 'unknown path'}` 1035 default: 1036 return `${nodeError.code}: ${nodeError.message}` 1037 } 1038 } 1039 return error.message 1040}