source dump of claude code
at main 1729 lines 69 kB view raw
1// biome-ignore-all assist/source/organizeImports: ANT-ONLY import markers must not be reordered 2import type { 3 ToolResultBlockParam, 4 ToolUseBlock, 5} from '@anthropic-ai/sdk/resources/index.mjs' 6import type { CanUseToolFn } from './hooks/useCanUseTool.js' 7import { FallbackTriggeredError } from './services/api/withRetry.js' 8import { 9 calculateTokenWarningState, 10 isAutoCompactEnabled, 11 type AutoCompactTrackingState, 12} from './services/compact/autoCompact.js' 13import { buildPostCompactMessages } from './services/compact/compact.js' 14/* eslint-disable @typescript-eslint/no-require-imports */ 15const reactiveCompact = feature('REACTIVE_COMPACT') 16 ? (require('./services/compact/reactiveCompact.js') as typeof import('./services/compact/reactiveCompact.js')) 17 : null 18const contextCollapse = feature('CONTEXT_COLLAPSE') 19 ? (require('./services/contextCollapse/index.js') as typeof import('./services/contextCollapse/index.js')) 20 : null 21/* eslint-enable @typescript-eslint/no-require-imports */ 22import { 23 logEvent, 24 type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 25} from 'src/services/analytics/index.js' 26import { ImageSizeError } from './utils/imageValidation.js' 27import { ImageResizeError } from './utils/imageResizer.js' 28import { findToolByName, type ToolUseContext } from './Tool.js' 29import { asSystemPrompt, type SystemPrompt } from './utils/systemPromptType.js' 30import type { 31 AssistantMessage, 32 AttachmentMessage, 33 Message, 34 RequestStartEvent, 35 StreamEvent, 36 ToolUseSummaryMessage, 37 UserMessage, 38 TombstoneMessage, 39} from './types/message.js' 40import { logError } from './utils/log.js' 41import { 42 PROMPT_TOO_LONG_ERROR_MESSAGE, 43 isPromptTooLongMessage, 44} from './services/api/errors.js' 45import { logAntError, logForDebugging } from './utils/debug.js' 46import { 47 createUserMessage, 48 createUserInterruptionMessage, 49 normalizeMessagesForAPI, 50 createSystemMessage, 51 createAssistantAPIErrorMessage, 52 getMessagesAfterCompactBoundary, 53 createToolUseSummaryMessage, 54 createMicrocompactBoundaryMessage, 55 stripSignatureBlocks, 56} from './utils/messages.js' 57import { generateToolUseSummary } from './services/toolUseSummary/toolUseSummaryGenerator.js' 58import { prependUserContext, appendSystemContext } from './utils/api.js' 59import { 60 createAttachmentMessage, 61 filterDuplicateMemoryAttachments, 62 getAttachmentMessages, 63 startRelevantMemoryPrefetch, 64} from './utils/attachments.js' 65/* eslint-disable @typescript-eslint/no-require-imports */ 66const skillPrefetch = feature('EXPERIMENTAL_SKILL_SEARCH') 67 ? (require('./services/skillSearch/prefetch.js') as typeof import('./services/skillSearch/prefetch.js')) 68 : null 69const jobClassifier = feature('TEMPLATES') 70 ? (require('./jobs/classifier.js') as typeof import('./jobs/classifier.js')) 71 : null 72/* eslint-enable @typescript-eslint/no-require-imports */ 73import { 74 remove as removeFromQueue, 75 getCommandsByMaxPriority, 76 isSlashCommand, 77} from './utils/messageQueueManager.js' 78import { notifyCommandLifecycle } from './utils/commandLifecycle.js' 79import { headlessProfilerCheckpoint } from './utils/headlessProfiler.js' 80import { 81 getRuntimeMainLoopModel, 82 renderModelName, 83} from './utils/model/model.js' 84import { 85 doesMostRecentAssistantMessageExceed200k, 86 finalContextTokensFromLastResponse, 87 tokenCountWithEstimation, 88} from './utils/tokens.js' 89import { ESCALATED_MAX_TOKENS } from './utils/context.js' 90import { getFeatureValue_CACHED_MAY_BE_STALE } from './services/analytics/growthbook.js' 91import { SLEEP_TOOL_NAME } from './tools/SleepTool/prompt.js' 92import { executePostSamplingHooks } from './utils/hooks/postSamplingHooks.js' 93import { executeStopFailureHooks } from './utils/hooks.js' 94import type { QuerySource } from './constants/querySource.js' 95import { createDumpPromptsFetch } from './services/api/dumpPrompts.js' 96import { StreamingToolExecutor } from './services/tools/StreamingToolExecutor.js' 97import { queryCheckpoint } from './utils/queryProfiler.js' 98import { runTools } from './services/tools/toolOrchestration.js' 99import { applyToolResultBudget } from './utils/toolResultStorage.js' 100import { recordContentReplacement } from './utils/sessionStorage.js' 101import { handleStopHooks } from './query/stopHooks.js' 102import { buildQueryConfig } from './query/config.js' 103import { productionDeps, type QueryDeps } from './query/deps.js' 104import type { Terminal, Continue } from './query/transitions.js' 105import { feature } from 'bun:bundle' 106import { 107 getCurrentTurnTokenBudget, 108 getTurnOutputTokens, 109 incrementBudgetContinuationCount, 110} from './bootstrap/state.js' 111import { createBudgetTracker, checkTokenBudget } from './query/tokenBudget.js' 112import { count } from './utils/array.js' 113 114/* eslint-disable @typescript-eslint/no-require-imports */ 115const snipModule = feature('HISTORY_SNIP') 116 ? (require('./services/compact/snipCompact.js') as typeof import('./services/compact/snipCompact.js')) 117 : null 118const taskSummaryModule = feature('BG_SESSIONS') 119 ? (require('./utils/taskSummary.js') as typeof import('./utils/taskSummary.js')) 120 : null 121/* eslint-enable @typescript-eslint/no-require-imports */ 122 123function* yieldMissingToolResultBlocks( 124 assistantMessages: AssistantMessage[], 125 errorMessage: string, 126) { 127 for (const assistantMessage of assistantMessages) { 128 // Extract all tool use blocks from this assistant message 129 const toolUseBlocks = assistantMessage.message.content.filter( 130 content => content.type === 'tool_use', 131 ) as ToolUseBlock[] 132 133 // Emit an interruption message for each tool use 134 for (const toolUse of toolUseBlocks) { 135 yield createUserMessage({ 136 content: [ 137 { 138 type: 'tool_result', 139 content: errorMessage, 140 is_error: true, 141 tool_use_id: toolUse.id, 142 }, 143 ], 144 toolUseResult: errorMessage, 145 sourceToolAssistantUUID: assistantMessage.uuid, 146 }) 147 } 148 } 149} 150 151/** 152 * The rules of thinking are lengthy and fortuitous. They require plenty of thinking 153 * of most long duration and deep meditation for a wizard to wrap one's noggin around. 154 * 155 * The rules follow: 156 * 1. A message that contains a thinking or redacted_thinking block must be part of a query whose max_thinking_length > 0 157 * 2. A thinking block may not be the last message in a block 158 * 3. Thinking blocks must be preserved for the duration of an assistant trajectory (a single turn, or if that turn includes a tool_use block then also its subsequent tool_result and the following assistant message) 159 * 160 * Heed these rules well, young wizard. For they are the rules of thinking, and 161 * the rules of thinking are the rules of the universe. If ye does not heed these 162 * rules, ye will be punished with an entire day of debugging and hair pulling. 163 */ 164const MAX_OUTPUT_TOKENS_RECOVERY_LIMIT = 3 165 166/** 167 * Is this a max_output_tokens error message? If so, the streaming loop should 168 * withhold it from SDK callers until we know whether the recovery loop can 169 * continue. Yielding early leaks an intermediate error to SDK callers (e.g. 170 * cowork/desktop) that terminate the session on any `error` field — the 171 * recovery loop keeps running but nobody is listening. 172 * 173 * Mirrors reactiveCompact.isWithheldPromptTooLong. 174 */ 175function isWithheldMaxOutputTokens( 176 msg: Message | StreamEvent | undefined, 177): msg is AssistantMessage { 178 return msg?.type === 'assistant' && msg.apiError === 'max_output_tokens' 179} 180 181export type QueryParams = { 182 messages: Message[] 183 systemPrompt: SystemPrompt 184 userContext: { [k: string]: string } 185 systemContext: { [k: string]: string } 186 canUseTool: CanUseToolFn 187 toolUseContext: ToolUseContext 188 fallbackModel?: string 189 querySource: QuerySource 190 maxOutputTokensOverride?: number 191 maxTurns?: number 192 skipCacheWrite?: boolean 193 // API task_budget (output_config.task_budget, beta task-budgets-2026-03-13). 194 // Distinct from the tokenBudget +500k auto-continue feature. `total` is the 195 // budget for the whole agentic turn; `remaining` is computed per iteration 196 // from cumulative API usage. See configureTaskBudgetParams in claude.ts. 197 taskBudget?: { total: number } 198 deps?: QueryDeps 199} 200 201// -- query loop state 202 203// Mutable state carried between loop iterations 204type State = { 205 messages: Message[] 206 toolUseContext: ToolUseContext 207 autoCompactTracking: AutoCompactTrackingState | undefined 208 maxOutputTokensRecoveryCount: number 209 hasAttemptedReactiveCompact: boolean 210 maxOutputTokensOverride: number | undefined 211 pendingToolUseSummary: Promise<ToolUseSummaryMessage | null> | undefined 212 stopHookActive: boolean | undefined 213 turnCount: number 214 // Why the previous iteration continued. Undefined on first iteration. 215 // Lets tests assert recovery paths fired without inspecting message contents. 216 transition: Continue | undefined 217} 218 219export async function* query( 220 params: QueryParams, 221): AsyncGenerator< 222 | StreamEvent 223 | RequestStartEvent 224 | Message 225 | TombstoneMessage 226 | ToolUseSummaryMessage, 227 Terminal 228> { 229 const consumedCommandUuids: string[] = [] 230 const terminal = yield* queryLoop(params, consumedCommandUuids) 231 // Only reached if queryLoop returned normally. Skipped on throw (error 232 // propagates through yield*) and on .return() (Return completion closes 233 // both generators). This gives the same asymmetric started-without-completed 234 // signal as print.ts's drainCommandQueue when the turn fails. 235 for (const uuid of consumedCommandUuids) { 236 notifyCommandLifecycle(uuid, 'completed') 237 } 238 return terminal 239} 240 241async function* queryLoop( 242 params: QueryParams, 243 consumedCommandUuids: string[], 244): AsyncGenerator< 245 | StreamEvent 246 | RequestStartEvent 247 | Message 248 | TombstoneMessage 249 | ToolUseSummaryMessage, 250 Terminal 251> { 252 // Immutable params — never reassigned during the query loop. 253 const { 254 systemPrompt, 255 userContext, 256 systemContext, 257 canUseTool, 258 fallbackModel, 259 querySource, 260 maxTurns, 261 skipCacheWrite, 262 } = params 263 const deps = params.deps ?? productionDeps() 264 265 // Mutable cross-iteration state. The loop body destructures this at the top 266 // of each iteration so reads stay bare-name (`messages`, `toolUseContext`). 267 // Continue sites write `state = { ... }` instead of 9 separate assignments. 268 let state: State = { 269 messages: params.messages, 270 toolUseContext: params.toolUseContext, 271 maxOutputTokensOverride: params.maxOutputTokensOverride, 272 autoCompactTracking: undefined, 273 stopHookActive: undefined, 274 maxOutputTokensRecoveryCount: 0, 275 hasAttemptedReactiveCompact: false, 276 turnCount: 1, 277 pendingToolUseSummary: undefined, 278 transition: undefined, 279 } 280 const budgetTracker = feature('TOKEN_BUDGET') ? createBudgetTracker() : null 281 282 // task_budget.remaining tracking across compaction boundaries. Undefined 283 // until first compact fires — while context is uncompacted the server can 284 // see the full history and handles the countdown from {total} itself (see 285 // api/api/sampling/prompt/renderer.py:292). After a compact, the server sees 286 // only the summary and would under-count spend; remaining tells it the 287 // pre-compact final window that got summarized away. Cumulative across 288 // multiple compacts: each subtracts the final context at that compact's 289 // trigger point. Loop-local (not on State) to avoid touching the 7 continue 290 // sites. 291 let taskBudgetRemaining: number | undefined = undefined 292 293 // Snapshot immutable env/statsig/session state once at entry. See QueryConfig 294 // for what's included and why feature() gates are intentionally excluded. 295 const config = buildQueryConfig() 296 297 // Fired once per user turn — the prompt is invariant across loop iterations, 298 // so per-iteration firing would ask sideQuery the same question N times. 299 // Consume point polls settledAt (never blocks). `using` disposes on all 300 // generator exit paths — see MemoryPrefetch for dispose/telemetry semantics. 301 using pendingMemoryPrefetch = startRelevantMemoryPrefetch( 302 state.messages, 303 state.toolUseContext, 304 ) 305 306 // eslint-disable-next-line no-constant-condition 307 while (true) { 308 // Destructure state at the top of each iteration. toolUseContext alone 309 // is reassigned within an iteration (queryTracking, messages updates); 310 // the rest are read-only between continue sites. 311 let { toolUseContext } = state 312 const { 313 messages, 314 autoCompactTracking, 315 maxOutputTokensRecoveryCount, 316 hasAttemptedReactiveCompact, 317 maxOutputTokensOverride, 318 pendingToolUseSummary, 319 stopHookActive, 320 turnCount, 321 } = state 322 323 // Skill discovery prefetch — per-iteration (uses findWritePivot guard 324 // that returns early on non-write iterations). Discovery runs while the 325 // model streams and tools execute; awaited post-tools alongside the 326 // memory prefetch consume. Replaces the blocking assistant_turn path 327 // that ran inside getAttachmentMessages (97% of those calls found 328 // nothing in prod). Turn-0 user-input discovery still blocks in 329 // userInputAttachments — that's the one signal where there's no prior 330 // work to hide under. 331 const pendingSkillPrefetch = skillPrefetch?.startSkillDiscoveryPrefetch( 332 null, 333 messages, 334 toolUseContext, 335 ) 336 337 yield { type: 'stream_request_start' } 338 339 queryCheckpoint('query_fn_entry') 340 341 // Record query start for headless latency tracking (skip for subagents) 342 if (!toolUseContext.agentId) { 343 headlessProfilerCheckpoint('query_started') 344 } 345 346 // Initialize or increment query chain tracking 347 const queryTracking = toolUseContext.queryTracking 348 ? { 349 chainId: toolUseContext.queryTracking.chainId, 350 depth: toolUseContext.queryTracking.depth + 1, 351 } 352 : { 353 chainId: deps.uuid(), 354 depth: 0, 355 } 356 357 const queryChainIdForAnalytics = 358 queryTracking.chainId as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS 359 360 toolUseContext = { 361 ...toolUseContext, 362 queryTracking, 363 } 364 365 let messagesForQuery = [...getMessagesAfterCompactBoundary(messages)] 366 367 let tracking = autoCompactTracking 368 369 // Enforce per-message budget on aggregate tool result size. Runs BEFORE 370 // microcompact — cached MC operates purely by tool_use_id (never inspects 371 // content), so content replacement is invisible to it and the two compose 372 // cleanly. No-ops when contentReplacementState is undefined (feature off). 373 // Persist only for querySources that read records back on resume: agentId 374 // routes to sidechain file (AgentTool resume) or session file (/resume). 375 // Ephemeral runForkedAgent callers (agent_summary etc.) don't persist. 376 const persistReplacements = 377 querySource.startsWith('agent:') || 378 querySource.startsWith('repl_main_thread') 379 messagesForQuery = await applyToolResultBudget( 380 messagesForQuery, 381 toolUseContext.contentReplacementState, 382 persistReplacements 383 ? records => 384 void recordContentReplacement( 385 records, 386 toolUseContext.agentId, 387 ).catch(logError) 388 : undefined, 389 new Set( 390 toolUseContext.options.tools 391 .filter(t => !Number.isFinite(t.maxResultSizeChars)) 392 .map(t => t.name), 393 ), 394 ) 395 396 // Apply snip before microcompact (both may run — they are not mutually exclusive). 397 // snipTokensFreed is plumbed to autocompact so its threshold check reflects 398 // what snip removed; tokenCountWithEstimation alone can't see it (reads usage 399 // from the protected-tail assistant, which survives snip unchanged). 400 let snipTokensFreed = 0 401 if (feature('HISTORY_SNIP')) { 402 queryCheckpoint('query_snip_start') 403 const snipResult = snipModule!.snipCompactIfNeeded(messagesForQuery) 404 messagesForQuery = snipResult.messages 405 snipTokensFreed = snipResult.tokensFreed 406 if (snipResult.boundaryMessage) { 407 yield snipResult.boundaryMessage 408 } 409 queryCheckpoint('query_snip_end') 410 } 411 412 // Apply microcompact before autocompact 413 queryCheckpoint('query_microcompact_start') 414 const microcompactResult = await deps.microcompact( 415 messagesForQuery, 416 toolUseContext, 417 querySource, 418 ) 419 messagesForQuery = microcompactResult.messages 420 // For cached microcompact (cache editing), defer boundary message until after 421 // the API response so we can use actual cache_deleted_input_tokens. 422 // Gated behind feature() so the string is eliminated from external builds. 423 const pendingCacheEdits = feature('CACHED_MICROCOMPACT') 424 ? microcompactResult.compactionInfo?.pendingCacheEdits 425 : undefined 426 queryCheckpoint('query_microcompact_end') 427 428 // Project the collapsed context view and maybe commit more collapses. 429 // Runs BEFORE autocompact so that if collapse gets us under the 430 // autocompact threshold, autocompact is a no-op and we keep granular 431 // context instead of a single summary. 432 // 433 // Nothing is yielded — the collapsed view is a read-time projection 434 // over the REPL's full history. Summary messages live in the collapse 435 // store, not the REPL array. This is what makes collapses persist 436 // across turns: projectView() replays the commit log on every entry. 437 // Within a turn, the view flows forward via state.messages at the 438 // continue site (query.ts:1192), and the next projectView() no-ops 439 // because the archived messages are already gone from its input. 440 if (feature('CONTEXT_COLLAPSE') && contextCollapse) { 441 const collapseResult = await contextCollapse.applyCollapsesIfNeeded( 442 messagesForQuery, 443 toolUseContext, 444 querySource, 445 ) 446 messagesForQuery = collapseResult.messages 447 } 448 449 const fullSystemPrompt = asSystemPrompt( 450 appendSystemContext(systemPrompt, systemContext), 451 ) 452 453 queryCheckpoint('query_autocompact_start') 454 const { compactionResult, consecutiveFailures } = await deps.autocompact( 455 messagesForQuery, 456 toolUseContext, 457 { 458 systemPrompt, 459 userContext, 460 systemContext, 461 toolUseContext, 462 forkContextMessages: messagesForQuery, 463 }, 464 querySource, 465 tracking, 466 snipTokensFreed, 467 ) 468 queryCheckpoint('query_autocompact_end') 469 470 if (compactionResult) { 471 const { 472 preCompactTokenCount, 473 postCompactTokenCount, 474 truePostCompactTokenCount, 475 compactionUsage, 476 } = compactionResult 477 478 logEvent('tengu_auto_compact_succeeded', { 479 originalMessageCount: messages.length, 480 compactedMessageCount: 481 compactionResult.summaryMessages.length + 482 compactionResult.attachments.length + 483 compactionResult.hookResults.length, 484 preCompactTokenCount, 485 postCompactTokenCount, 486 truePostCompactTokenCount, 487 compactionInputTokens: compactionUsage?.input_tokens, 488 compactionOutputTokens: compactionUsage?.output_tokens, 489 compactionCacheReadTokens: 490 compactionUsage?.cache_read_input_tokens ?? 0, 491 compactionCacheCreationTokens: 492 compactionUsage?.cache_creation_input_tokens ?? 0, 493 compactionTotalTokens: compactionUsage 494 ? compactionUsage.input_tokens + 495 (compactionUsage.cache_creation_input_tokens ?? 0) + 496 (compactionUsage.cache_read_input_tokens ?? 0) + 497 compactionUsage.output_tokens 498 : 0, 499 500 queryChainId: queryChainIdForAnalytics, 501 queryDepth: queryTracking.depth, 502 }) 503 504 // task_budget: capture pre-compact final context window before 505 // messagesForQuery is replaced with postCompactMessages below. 506 // iterations[-1] is the authoritative final window (post server tool 507 // loops); see #304930. 508 if (params.taskBudget) { 509 const preCompactContext = 510 finalContextTokensFromLastResponse(messagesForQuery) 511 taskBudgetRemaining = Math.max( 512 0, 513 (taskBudgetRemaining ?? params.taskBudget.total) - preCompactContext, 514 ) 515 } 516 517 // Reset on every compact so turnCounter/turnId reflect the MOST RECENT 518 // compact. recompactionInfo (autoCompact.ts:190) already captured the 519 // old values for turnsSincePreviousCompact/previousCompactTurnId before 520 // the call, so this reset doesn't lose those. 521 tracking = { 522 compacted: true, 523 turnId: deps.uuid(), 524 turnCounter: 0, 525 consecutiveFailures: 0, 526 } 527 528 const postCompactMessages = buildPostCompactMessages(compactionResult) 529 530 for (const message of postCompactMessages) { 531 yield message 532 } 533 534 // Continue on with the current query call using the post compact messages 535 messagesForQuery = postCompactMessages 536 } else if (consecutiveFailures !== undefined) { 537 // Autocompact failed — propagate failure count so the circuit breaker 538 // can stop retrying on the next iteration. 539 tracking = { 540 ...(tracking ?? { compacted: false, turnId: '', turnCounter: 0 }), 541 consecutiveFailures, 542 } 543 } 544 545 //TODO: no need to set toolUseContext.messages during set-up since it is updated here 546 toolUseContext = { 547 ...toolUseContext, 548 messages: messagesForQuery, 549 } 550 551 const assistantMessages: AssistantMessage[] = [] 552 const toolResults: (UserMessage | AttachmentMessage)[] = [] 553 // @see https://docs.claude.com/en/docs/build-with-claude/tool-use 554 // Note: stop_reason === 'tool_use' is unreliable -- it's not always set correctly. 555 // Set during streaming whenever a tool_use block arrives — the sole 556 // loop-exit signal. If false after streaming, we're done (modulo stop-hook retry). 557 const toolUseBlocks: ToolUseBlock[] = [] 558 let needsFollowUp = false 559 560 queryCheckpoint('query_setup_start') 561 const useStreamingToolExecution = config.gates.streamingToolExecution 562 let streamingToolExecutor = useStreamingToolExecution 563 ? new StreamingToolExecutor( 564 toolUseContext.options.tools, 565 canUseTool, 566 toolUseContext, 567 ) 568 : null 569 570 const appState = toolUseContext.getAppState() 571 const permissionMode = appState.toolPermissionContext.mode 572 let currentModel = getRuntimeMainLoopModel({ 573 permissionMode, 574 mainLoopModel: toolUseContext.options.mainLoopModel, 575 exceeds200kTokens: 576 permissionMode === 'plan' && 577 doesMostRecentAssistantMessageExceed200k(messagesForQuery), 578 }) 579 580 queryCheckpoint('query_setup_end') 581 582 // Create fetch wrapper once per query session to avoid memory retention. 583 // Each call to createDumpPromptsFetch creates a closure that captures the request body. 584 // Creating it once means only the latest request body is retained (~700KB), 585 // instead of all request bodies from the session (~500MB for long sessions). 586 // Note: agentId is effectively constant during a query() call - it only changes 587 // between queries (e.g., /clear command or session resume). 588 const dumpPromptsFetch = config.gates.isAnt 589 ? createDumpPromptsFetch(toolUseContext.agentId ?? config.sessionId) 590 : undefined 591 592 // Block if we've hit the hard blocking limit (only applies when auto-compact is OFF) 593 // This reserves space so users can still run /compact manually 594 // Skip this check if compaction just happened - the compaction result is already 595 // validated to be under the threshold, and tokenCountWithEstimation would use 596 // stale input_tokens from kept messages that reflect pre-compaction context size. 597 // Same staleness applies to snip: subtract snipTokensFreed (otherwise we'd 598 // falsely block in the window where snip brought us under autocompact threshold 599 // but the stale usage is still above blocking limit — before this PR that 600 // window never existed because autocompact always fired on the stale count). 601 // Also skip for compact/session_memory queries — these are forked agents that 602 // inherit the full conversation and would deadlock if blocked here (the compact 603 // agent needs to run to REDUCE the token count). 604 // Also skip when reactive compact is enabled and automatic compaction is 605 // allowed — the preempt's synthetic error returns before the API call, 606 // so reactive compact would never see a prompt-too-long to react to. 607 // Widened to walrus so RC can act as fallback when proactive fails. 608 // 609 // Same skip for context-collapse: its recoverFromOverflow drains 610 // staged collapses on a REAL API 413, then falls through to 611 // reactiveCompact. A synthetic preempt here would return before the 612 // API call and starve both recovery paths. The isAutoCompactEnabled() 613 // conjunct preserves the user's explicit "no automatic anything" 614 // config — if they set DISABLE_AUTO_COMPACT, they get the preempt. 615 let collapseOwnsIt = false 616 if (feature('CONTEXT_COLLAPSE')) { 617 collapseOwnsIt = 618 (contextCollapse?.isContextCollapseEnabled() ?? false) && 619 isAutoCompactEnabled() 620 } 621 // Hoist media-recovery gate once per turn. Withholding (inside the 622 // stream loop) and recovery (after) must agree; CACHED_MAY_BE_STALE can 623 // flip during the 5-30s stream, and withhold-without-recover would eat 624 // the message. PTL doesn't hoist because its withholding is ungated — 625 // it predates the experiment and is already the control-arm baseline. 626 const mediaRecoveryEnabled = 627 reactiveCompact?.isReactiveCompactEnabled() ?? false 628 if ( 629 !compactionResult && 630 querySource !== 'compact' && 631 querySource !== 'session_memory' && 632 !( 633 reactiveCompact?.isReactiveCompactEnabled() && isAutoCompactEnabled() 634 ) && 635 !collapseOwnsIt 636 ) { 637 const { isAtBlockingLimit } = calculateTokenWarningState( 638 tokenCountWithEstimation(messagesForQuery) - snipTokensFreed, 639 toolUseContext.options.mainLoopModel, 640 ) 641 if (isAtBlockingLimit) { 642 yield createAssistantAPIErrorMessage({ 643 content: PROMPT_TOO_LONG_ERROR_MESSAGE, 644 error: 'invalid_request', 645 }) 646 return { reason: 'blocking_limit' } 647 } 648 } 649 650 let attemptWithFallback = true 651 652 queryCheckpoint('query_api_loop_start') 653 try { 654 while (attemptWithFallback) { 655 attemptWithFallback = false 656 try { 657 let streamingFallbackOccured = false 658 queryCheckpoint('query_api_streaming_start') 659 for await (const message of deps.callModel({ 660 messages: prependUserContext(messagesForQuery, userContext), 661 systemPrompt: fullSystemPrompt, 662 thinkingConfig: toolUseContext.options.thinkingConfig, 663 tools: toolUseContext.options.tools, 664 signal: toolUseContext.abortController.signal, 665 options: { 666 async getToolPermissionContext() { 667 const appState = toolUseContext.getAppState() 668 return appState.toolPermissionContext 669 }, 670 model: currentModel, 671 ...(config.gates.fastModeEnabled && { 672 fastMode: appState.fastMode, 673 }), 674 toolChoice: undefined, 675 isNonInteractiveSession: 676 toolUseContext.options.isNonInteractiveSession, 677 fallbackModel, 678 onStreamingFallback: () => { 679 streamingFallbackOccured = true 680 }, 681 querySource, 682 agents: toolUseContext.options.agentDefinitions.activeAgents, 683 allowedAgentTypes: 684 toolUseContext.options.agentDefinitions.allowedAgentTypes, 685 hasAppendSystemPrompt: 686 !!toolUseContext.options.appendSystemPrompt, 687 maxOutputTokensOverride, 688 fetchOverride: dumpPromptsFetch, 689 mcpTools: appState.mcp.tools, 690 hasPendingMcpServers: appState.mcp.clients.some( 691 c => c.type === 'pending', 692 ), 693 queryTracking, 694 effortValue: appState.effortValue, 695 advisorModel: appState.advisorModel, 696 skipCacheWrite, 697 agentId: toolUseContext.agentId, 698 addNotification: toolUseContext.addNotification, 699 ...(params.taskBudget && { 700 taskBudget: { 701 total: params.taskBudget.total, 702 ...(taskBudgetRemaining !== undefined && { 703 remaining: taskBudgetRemaining, 704 }), 705 }, 706 }), 707 }, 708 })) { 709 // We won't use the tool_calls from the first attempt 710 // We could.. but then we'd have to merge assistant messages 711 // with different ids and double up on full the tool_results 712 if (streamingFallbackOccured) { 713 // Yield tombstones for orphaned messages so they're removed from UI and transcript. 714 // These partial messages (especially thinking blocks) have invalid signatures 715 // that would cause "thinking blocks cannot be modified" API errors. 716 for (const msg of assistantMessages) { 717 yield { type: 'tombstone' as const, message: msg } 718 } 719 logEvent('tengu_orphaned_messages_tombstoned', { 720 orphanedMessageCount: assistantMessages.length, 721 queryChainId: queryChainIdForAnalytics, 722 queryDepth: queryTracking.depth, 723 }) 724 725 assistantMessages.length = 0 726 toolResults.length = 0 727 toolUseBlocks.length = 0 728 needsFollowUp = false 729 730 // Discard pending results from the failed streaming attempt and create 731 // a fresh executor. This prevents orphan tool_results (with old tool_use_ids) 732 // from being yielded after the fallback response arrives. 733 if (streamingToolExecutor) { 734 streamingToolExecutor.discard() 735 streamingToolExecutor = new StreamingToolExecutor( 736 toolUseContext.options.tools, 737 canUseTool, 738 toolUseContext, 739 ) 740 } 741 } 742 // Backfill tool_use inputs on a cloned message before yield so 743 // SDK stream output and transcript serialization see legacy/derived 744 // fields. The original `message` is left untouched for 745 // assistantMessages.push below — it flows back to the API and 746 // mutating it would break prompt caching (byte mismatch). 747 let yieldMessage: typeof message = message 748 if (message.type === 'assistant') { 749 let clonedContent: typeof message.message.content | undefined 750 for (let i = 0; i < message.message.content.length; i++) { 751 const block = message.message.content[i]! 752 if ( 753 block.type === 'tool_use' && 754 typeof block.input === 'object' && 755 block.input !== null 756 ) { 757 const tool = findToolByName( 758 toolUseContext.options.tools, 759 block.name, 760 ) 761 if (tool?.backfillObservableInput) { 762 const originalInput = block.input as Record<string, unknown> 763 const inputCopy = { ...originalInput } 764 tool.backfillObservableInput(inputCopy) 765 // Only yield a clone when backfill ADDED fields; skip if 766 // it only OVERWROTE existing ones (e.g. file tools 767 // expanding file_path). Overwrites change the serialized 768 // transcript and break VCR fixture hashes on resume, 769 // while adding nothing the SDK stream needs — hooks get 770 // the expanded path via toolExecution.ts separately. 771 const addedFields = Object.keys(inputCopy).some( 772 k => !(k in originalInput), 773 ) 774 if (addedFields) { 775 clonedContent ??= [...message.message.content] 776 clonedContent[i] = { ...block, input: inputCopy } 777 } 778 } 779 } 780 } 781 if (clonedContent) { 782 yieldMessage = { 783 ...message, 784 message: { ...message.message, content: clonedContent }, 785 } 786 } 787 } 788 // Withhold recoverable errors (prompt-too-long, max-output-tokens) 789 // until we know whether recovery (collapse drain / reactive 790 // compact / truncation retry) can succeed. Still pushed to 791 // assistantMessages so the recovery checks below find them. 792 // Either subsystem's withhold is sufficient — they're 793 // independent so turning one off doesn't break the other's 794 // recovery path. 795 // 796 // feature() only works in if/ternary conditions (bun:bundle 797 // tree-shaking constraint), so the collapse check is nested 798 // rather than composed. 799 let withheld = false 800 if (feature('CONTEXT_COLLAPSE')) { 801 if ( 802 contextCollapse?.isWithheldPromptTooLong( 803 message, 804 isPromptTooLongMessage, 805 querySource, 806 ) 807 ) { 808 withheld = true 809 } 810 } 811 if (reactiveCompact?.isWithheldPromptTooLong(message)) { 812 withheld = true 813 } 814 if ( 815 mediaRecoveryEnabled && 816 reactiveCompact?.isWithheldMediaSizeError(message) 817 ) { 818 withheld = true 819 } 820 if (isWithheldMaxOutputTokens(message)) { 821 withheld = true 822 } 823 if (!withheld) { 824 yield yieldMessage 825 } 826 if (message.type === 'assistant') { 827 assistantMessages.push(message) 828 829 const msgToolUseBlocks = message.message.content.filter( 830 content => content.type === 'tool_use', 831 ) as ToolUseBlock[] 832 if (msgToolUseBlocks.length > 0) { 833 toolUseBlocks.push(...msgToolUseBlocks) 834 needsFollowUp = true 835 } 836 837 if ( 838 streamingToolExecutor && 839 !toolUseContext.abortController.signal.aborted 840 ) { 841 for (const toolBlock of msgToolUseBlocks) { 842 streamingToolExecutor.addTool(toolBlock, message) 843 } 844 } 845 } 846 847 if ( 848 streamingToolExecutor && 849 !toolUseContext.abortController.signal.aborted 850 ) { 851 for (const result of streamingToolExecutor.getCompletedResults()) { 852 if (result.message) { 853 yield result.message 854 toolResults.push( 855 ...normalizeMessagesForAPI( 856 [result.message], 857 toolUseContext.options.tools, 858 ).filter(_ => _.type === 'user'), 859 ) 860 } 861 } 862 } 863 } 864 queryCheckpoint('query_api_streaming_end') 865 866 // Yield deferred microcompact boundary message using actual API-reported 867 // token deletion count instead of client-side estimates. 868 // Entire block gated behind feature() so the excluded string 869 // is eliminated from external builds. 870 if (feature('CACHED_MICROCOMPACT') && pendingCacheEdits) { 871 const lastAssistant = assistantMessages.at(-1) 872 // The API field is cumulative/sticky across requests, so we 873 // subtract the baseline captured before this request to get the delta. 874 const usage = lastAssistant?.message.usage 875 const cumulativeDeleted = usage 876 ? ((usage as unknown as Record<string, number>) 877 .cache_deleted_input_tokens ?? 0) 878 : 0 879 const deletedTokens = Math.max( 880 0, 881 cumulativeDeleted - pendingCacheEdits.baselineCacheDeletedTokens, 882 ) 883 if (deletedTokens > 0) { 884 yield createMicrocompactBoundaryMessage( 885 pendingCacheEdits.trigger, 886 0, 887 deletedTokens, 888 pendingCacheEdits.deletedToolIds, 889 [], 890 ) 891 } 892 } 893 } catch (innerError) { 894 if (innerError instanceof FallbackTriggeredError && fallbackModel) { 895 // Fallback was triggered - switch model and retry 896 currentModel = fallbackModel 897 attemptWithFallback = true 898 899 // Clear assistant messages since we'll retry the entire request 900 yield* yieldMissingToolResultBlocks( 901 assistantMessages, 902 'Model fallback triggered', 903 ) 904 assistantMessages.length = 0 905 toolResults.length = 0 906 toolUseBlocks.length = 0 907 needsFollowUp = false 908 909 // Discard pending results from the failed attempt and create a 910 // fresh executor. This prevents orphan tool_results (with old 911 // tool_use_ids) from leaking into the retry. 912 if (streamingToolExecutor) { 913 streamingToolExecutor.discard() 914 streamingToolExecutor = new StreamingToolExecutor( 915 toolUseContext.options.tools, 916 canUseTool, 917 toolUseContext, 918 ) 919 } 920 921 // Update tool use context with new model 922 toolUseContext.options.mainLoopModel = fallbackModel 923 924 // Thinking signatures are model-bound: replaying a protected-thinking 925 // block (e.g. capybara) to an unprotected fallback (e.g. opus) 400s. 926 // Strip before retry so the fallback model gets clean history. 927 if (process.env.USER_TYPE === 'ant') { 928 messagesForQuery = stripSignatureBlocks(messagesForQuery) 929 } 930 931 // Log the fallback event 932 logEvent('tengu_model_fallback_triggered', { 933 original_model: 934 innerError.originalModel as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 935 fallback_model: 936 fallbackModel as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 937 entrypoint: 938 'cli' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 939 queryChainId: queryChainIdForAnalytics, 940 queryDepth: queryTracking.depth, 941 }) 942 943 // Yield system message about fallback — use 'warning' level so 944 // users see the notification without needing verbose mode 945 yield createSystemMessage( 946 `Switched to ${renderModelName(innerError.fallbackModel)} due to high demand for ${renderModelName(innerError.originalModel)}`, 947 'warning', 948 ) 949 950 continue 951 } 952 throw innerError 953 } 954 } 955 } catch (error) { 956 logError(error) 957 const errorMessage = 958 error instanceof Error ? error.message : String(error) 959 logEvent('tengu_query_error', { 960 assistantMessages: assistantMessages.length, 961 toolUses: assistantMessages.flatMap(_ => 962 _.message.content.filter(content => content.type === 'tool_use'), 963 ).length, 964 965 queryChainId: queryChainIdForAnalytics, 966 queryDepth: queryTracking.depth, 967 }) 968 969 // Handle image size/resize errors with user-friendly messages 970 if ( 971 error instanceof ImageSizeError || 972 error instanceof ImageResizeError 973 ) { 974 yield createAssistantAPIErrorMessage({ 975 content: error.message, 976 }) 977 return { reason: 'image_error' } 978 } 979 980 // Generally queryModelWithStreaming should not throw errors but instead 981 // yield them as synthetic assistant messages. However if it does throw 982 // due to a bug, we may end up in a state where we have already emitted 983 // a tool_use block but will stop before emitting the tool_result. 984 yield* yieldMissingToolResultBlocks(assistantMessages, errorMessage) 985 986 // Surface the real error instead of a misleading "[Request interrupted 987 // by user]" — this path is a model/runtime failure, not a user action. 988 // SDK consumers were seeing phantom interrupts on e.g. Node 18's missing 989 // Array.prototype.with(), masking the actual cause. 990 yield createAssistantAPIErrorMessage({ 991 content: errorMessage, 992 }) 993 994 // To help track down bugs, log loudly for ants 995 logAntError('Query error', error) 996 return { reason: 'model_error', error } 997 } 998 999 // Execute post-sampling hooks after model response is complete 1000 if (assistantMessages.length > 0) { 1001 void executePostSamplingHooks( 1002 [...messagesForQuery, ...assistantMessages], 1003 systemPrompt, 1004 userContext, 1005 systemContext, 1006 toolUseContext, 1007 querySource, 1008 ) 1009 } 1010 1011 // We need to handle a streaming abort before anything else. 1012 // When using streamingToolExecutor, we must consume getRemainingResults() so the 1013 // executor can generate synthetic tool_result blocks for queued/in-progress tools. 1014 // Without this, tool_use blocks would lack matching tool_result blocks. 1015 if (toolUseContext.abortController.signal.aborted) { 1016 if (streamingToolExecutor) { 1017 // Consume remaining results - executor generates synthetic tool_results for 1018 // aborted tools since it checks the abort signal in executeTool() 1019 for await (const update of streamingToolExecutor.getRemainingResults()) { 1020 if (update.message) { 1021 yield update.message 1022 } 1023 } 1024 } else { 1025 yield* yieldMissingToolResultBlocks( 1026 assistantMessages, 1027 'Interrupted by user', 1028 ) 1029 } 1030 // chicago MCP: auto-unhide + lock release on interrupt. Same cleanup 1031 // as the natural turn-end path in stopHooks.ts. Main thread only — 1032 // see stopHooks.ts for the subagent-releasing-main's-lock rationale. 1033 if (feature('CHICAGO_MCP') && !toolUseContext.agentId) { 1034 try { 1035 const { cleanupComputerUseAfterTurn } = await import( 1036 './utils/computerUse/cleanup.js' 1037 ) 1038 await cleanupComputerUseAfterTurn(toolUseContext) 1039 } catch { 1040 // Failures are silent — this is dogfooding cleanup, not critical path 1041 } 1042 } 1043 1044 // Skip the interruption message for submit-interrupts — the queued 1045 // user message that follows provides sufficient context. 1046 if (toolUseContext.abortController.signal.reason !== 'interrupt') { 1047 yield createUserInterruptionMessage({ 1048 toolUse: false, 1049 }) 1050 } 1051 return { reason: 'aborted_streaming' } 1052 } 1053 1054 // Yield tool use summary from previous turn — haiku (~1s) resolved during model streaming (5-30s) 1055 if (pendingToolUseSummary) { 1056 const summary = await pendingToolUseSummary 1057 if (summary) { 1058 yield summary 1059 } 1060 } 1061 1062 if (!needsFollowUp) { 1063 const lastMessage = assistantMessages.at(-1) 1064 1065 // Prompt-too-long recovery: the streaming loop withheld the error 1066 // (see withheldByCollapse / withheldByReactive above). Try collapse 1067 // drain first (cheap, keeps granular context), then reactive compact 1068 // (full summary). Single-shot on each — if a retry still 413's, 1069 // the next stage handles it or the error surfaces. 1070 const isWithheld413 = 1071 lastMessage?.type === 'assistant' && 1072 lastMessage.isApiErrorMessage && 1073 isPromptTooLongMessage(lastMessage) 1074 // Media-size rejections (image/PDF/many-image) are recoverable via 1075 // reactive compact's strip-retry. Unlike PTL, media errors skip the 1076 // collapse drain — collapse doesn't strip images. mediaRecoveryEnabled 1077 // is the hoisted gate from before the stream loop (same value as the 1078 // withholding check — these two must agree or a withheld message is 1079 // lost). If the oversized media is in the preserved tail, the 1080 // post-compact turn will media-error again; hasAttemptedReactiveCompact 1081 // prevents a spiral and the error surfaces. 1082 const isWithheldMedia = 1083 mediaRecoveryEnabled && 1084 reactiveCompact?.isWithheldMediaSizeError(lastMessage) 1085 if (isWithheld413) { 1086 // First: drain all staged context-collapses. Gated on the PREVIOUS 1087 // transition not being collapse_drain_retry — if we already drained 1088 // and the retry still 413'd, fall through to reactive compact. 1089 if ( 1090 feature('CONTEXT_COLLAPSE') && 1091 contextCollapse && 1092 state.transition?.reason !== 'collapse_drain_retry' 1093 ) { 1094 const drained = contextCollapse.recoverFromOverflow( 1095 messagesForQuery, 1096 querySource, 1097 ) 1098 if (drained.committed > 0) { 1099 const next: State = { 1100 messages: drained.messages, 1101 toolUseContext, 1102 autoCompactTracking: tracking, 1103 maxOutputTokensRecoveryCount, 1104 hasAttemptedReactiveCompact, 1105 maxOutputTokensOverride: undefined, 1106 pendingToolUseSummary: undefined, 1107 stopHookActive: undefined, 1108 turnCount, 1109 transition: { 1110 reason: 'collapse_drain_retry', 1111 committed: drained.committed, 1112 }, 1113 } 1114 state = next 1115 continue 1116 } 1117 } 1118 } 1119 if ((isWithheld413 || isWithheldMedia) && reactiveCompact) { 1120 const compacted = await reactiveCompact.tryReactiveCompact({ 1121 hasAttempted: hasAttemptedReactiveCompact, 1122 querySource, 1123 aborted: toolUseContext.abortController.signal.aborted, 1124 messages: messagesForQuery, 1125 cacheSafeParams: { 1126 systemPrompt, 1127 userContext, 1128 systemContext, 1129 toolUseContext, 1130 forkContextMessages: messagesForQuery, 1131 }, 1132 }) 1133 1134 if (compacted) { 1135 // task_budget: same carryover as the proactive path above. 1136 // messagesForQuery still holds the pre-compact array here (the 1137 // 413-failed attempt's input). 1138 if (params.taskBudget) { 1139 const preCompactContext = 1140 finalContextTokensFromLastResponse(messagesForQuery) 1141 taskBudgetRemaining = Math.max( 1142 0, 1143 (taskBudgetRemaining ?? params.taskBudget.total) - 1144 preCompactContext, 1145 ) 1146 } 1147 1148 const postCompactMessages = buildPostCompactMessages(compacted) 1149 for (const msg of postCompactMessages) { 1150 yield msg 1151 } 1152 const next: State = { 1153 messages: postCompactMessages, 1154 toolUseContext, 1155 autoCompactTracking: undefined, 1156 maxOutputTokensRecoveryCount, 1157 hasAttemptedReactiveCompact: true, 1158 maxOutputTokensOverride: undefined, 1159 pendingToolUseSummary: undefined, 1160 stopHookActive: undefined, 1161 turnCount, 1162 transition: { reason: 'reactive_compact_retry' }, 1163 } 1164 state = next 1165 continue 1166 } 1167 1168 // No recovery — surface the withheld error and exit. Do NOT fall 1169 // through to stop hooks: the model never produced a valid response, 1170 // so hooks have nothing meaningful to evaluate. Running stop hooks 1171 // on prompt-too-long creates a death spiral: error → hook blocking 1172 // → retry → error → … (the hook injects more tokens each cycle). 1173 yield lastMessage 1174 void executeStopFailureHooks(lastMessage, toolUseContext) 1175 return { reason: isWithheldMedia ? 'image_error' : 'prompt_too_long' } 1176 } else if (feature('CONTEXT_COLLAPSE') && isWithheld413) { 1177 // reactiveCompact compiled out but contextCollapse withheld and 1178 // couldn't recover (staged queue empty/stale). Surface. Same 1179 // early-return rationale — don't fall through to stop hooks. 1180 yield lastMessage 1181 void executeStopFailureHooks(lastMessage, toolUseContext) 1182 return { reason: 'prompt_too_long' } 1183 } 1184 1185 // Check for max_output_tokens and inject recovery message. The error 1186 // was withheld from the stream above; only surface it if recovery 1187 // exhausts. 1188 if (isWithheldMaxOutputTokens(lastMessage)) { 1189 // Escalating retry: if we used the capped 8k default and hit the 1190 // limit, retry the SAME request at 64k — no meta message, no 1191 // multi-turn dance. This fires once per turn (guarded by the 1192 // override check), then falls through to multi-turn recovery if 1193 // 64k also hits the cap. 1194 // 3P default: false (not validated on Bedrock/Vertex) 1195 const capEnabled = getFeatureValue_CACHED_MAY_BE_STALE( 1196 'tengu_otk_slot_v1', 1197 false, 1198 ) 1199 if ( 1200 capEnabled && 1201 maxOutputTokensOverride === undefined && 1202 !process.env.CLAUDE_CODE_MAX_OUTPUT_TOKENS 1203 ) { 1204 logEvent('tengu_max_tokens_escalate', { 1205 escalatedTo: ESCALATED_MAX_TOKENS, 1206 }) 1207 const next: State = { 1208 messages: messagesForQuery, 1209 toolUseContext, 1210 autoCompactTracking: tracking, 1211 maxOutputTokensRecoveryCount, 1212 hasAttemptedReactiveCompact, 1213 maxOutputTokensOverride: ESCALATED_MAX_TOKENS, 1214 pendingToolUseSummary: undefined, 1215 stopHookActive: undefined, 1216 turnCount, 1217 transition: { reason: 'max_output_tokens_escalate' }, 1218 } 1219 state = next 1220 continue 1221 } 1222 1223 if (maxOutputTokensRecoveryCount < MAX_OUTPUT_TOKENS_RECOVERY_LIMIT) { 1224 const recoveryMessage = createUserMessage({ 1225 content: 1226 `Output token limit hit. Resume directly — no apology, no recap of what you were doing. ` + 1227 `Pick up mid-thought if that is where the cut happened. Break remaining work into smaller pieces.`, 1228 isMeta: true, 1229 }) 1230 1231 const next: State = { 1232 messages: [ 1233 ...messagesForQuery, 1234 ...assistantMessages, 1235 recoveryMessage, 1236 ], 1237 toolUseContext, 1238 autoCompactTracking: tracking, 1239 maxOutputTokensRecoveryCount: maxOutputTokensRecoveryCount + 1, 1240 hasAttemptedReactiveCompact, 1241 maxOutputTokensOverride: undefined, 1242 pendingToolUseSummary: undefined, 1243 stopHookActive: undefined, 1244 turnCount, 1245 transition: { 1246 reason: 'max_output_tokens_recovery', 1247 attempt: maxOutputTokensRecoveryCount + 1, 1248 }, 1249 } 1250 state = next 1251 continue 1252 } 1253 1254 // Recovery exhausted — surface the withheld error now. 1255 yield lastMessage 1256 } 1257 1258 // Skip stop hooks when the last message is an API error (rate limit, 1259 // prompt-too-long, auth failure, etc.). The model never produced a 1260 // real response — hooks evaluating it create a death spiral: 1261 // error → hook blocking → retry → error → … 1262 if (lastMessage?.isApiErrorMessage) { 1263 void executeStopFailureHooks(lastMessage, toolUseContext) 1264 return { reason: 'completed' } 1265 } 1266 1267 const stopHookResult = yield* handleStopHooks( 1268 messagesForQuery, 1269 assistantMessages, 1270 systemPrompt, 1271 userContext, 1272 systemContext, 1273 toolUseContext, 1274 querySource, 1275 stopHookActive, 1276 ) 1277 1278 if (stopHookResult.preventContinuation) { 1279 return { reason: 'stop_hook_prevented' } 1280 } 1281 1282 if (stopHookResult.blockingErrors.length > 0) { 1283 const next: State = { 1284 messages: [ 1285 ...messagesForQuery, 1286 ...assistantMessages, 1287 ...stopHookResult.blockingErrors, 1288 ], 1289 toolUseContext, 1290 autoCompactTracking: tracking, 1291 maxOutputTokensRecoveryCount: 0, 1292 // Preserve the reactive compact guard — if compact already ran and 1293 // couldn't recover from prompt-too-long, retrying after a stop-hook 1294 // blocking error will produce the same result. Resetting to false 1295 // here caused an infinite loop: compact → still too long → error → 1296 // stop hook blocking → compact → … burning thousands of API calls. 1297 hasAttemptedReactiveCompact, 1298 maxOutputTokensOverride: undefined, 1299 pendingToolUseSummary: undefined, 1300 stopHookActive: true, 1301 turnCount, 1302 transition: { reason: 'stop_hook_blocking' }, 1303 } 1304 state = next 1305 continue 1306 } 1307 1308 if (feature('TOKEN_BUDGET')) { 1309 const decision = checkTokenBudget( 1310 budgetTracker!, 1311 toolUseContext.agentId, 1312 getCurrentTurnTokenBudget(), 1313 getTurnOutputTokens(), 1314 ) 1315 1316 if (decision.action === 'continue') { 1317 incrementBudgetContinuationCount() 1318 logForDebugging( 1319 `Token budget continuation #${decision.continuationCount}: ${decision.pct}% (${decision.turnTokens.toLocaleString()} / ${decision.budget.toLocaleString()})`, 1320 ) 1321 state = { 1322 messages: [ 1323 ...messagesForQuery, 1324 ...assistantMessages, 1325 createUserMessage({ 1326 content: decision.nudgeMessage, 1327 isMeta: true, 1328 }), 1329 ], 1330 toolUseContext, 1331 autoCompactTracking: tracking, 1332 maxOutputTokensRecoveryCount: 0, 1333 hasAttemptedReactiveCompact: false, 1334 maxOutputTokensOverride: undefined, 1335 pendingToolUseSummary: undefined, 1336 stopHookActive: undefined, 1337 turnCount, 1338 transition: { reason: 'token_budget_continuation' }, 1339 } 1340 continue 1341 } 1342 1343 if (decision.completionEvent) { 1344 if (decision.completionEvent.diminishingReturns) { 1345 logForDebugging( 1346 `Token budget early stop: diminishing returns at ${decision.completionEvent.pct}%`, 1347 ) 1348 } 1349 logEvent('tengu_token_budget_completed', { 1350 ...decision.completionEvent, 1351 queryChainId: queryChainIdForAnalytics, 1352 queryDepth: queryTracking.depth, 1353 }) 1354 } 1355 } 1356 1357 return { reason: 'completed' } 1358 } 1359 1360 let shouldPreventContinuation = false 1361 let updatedToolUseContext = toolUseContext 1362 1363 queryCheckpoint('query_tool_execution_start') 1364 1365 1366 if (streamingToolExecutor) { 1367 logEvent('tengu_streaming_tool_execution_used', { 1368 tool_count: toolUseBlocks.length, 1369 queryChainId: queryChainIdForAnalytics, 1370 queryDepth: queryTracking.depth, 1371 }) 1372 } else { 1373 logEvent('tengu_streaming_tool_execution_not_used', { 1374 tool_count: toolUseBlocks.length, 1375 queryChainId: queryChainIdForAnalytics, 1376 queryDepth: queryTracking.depth, 1377 }) 1378 } 1379 1380 const toolUpdates = streamingToolExecutor 1381 ? streamingToolExecutor.getRemainingResults() 1382 : runTools(toolUseBlocks, assistantMessages, canUseTool, toolUseContext) 1383 1384 for await (const update of toolUpdates) { 1385 if (update.message) { 1386 yield update.message 1387 1388 if ( 1389 update.message.type === 'attachment' && 1390 update.message.attachment.type === 'hook_stopped_continuation' 1391 ) { 1392 shouldPreventContinuation = true 1393 } 1394 1395 toolResults.push( 1396 ...normalizeMessagesForAPI( 1397 [update.message], 1398 toolUseContext.options.tools, 1399 ).filter(_ => _.type === 'user'), 1400 ) 1401 } 1402 if (update.newContext) { 1403 updatedToolUseContext = { 1404 ...update.newContext, 1405 queryTracking, 1406 } 1407 } 1408 } 1409 queryCheckpoint('query_tool_execution_end') 1410 1411 // Generate tool use summary after tool batch completes — passed to next recursive call 1412 let nextPendingToolUseSummary: 1413 | Promise<ToolUseSummaryMessage | null> 1414 | undefined 1415 if ( 1416 config.gates.emitToolUseSummaries && 1417 toolUseBlocks.length > 0 && 1418 !toolUseContext.abortController.signal.aborted && 1419 !toolUseContext.agentId // subagents don't surface in mobile UI — skip the Haiku call 1420 ) { 1421 // Extract the last assistant text block for context 1422 const lastAssistantMessage = assistantMessages.at(-1) 1423 let lastAssistantText: string | undefined 1424 if (lastAssistantMessage) { 1425 const textBlocks = lastAssistantMessage.message.content.filter( 1426 block => block.type === 'text', 1427 ) 1428 if (textBlocks.length > 0) { 1429 const lastTextBlock = textBlocks.at(-1) 1430 if (lastTextBlock && 'text' in lastTextBlock) { 1431 lastAssistantText = lastTextBlock.text 1432 } 1433 } 1434 } 1435 1436 // Collect tool info for summary generation 1437 const toolUseIds = toolUseBlocks.map(block => block.id) 1438 const toolInfoForSummary = toolUseBlocks.map(block => { 1439 // Find the corresponding tool result 1440 const toolResult = toolResults.find( 1441 result => 1442 result.type === 'user' && 1443 Array.isArray(result.message.content) && 1444 result.message.content.some( 1445 content => 1446 content.type === 'tool_result' && 1447 content.tool_use_id === block.id, 1448 ), 1449 ) 1450 const resultContent = 1451 toolResult?.type === 'user' && 1452 Array.isArray(toolResult.message.content) 1453 ? toolResult.message.content.find( 1454 (c): c is ToolResultBlockParam => 1455 c.type === 'tool_result' && c.tool_use_id === block.id, 1456 ) 1457 : undefined 1458 return { 1459 name: block.name, 1460 input: block.input, 1461 output: 1462 resultContent && 'content' in resultContent 1463 ? resultContent.content 1464 : null, 1465 } 1466 }) 1467 1468 // Fire off summary generation without blocking the next API call 1469 nextPendingToolUseSummary = generateToolUseSummary({ 1470 tools: toolInfoForSummary, 1471 signal: toolUseContext.abortController.signal, 1472 isNonInteractiveSession: toolUseContext.options.isNonInteractiveSession, 1473 lastAssistantText, 1474 }) 1475 .then(summary => { 1476 if (summary) { 1477 return createToolUseSummaryMessage(summary, toolUseIds) 1478 } 1479 return null 1480 }) 1481 .catch(() => null) 1482 } 1483 1484 // We were aborted during tool calls 1485 if (toolUseContext.abortController.signal.aborted) { 1486 // chicago MCP: auto-unhide + lock release when aborted mid-tool-call. 1487 // This is the most likely Ctrl+C path for CU (e.g. slow screenshot). 1488 // Main thread only — see stopHooks.ts for the subagent rationale. 1489 if (feature('CHICAGO_MCP') && !toolUseContext.agentId) { 1490 try { 1491 const { cleanupComputerUseAfterTurn } = await import( 1492 './utils/computerUse/cleanup.js' 1493 ) 1494 await cleanupComputerUseAfterTurn(toolUseContext) 1495 } catch { 1496 // Failures are silent — this is dogfooding cleanup, not critical path 1497 } 1498 } 1499 // Skip the interruption message for submit-interrupts — the queued 1500 // user message that follows provides sufficient context. 1501 if (toolUseContext.abortController.signal.reason !== 'interrupt') { 1502 yield createUserInterruptionMessage({ 1503 toolUse: true, 1504 }) 1505 } 1506 // Check maxTurns before returning when aborted 1507 const nextTurnCountOnAbort = turnCount + 1 1508 if (maxTurns && nextTurnCountOnAbort > maxTurns) { 1509 yield createAttachmentMessage({ 1510 type: 'max_turns_reached', 1511 maxTurns, 1512 turnCount: nextTurnCountOnAbort, 1513 }) 1514 } 1515 return { reason: 'aborted_tools' } 1516 } 1517 1518 // If a hook indicated to prevent continuation, stop here 1519 if (shouldPreventContinuation) { 1520 return { reason: 'hook_stopped' } 1521 } 1522 1523 if (tracking?.compacted) { 1524 tracking.turnCounter++ 1525 logEvent('tengu_post_autocompact_turn', { 1526 turnId: 1527 tracking.turnId as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 1528 turnCounter: tracking.turnCounter, 1529 1530 queryChainId: queryChainIdForAnalytics, 1531 queryDepth: queryTracking.depth, 1532 }) 1533 } 1534 1535 // Be careful to do this after tool calls are done, because the API 1536 // will error if we interleave tool_result messages with regular user messages. 1537 1538 // Instrumentation: Track message count before attachments 1539 logEvent('tengu_query_before_attachments', { 1540 messagesForQueryCount: messagesForQuery.length, 1541 assistantMessagesCount: assistantMessages.length, 1542 toolResultsCount: toolResults.length, 1543 queryChainId: queryChainIdForAnalytics, 1544 queryDepth: queryTracking.depth, 1545 }) 1546 1547 // Get queued commands snapshot before processing attachments. 1548 // These will be sent as attachments so Claude can respond to them in the current turn. 1549 // 1550 // Drain pending notifications. LocalShellTask completions are 'next' 1551 // (when MONITOR_TOOL is on) and drain without Sleep. Other task types 1552 // (agent/workflow/framework) still default to 'later' — the Sleep flush 1553 // covers those. If all task types move to 'next', this branch could go. 1554 // 1555 // Slash commands are excluded from mid-turn drain — they must go through 1556 // processSlashCommand after the turn ends (via useQueueProcessor), not be 1557 // sent to the model as text. Bash-mode commands are already excluded by 1558 // INLINE_NOTIFICATION_MODES in getQueuedCommandAttachments. 1559 // 1560 // Agent scoping: the queue is a process-global singleton shared by the 1561 // coordinator and all in-process subagents. Each loop drains only what's 1562 // addressed to it — main thread drains agentId===undefined, subagents 1563 // drain their own agentId. User prompts (mode:'prompt') still go to main 1564 // only; subagents never see the prompt stream. 1565 // eslint-disable-next-line custom-rules/require-tool-match-name -- ToolUseBlock.name has no aliases 1566 const sleepRan = toolUseBlocks.some(b => b.name === SLEEP_TOOL_NAME) 1567 const isMainThread = 1568 querySource.startsWith('repl_main_thread') || querySource === 'sdk' 1569 const currentAgentId = toolUseContext.agentId 1570 const queuedCommandsSnapshot = getCommandsByMaxPriority( 1571 sleepRan ? 'later' : 'next', 1572 ).filter(cmd => { 1573 if (isSlashCommand(cmd)) return false 1574 if (isMainThread) return cmd.agentId === undefined 1575 // Subagents only drain task-notifications addressed to them — never 1576 // user prompts, even if someone stamps an agentId on one. 1577 return cmd.mode === 'task-notification' && cmd.agentId === currentAgentId 1578 }) 1579 1580 for await (const attachment of getAttachmentMessages( 1581 null, 1582 updatedToolUseContext, 1583 null, 1584 queuedCommandsSnapshot, 1585 [...messagesForQuery, ...assistantMessages, ...toolResults], 1586 querySource, 1587 )) { 1588 yield attachment 1589 toolResults.push(attachment) 1590 } 1591 1592 // Memory prefetch consume: only if settled and not already consumed on 1593 // an earlier iteration. If not settled yet, skip (zero-wait) and retry 1594 // next iteration — the prefetch gets as many chances as there are loop 1595 // iterations before the turn ends. readFileState (cumulative across 1596 // iterations) filters out memories the model already Read/Wrote/Edited 1597 // — including in earlier iterations, which the per-iteration 1598 // toolUseBlocks array would miss. 1599 if ( 1600 pendingMemoryPrefetch && 1601 pendingMemoryPrefetch.settledAt !== null && 1602 pendingMemoryPrefetch.consumedOnIteration === -1 1603 ) { 1604 const memoryAttachments = filterDuplicateMemoryAttachments( 1605 await pendingMemoryPrefetch.promise, 1606 toolUseContext.readFileState, 1607 ) 1608 for (const memAttachment of memoryAttachments) { 1609 const msg = createAttachmentMessage(memAttachment) 1610 yield msg 1611 toolResults.push(msg) 1612 } 1613 pendingMemoryPrefetch.consumedOnIteration = turnCount - 1 1614 } 1615 1616 1617 // Inject prefetched skill discovery. collectSkillDiscoveryPrefetch emits 1618 // hidden_by_main_turn — true when the prefetch resolved before this point 1619 // (should be >98% at AKI@250ms / Haiku@573ms vs turn durations of 2-30s). 1620 if (skillPrefetch && pendingSkillPrefetch) { 1621 const skillAttachments = 1622 await skillPrefetch.collectSkillDiscoveryPrefetch(pendingSkillPrefetch) 1623 for (const att of skillAttachments) { 1624 const msg = createAttachmentMessage(att) 1625 yield msg 1626 toolResults.push(msg) 1627 } 1628 } 1629 1630 // Remove only commands that were actually consumed as attachments. 1631 // Prompt and task-notification commands are converted to attachments above. 1632 const consumedCommands = queuedCommandsSnapshot.filter( 1633 cmd => cmd.mode === 'prompt' || cmd.mode === 'task-notification', 1634 ) 1635 if (consumedCommands.length > 0) { 1636 for (const cmd of consumedCommands) { 1637 if (cmd.uuid) { 1638 consumedCommandUuids.push(cmd.uuid) 1639 notifyCommandLifecycle(cmd.uuid, 'started') 1640 } 1641 } 1642 removeFromQueue(consumedCommands) 1643 } 1644 1645 // Instrumentation: Track file change attachments after they're added 1646 const fileChangeAttachmentCount = count( 1647 toolResults, 1648 tr => 1649 tr.type === 'attachment' && tr.attachment.type === 'edited_text_file', 1650 ) 1651 1652 logEvent('tengu_query_after_attachments', { 1653 totalToolResultsCount: toolResults.length, 1654 fileChangeAttachmentCount, 1655 queryChainId: queryChainIdForAnalytics, 1656 queryDepth: queryTracking.depth, 1657 }) 1658 1659 // Refresh tools between turns so newly-connected MCP servers become available 1660 if (updatedToolUseContext.options.refreshTools) { 1661 const refreshedTools = updatedToolUseContext.options.refreshTools() 1662 if (refreshedTools !== updatedToolUseContext.options.tools) { 1663 updatedToolUseContext = { 1664 ...updatedToolUseContext, 1665 options: { 1666 ...updatedToolUseContext.options, 1667 tools: refreshedTools, 1668 }, 1669 } 1670 } 1671 } 1672 1673 const toolUseContextWithQueryTracking = { 1674 ...updatedToolUseContext, 1675 queryTracking, 1676 } 1677 1678 // Each time we have tool results and are about to recurse, that's a turn 1679 const nextTurnCount = turnCount + 1 1680 1681 // Periodic task summary for `claude ps` — fires mid-turn so a 1682 // long-running agent still refreshes what it's working on. Gated 1683 // only on !agentId so every top-level conversation (REPL, SDK, HFI, 1684 // remote) generates summaries; subagents/forks don't. 1685 if (feature('BG_SESSIONS')) { 1686 if ( 1687 !toolUseContext.agentId && 1688 taskSummaryModule!.shouldGenerateTaskSummary() 1689 ) { 1690 taskSummaryModule!.maybeGenerateTaskSummary({ 1691 systemPrompt, 1692 userContext, 1693 systemContext, 1694 toolUseContext, 1695 forkContextMessages: [ 1696 ...messagesForQuery, 1697 ...assistantMessages, 1698 ...toolResults, 1699 ], 1700 }) 1701 } 1702 } 1703 1704 // Check if we've reached the max turns limit 1705 if (maxTurns && nextTurnCount > maxTurns) { 1706 yield createAttachmentMessage({ 1707 type: 'max_turns_reached', 1708 maxTurns, 1709 turnCount: nextTurnCount, 1710 }) 1711 return { reason: 'max_turns', turnCount: nextTurnCount } 1712 } 1713 1714 queryCheckpoint('query_recursive_call') 1715 const next: State = { 1716 messages: [...messagesForQuery, ...assistantMessages, ...toolResults], 1717 toolUseContext: toolUseContextWithQueryTracking, 1718 autoCompactTracking: tracking, 1719 turnCount: nextTurnCount, 1720 maxOutputTokensRecoveryCount: 0, 1721 hasAttemptedReactiveCompact: false, 1722 pendingToolUseSummary: nextPendingToolUseSummary, 1723 maxOutputTokensOverride: undefined, 1724 stopHookActive, 1725 transition: { reason: 'next_turn' }, 1726 } 1727 state = next 1728 } // while (true) 1729}