source dump of claude code
at main 261 lines 9.6 kB view raw
1import type { BetaUsage as Usage } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs' 2import { roughTokenCountEstimationForMessages } from '../services/tokenEstimation.js' 3import type { AssistantMessage, Message } from '../types/message.js' 4import { SYNTHETIC_MESSAGES, SYNTHETIC_MODEL } from './messages.js' 5import { jsonStringify } from './slowOperations.js' 6 7export function getTokenUsage(message: Message): Usage | undefined { 8 if ( 9 message?.type === 'assistant' && 10 'usage' in message.message && 11 !( 12 message.message.content[0]?.type === 'text' && 13 SYNTHETIC_MESSAGES.has(message.message.content[0].text) 14 ) && 15 message.message.model !== SYNTHETIC_MODEL 16 ) { 17 return message.message.usage 18 } 19 return undefined 20} 21 22/** 23 * Get the API response id for an assistant message with real (non-synthetic) usage. 24 * Used to identify split assistant records that came from the same API response — 25 * when parallel tool calls are streamed, each content block becomes a separate 26 * AssistantMessage record, but they all share the same message.id. 27 */ 28function getAssistantMessageId(message: Message): string | undefined { 29 if ( 30 message?.type === 'assistant' && 31 'id' in message.message && 32 message.message.model !== SYNTHETIC_MODEL 33 ) { 34 return message.message.id 35 } 36 return undefined 37} 38 39/** 40 * Calculate total context window tokens from an API response's usage data. 41 * Includes input_tokens + cache tokens + output_tokens. 42 * 43 * This represents the full context size at the time of that API call. 44 * Use tokenCountWithEstimation() when you need context size from messages. 45 */ 46export function getTokenCountFromUsage(usage: Usage): number { 47 return ( 48 usage.input_tokens + 49 (usage.cache_creation_input_tokens ?? 0) + 50 (usage.cache_read_input_tokens ?? 0) + 51 usage.output_tokens 52 ) 53} 54 55export function tokenCountFromLastAPIResponse(messages: Message[]): number { 56 let i = messages.length - 1 57 while (i >= 0) { 58 const message = messages[i] 59 const usage = message ? getTokenUsage(message) : undefined 60 if (usage) { 61 return getTokenCountFromUsage(usage) 62 } 63 i-- 64 } 65 return 0 66} 67 68/** 69 * Final context window size from the last API response's usage.iterations[-1]. 70 * Used for task_budget.remaining computation across compaction boundaries — 71 * the server's budget countdown is context-based, so remaining decrements by 72 * the pre-compact final window, not billing spend. See monorepo 73 * api/api/sampling/prompt/renderer.py:292 for the server-side computation. 74 * 75 * Falls back to top-level input_tokens + output_tokens when iterations is 76 * absent (no server-side tool loops, so top-level usage IS the final window). 77 * Both paths exclude cache tokens to match #304930's formula. 78 */ 79export function finalContextTokensFromLastResponse( 80 messages: Message[], 81): number { 82 let i = messages.length - 1 83 while (i >= 0) { 84 const message = messages[i] 85 const usage = message ? getTokenUsage(message) : undefined 86 if (usage) { 87 // Stainless types don't include iterations yet — cast like advisor.ts:43 88 const iterations = ( 89 usage as { 90 iterations?: Array<{ 91 input_tokens: number 92 output_tokens: number 93 }> | null 94 } 95 ).iterations 96 if (iterations && iterations.length > 0) { 97 const last = iterations.at(-1)! 98 return last.input_tokens + last.output_tokens 99 } 100 // No iterations → no server tool loop → top-level usage IS the final 101 // window. Match the iterations path's formula (input + output, no cache) 102 // rather than getTokenCountFromUsage — #304930 defines final window as 103 // non-cache input + output. Whether the server's budget countdown 104 // (renderer.py:292 calculate_context_tokens) counts cache the same way 105 // is an open question; aligning with the iterations path keeps the two 106 // branches consistent until that's resolved. 107 return usage.input_tokens + usage.output_tokens 108 } 109 i-- 110 } 111 return 0 112} 113 114/** 115 * Get only the output_tokens from the last API response. 116 * This excludes input context (system prompt, tools, prior messages). 117 * 118 * WARNING: Do NOT use this for threshold comparisons (autocompact, session memory). 119 * Use tokenCountWithEstimation() instead, which measures full context size. 120 * This function is only useful for measuring how many tokens Claude generated 121 * in a single response, not how full the context window is. 122 */ 123export function messageTokenCountFromLastAPIResponse( 124 messages: Message[], 125): number { 126 let i = messages.length - 1 127 while (i >= 0) { 128 const message = messages[i] 129 const usage = message ? getTokenUsage(message) : undefined 130 if (usage) { 131 return usage.output_tokens 132 } 133 i-- 134 } 135 return 0 136} 137 138export function getCurrentUsage(messages: Message[]): { 139 input_tokens: number 140 output_tokens: number 141 cache_creation_input_tokens: number 142 cache_read_input_tokens: number 143} | null { 144 for (let i = messages.length - 1; i >= 0; i--) { 145 const message = messages[i] 146 const usage = message ? getTokenUsage(message) : undefined 147 if (usage) { 148 return { 149 input_tokens: usage.input_tokens, 150 output_tokens: usage.output_tokens, 151 cache_creation_input_tokens: usage.cache_creation_input_tokens ?? 0, 152 cache_read_input_tokens: usage.cache_read_input_tokens ?? 0, 153 } 154 } 155 } 156 return null 157} 158 159export function doesMostRecentAssistantMessageExceed200k( 160 messages: Message[], 161): boolean { 162 const THRESHOLD = 200_000 163 164 const lastAsst = messages.findLast(m => m.type === 'assistant') 165 if (!lastAsst) return false 166 const usage = getTokenUsage(lastAsst) 167 return usage ? getTokenCountFromUsage(usage) > THRESHOLD : false 168} 169 170/** 171 * Calculate the character content length of an assistant message. 172 * Used for spinner token estimation (characters / 4 ≈ tokens). 173 * This is used when subagent streaming events are filtered out and we 174 * need to count content from completed messages instead. 175 * 176 * Counts the same content that handleMessageFromStream would count via deltas: 177 * - text (text_delta) 178 * - thinking (thinking_delta) 179 * - redacted_thinking data 180 * - tool_use input (input_json_delta) 181 * Note: signature_delta is excluded from streaming counts (not model output). 182 */ 183export function getAssistantMessageContentLength( 184 message: AssistantMessage, 185): number { 186 let contentLength = 0 187 for (const block of message.message.content) { 188 if (block.type === 'text') { 189 contentLength += block.text.length 190 } else if (block.type === 'thinking') { 191 contentLength += block.thinking.length 192 } else if (block.type === 'redacted_thinking') { 193 contentLength += block.data.length 194 } else if (block.type === 'tool_use') { 195 contentLength += jsonStringify(block.input).length 196 } 197 } 198 return contentLength 199} 200 201/** 202 * Get the current context window size in tokens. 203 * 204 * This is the CANONICAL function for measuring context size when checking 205 * thresholds (autocompact, session memory init, etc.). Uses the last API 206 * response's token count (input + output + cache) plus estimates for any 207 * messages added since. 208 * 209 * Always use this instead of: 210 * - Cumulative token counting (which double-counts as context grows) 211 * - messageTokenCountFromLastAPIResponse (which only counts output_tokens) 212 * - tokenCountFromLastAPIResponse (which doesn't estimate new messages) 213 * 214 * Implementation note on parallel tool calls: when the model makes multiple 215 * tool calls in one response, the streaming code emits a SEPARATE assistant 216 * record per content block (all sharing the same message.id and usage), and 217 * the query loop interleaves each tool_result immediately after its tool_use. 218 * So the messages array looks like: 219 * [..., assistant(id=A), user(result), assistant(id=A), user(result), ...] 220 * If we stop at the LAST assistant record, we only estimate the one tool_result 221 * after it and miss all the earlier interleaved tool_results — which will ALL 222 * be in the next API request. To avoid undercounting, after finding a usage- 223 * bearing record we walk back to the FIRST sibling with the same message.id 224 * so every interleaved tool_result is included in the rough estimate. 225 */ 226export function tokenCountWithEstimation(messages: readonly Message[]): number { 227 let i = messages.length - 1 228 while (i >= 0) { 229 const message = messages[i] 230 const usage = message ? getTokenUsage(message) : undefined 231 if (message && usage) { 232 // Walk back past any earlier sibling records split from the same API 233 // response (same message.id) so interleaved tool_results between them 234 // are included in the estimation slice. 235 const responseId = getAssistantMessageId(message) 236 if (responseId) { 237 let j = i - 1 238 while (j >= 0) { 239 const prior = messages[j] 240 const priorId = prior ? getAssistantMessageId(prior) : undefined 241 if (priorId === responseId) { 242 // Earlier split of the same API response — anchor here instead. 243 i = j 244 } else if (priorId !== undefined) { 245 // Hit a different API response — stop walking. 246 break 247 } 248 // priorId === undefined: a user/tool_result/attachment message, 249 // possibly interleaved between splits — keep walking. 250 j-- 251 } 252 } 253 return ( 254 getTokenCountFromUsage(usage) + 255 roughTokenCountEstimationForMessages(messages.slice(i + 1)) 256 ) 257 } 258 i-- 259 } 260 return roughTokenCountEstimationForMessages(messages) 261}