utils/tokens.ts at main · nonbinary.computer/claude-code

nonbinary.computer / claude-code
forked from oppi.li/claude-code
fork atom
source dump of claude code
fork atom
claude-code / utils / tokens.ts
at main 261 lines 9.6 kB view raw
wrap content
oppi.li dump from zip 14d ago
63aada3f
  1import type { BetaUsage as Usage } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
  2import { roughTokenCountEstimationForMessages } from '../services/tokenEstimation.js'
  3import type { AssistantMessage, Message } from '../types/message.js'
  4import { SYNTHETIC_MESSAGES, SYNTHETIC_MODEL } from './messages.js'
  5import { jsonStringify } from './slowOperations.js'
  6
  7export function getTokenUsage(message: Message): Usage | undefined {
  8  if (
  9    message?.type === 'assistant' &&
 10    'usage' in message.message &&
 11    !(
 12      message.message.content[0]?.type === 'text' &&
 13      SYNTHETIC_MESSAGES.has(message.message.content[0].text)
 14    ) &&
 15    message.message.model !== SYNTHETIC_MODEL
 16  ) {
 17    return message.message.usage
 18  }
 19  return undefined
 20}
 21
 22/**
 23 * Get the API response id for an assistant message with real (non-synthetic) usage.
 24 * Used to identify split assistant records that came from the same API response —
 25 * when parallel tool calls are streamed, each content block becomes a separate
 26 * AssistantMessage record, but they all share the same message.id.
 27 */
 28function getAssistantMessageId(message: Message): string | undefined {
 29  if (
 30    message?.type === 'assistant' &&
 31    'id' in message.message &&
 32    message.message.model !== SYNTHETIC_MODEL
 33  ) {
 34    return message.message.id
 35  }
 36  return undefined
 37}
 38
 39/**
 40 * Calculate total context window tokens from an API response's usage data.
 41 * Includes input_tokens + cache tokens + output_tokens.
 42 *
 43 * This represents the full context size at the time of that API call.
 44 * Use tokenCountWithEstimation() when you need context size from messages.
 45 */
 46export function getTokenCountFromUsage(usage: Usage): number {
 47  return (
 48    usage.input_tokens +
 49    (usage.cache_creation_input_tokens ?? 0) +
 50    (usage.cache_read_input_tokens ?? 0) +
 51    usage.output_tokens
 52  )
 53}
 54
 55export function tokenCountFromLastAPIResponse(messages: Message[]): number {
 56  let i = messages.length - 1
 57  while (i >= 0) {
 58    const message = messages[i]
 59    const usage = message ? getTokenUsage(message) : undefined
 60    if (usage) {
 61      return getTokenCountFromUsage(usage)
 62    }
 63    i--
 64  }
 65  return 0
 66}
 67
 68/**
 69 * Final context window size from the last API response's usage.iterations[-1].
 70 * Used for task_budget.remaining computation across compaction boundaries —
 71 * the server's budget countdown is context-based, so remaining decrements by
 72 * the pre-compact final window, not billing spend. See monorepo
 73 * api/api/sampling/prompt/renderer.py:292 for the server-side computation.
 74 *
 75 * Falls back to top-level input_tokens + output_tokens when iterations is
 76 * absent (no server-side tool loops, so top-level usage IS the final window).
 77 * Both paths exclude cache tokens to match #304930's formula.
 78 */
 79export function finalContextTokensFromLastResponse(
 80  messages: Message[],
 81): number {
 82  let i = messages.length - 1
 83  while (i >= 0) {
 84    const message = messages[i]
 85    const usage = message ? getTokenUsage(message) : undefined
 86    if (usage) {
 87      // Stainless types don't include iterations yet — cast like advisor.ts:43
 88      const iterations = (
 89        usage as {
 90          iterations?: Array<{
 91            input_tokens: number
 92            output_tokens: number
 93          }> | null
 94        }
 95      ).iterations
 96      if (iterations && iterations.length > 0) {
 97        const last = iterations.at(-1)!
 98        return last.input_tokens + last.output_tokens
 99      }
100      // No iterations → no server tool loop → top-level usage IS the final
101      // window. Match the iterations path's formula (input + output, no cache)
102      // rather than getTokenCountFromUsage — #304930 defines final window as
103      // non-cache input + output. Whether the server's budget countdown
104      // (renderer.py:292 calculate_context_tokens) counts cache the same way
105      // is an open question; aligning with the iterations path keeps the two
106      // branches consistent until that's resolved.
107      return usage.input_tokens + usage.output_tokens
108    }
109    i--
110  }
111  return 0
112}
113
114/**
115 * Get only the output_tokens from the last API response.
116 * This excludes input context (system prompt, tools, prior messages).
117 *
118 * WARNING: Do NOT use this for threshold comparisons (autocompact, session memory).
119 * Use tokenCountWithEstimation() instead, which measures full context size.
120 * This function is only useful for measuring how many tokens Claude generated
121 * in a single response, not how full the context window is.
122 */
123export function messageTokenCountFromLastAPIResponse(
124  messages: Message[],
125): number {
126  let i = messages.length - 1
127  while (i >= 0) {
128    const message = messages[i]
129    const usage = message ? getTokenUsage(message) : undefined
130    if (usage) {
131      return usage.output_tokens
132    }
133    i--
134  }
135  return 0
136}
137
138export function getCurrentUsage(messages: Message[]): {
139  input_tokens: number
140  output_tokens: number
141  cache_creation_input_tokens: number
142  cache_read_input_tokens: number
143} | null {
144  for (let i = messages.length - 1; i >= 0; i--) {
145    const message = messages[i]
146    const usage = message ? getTokenUsage(message) : undefined
147    if (usage) {
148      return {
149        input_tokens: usage.input_tokens,
150        output_tokens: usage.output_tokens,
151        cache_creation_input_tokens: usage.cache_creation_input_tokens ?? 0,
152        cache_read_input_tokens: usage.cache_read_input_tokens ?? 0,
153      }
154    }
155  }
156  return null
157}
158
159export function doesMostRecentAssistantMessageExceed200k(
160  messages: Message[],
161): boolean {
162  const THRESHOLD = 200_000
163
164  const lastAsst = messages.findLast(m => m.type === 'assistant')
165  if (!lastAsst) return false
166  const usage = getTokenUsage(lastAsst)
167  return usage ? getTokenCountFromUsage(usage) > THRESHOLD : false
168}
169
170/**
171 * Calculate the character content length of an assistant message.
172 * Used for spinner token estimation (characters / 4 ≈ tokens).
173 * This is used when subagent streaming events are filtered out and we
174 * need to count content from completed messages instead.
175 *
176 * Counts the same content that handleMessageFromStream would count via deltas:
177 * - text (text_delta)
178 * - thinking (thinking_delta)
179 * - redacted_thinking data
180 * - tool_use input (input_json_delta)
181 * Note: signature_delta is excluded from streaming counts (not model output).
182 */
183export function getAssistantMessageContentLength(
184  message: AssistantMessage,
185): number {
186  let contentLength = 0
187  for (const block of message.message.content) {
188    if (block.type === 'text') {
189      contentLength += block.text.length
190    } else if (block.type === 'thinking') {
191      contentLength += block.thinking.length
192    } else if (block.type === 'redacted_thinking') {
193      contentLength += block.data.length
194    } else if (block.type === 'tool_use') {
195      contentLength += jsonStringify(block.input).length
196    }
197  }
198  return contentLength
199}
200
201/**
202 * Get the current context window size in tokens.
203 *
204 * This is the CANONICAL function for measuring context size when checking
205 * thresholds (autocompact, session memory init, etc.). Uses the last API
206 * response's token count (input + output + cache) plus estimates for any
207 * messages added since.
208 *
209 * Always use this instead of:
210 * - Cumulative token counting (which double-counts as context grows)
211 * - messageTokenCountFromLastAPIResponse (which only counts output_tokens)
212 * - tokenCountFromLastAPIResponse (which doesn't estimate new messages)
213 *
214 * Implementation note on parallel tool calls: when the model makes multiple
215 * tool calls in one response, the streaming code emits a SEPARATE assistant
216 * record per content block (all sharing the same message.id and usage), and
217 * the query loop interleaves each tool_result immediately after its tool_use.
218 * So the messages array looks like:
219 *   [..., assistant(id=A), user(result), assistant(id=A), user(result), ...]
220 * If we stop at the LAST assistant record, we only estimate the one tool_result
221 * after it and miss all the earlier interleaved tool_results — which will ALL
222 * be in the next API request. To avoid undercounting, after finding a usage-
223 * bearing record we walk back to the FIRST sibling with the same message.id
224 * so every interleaved tool_result is included in the rough estimate.
225 */
226export function tokenCountWithEstimation(messages: readonly Message[]): number {
227  let i = messages.length - 1
228  while (i >= 0) {
229    const message = messages[i]
230    const usage = message ? getTokenUsage(message) : undefined
231    if (message && usage) {
232      // Walk back past any earlier sibling records split from the same API
233      // response (same message.id) so interleaved tool_results between them
234      // are included in the estimation slice.
235      const responseId = getAssistantMessageId(message)
236      if (responseId) {
237        let j = i - 1
238        while (j >= 0) {
239          const prior = messages[j]
240          const priorId = prior ? getAssistantMessageId(prior) : undefined
241          if (priorId === responseId) {
242            // Earlier split of the same API response — anchor here instead.
243            i = j
244          } else if (priorId !== undefined) {
245            // Hit a different API response — stop walking.
246            break
247          }
248          // priorId === undefined: a user/tool_result/attachment message,
249          // possibly interleaved between splits — keep walking.
250          j--
251        }
252      }
253      return (
254        getTokenCountFromUsage(usage) +
255        roughTokenCountEstimationForMessages(messages.slice(i + 1))
256      )
257    }
258    i--
259  }
260  return roughTokenCountEstimationForMessages(messages)
261}