services/tokenEstimation.ts at main · nonbinary.computer/claude-code

nonbinary.computer / claude-code
forked from oppi.li/claude-code
fork atom
source dump of claude code
fork atom
claude-code / services / tokenEstimation.ts
at main 495 lines 17 kB view raw
wrap content
oppi.li dump from zip 10d ago
63aada3f
  1import type { Anthropic } from '@anthropic-ai/sdk'
  2import type { BetaMessageParam as MessageParam } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
  3// @aws-sdk/client-bedrock-runtime is imported dynamically in countTokensWithBedrock()
  4// to defer ~279KB of AWS SDK code until a Bedrock call is actually made
  5import type { CountTokensCommandInput } from '@aws-sdk/client-bedrock-runtime'
  6import { getAPIProvider } from 'src/utils/model/providers.js'
  7import { VERTEX_COUNT_TOKENS_ALLOWED_BETAS } from '../constants/betas.js'
  8import type { Attachment } from '../utils/attachments.js'
  9import { getModelBetas } from '../utils/betas.js'
 10import { getVertexRegionForModel, isEnvTruthy } from '../utils/envUtils.js'
 11import { logError } from '../utils/log.js'
 12import { normalizeAttachmentForAPI } from '../utils/messages.js'
 13import {
 14  createBedrockRuntimeClient,
 15  getInferenceProfileBackingModel,
 16  isFoundationModel,
 17} from '../utils/model/bedrock.js'
 18import {
 19  getDefaultSonnetModel,
 20  getMainLoopModel,
 21  getSmallFastModel,
 22  normalizeModelStringForAPI,
 23} from '../utils/model/model.js'
 24import { jsonStringify } from '../utils/slowOperations.js'
 25import { isToolReferenceBlock } from '../utils/toolSearch.js'
 26import { getAPIMetadata, getExtraBodyParams } from './api/claude.js'
 27import { getAnthropicClient } from './api/client.js'
 28import { withTokenCountVCR } from './vcr.js'
 29
 30// Minimal values for token counting with thinking enabled
 31// API constraint: max_tokens must be greater than thinking.budget_tokens
 32const TOKEN_COUNT_THINKING_BUDGET = 1024
 33const TOKEN_COUNT_MAX_TOKENS = 2048
 34
 35/**
 36 * Check if messages contain thinking blocks
 37 */
 38function hasThinkingBlocks(
 39  messages: Anthropic.Beta.Messages.BetaMessageParam[],
 40): boolean {
 41  for (const message of messages) {
 42    if (message.role === 'assistant' && Array.isArray(message.content)) {
 43      for (const block of message.content) {
 44        if (
 45          typeof block === 'object' &&
 46          block !== null &&
 47          'type' in block &&
 48          (block.type === 'thinking' || block.type === 'redacted_thinking')
 49        ) {
 50          return true
 51        }
 52      }
 53    }
 54  }
 55  return false
 56}
 57
 58/**
 59 * Strip tool search-specific fields from messages before sending for token counting.
 60 * This removes 'caller' from tool_use blocks and 'tool_reference' from tool_result content.
 61 * These fields are only valid with the tool search beta and will cause errors otherwise.
 62 *
 63 * Note: We use 'as unknown as' casts because the SDK types don't include tool search beta fields,
 64 * but at runtime these fields may exist from API responses when tool search was enabled.
 65 */
 66function stripToolSearchFieldsFromMessages(
 67  messages: Anthropic.Beta.Messages.BetaMessageParam[],
 68): Anthropic.Beta.Messages.BetaMessageParam[] {
 69  return messages.map(message => {
 70    if (!Array.isArray(message.content)) {
 71      return message
 72    }
 73
 74    const normalizedContent = message.content.map(block => {
 75      // Strip 'caller' from tool_use blocks (assistant messages)
 76      if (block.type === 'tool_use') {
 77        // Destructure to exclude any extra fields like 'caller'
 78        const toolUse =
 79          block as Anthropic.Beta.Messages.BetaToolUseBlockParam & {
 80            caller?: unknown
 81          }
 82        return {
 83          type: 'tool_use' as const,
 84          id: toolUse.id,
 85          name: toolUse.name,
 86          input: toolUse.input,
 87        }
 88      }
 89
 90      // Strip tool_reference blocks from tool_result content (user messages)
 91      if (block.type === 'tool_result') {
 92        const toolResult =
 93          block as Anthropic.Beta.Messages.BetaToolResultBlockParam
 94        if (Array.isArray(toolResult.content)) {
 95          const filteredContent = (toolResult.content as unknown[]).filter(
 96            c => !isToolReferenceBlock(c),
 97          ) as typeof toolResult.content
 98
 99          if (filteredContent.length === 0) {
100            return {
101              ...toolResult,
102              content: [{ type: 'text' as const, text: '[tool references]' }],
103            }
104          }
105          if (filteredContent.length !== toolResult.content.length) {
106            return {
107              ...toolResult,
108              content: filteredContent,
109            }
110          }
111        }
112      }
113
114      return block
115    })
116
117    return {
118      ...message,
119      content: normalizedContent,
120    }
121  })
122}
123
124export async function countTokensWithAPI(
125  content: string,
126): Promise<number | null> {
127  // Special case for empty content - API doesn't accept empty messages
128  if (!content) {
129    return 0
130  }
131
132  const message: Anthropic.Beta.Messages.BetaMessageParam = {
133    role: 'user',
134    content: content,
135  }
136
137  return countMessagesTokensWithAPI([message], [])
138}
139
140export async function countMessagesTokensWithAPI(
141  messages: Anthropic.Beta.Messages.BetaMessageParam[],
142  tools: Anthropic.Beta.Messages.BetaToolUnion[],
143): Promise<number | null> {
144  return withTokenCountVCR(messages, tools, async () => {
145    try {
146      const model = getMainLoopModel()
147      const betas = getModelBetas(model)
148      const containsThinking = hasThinkingBlocks(messages)
149
150      if (getAPIProvider() === 'bedrock') {
151        // @anthropic-sdk/bedrock-sdk doesn't support countTokens currently
152        return countTokensWithBedrock({
153          model: normalizeModelStringForAPI(model),
154          messages,
155          tools,
156          betas,
157          containsThinking,
158        })
159      }
160
161      const anthropic = await getAnthropicClient({
162        maxRetries: 1,
163        model,
164        source: 'count_tokens',
165      })
166
167      const filteredBetas =
168        getAPIProvider() === 'vertex'
169          ? betas.filter(b => VERTEX_COUNT_TOKENS_ALLOWED_BETAS.has(b))
170          : betas
171
172      const response = await anthropic.beta.messages.countTokens({
173        model: normalizeModelStringForAPI(model),
174        messages:
175          // When we pass tools and no messages, we need to pass a dummy message
176          // to get an accurate tool token count.
177          messages.length > 0 ? messages : [{ role: 'user', content: 'foo' }],
178        tools,
179        ...(filteredBetas.length > 0 && { betas: filteredBetas }),
180        // Enable thinking if messages contain thinking blocks
181        ...(containsThinking && {
182          thinking: {
183            type: 'enabled',
184            budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
185          },
186        }),
187      })
188
189      if (typeof response.input_tokens !== 'number') {
190        // Vertex client throws
191        // Bedrock client succeeds with { Output: { __type: 'com.amazon.coral.service#UnknownOperationException' }, Version: '1.0' }
192        return null
193      }
194
195      return response.input_tokens
196    } catch (error) {
197      logError(error)
198      return null
199    }
200  })
201}
202
203export function roughTokenCountEstimation(
204  content: string,
205  bytesPerToken: number = 4,
206): number {
207  return Math.round(content.length / bytesPerToken)
208}
209
210/**
211 * Returns an estimated bytes-per-token ratio for a given file extension.
212 * Dense JSON has many single-character tokens (`{`, `}`, `:`, `,`, `"`)
213 * which makes the real ratio closer to 2 rather than the default 4.
214 */
215export function bytesPerTokenForFileType(fileExtension: string): number {
216  switch (fileExtension) {
217    case 'json':
218    case 'jsonl':
219    case 'jsonc':
220      return 2
221    default:
222      return 4
223  }
224}
225
226/**
227 * Like {@link roughTokenCountEstimation} but uses a more accurate
228 * bytes-per-token ratio when the file type is known.
229 *
230 * This matters when the API-based token count is unavailable (e.g. on
231 * Bedrock) and we fall back to the rough estimate — an underestimate can
232 * let an oversized tool result slip into the conversation.
233 */
234export function roughTokenCountEstimationForFileType(
235  content: string,
236  fileExtension: string,
237): number {
238  return roughTokenCountEstimation(
239    content,
240    bytesPerTokenForFileType(fileExtension),
241  )
242}
243
244/**
245 * Estimates token count for a Message object by extracting and analyzing its text content.
246 * This provides a more reliable estimate than getTokenUsage for messages that may have been compacted.
247 * Uses Haiku for token counting (Haiku 4.5 supports thinking blocks), except:
248 * - Vertex global region: uses Sonnet (Haiku not available)
249 * - Bedrock with thinking blocks: uses Sonnet (Haiku 3.5 doesn't support thinking)
250 */
251export async function countTokensViaHaikuFallback(
252  messages: Anthropic.Beta.Messages.BetaMessageParam[],
253  tools: Anthropic.Beta.Messages.BetaToolUnion[],
254): Promise<number | null> {
255  // Check if messages contain thinking blocks
256  const containsThinking = hasThinkingBlocks(messages)
257
258  // If we're on Vertex and using global region, always use Sonnet since Haiku is not available there.
259  const isVertexGlobalEndpoint =
260    isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX) &&
261    getVertexRegionForModel(getSmallFastModel()) === 'global'
262  // If we're on Bedrock with thinking blocks, use Sonnet since Haiku 3.5 doesn't support thinking
263  const isBedrockWithThinking =
264    isEnvTruthy(process.env.CLAUDE_CODE_USE_BEDROCK) && containsThinking
265  // If we're on Vertex with thinking blocks, use Sonnet since Haiku 3.5 doesn't support thinking
266  const isVertexWithThinking =
267    isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX) && containsThinking
268  // Otherwise always use Haiku - Haiku 4.5 supports thinking blocks.
269  // WARNING: if you change this to use a non-Haiku model, this request will fail in 1P unless it uses getCLISyspromptPrefix.
270  // Note: We don't need Sonnet for tool_reference blocks because we strip them via
271  // stripToolSearchFieldsFromMessages() before sending.
272  // Use getSmallFastModel() to respect ANTHROPIC_SMALL_FAST_MODEL env var for Bedrock users
273  // with global inference profiles (see issue #10883).
274  const model =
275    isVertexGlobalEndpoint || isBedrockWithThinking || isVertexWithThinking
276      ? getDefaultSonnetModel()
277      : getSmallFastModel()
278  const anthropic = await getAnthropicClient({
279    maxRetries: 1,
280    model,
281    source: 'count_tokens',
282  })
283
284  // Strip tool search-specific fields (caller, tool_reference) before sending
285  // These fields are only valid with the tool search beta header
286  const normalizedMessages = stripToolSearchFieldsFromMessages(messages)
287
288  const messagesToSend: MessageParam[] =
289    normalizedMessages.length > 0
290      ? (normalizedMessages as MessageParam[])
291      : [{ role: 'user', content: 'count' }]
292
293  const betas = getModelBetas(model)
294  // Filter betas for Vertex - some betas (like web-search) cause 400 errors
295  // on certain Vertex endpoints. See issue #10789.
296  const filteredBetas =
297    getAPIProvider() === 'vertex'
298      ? betas.filter(b => VERTEX_COUNT_TOKENS_ALLOWED_BETAS.has(b))
299      : betas
300
301  // biome-ignore lint/plugin: token counting needs specialized parameters (thinking, betas) that sideQuery doesn't support
302  const response = await anthropic.beta.messages.create({
303    model: normalizeModelStringForAPI(model),
304    max_tokens: containsThinking ? TOKEN_COUNT_MAX_TOKENS : 1,
305    messages: messagesToSend,
306    tools: tools.length > 0 ? tools : undefined,
307    ...(filteredBetas.length > 0 && { betas: filteredBetas }),
308    metadata: getAPIMetadata(),
309    ...getExtraBodyParams(),
310    // Enable thinking if messages contain thinking blocks
311    ...(containsThinking && {
312      thinking: {
313        type: 'enabled',
314        budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
315      },
316    }),
317  })
318
319  const usage = response.usage
320  const inputTokens = usage.input_tokens
321  const cacheCreationTokens = usage.cache_creation_input_tokens || 0
322  const cacheReadTokens = usage.cache_read_input_tokens || 0
323
324  return inputTokens + cacheCreationTokens + cacheReadTokens
325}
326
327export function roughTokenCountEstimationForMessages(
328  messages: readonly {
329    type: string
330    message?: { content?: unknown }
331    attachment?: Attachment
332  }[],
333): number {
334  let totalTokens = 0
335  for (const message of messages) {
336    totalTokens += roughTokenCountEstimationForMessage(message)
337  }
338  return totalTokens
339}
340
341export function roughTokenCountEstimationForMessage(message: {
342  type: string
343  message?: { content?: unknown }
344  attachment?: Attachment
345}): number {
346  if (
347    (message.type === 'assistant' || message.type === 'user') &&
348    message.message?.content
349  ) {
350    return roughTokenCountEstimationForContent(
351      message.message?.content as
352        | string
353        | Array<Anthropic.ContentBlock>
354        | Array<Anthropic.ContentBlockParam>
355        | undefined,
356    )
357  }
358
359  if (message.type === 'attachment' && message.attachment) {
360    const userMessages = normalizeAttachmentForAPI(message.attachment)
361    let total = 0
362    for (const userMsg of userMessages) {
363      total += roughTokenCountEstimationForContent(userMsg.message.content)
364    }
365    return total
366  }
367
368  return 0
369}
370
371function roughTokenCountEstimationForContent(
372  content:
373    | string
374    | Array<Anthropic.ContentBlock>
375    | Array<Anthropic.ContentBlockParam>
376    | undefined,
377): number {
378  if (!content) {
379    return 0
380  }
381  if (typeof content === 'string') {
382    return roughTokenCountEstimation(content)
383  }
384  let totalTokens = 0
385  for (const block of content) {
386    totalTokens += roughTokenCountEstimationForBlock(block)
387  }
388  return totalTokens
389}
390
391function roughTokenCountEstimationForBlock(
392  block: string | Anthropic.ContentBlock | Anthropic.ContentBlockParam,
393): number {
394  if (typeof block === 'string') {
395    return roughTokenCountEstimation(block)
396  }
397  if (block.type === 'text') {
398    return roughTokenCountEstimation(block.text)
399  }
400  if (block.type === 'image' || block.type === 'document') {
401    // https://platform.claude.com/docs/en/build-with-claude/vision#calculate-image-costs
402    // tokens = (width px * height px)/750
403    // Images are resized to max 2000x2000 (5333 tokens). Use a conservative
404    // estimate that matches microCompact's IMAGE_MAX_TOKEN_SIZE to avoid
405    // underestimating and triggering auto-compact too late.
406    //
407    // document: base64 PDF in source.data.  Must NOT reach the
408    // jsonStringify catch-all — a 1MB PDF is ~1.33M base64 chars →
409    // ~325k estimated tokens, vs the ~2000 the API actually charges.
410    // Same constant as microCompact's calculateToolResultTokens.
411    return 2000
412  }
413  if (block.type === 'tool_result') {
414    return roughTokenCountEstimationForContent(block.content)
415  }
416  if (block.type === 'tool_use') {
417    // input is the JSON the model generated — arbitrarily large (bash
418    // commands, Edit diffs, file contents).  Stringify once for the
419    // char count; the API re-serializes anyway so this is what it sees.
420    return roughTokenCountEstimation(
421      block.name + jsonStringify(block.input ?? {}),
422    )
423  }
424  if (block.type === 'thinking') {
425    return roughTokenCountEstimation(block.thinking)
426  }
427  if (block.type === 'redacted_thinking') {
428    return roughTokenCountEstimation(block.data)
429  }
430  // server_tool_use, web_search_tool_result, mcp_tool_use, etc. —
431  // text-like payloads (tool inputs, search results, no base64).
432  // Stringify-length tracks the serialized form the API sees; the
433  // key/bracket overhead is single-digit percent on real blocks.
434  return roughTokenCountEstimation(jsonStringify(block))
435}
436
437async function countTokensWithBedrock({
438  model,
439  messages,
440  tools,
441  betas,
442  containsThinking,
443}: {
444  model: string
445  messages: Anthropic.Beta.Messages.BetaMessageParam[]
446  tools: Anthropic.Beta.Messages.BetaToolUnion[]
447  betas: string[]
448  containsThinking: boolean
449}): Promise<number | null> {
450  try {
451    const client = await createBedrockRuntimeClient()
452    // Bedrock CountTokens requires a model ID, not an inference profile / ARN
453    const modelId = isFoundationModel(model)
454      ? model
455      : await getInferenceProfileBackingModel(model)
456    if (!modelId) {
457      return null
458    }
459
460    const requestBody = {
461      anthropic_version: 'bedrock-2023-05-31',
462      // When we pass tools and no messages, we need to pass a dummy message
463      // to get an accurate tool token count.
464      messages:
465        messages.length > 0 ? messages : [{ role: 'user', content: 'foo' }],
466      max_tokens: containsThinking ? TOKEN_COUNT_MAX_TOKENS : 1,
467      ...(tools.length > 0 && { tools }),
468      ...(betas.length > 0 && { anthropic_beta: betas }),
469      ...(containsThinking && {
470        thinking: {
471          type: 'enabled',
472          budget_tokens: TOKEN_COUNT_THINKING_BUDGET,
473        },
474      }),
475    }
476
477    const { CountTokensCommand } = await import(
478      '@aws-sdk/client-bedrock-runtime'
479    )
480    const input: CountTokensCommandInput = {
481      modelId,
482      input: {
483        invokeModel: {
484          body: new TextEncoder().encode(jsonStringify(requestBody)),
485        },
486      },
487    }
488    const response = await client.send(new CountTokensCommand(input))
489    const tokenCount = response.inputTokens ?? null
490    return tokenCount
491  } catch (error) {
492    logError(error)
493    return null
494  }
495}