services/api/withRetry.ts at main · oppi.li/claude-code

oppi.li / claude-code
fork atom
source dump of claude code
fork atom
claude-code / services / api / withRetry.ts
at main 822 lines 28 kB view raw
wrap content
oppi.li dump from zip 9d ago
63aada3f
  1import { feature } from 'bun:bundle'
  2import type Anthropic from '@anthropic-ai/sdk'
  3import {
  4  APIConnectionError,
  5  APIError,
  6  APIUserAbortError,
  7} from '@anthropic-ai/sdk'
  8import type { QuerySource } from 'src/constants/querySource.js'
  9import type { SystemAPIErrorMessage } from 'src/types/message.js'
 10import { isAwsCredentialsProviderError } from 'src/utils/aws.js'
 11import { logForDebugging } from 'src/utils/debug.js'
 12import { logError } from 'src/utils/log.js'
 13import { createSystemAPIErrorMessage } from 'src/utils/messages.js'
 14import { getAPIProviderForStatsig } from 'src/utils/model/providers.js'
 15import {
 16  clearApiKeyHelperCache,
 17  clearAwsCredentialsCache,
 18  clearGcpCredentialsCache,
 19  getClaudeAIOAuthTokens,
 20  handleOAuth401Error,
 21  isClaudeAISubscriber,
 22  isEnterpriseSubscriber,
 23} from '../../utils/auth.js'
 24import { isEnvTruthy } from '../../utils/envUtils.js'
 25import { errorMessage } from '../../utils/errors.js'
 26import {
 27  type CooldownReason,
 28  handleFastModeOverageRejection,
 29  handleFastModeRejectedByAPI,
 30  isFastModeCooldown,
 31  isFastModeEnabled,
 32  triggerFastModeCooldown,
 33} from '../../utils/fastMode.js'
 34import { isNonCustomOpusModel } from '../../utils/model/model.js'
 35import { disableKeepAlive } from '../../utils/proxy.js'
 36import { sleep } from '../../utils/sleep.js'
 37import type { ThinkingConfig } from '../../utils/thinking.js'
 38import { getFeatureValue_CACHED_MAY_BE_STALE } from '../analytics/growthbook.js'
 39import {
 40  type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
 41  logEvent,
 42} from '../analytics/index.js'
 43import {
 44  checkMockRateLimitError,
 45  isMockRateLimitError,
 46} from '../rateLimitMocking.js'
 47import { REPEATED_529_ERROR_MESSAGE } from './errors.js'
 48import { extractConnectionErrorDetails } from './errorUtils.js'
 49
 50const abortError = () => new APIUserAbortError()
 51
 52const DEFAULT_MAX_RETRIES = 10
 53const FLOOR_OUTPUT_TOKENS = 3000
 54const MAX_529_RETRIES = 3
 55export const BASE_DELAY_MS = 500
 56
 57// Foreground query sources where the user IS blocking on the result — these
 58// retry on 529. Everything else (summaries, titles, suggestions, classifiers)
 59// bails immediately: during a capacity cascade each retry is 3-10× gateway
 60// amplification, and the user never sees those fail anyway. New sources
 61// default to no-retry — add here only if the user is waiting on the result.
 62const FOREGROUND_529_RETRY_SOURCES = new Set<QuerySource>([
 63  'repl_main_thread',
 64  'repl_main_thread:outputStyle:custom',
 65  'repl_main_thread:outputStyle:Explanatory',
 66  'repl_main_thread:outputStyle:Learning',
 67  'sdk',
 68  'agent:custom',
 69  'agent:default',
 70  'agent:builtin',
 71  'compact',
 72  'hook_agent',
 73  'hook_prompt',
 74  'verification_agent',
 75  'side_question',
 76  // Security classifiers — must complete for auto-mode correctness.
 77  // yoloClassifier.ts uses 'auto_mode' (not 'yolo_classifier' — that's
 78  // type-only). bash_classifier is ant-only; feature-gate so the string
 79  // tree-shakes out of external builds (excluded-strings.txt).
 80  'auto_mode',
 81  ...(feature('BASH_CLASSIFIER') ? (['bash_classifier'] as const) : []),
 82])
 83
 84function shouldRetry529(querySource: QuerySource | undefined): boolean {
 85  // undefined → retry (conservative for untagged call paths)
 86  return (
 87    querySource === undefined || FOREGROUND_529_RETRY_SOURCES.has(querySource)
 88  )
 89}
 90
 91// CLAUDE_CODE_UNATTENDED_RETRY: for unattended sessions (ant-only). Retries 429/529
 92// indefinitely with higher backoff and periodic keep-alive yields so the host
 93// environment does not mark the session idle mid-wait.
 94// TODO(ANT-344): the keep-alive via SystemAPIErrorMessage yields is a stopgap
 95// until there's a dedicated keep-alive channel.
 96const PERSISTENT_MAX_BACKOFF_MS = 5 * 60 * 1000
 97const PERSISTENT_RESET_CAP_MS = 6 * 60 * 60 * 1000
 98const HEARTBEAT_INTERVAL_MS = 30_000
 99
100function isPersistentRetryEnabled(): boolean {
101  return feature('UNATTENDED_RETRY')
102    ? isEnvTruthy(process.env.CLAUDE_CODE_UNATTENDED_RETRY)
103    : false
104}
105
106function isTransientCapacityError(error: unknown): boolean {
107  return (
108    is529Error(error) || (error instanceof APIError && error.status === 429)
109  )
110}
111
112function isStaleConnectionError(error: unknown): boolean {
113  if (!(error instanceof APIConnectionError)) {
114    return false
115  }
116  const details = extractConnectionErrorDetails(error)
117  return details?.code === 'ECONNRESET' || details?.code === 'EPIPE'
118}
119
120export interface RetryContext {
121  maxTokensOverride?: number
122  model: string
123  thinkingConfig: ThinkingConfig
124  fastMode?: boolean
125}
126
127interface RetryOptions {
128  maxRetries?: number
129  model: string
130  fallbackModel?: string
131  thinkingConfig: ThinkingConfig
132  fastMode?: boolean
133  signal?: AbortSignal
134  querySource?: QuerySource
135  /**
136   * Pre-seed the consecutive 529 counter. Used when this retry loop is a
137   * non-streaming fallback after a streaming 529 — the streaming 529 should
138   * count toward MAX_529_RETRIES so total 529s-before-fallback is consistent
139   * regardless of which request mode hit the overload.
140   */
141  initialConsecutive529Errors?: number
142}
143
144export class CannotRetryError extends Error {
145  constructor(
146    public readonly originalError: unknown,
147    public readonly retryContext: RetryContext,
148  ) {
149    const message = errorMessage(originalError)
150    super(message)
151    this.name = 'RetryError'
152
153    // Preserve the original stack trace if available
154    if (originalError instanceof Error && originalError.stack) {
155      this.stack = originalError.stack
156    }
157  }
158}
159
160export class FallbackTriggeredError extends Error {
161  constructor(
162    public readonly originalModel: string,
163    public readonly fallbackModel: string,
164  ) {
165    super(`Model fallback triggered: ${originalModel} -> ${fallbackModel}`)
166    this.name = 'FallbackTriggeredError'
167  }
168}
169
170export async function* withRetry<T>(
171  getClient: () => Promise<Anthropic>,
172  operation: (
173    client: Anthropic,
174    attempt: number,
175    context: RetryContext,
176  ) => Promise<T>,
177  options: RetryOptions,
178): AsyncGenerator<SystemAPIErrorMessage, T> {
179  const maxRetries = getMaxRetries(options)
180  const retryContext: RetryContext = {
181    model: options.model,
182    thinkingConfig: options.thinkingConfig,
183    ...(isFastModeEnabled() && { fastMode: options.fastMode }),
184  }
185  let client: Anthropic | null = null
186  let consecutive529Errors = options.initialConsecutive529Errors ?? 0
187  let lastError: unknown
188  let persistentAttempt = 0
189  for (let attempt = 1; attempt <= maxRetries + 1; attempt++) {
190    if (options.signal?.aborted) {
191      throw new APIUserAbortError()
192    }
193
194    // Capture whether fast mode is active before this attempt
195    // (fallback may change the state mid-loop)
196    const wasFastModeActive = isFastModeEnabled()
197      ? retryContext.fastMode && !isFastModeCooldown()
198      : false
199
200    try {
201      // Check for mock rate limits (used by /mock-limits command for Ant employees)
202      if (process.env.USER_TYPE === 'ant') {
203        const mockError = checkMockRateLimitError(
204          retryContext.model,
205          wasFastModeActive,
206        )
207        if (mockError) {
208          throw mockError
209        }
210      }
211
212      // Get a fresh client instance on first attempt or after authentication errors
213      // - 401 for first-party API authentication failures
214      // - 403 "OAuth token has been revoked" (another process refreshed the token)
215      // - Bedrock-specific auth errors (403 or CredentialsProviderError)
216      // - Vertex-specific auth errors (credential refresh failures, 401)
217      // - ECONNRESET/EPIPE: stale keep-alive socket; disable pooling and reconnect
218      const isStaleConnection = isStaleConnectionError(lastError)
219      if (
220        isStaleConnection &&
221        getFeatureValue_CACHED_MAY_BE_STALE(
222          'tengu_disable_keepalive_on_econnreset',
223          false,
224        )
225      ) {
226        logForDebugging(
227          'Stale connection (ECONNRESET/EPIPE) — disabling keep-alive for retry',
228        )
229        disableKeepAlive()
230      }
231
232      if (
233        client === null ||
234        (lastError instanceof APIError && lastError.status === 401) ||
235        isOAuthTokenRevokedError(lastError) ||
236        isBedrockAuthError(lastError) ||
237        isVertexAuthError(lastError) ||
238        isStaleConnection
239      ) {
240        // On 401 "token expired" or 403 "token revoked", force a token refresh
241        if (
242          (lastError instanceof APIError && lastError.status === 401) ||
243          isOAuthTokenRevokedError(lastError)
244        ) {
245          const failedAccessToken = getClaudeAIOAuthTokens()?.accessToken
246          if (failedAccessToken) {
247            await handleOAuth401Error(failedAccessToken)
248          }
249        }
250        client = await getClient()
251      }
252
253      return await operation(client, attempt, retryContext)
254    } catch (error) {
255      lastError = error
256      logForDebugging(
257        `API error (attempt ${attempt}/${maxRetries + 1}): ${error instanceof APIError ? `${error.status} ${error.message}` : errorMessage(error)}`,
258        { level: 'error' },
259      )
260
261      // Fast mode fallback: on 429/529, either wait and retry (short delays)
262      // or fall back to standard speed (long delays) to avoid cache thrashing.
263      // Skip in persistent mode: the short-retry path below loops with fast
264      // mode still active, so its `continue` never reaches the attempt clamp
265      // and the for-loop terminates. Persistent sessions want the chunked
266      // keep-alive path instead of fast-mode cache-preservation anyway.
267      if (
268        wasFastModeActive &&
269        !isPersistentRetryEnabled() &&
270        error instanceof APIError &&
271        (error.status === 429 || is529Error(error))
272      ) {
273        // If the 429 is specifically because extra usage (overage) is not
274        // available, permanently disable fast mode with a specific message.
275        const overageReason = error.headers?.get(
276          'anthropic-ratelimit-unified-overage-disabled-reason',
277        )
278        if (overageReason !== null && overageReason !== undefined) {
279          handleFastModeOverageRejection(overageReason)
280          retryContext.fastMode = false
281          continue
282        }
283
284        const retryAfterMs = getRetryAfterMs(error)
285        if (retryAfterMs !== null && retryAfterMs < SHORT_RETRY_THRESHOLD_MS) {
286          // Short retry-after: wait and retry with fast mode still active
287          // to preserve prompt cache (same model name on retry).
288          await sleep(retryAfterMs, options.signal, { abortError })
289          continue
290        }
291        // Long or unknown retry-after: enter cooldown (switches to standard
292        // speed model), with a minimum floor to avoid flip-flopping.
293        const cooldownMs = Math.max(
294          retryAfterMs ?? DEFAULT_FAST_MODE_FALLBACK_HOLD_MS,
295          MIN_COOLDOWN_MS,
296        )
297        const cooldownReason: CooldownReason = is529Error(error)
298          ? 'overloaded'
299          : 'rate_limit'
300        triggerFastModeCooldown(Date.now() + cooldownMs, cooldownReason)
301        if (isFastModeEnabled()) {
302          retryContext.fastMode = false
303        }
304        continue
305      }
306
307      // Fast mode fallback: if the API rejects the fast mode parameter
308      // (e.g., org doesn't have fast mode enabled), permanently disable fast
309      // mode and retry at standard speed.
310      if (wasFastModeActive && isFastModeNotEnabledError(error)) {
311        handleFastModeRejectedByAPI()
312        retryContext.fastMode = false
313        continue
314      }
315
316      // Non-foreground sources bail immediately on 529 — no retry amplification
317      // during capacity cascades. User never sees these fail.
318      if (is529Error(error) && !shouldRetry529(options.querySource)) {
319        logEvent('tengu_api_529_background_dropped', {
320          query_source:
321            options.querySource as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
322        })
323        throw new CannotRetryError(error, retryContext)
324      }
325
326      // Track consecutive 529 errors
327      if (
328        is529Error(error) &&
329        // If FALLBACK_FOR_ALL_PRIMARY_MODELS is not set, fall through only if the primary model is a non-custom Opus model.
330        // TODO: Revisit if the isNonCustomOpusModel check should still exist, or if isNonCustomOpusModel is a stale artifact of when Claude Code was hardcoded on Opus.
331        (process.env.FALLBACK_FOR_ALL_PRIMARY_MODELS ||
332          (!isClaudeAISubscriber() && isNonCustomOpusModel(options.model)))
333      ) {
334        consecutive529Errors++
335        if (consecutive529Errors >= MAX_529_RETRIES) {
336          // Check if fallback model is specified
337          if (options.fallbackModel) {
338            logEvent('tengu_api_opus_fallback_triggered', {
339              original_model:
340                options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
341              fallback_model:
342                options.fallbackModel as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
343              provider: getAPIProviderForStatsig(),
344            })
345
346            // Throw special error to indicate fallback was triggered
347            throw new FallbackTriggeredError(
348              options.model,
349              options.fallbackModel,
350            )
351          }
352
353          if (
354            process.env.USER_TYPE === 'external' &&
355            !process.env.IS_SANDBOX &&
356            !isPersistentRetryEnabled()
357          ) {
358            logEvent('tengu_api_custom_529_overloaded_error', {})
359            throw new CannotRetryError(
360              new Error(REPEATED_529_ERROR_MESSAGE),
361              retryContext,
362            )
363          }
364        }
365      }
366
367      // Only retry if the error indicates we should
368      const persistent =
369        isPersistentRetryEnabled() && isTransientCapacityError(error)
370      if (attempt > maxRetries && !persistent) {
371        throw new CannotRetryError(error, retryContext)
372      }
373
374      // AWS/GCP errors aren't always APIError, but can be retried
375      const handledCloudAuthError =
376        handleAwsCredentialError(error) || handleGcpCredentialError(error)
377      if (
378        !handledCloudAuthError &&
379        (!(error instanceof APIError) || !shouldRetry(error))
380      ) {
381        throw new CannotRetryError(error, retryContext)
382      }
383
384      // Handle max tokens context overflow errors by adjusting max_tokens for the next attempt
385      // NOTE: With extended-context-window beta, this 400 error should not occur.
386      // The API now returns 'model_context_window_exceeded' stop_reason instead.
387      // Keeping for backward compatibility.
388      if (error instanceof APIError) {
389        const overflowData = parseMaxTokensContextOverflowError(error)
390        if (overflowData) {
391          const { inputTokens, contextLimit } = overflowData
392
393          const safetyBuffer = 1000
394          const availableContext = Math.max(
395            0,
396            contextLimit - inputTokens - safetyBuffer,
397          )
398          if (availableContext < FLOOR_OUTPUT_TOKENS) {
399            logError(
400              new Error(
401                `availableContext ${availableContext} is less than FLOOR_OUTPUT_TOKENS ${FLOOR_OUTPUT_TOKENS}`,
402              ),
403            )
404            throw error
405          }
406          // Ensure we have enough tokens for thinking + at least 1 output token
407          const minRequired =
408            (retryContext.thinkingConfig.type === 'enabled'
409              ? retryContext.thinkingConfig.budgetTokens
410              : 0) + 1
411          const adjustedMaxTokens = Math.max(
412            FLOOR_OUTPUT_TOKENS,
413            availableContext,
414            minRequired,
415          )
416          retryContext.maxTokensOverride = adjustedMaxTokens
417
418          logEvent('tengu_max_tokens_context_overflow_adjustment', {
419            inputTokens,
420            contextLimit,
421            adjustedMaxTokens,
422            attempt,
423          })
424
425          continue
426        }
427      }
428
429      // For other errors, proceed with normal retry logic
430      // Get retry-after header if available
431      const retryAfter = getRetryAfter(error)
432      let delayMs: number
433      if (persistent && error instanceof APIError && error.status === 429) {
434        persistentAttempt++
435        // Window-based limits (e.g. 5hr Max/Pro) include a reset timestamp.
436        // Wait until reset rather than polling every 5 min uselessly.
437        const resetDelay = getRateLimitResetDelayMs(error)
438        delayMs =
439          resetDelay ??
440          Math.min(
441            getRetryDelay(
442              persistentAttempt,
443              retryAfter,
444              PERSISTENT_MAX_BACKOFF_MS,
445            ),
446            PERSISTENT_RESET_CAP_MS,
447          )
448      } else if (persistent) {
449        persistentAttempt++
450        // Retry-After is a server directive and bypasses maxDelayMs inside
451        // getRetryDelay (intentional — honoring it is correct). Cap at the
452        // 6hr reset-cap here so a pathological header can't wait unbounded.
453        delayMs = Math.min(
454          getRetryDelay(
455            persistentAttempt,
456            retryAfter,
457            PERSISTENT_MAX_BACKOFF_MS,
458          ),
459          PERSISTENT_RESET_CAP_MS,
460        )
461      } else {
462        delayMs = getRetryDelay(attempt, retryAfter)
463      }
464
465      // In persistent mode the for-loop `attempt` is clamped at maxRetries+1;
466      // use persistentAttempt for telemetry/yields so they show the true count.
467      const reportedAttempt = persistent ? persistentAttempt : attempt
468      logEvent('tengu_api_retry', {
469        attempt: reportedAttempt,
470        delayMs: delayMs,
471        error: (error as APIError)
472          .message as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
473        status: (error as APIError).status,
474        provider: getAPIProviderForStatsig(),
475      })
476
477      if (persistent) {
478        if (delayMs > 60_000) {
479          logEvent('tengu_api_persistent_retry_wait', {
480            status: (error as APIError).status,
481            delayMs,
482            attempt: reportedAttempt,
483            provider: getAPIProviderForStatsig(),
484          })
485        }
486        // Chunk long sleeps so the host sees periodic stdout activity and
487        // does not mark the session idle. Each yield surfaces as
488        // {type:'system', subtype:'api_retry'} on stdout via QueryEngine.
489        let remaining = delayMs
490        while (remaining > 0) {
491          if (options.signal?.aborted) throw new APIUserAbortError()
492          if (error instanceof APIError) {
493            yield createSystemAPIErrorMessage(
494              error,
495              remaining,
496              reportedAttempt,
497              maxRetries,
498            )
499          }
500          const chunk = Math.min(remaining, HEARTBEAT_INTERVAL_MS)
501          await sleep(chunk, options.signal, { abortError })
502          remaining -= chunk
503        }
504        // Clamp so the for-loop never terminates. Backoff uses the separate
505        // persistentAttempt counter which keeps growing to the 5-min cap.
506        if (attempt >= maxRetries) attempt = maxRetries
507      } else {
508        if (error instanceof APIError) {
509          yield createSystemAPIErrorMessage(error, delayMs, attempt, maxRetries)
510        }
511        await sleep(delayMs, options.signal, { abortError })
512      }
513    }
514  }
515
516  throw new CannotRetryError(lastError, retryContext)
517}
518
519function getRetryAfter(error: unknown): string | null {
520  return (
521    ((error as { headers?: { 'retry-after'?: string } }).headers?.[
522      'retry-after'
523    ] ||
524      // eslint-disable-next-line eslint-plugin-n/no-unsupported-features/node-builtins
525      ((error as APIError).headers as Headers)?.get?.('retry-after')) ??
526    null
527  )
528}
529
530export function getRetryDelay(
531  attempt: number,
532  retryAfterHeader?: string | null,
533  maxDelayMs = 32000,
534): number {
535  if (retryAfterHeader) {
536    const seconds = parseInt(retryAfterHeader, 10)
537    if (!isNaN(seconds)) {
538      return seconds * 1000
539    }
540  }
541
542  const baseDelay = Math.min(
543    BASE_DELAY_MS * Math.pow(2, attempt - 1),
544    maxDelayMs,
545  )
546  const jitter = Math.random() * 0.25 * baseDelay
547  return baseDelay + jitter
548}
549
550export function parseMaxTokensContextOverflowError(error: APIError):
551  | {
552      inputTokens: number
553      maxTokens: number
554      contextLimit: number
555    }
556  | undefined {
557  if (error.status !== 400 || !error.message) {
558    return undefined
559  }
560
561  if (
562    !error.message.includes(
563      'input length and `max_tokens` exceed context limit',
564    )
565  ) {
566    return undefined
567  }
568
569  // Example format: "input length and `max_tokens` exceed context limit: 188059 + 20000 > 200000"
570  const regex =
571    /input length and `max_tokens` exceed context limit: (\d+) \+ (\d+) > (\d+)/
572  const match = error.message.match(regex)
573
574  if (!match || match.length !== 4) {
575    return undefined
576  }
577
578  if (!match[1] || !match[2] || !match[3]) {
579    logError(
580      new Error(
581        'Unable to parse max_tokens from max_tokens exceed context limit error message',
582      ),
583    )
584    return undefined
585  }
586  const inputTokens = parseInt(match[1], 10)
587  const maxTokens = parseInt(match[2], 10)
588  const contextLimit = parseInt(match[3], 10)
589
590  if (isNaN(inputTokens) || isNaN(maxTokens) || isNaN(contextLimit)) {
591    return undefined
592  }
593
594  return { inputTokens, maxTokens, contextLimit }
595}
596
597// TODO: Replace with a response header check once the API adds a dedicated
598// header for fast-mode rejection (e.g., x-fast-mode-rejected). String-matching
599// the error message is fragile and will break if the API wording changes.
600function isFastModeNotEnabledError(error: unknown): boolean {
601  if (!(error instanceof APIError)) {
602    return false
603  }
604  return (
605    error.status === 400 &&
606    (error.message?.includes('Fast mode is not enabled') ?? false)
607  )
608}
609
610export function is529Error(error: unknown): boolean {
611  if (!(error instanceof APIError)) {
612    return false
613  }
614
615  // Check for 529 status code or overloaded error in message
616  return (
617    error.status === 529 ||
618    // See below: the SDK sometimes fails to properly pass the 529 status code during streaming
619    (error.message?.includes('"type":"overloaded_error"') ?? false)
620  )
621}
622
623function isOAuthTokenRevokedError(error: unknown): boolean {
624  return (
625    error instanceof APIError &&
626    error.status === 403 &&
627    (error.message?.includes('OAuth token has been revoked') ?? false)
628  )
629}
630
631function isBedrockAuthError(error: unknown): boolean {
632  if (isEnvTruthy(process.env.CLAUDE_CODE_USE_BEDROCK)) {
633    // AWS libs reject without an API call if .aws holds a past Expiration value
634    // otherwise, API calls that receive expired tokens give generic 403
635    // "The security token included in the request is invalid"
636    if (
637      isAwsCredentialsProviderError(error) ||
638      (error instanceof APIError && error.status === 403)
639    ) {
640      return true
641    }
642  }
643  return false
644}
645
646/**
647 * Clear AWS auth caches if appropriate.
648 * @returns true if action was taken.
649 */
650function handleAwsCredentialError(error: unknown): boolean {
651  if (isBedrockAuthError(error)) {
652    clearAwsCredentialsCache()
653    return true
654  }
655  return false
656}
657
658// google-auth-library throws plain Error (no typed name like AWS's
659// CredentialsProviderError). Match common SDK-level credential-failure messages.
660function isGoogleAuthLibraryCredentialError(error: unknown): boolean {
661  if (!(error instanceof Error)) return false
662  const msg = error.message
663  return (
664    msg.includes('Could not load the default credentials') ||
665    msg.includes('Could not refresh access token') ||
666    msg.includes('invalid_grant')
667  )
668}
669
670function isVertexAuthError(error: unknown): boolean {
671  if (isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX)) {
672    // SDK-level: google-auth-library fails in prepareOptions() before the HTTP call
673    if (isGoogleAuthLibraryCredentialError(error)) {
674      return true
675    }
676    // Server-side: Vertex returns 401 for expired/invalid tokens
677    if (error instanceof APIError && error.status === 401) {
678      return true
679    }
680  }
681  return false
682}
683
684/**
685 * Clear GCP auth caches if appropriate.
686 * @returns true if action was taken.
687 */
688function handleGcpCredentialError(error: unknown): boolean {
689  if (isVertexAuthError(error)) {
690    clearGcpCredentialsCache()
691    return true
692  }
693  return false
694}
695
696function shouldRetry(error: APIError): boolean {
697  // Never retry mock errors - they're from /mock-limits command for testing
698  if (isMockRateLimitError(error)) {
699    return false
700  }
701
702  // Persistent mode: 429/529 always retryable, bypass subscriber gates and
703  // x-should-retry header.
704  if (isPersistentRetryEnabled() && isTransientCapacityError(error)) {
705    return true
706  }
707
708  // CCR mode: auth is via infrastructure-provided JWTs, so a 401/403 is a
709  // transient blip (auth service flap, network hiccup) rather than bad
710  // credentials. Bypass x-should-retry:false — the server assumes we'd retry
711  // the same bad key, but our key is fine.
712  if (
713    isEnvTruthy(process.env.CLAUDE_CODE_REMOTE) &&
714    (error.status === 401 || error.status === 403)
715  ) {
716    return true
717  }
718
719  // Check for overloaded errors first by examining the message content
720  // The SDK sometimes fails to properly pass the 529 status code during streaming,
721  // so we need to check the error message directly
722  if (error.message?.includes('"type":"overloaded_error"')) {
723    return true
724  }
725
726  // Check for max tokens context overflow errors that we can handle
727  if (parseMaxTokensContextOverflowError(error)) {
728    return true
729  }
730
731  // Note this is not a standard header.
732  const shouldRetryHeader = error.headers?.get('x-should-retry')
733
734  // If the server explicitly says whether or not to retry, obey.
735  // For Max and Pro users, should-retry is true, but in several hours, so we shouldn't.
736  // Enterprise users can retry because they typically use PAYG instead of rate limits.
737  if (
738    shouldRetryHeader === 'true' &&
739    (!isClaudeAISubscriber() || isEnterpriseSubscriber())
740  ) {
741    return true
742  }
743
744  // Ants can ignore x-should-retry: false for 5xx server errors only.
745  // For other status codes (401, 403, 400, 429, etc.), respect the header.
746  if (shouldRetryHeader === 'false') {
747    const is5xxError = error.status !== undefined && error.status >= 500
748    if (!(process.env.USER_TYPE === 'ant' && is5xxError)) {
749      return false
750    }
751  }
752
753  if (error instanceof APIConnectionError) {
754    return true
755  }
756
757  if (!error.status) return false
758
759  // Retry on request timeouts.
760  if (error.status === 408) return true
761
762  // Retry on lock timeouts.
763  if (error.status === 409) return true
764
765  // Retry on rate limits, but not for ClaudeAI Subscription users
766  // Enterprise users can retry because they typically use PAYG instead of rate limits
767  if (error.status === 429) {
768    return !isClaudeAISubscriber() || isEnterpriseSubscriber()
769  }
770
771  // Clear API key cache on 401 and allow retry.
772  // OAuth token handling is done in the main retry loop via handleOAuth401Error.
773  if (error.status === 401) {
774    clearApiKeyHelperCache()
775    return true
776  }
777
778  // Retry on 403 "token revoked" (same refresh logic as 401, see above)
779  if (isOAuthTokenRevokedError(error)) {
780    return true
781  }
782
783  // Retry internal errors.
784  if (error.status && error.status >= 500) return true
785
786  return false
787}
788
789export function getDefaultMaxRetries(): number {
790  if (process.env.CLAUDE_CODE_MAX_RETRIES) {
791    return parseInt(process.env.CLAUDE_CODE_MAX_RETRIES, 10)
792  }
793  return DEFAULT_MAX_RETRIES
794}
795function getMaxRetries(options: RetryOptions): number {
796  return options.maxRetries ?? getDefaultMaxRetries()
797}
798
799const DEFAULT_FAST_MODE_FALLBACK_HOLD_MS = 30 * 60 * 1000 // 30 minutes
800const SHORT_RETRY_THRESHOLD_MS = 20 * 1000 // 20 seconds
801const MIN_COOLDOWN_MS = 10 * 60 * 1000 // 10 minutes
802
803function getRetryAfterMs(error: APIError): number | null {
804  const retryAfter = getRetryAfter(error)
805  if (retryAfter) {
806    const seconds = parseInt(retryAfter, 10)
807    if (!isNaN(seconds)) {
808      return seconds * 1000
809    }
810  }
811  return null
812}
813
814function getRateLimitResetDelayMs(error: APIError): number | null {
815  const resetHeader = error.headers?.get?.('anthropic-ratelimit-unified-reset')
816  if (!resetHeader) return null
817  const resetUnixSec = Number(resetHeader)
818  if (!Number.isFinite(resetUnixSec)) return null
819  const delayMs = resetUnixSec * 1000 - Date.now()
820  if (delayMs <= 0) return null
821  return Math.min(delayMs, PERSISTENT_RESET_CAP_MS)
822}