tools/WebFetchTool/utils.ts at main

oppi.li / claude-code
fork atom
source dump of claude code
fork atom
claude-code / tools / WebFetchTool / utils.ts
at main 530 lines 17 kB view raw
wrap content
oppi.li dump from zip 8d ago
63aada3f
  1import axios, { type AxiosResponse } from 'axios'
  2import { LRUCache } from 'lru-cache'
  3import {
  4  type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  5  logEvent,
  6} from '../../services/analytics/index.js'
  7import { queryHaiku } from '../../services/api/claude.js'
  8import { AbortError } from '../../utils/errors.js'
  9import { getWebFetchUserAgent } from '../../utils/http.js'
 10import { logError } from '../../utils/log.js'
 11import {
 12  isBinaryContentType,
 13  persistBinaryContent,
 14} from '../../utils/mcpOutputStorage.js'
 15import { getSettings_DEPRECATED } from '../../utils/settings/settings.js'
 16import { asSystemPrompt } from '../../utils/systemPromptType.js'
 17import { isPreapprovedHost } from './preapproved.js'
 18import { makeSecondaryModelPrompt } from './prompt.js'
 19
 20// Custom error classes for domain blocking
 21class DomainBlockedError extends Error {
 22  constructor(domain: string) {
 23    super(`Claude Code is unable to fetch from ${domain}`)
 24    this.name = 'DomainBlockedError'
 25  }
 26}
 27
 28class DomainCheckFailedError extends Error {
 29  constructor(domain: string) {
 30    super(
 31      `Unable to verify if domain ${domain} is safe to fetch. This may be due to network restrictions or enterprise security policies blocking claude.ai.`,
 32    )
 33    this.name = 'DomainCheckFailedError'
 34  }
 35}
 36
 37class EgressBlockedError extends Error {
 38  constructor(public readonly domain: string) {
 39    super(
 40      JSON.stringify({
 41        error_type: 'EGRESS_BLOCKED',
 42        domain,
 43        message: `Access to ${domain} is blocked by the network egress proxy.`,
 44      }),
 45    )
 46    this.name = 'EgressBlockedError'
 47  }
 48}
 49
 50// Cache for storing fetched URL content
 51type CacheEntry = {
 52  bytes: number
 53  code: number
 54  codeText: string
 55  content: string
 56  contentType: string
 57  persistedPath?: string
 58  persistedSize?: number
 59}
 60
 61// Cache with 15-minute TTL and 50MB size limit
 62// LRUCache handles automatic expiration and eviction
 63const CACHE_TTL_MS = 15 * 60 * 1000 // 15 minutes
 64const MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 // 50MB
 65
 66const URL_CACHE = new LRUCache<string, CacheEntry>({
 67  maxSize: MAX_CACHE_SIZE_BYTES,
 68  ttl: CACHE_TTL_MS,
 69})
 70
 71// Separate cache for preflight domain checks. URL_CACHE is URL-keyed, so
 72// fetching two paths on the same domain triggers two identical preflight
 73// HTTP round-trips to api.anthropic.com. This hostname-keyed cache avoids
 74// that. Only 'allowed' is cached — blocked/failed re-check on next attempt.
 75const DOMAIN_CHECK_CACHE = new LRUCache<string, true>({
 76  max: 128,
 77  ttl: 5 * 60 * 1000, // 5 minutes — shorter than URL_CACHE TTL
 78})
 79
 80export function clearWebFetchCache(): void {
 81  URL_CACHE.clear()
 82  DOMAIN_CHECK_CACHE.clear()
 83}
 84
 85// Lazy singleton — defers the turndown → @mixmark-io/domino import (~1.4MB
 86// retained heap) until the first HTML fetch, and reuses one instance across
 87// calls (construction builds 15 rule objects; .turndown() is stateless).
 88// @types/turndown ships only `export =` (no .d.mts), so TS types the import
 89// as the class itself while Bun wraps CJS in { default } — hence the cast.
 90type TurndownCtor = typeof import('turndown')
 91let turndownServicePromise: Promise<InstanceType<TurndownCtor>> | undefined
 92function getTurndownService(): Promise<InstanceType<TurndownCtor>> {
 93  return (turndownServicePromise ??= import('turndown').then(m => {
 94    const Turndown = (m as unknown as { default: TurndownCtor }).default
 95    return new Turndown()
 96  }))
 97}
 98
 99// PSR requested limiting the length of URLs to 250 to lower the potential
100// for a data exfiltration. However, this is too restrictive for some customers'
101// legitimate use cases, such as JWT-signed URLs (e.g., cloud service signed URLs)
102// that can be much longer. We already require user approval for each domain,
103// which provides a primary security boundary. In addition, Claude Code has
104// other data exfil channels, and this one does not seem relatively high risk,
105// so I'm removing that length restriction. -ab
106const MAX_URL_LENGTH = 2000
107
108// Per PSR:
109// "Implement resource consumption controls because setting limits on CPU,
110// memory, and network usage for the Web Fetch tool can prevent a single
111// request or user from overwhelming the system."
112const MAX_HTTP_CONTENT_LENGTH = 10 * 1024 * 1024
113
114// Timeout for the main HTTP fetch request (60 seconds).
115// Prevents hanging indefinitely on slow/unresponsive servers.
116const FETCH_TIMEOUT_MS = 60_000
117
118// Timeout for the domain blocklist preflight check (10 seconds).
119const DOMAIN_CHECK_TIMEOUT_MS = 10_000
120
121// Cap same-host redirect hops. Without this a malicious server can return
122// a redirect loop (/a → /b → /a …) and the per-request FETCH_TIMEOUT_MS
123// resets on every hop, hanging the tool until user interrupt. 10 matches
124// common client defaults (axios=5, follow-redirects=21, Chrome=20).
125const MAX_REDIRECTS = 10
126
127// Truncate to not spend too many tokens
128export const MAX_MARKDOWN_LENGTH = 100_000
129
130export function isPreapprovedUrl(url: string): boolean {
131  try {
132    const parsedUrl = new URL(url)
133    return isPreapprovedHost(parsedUrl.hostname, parsedUrl.pathname)
134  } catch {
135    return false
136  }
137}
138
139export function validateURL(url: string): boolean {
140  if (url.length > MAX_URL_LENGTH) {
141    return false
142  }
143
144  let parsed
145  try {
146    parsed = new URL(url)
147  } catch {
148    return false
149  }
150
151  // We don't need to check protocol here, as we'll upgrade http to https when making the request
152
153  // As long as we aren't supporting aiming to cookies or internal domains,
154  // we should block URLs with usernames/passwords too, even though these
155  // seem exceedingly unlikely.
156  if (parsed.username || parsed.password) {
157    return false
158  }
159
160  // Initial filter that this isn't a privileged, company-internal URL
161  // by checking that the hostname is publicly resolvable
162  const hostname = parsed.hostname
163  const parts = hostname.split('.')
164  if (parts.length < 2) {
165    return false
166  }
167
168  return true
169}
170
171type DomainCheckResult =
172  | { status: 'allowed' }
173  | { status: 'blocked' }
174  | { status: 'check_failed'; error: Error }
175
176export async function checkDomainBlocklist(
177  domain: string,
178): Promise<DomainCheckResult> {
179  if (DOMAIN_CHECK_CACHE.has(domain)) {
180    return { status: 'allowed' }
181  }
182  try {
183    const response = await axios.get(
184      `https://api.anthropic.com/api/web/domain_info?domain=${encodeURIComponent(domain)}`,
185      { timeout: DOMAIN_CHECK_TIMEOUT_MS },
186    )
187    if (response.status === 200) {
188      if (response.data.can_fetch === true) {
189        DOMAIN_CHECK_CACHE.set(domain, true)
190        return { status: 'allowed' }
191      }
192      return { status: 'blocked' }
193    }
194    // Non-200 status but didn't throw
195    return {
196      status: 'check_failed',
197      error: new Error(`Domain check returned status ${response.status}`),
198    }
199  } catch (e) {
200    logError(e)
201    return { status: 'check_failed', error: e as Error }
202  }
203}
204
205/**
206 * Check if a redirect is safe to follow
207 * Allows redirects that:
208 * - Add or remove "www." in the hostname
209 * - Keep the origin the same but change path/query params
210 * - Or both of the above
211 */
212export function isPermittedRedirect(
213  originalUrl: string,
214  redirectUrl: string,
215): boolean {
216  try {
217    const parsedOriginal = new URL(originalUrl)
218    const parsedRedirect = new URL(redirectUrl)
219
220    if (parsedRedirect.protocol !== parsedOriginal.protocol) {
221      return false
222    }
223
224    if (parsedRedirect.port !== parsedOriginal.port) {
225      return false
226    }
227
228    if (parsedRedirect.username || parsedRedirect.password) {
229      return false
230    }
231
232    // Now check hostname conditions
233    // 1. Adding www. is allowed: example.com -> www.example.com
234    // 2. Removing www. is allowed: www.example.com -> example.com
235    // 3. Same host (with or without www.) is allowed: paths can change
236    const stripWww = (hostname: string) => hostname.replace(/^www\./, '')
237    const originalHostWithoutWww = stripWww(parsedOriginal.hostname)
238    const redirectHostWithoutWww = stripWww(parsedRedirect.hostname)
239    return originalHostWithoutWww === redirectHostWithoutWww
240  } catch (_error) {
241    return false
242  }
243}
244
245/**
246 * Helper function to handle fetching URLs with custom redirect handling
247 * Recursively follows redirects if they pass the redirectChecker function
248 *
249 * Per PSR:
250 * "Do not automatically follow redirects because following redirects could
251 * allow for an attacker to exploit an open redirect vulnerability in a
252 * trusted domain to force a user to make a request to a malicious domain
253 * unknowingly"
254 */
255type RedirectInfo = {
256  type: 'redirect'
257  originalUrl: string
258  redirectUrl: string
259  statusCode: number
260}
261
262export async function getWithPermittedRedirects(
263  url: string,
264  signal: AbortSignal,
265  redirectChecker: (originalUrl: string, redirectUrl: string) => boolean,
266  depth = 0,
267): Promise<AxiosResponse<ArrayBuffer> | RedirectInfo> {
268  if (depth > MAX_REDIRECTS) {
269    throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`)
270  }
271  try {
272    return await axios.get(url, {
273      signal,
274      timeout: FETCH_TIMEOUT_MS,
275      maxRedirects: 0,
276      responseType: 'arraybuffer',
277      maxContentLength: MAX_HTTP_CONTENT_LENGTH,
278      headers: {
279        Accept: 'text/markdown, text/html, */*',
280        'User-Agent': getWebFetchUserAgent(),
281      },
282    })
283  } catch (error) {
284    if (
285      axios.isAxiosError(error) &&
286      error.response &&
287      [301, 302, 307, 308].includes(error.response.status)
288    ) {
289      const redirectLocation = error.response.headers.location
290      if (!redirectLocation) {
291        throw new Error('Redirect missing Location header')
292      }
293
294      // Resolve relative URLs against the original URL
295      const redirectUrl = new URL(redirectLocation, url).toString()
296
297      if (redirectChecker(url, redirectUrl)) {
298        // Recursively follow the permitted redirect
299        return getWithPermittedRedirects(
300          redirectUrl,
301          signal,
302          redirectChecker,
303          depth + 1,
304        )
305      } else {
306        // Return redirect information to the caller
307        return {
308          type: 'redirect',
309          originalUrl: url,
310          redirectUrl,
311          statusCode: error.response.status,
312        }
313      }
314    }
315
316    // Detect egress proxy blocks: the proxy returns 403 with
317    // X-Proxy-Error: blocked-by-allowlist when egress is restricted
318    if (
319      axios.isAxiosError(error) &&
320      error.response?.status === 403 &&
321      error.response.headers['x-proxy-error'] === 'blocked-by-allowlist'
322    ) {
323      const hostname = new URL(url).hostname
324      throw new EgressBlockedError(hostname)
325    }
326
327    throw error
328  }
329}
330
331function isRedirectInfo(
332  response: AxiosResponse<ArrayBuffer> | RedirectInfo,
333): response is RedirectInfo {
334  return 'type' in response && response.type === 'redirect'
335}
336
337export type FetchedContent = {
338  content: string
339  bytes: number
340  code: number
341  codeText: string
342  contentType: string
343  persistedPath?: string
344  persistedSize?: number
345}
346
347export async function getURLMarkdownContent(
348  url: string,
349  abortController: AbortController,
350): Promise<FetchedContent | RedirectInfo> {
351  if (!validateURL(url)) {
352    throw new Error('Invalid URL')
353  }
354
355  // Check cache (LRUCache handles TTL automatically)
356  const cachedEntry = URL_CACHE.get(url)
357  if (cachedEntry) {
358    return {
359      bytes: cachedEntry.bytes,
360      code: cachedEntry.code,
361      codeText: cachedEntry.codeText,
362      content: cachedEntry.content,
363      contentType: cachedEntry.contentType,
364      persistedPath: cachedEntry.persistedPath,
365      persistedSize: cachedEntry.persistedSize,
366    }
367  }
368
369  let parsedUrl: URL
370  let upgradedUrl = url
371
372  try {
373    parsedUrl = new URL(url)
374
375    // Upgrade http to https if needed
376    if (parsedUrl.protocol === 'http:') {
377      parsedUrl.protocol = 'https:'
378      upgradedUrl = parsedUrl.toString()
379    }
380
381    const hostname = parsedUrl.hostname
382
383    // Check if the user has opted to skip the blocklist check
384    // This is for enterprise customers with restrictive security policies
385    // that prevent outbound connections to claude.ai
386    const settings = getSettings_DEPRECATED()
387    if (!settings.skipWebFetchPreflight) {
388      const checkResult = await checkDomainBlocklist(hostname)
389      switch (checkResult.status) {
390        case 'allowed':
391          // Continue with the fetch
392          break
393        case 'blocked':
394          throw new DomainBlockedError(hostname)
395        case 'check_failed':
396          throw new DomainCheckFailedError(hostname)
397      }
398    }
399
400    if (process.env.USER_TYPE === 'ant') {
401      logEvent('tengu_web_fetch_host', {
402        hostname:
403          hostname as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
404      })
405    }
406  } catch (e) {
407    if (
408      e instanceof DomainBlockedError ||
409      e instanceof DomainCheckFailedError
410    ) {
411      // Expected user-facing failures - re-throw without logging as internal error
412      throw e
413    }
414    logError(e)
415  }
416
417  const response = await getWithPermittedRedirects(
418    upgradedUrl,
419    abortController.signal,
420    isPermittedRedirect,
421  )
422
423  // Check if we got a redirect response
424  if (isRedirectInfo(response)) {
425    return response
426  }
427
428  const rawBuffer = Buffer.from(response.data)
429  // Release the axios-held ArrayBuffer copy; rawBuffer owns the bytes now.
430  // This lets GC reclaim up to MAX_HTTP_CONTENT_LENGTH (10MB) before Turndown
431  // builds its DOM tree (which can be 3-5x the HTML size).
432  ;(response as { data: unknown }).data = null
433  const contentType = response.headers['content-type'] ?? ''
434
435  // Binary content: save raw bytes to disk with a proper extension so Claude
436  // can inspect the file later. We still fall through to the utf-8 decode +
437  // Haiku path below — for PDFs in particular the decoded string has enough
438  // ASCII structure (/Title, text streams) that Haiku can summarize it, and
439  // the saved file is a supplement rather than a replacement.
440  let persistedPath: string | undefined
441  let persistedSize: number | undefined
442  if (isBinaryContentType(contentType)) {
443    const persistId = `webfetch-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
444    const result = await persistBinaryContent(rawBuffer, contentType, persistId)
445    if (!('error' in result)) {
446      persistedPath = result.filepath
447      persistedSize = result.size
448    }
449  }
450
451  const bytes = rawBuffer.length
452  const htmlContent = rawBuffer.toString('utf-8')
453
454  let markdownContent: string
455  let contentBytes: number
456  if (contentType.includes('text/html')) {
457    markdownContent = (await getTurndownService()).turndown(htmlContent)
458    contentBytes = Buffer.byteLength(markdownContent)
459  } else {
460    // It's not HTML - just use it raw. The decoded string's UTF-8 byte
461    // length equals rawBuffer.length (modulo U+FFFD replacement on invalid
462    // bytes — negligible for cache eviction accounting), so skip the O(n)
463    // Buffer.byteLength scan.
464    markdownContent = htmlContent
465    contentBytes = bytes
466  }
467
468  // Store the fetched content in cache. Note that it's stored under
469  // the original URL, not the upgraded or redirected URL.
470  const entry: CacheEntry = {
471    bytes,
472    code: response.status,
473    codeText: response.statusText,
474    content: markdownContent,
475    contentType,
476    persistedPath,
477    persistedSize,
478  }
479  // lru-cache requires positive integers; clamp to 1 for empty responses.
480  URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) })
481  return entry
482}
483
484export async function applyPromptToMarkdown(
485  prompt: string,
486  markdownContent: string,
487  signal: AbortSignal,
488  isNonInteractiveSession: boolean,
489  isPreapprovedDomain: boolean,
490): Promise<string> {
491  // Truncate content to avoid "Prompt is too long" errors from the secondary model
492  const truncatedContent =
493    markdownContent.length > MAX_MARKDOWN_LENGTH
494      ? markdownContent.slice(0, MAX_MARKDOWN_LENGTH) +
495        '\n\n[Content truncated due to length...]'
496      : markdownContent
497
498  const modelPrompt = makeSecondaryModelPrompt(
499    truncatedContent,
500    prompt,
501    isPreapprovedDomain,
502  )
503  const assistantMessage = await queryHaiku({
504    systemPrompt: asSystemPrompt([]),
505    userPrompt: modelPrompt,
506    signal,
507    options: {
508      querySource: 'web_fetch_apply',
509      agents: [],
510      isNonInteractiveSession,
511      hasAppendSystemPrompt: false,
512      mcpTools: [],
513    },
514  })
515
516  // We need to bubble this up, so that the tool call throws, causing us to return
517  // an is_error tool_use block to the server, and render a red dot in the UI.
518  if (signal.aborted) {
519    throw new AbortError()
520  }
521
522  const { content } = assistantMessage.message
523  if (content.length > 0) {
524    const contentBlock = content[0]
525    if ('text' in contentBlock!) {
526      return contentBlock.text
527    }
528  }
529  return 'No response from model'
530}