source dump of claude code
at main 530 lines 17 kB view raw
1import axios, { type AxiosResponse } from 'axios' 2import { LRUCache } from 'lru-cache' 3import { 4 type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 5 logEvent, 6} from '../../services/analytics/index.js' 7import { queryHaiku } from '../../services/api/claude.js' 8import { AbortError } from '../../utils/errors.js' 9import { getWebFetchUserAgent } from '../../utils/http.js' 10import { logError } from '../../utils/log.js' 11import { 12 isBinaryContentType, 13 persistBinaryContent, 14} from '../../utils/mcpOutputStorage.js' 15import { getSettings_DEPRECATED } from '../../utils/settings/settings.js' 16import { asSystemPrompt } from '../../utils/systemPromptType.js' 17import { isPreapprovedHost } from './preapproved.js' 18import { makeSecondaryModelPrompt } from './prompt.js' 19 20// Custom error classes for domain blocking 21class DomainBlockedError extends Error { 22 constructor(domain: string) { 23 super(`Claude Code is unable to fetch from ${domain}`) 24 this.name = 'DomainBlockedError' 25 } 26} 27 28class DomainCheckFailedError extends Error { 29 constructor(domain: string) { 30 super( 31 `Unable to verify if domain ${domain} is safe to fetch. This may be due to network restrictions or enterprise security policies blocking claude.ai.`, 32 ) 33 this.name = 'DomainCheckFailedError' 34 } 35} 36 37class EgressBlockedError extends Error { 38 constructor(public readonly domain: string) { 39 super( 40 JSON.stringify({ 41 error_type: 'EGRESS_BLOCKED', 42 domain, 43 message: `Access to ${domain} is blocked by the network egress proxy.`, 44 }), 45 ) 46 this.name = 'EgressBlockedError' 47 } 48} 49 50// Cache for storing fetched URL content 51type CacheEntry = { 52 bytes: number 53 code: number 54 codeText: string 55 content: string 56 contentType: string 57 persistedPath?: string 58 persistedSize?: number 59} 60 61// Cache with 15-minute TTL and 50MB size limit 62// LRUCache handles automatic expiration and eviction 63const CACHE_TTL_MS = 15 * 60 * 1000 // 15 minutes 64const MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 // 50MB 65 66const URL_CACHE = new LRUCache<string, CacheEntry>({ 67 maxSize: MAX_CACHE_SIZE_BYTES, 68 ttl: CACHE_TTL_MS, 69}) 70 71// Separate cache for preflight domain checks. URL_CACHE is URL-keyed, so 72// fetching two paths on the same domain triggers two identical preflight 73// HTTP round-trips to api.anthropic.com. This hostname-keyed cache avoids 74// that. Only 'allowed' is cached — blocked/failed re-check on next attempt. 75const DOMAIN_CHECK_CACHE = new LRUCache<string, true>({ 76 max: 128, 77 ttl: 5 * 60 * 1000, // 5 minutes — shorter than URL_CACHE TTL 78}) 79 80export function clearWebFetchCache(): void { 81 URL_CACHE.clear() 82 DOMAIN_CHECK_CACHE.clear() 83} 84 85// Lazy singleton — defers the turndown → @mixmark-io/domino import (~1.4MB 86// retained heap) until the first HTML fetch, and reuses one instance across 87// calls (construction builds 15 rule objects; .turndown() is stateless). 88// @types/turndown ships only `export =` (no .d.mts), so TS types the import 89// as the class itself while Bun wraps CJS in { default } — hence the cast. 90type TurndownCtor = typeof import('turndown') 91let turndownServicePromise: Promise<InstanceType<TurndownCtor>> | undefined 92function getTurndownService(): Promise<InstanceType<TurndownCtor>> { 93 return (turndownServicePromise ??= import('turndown').then(m => { 94 const Turndown = (m as unknown as { default: TurndownCtor }).default 95 return new Turndown() 96 })) 97} 98 99// PSR requested limiting the length of URLs to 250 to lower the potential 100// for a data exfiltration. However, this is too restrictive for some customers' 101// legitimate use cases, such as JWT-signed URLs (e.g., cloud service signed URLs) 102// that can be much longer. We already require user approval for each domain, 103// which provides a primary security boundary. In addition, Claude Code has 104// other data exfil channels, and this one does not seem relatively high risk, 105// so I'm removing that length restriction. -ab 106const MAX_URL_LENGTH = 2000 107 108// Per PSR: 109// "Implement resource consumption controls because setting limits on CPU, 110// memory, and network usage for the Web Fetch tool can prevent a single 111// request or user from overwhelming the system." 112const MAX_HTTP_CONTENT_LENGTH = 10 * 1024 * 1024 113 114// Timeout for the main HTTP fetch request (60 seconds). 115// Prevents hanging indefinitely on slow/unresponsive servers. 116const FETCH_TIMEOUT_MS = 60_000 117 118// Timeout for the domain blocklist preflight check (10 seconds). 119const DOMAIN_CHECK_TIMEOUT_MS = 10_000 120 121// Cap same-host redirect hops. Without this a malicious server can return 122// a redirect loop (/a → /b → /a …) and the per-request FETCH_TIMEOUT_MS 123// resets on every hop, hanging the tool until user interrupt. 10 matches 124// common client defaults (axios=5, follow-redirects=21, Chrome=20). 125const MAX_REDIRECTS = 10 126 127// Truncate to not spend too many tokens 128export const MAX_MARKDOWN_LENGTH = 100_000 129 130export function isPreapprovedUrl(url: string): boolean { 131 try { 132 const parsedUrl = new URL(url) 133 return isPreapprovedHost(parsedUrl.hostname, parsedUrl.pathname) 134 } catch { 135 return false 136 } 137} 138 139export function validateURL(url: string): boolean { 140 if (url.length > MAX_URL_LENGTH) { 141 return false 142 } 143 144 let parsed 145 try { 146 parsed = new URL(url) 147 } catch { 148 return false 149 } 150 151 // We don't need to check protocol here, as we'll upgrade http to https when making the request 152 153 // As long as we aren't supporting aiming to cookies or internal domains, 154 // we should block URLs with usernames/passwords too, even though these 155 // seem exceedingly unlikely. 156 if (parsed.username || parsed.password) { 157 return false 158 } 159 160 // Initial filter that this isn't a privileged, company-internal URL 161 // by checking that the hostname is publicly resolvable 162 const hostname = parsed.hostname 163 const parts = hostname.split('.') 164 if (parts.length < 2) { 165 return false 166 } 167 168 return true 169} 170 171type DomainCheckResult = 172 | { status: 'allowed' } 173 | { status: 'blocked' } 174 | { status: 'check_failed'; error: Error } 175 176export async function checkDomainBlocklist( 177 domain: string, 178): Promise<DomainCheckResult> { 179 if (DOMAIN_CHECK_CACHE.has(domain)) { 180 return { status: 'allowed' } 181 } 182 try { 183 const response = await axios.get( 184 `https://api.anthropic.com/api/web/domain_info?domain=${encodeURIComponent(domain)}`, 185 { timeout: DOMAIN_CHECK_TIMEOUT_MS }, 186 ) 187 if (response.status === 200) { 188 if (response.data.can_fetch === true) { 189 DOMAIN_CHECK_CACHE.set(domain, true) 190 return { status: 'allowed' } 191 } 192 return { status: 'blocked' } 193 } 194 // Non-200 status but didn't throw 195 return { 196 status: 'check_failed', 197 error: new Error(`Domain check returned status ${response.status}`), 198 } 199 } catch (e) { 200 logError(e) 201 return { status: 'check_failed', error: e as Error } 202 } 203} 204 205/** 206 * Check if a redirect is safe to follow 207 * Allows redirects that: 208 * - Add or remove "www." in the hostname 209 * - Keep the origin the same but change path/query params 210 * - Or both of the above 211 */ 212export function isPermittedRedirect( 213 originalUrl: string, 214 redirectUrl: string, 215): boolean { 216 try { 217 const parsedOriginal = new URL(originalUrl) 218 const parsedRedirect = new URL(redirectUrl) 219 220 if (parsedRedirect.protocol !== parsedOriginal.protocol) { 221 return false 222 } 223 224 if (parsedRedirect.port !== parsedOriginal.port) { 225 return false 226 } 227 228 if (parsedRedirect.username || parsedRedirect.password) { 229 return false 230 } 231 232 // Now check hostname conditions 233 // 1. Adding www. is allowed: example.com -> www.example.com 234 // 2. Removing www. is allowed: www.example.com -> example.com 235 // 3. Same host (with or without www.) is allowed: paths can change 236 const stripWww = (hostname: string) => hostname.replace(/^www\./, '') 237 const originalHostWithoutWww = stripWww(parsedOriginal.hostname) 238 const redirectHostWithoutWww = stripWww(parsedRedirect.hostname) 239 return originalHostWithoutWww === redirectHostWithoutWww 240 } catch (_error) { 241 return false 242 } 243} 244 245/** 246 * Helper function to handle fetching URLs with custom redirect handling 247 * Recursively follows redirects if they pass the redirectChecker function 248 * 249 * Per PSR: 250 * "Do not automatically follow redirects because following redirects could 251 * allow for an attacker to exploit an open redirect vulnerability in a 252 * trusted domain to force a user to make a request to a malicious domain 253 * unknowingly" 254 */ 255type RedirectInfo = { 256 type: 'redirect' 257 originalUrl: string 258 redirectUrl: string 259 statusCode: number 260} 261 262export async function getWithPermittedRedirects( 263 url: string, 264 signal: AbortSignal, 265 redirectChecker: (originalUrl: string, redirectUrl: string) => boolean, 266 depth = 0, 267): Promise<AxiosResponse<ArrayBuffer> | RedirectInfo> { 268 if (depth > MAX_REDIRECTS) { 269 throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`) 270 } 271 try { 272 return await axios.get(url, { 273 signal, 274 timeout: FETCH_TIMEOUT_MS, 275 maxRedirects: 0, 276 responseType: 'arraybuffer', 277 maxContentLength: MAX_HTTP_CONTENT_LENGTH, 278 headers: { 279 Accept: 'text/markdown, text/html, */*', 280 'User-Agent': getWebFetchUserAgent(), 281 }, 282 }) 283 } catch (error) { 284 if ( 285 axios.isAxiosError(error) && 286 error.response && 287 [301, 302, 307, 308].includes(error.response.status) 288 ) { 289 const redirectLocation = error.response.headers.location 290 if (!redirectLocation) { 291 throw new Error('Redirect missing Location header') 292 } 293 294 // Resolve relative URLs against the original URL 295 const redirectUrl = new URL(redirectLocation, url).toString() 296 297 if (redirectChecker(url, redirectUrl)) { 298 // Recursively follow the permitted redirect 299 return getWithPermittedRedirects( 300 redirectUrl, 301 signal, 302 redirectChecker, 303 depth + 1, 304 ) 305 } else { 306 // Return redirect information to the caller 307 return { 308 type: 'redirect', 309 originalUrl: url, 310 redirectUrl, 311 statusCode: error.response.status, 312 } 313 } 314 } 315 316 // Detect egress proxy blocks: the proxy returns 403 with 317 // X-Proxy-Error: blocked-by-allowlist when egress is restricted 318 if ( 319 axios.isAxiosError(error) && 320 error.response?.status === 403 && 321 error.response.headers['x-proxy-error'] === 'blocked-by-allowlist' 322 ) { 323 const hostname = new URL(url).hostname 324 throw new EgressBlockedError(hostname) 325 } 326 327 throw error 328 } 329} 330 331function isRedirectInfo( 332 response: AxiosResponse<ArrayBuffer> | RedirectInfo, 333): response is RedirectInfo { 334 return 'type' in response && response.type === 'redirect' 335} 336 337export type FetchedContent = { 338 content: string 339 bytes: number 340 code: number 341 codeText: string 342 contentType: string 343 persistedPath?: string 344 persistedSize?: number 345} 346 347export async function getURLMarkdownContent( 348 url: string, 349 abortController: AbortController, 350): Promise<FetchedContent | RedirectInfo> { 351 if (!validateURL(url)) { 352 throw new Error('Invalid URL') 353 } 354 355 // Check cache (LRUCache handles TTL automatically) 356 const cachedEntry = URL_CACHE.get(url) 357 if (cachedEntry) { 358 return { 359 bytes: cachedEntry.bytes, 360 code: cachedEntry.code, 361 codeText: cachedEntry.codeText, 362 content: cachedEntry.content, 363 contentType: cachedEntry.contentType, 364 persistedPath: cachedEntry.persistedPath, 365 persistedSize: cachedEntry.persistedSize, 366 } 367 } 368 369 let parsedUrl: URL 370 let upgradedUrl = url 371 372 try { 373 parsedUrl = new URL(url) 374 375 // Upgrade http to https if needed 376 if (parsedUrl.protocol === 'http:') { 377 parsedUrl.protocol = 'https:' 378 upgradedUrl = parsedUrl.toString() 379 } 380 381 const hostname = parsedUrl.hostname 382 383 // Check if the user has opted to skip the blocklist check 384 // This is for enterprise customers with restrictive security policies 385 // that prevent outbound connections to claude.ai 386 const settings = getSettings_DEPRECATED() 387 if (!settings.skipWebFetchPreflight) { 388 const checkResult = await checkDomainBlocklist(hostname) 389 switch (checkResult.status) { 390 case 'allowed': 391 // Continue with the fetch 392 break 393 case 'blocked': 394 throw new DomainBlockedError(hostname) 395 case 'check_failed': 396 throw new DomainCheckFailedError(hostname) 397 } 398 } 399 400 if (process.env.USER_TYPE === 'ant') { 401 logEvent('tengu_web_fetch_host', { 402 hostname: 403 hostname as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 404 }) 405 } 406 } catch (e) { 407 if ( 408 e instanceof DomainBlockedError || 409 e instanceof DomainCheckFailedError 410 ) { 411 // Expected user-facing failures - re-throw without logging as internal error 412 throw e 413 } 414 logError(e) 415 } 416 417 const response = await getWithPermittedRedirects( 418 upgradedUrl, 419 abortController.signal, 420 isPermittedRedirect, 421 ) 422 423 // Check if we got a redirect response 424 if (isRedirectInfo(response)) { 425 return response 426 } 427 428 const rawBuffer = Buffer.from(response.data) 429 // Release the axios-held ArrayBuffer copy; rawBuffer owns the bytes now. 430 // This lets GC reclaim up to MAX_HTTP_CONTENT_LENGTH (10MB) before Turndown 431 // builds its DOM tree (which can be 3-5x the HTML size). 432 ;(response as { data: unknown }).data = null 433 const contentType = response.headers['content-type'] ?? '' 434 435 // Binary content: save raw bytes to disk with a proper extension so Claude 436 // can inspect the file later. We still fall through to the utf-8 decode + 437 // Haiku path below — for PDFs in particular the decoded string has enough 438 // ASCII structure (/Title, text streams) that Haiku can summarize it, and 439 // the saved file is a supplement rather than a replacement. 440 let persistedPath: string | undefined 441 let persistedSize: number | undefined 442 if (isBinaryContentType(contentType)) { 443 const persistId = `webfetch-${Date.now()}-${Math.random().toString(36).slice(2, 8)}` 444 const result = await persistBinaryContent(rawBuffer, contentType, persistId) 445 if (!('error' in result)) { 446 persistedPath = result.filepath 447 persistedSize = result.size 448 } 449 } 450 451 const bytes = rawBuffer.length 452 const htmlContent = rawBuffer.toString('utf-8') 453 454 let markdownContent: string 455 let contentBytes: number 456 if (contentType.includes('text/html')) { 457 markdownContent = (await getTurndownService()).turndown(htmlContent) 458 contentBytes = Buffer.byteLength(markdownContent) 459 } else { 460 // It's not HTML - just use it raw. The decoded string's UTF-8 byte 461 // length equals rawBuffer.length (modulo U+FFFD replacement on invalid 462 // bytes — negligible for cache eviction accounting), so skip the O(n) 463 // Buffer.byteLength scan. 464 markdownContent = htmlContent 465 contentBytes = bytes 466 } 467 468 // Store the fetched content in cache. Note that it's stored under 469 // the original URL, not the upgraded or redirected URL. 470 const entry: CacheEntry = { 471 bytes, 472 code: response.status, 473 codeText: response.statusText, 474 content: markdownContent, 475 contentType, 476 persistedPath, 477 persistedSize, 478 } 479 // lru-cache requires positive integers; clamp to 1 for empty responses. 480 URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) }) 481 return entry 482} 483 484export async function applyPromptToMarkdown( 485 prompt: string, 486 markdownContent: string, 487 signal: AbortSignal, 488 isNonInteractiveSession: boolean, 489 isPreapprovedDomain: boolean, 490): Promise<string> { 491 // Truncate content to avoid "Prompt is too long" errors from the secondary model 492 const truncatedContent = 493 markdownContent.length > MAX_MARKDOWN_LENGTH 494 ? markdownContent.slice(0, MAX_MARKDOWN_LENGTH) + 495 '\n\n[Content truncated due to length...]' 496 : markdownContent 497 498 const modelPrompt = makeSecondaryModelPrompt( 499 truncatedContent, 500 prompt, 501 isPreapprovedDomain, 502 ) 503 const assistantMessage = await queryHaiku({ 504 systemPrompt: asSystemPrompt([]), 505 userPrompt: modelPrompt, 506 signal, 507 options: { 508 querySource: 'web_fetch_apply', 509 agents: [], 510 isNonInteractiveSession, 511 hasAppendSystemPrompt: false, 512 mcpTools: [], 513 }, 514 }) 515 516 // We need to bubble this up, so that the tool call throws, causing us to return 517 // an is_error tool_use block to the server, and render a red dot in the UI. 518 if (signal.aborted) { 519 throw new AbortError() 520 } 521 522 const { content } = assistantMessage.message 523 if (content.length > 0) { 524 const contentBlock = content[0] 525 if ('text' in contentBlock!) { 526 return contentBlock.text 527 } 528 } 529 return 'No response from model' 530}