source dump of claude code
at main 2679 lines 112 kB view raw
1/** 2 * AST-based bash command analysis using tree-sitter. 3 * 4 * This module replaces the shell-quote + hand-rolled char-walker approach in 5 * bashSecurity.ts / commands.ts. Instead of detecting parser differentials 6 * one-by-one, we parse with tree-sitter-bash and walk the tree with an 7 * EXPLICIT allowlist of node types. Any node type not in the allowlist causes 8 * the entire command to be classified as 'too-complex', which means it goes 9 * through the normal permission prompt flow. 10 * 11 * The key design property is FAIL-CLOSED: we never interpret structure we 12 * don't understand. If tree-sitter produces a node we haven't explicitly 13 * allowlisted, we refuse to extract argv and the caller must ask the user. 14 * 15 * This is NOT a sandbox. It does not prevent dangerous commands from running. 16 * It answers exactly one question: "Can we produce a trustworthy argv[] for 17 * each simple command in this string?" If yes, downstream code can match 18 * argv[0] against permission rules and flag allowlists. If no, ask the user. 19 */ 20 21import { SHELL_KEYWORDS } from './bashParser.js' 22import type { Node } from './parser.js' 23import { PARSE_ABORTED, parseCommandRaw } from './parser.js' 24 25export type Redirect = { 26 op: '>' | '>>' | '<' | '<<' | '>&' | '>|' | '<&' | '&>' | '&>>' | '<<<' 27 target: string 28 fd?: number 29} 30 31export type SimpleCommand = { 32 /** argv[0] is the command name, rest are arguments with quotes already resolved */ 33 argv: string[] 34 /** Leading VAR=val assignments */ 35 envVars: { name: string; value: string }[] 36 /** Output/input redirects */ 37 redirects: Redirect[] 38 /** Original source span for this command (for UI display) */ 39 text: string 40} 41 42export type ParseForSecurityResult = 43 | { kind: 'simple'; commands: SimpleCommand[] } 44 | { kind: 'too-complex'; reason: string; nodeType?: string } 45 | { kind: 'parse-unavailable' } 46 47/** 48 * Structural node types that represent composition of commands. We recurse 49 * through these to find the leaf `command` nodes. `program` is the root; 50 * `list` is `a && b || c`; `pipeline` is `a | b`; `redirected_statement` 51 * wraps a command with its redirects. Semicolon-separated commands appear 52 * as direct siblings under `program` (no wrapper node). 53 */ 54const STRUCTURAL_TYPES = new Set([ 55 'program', 56 'list', 57 'pipeline', 58 'redirected_statement', 59]) 60 61/** 62 * Operator tokens that separate commands. These are leaf nodes that appear 63 * between commands in `list`/`pipeline`/`program` and carry no payload. 64 */ 65const SEPARATOR_TYPES = new Set(['&&', '||', '|', ';', '&', '|&', '\n']) 66 67/** 68 * Placeholder string used in outer argv when a $() is recursively extracted. 69 * The actual $() output is runtime-determined; the inner command(s) are 70 * checked against permission rules separately. Using a placeholder keeps 71 * the outer argv clean (no multi-line heredoc bodies polluting path 72 * extraction or triggering newline checks). 73 */ 74const CMDSUB_PLACEHOLDER = '__CMDSUB_OUTPUT__' 75 76/** 77 * Placeholder for simple_expansion ($VAR) references to variables set earlier 78 * in the same command via variable_assignment. Since we tracked the assignment, 79 * we know the var exists and its value is either a static string or 80 * __CMDSUB_OUTPUT__ (if set via $()). Either way, safe to substitute. 81 */ 82const VAR_PLACEHOLDER = '__TRACKED_VAR__' 83 84/** 85 * All placeholder strings. Used for defense-in-depth: if a varScope value 86 * contains ANY placeholder (exact or embedded), the value is NOT a pure 87 * literal and cannot be trusted as a bare argument. Covers composites like 88 * `VAR="prefix$(cmd)"` → `"prefix__CMDSUB_OUTPUT__"` — the substring check 89 * catches these where exact-match Set.has() would miss. 90 * 91 * Also catches user-typed literals that collide with placeholder strings: 92 * `VAR=__TRACKED_VAR__ && rm $VAR` — treated as non-literal (conservative). 93 */ 94function containsAnyPlaceholder(value: string): boolean { 95 return value.includes(CMDSUB_PLACEHOLDER) || value.includes(VAR_PLACEHOLDER) 96} 97 98/** 99 * Unquoted $VAR in bash undergoes word-splitting (on $IFS: space/tab/NL) 100 * and pathname expansion (glob matching on * ? [). Our argv stores a 101 * single string — but at runtime bash may produce MULTIPLE args, or paths 102 * matched by a glob. A value containing these metacharacters cannot be 103 * trusted as a bare arg: `VAR="-rf /" && rm $VAR` → bash runs `rm -rf /` 104 * (two args) but our argv would have `['rm', '-rf /']` (one arg). Similarly 105 * `VAR="/etc/*" && cat $VAR` → bash expands to all /etc files. 106 * 107 * Inside double-quotes ("$VAR"), neither splitting nor globbing applies — 108 * the value IS a single literal argument. 109 */ 110const BARE_VAR_UNSAFE_RE = /[ \t\n*?[]/ 111 112// stdbuf flag forms — hoisted from the wrapper-stripping while-loop 113const STDBUF_SHORT_SEP_RE = /^-[ioe]$/ 114const STDBUF_SHORT_FUSED_RE = /^-[ioe]./ 115const STDBUF_LONG_RE = /^--(input|output|error)=/ 116 117/** 118 * Known-safe environment variables that bash sets automatically. Their values 119 * are controlled by the shell/OS, not arbitrary user input. Referencing these 120 * via $VAR is safe — the expansion is deterministic and doesn't introduce 121 * injection risk. Covers `$HOME`, `$PWD`, `$USER`, `$PATH`, `$SHELL`, etc. 122 * Intentionally small: only vars that are always set by bash/login and whose 123 * values are paths/names (not arbitrary content). 124 */ 125const SAFE_ENV_VARS = new Set([ 126 'HOME', // user's home directory 127 'PWD', // current working directory (bash maintains) 128 'OLDPWD', // previous directory 129 'USER', // current username 130 'LOGNAME', // login name 131 'SHELL', // user's login shell 132 'PATH', // executable search path 133 'HOSTNAME', // machine hostname 134 'UID', // user id 135 'EUID', // effective user id 136 'PPID', // parent process id 137 'RANDOM', // random number (bash builtin) 138 'SECONDS', // seconds since shell start 139 'LINENO', // current line number 140 'TMPDIR', // temp directory 141 // Special bash variables — always set, values are shell-controlled: 142 'BASH_VERSION', // bash version string 143 'BASHPID', // current bash process id 144 'SHLVL', // shell nesting level 145 'HISTFILE', // history file path 146 'IFS', // field separator (NOTE: only safe INSIDE strings; as bare arg 147 // $IFS is the classic injection primitive and the insideString 148 // gate in resolveSimpleExpansion correctly blocks it) 149]) 150 151/** 152 * Special shell variables ($?, $$, $!, $#, $0-$9). tree-sitter uses 153 * `special_variable_name` for these (not `variable_name`). Values are 154 * shell-controlled: exit status, PIDs, positional args. Safe to resolve 155 * ONLY inside strings (same rationale as SAFE_ENV_VARS — as bare args 156 * their value IS the argument and might be a path/flag from $1 etc.). 157 * 158 * SECURITY: '@' and '*' are NOT in this set. Inside "...", they expand to 159 * the positional params — which are EMPTY in a fresh BashTool shell (how we 160 * always spawn). Returning VAR_PLACEHOLDER would lie: `git "push$*"` gives 161 * argv ['git','push__TRACKED_VAR__'] while bash passes ['git','push']. Deny 162 * rule Bash(git push:*) fails on both .text (raw `$*`) AND rebuilt argv 163 * (placeholder). With them removed, resolveSimpleExpansion falls through to 164 * tooComplex for `$*` / `$@`. `echo "args: $*"` becomes too-complex — 165 * acceptable (rare in BashTool usage; `"$@"` even rarer). 166 */ 167const SPECIAL_VAR_NAMES = new Set([ 168 '?', // exit status of last command 169 '$', // current shell PID 170 '!', // last background PID 171 '#', // number of positional params 172 '0', // script name 173 '-', // shell option flags 174]) 175 176/** 177 * Node types that mean "this command cannot be statically analyzed." These 178 * either execute arbitrary code (substitutions, subshells, control flow) or 179 * expand to values we can't determine statically (parameter/arithmetic 180 * expansion, brace expressions). 181 * 182 * This set is not exhaustive — it documents KNOWN dangerous types. The real 183 * safety property is the allowlist in walkArgument/walkCommand: any type NOT 184 * explicitly handled there also triggers too-complex. 185 */ 186const DANGEROUS_TYPES = new Set([ 187 'command_substitution', 188 'process_substitution', 189 'expansion', 190 'simple_expansion', 191 'brace_expression', 192 'subshell', 193 'compound_statement', 194 'for_statement', 195 'while_statement', 196 'until_statement', 197 'if_statement', 198 'case_statement', 199 'function_definition', 200 'test_command', 201 'ansi_c_string', 202 'translated_string', 203 'herestring_redirect', 204 'heredoc_redirect', 205]) 206 207/** 208 * Numeric IDs for analytics (logEvent doesn't accept strings). Index into 209 * DANGEROUS_TYPES. Append new entries at the end to keep IDs stable. 210 * 0 = unknown/other, -1 = ERROR (parse failure), -2 = pre-check. 211 */ 212const DANGEROUS_TYPE_IDS = [...DANGEROUS_TYPES] 213export function nodeTypeId(nodeType: string | undefined): number { 214 if (!nodeType) return -2 215 if (nodeType === 'ERROR') return -1 216 const i = DANGEROUS_TYPE_IDS.indexOf(nodeType) 217 return i >= 0 ? i + 1 : 0 218} 219 220/** 221 * Redirect operator tokens → canonical operator. tree-sitter produces these 222 * as child nodes of `file_redirect`. 223 */ 224const REDIRECT_OPS: Record<string, Redirect['op']> = { 225 '>': '>', 226 '>>': '>>', 227 '<': '<', 228 '>&': '>&', 229 '<&': '<&', 230 '>|': '>|', 231 '&>': '&>', 232 '&>>': '&>>', 233 '<<<': '<<<', 234} 235 236/** 237 * Brace expansion pattern: {a,b} or {a..b}. Must have , or .. inside 238 * braces. We deliberately do NOT try to determine whether the opening brace 239 * is backslash-escaped: tree-sitter doesn't unescape backslashes, so 240 * distinguishing `\{a,b}` (escaped, literal) from `\\{a,b}` (literal 241 * backslash + expansion) would require reimplementing bash quote removal. 242 * Reject both — the escaped-brace case is rare and trivially rewritten 243 * with single quotes. 244 */ 245const BRACE_EXPANSION_RE = /\{[^{}\s]*(,|\.\.)[^{}\s]*\}/ 246 247/** 248 * Control characters that bash silently drops but confuse static analysis. 249 * Includes CR (0x0D): tree-sitter treats CR as a word separator but bash's 250 * default IFS does not include CR, so tree-sitter and bash disagree on 251 * word boundaries. 252 */ 253// eslint-disable-next-line no-control-regex 254const CONTROL_CHAR_RE = /[\x00-\x08\x0B-\x1F\x7F]/ 255 256/** 257 * Unicode whitespace beyond ASCII. These render invisibly (or as regular 258 * spaces) in terminals so a user reviewing the command can't see them, but 259 * bash treats them as literal word characters. Blocks NBSP, zero-width 260 * spaces, line/paragraph separators, BOM. 261 */ 262const UNICODE_WHITESPACE_RE = 263 /[\u00A0\u1680\u2000-\u200B\u2028\u2029\u202F\u205F\u3000\uFEFF]/ 264 265/** 266 * Backslash immediately before whitespace. bash treats `\ ` as a literal 267 * space inside the current word, but tree-sitter returns the raw text with 268 * the backslash still present. argv[0] from tree-sitter is `cat\ test` 269 * while bash runs `cat test` (with a literal space). Rather than 270 * reimplement bash's unescaping rules, we reject these — they're rare in 271 * practice and trivial to rewrite with quotes. 272 * 273 * Also matches `\` before newline (line continuation) when adjacent to a 274 * non-whitespace char. `tr\<NL>aceroute` — bash joins to `traceroute`, but 275 * tree-sitter splits into two words (differential). When `\<NL>` is preceded 276 * by whitespace (e.g. `foo && \<NL>bar`), there's no word to join — both 277 * parsers agree, so we allow it. 278 */ 279const BACKSLASH_WHITESPACE_RE = /\\[ \t]|[^ \t\n\\]\\\n/ 280 281/** 282 * Zsh dynamic named directory expansion: ~[name]. In zsh this invokes the 283 * zsh_directory_name hook, which can run arbitrary code. bash treats it as 284 * a literal tilde followed by a glob character class. Since BashTool runs 285 * via the user's default shell (often zsh), reject conservatively. 286 */ 287const ZSH_TILDE_BRACKET_RE = /~\[/ 288 289/** 290 * Zsh EQUALS expansion: word-initial `=cmd` expands to the absolute path of 291 * `cmd` (equivalent to `$(which cmd)`). `=curl evil.com` runs as 292 * `/usr/bin/curl evil.com`. tree-sitter parses `=curl` as a literal word, so 293 * a `Bash(curl:*)` deny rule matching on base command name won't see `curl`. 294 * Only matches word-initial `=` followed by a command-name char — `VAR=val` 295 * and `--flag=val` have `=` mid-word and are not expanded by zsh. 296 */ 297const ZSH_EQUALS_EXPANSION_RE = /(?:^|[\s;&|])=[a-zA-Z_]/ 298 299/** 300 * Brace character combined with quote characters. Constructions like 301 * `{a'}',b}` use quoted braces inside brace expansion context to obfuscate 302 * the expansion from regex-based detection. In bash, `{a'}',b}` expands to 303 * `a} b` (the quoted `}` becomes literal inside the first alternative). 304 * These are hard to analyze correctly and have no legitimate use in 305 * commands we'd want to auto-allow. 306 * 307 * This check runs on a version of the command with `{` masked out of 308 * single-quoted and double-quoted spans, so JSON payloads like 309 * `curl -d '{"k":"v"}'` don't trigger a false positive. Brace expansion 310 * cannot occur inside quotes, so a `{` there can never start an obfuscation 311 * pattern. The quote characters themselves stay visible so `{a'}',b}` and 312 * `{@'{'0},...}` still match via the outer unquoted `{`. 313 */ 314const BRACE_WITH_QUOTE_RE = /\{[^}]*['"]/ 315 316/** 317 * Mask `{` characters that appear inside single- or double-quoted contexts. 318 * Uses a single-pass bash-aware quote-state scanner instead of a regex. 319 * 320 * A naive regex (`/'[^']*'/g`) mis-detects spans when a `'` appears inside 321 * a double-quoted string: for `echo "it's" {a'}',b}`, it matches from the 322 * `'` in `it's` across to the `'` in `{a'}`, masking the unquoted `{` and 323 * producing a false negative. The scanner tracks actual bash quote state: 324 * `'` toggles single-quote only in unquoted context; `"` toggles 325 * double-quote only outside single quotes; `\` escapes the next char in 326 * unquoted context and escapes `"` / `\\` inside double quotes. 327 * 328 * Brace expansion is impossible in both quote contexts, so masking `{` in 329 * either is safe. Secondary defense: BRACE_EXPANSION_RE in walkArgument. 330 */ 331function maskBracesInQuotedContexts(cmd: string): string { 332 // Fast path: no `{` → nothing to mask. Skips the char-by-char scan for 333 // the >90% of commands with no braces (`ls -la`, `git status`, etc). 334 if (!cmd.includes('{')) return cmd 335 const out: string[] = [] 336 let inSingle = false 337 let inDouble = false 338 let i = 0 339 while (i < cmd.length) { 340 const c = cmd[i]! 341 if (inSingle) { 342 // Bash single quotes: no escapes, `'` always terminates. 343 if (c === "'") inSingle = false 344 out.push(c === '{' ? ' ' : c) 345 i++ 346 } else if (inDouble) { 347 // Bash double quotes: `\` escapes `"` and `\` (also `$`, backtick, 348 // newline — but those don't affect quote state so we let them pass). 349 if (c === '\\' && (cmd[i + 1] === '"' || cmd[i + 1] === '\\')) { 350 out.push(c, cmd[i + 1]!) 351 i += 2 352 } else { 353 if (c === '"') inDouble = false 354 out.push(c === '{' ? ' ' : c) 355 i++ 356 } 357 } else { 358 // Unquoted: `\` escapes any next char. 359 if (c === '\\' && i + 1 < cmd.length) { 360 out.push(c, cmd[i + 1]!) 361 i += 2 362 } else { 363 if (c === "'") inSingle = true 364 else if (c === '"') inDouble = true 365 out.push(c) 366 i++ 367 } 368 } 369 } 370 return out.join('') 371} 372 373const DOLLAR = String.fromCharCode(0x24) 374 375/** 376 * Parse a bash command string and extract a flat list of simple commands. 377 * Returns 'too-complex' if the command uses any shell feature we can't 378 * statically analyze. Returns 'parse-unavailable' if tree-sitter WASM isn't 379 * loaded — caller should fall back to conservative behavior. 380 */ 381export async function parseForSecurity( 382 cmd: string, 383): Promise<ParseForSecurityResult> { 384 // parseCommandRaw('') returns null (falsy check), so short-circuit here. 385 // Don't use .trim() — it strips Unicode whitespace (\u00a0 etc.) which the 386 // pre-checks in parseForSecurityFromAst need to see and reject. 387 if (cmd === '') return { kind: 'simple', commands: [] } 388 const root = await parseCommandRaw(cmd) 389 return root === null 390 ? { kind: 'parse-unavailable' } 391 : parseForSecurityFromAst(cmd, root) 392} 393 394/** 395 * Same as parseForSecurity but takes a pre-parsed AST root so callers that 396 * need the tree for other purposes can parse once and share. Pre-checks 397 * still run on `cmd` — they catch tree-sitter/bash differentials that a 398 * successful parse doesn't. 399 */ 400export function parseForSecurityFromAst( 401 cmd: string, 402 root: Node | typeof PARSE_ABORTED, 403): ParseForSecurityResult { 404 // Pre-checks: characters that cause tree-sitter and bash to disagree on 405 // word boundaries. These run before tree-sitter because they're the known 406 // tree-sitter/bash differentials. Everything after this point trusts 407 // tree-sitter's tokenization. 408 if (CONTROL_CHAR_RE.test(cmd)) { 409 return { kind: 'too-complex', reason: 'Contains control characters' } 410 } 411 if (UNICODE_WHITESPACE_RE.test(cmd)) { 412 return { kind: 'too-complex', reason: 'Contains Unicode whitespace' } 413 } 414 if (BACKSLASH_WHITESPACE_RE.test(cmd)) { 415 return { 416 kind: 'too-complex', 417 reason: 'Contains backslash-escaped whitespace', 418 } 419 } 420 if (ZSH_TILDE_BRACKET_RE.test(cmd)) { 421 return { 422 kind: 'too-complex', 423 reason: 'Contains zsh ~[ dynamic directory syntax', 424 } 425 } 426 if (ZSH_EQUALS_EXPANSION_RE.test(cmd)) { 427 return { 428 kind: 'too-complex', 429 reason: 'Contains zsh =cmd equals expansion', 430 } 431 } 432 if (BRACE_WITH_QUOTE_RE.test(maskBracesInQuotedContexts(cmd))) { 433 return { 434 kind: 'too-complex', 435 reason: 'Contains brace with quote character (expansion obfuscation)', 436 } 437 } 438 439 const trimmed = cmd.trim() 440 if (trimmed === '') { 441 return { kind: 'simple', commands: [] } 442 } 443 444 if (root === PARSE_ABORTED) { 445 // SECURITY: module loaded but parse aborted (timeout / node budget / 446 // panic). Adversarially triggerable — `(( a[0][0]... ))` with ~2800 447 // subscripts hits PARSE_TIMEOUT_MICROS under the 10K length limit. 448 // Previously indistinguishable from module-not-loaded → routed to 449 // legacy (parse-unavailable), which lacks EVAL_LIKE_BUILTINS — `trap`, 450 // `enable`, `hash` leaked with Bash(*). Fail closed: too-complex → ask. 451 return { 452 kind: 'too-complex', 453 reason: 454 'Parser aborted (timeout or resource limit) — possible adversarial input', 455 nodeType: 'PARSE_ABORT', 456 } 457 } 458 459 return walkProgram(root) 460} 461 462function walkProgram(root: Node): ParseForSecurityResult { 463 // ERROR-node check folded into collectCommands — any unhandled node type 464 // (including ERROR) falls through to tooComplex() in the default branch. 465 // Avoids a separate full-tree walk for error detection. 466 const commands: SimpleCommand[] = [] 467 // Track variables assigned earlier in the same command. When a 468 // simple_expansion ($VAR) references a tracked var, we can substitute 469 // a placeholder instead of returning too-complex. Enables patterns like 470 // `NOW=$(date) && jq --arg now "$NOW" ...` — $NOW is known to be the 471 // $(date) output (already extracted as inner command). 472 const varScope = new Map<string, string>() 473 const err = collectCommands(root, commands, varScope) 474 if (err) return err 475 return { kind: 'simple', commands } 476} 477 478/** 479 * Recursively collect leaf `command` nodes from a structural wrapper node. 480 * Returns an error result on any disallowed node type, or null on success. 481 */ 482function collectCommands( 483 node: Node, 484 commands: SimpleCommand[], 485 varScope: Map<string, string>, 486): ParseForSecurityResult | null { 487 if (node.type === 'command') { 488 // Pass `commands` as the innerCommands accumulator — any $() extracted 489 // during walkCommand gets appended alongside the outer command. 490 const result = walkCommand(node, [], commands, varScope) 491 if (result.kind !== 'simple') return result 492 commands.push(...result.commands) 493 return null 494 } 495 496 if (node.type === 'redirected_statement') { 497 return walkRedirectedStatement(node, commands, varScope) 498 } 499 500 if (node.type === 'comment') { 501 return null 502 } 503 504 if (STRUCTURAL_TYPES.has(node.type)) { 505 // SECURITY: `||`, `|`, `|&`, `&` must NOT carry varScope linearly. In bash: 506 // `||` RHS runs conditionally → vars set there MAY not be set 507 // `|`/`|&` stages run in subshells → vars set there are NEVER visible after 508 // `&` LHS runs in a background subshell → same as above 509 // Flag-omission attack: `true || FLAG=--dry-run && cmd $FLAG` — bash skips 510 // the `||` RHS (FLAG unset → $FLAG empty), runs `cmd` WITHOUT --dry-run. 511 // With linear scope, our argv has ['cmd','--dry-run'] → looks SAFE → bypass. 512 // 513 // Fix: snapshot incoming scope at entry. After these separators, reset to 514 // the snapshot — vars set in clauses between separators don't leak. `scope` 515 // for clauses BETWEEN `&&`/`;` chains shares state (common `VAR=x && cmd 516 // $VAR`). `scope` crosses `||`/`|`/`&` as the pre-structure snapshot only. 517 // 518 // `&&` and `;` DO carry scope: `VAR=x && cmd $VAR` is sequential, VAR is set. 519 // 520 // NOTE: `scope` and `varScope` diverge after the first `||`/`|`/`&`. The 521 // caller's varScope is only mutated for the `&&`/`;` prefix — this is 522 // conservative (vars set in `A && B | C && D` leak A+B into caller, not 523 // C+D) but safe. 524 // 525 // Efficiency: snapshot is only needed if we hit `||`/`|`/`|&`/`&`. For 526 // the dominant case (`ls`, `git status` — no such separators), skip the 527 // Map alloc via a cheap pre-scan. For `pipeline`, node.type already tells 528 // us stages are subshells — copy once at entry, no snapshot needed (each 529 // reset uses the entry copy pattern via varScope, which is untouched). 530 const isPipeline = node.type === 'pipeline' 531 let needsSnapshot = false 532 if (!isPipeline) { 533 for (const c of node.children) { 534 if (c && (c.type === '||' || c.type === '&')) { 535 needsSnapshot = true 536 break 537 } 538 } 539 } 540 const snapshot = needsSnapshot ? new Map(varScope) : null 541 // For `pipeline`, ALL stages run in subshells — start with a copy so 542 // nothing mutates caller's scope. For `list`/`program`, the `&&`/`;` 543 // chain mutates caller's scope (sequential); fork only on `||`/`&`. 544 let scope = isPipeline ? new Map(varScope) : varScope 545 for (const child of node.children) { 546 if (!child) continue 547 if (SEPARATOR_TYPES.has(child.type)) { 548 if ( 549 child.type === '||' || 550 child.type === '|' || 551 child.type === '|&' || 552 child.type === '&' 553 ) { 554 // For pipeline: varScope is untouched (we started with a copy). 555 // For list/program: snapshot is non-null (pre-scan set it). 556 // `|`/`|&` only appear under `pipeline` nodes; `||`/`&` under list. 557 scope = new Map(snapshot ?? varScope) 558 } 559 continue 560 } 561 const err = collectCommands(child, commands, scope) 562 if (err) return err 563 } 564 return null 565 } 566 567 if (node.type === 'negated_command') { 568 // `! cmd` inverts exit code only — doesn't execute code or affect 569 // argv. Recurse into the wrapped command. Common in CI: `! grep err`, 570 // `! test -f lock`, `! git diff --quiet`. 571 for (const child of node.children) { 572 if (!child) continue 573 if (child.type === '!') continue 574 return collectCommands(child, commands, varScope) 575 } 576 return null 577 } 578 579 if (node.type === 'declaration_command') { 580 // `export`/`local`/`readonly`/`declare`/`typeset`. tree-sitter emits 581 // these as declaration_command, not command, so they previously fell 582 // through to tooComplex. Values are validated via walkVariableAssignment: 583 // `$()` in the value is recursively extracted (inner command pushed to 584 // commands[], outer argv gets CMDSUB_PLACEHOLDER); other disallowed 585 // expansions still reject via walkArgument. argv[0] is the builtin name so 586 // `Bash(export:*)` rules match. 587 const argv: string[] = [] 588 for (const child of node.children) { 589 if (!child) continue 590 switch (child.type) { 591 case 'export': 592 case 'local': 593 case 'readonly': 594 case 'declare': 595 case 'typeset': 596 argv.push(child.text) 597 break 598 case 'word': 599 case 'number': 600 case 'raw_string': 601 case 'string': 602 case 'concatenation': { 603 // Flags (`declare -r`), quoted names (`export "FOO=bar"`), numbers 604 // (`declare -i 42`). Mirrors walkCommand's argv handling — before 605 // this, `export "FOO=bar"` hit tooComplex on the `string` child. 606 // walkArgument validates each (expansions still reject). 607 const arg = walkArgument(child, commands, varScope) 608 if (typeof arg !== 'string') return arg 609 // SECURITY: declare/typeset/local flags that change assignment 610 // semantics break our static model. -n (nameref): `declare -n X=Y` 611 // then `$X` dereferences to $Y's VALUE — varScope stores 'Y' 612 // (target NAME), argv[0] shows 'Y' while bash runs whatever $Y 613 // holds. -i (integer): `declare -i X='a[$(cmd)]'` arithmetically 614 // evaluates the RHS at assignment time, running $(cmd) even from 615 // a single-quoted raw_string (same primitive walkArithmetic 616 // guards in $((…))). -a/-A (array): subscript arithmetic on 617 // assignment. -r/-x/-g/-p/-f/-F are inert. Check the resolved 618 // arg (not child.text) so `\-n` and quoted `-n` are caught. 619 // Scope to declare/typeset/local only: `export -n` means "remove 620 // export attribute" (not nameref), and export/readonly don't 621 // accept -i; readonly -a/-A rejects subscripted args as invalid 622 // identifiers so subscript-arith doesn't fire. 623 if ( 624 (argv[0] === 'declare' || 625 argv[0] === 'typeset' || 626 argv[0] === 'local') && 627 /^-[a-zA-Z]*[niaA]/.test(arg) 628 ) { 629 return { 630 kind: 'too-complex', 631 reason: `declare flag ${arg} changes assignment semantics (nameref/integer/array)`, 632 nodeType: 'declaration_command', 633 } 634 } 635 // SECURITY: bare positional assignment with a subscript also 636 // evaluates — no -a/-i flag needed. `declare 'x[$(id)]=val'` 637 // implicitly creates an array element, arithmetically evaluating 638 // the subscript and running $(id). tree-sitter delivers the 639 // single-quoted form as a raw_string leaf so walkArgument sees 640 // only the literal text. Scoped to declare/typeset/local: 641 // export/readonly reject `[` in identifiers before eval. 642 if ( 643 (argv[0] === 'declare' || 644 argv[0] === 'typeset' || 645 argv[0] === 'local') && 646 arg[0] !== '-' && 647 /^[^=]*\[/.test(arg) 648 ) { 649 return { 650 kind: 'too-complex', 651 reason: `declare positional '${arg}' contains array subscript — bash evaluates $(cmd) in subscripts`, 652 nodeType: 'declaration_command', 653 } 654 } 655 argv.push(arg) 656 break 657 } 658 case 'variable_assignment': { 659 const ev = walkVariableAssignment(child, commands, varScope) 660 if ('kind' in ev) return ev 661 // export/declare assignments populate the scope so later $VAR refs resolve. 662 applyVarToScope(varScope, ev) 663 argv.push(`${ev.name}=${ev.value}`) 664 break 665 } 666 case 'variable_name': 667 // `export FOO` — bare name, no assignment. 668 argv.push(child.text) 669 break 670 default: 671 return tooComplex(child) 672 } 673 } 674 commands.push({ argv, envVars: [], redirects: [], text: node.text }) 675 return null 676 } 677 678 if (node.type === 'variable_assignment') { 679 // Bare `VAR=value` at statement level (not a command env prefix). 680 // Sets a shell variable — no code execution, no filesystem I/O. 681 // The value is validated via walkVariableAssignment → walkArgument, 682 // so `VAR=$(evil)` still recursively extracts/rejects based on the 683 // inner command. Does NOT push to commands — a bare assignment needs 684 // no permission rule (it's inert). Common pattern: `VAR=x && cmd` 685 // where cmd references $VAR. ~35% of too-complex in top-5k ant cmds. 686 const ev = walkVariableAssignment(node, commands, varScope) 687 if ('kind' in ev) return ev 688 // Populate scope so later `$VAR` references resolve. 689 applyVarToScope(varScope, ev) 690 return null 691 } 692 693 if (node.type === 'for_statement') { 694 // `for VAR in WORD...; do BODY; done` — iterate BODY once per word. 695 // Body commands extracted once; every iteration runs the same commands. 696 // 697 // SECURITY: Loop var is ALWAYS treated as unknown-value (VAR_PLACEHOLDER). 698 // Even "static" iteration words can be: 699 // - Absolute paths: `for i in /etc/passwd; do rm $i; done` — body argv 700 // would have placeholder, path validation never sees /etc/passwd. 701 // - Globs: `for i in /etc/*; do rm $i; done` — `/etc/*` is a static word 702 // at parse time but bash expands it at runtime. 703 // - Flags: `for i in -rf /; do rm $i; done` — flag smuggling. 704 // 705 // VAR_PLACEHOLDER means bare `$i` in body → too-complex. Only 706 // string-embedding (`echo "item: $i"`) stays simple. This reverts some 707 // of the too-complex→simple rescues in the original PR — each one was a 708 // potential path-validation bypass. 709 let loopVar: string | null = null 710 let doGroup: Node | null = null 711 for (const child of node.children) { 712 if (!child) continue 713 if (child.type === 'variable_name') { 714 loopVar = child.text 715 } else if (child.type === 'do_group') { 716 doGroup = child 717 } else if ( 718 child.type === 'for' || 719 child.type === 'in' || 720 child.type === 'select' || 721 child.type === ';' 722 ) { 723 continue // structural tokens 724 } else if (child.type === 'command_substitution') { 725 // `for i in $(seq 1 3)` — inner cmd IS extracted and rule-checked. 726 const err = collectCommandSubstitution(child, commands, varScope) 727 if (err) return err 728 } else { 729 // Iteration values — validated via walkArgument. Value discarded: 730 // body argv gets VAR_PLACEHOLDER regardless of the iteration words, 731 // and bare `$i` in body → too-complex (see SECURITY comment above). 732 // We still validate to reject e.g. `for i in $(cmd); do ...; done` 733 // where the iteration word itself is a disallowed expansion. 734 const arg = walkArgument(child, commands, varScope) 735 if (typeof arg !== 'string') return arg 736 } 737 } 738 if (loopVar === null || doGroup === null) return tooComplex(node) 739 // SECURITY: `for PS4 in '$(id)'; do set -x; :; done` sets PS4 directly 740 // via varScope.set below — walkVariableAssignment's PS4/IFS checks never 741 // fire. Trace-time RCE (PS4) or word-split bypass (IFS). No legit use. 742 if (loopVar === 'PS4' || loopVar === 'IFS') { 743 return { 744 kind: 'too-complex', 745 reason: `${loopVar} as loop variable bypasses assignment validation`, 746 nodeType: 'for_statement', 747 } 748 } 749 // SECURITY: Body uses a scope COPY — vars assigned inside the loop 750 // body don't leak to commands after `done`. The loop var itself is 751 // set in the REAL scope (bash semantics: $i still set after loop) 752 // and copied into the body scope. ALWAYS VAR_PLACEHOLDER — see above. 753 varScope.set(loopVar, VAR_PLACEHOLDER) 754 const bodyScope = new Map(varScope) 755 for (const c of doGroup.children) { 756 if (!c) continue 757 if (c.type === 'do' || c.type === 'done' || c.type === ';') continue 758 const err = collectCommands(c, commands, bodyScope) 759 if (err) return err 760 } 761 return null 762 } 763 764 if (node.type === 'if_statement' || node.type === 'while_statement') { 765 // `if COND; then BODY; [elif...; else...;] fi` 766 // `while COND; do BODY; done` 767 // Extract condition command(s) + all branch/body commands. All get 768 // checked against permission rules. `while read VAR` tracks VAR so 769 // body can reference $VAR. 770 // 771 // SECURITY: Branch bodies use scope COPIES — vars assigned inside a 772 // conditional branch (which may not execute) must not leak to commands 773 // after fi/done. `if false; then T=safe; fi && rm $T` must reject $T. 774 // Condition commands use the REAL varScope (they always run for the 775 // check, so assignments there are unconditional — e.g., `while read V` 776 // tracking must persist to the body copy). 777 // 778 // tree-sitter if_statement children: if, COND..., then, THEN-BODY..., 779 // [elif_clause...], [else_clause], fi. We distinguish condition from 780 // then-body by tracking whether we've seen the `then` token. 781 let seenThen = false 782 for (const child of node.children) { 783 if (!child) continue 784 if ( 785 child.type === 'if' || 786 child.type === 'fi' || 787 child.type === 'else' || 788 child.type === 'elif' || 789 child.type === 'while' || 790 child.type === 'until' || 791 child.type === ';' 792 ) { 793 continue 794 } 795 if (child.type === 'then') { 796 seenThen = true 797 continue 798 } 799 if (child.type === 'do_group') { 800 // while body: recurse with scope COPY (body assignments don't leak 801 // past done). The COPY contains any `read VAR` tracking from the 802 // condition (already in real varScope at this point). 803 const bodyScope = new Map(varScope) 804 for (const c of child.children) { 805 if (!c) continue 806 if (c.type === 'do' || c.type === 'done' || c.type === ';') continue 807 const err = collectCommands(c, commands, bodyScope) 808 if (err) return err 809 } 810 continue 811 } 812 if (child.type === 'elif_clause' || child.type === 'else_clause') { 813 // elif_clause: elif, cond, ;, then, body... / else_clause: else, body... 814 // Scope COPY — elif/else branch assignments don't leak past fi. 815 const branchScope = new Map(varScope) 816 for (const c of child.children) { 817 if (!c) continue 818 if ( 819 c.type === 'elif' || 820 c.type === 'else' || 821 c.type === 'then' || 822 c.type === ';' 823 ) { 824 continue 825 } 826 const err = collectCommands(c, commands, branchScope) 827 if (err) return err 828 } 829 continue 830 } 831 // Condition (seenThen=false) or then-body (seenThen=true). 832 // Condition uses REAL varScope (always runs). Then-body uses a COPY. 833 // Special-case `while read VAR`: after condition `read VAR` is 834 // collected, track VAR in the REAL scope so the body COPY inherits it. 835 const targetScope = seenThen ? new Map(varScope) : varScope 836 const before = commands.length 837 const err = collectCommands(child, commands, targetScope) 838 if (err) return err 839 // If condition included `read VAR...`, track vars in REAL scope. 840 // read var value is UNKNOWN (stdin input) → use VAR_PLACEHOLDER 841 // (unknown-value sentinel, string-only). 842 if (!seenThen) { 843 for (let i = before; i < commands.length; i++) { 844 const c = commands[i] 845 if (c?.argv[0] === 'read') { 846 for (const a of c.argv.slice(1)) { 847 // Skip flags (-r, -d, etc.); track bare identifier args as var names. 848 if (!a.startsWith('-') && /^[A-Za-z_][A-Za-z0-9_]*$/.test(a)) { 849 // SECURITY: commands[] is a flat accumulator. `true || read 850 // VAR` in the condition: the list handler correctly uses a 851 // scope COPY for the ||-RHS (may not run), but `read VAR` 852 // IS still pushed to commands[] — we can't tell it was 853 // scope-isolated from here. Same for `echo | read VAR` 854 // (pipeline, subshell in bash) and `(read VAR)` (subshell). 855 // Overwriting a tracked literal with VAR_PLACEHOLDER hides 856 // path traversal: `VAR=../../etc/passwd && if true || read 857 // VAR; then cat "/tmp/$VAR"; fi` — parser would see 858 // /tmp/__TRACKED_VAR__, bash reads /etc/passwd. Fail closed 859 // when a tracked literal would be overwritten. Safe case 860 // (no prior value or already a placeholder) → proceed. 861 const existing = varScope.get(a) 862 if ( 863 existing !== undefined && 864 !containsAnyPlaceholder(existing) 865 ) { 866 return { 867 kind: 'too-complex', 868 reason: `'read ${a}' in condition may not execute (||/pipeline/subshell); cannot prove it overwrites tracked literal '${existing}'`, 869 nodeType: 'if_statement', 870 } 871 } 872 varScope.set(a, VAR_PLACEHOLDER) 873 } 874 } 875 } 876 } 877 } 878 } 879 return null 880 } 881 882 if (node.type === 'subshell') { 883 // `(cmd1; cmd2)` — run commands in a subshell. Inner commands ARE 884 // executed, so extract them for permission checking. Subshell has 885 // isolated scope: vars set inside don't leak out. Use a COPY of 886 // varScope (outer vars visible, inner changes discarded). 887 const innerScope = new Map(varScope) 888 for (const child of node.children) { 889 if (!child) continue 890 if (child.type === '(' || child.type === ')') continue 891 const err = collectCommands(child, commands, innerScope) 892 if (err) return err 893 } 894 return null 895 } 896 897 if (node.type === 'test_command') { 898 // `[[ EXPR ]]` or `[ EXPR ]` — conditional test. Evaluates to true/false 899 // based on file tests (-f, -d), string comparisons (==, !=), etc. 900 // No code execution (no command_substitution inside — that would be a 901 // child and we'd recurse into it via walkArgument and reject it). 902 // Push as a synthetic command with argv[0]='[[' so permission rules 903 // can match — `Bash([[ :*)` would be unusual but legal. 904 // Walk arguments to validate (no cmdsub/expansion inside operands). 905 const argv: string[] = ['[['] 906 for (const child of node.children) { 907 if (!child) continue 908 if (child.type === '[[' || child.type === ']]') continue 909 if (child.type === '[' || child.type === ']') continue 910 // Recurse into test expression structure: unary_expression, 911 // binary_expression, parenthesized_expression, negated_expression. 912 // The leaves are test_operator (-f, -d, ==) and operand words. 913 const err = walkTestExpr(child, argv, commands, varScope) 914 if (err) return err 915 } 916 commands.push({ argv, envVars: [], redirects: [], text: node.text }) 917 return null 918 } 919 920 if (node.type === 'unset_command') { 921 // `unset FOO BAR`, `unset -f func`. Safe: only removes shell 922 // variables/functions from the current shell — no code execution, no 923 // filesystem I/O. tree-sitter emits a dedicated node type so it 924 // previously fell through to tooComplex. Children: `unset` keyword, 925 // `variable_name` for each name, `word` for flags like `-f`/`-v`. 926 const argv: string[] = [] 927 for (const child of node.children) { 928 if (!child) continue 929 switch (child.type) { 930 case 'unset': 931 argv.push(child.text) 932 break 933 case 'variable_name': 934 argv.push(child.text) 935 // SECURITY: unset removes the var from bash's scope. Remove from 936 // varScope so subsequent `$VAR` references correctly reject. 937 // `VAR=safe && unset VAR && rm $VAR` must NOT resolve $VAR. 938 varScope.delete(child.text) 939 break 940 case 'word': { 941 const arg = walkArgument(child, commands, varScope) 942 if (typeof arg !== 'string') return arg 943 argv.push(arg) 944 break 945 } 946 default: 947 return tooComplex(child) 948 } 949 } 950 commands.push({ argv, envVars: [], redirects: [], text: node.text }) 951 return null 952 } 953 954 return tooComplex(node) 955} 956 957/** 958 * Recursively walk a test_command expression tree (unary/binary/negated/ 959 * parenthesized expressions). Leaves are test_operator tokens and operands 960 * (word/string/number/etc). Operands are validated via walkArgument. 961 */ 962function walkTestExpr( 963 node: Node, 964 argv: string[], 965 innerCommands: SimpleCommand[], 966 varScope: Map<string, string>, 967): ParseForSecurityResult | null { 968 switch (node.type) { 969 case 'unary_expression': 970 case 'binary_expression': 971 case 'negated_expression': 972 case 'parenthesized_expression': { 973 for (const c of node.children) { 974 if (!c) continue 975 const err = walkTestExpr(c, argv, innerCommands, varScope) 976 if (err) return err 977 } 978 return null 979 } 980 case 'test_operator': 981 case '!': 982 case '(': 983 case ')': 984 case '&&': 985 case '||': 986 case '==': 987 case '=': 988 case '!=': 989 case '<': 990 case '>': 991 case '=~': 992 argv.push(node.text) 993 return null 994 case 'regex': 995 case 'extglob_pattern': 996 // RHS of =~ or ==/!= in [[ ]]. Pattern text only — no code execution. 997 // Parser emits these as leaf nodes with no children (any $(...) or ${...} 998 // inside the pattern is a sibling, not a child, and is walked separately). 999 argv.push(node.text) 1000 return null 1001 default: { 1002 // Operand — word, string, number, etc. Validate via walkArgument. 1003 const arg = walkArgument(node, innerCommands, varScope) 1004 if (typeof arg !== 'string') return arg 1005 argv.push(arg) 1006 return null 1007 } 1008 } 1009} 1010 1011/** 1012 * A `redirected_statement` wraps a command (or pipeline) plus one or more 1013 * `file_redirect`/`heredoc_redirect` nodes. Extract redirects, walk the 1014 * inner command, attach redirects to the LAST command (the one whose output 1015 * is being redirected). 1016 */ 1017function walkRedirectedStatement( 1018 node: Node, 1019 commands: SimpleCommand[], 1020 varScope: Map<string, string>, 1021): ParseForSecurityResult | null { 1022 const redirects: Redirect[] = [] 1023 let innerCommand: Node | null = null 1024 1025 for (const child of node.children) { 1026 if (!child) continue 1027 if (child.type === 'file_redirect') { 1028 // Thread `commands` so $() in redirect targets (e.g., `> $(mktemp)`) 1029 // extracts the inner command for permission checking. 1030 const r = walkFileRedirect(child, commands, varScope) 1031 if ('kind' in r) return r 1032 redirects.push(r) 1033 } else if (child.type === 'heredoc_redirect') { 1034 const r = walkHeredocRedirect(child) 1035 if (r) return r 1036 } else if ( 1037 child.type === 'command' || 1038 child.type === 'pipeline' || 1039 child.type === 'list' || 1040 child.type === 'negated_command' || 1041 child.type === 'declaration_command' || 1042 child.type === 'unset_command' 1043 ) { 1044 innerCommand = child 1045 } else { 1046 return tooComplex(child) 1047 } 1048 } 1049 1050 if (!innerCommand) { 1051 // `> file` alone is valid bash (truncates file). Represent as a command 1052 // with empty argv so downstream sees the write. 1053 commands.push({ argv: [], envVars: [], redirects, text: node.text }) 1054 return null 1055 } 1056 1057 const before = commands.length 1058 const err = collectCommands(innerCommand, commands, varScope) 1059 if (err) return err 1060 if (commands.length > before && redirects.length > 0) { 1061 const last = commands[commands.length - 1] 1062 if (last) last.redirects.push(...redirects) 1063 } 1064 return null 1065} 1066 1067/** 1068 * Extract operator + target from a `file_redirect` node. The target must be 1069 * a static word or string. 1070 */ 1071function walkFileRedirect( 1072 node: Node, 1073 innerCommands: SimpleCommand[], 1074 varScope: Map<string, string>, 1075): Redirect | ParseForSecurityResult { 1076 let op: Redirect['op'] | null = null 1077 let target: string | null = null 1078 let fd: number | undefined 1079 1080 for (const child of node.children) { 1081 if (!child) continue 1082 if (child.type === 'file_descriptor') { 1083 fd = Number(child.text) 1084 } else if (child.type in REDIRECT_OPS) { 1085 op = REDIRECT_OPS[child.type] ?? null 1086 } else if (child.type === 'word' || child.type === 'number') { 1087 // SECURITY: `number` nodes can contain expansion children via the 1088 // `NN#<expansion>` arithmetic-base grammar quirk — same issue as 1089 // walkArgument's number case. `> 10#$(cmd)` runs cmd at runtime. 1090 // Plain word/number nodes have zero children. 1091 if (child.children.length > 0) return tooComplex(child) 1092 // Symmetry with walkArgument (~608): `echo foo > {a,b}` is an 1093 // ambiguous redirect in bash. tree-sitter actually emits a 1094 // `concatenation` node for brace targets (caught by the default 1095 // branch below), but check `word` text too for defense-in-depth. 1096 if (BRACE_EXPANSION_RE.test(child.text)) return tooComplex(child) 1097 // Unescape backslash sequences — same as walkArgument. Bash quote 1098 // removal turns `\X` → `X`. Without this, `cat < /proc/self/\environ` 1099 // stores target `/proc/self/\environ` which evades PROC_ENVIRON_RE, 1100 // but bash reads /proc/self/environ. 1101 target = child.text.replace(/\\(.)/g, '$1') 1102 } else if (child.type === 'raw_string') { 1103 target = stripRawString(child.text) 1104 } else if (child.type === 'string') { 1105 const s = walkString(child, innerCommands, varScope) 1106 if (typeof s !== 'string') return s 1107 target = s 1108 } else if (child.type === 'concatenation') { 1109 // `echo > "foo"bar` — tree-sitter produces a concatenation of string + 1110 // word children. walkArgument already validates concatenation (rejects 1111 // expansions, checks brace syntax) and returns the joined text. 1112 const s = walkArgument(child, innerCommands, varScope) 1113 if (typeof s !== 'string') return s 1114 target = s 1115 } else { 1116 return tooComplex(child) 1117 } 1118 } 1119 1120 if (!op || target === null) { 1121 return { 1122 kind: 'too-complex', 1123 reason: 'Unrecognized redirect shape', 1124 nodeType: node.type, 1125 } 1126 } 1127 return { op, target, fd } 1128} 1129 1130/** 1131 * Heredoc redirect. Only quoted-delimiter heredocs (<<'EOF') are safe — 1132 * their bodies are literal text. Unquoted-delimiter heredocs (<<EOF) 1133 * undergo full parameter/command/arithmetic expansion in the body. 1134 * 1135 * SECURITY: tree-sitter-bash has a grammar gap — backticks (`...`) inside 1136 * an unquoted heredoc body are NOT parsed as command_substitution nodes 1137 * (body.children is empty, backticks are in body.text). But bash DOES 1138 * execute them. We cannot safely relax the quoted-delimiter requirement 1139 * by checking body children for expansion nodes — we'd miss backtick 1140 * substitution. Keep rejecting all unquoted heredocs. Users should use 1141 * <<'EOF' to get a literal body, which the model already prefers. 1142 */ 1143function walkHeredocRedirect(node: Node): ParseForSecurityResult | null { 1144 let startText: string | null = null 1145 let body: Node | null = null 1146 1147 for (const child of node.children) { 1148 if (!child) continue 1149 if (child.type === 'heredoc_start') startText = child.text 1150 else if (child.type === 'heredoc_body') body = child 1151 else if ( 1152 child.type === '<<' || 1153 child.type === '<<-' || 1154 child.type === 'heredoc_end' || 1155 child.type === 'file_descriptor' 1156 ) { 1157 // expected structural tokens — safe to skip. file_descriptor 1158 // covers fd-prefixed heredocs (`cat 3<<'EOF'`) — walkFileRedirect 1159 // already treats it as a benign structural token. 1160 } else { 1161 // SECURITY: tree-sitter places pipeline / command / file_redirect / 1162 // && / etc. as children of heredoc_redirect when they follow the 1163 // delimiter on the same line (e.g. `ls <<'EOF' | rm x`). Previously 1164 // these were silently skipped, hiding the piped command from 1165 // permission checks. Fail closed like every other walker. 1166 return tooComplex(child) 1167 } 1168 } 1169 1170 const isQuoted = 1171 startText !== null && 1172 ((startText.startsWith("'") && startText.endsWith("'")) || 1173 (startText.startsWith('"') && startText.endsWith('"')) || 1174 startText.startsWith('\\')) 1175 1176 if (!isQuoted) { 1177 return { 1178 kind: 'too-complex', 1179 reason: 'Heredoc with unquoted delimiter undergoes shell expansion', 1180 nodeType: 'heredoc_redirect', 1181 } 1182 } 1183 1184 if (body) { 1185 for (const child of body.children) { 1186 if (!child) continue 1187 if (child.type !== 'heredoc_content') { 1188 return tooComplex(child) 1189 } 1190 } 1191 } 1192 return null 1193} 1194 1195/** 1196 * Here-string redirect (`<<< content`). The content becomes stdin — not 1197 * argv, not a path. Safe when content is a literal word, raw_string, or 1198 * string with no expansions. Reject when content contains $()/${}/$VAR — 1199 * those execute arbitrary code or inject runtime values. 1200 * 1201 * Reuses walkArgument for content validation: it already rejects 1202 * command_substitution, expansion, and (for strings) simple_expansion 1203 * unless the var is tracked/safe. The result string is discarded — we only 1204 * care that it's statically resolvable. 1205 * 1206 * NOTE: `VAR=$(cmd) && cat <<< "$VAR"` would be safe in principle (inner 1207 * cmd is extracted separately, herestring content is stdin) but is 1208 * currently rejected conservatively — walkString's solo-placeholder guard 1209 * fires because it has no awareness of herestring vs argv context. 1210 */ 1211function walkHerestringRedirect( 1212 node: Node, 1213 innerCommands: SimpleCommand[], 1214 varScope: Map<string, string>, 1215): ParseForSecurityResult | null { 1216 for (const child of node.children) { 1217 if (!child) continue 1218 if (child.type === '<<<') continue 1219 // Content node: reuse walkArgument. It returns a string on success 1220 // (which we discard — content is stdin, irrelevant to permissions) or 1221 // a too-complex result on failure (expansion found, unresolvable var). 1222 const content = walkArgument(child, innerCommands, varScope) 1223 if (typeof content !== 'string') return content 1224 // Herestring content is discarded (not in argv/envVars/redirects) but 1225 // remains in .text via raw node.text. Scan it here so checkSemantics's 1226 // NEWLINE_HASH invariant (bashPermissions.ts relies on it) still holds. 1227 if (NEWLINE_HASH_RE.test(content)) return tooComplex(child) 1228 } 1229 return null 1230} 1231 1232/** 1233 * Walk a `command` node and extract argv. Children appear in order: 1234 * [variable_assignment...] command_name [argument...] [file_redirect...] 1235 * Any child type not explicitly handled triggers too-complex. 1236 */ 1237function walkCommand( 1238 node: Node, 1239 extraRedirects: Redirect[], 1240 innerCommands: SimpleCommand[], 1241 varScope: Map<string, string>, 1242): ParseForSecurityResult { 1243 const argv: string[] = [] 1244 const envVars: { name: string; value: string }[] = [] 1245 const redirects: Redirect[] = [...extraRedirects] 1246 1247 for (const child of node.children) { 1248 if (!child) continue 1249 1250 switch (child.type) { 1251 case 'variable_assignment': { 1252 const ev = walkVariableAssignment(child, innerCommands, varScope) 1253 if ('kind' in ev) return ev 1254 // SECURITY: Env-prefix assignments (`VAR=x cmd`) are command-local in 1255 // bash — VAR is only visible to `cmd` as an env var, NOT to 1256 // subsequent commands. Do NOT add to global varScope — that would 1257 // let `VAR=safe cmd1 && rm $VAR` resolve $VAR when bash has unset it. 1258 envVars.push({ name: ev.name, value: ev.value }) 1259 break 1260 } 1261 case 'command_name': { 1262 const arg = walkArgument( 1263 child.children[0] ?? child, 1264 innerCommands, 1265 varScope, 1266 ) 1267 if (typeof arg !== 'string') return arg 1268 argv.push(arg) 1269 break 1270 } 1271 case 'word': 1272 case 'number': 1273 case 'raw_string': 1274 case 'string': 1275 case 'concatenation': 1276 case 'arithmetic_expansion': { 1277 const arg = walkArgument(child, innerCommands, varScope) 1278 if (typeof arg !== 'string') return arg 1279 argv.push(arg) 1280 break 1281 } 1282 // NOTE: command_substitution as a BARE argument (not inside a string) 1283 // is intentionally NOT handled here — the $() output IS the argument, 1284 // and for path-sensitive commands (cd, rm, chmod) the placeholder would 1285 // hide the real path from downstream checks. `cd $(echo /etc)` must 1286 // stay too-complex so the path-check can't be bypassed. $() inside 1287 // strings ("Timer: $(date)") is handled in walkString where the output 1288 // is embedded in a longer string (safer). 1289 case 'simple_expansion': { 1290 // Bare `$VAR` as an argument. Tracked static vars return the ACTUAL 1291 // value (e.g. VAR=/etc → '/etc'). Values with IFS/glob chars or 1292 // placeholders reject. See resolveSimpleExpansion. 1293 const v = resolveSimpleExpansion(child, varScope, false) 1294 if (typeof v !== 'string') return v 1295 argv.push(v) 1296 break 1297 } 1298 case 'file_redirect': { 1299 const r = walkFileRedirect(child, innerCommands, varScope) 1300 if ('kind' in r) return r 1301 redirects.push(r) 1302 break 1303 } 1304 case 'herestring_redirect': { 1305 // `cmd <<< "content"` — content is stdin, not argv. Validate it's 1306 // literal (no expansion); discard the content string. 1307 const err = walkHerestringRedirect(child, innerCommands, varScope) 1308 if (err) return err 1309 break 1310 } 1311 default: 1312 return tooComplex(child) 1313 } 1314 } 1315 1316 // .text is the raw source span. Downstream (bashToolCheckPermission → 1317 // splitCommand_DEPRECATED) re-tokenizes it via shell-quote. Normally .text 1318 // is used unchanged — but if we resolved a $VAR into argv, .text diverges 1319 // (has raw `$VAR`) and downstream RULE MATCHING would miss deny rules. 1320 // 1321 // SECURITY: `SUB=push && git $SUB --force` with `Bash(git push:*)` deny: 1322 // argv = ['git', 'push', '--force'] ← correct, path validation sees 'push' 1323 // .text = 'git $SUB --force' ← deny rule 'git push:*' doesn't match 1324 // 1325 // Detection: any `$<identifier>` in node.text means a simple_expansion was 1326 // resolved (or we'd have returned too-complex). This catches $VAR at any 1327 // position — command_name, word, string interior, concatenation part. 1328 // `$(...)` doesn't match (paren, not identifier start). `'$VAR'` in single 1329 // quotes: tree-sitter's .text includes the quotes, so a naive check would 1330 // FP on `echo '$VAR'`. But single-quoted $ is LITERAL in bash — argv has 1331 // the literal `$VAR` string, so rebuilding from argv produces `'$VAR'` 1332 // anyway (shell-escape wraps it). Same net .text. No rule-matching error. 1333 // 1334 // Rebuild .text from argv. Shell-escape each arg: single-quote wrap with 1335 // `'\''` for embedded single quotes. Empty string, metacharacters, and 1336 // placeholders all get quoted. Downstream shell-quote re-parse is correct. 1337 // 1338 // NOTE: This does NOT include redirects/envVars in the rebuilt .text — 1339 // walkFileRedirect rejects simple_expansion, and envVars aren't used for 1340 // rule matching. If either changes, this rebuild must include them. 1341 // 1342 // SECURITY: also rebuild when node.text contains a newline. Line 1343 // continuations `<space>\<LF>` are invisible to argv (tree-sitter collapses 1344 // them) but preserved in node.text. `timeout 5 \<LF>curl evil.com` → argv 1345 // is correct, but raw .text → stripSafeWrappers matches `timeout 5 ` (the 1346 // space before \), leaving `\<LF>curl evil.com` — Bash(curl:*) deny doesn't 1347 // prefix-match. Rebuilt .text joins argv with ' ' → no newlines → 1348 // stripSafeWrappers works. Also covers heredoc-body leakage. 1349 const text = 1350 /\$[A-Za-z_]/.test(node.text) || node.text.includes('\n') 1351 ? argv 1352 .map(a => 1353 a === '' || /["'\\ \t\n$`;|&<>(){}*?[\]~#]/.test(a) 1354 ? `'${a.replace(/'/g, "'\\''")}'` 1355 : a, 1356 ) 1357 .join(' ') 1358 : node.text 1359 return { 1360 kind: 'simple', 1361 commands: [{ argv, envVars, redirects, text }], 1362 } 1363} 1364 1365/** 1366 * Recurse into a command_substitution node's inner command(s). If the inner 1367 * command(s) parse cleanly (simple), add them to the innerCommands 1368 * accumulator and return null (success). If the inner command is itself 1369 * too-complex (e.g., nested arith expansion, process sub), return the error. 1370 * This enables recursive permission checking: `echo $(git rev-parse HEAD)` 1371 * extracts BOTH `echo $(git rev-parse HEAD)` (outer) AND `git rev-parse HEAD` 1372 * (inner) — permission rules must match BOTH for the whole command to allow. 1373 */ 1374function collectCommandSubstitution( 1375 csNode: Node, 1376 innerCommands: SimpleCommand[], 1377 varScope: Map<string, string>, 1378): ParseForSecurityResult | null { 1379 // Vars set BEFORE the $() are visible inside (bash subshell semantics), 1380 // but vars set INSIDE don't leak out. Pass a COPY of the outer scope so 1381 // inner assignments don't mutate the outer map. 1382 const innerScope = new Map(varScope) 1383 // command_substitution children: `$(` or `` ` ``, inner statement(s), `)` 1384 for (const child of csNode.children) { 1385 if (!child) continue 1386 if (child.type === '$(' || child.type === '`' || child.type === ')') { 1387 continue 1388 } 1389 const err = collectCommands(child, innerCommands, innerScope) 1390 if (err) return err 1391 } 1392 return null 1393} 1394 1395/** 1396 * Convert an argument node to its literal string value. Quotes are resolved. 1397 * This function implements the argument-position allowlist. 1398 */ 1399function walkArgument( 1400 node: Node | null, 1401 innerCommands: SimpleCommand[], 1402 varScope: Map<string, string>, 1403): string | ParseForSecurityResult { 1404 if (!node) { 1405 return { kind: 'too-complex', reason: 'Null argument node' } 1406 } 1407 1408 switch (node.type) { 1409 case 'word': { 1410 // Unescape backslash sequences. In unquoted context, bash's quote 1411 // removal turns `\X` → `X` for any character X. tree-sitter preserves 1412 // the raw text. Required for checkSemantics: `\eval` must match 1413 // EVAL_LIKE_BUILTINS, `\zmodload` must match ZSH_DANGEROUS_BUILTINS. 1414 // Also makes argv accurate: `find -exec {} \;` → argv has `;` not 1415 // `\;`. (Deny-rule matching on .text already worked via downstream 1416 // splitCommand_DEPRECATED unescaping — see walkCommand comment.) `\<whitespace>` 1417 // is already rejected by BACKSLASH_WHITESPACE_RE. 1418 if (BRACE_EXPANSION_RE.test(node.text)) { 1419 return { 1420 kind: 'too-complex', 1421 reason: 'Word contains brace expansion syntax', 1422 nodeType: 'word', 1423 } 1424 } 1425 return node.text.replace(/\\(.)/g, '$1') 1426 } 1427 1428 case 'number': 1429 // SECURITY: tree-sitter-bash parses `NN#<expansion>` (arithmetic base 1430 // syntax) as a `number` node with the expansion as a CHILD. `10#$(cmd)` 1431 // is a number node whose .text is the full literal but whose child is a 1432 // command_substitution — bash runs the substitution. .text on a node 1433 // with children would smuggle the expansion past permission checks. 1434 // Plain numbers (`10`, `16#ff`) have zero children. 1435 if (node.children.length > 0) { 1436 return { 1437 kind: 'too-complex', 1438 reason: 'Number node contains expansion (NN# arithmetic base syntax)', 1439 nodeType: node.children[0]?.type, 1440 } 1441 } 1442 return node.text 1443 1444 case 'raw_string': 1445 return stripRawString(node.text) 1446 1447 case 'string': 1448 return walkString(node, innerCommands, varScope) 1449 1450 case 'concatenation': { 1451 if (BRACE_EXPANSION_RE.test(node.text)) { 1452 return { 1453 kind: 'too-complex', 1454 reason: 'Brace expansion', 1455 nodeType: 'concatenation', 1456 } 1457 } 1458 let result = '' 1459 for (const child of node.children) { 1460 if (!child) continue 1461 const part = walkArgument(child, innerCommands, varScope) 1462 if (typeof part !== 'string') return part 1463 result += part 1464 } 1465 return result 1466 } 1467 1468 case 'arithmetic_expansion': { 1469 const err = walkArithmetic(node) 1470 if (err) return err 1471 return node.text 1472 } 1473 1474 case 'simple_expansion': { 1475 // `$VAR` inside a concatenation (e.g., `prefix$VAR`). Same rules 1476 // as the bare case in walkCommand: must be tracked or SAFE_ENV_VARS. 1477 // inside-concatenation counts as bare arg (the whole concat IS the arg) 1478 return resolveSimpleExpansion(node, varScope, false) 1479 } 1480 1481 // NOTE: command_substitution at arg position (bare or inside concatenation) 1482 // is intentionally NOT handled — the output is/becomes-part-of a positional 1483 // argument which might be a path or flag. `rm $(foo)` or `rm $(foo)bar` 1484 // would hide the real path behind the placeholder. Only $() inside a 1485 // `string` node (walkString) is extracted, since the output is embedded 1486 // in a longer string rather than BEING the argument. 1487 1488 default: 1489 return tooComplex(node) 1490 } 1491} 1492 1493/** 1494 * Extract literal content from a double-quoted string node. A `string` node's 1495 * children are `"` delimiters, `string_content` literals, and possibly 1496 * expansion nodes. 1497 * 1498 * tree-sitter quirk: literal newlines inside double quotes are NOT included 1499 * in `string_content` node text. bash preserves them. For `"a\nb"`, 1500 * tree-sitter produces two `string_content` children (`"a"`, `"b"`) with the 1501 * newline in neither. For `"\n#"`, it produces ONE child (`"#"`) with the 1502 * leading newline eaten. Concatenating children therefore loses newlines. 1503 * 1504 * Fix: track child `startIndex` and insert one `\n` per index gap. The gap 1505 * between children IS the dropped newline(s). This makes the argv value 1506 * match what bash actually sees. 1507 */ 1508function walkString( 1509 node: Node, 1510 innerCommands: SimpleCommand[], 1511 varScope: Map<string, string>, 1512): string | ParseForSecurityResult { 1513 let result = '' 1514 let cursor = -1 1515 // SECURITY: Track whether the string contains a runtime-unknown 1516 // placeholder ($() output or unknown-value tracked var) vs any literal 1517 // content. A string that is ONLY a placeholder (`"$(cmd)"`, `"$VAR"` 1518 // where VAR holds an unknown sentinel) produces an argv element that IS 1519 // the placeholder — which downstream path validation resolves as a 1520 // relative filename within cwd, bypassing the check. `cd "$(echo /etc)"` 1521 // would pass validation but runtime-cd into /etc. We reject 1522 // solo-placeholder strings; placeholders mixed with literal content 1523 // (`"prefix: $(cmd)"`) are safe — runtime value can't equal a bare path. 1524 let sawDynamicPlaceholder = false 1525 let sawLiteralContent = false 1526 for (const child of node.children) { 1527 if (!child) continue 1528 // Index gap between this child and the previous one = dropped newline(s). 1529 // Ignore the gap before the first non-delimiter child (cursor === -1). 1530 // Skip gap-fill for `"` delimiters: a gap before the closing `"` is the 1531 // tree-sitter whitespace-only-string quirk (space/tab, not newline) — let 1532 // the Fix C check below catch it as too-complex instead of mis-filling 1533 // with `\n` and diverging from bash. 1534 if (cursor !== -1 && child.startIndex > cursor && child.type !== '"') { 1535 result += '\n'.repeat(child.startIndex - cursor) 1536 sawLiteralContent = true 1537 } 1538 cursor = child.endIndex 1539 switch (child.type) { 1540 case '"': 1541 // Reset cursor after opening quote so the gap between `"` and the 1542 // first content child is captured. 1543 cursor = child.endIndex 1544 break 1545 case 'string_content': 1546 // Bash double-quote escape rules (NOT the generic /\\(.)/g used for 1547 // unquoted words in walkArgument): inside "...", a backslash only 1548 // escapes $ ` " \ — other sequences like \n stay literal. So 1549 // `"fix \"bug\""` → `fix "bug"`, but `"a\nb"` → `a\nb` (backslash 1550 // kept). tree-sitter preserves the raw escapes in .text; we resolve 1551 // them here so argv matches what bash actually passes. 1552 result += child.text.replace(/\\([$`"\\])/g, '$1') 1553 sawLiteralContent = true 1554 break 1555 case DOLLAR: 1556 // A bare dollar sign before closing quote or a non-name char is 1557 // literal in bash. tree-sitter emits it as a standalone node. 1558 result += DOLLAR 1559 sawLiteralContent = true 1560 break 1561 case 'command_substitution': { 1562 // Carve-out: `$(cat <<'EOF' ... EOF)` is safe. The quoted-delimiter 1563 // heredoc body is literal (no expansion), and `cat` just prints it. 1564 // The substitution result is therefore a known static string. This 1565 // pattern is the idiomatic way to pass multi-line content to tools 1566 // like `gh pr create --body`. We replace the substitution with a 1567 // placeholder argv value — the actual content doesn't matter for 1568 // permission checking, only that it IS static. 1569 const heredocBody = extractSafeCatHeredoc(child) 1570 if (heredocBody === 'DANGEROUS') return tooComplex(child) 1571 if (heredocBody !== null) { 1572 // SECURITY: the body IS the substitution result. Previously we 1573 // dropped it → `rm "$(cat <<'EOF'\n/etc/passwd\nEOF)"` produced 1574 // argv ['rm',''] while bash runs `rm /etc/passwd`. validatePath('') 1575 // resolves to cwd → allowed. Every path-constrained command 1576 // bypassed via this. Now: append the body (trailing LF trimmed — 1577 // bash $() strips trailing newlines). 1578 // 1579 // Tradeoff: bodies with internal newlines are multi-line text 1580 // (markdown, scripts) which cannot be valid paths — safe to drop 1581 // to avoid NEWLINE_HASH_RE false positives on `## Summary`. A 1582 // single-line body (like `/etc/passwd`) MUST go into argv so 1583 // downstream path validation sees the real target. 1584 const trimmed = heredocBody.replace(/\n+$/, '') 1585 if (trimmed.includes('\n')) { 1586 sawLiteralContent = true 1587 break 1588 } 1589 result += trimmed 1590 sawLiteralContent = true 1591 break 1592 } 1593 // General $() inside "...": recurse into inner command(s). If they 1594 // parse cleanly, they become additional subcommands that the 1595 // permission system must match rules against. The outer argv gets 1596 // the original $() text as placeholder (runtime-determined value). 1597 // `echo "SHA: $(git rev-parse HEAD)"` → extracts BOTH 1598 // `echo "SHA: $(...)"` AND `git rev-parse HEAD` — both must match 1599 // permission rules. ~27% of too-complex in top-5k ant cmds. 1600 const err = collectCommandSubstitution(child, innerCommands, varScope) 1601 if (err) return err 1602 result += CMDSUB_PLACEHOLDER 1603 sawDynamicPlaceholder = true 1604 break 1605 } 1606 case 'simple_expansion': { 1607 // `$VAR` inside "...". Tracked/safe vars resolve; untracked reject. 1608 const v = resolveSimpleExpansion(child, varScope, true) 1609 if (typeof v !== 'string') return v 1610 // VAR_PLACEHOLDER = runtime-unknown (loop var, read var, $() output, 1611 // SAFE_ENV_VARS, special vars). Any other string = actual literal 1612 // value from a tracked static var (e.g. VAR=/tmp → v='/tmp'). 1613 if (v === VAR_PLACEHOLDER) sawDynamicPlaceholder = true 1614 else sawLiteralContent = true 1615 result += v 1616 break 1617 } 1618 case 'arithmetic_expansion': { 1619 const err = walkArithmetic(child) 1620 if (err) return err 1621 result += child.text 1622 // Validated to be literal-numeric — static content. 1623 sawLiteralContent = true 1624 break 1625 } 1626 default: 1627 // expansion (${...}) inside "..." 1628 return tooComplex(child) 1629 } 1630 } 1631 // SECURITY: Reject solo-placeholder strings. `"$(cmd)"` or `"$VAR"` (where 1632 // VAR holds an unknown value) would produce an argv element that IS the 1633 // placeholder — which bypasses downstream path validation (validatePath 1634 // resolves placeholders as relative filenames within cwd). Only allow 1635 // placeholders embedded alongside literal content (`"prefix: $(cmd)"`). 1636 if (sawDynamicPlaceholder && !sawLiteralContent) { 1637 return tooComplex(node) 1638 } 1639 // SECURITY: tree-sitter-bash quirk — a double-quoted string containing 1640 // ONLY whitespace (` "`, `" "`, `"\t"`) produces NO string_content child; 1641 // the whitespace is attributed to the closing `"` node's text. Our loop 1642 // only adds to `result` from string_content/expansion children, so we'd 1643 // return "" when bash sees " ". Detect: we saw no content children 1644 // (both flags false — neither literal nor placeholder added) but the 1645 // source span is longer than bare `""`. Genuine `""` has text.length==2. 1646 // `"$V"` with V="" doesn't hit this — the simple_expansion child sets 1647 // sawLiteralContent via the `else` branch even when v is empty. 1648 if (!sawLiteralContent && !sawDynamicPlaceholder && node.text.length > 2) { 1649 return tooComplex(node) 1650 } 1651 return result 1652} 1653 1654/** 1655 * Safe leaf nodes inside arithmetic expansion: integer literals (decimal, 1656 * hex, octal, bash base#digits) and operator/paren tokens. Anything else at 1657 * leaf position (notably variable_name that isn't a numeric literal) rejects. 1658 */ 1659const ARITH_LEAF_RE = 1660 /^(?:[0-9]+|0[xX][0-9a-fA-F]+|[0-9]+#[0-9a-zA-Z]+|[-+*/%^&|~!<>=?:(),]+|<<|>>|\*\*|&&|\|\||[<>=!]=|\$\(\(|\)\))$/ 1661 1662/** 1663 * Recursively validate an arithmetic_expansion node. Allows only literal 1664 * numeric expressions — no variables, no substitutions. Returns null if 1665 * safe, or a too-complex result if not. 1666 * 1667 * Variables are rejected because bash arithmetic recursively evaluates 1668 * variable values: if x='a[$(cmd)]' then $((x)) executes cmd. See 1669 * https://www.vidarholen.net/contents/blog/?p=716 (arithmetic injection). 1670 * 1671 * When safe, the caller puts the full `$((…))` span into argv as a literal 1672 * string. bash will expand it to an integer at runtime; the static string 1673 * won't match any sensitive path/deny patterns. 1674 */ 1675function walkArithmetic(node: Node): ParseForSecurityResult | null { 1676 for (const child of node.children) { 1677 if (!child) continue 1678 if (child.children.length === 0) { 1679 if (!ARITH_LEAF_RE.test(child.text)) { 1680 return { 1681 kind: 'too-complex', 1682 reason: `Arithmetic expansion references variable or non-literal: ${child.text}`, 1683 nodeType: 'arithmetic_expansion', 1684 } 1685 } 1686 continue 1687 } 1688 switch (child.type) { 1689 case 'binary_expression': 1690 case 'unary_expression': 1691 case 'ternary_expression': 1692 case 'parenthesized_expression': { 1693 const err = walkArithmetic(child) 1694 if (err) return err 1695 break 1696 } 1697 default: 1698 return tooComplex(child) 1699 } 1700 } 1701 return null 1702} 1703 1704/** 1705 * Check if a command_substitution node is exactly `$(cat <<'DELIM'...DELIM)` 1706 * and return the heredoc body if so. Any deviation (extra args to cat, 1707 * unquoted delimiter, additional commands) returns null. 1708 * 1709 * tree-sitter structure: 1710 * command_substitution 1711 * $( 1712 * redirected_statement 1713 * command → command_name → word "cat" (exactly one child) 1714 * heredoc_redirect 1715 * << 1716 * heredoc_start 'DELIM' (quoted) 1717 * heredoc_body (pure heredoc_content) 1718 * heredoc_end 1719 * ) 1720 */ 1721function extractSafeCatHeredoc(subNode: Node): string | 'DANGEROUS' | null { 1722 // Expect exactly: $( + one redirected_statement + ) 1723 let stmt: Node | null = null 1724 for (const child of subNode.children) { 1725 if (!child) continue 1726 if (child.type === '$(' || child.type === ')') continue 1727 if (child.type === 'redirected_statement' && stmt === null) { 1728 stmt = child 1729 } else { 1730 return null 1731 } 1732 } 1733 if (!stmt) return null 1734 1735 // redirected_statement must be: command(cat) + heredoc_redirect (quoted) 1736 let sawCat = false 1737 let body: string | null = null 1738 for (const child of stmt.children) { 1739 if (!child) continue 1740 if (child.type === 'command') { 1741 // Must be bare `cat` — no args, no env vars 1742 const cmdChildren = child.children.filter(c => c) 1743 if (cmdChildren.length !== 1) return null 1744 const nameNode = cmdChildren[0] 1745 if (nameNode?.type !== 'command_name' || nameNode.text !== 'cat') { 1746 return null 1747 } 1748 sawCat = true 1749 } else if (child.type === 'heredoc_redirect') { 1750 // Reuse the existing validator: quoted delimiter, body is pure text. 1751 // walkHeredocRedirect returns null on success, non-null on rejection. 1752 if (walkHeredocRedirect(child) !== null) return null 1753 for (const hc of child.children) { 1754 if (hc?.type === 'heredoc_body') body = hc.text 1755 } 1756 } else { 1757 return null 1758 } 1759 } 1760 1761 if (!sawCat || body === null) return null 1762 // SECURITY: the heredoc body becomes the outer command's argv value via 1763 // substitution, so a body like `/proc/self/environ` is semantically 1764 // `cat /proc/self/environ`. checkSemantics never sees the body (we drop it 1765 // at the walkString call site to avoid newline+# FPs). Returning `null` 1766 // here would fall through to collectCommandSubstitution in walkString, 1767 // which would extract the inner `cat` via walkHeredocRedirect (body text 1768 // not inspected there) — effectively bypassing this check. Return a 1769 // distinct sentinel so the caller can reject instead of falling through. 1770 if (PROC_ENVIRON_RE.test(body)) return 'DANGEROUS' 1771 // Same for jq system(): checkSemantics checks argv but never sees the 1772 // heredoc body. Check unconditionally (we don't know the outer command). 1773 if (/\bsystem\s*\(/.test(body)) return 'DANGEROUS' 1774 return body 1775} 1776 1777function walkVariableAssignment( 1778 node: Node, 1779 innerCommands: SimpleCommand[], 1780 varScope: Map<string, string>, 1781): { name: string; value: string; isAppend: boolean } | ParseForSecurityResult { 1782 let name: string | null = null 1783 let value = '' 1784 let isAppend = false 1785 1786 for (const child of node.children) { 1787 if (!child) continue 1788 if (child.type === 'variable_name') { 1789 name = child.text 1790 } else if (child.type === '=' || child.type === '+=') { 1791 // `PATH+=":/new"` — tree-sitter emits `+=` as a distinct operator 1792 // node. Without this case it falls through to walkArgument below 1793 // → tooComplex on unknown type `+=`. 1794 isAppend = child.type === '+=' 1795 continue 1796 } else if (child.type === 'command_substitution') { 1797 // $() as the variable's value. The output becomes a STRING stored in 1798 // the variable — it's NOT a positional argument (no path/flag concern). 1799 // `VAR=$(date)` runs `date`, stores output. `VAR=$(rm -rf /)` runs 1800 // `rm` — the inner command IS checked against permission rules, so 1801 // `rm` must match a rule. The variable just holds whatever `rm` prints. 1802 const err = collectCommandSubstitution(child, innerCommands, varScope) 1803 if (err) return err 1804 value = CMDSUB_PLACEHOLDER 1805 } else if (child.type === 'simple_expansion') { 1806 // `VAR=$OTHER` — assignment RHS does NOT word-split or glob-expand 1807 // in bash (unlike command arguments). So `A="a b"; B=$A` sets B to 1808 // the literal "a b". Resolve as if inside a string (insideString=true) 1809 // so BARE_VAR_UNSAFE_RE doesn't over-reject. The resulting value may 1810 // contain spaces/globs — if B is later used as a bare arg, THAT use 1811 // will correctly reject via BARE_VAR_UNSAFE_RE. 1812 const v = resolveSimpleExpansion(child, varScope, true) 1813 if (typeof v !== 'string') return v 1814 // If v is VAR_PLACEHOLDER (OTHER holds unknown), store it — combined 1815 // with containsAnyPlaceholder in the caller to treat as unknown. 1816 value = v 1817 } else { 1818 const v = walkArgument(child, innerCommands, varScope) 1819 if (typeof v !== 'string') return v 1820 value = v 1821 } 1822 } 1823 1824 if (name === null) { 1825 return { 1826 kind: 'too-complex', 1827 reason: 'Variable assignment without name', 1828 nodeType: 'variable_assignment', 1829 } 1830 } 1831 // SECURITY: tree-sitter-bash accepts invalid var names (e.g. `1VAR=value`) 1832 // as variable_assignment. Bash only recognizes [A-Za-z_][A-Za-z0-9_]* — 1833 // anything else is run as a COMMAND. `1VAR=value` → bash tries to execute 1834 // `1VAR=value` from PATH. We must not treat it as an inert assignment. 1835 if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(name)) { 1836 return { 1837 kind: 'too-complex', 1838 reason: `Invalid variable name (bash treats as command): ${name}`, 1839 nodeType: 'variable_assignment', 1840 } 1841 } 1842 // SECURITY: Setting IFS changes word-splitting behavior for subsequent 1843 // unquoted $VAR expansions. `IFS=: && VAR=a:b && rm $VAR` → bash splits 1844 // on `:` → `rm a b`. Our BARE_VAR_UNSAFE_RE only checks default IFS 1845 // chars (space/tab/NL) — we can't model custom IFS. Reject. 1846 if (name === 'IFS') { 1847 return { 1848 kind: 'too-complex', 1849 reason: 'IFS assignment changes word-splitting — cannot model statically', 1850 nodeType: 'variable_assignment', 1851 } 1852 } 1853 // SECURITY: PS4 is expanded via promptvars (default on) on every command 1854 // traced after `set -x`. A raw_string value containing $(cmd) or `cmd` 1855 // executes at trace time: `PS4='$(id)' && set -x && :` runs id, but our 1856 // argv is only [["set","-x"],[":"]] — the payload is invisible to 1857 // permission checks. PS0-3 and PROMPT_COMMAND are not expanded in 1858 // non-interactive shells (BashTool). 1859 // 1860 // ALLOWLIST, not blocklist. 5 rounds of bypass patches taught us that a 1861 // value-dependent blocklist is structurally fragile: 1862 // - `+=` effective-value computation diverges from bash in multiple 1863 // scope-model gaps: `||` reset, env-prefix chain (PS4='' && PS4='$' 1864 // PS4+='(id)' cmd reads stale parent value), subshell. 1865 // - bash's decode_prompt_string runs BEFORE promptvars, so `\044(id)` 1866 // (octal for `$`) becomes `$(id)` at trace time — any literal-char 1867 // check must model prompt-escape decoding exactly. 1868 // - assignment paths exist outside walkVariableAssignment (for_statement 1869 // sets loopVar directly, see that handler's PS4 check). 1870 // 1871 // Policy: (1) reject += outright — no scope-tracking dependency; user can 1872 // combine into one PS4=... (2) reject placeholders — runtime unknowable. 1873 // (3) allowlist remaining value: ${identifier} refs (value-read only, safe) 1874 // plus [A-Za-z0-9 _+:.\/=[\]-]. No bare `$` (blocks split primitive), no 1875 // `\` (blocks octal \044/\140), no backtick, no parens. Covers all known 1876 // encoding vectors and future ones — anything off the allowlist fails. 1877 // Legit `PS4='+${BASH_SOURCE}:${LINENO}: '` still passes. 1878 if (name === 'PS4') { 1879 if (isAppend) { 1880 return { 1881 kind: 'too-complex', 1882 reason: 1883 'PS4 += cannot be statically verified — combine into a single PS4= assignment', 1884 nodeType: 'variable_assignment', 1885 } 1886 } 1887 if (containsAnyPlaceholder(value)) { 1888 return { 1889 kind: 'too-complex', 1890 reason: 'PS4 value derived from cmdsub/variable — runtime unknowable', 1891 nodeType: 'variable_assignment', 1892 } 1893 } 1894 if ( 1895 !/^[A-Za-z0-9 _+:./=[\]-]*$/.test( 1896 value.replace(/\$\{[A-Za-z_][A-Za-z0-9_]*\}/g, ''), 1897 ) 1898 ) { 1899 return { 1900 kind: 'too-complex', 1901 reason: 1902 'PS4 value outside safe charset — only ${VAR} refs and [A-Za-z0-9 _+:.=/[]-] allowed', 1903 nodeType: 'variable_assignment', 1904 } 1905 } 1906 } 1907 // SECURITY: Tilde expansion in assignment RHS. `VAR=~/x` (unquoted) → 1908 // bash expands `~` at ASSIGNMENT time → VAR='/home/user/x'. We see the 1909 // literal `~/x`. Later `cd $VAR` → our argv `['cd','~/x']`, bash runs 1910 // `cd /home/user/x`. Tilde expansion also happens after `=` and `:` in 1911 // assignment values (e.g. PATH=~/bin:~/sbin). We can't model it — reject 1912 // any value containing `~` that isn't already quoted-literal (where bash 1913 // doesn't expand). Conservative: any `~` in value → reject. 1914 if (value.includes('~')) { 1915 return { 1916 kind: 'too-complex', 1917 reason: 'Tilde in assignment value — bash may expand at assignment time', 1918 nodeType: 'variable_assignment', 1919 } 1920 } 1921 return { name, value, isAppend } 1922} 1923 1924/** 1925 * Resolve a `simple_expansion` ($VAR) node. Returns VAR_PLACEHOLDER if 1926 * resolvable, too-complex otherwise. 1927 * 1928 * @param insideString true when $VAR is inside a `string` node ("...$VAR...") 1929 * rather than a bare/concatenation argument. SAFE_ENV_VARS and unknown-value 1930 * tracked vars are only allowed inside strings — as bare args their runtime 1931 * value IS the argument and we don't know it statically. 1932 * `cd $HOME/../x` would hide the real path behind the placeholder; 1933 * `echo "Home: $HOME"` just embeds text in a string. Tracked vars holding 1934 * STATIC strings (VAR=literal) are allowed in both positions since their 1935 * value IS known. 1936 */ 1937function resolveSimpleExpansion( 1938 node: Node, 1939 varScope: Map<string, string>, 1940 insideString: boolean, 1941): string | ParseForSecurityResult { 1942 let varName: string | null = null 1943 let isSpecial = false 1944 for (const c of node.children) { 1945 if (c?.type === 'variable_name') { 1946 varName = c.text 1947 break 1948 } 1949 if (c?.type === 'special_variable_name') { 1950 varName = c.text 1951 isSpecial = true 1952 break 1953 } 1954 } 1955 if (varName === null) return tooComplex(node) 1956 // Tracked vars: check stored value. Literal strings (VAR=/tmp) are 1957 // returned DIRECTLY so downstream path validation sees the real path. 1958 // Non-literal values (containing any placeholder — loop vars, $() output, 1959 // read vars, composites like `VAR="prefix$(cmd)"`) are ONLY safe inside 1960 // strings; as bare args they'd hide the runtime path/flag from validation. 1961 // 1962 // SECURITY: Returning the actual trackedValue (not a placeholder) is the 1963 // critical fix. `VAR=/etc && rm $VAR` → argv ['rm', '/etc'] → validatePath 1964 // correctly rejects. Previously returned a placeholder → validatePath saw 1965 // '__LOOP_STATIC__', resolved as cwd-relative → PASSED → bypass. 1966 const trackedValue = varScope.get(varName) 1967 if (trackedValue !== undefined) { 1968 if (containsAnyPlaceholder(trackedValue)) { 1969 // Non-literal: bare → reject, inside string → VAR_PLACEHOLDER 1970 // (walkString's solo-placeholder gate rejects `"$VAR"` alone). 1971 if (!insideString) return tooComplex(node) 1972 return VAR_PLACEHOLDER 1973 } 1974 // Pure literal (e.g. '/tmp', 'foo') — return it directly. Downstream 1975 // path validation / checkSemantics operate on the REAL value. 1976 // 1977 // SECURITY: For BARE args (not inside a string), bash word-splits on 1978 // $IFS and glob-expands the result. `VAR="-rf /" && rm $VAR` → bash 1979 // runs `rm -rf /` (two args); `VAR="/etc/*" && cat $VAR` → expands to 1980 // all files. Reject values containing IFS/glob chars unless in "...". 1981 // 1982 // SECURITY: Empty value as bare arg. Bash word-splitting on "" produces 1983 // ZERO fields — the expansion disappears. `V="" && $V eval x` → bash 1984 // runs `eval x` (our argv would be ["","eval","x"] with name="" — 1985 // every EVAL_LIKE/ZSH/keyword check misses). `V="" && ls $V /etc` → 1986 // bash runs `ls /etc`, our argv has a phantom "" shifting positions. 1987 // Inside "...": `"$V"` → bash produces one empty-string arg → our "" 1988 // is correct, keep allowing. 1989 if (!insideString) { 1990 if (trackedValue === '') return tooComplex(node) 1991 if (BARE_VAR_UNSAFE_RE.test(trackedValue)) return tooComplex(node) 1992 } 1993 return trackedValue 1994 } 1995 // SAFE_ENV_VARS + special vars ($?, $$, $@, $1, etc.): value unknown 1996 // (shell-controlled). Only safe when embedded in a string, NOT as a 1997 // bare argument to a path-sensitive command. 1998 if (insideString) { 1999 if (SAFE_ENV_VARS.has(varName)) return VAR_PLACEHOLDER 2000 if ( 2001 isSpecial && 2002 (SPECIAL_VAR_NAMES.has(varName) || /^[0-9]+$/.test(varName)) 2003 ) { 2004 return VAR_PLACEHOLDER 2005 } 2006 } 2007 return tooComplex(node) 2008} 2009 2010/** 2011 * Apply a variable assignment to the scope, handling `+=` append semantics. 2012 * SECURITY: If EITHER side (existing value or appended value) contains a 2013 * placeholder, the result is non-literal — store VAR_PLACEHOLDER so later 2014 * $VAR correctly rejects as bare arg. 2015 * `VAR=/etc && VAR+=$(cmd)` must not leave VAR looking static. 2016 */ 2017function applyVarToScope( 2018 varScope: Map<string, string>, 2019 ev: { name: string; value: string; isAppend: boolean }, 2020): void { 2021 const existing = varScope.get(ev.name) ?? '' 2022 const combined = ev.isAppend ? existing + ev.value : ev.value 2023 varScope.set( 2024 ev.name, 2025 containsAnyPlaceholder(combined) ? VAR_PLACEHOLDER : combined, 2026 ) 2027} 2028 2029function stripRawString(text: string): string { 2030 return text.slice(1, -1) 2031} 2032 2033function tooComplex(node: Node): ParseForSecurityResult { 2034 const reason = 2035 node.type === 'ERROR' 2036 ? 'Parse error' 2037 : DANGEROUS_TYPES.has(node.type) 2038 ? `Contains ${node.type}` 2039 : `Unhandled node type: ${node.type}` 2040 return { kind: 'too-complex', reason, nodeType: node.type } 2041} 2042 2043// ──────────────────────────────────────────────────────────────────────────── 2044// Post-argv semantic checks 2045// 2046// Everything above answers "can we tokenize?". Everything below answers 2047// "is the resulting argv dangerous in ways that don't involve parsing?". 2048// These are checks on argv[0] or argv content that the old bashSecurity.ts 2049// validators performed but which have nothing to do with parser 2050// differentials. They're here (not in bashSecurity.ts) because they operate 2051// on SimpleCommand and need to run for every extracted command. 2052// ──────────────────────────────────────────────────────────────────────────── 2053 2054/** 2055 * Zsh module builtins. These are not binaries on PATH — they're zsh 2056 * internals loaded via zmodload. Since BashTool runs via the user's default 2057 * shell (often zsh), and these parse as plain `command` nodes with no 2058 * distinguishing syntax, we can only catch them by name. 2059 */ 2060const ZSH_DANGEROUS_BUILTINS = new Set([ 2061 'zmodload', 2062 'emulate', 2063 'sysopen', 2064 'sysread', 2065 'syswrite', 2066 'sysseek', 2067 'zpty', 2068 'ztcp', 2069 'zsocket', 2070 'zf_rm', 2071 'zf_mv', 2072 'zf_ln', 2073 'zf_chmod', 2074 'zf_chown', 2075 'zf_mkdir', 2076 'zf_rmdir', 2077 'zf_chgrp', 2078]) 2079 2080/** 2081 * Shell builtins that evaluate their arguments as code or otherwise escape 2082 * the argv abstraction. A command like `eval "rm -rf /"` has argv 2083 * ['eval', 'rm -rf /'] which looks inert to flag validation but executes 2084 * the string. Treat these the same as command substitution. 2085 */ 2086const EVAL_LIKE_BUILTINS = new Set([ 2087 'eval', 2088 'source', 2089 '.', 2090 'exec', 2091 'command', 2092 'builtin', 2093 'fc', 2094 // `coproc rm -rf /` spawns rm as a coprocess. tree-sitter parses it as 2095 // a plain command with argv[0]='coproc', so permission rules and path 2096 // validation would check 'coproc' not 'rm'. 2097 'coproc', 2098 // Zsh precommand modifiers: `noglob cmd args` runs cmd with globbing off. 2099 // They parse as ordinary commands (noglob is argv[0], the real command is 2100 // argv[1]) so permission matching against argv[0] would see 'noglob', not 2101 // the wrapped command. 2102 'noglob', 2103 'nocorrect', 2104 // `trap 'cmd' SIGNAL` — cmd runs as shell code on signal/exit. EXIT fires 2105 // at end of every BashTool invocation, so this is guaranteed execution. 2106 'trap', 2107 // `enable -f /path/lib.so name` — dlopen arbitrary .so as a builtin. 2108 // Native code execution. 2109 'enable', 2110 // `mapfile -C callback -c N` / `readarray -C callback` — callback runs as 2111 // shell code every N input lines. 2112 'mapfile', 2113 'readarray', 2114 // `hash -p /path cmd` — poisons bash's command-lookup cache. Subsequent 2115 // `cmd` in the same command resolves to /path instead of PATH lookup. 2116 'hash', 2117 // `bind -x '"key":cmd'` / `complete -C cmd` — interactive-only callbacks 2118 // but still code-string arguments. Low impact in non-interactive BashTool 2119 // shells, blocked for consistency. `compgen -C cmd` is NOT interactive-only: 2120 // it immediately executes the -C argument to generate completions. 2121 'bind', 2122 'complete', 2123 'compgen', 2124 // `alias name='cmd'` — aliases not expanded in non-interactive bash by 2125 // default, but `shopt -s expand_aliases` enables them. Also blocked as 2126 // defense-in-depth (alias followed by name use in same command). 2127 'alias', 2128 // `let EXPR` arithmetically evaluates EXPR — identical to $(( EXPR )). 2129 // Array subscripts in the expression expand $(cmd) at eval time even when 2130 // the argument arrived single-quoted: `let 'x=a[$(id)]'` executes id. 2131 // tree-sitter sees the raw_string as an opaque leaf. Same primitive 2132 // walkArithmetic guards, but `let` is a plain command node. 2133 'let', 2134]) 2135 2136/** 2137 * Builtins that re-parse a NAME operand internally and arithmetically 2138 * evaluate `arr[EXPR]` subscripts — including $(cmd) in the subscript — 2139 * even when the argv element arrived from a single-quoted raw_string. 2140 * `test -v 'a[$(id)]'` → tree-sitter sees an opaque leaf, bash runs id. 2141 * Maps: builtin name → set of flags whose next argument is a NAME. 2142 */ 2143const SUBSCRIPT_EVAL_FLAGS: Record<string, Set<string>> = { 2144 test: new Set(['-v', '-R']), 2145 '[': new Set(['-v', '-R']), 2146 '[[': new Set(['-v', '-R']), 2147 printf: new Set(['-v']), 2148 read: new Set(['-a']), 2149 unset: new Set(['-v']), 2150 // bash 5.1+: `wait -p VAR [id...]` stores the waited PID into VAR. When VAR 2151 // is `arr[EXPR]`, bash arithmetically evaluates the subscript — running 2152 // $(cmd) even from a single-quoted raw_string. Verified bash 5.3.9: 2153 // `: & wait -p 'a[$(id)]' %1` executes id. 2154 wait: new Set(['-p']), 2155} 2156 2157/** 2158 * `[[ ARG1 OP ARG2 ]]` where OP is an arithmetic comparison. bash manual: 2159 * "When used with [[, Arg1 and Arg2 are evaluated as arithmetic 2160 * expressions." Arithmetic evaluation recursively expands array subscripts, 2161 * so `[[ 'a[$(id)]' -eq 0 ]]` executes `id` even though tree-sitter sees 2162 * the operand as an opaque raw_string leaf. Unlike -v/-R (unary, NAME after 2163 * flag), these are binary — the subscript can appear on EITHER side, so 2164 * SUBSCRIPT_EVAL_FLAGS's "next arg" logic is insufficient. 2165 * `[` / `test` are not vulnerable (bash errors with "integer expression 2166 * expected"), but the test_command handler normalizes argv[0]='[[' for 2167 * both forms, so they get this check too — mild over-blocking, safe side. 2168 */ 2169const TEST_ARITH_CMP_OPS = new Set(['-eq', '-ne', '-lt', '-le', '-gt', '-ge']) 2170 2171/** 2172 * Builtins where EVERY non-flag positional argument is a NAME that bash 2173 * re-parses and arithmetically evaluates subscripts on — no flag required. 2174 * `read 'a[$(id)]'` executes id: each positional is a variable name to 2175 * assign into, and `arr[EXPR]` is valid syntax there. `unset NAME...` is 2176 * the same (though tree-sitter's unset_command handler currently rejects 2177 * raw_string children before reaching here — this is defense-in-depth). 2178 * NOT printf (positional args are FORMAT/data), NOT test/[ (operands are 2179 * values, only -v/-R take a NAME). declare/typeset/local handled in 2180 * declaration_command since they never reach here as plain commands. 2181 */ 2182const BARE_SUBSCRIPT_NAME_BUILTINS = new Set(['read', 'unset']) 2183 2184/** 2185 * `read` flags whose NEXT argument is data (prompt/delimiter/count/fd), 2186 * not a NAME. `read -p '[foo] ' var` must not trip on the `[` in the 2187 * prompt string. `-a` is intentionally absent — its operand IS a NAME. 2188 */ 2189const READ_DATA_FLAGS = new Set(['-p', '-d', '-n', '-N', '-t', '-u', '-i']) 2190 2191// SHELL_KEYWORDS imported from bashParser.ts — shell reserved words can never 2192// be legitimate argv[0]; if they appear, the parser mis-parsed a compound 2193// command. Reject to avoid nonsense argv reaching downstream. 2194 2195// Use `.*` not `[^/]*` — Linux resolves `..` in procfs, so 2196// `/proc/self/../self/environ` works and must be caught. 2197const PROC_ENVIRON_RE = /\/proc\/.*\/environ/ 2198 2199/** 2200 * Newline followed by `#` in an argv element, env var value, or redirect target. 2201 * Downstream stripSafeWrappers re-tokenizes .text line-by-line and treats `#` 2202 * after a newline as a comment, hiding arguments that follow. 2203 */ 2204const NEWLINE_HASH_RE = /\n[ \t]*#/ 2205 2206export type SemanticCheckResult = { ok: true } | { ok: false; reason: string } 2207 2208/** 2209 * Post-argv semantic checks. Run after parseForSecurity returns 'simple' to 2210 * catch commands that tokenize fine but are dangerous by name or argument 2211 * content. Returns the first failure or {ok: true}. 2212 */ 2213export function checkSemantics(commands: SimpleCommand[]): SemanticCheckResult { 2214 for (const cmd of commands) { 2215 // Strip safe wrapper commands (nohup, time, timeout N, nice -n N) so 2216 // `nohup eval "..."` and `timeout 5 jq 'system(...)'` are checked 2217 // against the wrapped command, not the wrapper. Inlined here to avoid 2218 // circular import with bashPermissions.ts. 2219 let a = cmd.argv 2220 for (;;) { 2221 if (a[0] === 'time' || a[0] === 'nohup') { 2222 a = a.slice(1) 2223 } else if (a[0] === 'timeout') { 2224 // `timeout 5`, `timeout 5s`, `timeout 5.5`, plus optional GNU flags 2225 // preceding the duration. Long: --foreground, --kill-after=N, 2226 // --signal=SIG, --preserve-status. Short: -k DUR, -s SIG, -v (also 2227 // fused: -k5, -sTERM). 2228 // SECURITY (SAST Mar 2026): the previous loop only skipped `--long` 2229 // flags, so `timeout -k 5 10 eval ...` broke out with name='timeout' 2230 // and the wrapped eval was never checked. Now handle known short 2231 // flags AND fail closed on any unrecognized flag — an unknown flag 2232 // means we can't locate the wrapped command, so we must not silently 2233 // fall through to name='timeout'. 2234 let i = 1 2235 while (i < a.length) { 2236 const arg = a[i]! 2237 if ( 2238 arg === '--foreground' || 2239 arg === '--preserve-status' || 2240 arg === '--verbose' 2241 ) { 2242 i++ // known no-value long flags 2243 } else if (/^--(?:kill-after|signal)=[A-Za-z0-9_.+-]+$/.test(arg)) { 2244 i++ // --kill-after=5, --signal=TERM (value fused with =) 2245 } else if ( 2246 (arg === '--kill-after' || arg === '--signal') && 2247 a[i + 1] && 2248 /^[A-Za-z0-9_.+-]+$/.test(a[i + 1]!) 2249 ) { 2250 i += 2 // --kill-after 5, --signal TERM (space-separated) 2251 } else if (arg.startsWith('--')) { 2252 // Unknown long flag, OR --kill-after/--signal with non-allowlisted 2253 // value (e.g. placeholder from $() substitution). Fail closed. 2254 return { 2255 ok: false, 2256 reason: `timeout with ${arg} flag cannot be statically analyzed`, 2257 } 2258 } else if (arg === '-v') { 2259 i++ // --verbose, no argument 2260 } else if ( 2261 (arg === '-k' || arg === '-s') && 2262 a[i + 1] && 2263 /^[A-Za-z0-9_.+-]+$/.test(a[i + 1]!) 2264 ) { 2265 i += 2 // -k DURATION / -s SIGNAL — separate value 2266 } else if (/^-[ks][A-Za-z0-9_.+-]+$/.test(arg)) { 2267 i++ // fused: -k5, -sTERM 2268 } else if (arg.startsWith('-')) { 2269 // Unknown flag OR -k/-s with non-allowlisted value — can't locate 2270 // wrapped cmd. Reject, don't fall through to name='timeout'. 2271 return { 2272 ok: false, 2273 reason: `timeout with ${arg} flag cannot be statically analyzed`, 2274 } 2275 } else { 2276 break // non-flag — should be the duration 2277 } 2278 } 2279 if (a[i] && /^\d+(?:\.\d+)?[smhd]?$/.test(a[i]!)) { 2280 a = a.slice(i + 1) 2281 } else if (a[i]) { 2282 // SECURITY (PR #21503 round 3): a[i] exists but doesn't match our 2283 // duration regex. GNU timeout parses via xstrtod() (libc strtod) and 2284 // accepts `.5`, `+5`, `5e-1`, `inf`, `infinity`, hex floats — none 2285 // of which match `/^\d+(\.\d+)?[smhd]?$/`. Empirically verified: 2286 // `timeout .5 echo ok` works. Previously this branch `break`ed 2287 // (fail-OPEN) so `timeout .5 eval "id"` with `Bash(timeout:*)` left 2288 // name='timeout' and eval was never checked. Now fail CLOSED — 2289 // consistent with the unknown-FLAG handling above (lines ~1895,1912). 2290 return { 2291 ok: false, 2292 reason: `timeout duration '${a[i]}' cannot be statically analyzed`, 2293 } 2294 } else { 2295 break // no more args — `timeout` alone, inert 2296 } 2297 } else if (a[0] === 'nice') { 2298 // `nice cmd`, `nice -n N cmd`, `nice -N cmd` (legacy). All run cmd 2299 // at a lower priority. argv[0] check must see the wrapped cmd. 2300 if (a[1] === '-n' && a[2] && /^-?\d+$/.test(a[2])) { 2301 a = a.slice(3) 2302 } else if (a[1] && /^-\d+$/.test(a[1])) { 2303 a = a.slice(2) // `nice -10 cmd` 2304 } else if (a[1] && /[$(`]/.test(a[1])) { 2305 // SECURITY: walkArgument returns node.text for arithmetic_expansion, 2306 // so `nice $((0-5)) jq ...` has a[1]='$((0-5))'. Bash expands it to 2307 // '-5' (legacy nice syntax) and execs jq; we'd slice(1) here and 2308 // set name='$((0-5))' which skips the jq system() check entirely. 2309 // Fail closed — mirrors the timeout-duration fail-closed above. 2310 return { 2311 ok: false, 2312 reason: `nice argument '${a[1]}' contains expansion — cannot statically determine wrapped command`, 2313 } 2314 } else { 2315 a = a.slice(1) // bare `nice cmd` 2316 } 2317 } else if (a[0] === 'env') { 2318 // `env [VAR=val...] [-i] [-0] [-v] [-u NAME...] cmd args` runs cmd. 2319 // argv[0] check must see cmd, not env. Skip known-safe forms only. 2320 // SECURITY: -S splits a string into argv (mini-shell) — must reject. 2321 // -C/-P change cwd/PATH — wrapped cmd runs elsewhere, reject. 2322 // Any OTHER flag → reject (fail-closed, not fail-open to name='env'). 2323 let i = 1 2324 while (i < a.length) { 2325 const arg = a[i]! 2326 if (arg.includes('=') && !arg.startsWith('-')) { 2327 i++ // VAR=val assignment 2328 } else if (arg === '-i' || arg === '-0' || arg === '-v') { 2329 i++ // flags with no argument 2330 } else if (arg === '-u' && a[i + 1]) { 2331 i += 2 // -u NAME unsets; takes one arg 2332 } else if (arg.startsWith('-')) { 2333 // -S (argv splitter), -C (altwd), -P (altpath), --anything, 2334 // or unknown flag. Can't model — reject the whole command. 2335 return { 2336 ok: false, 2337 reason: `env with ${arg} flag cannot be statically analyzed`, 2338 } 2339 } else { 2340 break // the wrapped command 2341 } 2342 } 2343 if (i < a.length) { 2344 a = a.slice(i) 2345 } else { 2346 break // `env` alone (no wrapped cmd) — inert, name='env' 2347 } 2348 } else if (a[0] === 'stdbuf') { 2349 // `stdbuf -o0 cmd` (fused), `stdbuf -o 0 cmd` (space-separated), 2350 // multiple flags (`stdbuf -o0 -eL cmd`), long forms (`--output=0`). 2351 // SECURITY: previous handling only stripped ONE flag and fell through 2352 // to slice(2) for anything unrecognized, so `stdbuf --output 0 eval` 2353 // → ['0','eval',...] → name='0' hid eval. Now iterate all known flag 2354 // forms and fail closed on any unknown flag. 2355 let i = 1 2356 while (i < a.length) { 2357 const arg = a[i]! 2358 if (STDBUF_SHORT_SEP_RE.test(arg) && a[i + 1]) { 2359 i += 2 // -o MODE (space-separated) 2360 } else if (STDBUF_SHORT_FUSED_RE.test(arg)) { 2361 i++ // -o0 (fused) 2362 } else if (STDBUF_LONG_RE.test(arg)) { 2363 i++ // --output=MODE (fused long) 2364 } else if (arg.startsWith('-')) { 2365 // --output MODE (space-separated long) or unknown flag. GNU 2366 // stdbuf long options use `=` syntax, but getopt_long also 2367 // accepts space-separated — we can't enumerate safely, reject. 2368 return { 2369 ok: false, 2370 reason: `stdbuf with ${arg} flag cannot be statically analyzed`, 2371 } 2372 } else { 2373 break // the wrapped command 2374 } 2375 } 2376 if (i > 1 && i < a.length) { 2377 a = a.slice(i) 2378 } else { 2379 break // `stdbuf` with no flags or no wrapped cmd — inert 2380 } 2381 } else { 2382 break 2383 } 2384 } 2385 const name = a[0] 2386 if (name === undefined) continue 2387 2388 // SECURITY: Empty command name. Quoted empty (`"" cmd`) is harmless — 2389 // bash tries to exec "" and fails with "command not found". But an 2390 // UNQUOTED empty expansion at command position (`V="" && $V cmd`) is a 2391 // bypass: bash drops the empty field and runs `cmd` as argv[0], while 2392 // our name="" skips every builtin check below. resolveSimpleExpansion 2393 // rejects the $V case; this catches any other path to empty argv[0] 2394 // (concatenation of empties, walkString whitespace-quirk, future bugs). 2395 if (name === '') { 2396 return { 2397 ok: false, 2398 reason: 'Empty command name — argv[0] may not reflect what bash runs', 2399 } 2400 } 2401 2402 // Defense-in-depth: argv[0] should never be a placeholder after the 2403 // var-tracking fix (static vars return real value, unknown vars reject). 2404 // But if a bug upstream ever lets one through, catch it here — a 2405 // placeholder-as-command-name means runtime-determined command → unsafe. 2406 if (name.includes(CMDSUB_PLACEHOLDER) || name.includes(VAR_PLACEHOLDER)) { 2407 return { 2408 ok: false, 2409 reason: 'Command name is runtime-determined (placeholder argv[0])', 2410 } 2411 } 2412 2413 // argv[0] starts with an operator/flag: this is a fragment, not a 2414 // command. Likely a line-continuation leak or a mistake. 2415 if (name.startsWith('-') || name.startsWith('|') || name.startsWith('&')) { 2416 return { 2417 ok: false, 2418 reason: 'Command appears to be an incomplete fragment', 2419 } 2420 } 2421 2422 // SECURITY: builtins that re-parse a NAME operand internally. bash 2423 // arithmetically evaluates `arr[EXPR]` in NAME position, running $(cmd) 2424 // in the subscript even when the argv element arrived from a 2425 // single-quoted raw_string (opaque leaf to tree-sitter). Two forms: 2426 // separate (`printf -v NAME`) and fused (`printf -vNAME`, getopt-style). 2427 // `printf '[%s]' x` stays safe — `[` in format string, not after `-v`. 2428 const dangerFlags = SUBSCRIPT_EVAL_FLAGS[name] 2429 if (dangerFlags !== undefined) { 2430 for (let i = 1; i < a.length; i++) { 2431 const arg = a[i]! 2432 // Separate form: `-v` then NAME in next arg. 2433 if (dangerFlags.has(arg) && a[i + 1]?.includes('[')) { 2434 return { 2435 ok: false, 2436 reason: `'${name} ${arg}' operand contains array subscript — bash evaluates $(cmd) in subscripts`, 2437 } 2438 } 2439 // Combined short flags: `-ra` is bash shorthand for `-r -a`. 2440 // Check if any danger flag character appears in a combined flag 2441 // string. The danger flag's NAME operand is the next argument. 2442 if ( 2443 arg.length > 2 && 2444 arg[0] === '-' && 2445 arg[1] !== '-' && 2446 !arg.includes('[') 2447 ) { 2448 for (const flag of dangerFlags) { 2449 if (flag.length === 2 && arg.includes(flag[1]!)) { 2450 if (a[i + 1]?.includes('[')) { 2451 return { 2452 ok: false, 2453 reason: `'${name} ${flag}' (combined in '${arg}') operand contains array subscript — bash evaluates $(cmd) in subscripts`, 2454 } 2455 } 2456 } 2457 } 2458 } 2459 // Fused form: `-vNAME` in one arg. Only short-option flags fuse 2460 // (getopt), so check -v/-a/-R. `[[` uses test_operator nodes only. 2461 for (const flag of dangerFlags) { 2462 if ( 2463 flag.length === 2 && 2464 arg.startsWith(flag) && 2465 arg.length > 2 && 2466 arg.includes('[') 2467 ) { 2468 return { 2469 ok: false, 2470 reason: `'${name} ${flag}' (fused) operand contains array subscript — bash evaluates $(cmd) in subscripts`, 2471 } 2472 } 2473 } 2474 } 2475 } 2476 2477 // SECURITY: `[[ ARG OP ARG ]]` arithmetic comparison. bash evaluates 2478 // BOTH operands as arithmetic expressions, recursively expanding 2479 // `arr[$(cmd)]` subscripts even from single-quoted raw_string. Check 2480 // the operand adjacent to each arith-cmp operator on BOTH sides — 2481 // SUBSCRIPT_EVAL_FLAGS's "flag then next-arg" pattern can't express 2482 // "either side of a binary op". String comparisons (==/!=/=~) do NOT 2483 // trigger arithmetic eval — `[[ 'a[x]' == y ]]` is a literal string cmp. 2484 if (name === '[[') { 2485 // i starts at 2: a[0]='[[' (contains '['), a[1] is the first real 2486 // operand. A binary op can't appear before index 2. 2487 for (let i = 2; i < a.length; i++) { 2488 if (!TEST_ARITH_CMP_OPS.has(a[i]!)) continue 2489 if (a[i - 1]?.includes('[') || a[i + 1]?.includes('[')) { 2490 return { 2491 ok: false, 2492 reason: `'[[ ... ${a[i]} ... ]]' operand contains array subscript — bash arithmetically evaluates $(cmd) in subscripts`, 2493 } 2494 } 2495 } 2496 } 2497 2498 // SECURITY: `read`/`unset` treat EVERY bare positional as a NAME — 2499 // no flag needed. `read 'a[$(id)]' <<< data` executes id even though 2500 // argv[1] arrived from a single-quoted raw_string and no -a flag is 2501 // present. Same primitive as SUBSCRIPT_EVAL_FLAGS but the trigger is 2502 // positional, not flag-gated. Skip operands of read's data-taking 2503 // flags (-p PROMPT etc.) to avoid blocking `read -p '[foo] ' var`. 2504 if (BARE_SUBSCRIPT_NAME_BUILTINS.has(name)) { 2505 let skipNext = false 2506 for (let i = 1; i < a.length; i++) { 2507 const arg = a[i]! 2508 if (skipNext) { 2509 skipNext = false 2510 continue 2511 } 2512 if (arg[0] === '-') { 2513 if (name === 'read') { 2514 if (READ_DATA_FLAGS.has(arg)) { 2515 skipNext = true 2516 } else if (arg.length > 2 && arg[1] !== '-') { 2517 // Combined short flag like `-rp`. Getopt-style: first 2518 // data-flag char consumes rest-of-arg as its operand 2519 // (`-p[foo]` → prompt=`[foo]`), or next-arg if last 2520 // (`-rp '[foo]'` → prompt=`[foo]`). So skipNext iff a 2521 // data-flag char appears at the END after only no-arg 2522 // flags like `-r`/`-s`. 2523 for (let j = 1; j < arg.length; j++) { 2524 if (READ_DATA_FLAGS.has('-' + arg[j])) { 2525 if (j === arg.length - 1) skipNext = true 2526 break 2527 } 2528 } 2529 } 2530 } 2531 continue 2532 } 2533 if (arg.includes('[')) { 2534 return { 2535 ok: false, 2536 reason: `'${name}' positional NAME '${arg}' contains array subscript — bash evaluates $(cmd) in subscripts`, 2537 } 2538 } 2539 } 2540 } 2541 2542 // SECURITY: Shell reserved keywords as argv[0] indicate a tree-sitter 2543 // mis-parse. `! for i in a; do :; done` parses as `command "for i in a"` 2544 // + `command "do :"` + `command "done"` — tree-sitter fails to recognize 2545 // `for` after `!` as a compound command start. Reject: keywords can never 2546 // be legitimate command names, and argv like ['do','false'] is nonsense. 2547 if (SHELL_KEYWORDS.has(name)) { 2548 return { 2549 ok: false, 2550 reason: `Shell keyword '${name}' as command name — tree-sitter mis-parse`, 2551 } 2552 } 2553 2554 // Check argv (not .text) to catch both single-quote (`'\n#'`) and 2555 // double-quote (`"\n#"`) variants. Env vars and redirects are also 2556 // part of the .text span so the same downstream bug applies. 2557 // Heredoc bodies are excluded from argv so markdown `##` headers 2558 // don't trigger this. 2559 // TODO: remove once downstream path validation operates on argv. 2560 for (const arg of cmd.argv) { 2561 if (arg.includes('\n') && NEWLINE_HASH_RE.test(arg)) { 2562 return { 2563 ok: false, 2564 reason: 2565 'Newline followed by # inside a quoted argument can hide arguments from path validation', 2566 } 2567 } 2568 } 2569 for (const ev of cmd.envVars) { 2570 if (ev.value.includes('\n') && NEWLINE_HASH_RE.test(ev.value)) { 2571 return { 2572 ok: false, 2573 reason: 2574 'Newline followed by # inside an env var value can hide arguments from path validation', 2575 } 2576 } 2577 } 2578 for (const r of cmd.redirects) { 2579 if (r.target.includes('\n') && NEWLINE_HASH_RE.test(r.target)) { 2580 return { 2581 ok: false, 2582 reason: 2583 'Newline followed by # inside a redirect target can hide arguments from path validation', 2584 } 2585 } 2586 } 2587 2588 // jq's system() built-in executes arbitrary shell commands, and flags 2589 // like --from-file can read arbitrary files into jq variables. On the 2590 // legacy path these are caught by validateJqCommand in bashSecurity.ts, 2591 // but that validator is gated behind `astSubcommands === null` and 2592 // never runs when the AST parse succeeds. Mirror the checks here so 2593 // the AST path has the same defence. 2594 if (name === 'jq') { 2595 for (const arg of a) { 2596 if (/\bsystem\s*\(/.test(arg)) { 2597 return { 2598 ok: false, 2599 reason: 2600 'jq command contains system() function which executes arbitrary commands', 2601 } 2602 } 2603 } 2604 if ( 2605 a.some(arg => 2606 /^(?:-[fL](?:$|[^A-Za-z])|--(?:from-file|rawfile|slurpfile|library-path)(?:$|=))/.test( 2607 arg, 2608 ), 2609 ) 2610 ) { 2611 return { 2612 ok: false, 2613 reason: 2614 'jq command contains dangerous flags that could execute code or read arbitrary files', 2615 } 2616 } 2617 } 2618 2619 if (ZSH_DANGEROUS_BUILTINS.has(name)) { 2620 return { 2621 ok: false, 2622 reason: `Zsh builtin '${name}' can bypass security checks`, 2623 } 2624 } 2625 2626 if (EVAL_LIKE_BUILTINS.has(name)) { 2627 // `command -v foo` / `command -V foo` are POSIX existence checks that 2628 // only print paths — they never execute argv[1]. Bare `command foo` 2629 // does bypass function/alias lookup (the concern), so keep blocking it. 2630 if (name === 'command' && (a[1] === '-v' || a[1] === '-V')) { 2631 // fall through to remaining checks 2632 } else if ( 2633 name === 'fc' && 2634 !a.slice(1).some(arg => /^-[^-]*[es]/.test(arg)) 2635 ) { 2636 // `fc -l`, `fc -ln` list history — safe. `fc -e ed` invokes an 2637 // editor then executes. `fc -s [pat=rep]` RE-EXECUTES the last 2638 // matching command (optionally with substitution) — as dangerous 2639 // as eval. Block any short-opt containing `e` or `s`. 2640 // to avoid introducing FPs for `fc -l` (list history). 2641 } else if ( 2642 name === 'compgen' && 2643 !a.slice(1).some(arg => /^-[^-]*[CFW]/.test(arg)) 2644 ) { 2645 // `compgen -c/-f/-v` only list completions — safe. `compgen -C cmd` 2646 // immediately executes cmd; `-F func` calls a shell function; `-W list` 2647 // word-expands its argument (including $(cmd) even from single-quoted 2648 // raw_string). Block any short-opt containing C/F/W (case-sensitive: 2649 // -c/-f are safe). 2650 } else { 2651 return { 2652 ok: false, 2653 reason: `'${name}' evaluates arguments as shell code`, 2654 } 2655 } 2656 } 2657 2658 // /proc/*/environ exposes env vars (including secrets) of other processes. 2659 // Check argv and redirect targets — `cat /proc/self/environ` and 2660 // `cat < /proc/self/environ` both read it. 2661 for (const arg of cmd.argv) { 2662 if (arg.includes('/proc/') && PROC_ENVIRON_RE.test(arg)) { 2663 return { 2664 ok: false, 2665 reason: 'Accesses /proc/*/environ which may expose secrets', 2666 } 2667 } 2668 } 2669 for (const r of cmd.redirects) { 2670 if (r.target.includes('/proc/') && PROC_ENVIRON_RE.test(r.target)) { 2671 return { 2672 ok: false, 2673 reason: 'Accesses /proc/*/environ which may expose secrets', 2674 } 2675 } 2676 } 2677 } 2678 return { ok: true } 2679}