source dump of claude code
at main 506 lines 18 kB view raw
1/** 2 * Tree-sitter AST analysis utilities for bash command security validation. 3 * 4 * These functions extract security-relevant information from tree-sitter 5 * parse trees, providing more accurate analysis than regex/shell-quote 6 * parsing. Each function takes a root node and command string, and returns 7 * structured data that can be used by security validators. 8 * 9 * The native NAPI parser returns plain JS objects — no cleanup needed. 10 */ 11 12type TreeSitterNode = { 13 type: string 14 text: string 15 startIndex: number 16 endIndex: number 17 children: TreeSitterNode[] 18 childCount: number 19} 20 21export type QuoteContext = { 22 /** Command text with single-quoted content removed (double-quoted content preserved) */ 23 withDoubleQuotes: string 24 /** Command text with all quoted content removed */ 25 fullyUnquoted: string 26 /** Like fullyUnquoted but preserves quote characters (', ") */ 27 unquotedKeepQuoteChars: string 28} 29 30export type CompoundStructure = { 31 /** Whether the command has compound operators (&&, ||, ;) at the top level */ 32 hasCompoundOperators: boolean 33 /** Whether the command has pipelines */ 34 hasPipeline: boolean 35 /** Whether the command has subshells */ 36 hasSubshell: boolean 37 /** Whether the command has command groups ({...}) */ 38 hasCommandGroup: boolean 39 /** Top-level compound operator types found */ 40 operators: string[] 41 /** Individual command segments split by compound operators */ 42 segments: string[] 43} 44 45export type DangerousPatterns = { 46 /** Has $() or backtick command substitution (outside quotes that would make it safe) */ 47 hasCommandSubstitution: boolean 48 /** Has <() or >() process substitution */ 49 hasProcessSubstitution: boolean 50 /** Has ${...} parameter expansion */ 51 hasParameterExpansion: boolean 52 /** Has heredoc */ 53 hasHeredoc: boolean 54 /** Has comment */ 55 hasComment: boolean 56} 57 58export type TreeSitterAnalysis = { 59 quoteContext: QuoteContext 60 compoundStructure: CompoundStructure 61 /** Whether actual operator nodes (;, &&, ||) exist — if false, \; is just a word argument */ 62 hasActualOperatorNodes: boolean 63 dangerousPatterns: DangerousPatterns 64} 65 66type QuoteSpans = { 67 raw: Array<[number, number]> // raw_string (single-quoted) 68 ansiC: Array<[number, number]> // ansi_c_string ($'...') 69 double: Array<[number, number]> // string (double-quoted) 70 heredoc: Array<[number, number]> // quoted heredoc_redirect 71} 72 73/** 74 * Single-pass collection of all quote-related spans. 75 * Previously this was 5 separate tree walks (one per type-set plus 76 * allQuoteTypes plus heredoc); fusing cuts tree-traversal ~5x. 77 * 78 * Replicates the per-type walk semantics: each original walk stopped at 79 * its own type. So the raw_string walk would recurse THROUGH a string 80 * node (not its type) to reach nested raw_string inside $(...), but the 81 * string walk would stop at the outer string. We track `inDouble` to 82 * collect the *outermost* string span per path, while still descending 83 * into $()/${} bodies to pick up inner raw_string/ansi_c_string. 84 * 85 * raw_string / ansi_c_string / quoted-heredoc bodies are literal text 86 * in bash (no expansion), so no nested quote nodes exist — return early. 87 */ 88function collectQuoteSpans( 89 node: TreeSitterNode, 90 out: QuoteSpans, 91 inDouble: boolean, 92): void { 93 switch (node.type) { 94 case 'raw_string': 95 out.raw.push([node.startIndex, node.endIndex]) 96 return // literal body, no nested quotes possible 97 case 'ansi_c_string': 98 out.ansiC.push([node.startIndex, node.endIndex]) 99 return // literal body 100 case 'string': 101 // Only collect the outermost string (matches old per-type walk 102 // which stops at first match). Recurse regardless — a nested 103 // $(cmd 'x') inside "..." has a real inner raw_string. 104 if (!inDouble) out.double.push([node.startIndex, node.endIndex]) 105 for (const child of node.children) { 106 if (child) collectQuoteSpans(child, out, true) 107 } 108 return 109 case 'heredoc_redirect': { 110 // Quoted heredocs (<<'EOF', <<"EOF", <<\EOF): literal body. 111 // Unquoted (<<EOF) expands $()/${} — the body can contain 112 // $(cmd 'x') whose inner '...' IS a real raw_string node. 113 // Detection: heredoc_start text starts with '/"/\\ 114 // Matches sync path's extractHeredocs({ quotedOnly: true }). 115 let isQuoted = false 116 for (const child of node.children) { 117 if (child && child.type === 'heredoc_start') { 118 const first = child.text[0] 119 isQuoted = first === "'" || first === '"' || first === '\\' 120 break 121 } 122 } 123 if (isQuoted) { 124 out.heredoc.push([node.startIndex, node.endIndex]) 125 return // literal body, no nested quote nodes 126 } 127 // Unquoted: recurse into heredoc_body → command_substitution → 128 // inner quote nodes. The original per-type walks did NOT stop at 129 // heredoc_redirect (not in their type sets), so they recursed here. 130 break 131 } 132 } 133 134 for (const child of node.children) { 135 if (child) collectQuoteSpans(child, out, inDouble) 136 } 137} 138 139/** 140 * Builds a Set of all character positions covered by the given spans. 141 */ 142function buildPositionSet(spans: Array<[number, number]>): Set<number> { 143 const set = new Set<number>() 144 for (const [start, end] of spans) { 145 for (let i = start; i < end; i++) { 146 set.add(i) 147 } 148 } 149 return set 150} 151 152/** 153 * Drops spans that are fully contained within another span, keeping only the 154 * outermost. Nested quotes (e.g., `"$(echo 'hi')"`) yield overlapping spans 155 * — the inner raw_string is found by recursing into the outer string node. 156 * Processing overlapping spans corrupts indices since removing/replacing the 157 * outer span shifts the inner span's start/end into stale positions. 158 */ 159function dropContainedSpans<T extends readonly [number, number, ...unknown[]]>( 160 spans: T[], 161): T[] { 162 return spans.filter( 163 (s, i) => 164 !spans.some( 165 (other, j) => 166 j !== i && 167 other[0] <= s[0] && 168 other[1] >= s[1] && 169 (other[0] < s[0] || other[1] > s[1]), 170 ), 171 ) 172} 173 174/** 175 * Removes spans from a string, returning the string with those character 176 * ranges removed. 177 */ 178function removeSpans(command: string, spans: Array<[number, number]>): string { 179 if (spans.length === 0) return command 180 181 // Drop inner spans that are fully contained in an outer one, then sort by 182 // start index descending so we can splice without offset shifts. 183 const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0]) 184 let result = command 185 for (const [start, end] of sorted) { 186 result = result.slice(0, start) + result.slice(end) 187 } 188 return result 189} 190 191/** 192 * Replaces spans with just the quote delimiters (preserving ' and " characters). 193 */ 194function replaceSpansKeepQuotes( 195 command: string, 196 spans: Array<[number, number, string, string]>, 197): string { 198 if (spans.length === 0) return command 199 200 const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0]) 201 let result = command 202 for (const [start, end, open, close] of sorted) { 203 // Replace content but keep the quote delimiters 204 result = result.slice(0, start) + open + close + result.slice(end) 205 } 206 return result 207} 208 209/** 210 * Extract quote context from the tree-sitter AST. 211 * Replaces the manual character-by-character extractQuotedContent() function. 212 * 213 * Tree-sitter node types: 214 * - raw_string: single-quoted ('...') 215 * - string: double-quoted ("...") 216 * - ansi_c_string: ANSI-C quoting ($'...') — span includes the leading $ 217 * - heredoc_redirect: QUOTED heredocs only (<<'EOF', <<"EOF", <<\EOF) — 218 * the full redirect span (<<, delimiters, body, newlines) is stripped 219 * since the body is literal text in bash (no expansion). UNQUOTED 220 * heredocs (<<EOF) are left in place since bash expands $(...)/${...} 221 * inside them, and validators need to see those patterns. Matches the 222 * sync path's extractHeredocs({ quotedOnly: true }). 223 */ 224export function extractQuoteContext( 225 rootNode: unknown, 226 command: string, 227): QuoteContext { 228 // Single walk collects all quote span types at once. 229 const spans: QuoteSpans = { raw: [], ansiC: [], double: [], heredoc: [] } 230 collectQuoteSpans(rootNode as TreeSitterNode, spans, false) 231 const singleQuoteSpans = spans.raw 232 const ansiCSpans = spans.ansiC 233 const doubleQuoteSpans = spans.double 234 const quotedHeredocSpans = spans.heredoc 235 const allQuoteSpans = [ 236 ...singleQuoteSpans, 237 ...ansiCSpans, 238 ...doubleQuoteSpans, 239 ...quotedHeredocSpans, 240 ] 241 242 // Build a set of positions that should be excluded for each output variant. 243 // For withDoubleQuotes: remove single-quoted spans entirely, plus the 244 // opening/closing `"` delimiters of double-quoted spans (but keep the 245 // content between them). This matches the regex extractQuotedContent() 246 // semantics where `"` toggles quote state but content is still emitted. 247 const singleQuoteSet = buildPositionSet([ 248 ...singleQuoteSpans, 249 ...ansiCSpans, 250 ...quotedHeredocSpans, 251 ]) 252 const doubleQuoteDelimSet = new Set<number>() 253 for (const [start, end] of doubleQuoteSpans) { 254 doubleQuoteDelimSet.add(start) // opening " 255 doubleQuoteDelimSet.add(end - 1) // closing " 256 } 257 let withDoubleQuotes = '' 258 for (let i = 0; i < command.length; i++) { 259 if (singleQuoteSet.has(i)) continue 260 if (doubleQuoteDelimSet.has(i)) continue 261 withDoubleQuotes += command[i] 262 } 263 264 // fullyUnquoted: remove all quoted content 265 const fullyUnquoted = removeSpans(command, allQuoteSpans) 266 267 // unquotedKeepQuoteChars: remove content but keep delimiter chars 268 const spansWithQuoteChars: Array<[number, number, string, string]> = [] 269 for (const [start, end] of singleQuoteSpans) { 270 spansWithQuoteChars.push([start, end, "'", "'"]) 271 } 272 for (const [start, end] of ansiCSpans) { 273 // ansi_c_string spans include the leading $; preserve it so this 274 // matches the regex path, which treats $ as unquoted preceding '. 275 spansWithQuoteChars.push([start, end, "$'", "'"]) 276 } 277 for (const [start, end] of doubleQuoteSpans) { 278 spansWithQuoteChars.push([start, end, '"', '"']) 279 } 280 for (const [start, end] of quotedHeredocSpans) { 281 // Heredoc redirect spans have no inline quote delimiters — strip entirely. 282 spansWithQuoteChars.push([start, end, '', '']) 283 } 284 const unquotedKeepQuoteChars = replaceSpansKeepQuotes( 285 command, 286 spansWithQuoteChars, 287 ) 288 289 return { withDoubleQuotes, fullyUnquoted, unquotedKeepQuoteChars } 290} 291 292/** 293 * Extract compound command structure from the AST. 294 * Replaces isUnsafeCompoundCommand() and splitCommand() for tree-sitter path. 295 */ 296export function extractCompoundStructure( 297 rootNode: unknown, 298 command: string, 299): CompoundStructure { 300 const n = rootNode as TreeSitterNode 301 const operators: string[] = [] 302 const segments: string[] = [] 303 let hasSubshell = false 304 let hasCommandGroup = false 305 let hasPipeline = false 306 307 // Walk top-level children of the program node 308 function walkTopLevel(node: TreeSitterNode): void { 309 for (const child of node.children) { 310 if (!child) continue 311 312 if (child.type === 'list') { 313 // list nodes contain && and || operators 314 for (const listChild of child.children) { 315 if (!listChild) continue 316 if (listChild.type === '&&' || listChild.type === '||') { 317 operators.push(listChild.type) 318 } else if ( 319 listChild.type === 'list' || 320 listChild.type === 'redirected_statement' 321 ) { 322 // Nested list, or redirected_statement wrapping a list/pipeline — 323 // recurse so inner operators/pipelines are detected. For 324 // `cmd1 && cmd2 2>/dev/null && cmd3`, the redirected_statement 325 // wraps `list(cmd1 && cmd2)` — the inner `&&` would be missed 326 // without recursion. 327 walkTopLevel({ ...node, children: [listChild] } as TreeSitterNode) 328 } else if (listChild.type === 'pipeline') { 329 hasPipeline = true 330 segments.push(listChild.text) 331 } else if (listChild.type === 'subshell') { 332 hasSubshell = true 333 segments.push(listChild.text) 334 } else if (listChild.type === 'compound_statement') { 335 hasCommandGroup = true 336 segments.push(listChild.text) 337 } else { 338 segments.push(listChild.text) 339 } 340 } 341 } else if (child.type === ';') { 342 operators.push(';') 343 } else if (child.type === 'pipeline') { 344 hasPipeline = true 345 segments.push(child.text) 346 } else if (child.type === 'subshell') { 347 hasSubshell = true 348 segments.push(child.text) 349 } else if (child.type === 'compound_statement') { 350 hasCommandGroup = true 351 segments.push(child.text) 352 } else if ( 353 child.type === 'command' || 354 child.type === 'declaration_command' || 355 child.type === 'variable_assignment' 356 ) { 357 segments.push(child.text) 358 } else if (child.type === 'redirected_statement') { 359 // `cd ~/src && find path 2>/dev/null` — tree-sitter wraps the ENTIRE 360 // compound in a redirected_statement: program → redirected_statement → 361 // (list → cmd1, &&, cmd2) + file_redirect. Same for `cmd1 | cmd2 > out` 362 // (wraps pipeline) and `(cmd) > out` (wraps subshell). Recurse to 363 // detect the inner structure; skip file_redirect children (redirects 364 // don't affect compound/pipeline classification). 365 let foundInner = false 366 for (const inner of child.children) { 367 if (!inner || inner.type === 'file_redirect') continue 368 foundInner = true 369 walkTopLevel({ ...child, children: [inner] } as TreeSitterNode) 370 } 371 if (!foundInner) { 372 // Standalone redirect with no body (shouldn't happen, but fail-safe) 373 segments.push(child.text) 374 } 375 } else if (child.type === 'negated_command') { 376 // `! cmd` — recurse into the inner command so its structure is 377 // classified (pipeline/subshell/etc.), but also record the full 378 // negated text as a segment so segments.length stays meaningful. 379 segments.push(child.text) 380 walkTopLevel(child) 381 } else if ( 382 child.type === 'if_statement' || 383 child.type === 'while_statement' || 384 child.type === 'for_statement' || 385 child.type === 'case_statement' || 386 child.type === 'function_definition' 387 ) { 388 // Control-flow constructs: the construct itself is one segment, 389 // but recurse so inner pipelines/subshells/operators are detected. 390 segments.push(child.text) 391 walkTopLevel(child) 392 } 393 } 394 } 395 396 walkTopLevel(n) 397 398 // If no segments found, the whole command is one segment 399 if (segments.length === 0) { 400 segments.push(command) 401 } 402 403 return { 404 hasCompoundOperators: operators.length > 0, 405 hasPipeline, 406 hasSubshell, 407 hasCommandGroup, 408 operators, 409 segments, 410 } 411} 412 413/** 414 * Check whether the AST contains actual operator nodes (;, &&, ||). 415 * 416 * This is the key function for eliminating the `find -exec \;` false positive. 417 * Tree-sitter parses `\;` as part of a `word` node (an argument to find), 418 * NOT as a `;` operator. So if no actual `;` operator nodes exist in the AST, 419 * there are no compound operators and hasBackslashEscapedOperator() can be skipped. 420 */ 421export function hasActualOperatorNodes(rootNode: unknown): boolean { 422 const n = rootNode as TreeSitterNode 423 424 function walk(node: TreeSitterNode): boolean { 425 // Check for operator types that indicate compound commands 426 if (node.type === ';' || node.type === '&&' || node.type === '||') { 427 // Verify this is a child of a list or program, not inside a command 428 return true 429 } 430 431 if (node.type === 'list') { 432 // A list node means there are compound operators 433 return true 434 } 435 436 for (const child of node.children) { 437 if (child && walk(child)) return true 438 } 439 return false 440 } 441 442 return walk(n) 443} 444 445/** 446 * Extract dangerous pattern information from the AST. 447 */ 448export function extractDangerousPatterns(rootNode: unknown): DangerousPatterns { 449 const n = rootNode as TreeSitterNode 450 let hasCommandSubstitution = false 451 let hasProcessSubstitution = false 452 let hasParameterExpansion = false 453 let hasHeredoc = false 454 let hasComment = false 455 456 function walk(node: TreeSitterNode): void { 457 switch (node.type) { 458 case 'command_substitution': 459 hasCommandSubstitution = true 460 break 461 case 'process_substitution': 462 hasProcessSubstitution = true 463 break 464 case 'expansion': 465 hasParameterExpansion = true 466 break 467 case 'heredoc_redirect': 468 hasHeredoc = true 469 break 470 case 'comment': 471 hasComment = true 472 break 473 } 474 475 for (const child of node.children) { 476 if (child) walk(child) 477 } 478 } 479 480 walk(n) 481 482 return { 483 hasCommandSubstitution, 484 hasProcessSubstitution, 485 hasParameterExpansion, 486 hasHeredoc, 487 hasComment, 488 } 489} 490 491/** 492 * Perform complete tree-sitter analysis of a command. 493 * Extracts all security-relevant data from the AST in one pass. 494 * This data must be extracted before tree.delete() is called. 495 */ 496export function analyzeCommand( 497 rootNode: unknown, 498 command: string, 499): TreeSitterAnalysis { 500 return { 501 quoteContext: extractQuoteContext(rootNode, command), 502 compoundStructure: extractCompoundStructure(rootNode, command), 503 hasActualOperatorNodes: hasActualOperatorNodes(rootNode), 504 dangerousPatterns: extractDangerousPatterns(rootNode), 505 } 506}