we (web engine): Experimental web browser project to understand the limits of Claude
at encoding-sniffing 1370 lines 44 kB view raw
1//! Pure-Rust regular expression engine for the JavaScript RegExp built-in. 2//! 3//! Implements a regex parser (pattern → IR) and a backtracking matcher. 4//! Supports: character classes, quantifiers (greedy/lazy), anchors, groups 5//! (capturing, non-capturing, named), alternation, backreferences, lookahead, 6//! and standard escape sequences. 7 8// ── Regex AST ─────────────────────────────────────────────── 9 10/// A parsed regex node. 11#[derive(Debug, Clone)] 12pub enum Node { 13 /// Match a single literal character. 14 Literal(char), 15 /// `.` — match any character (respects dotAll flag). 16 Dot, 17 /// `^` — start anchor. 18 Start, 19 /// `$` — end anchor. 20 End, 21 /// `\b` — word boundary. 22 WordBoundary, 23 /// `\B` — non-word boundary. 24 NonWordBoundary, 25 /// Character class `[...]` or `[^...]`. 26 CharClass { 27 negated: bool, 28 ranges: Vec<ClassRange>, 29 }, 30 /// Alternation `a|b`. 31 Alternation(Vec<Node>), 32 /// Sequence of nodes (implicit concatenation). 33 Sequence(Vec<Node>), 34 /// Quantifier applied to a sub-node. 35 Quantifier { 36 node: Box<Node>, 37 min: u32, 38 max: Option<u32>, // None = unbounded 39 greedy: bool, 40 }, 41 /// Capturing group `(...)`. 42 Group { 43 index: u32, 44 name: Option<String>, 45 node: Box<Node>, 46 }, 47 /// Non-capturing group `(?:...)`. 48 NonCapturingGroup(Box<Node>), 49 /// Backreference `\1` or `\k<name>`. 50 Backref(u32), 51 /// Positive lookahead `(?=...)`. 52 Lookahead(Box<Node>), 53 /// Negative lookahead `(?!...)`. 54 NegativeLookahead(Box<Node>), 55} 56 57/// A range within a character class. 58#[derive(Debug, Clone)] 59pub enum ClassRange { 60 /// Single character. 61 Char(char), 62 /// Character range `a-z`. 63 Range(char, char), 64 /// Predefined class like `\d`, `\w`, `\s`. 65 Predefined(PredefinedClass), 66} 67 68/// Predefined character classes. 69#[derive(Debug, Clone, Copy)] 70pub enum PredefinedClass { 71 Digit, // \d 72 NonDigit, // \D 73 Word, // \w 74 NonWord, // \W 75 Space, // \s 76 NonSpace, // \S 77} 78 79// ── Regex flags ───────────────────────────────────────────── 80 81/// Parsed regex flags. 82#[derive(Debug, Clone, Copy, Default)] 83pub struct RegexFlags { 84 pub global: bool, 85 pub ignore_case: bool, 86 pub multiline: bool, 87 pub dot_all: bool, 88 pub unicode: bool, 89 pub sticky: bool, 90} 91 92impl RegexFlags { 93 pub fn parse(flags_str: &str) -> Result<Self, String> { 94 let mut f = RegexFlags::default(); 95 for ch in flags_str.chars() { 96 match ch { 97 'g' => { 98 if f.global { 99 return Err(format!("duplicate flag '{}'", ch)); 100 } 101 f.global = true; 102 } 103 'i' => { 104 if f.ignore_case { 105 return Err(format!("duplicate flag '{}'", ch)); 106 } 107 f.ignore_case = true; 108 } 109 'm' => { 110 if f.multiline { 111 return Err(format!("duplicate flag '{}'", ch)); 112 } 113 f.multiline = true; 114 } 115 's' => { 116 if f.dot_all { 117 return Err(format!("duplicate flag '{}'", ch)); 118 } 119 f.dot_all = true; 120 } 121 'u' => { 122 if f.unicode { 123 return Err(format!("duplicate flag '{}'", ch)); 124 } 125 f.unicode = true; 126 } 127 'y' => { 128 if f.sticky { 129 return Err(format!("duplicate flag '{}'", ch)); 130 } 131 f.sticky = true; 132 } 133 _ => return Err(format!("invalid flag '{}'", ch)), 134 } 135 } 136 Ok(f) 137 } 138 139 pub fn as_flag_string(&self) -> String { 140 let mut s = String::new(); 141 if self.global { 142 s.push('g'); 143 } 144 if self.ignore_case { 145 s.push('i'); 146 } 147 if self.multiline { 148 s.push('m'); 149 } 150 if self.dot_all { 151 s.push('s'); 152 } 153 if self.unicode { 154 s.push('u'); 155 } 156 if self.sticky { 157 s.push('y'); 158 } 159 s 160 } 161} 162 163// ── Regex Parser ──────────────────────────────────────────── 164 165/// Parse a regex pattern string into a `Node` tree. 166pub fn parse_pattern(pattern: &str) -> Result<(Node, u32), String> { 167 let chars: Vec<char> = pattern.chars().collect(); 168 let mut parser = PatternParser { 169 chars: &chars, 170 pos: 0, 171 group_count: 0, 172 group_names: Vec::new(), 173 }; 174 let node = parser.parse_alternation()?; 175 if parser.pos < parser.chars.len() { 176 return Err(format!( 177 "unexpected '{}' at position {}", 178 parser.chars[parser.pos], parser.pos 179 )); 180 } 181 Ok((node, parser.group_count)) 182} 183 184struct PatternParser<'a> { 185 chars: &'a [char], 186 pos: usize, 187 group_count: u32, 188 group_names: Vec<(String, u32)>, 189} 190 191impl<'a> PatternParser<'a> { 192 fn peek(&self) -> Option<char> { 193 self.chars.get(self.pos).copied() 194 } 195 196 fn advance(&mut self) -> Option<char> { 197 let ch = self.chars.get(self.pos).copied(); 198 if ch.is_some() { 199 self.pos += 1; 200 } 201 ch 202 } 203 204 fn expect(&mut self, expected: char) -> Result<(), String> { 205 match self.advance() { 206 Some(ch) if ch == expected => Ok(()), 207 Some(ch) => Err(format!("expected '{}', got '{}'", expected, ch)), 208 None => Err(format!("expected '{}', got end of pattern", expected)), 209 } 210 } 211 212 /// Parse alternation: `a|b|c` 213 fn parse_alternation(&mut self) -> Result<Node, String> { 214 let mut branches = vec![self.parse_sequence()?]; 215 while self.peek() == Some('|') { 216 self.advance(); // consume '|' 217 branches.push(self.parse_sequence()?); 218 } 219 if branches.len() == 1 { 220 Ok(branches.pop().unwrap()) 221 } else { 222 Ok(Node::Alternation(branches)) 223 } 224 } 225 226 /// Parse a sequence of atoms (concatenation). 227 fn parse_sequence(&mut self) -> Result<Node, String> { 228 let mut nodes = Vec::new(); 229 while let Some(ch) = self.peek() { 230 if ch == '|' || ch == ')' { 231 break; 232 } 233 nodes.push(self.parse_quantified()?); 234 } 235 if nodes.len() == 1 { 236 Ok(nodes.pop().unwrap()) 237 } else { 238 Ok(Node::Sequence(nodes)) 239 } 240 } 241 242 /// Parse an atom with optional quantifier. 243 fn parse_quantified(&mut self) -> Result<Node, String> { 244 let node = self.parse_atom()?; 245 if let Some(ch) = self.peek() { 246 match ch { 247 '*' | '+' | '?' => { 248 self.advance(); 249 let (min, max) = match ch { 250 '*' => (0, None), 251 '+' => (1, None), 252 '?' => (0, Some(1)), 253 _ => unreachable!(), 254 }; 255 let greedy = if self.peek() == Some('?') { 256 self.advance(); 257 false 258 } else { 259 true 260 }; 261 Ok(Node::Quantifier { 262 node: Box::new(node), 263 min, 264 max, 265 greedy, 266 }) 267 } 268 '{' => self.parse_brace_quantifier(node), 269 _ => Ok(node), 270 } 271 } else { 272 Ok(node) 273 } 274 } 275 276 /// Parse `{n}`, `{n,}`, `{n,m}` quantifier. 277 fn parse_brace_quantifier(&mut self, node: Node) -> Result<Node, String> { 278 let save = self.pos; 279 self.advance(); // consume '{' 280 let min = match self.parse_decimal() { 281 Some(n) => n, 282 None => { 283 // Not a valid quantifier — treat '{' as literal. 284 self.pos = save; 285 return Ok(node); 286 } 287 }; 288 let max; 289 match self.peek() { 290 Some('}') => { 291 self.advance(); 292 max = Some(min); 293 } 294 Some(',') => { 295 self.advance(); 296 if self.peek() == Some('}') { 297 self.advance(); 298 max = None; 299 } else { 300 match self.parse_decimal() { 301 Some(n) => { 302 max = Some(n); 303 if self.peek() != Some('}') { 304 self.pos = save; 305 return Ok(node); 306 } 307 self.advance(); 308 } 309 None => { 310 self.pos = save; 311 return Ok(node); 312 } 313 } 314 } 315 } 316 _ => { 317 self.pos = save; 318 return Ok(node); 319 } 320 } 321 let greedy = if self.peek() == Some('?') { 322 self.advance(); 323 false 324 } else { 325 true 326 }; 327 Ok(Node::Quantifier { 328 node: Box::new(node), 329 min, 330 max, 331 greedy, 332 }) 333 } 334 335 fn parse_decimal(&mut self) -> Option<u32> { 336 let start = self.pos; 337 while let Some(ch) = self.peek() { 338 if ch.is_ascii_digit() { 339 self.advance(); 340 } else { 341 break; 342 } 343 } 344 if self.pos == start { 345 return None; 346 } 347 let s: String = self.chars[start..self.pos].iter().collect(); 348 s.parse().ok() 349 } 350 351 /// Parse a single atom. 352 fn parse_atom(&mut self) -> Result<Node, String> { 353 match self.peek() { 354 None => Err("unexpected end of pattern".to_string()), 355 Some('.') => { 356 self.advance(); 357 Ok(Node::Dot) 358 } 359 Some('^') => { 360 self.advance(); 361 Ok(Node::Start) 362 } 363 Some('$') => { 364 self.advance(); 365 Ok(Node::End) 366 } 367 Some('\\') => self.parse_escape(), 368 Some('[') => self.parse_char_class(), 369 Some('(') => self.parse_group(), 370 Some(ch) => { 371 self.advance(); 372 Ok(Node::Literal(ch)) 373 } 374 } 375 } 376 377 /// Parse an escape sequence. 378 fn parse_escape(&mut self) -> Result<Node, String> { 379 self.advance(); // consume '\' 380 match self.advance() { 381 None => Err("unexpected end of pattern after '\\'".to_string()), 382 Some('d') => Ok(Node::CharClass { 383 negated: false, 384 ranges: vec![ClassRange::Predefined(PredefinedClass::Digit)], 385 }), 386 Some('D') => Ok(Node::CharClass { 387 negated: false, 388 ranges: vec![ClassRange::Predefined(PredefinedClass::NonDigit)], 389 }), 390 Some('w') => Ok(Node::CharClass { 391 negated: false, 392 ranges: vec![ClassRange::Predefined(PredefinedClass::Word)], 393 }), 394 Some('W') => Ok(Node::CharClass { 395 negated: false, 396 ranges: vec![ClassRange::Predefined(PredefinedClass::NonWord)], 397 }), 398 Some('s') => Ok(Node::CharClass { 399 negated: false, 400 ranges: vec![ClassRange::Predefined(PredefinedClass::Space)], 401 }), 402 Some('S') => Ok(Node::CharClass { 403 negated: false, 404 ranges: vec![ClassRange::Predefined(PredefinedClass::NonSpace)], 405 }), 406 Some('b') => Ok(Node::WordBoundary), 407 Some('B') => Ok(Node::NonWordBoundary), 408 Some('n') => Ok(Node::Literal('\n')), 409 Some('r') => Ok(Node::Literal('\r')), 410 Some('t') => Ok(Node::Literal('\t')), 411 Some('f') => Ok(Node::Literal('\x0C')), 412 Some('v') => Ok(Node::Literal('\x0B')), 413 Some('0') => Ok(Node::Literal('\0')), 414 Some('x') => { 415 let hi = self.advance().ok_or("expected hex digit")?; 416 let lo = self.advance().ok_or("expected hex digit")?; 417 let code = hex2(hi, lo)?; 418 Ok(Node::Literal(char::from(code))) 419 } 420 Some('u') => { 421 if self.peek() == Some('{') { 422 self.advance(); 423 let mut code_str = String::new(); 424 while self.peek() != Some('}') { 425 code_str.push(self.advance().ok_or("expected '}'")?); 426 } 427 self.advance(); // consume '}' 428 let code = 429 u32::from_str_radix(&code_str, 16).map_err(|_| "invalid unicode escape")?; 430 let ch = char::from_u32(code).ok_or("invalid unicode code point")?; 431 Ok(Node::Literal(ch)) 432 } else { 433 let a = self.advance().ok_or("expected hex digit")?; 434 let b = self.advance().ok_or("expected hex digit")?; 435 let c = self.advance().ok_or("expected hex digit")?; 436 let d = self.advance().ok_or("expected hex digit")?; 437 let code = hex4(a, b, c, d)?; 438 let ch = char::from_u32(code).ok_or("invalid unicode escape")?; 439 Ok(Node::Literal(ch)) 440 } 441 } 442 Some('k') => { 443 // Named backreference \k<name> 444 self.expect('<')?; 445 let mut name = String::new(); 446 while self.peek() != Some('>') { 447 name.push(self.advance().ok_or("expected '>'")?); 448 } 449 self.advance(); // consume '>' 450 // Resolve name to group index. 451 for &(ref n, idx) in &self.group_names { 452 if *n == name { 453 return Ok(Node::Backref(idx)); 454 } 455 } 456 Err(format!("unknown group name '{}'", name)) 457 } 458 Some(ch) if ch.is_ascii_digit() && ch != '0' => { 459 // Numeric backreference \1, \12, etc. 460 let mut num_str = String::new(); 461 num_str.push(ch); 462 while let Some(next) = self.peek() { 463 if next.is_ascii_digit() { 464 num_str.push(next); 465 self.advance(); 466 } else { 467 break; 468 } 469 } 470 let idx: u32 = num_str.parse().map_err(|_| "invalid backreference")?; 471 Ok(Node::Backref(idx)) 472 } 473 // Escaped metacharacters — treat as literal. 474 Some(ch) => Ok(Node::Literal(ch)), 475 } 476 } 477 478 /// Parse a character class `[...]` or `[^...]`. 479 fn parse_char_class(&mut self) -> Result<Node, String> { 480 self.advance(); // consume '[' 481 let negated = if self.peek() == Some('^') { 482 self.advance(); 483 true 484 } else { 485 false 486 }; 487 let mut ranges = Vec::new(); 488 // Allow ']' as first character in class. 489 if self.peek() == Some(']') { 490 self.advance(); 491 ranges.push(ClassRange::Char(']')); 492 } 493 while self.peek() != Some(']') { 494 if self.peek().is_none() { 495 return Err("unterminated character class".to_string()); 496 } 497 let item = self.parse_class_atom()?; 498 // Check for range `a-b`. 499 if self.peek() == Some('-') 500 && self.chars.get(self.pos + 1) != Some(&']') 501 && self.chars.get(self.pos + 1).is_some() 502 { 503 if let ClassRange::Char(start) = item { 504 self.advance(); // consume '-' 505 let end_item = self.parse_class_atom()?; 506 if let ClassRange::Char(end) = end_item { 507 if start > end { 508 return Err("character class range out of order".to_string()); 509 } 510 ranges.push(ClassRange::Range(start, end)); 511 continue; 512 } else { 513 // Not a valid range, push start, '-', and end_item. 514 ranges.push(ClassRange::Char(start)); 515 ranges.push(ClassRange::Char('-')); 516 ranges.push(end_item); 517 continue; 518 } 519 } 520 } 521 ranges.push(item); 522 } 523 self.advance(); // consume ']' 524 Ok(Node::CharClass { negated, ranges }) 525 } 526 527 fn parse_class_atom(&mut self) -> Result<ClassRange, String> { 528 match self.peek() { 529 Some('\\') => { 530 self.advance(); // consume '\' 531 match self.advance() { 532 None => Err("unexpected end of class".to_string()), 533 Some('d') => Ok(ClassRange::Predefined(PredefinedClass::Digit)), 534 Some('D') => Ok(ClassRange::Predefined(PredefinedClass::NonDigit)), 535 Some('w') => Ok(ClassRange::Predefined(PredefinedClass::Word)), 536 Some('W') => Ok(ClassRange::Predefined(PredefinedClass::NonWord)), 537 Some('s') => Ok(ClassRange::Predefined(PredefinedClass::Space)), 538 Some('S') => Ok(ClassRange::Predefined(PredefinedClass::NonSpace)), 539 Some('n') => Ok(ClassRange::Char('\n')), 540 Some('r') => Ok(ClassRange::Char('\r')), 541 Some('t') => Ok(ClassRange::Char('\t')), 542 Some('f') => Ok(ClassRange::Char('\x0C')), 543 Some('v') => Ok(ClassRange::Char('\x0B')), 544 Some('0') => Ok(ClassRange::Char('\0')), 545 Some('x') => { 546 let hi = self.advance().ok_or("expected hex digit")?; 547 let lo = self.advance().ok_or("expected hex digit")?; 548 let code = hex2(hi, lo)?; 549 Ok(ClassRange::Char(char::from(code))) 550 } 551 Some('u') => { 552 if self.peek() == Some('{') { 553 self.advance(); 554 let mut code_str = String::new(); 555 while self.peek() != Some('}') { 556 code_str.push(self.advance().ok_or("expected '}'")?); 557 } 558 self.advance(); 559 let code = u32::from_str_radix(&code_str, 16) 560 .map_err(|_| "invalid unicode escape")?; 561 let ch = char::from_u32(code) 562 .ok_or_else(|| "invalid unicode code point".to_string())?; 563 Ok(ClassRange::Char(ch)) 564 } else { 565 let a = self.advance().ok_or("expected hex digit")?; 566 let b = self.advance().ok_or("expected hex digit")?; 567 let c = self.advance().ok_or("expected hex digit")?; 568 let d = self.advance().ok_or("expected hex digit")?; 569 let code = hex4(a, b, c, d)?; 570 let ch = char::from_u32(code) 571 .ok_or_else(|| "invalid unicode escape".to_string())?; 572 Ok(ClassRange::Char(ch)) 573 } 574 } 575 Some(ch) => Ok(ClassRange::Char(ch)), 576 } 577 } 578 Some(ch) => { 579 self.advance(); 580 Ok(ClassRange::Char(ch)) 581 } 582 None => Err("unexpected end of character class".to_string()), 583 } 584 } 585 586 /// Parse a group `(...)`. 587 fn parse_group(&mut self) -> Result<Node, String> { 588 self.advance(); // consume '(' 589 if self.peek() == Some('?') { 590 self.advance(); // consume '?' 591 match self.peek() { 592 Some(':') => { 593 self.advance(); 594 let inner = self.parse_alternation()?; 595 self.expect(')')?; 596 Ok(Node::NonCapturingGroup(Box::new(inner))) 597 } 598 Some('=') => { 599 self.advance(); 600 let inner = self.parse_alternation()?; 601 self.expect(')')?; 602 Ok(Node::Lookahead(Box::new(inner))) 603 } 604 Some('!') => { 605 self.advance(); 606 let inner = self.parse_alternation()?; 607 self.expect(')')?; 608 Ok(Node::NegativeLookahead(Box::new(inner))) 609 } 610 Some('<') => { 611 self.advance(); // consume '<' 612 let mut name = String::new(); 613 while self.peek() != Some('>') { 614 name.push(self.advance().ok_or("expected '>'")?); 615 } 616 self.advance(); // consume '>' 617 self.group_count += 1; 618 let idx = self.group_count; 619 self.group_names.push((name.clone(), idx)); 620 let inner = self.parse_alternation()?; 621 self.expect(')')?; 622 Ok(Node::Group { 623 index: idx, 624 name: Some(name), 625 node: Box::new(inner), 626 }) 627 } 628 _ => Err("invalid group specifier".to_string()), 629 } 630 } else { 631 self.group_count += 1; 632 let idx = self.group_count; 633 let inner = self.parse_alternation()?; 634 self.expect(')')?; 635 Ok(Node::Group { 636 index: idx, 637 name: None, 638 node: Box::new(inner), 639 }) 640 } 641 } 642} 643 644// ── Hex helpers ───────────────────────────────────────────── 645 646fn hex_digit(ch: char) -> Result<u32, String> { 647 match ch { 648 '0'..='9' => Ok(ch as u32 - '0' as u32), 649 'a'..='f' => Ok(ch as u32 - 'a' as u32 + 10), 650 'A'..='F' => Ok(ch as u32 - 'A' as u32 + 10), 651 _ => Err(format!("invalid hex digit '{}'", ch)), 652 } 653} 654 655fn hex2(hi: char, lo: char) -> Result<u8, String> { 656 Ok((hex_digit(hi)? * 16 + hex_digit(lo)?) as u8) 657} 658 659fn hex4(a: char, b: char, c: char, d: char) -> Result<u32, String> { 660 Ok(hex_digit(a)? * 4096 + hex_digit(b)? * 256 + hex_digit(c)? * 16 + hex_digit(d)?) 661} 662 663// ── Compiled regex ────────────────────────────────────────── 664 665/// A compiled regular expression ready for matching. 666#[derive(Debug, Clone)] 667pub struct CompiledRegex { 668 pub pattern: String, 669 pub flags: RegexFlags, 670 pub node: Node, 671 pub group_count: u32, 672} 673 674impl CompiledRegex { 675 pub fn new(pattern: &str, flags_str: &str) -> Result<Self, String> { 676 let flags = RegexFlags::parse(flags_str)?; 677 let (node, group_count) = parse_pattern(pattern)?; 678 Ok(CompiledRegex { 679 pattern: pattern.to_string(), 680 flags, 681 node, 682 group_count, 683 }) 684 } 685} 686 687// ── Backtracking matcher ──────────────────────────────────── 688 689/// Result of a successful regex match. 690#[derive(Debug, Clone)] 691pub struct MatchResult { 692 /// Overall match start index (in chars). 693 pub start: usize, 694 /// Overall match end index (in chars). 695 pub end: usize, 696 /// Capture group contents: index 0 = full match, 1..n = groups. 697 pub captures: Vec<Option<(usize, usize)>>, 698} 699 700/// Execute the regex on the given string, starting the search at `start_pos` (char index). 701pub fn exec(regex: &CompiledRegex, input: &str, start_pos: usize) -> Option<MatchResult> { 702 let chars: Vec<char> = input.chars().collect(); 703 let group_count = regex.group_count as usize; 704 705 if regex.flags.sticky { 706 // Sticky: only try at start_pos. 707 let mut captures = vec![None; group_count + 1]; 708 let mut ctx = MatchContext { 709 chars: &chars, 710 flags: &regex.flags, 711 captures: &mut captures, 712 backtrack_limit: 1_000_000, 713 backtrack_count: 0, 714 }; 715 if let Some(end) = match_node(&mut ctx, &regex.node, start_pos) { 716 captures = ctx.captures.to_vec(); 717 captures[0] = Some((start_pos, end)); 718 return Some(MatchResult { 719 start: start_pos, 720 end, 721 captures, 722 }); 723 } 724 return None; 725 } 726 727 // Non-sticky: try each position starting from start_pos. 728 for i in start_pos..=chars.len() { 729 let mut captures = vec![None; group_count + 1]; 730 let mut ctx = MatchContext { 731 chars: &chars, 732 flags: &regex.flags, 733 captures: &mut captures, 734 backtrack_limit: 1_000_000, 735 backtrack_count: 0, 736 }; 737 if let Some(end) = match_node(&mut ctx, &regex.node, i) { 738 captures = ctx.captures.to_vec(); 739 captures[0] = Some((i, end)); 740 return Some(MatchResult { 741 start: i, 742 end, 743 captures, 744 }); 745 } 746 } 747 None 748} 749 750struct MatchContext<'a> { 751 chars: &'a [char], 752 flags: &'a RegexFlags, 753 captures: &'a mut Vec<Option<(usize, usize)>>, 754 backtrack_limit: u32, 755 backtrack_count: u32, 756} 757 758/// Try to match `node` at position `pos`, returning the end position on success. 759fn match_node(ctx: &mut MatchContext, node: &Node, pos: usize) -> Option<usize> { 760 ctx.backtrack_count += 1; 761 if ctx.backtrack_count > ctx.backtrack_limit { 762 return None; 763 } 764 765 match node { 766 Node::Literal(ch) => { 767 if pos < ctx.chars.len() { 768 let input_ch = ctx.chars[pos]; 769 if ctx.flags.ignore_case { 770 if char_eq_ignore_case(input_ch, *ch) { 771 Some(pos + 1) 772 } else { 773 None 774 } 775 } else if input_ch == *ch { 776 Some(pos + 1) 777 } else { 778 None 779 } 780 } else { 781 None 782 } 783 } 784 785 Node::Dot => { 786 if pos < ctx.chars.len() { 787 let ch = ctx.chars[pos]; 788 if ctx.flags.dot_all || !is_line_terminator(ch) { 789 Some(pos + 1) 790 } else { 791 None 792 } 793 } else { 794 None 795 } 796 } 797 798 Node::Start => { 799 if pos == 0 800 || (ctx.flags.multiline && pos > 0 && is_line_terminator(ctx.chars[pos - 1])) 801 { 802 Some(pos) 803 } else { 804 None 805 } 806 } 807 808 Node::End => { 809 if pos == ctx.chars.len() 810 || (ctx.flags.multiline 811 && pos < ctx.chars.len() 812 && is_line_terminator(ctx.chars[pos])) 813 { 814 Some(pos) 815 } else { 816 None 817 } 818 } 819 820 Node::WordBoundary => { 821 let before = if pos > 0 { 822 is_word_char(ctx.chars[pos - 1]) 823 } else { 824 false 825 }; 826 let after = if pos < ctx.chars.len() { 827 is_word_char(ctx.chars[pos]) 828 } else { 829 false 830 }; 831 if before != after { 832 Some(pos) 833 } else { 834 None 835 } 836 } 837 838 Node::NonWordBoundary => { 839 let before = if pos > 0 { 840 is_word_char(ctx.chars[pos - 1]) 841 } else { 842 false 843 }; 844 let after = if pos < ctx.chars.len() { 845 is_word_char(ctx.chars[pos]) 846 } else { 847 false 848 }; 849 if before == after { 850 Some(pos) 851 } else { 852 None 853 } 854 } 855 856 Node::CharClass { negated, ranges } => { 857 if pos >= ctx.chars.len() { 858 return None; 859 } 860 let ch = ctx.chars[pos]; 861 let matched = class_matches(ranges, ch, ctx.flags.ignore_case); 862 if matched != *negated { 863 Some(pos + 1) 864 } else { 865 None 866 } 867 } 868 869 Node::Sequence(nodes) => match_sequence(ctx, nodes, 0, pos), 870 871 Node::Alternation(branches) => { 872 for branch in branches { 873 let saved = ctx.captures.clone(); 874 if let Some(end) = match_node(ctx, branch, pos) { 875 return Some(end); 876 } 877 *ctx.captures = saved; 878 } 879 None 880 } 881 882 Node::Quantifier { 883 node: inner, 884 min, 885 max, 886 greedy, 887 } => match_quantifier_standalone(ctx, inner, pos, *min, *max, *greedy), 888 889 Node::Group { 890 index, node: inner, .. 891 } => { 892 let idx = *index as usize; 893 let saved = if idx < ctx.captures.len() { 894 ctx.captures[idx] 895 } else { 896 None 897 }; 898 let result = match_node(ctx, inner, pos); 899 if let Some(end) = result { 900 if idx < ctx.captures.len() { 901 ctx.captures[idx] = Some((pos, end)); 902 } 903 Some(end) 904 } else { 905 if idx < ctx.captures.len() { 906 ctx.captures[idx] = saved; 907 } 908 None 909 } 910 } 911 912 Node::NonCapturingGroup(inner) => match_node(ctx, inner, pos), 913 914 Node::Backref(idx) => { 915 let idx = *idx as usize; 916 if idx >= ctx.captures.len() { 917 return Some(pos); // Unmatched backref matches empty. 918 } 919 match ctx.captures[idx] { 920 Some((start, end)) => { 921 let cap_len = end - start; 922 if pos + cap_len > ctx.chars.len() { 923 return None; 924 } 925 for i in 0..cap_len { 926 let a = ctx.chars[start + i]; 927 let b = ctx.chars[pos + i]; 928 if ctx.flags.ignore_case { 929 if !char_eq_ignore_case(a, b) { 930 return None; 931 } 932 } else if a != b { 933 return None; 934 } 935 } 936 Some(pos + cap_len) 937 } 938 None => Some(pos), // Unmatched group — backref matches empty. 939 } 940 } 941 942 Node::Lookahead(inner) => { 943 let saved = ctx.captures.clone(); 944 if match_node(ctx, inner, pos).is_some() { 945 Some(pos) // Lookahead doesn't consume. 946 } else { 947 *ctx.captures = saved; 948 None 949 } 950 } 951 952 Node::NegativeLookahead(inner) => { 953 let saved = ctx.captures.clone(); 954 if match_node(ctx, inner, pos).is_some() { 955 *ctx.captures = saved; 956 None 957 } else { 958 *ctx.captures = saved; 959 Some(pos) 960 } 961 } 962 } 963} 964 965/// Match a sequence of nodes with backtracking support for quantifiers. 966fn match_sequence(ctx: &mut MatchContext, nodes: &[Node], idx: usize, pos: usize) -> Option<usize> { 967 if idx >= nodes.len() { 968 return Some(pos); 969 } 970 971 let node = &nodes[idx]; 972 973 // For quantifiers, try each count with the remaining sequence as continuation. 974 if let Node::Quantifier { 975 node: inner, 976 min, 977 max, 978 greedy, 979 } = node 980 { 981 return match_quantifier_in_seq(ctx, inner, pos, *min, *max, *greedy, &nodes[idx + 1..]); 982 } 983 984 // For non-quantifier nodes, match and continue. 985 let saved = ctx.captures.clone(); 986 match match_node(ctx, node, pos) { 987 Some(next) => match_sequence(ctx, nodes, idx + 1, next).or_else(|| { 988 *ctx.captures = saved; 989 None 990 }), 991 None => None, 992 } 993} 994 995/// Match a quantifier within a sequence, trying each count with the continuation. 996fn match_quantifier_in_seq( 997 ctx: &mut MatchContext, 998 inner: &Node, 999 pos: usize, 1000 min: u32, 1001 max: Option<u32>, 1002 greedy: bool, 1003 continuation: &[Node], 1004) -> Option<usize> { 1005 // Collect all reachable positions (match inner 0 to max times). 1006 let mut positions = vec![pos]; // positions[0] = 0 matches 1007 let mut cur = pos; 1008 1009 loop { 1010 let count = positions.len() - 1; 1011 if let Some(m) = max { 1012 if count as u32 >= m { 1013 break; 1014 } 1015 } 1016 let saved = ctx.captures.clone(); 1017 match match_node(ctx, inner, cur) { 1018 Some(next) if next > cur => { 1019 cur = next; 1020 positions.push(cur); 1021 } 1022 Some(next) => { 1023 // Zero-width match — record once and stop to prevent infinite loop. 1024 *ctx.captures = saved; 1025 if next == cur && (count as u32) < min { 1026 positions.push(cur); 1027 } 1028 break; 1029 } 1030 None => { 1031 *ctx.captures = saved; 1032 break; 1033 } 1034 } 1035 } 1036 1037 let max_count = positions.len() - 1; 1038 let min_count = min as usize; 1039 if max_count < min_count { 1040 return None; 1041 } 1042 1043 if greedy { 1044 // Try from most matches to fewest. 1045 for &p in positions[min_count..=max_count].iter().rev() { 1046 let saved = ctx.captures.clone(); 1047 if let Some(end) = match_sequence(ctx, continuation, 0, p) { 1048 return Some(end); 1049 } 1050 *ctx.captures = saved; 1051 } 1052 None 1053 } else { 1054 // Lazy: try from fewest to most. 1055 for &p in &positions[min_count..=max_count] { 1056 let saved = ctx.captures.clone(); 1057 if let Some(end) = match_sequence(ctx, continuation, 0, p) { 1058 return Some(end); 1059 } 1060 *ctx.captures = saved; 1061 } 1062 None 1063 } 1064} 1065 1066/// Match a standalone quantifier (not in a sequence context). 1067fn match_quantifier_standalone( 1068 ctx: &mut MatchContext, 1069 inner: &Node, 1070 pos: usize, 1071 min: u32, 1072 max: Option<u32>, 1073 greedy: bool, 1074) -> Option<usize> { 1075 let mut positions = vec![pos]; 1076 let mut cur = pos; 1077 1078 loop { 1079 let count = positions.len() - 1; 1080 if let Some(m) = max { 1081 if count as u32 >= m { 1082 break; 1083 } 1084 } 1085 let saved = ctx.captures.clone(); 1086 match match_node(ctx, inner, cur) { 1087 Some(next) if next > cur => { 1088 cur = next; 1089 positions.push(cur); 1090 } 1091 Some(next) => { 1092 *ctx.captures = saved; 1093 if next == cur && (count as u32) < min { 1094 positions.push(cur); 1095 } 1096 break; 1097 } 1098 None => { 1099 *ctx.captures = saved; 1100 break; 1101 } 1102 } 1103 } 1104 1105 let max_count = positions.len() - 1; 1106 let min_count = min as usize; 1107 if max_count < min_count { 1108 return None; 1109 } 1110 1111 if greedy { 1112 Some(positions[max_count]) 1113 } else { 1114 Some(positions[min_count]) 1115 } 1116} 1117 1118// ── Character helpers ─────────────────────────────────────── 1119 1120fn is_line_terminator(ch: char) -> bool { 1121 matches!(ch, '\n' | '\r' | '\u{2028}' | '\u{2029}') 1122} 1123 1124fn is_word_char(ch: char) -> bool { 1125 ch.is_ascii_alphanumeric() || ch == '_' 1126} 1127 1128fn char_eq_ignore_case(a: char, b: char) -> bool { 1129 a.eq_ignore_ascii_case(&b) 1130} 1131 1132fn class_matches(ranges: &[ClassRange], ch: char, ignore_case: bool) -> bool { 1133 for range in ranges { 1134 match range { 1135 ClassRange::Char(c) => { 1136 if ignore_case { 1137 if char_eq_ignore_case(ch, *c) { 1138 return true; 1139 } 1140 } else if ch == *c { 1141 return true; 1142 } 1143 } 1144 ClassRange::Range(start, end) => { 1145 if ignore_case { 1146 let ch_lower = ch.to_ascii_lowercase(); 1147 let start_lower = start.to_ascii_lowercase(); 1148 let end_lower = end.to_ascii_lowercase(); 1149 if ch_lower >= start_lower && ch_lower <= end_lower { 1150 return true; 1151 } 1152 } else if ch >= *start && ch <= *end { 1153 return true; 1154 } 1155 } 1156 ClassRange::Predefined(class) => { 1157 if predefined_matches(*class, ch) { 1158 return true; 1159 } 1160 } 1161 } 1162 } 1163 false 1164} 1165 1166fn predefined_matches(class: PredefinedClass, ch: char) -> bool { 1167 match class { 1168 PredefinedClass::Digit => ch.is_ascii_digit(), 1169 PredefinedClass::NonDigit => !ch.is_ascii_digit(), 1170 PredefinedClass::Word => is_word_char(ch), 1171 PredefinedClass::NonWord => !is_word_char(ch), 1172 PredefinedClass::Space => matches!( 1173 ch, 1174 ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C' | '\u{00A0}' | '\u{FEFF}' 1175 ), 1176 PredefinedClass::NonSpace => !matches!( 1177 ch, 1178 ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C' | '\u{00A0}' | '\u{FEFF}' 1179 ), 1180 } 1181} 1182 1183// ── Tests ─────────────────────────────────────────────────── 1184 1185#[cfg(test)] 1186mod tests { 1187 use super::*; 1188 1189 fn match_at(pattern: &str, flags: &str, input: &str, start: usize) -> Option<MatchResult> { 1190 let regex = CompiledRegex::new(pattern, flags).unwrap(); 1191 exec(&regex, input, start) 1192 } 1193 1194 fn assert_match(pattern: &str, input: &str, expected_start: usize, expected_end: usize) { 1195 let m = match_at(pattern, "", input, 0).expect("expected match"); 1196 assert_eq!( 1197 m.start, expected_start, 1198 "pattern={} input={}", 1199 pattern, input 1200 ); 1201 assert_eq!(m.end, expected_end, "pattern={} input={}", pattern, input); 1202 } 1203 1204 fn assert_no_match(pattern: &str, input: &str) { 1205 assert!( 1206 match_at(pattern, "", input, 0).is_none(), 1207 "expected no match: pattern={} input={}", 1208 pattern, 1209 input 1210 ); 1211 } 1212 1213 #[test] 1214 fn test_literal() { 1215 assert_match("abc", "xabcx", 1, 4); 1216 assert_no_match("abc", "xyz"); 1217 } 1218 1219 #[test] 1220 fn test_dot() { 1221 assert_match("a.c", "abc", 0, 3); 1222 assert_match("a.c", "axc", 0, 3); 1223 assert_no_match("a.c", "a\nc"); 1224 // dotAll flag 1225 let m = match_at("a.c", "s", "a\nc", 0).unwrap(); 1226 assert_eq!(m.start, 0); 1227 assert_eq!(m.end, 3); 1228 } 1229 1230 #[test] 1231 fn test_anchors() { 1232 assert_match("^abc", "abc", 0, 3); 1233 assert_no_match("^abc", "xabc"); 1234 assert_match("abc$", "abc", 0, 3); 1235 assert_no_match("abc$", "abcx"); 1236 } 1237 1238 #[test] 1239 fn test_quantifiers() { 1240 assert_match("a*", "aaa", 0, 3); 1241 assert_match("a+", "aaa", 0, 3); 1242 assert_no_match("a+", "bbb"); 1243 assert_match("a?b", "ab", 0, 2); 1244 assert_match("a?b", "b", 0, 1); 1245 assert_match("a{2}", "aaa", 0, 2); 1246 assert_match("a{2,3}", "aaaa", 0, 3); 1247 assert_match("a{2,}", "aaaa", 0, 4); 1248 } 1249 1250 #[test] 1251 fn test_char_class() { 1252 assert_match("[abc]", "b", 0, 1); 1253 assert_no_match("[abc]", "d"); 1254 assert_match("[a-z]", "m", 0, 1); 1255 assert_no_match("[a-z]", "M"); 1256 assert_match("[^abc]", "d", 0, 1); 1257 assert_no_match("[^abc]", "a"); 1258 } 1259 1260 #[test] 1261 fn test_predefined_classes() { 1262 assert_match("\\d+", "abc123", 3, 6); 1263 assert_match("\\w+", "hello world", 0, 5); 1264 assert_match("\\s+", "a b", 1, 2); 1265 } 1266 1267 #[test] 1268 fn test_alternation() { 1269 assert_match("cat|dog", "dog", 0, 3); 1270 assert_match("cat|dog", "catdog", 0, 3); 1271 } 1272 1273 #[test] 1274 fn test_groups() { 1275 let m = match_at("(a)(b)(c)", "", "abc", 0).unwrap(); 1276 assert_eq!(m.captures[1], Some((0, 1))); 1277 assert_eq!(m.captures[2], Some((1, 2))); 1278 assert_eq!(m.captures[3], Some((2, 3))); 1279 } 1280 1281 #[test] 1282 fn test_non_capturing_group() { 1283 let m = match_at("(?:ab)(c)", "", "abc", 0).unwrap(); 1284 assert_eq!(m.captures[1], Some((2, 3))); 1285 } 1286 1287 #[test] 1288 fn test_named_group() { 1289 let m = match_at("(?<word>\\w+)", "", "hello", 0).unwrap(); 1290 assert_eq!(m.captures[1], Some((0, 5))); 1291 } 1292 1293 #[test] 1294 fn test_backreference() { 1295 assert_match("(a)\\1", "aa", 0, 2); 1296 assert_no_match("(a)\\1", "ab"); 1297 } 1298 1299 #[test] 1300 fn test_word_boundary() { 1301 assert_match("\\bfoo\\b", "a foo b", 2, 5); 1302 assert_no_match("\\bfoo\\b", "afoo"); 1303 } 1304 1305 #[test] 1306 fn test_lookahead() { 1307 assert_match("a(?=b)", "ab", 0, 1); 1308 assert_no_match("a(?=b)", "ac"); 1309 assert_match("a(?!b)", "ac", 0, 1); 1310 assert_no_match("a(?!b)", "ab"); 1311 } 1312 1313 #[test] 1314 fn test_ignore_case() { 1315 let m = match_at("abc", "i", "ABC", 0).unwrap(); 1316 assert_eq!(m.start, 0); 1317 assert_eq!(m.end, 3); 1318 } 1319 1320 #[test] 1321 fn test_multiline() { 1322 let m = match_at("^b", "m", "a\nb", 0).unwrap(); 1323 assert_eq!(m.start, 2); 1324 } 1325 1326 #[test] 1327 fn test_global_multiple() { 1328 let regex = CompiledRegex::new("a", "g").unwrap(); 1329 let m1 = exec(&regex, "aba", 0).unwrap(); 1330 assert_eq!(m1.start, 0); 1331 let m2 = exec(&regex, "aba", m1.end).unwrap(); 1332 assert_eq!(m2.start, 2); 1333 assert!(exec(&regex, "aba", m2.end).is_none()); 1334 } 1335 1336 #[test] 1337 fn test_escape_sequences() { 1338 assert_match("\\n", "\n", 0, 1); 1339 assert_match("\\t", "\t", 0, 1); 1340 assert_match("\\x41", "A", 0, 1); 1341 assert_match("\\u0041", "A", 0, 1); 1342 } 1343 1344 #[test] 1345 fn test_lazy_quantifiers() { 1346 let m = match_at("a+?", "", "aaa", 0).unwrap(); 1347 assert_eq!(m.end, 1); // Lazy: match as few as possible. 1348 } 1349 1350 #[test] 1351 fn test_empty_pattern() { 1352 let m = match_at("", "", "abc", 0).unwrap(); 1353 assert_eq!(m.start, 0); 1354 assert_eq!(m.end, 0); 1355 } 1356 1357 #[test] 1358 fn test_flags_parse() { 1359 let f = RegexFlags::parse("gims").unwrap(); 1360 assert!(f.global); 1361 assert!(f.ignore_case); 1362 assert!(f.multiline); 1363 assert!(f.dot_all); 1364 assert!(!f.unicode); 1365 assert!(!f.sticky); 1366 1367 assert!(RegexFlags::parse("gg").is_err()); 1368 assert!(RegexFlags::parse("x").is_err()); 1369 } 1370}