we (web engine): Experimental web browser project to understand the limits of Claude
1//! Pure-Rust regular expression engine for the JavaScript RegExp built-in.
2//!
3//! Implements a regex parser (pattern → IR) and a backtracking matcher.
4//! Supports: character classes, quantifiers (greedy/lazy), anchors, groups
5//! (capturing, non-capturing, named), alternation, backreferences, lookahead,
6//! and standard escape sequences.
7
8// ── Regex AST ───────────────────────────────────────────────
9
10/// A parsed regex node.
11#[derive(Debug, Clone)]
12pub enum Node {
13 /// Match a single literal character.
14 Literal(char),
15 /// `.` — match any character (respects dotAll flag).
16 Dot,
17 /// `^` — start anchor.
18 Start,
19 /// `$` — end anchor.
20 End,
21 /// `\b` — word boundary.
22 WordBoundary,
23 /// `\B` — non-word boundary.
24 NonWordBoundary,
25 /// Character class `[...]` or `[^...]`.
26 CharClass {
27 negated: bool,
28 ranges: Vec<ClassRange>,
29 },
30 /// Alternation `a|b`.
31 Alternation(Vec<Node>),
32 /// Sequence of nodes (implicit concatenation).
33 Sequence(Vec<Node>),
34 /// Quantifier applied to a sub-node.
35 Quantifier {
36 node: Box<Node>,
37 min: u32,
38 max: Option<u32>, // None = unbounded
39 greedy: bool,
40 },
41 /// Capturing group `(...)`.
42 Group {
43 index: u32,
44 name: Option<String>,
45 node: Box<Node>,
46 },
47 /// Non-capturing group `(?:...)`.
48 NonCapturingGroup(Box<Node>),
49 /// Backreference `\1` or `\k<name>`.
50 Backref(u32),
51 /// Positive lookahead `(?=...)`.
52 Lookahead(Box<Node>),
53 /// Negative lookahead `(?!...)`.
54 NegativeLookahead(Box<Node>),
55}
56
57/// A range within a character class.
58#[derive(Debug, Clone)]
59pub enum ClassRange {
60 /// Single character.
61 Char(char),
62 /// Character range `a-z`.
63 Range(char, char),
64 /// Predefined class like `\d`, `\w`, `\s`.
65 Predefined(PredefinedClass),
66}
67
68/// Predefined character classes.
69#[derive(Debug, Clone, Copy)]
70pub enum PredefinedClass {
71 Digit, // \d
72 NonDigit, // \D
73 Word, // \w
74 NonWord, // \W
75 Space, // \s
76 NonSpace, // \S
77}
78
79// ── Regex flags ─────────────────────────────────────────────
80
81/// Parsed regex flags.
82#[derive(Debug, Clone, Copy, Default)]
83pub struct RegexFlags {
84 pub global: bool,
85 pub ignore_case: bool,
86 pub multiline: bool,
87 pub dot_all: bool,
88 pub unicode: bool,
89 pub sticky: bool,
90}
91
92impl RegexFlags {
93 pub fn parse(flags_str: &str) -> Result<Self, String> {
94 let mut f = RegexFlags::default();
95 for ch in flags_str.chars() {
96 match ch {
97 'g' => {
98 if f.global {
99 return Err(format!("duplicate flag '{}'", ch));
100 }
101 f.global = true;
102 }
103 'i' => {
104 if f.ignore_case {
105 return Err(format!("duplicate flag '{}'", ch));
106 }
107 f.ignore_case = true;
108 }
109 'm' => {
110 if f.multiline {
111 return Err(format!("duplicate flag '{}'", ch));
112 }
113 f.multiline = true;
114 }
115 's' => {
116 if f.dot_all {
117 return Err(format!("duplicate flag '{}'", ch));
118 }
119 f.dot_all = true;
120 }
121 'u' => {
122 if f.unicode {
123 return Err(format!("duplicate flag '{}'", ch));
124 }
125 f.unicode = true;
126 }
127 'y' => {
128 if f.sticky {
129 return Err(format!("duplicate flag '{}'", ch));
130 }
131 f.sticky = true;
132 }
133 _ => return Err(format!("invalid flag '{}'", ch)),
134 }
135 }
136 Ok(f)
137 }
138
139 pub fn as_flag_string(&self) -> String {
140 let mut s = String::new();
141 if self.global {
142 s.push('g');
143 }
144 if self.ignore_case {
145 s.push('i');
146 }
147 if self.multiline {
148 s.push('m');
149 }
150 if self.dot_all {
151 s.push('s');
152 }
153 if self.unicode {
154 s.push('u');
155 }
156 if self.sticky {
157 s.push('y');
158 }
159 s
160 }
161}
162
163// ── Regex Parser ────────────────────────────────────────────
164
165/// Parse a regex pattern string into a `Node` tree.
166pub fn parse_pattern(pattern: &str) -> Result<(Node, u32), String> {
167 let chars: Vec<char> = pattern.chars().collect();
168 let mut parser = PatternParser {
169 chars: &chars,
170 pos: 0,
171 group_count: 0,
172 group_names: Vec::new(),
173 };
174 let node = parser.parse_alternation()?;
175 if parser.pos < parser.chars.len() {
176 return Err(format!(
177 "unexpected '{}' at position {}",
178 parser.chars[parser.pos], parser.pos
179 ));
180 }
181 Ok((node, parser.group_count))
182}
183
184struct PatternParser<'a> {
185 chars: &'a [char],
186 pos: usize,
187 group_count: u32,
188 group_names: Vec<(String, u32)>,
189}
190
191impl<'a> PatternParser<'a> {
192 fn peek(&self) -> Option<char> {
193 self.chars.get(self.pos).copied()
194 }
195
196 fn advance(&mut self) -> Option<char> {
197 let ch = self.chars.get(self.pos).copied();
198 if ch.is_some() {
199 self.pos += 1;
200 }
201 ch
202 }
203
204 fn expect(&mut self, expected: char) -> Result<(), String> {
205 match self.advance() {
206 Some(ch) if ch == expected => Ok(()),
207 Some(ch) => Err(format!("expected '{}', got '{}'", expected, ch)),
208 None => Err(format!("expected '{}', got end of pattern", expected)),
209 }
210 }
211
212 /// Parse alternation: `a|b|c`
213 fn parse_alternation(&mut self) -> Result<Node, String> {
214 let mut branches = vec![self.parse_sequence()?];
215 while self.peek() == Some('|') {
216 self.advance(); // consume '|'
217 branches.push(self.parse_sequence()?);
218 }
219 if branches.len() == 1 {
220 Ok(branches.pop().unwrap())
221 } else {
222 Ok(Node::Alternation(branches))
223 }
224 }
225
226 /// Parse a sequence of atoms (concatenation).
227 fn parse_sequence(&mut self) -> Result<Node, String> {
228 let mut nodes = Vec::new();
229 while let Some(ch) = self.peek() {
230 if ch == '|' || ch == ')' {
231 break;
232 }
233 nodes.push(self.parse_quantified()?);
234 }
235 if nodes.len() == 1 {
236 Ok(nodes.pop().unwrap())
237 } else {
238 Ok(Node::Sequence(nodes))
239 }
240 }
241
242 /// Parse an atom with optional quantifier.
243 fn parse_quantified(&mut self) -> Result<Node, String> {
244 let node = self.parse_atom()?;
245 if let Some(ch) = self.peek() {
246 match ch {
247 '*' | '+' | '?' => {
248 self.advance();
249 let (min, max) = match ch {
250 '*' => (0, None),
251 '+' => (1, None),
252 '?' => (0, Some(1)),
253 _ => unreachable!(),
254 };
255 let greedy = if self.peek() == Some('?') {
256 self.advance();
257 false
258 } else {
259 true
260 };
261 Ok(Node::Quantifier {
262 node: Box::new(node),
263 min,
264 max,
265 greedy,
266 })
267 }
268 '{' => self.parse_brace_quantifier(node),
269 _ => Ok(node),
270 }
271 } else {
272 Ok(node)
273 }
274 }
275
276 /// Parse `{n}`, `{n,}`, `{n,m}` quantifier.
277 fn parse_brace_quantifier(&mut self, node: Node) -> Result<Node, String> {
278 let save = self.pos;
279 self.advance(); // consume '{'
280 let min = match self.parse_decimal() {
281 Some(n) => n,
282 None => {
283 // Not a valid quantifier — treat '{' as literal.
284 self.pos = save;
285 return Ok(node);
286 }
287 };
288 let max;
289 match self.peek() {
290 Some('}') => {
291 self.advance();
292 max = Some(min);
293 }
294 Some(',') => {
295 self.advance();
296 if self.peek() == Some('}') {
297 self.advance();
298 max = None;
299 } else {
300 match self.parse_decimal() {
301 Some(n) => {
302 max = Some(n);
303 if self.peek() != Some('}') {
304 self.pos = save;
305 return Ok(node);
306 }
307 self.advance();
308 }
309 None => {
310 self.pos = save;
311 return Ok(node);
312 }
313 }
314 }
315 }
316 _ => {
317 self.pos = save;
318 return Ok(node);
319 }
320 }
321 let greedy = if self.peek() == Some('?') {
322 self.advance();
323 false
324 } else {
325 true
326 };
327 Ok(Node::Quantifier {
328 node: Box::new(node),
329 min,
330 max,
331 greedy,
332 })
333 }
334
335 fn parse_decimal(&mut self) -> Option<u32> {
336 let start = self.pos;
337 while let Some(ch) = self.peek() {
338 if ch.is_ascii_digit() {
339 self.advance();
340 } else {
341 break;
342 }
343 }
344 if self.pos == start {
345 return None;
346 }
347 let s: String = self.chars[start..self.pos].iter().collect();
348 s.parse().ok()
349 }
350
351 /// Parse a single atom.
352 fn parse_atom(&mut self) -> Result<Node, String> {
353 match self.peek() {
354 None => Err("unexpected end of pattern".to_string()),
355 Some('.') => {
356 self.advance();
357 Ok(Node::Dot)
358 }
359 Some('^') => {
360 self.advance();
361 Ok(Node::Start)
362 }
363 Some('$') => {
364 self.advance();
365 Ok(Node::End)
366 }
367 Some('\\') => self.parse_escape(),
368 Some('[') => self.parse_char_class(),
369 Some('(') => self.parse_group(),
370 Some(ch) => {
371 self.advance();
372 Ok(Node::Literal(ch))
373 }
374 }
375 }
376
377 /// Parse an escape sequence.
378 fn parse_escape(&mut self) -> Result<Node, String> {
379 self.advance(); // consume '\'
380 match self.advance() {
381 None => Err("unexpected end of pattern after '\\'".to_string()),
382 Some('d') => Ok(Node::CharClass {
383 negated: false,
384 ranges: vec![ClassRange::Predefined(PredefinedClass::Digit)],
385 }),
386 Some('D') => Ok(Node::CharClass {
387 negated: false,
388 ranges: vec![ClassRange::Predefined(PredefinedClass::NonDigit)],
389 }),
390 Some('w') => Ok(Node::CharClass {
391 negated: false,
392 ranges: vec![ClassRange::Predefined(PredefinedClass::Word)],
393 }),
394 Some('W') => Ok(Node::CharClass {
395 negated: false,
396 ranges: vec![ClassRange::Predefined(PredefinedClass::NonWord)],
397 }),
398 Some('s') => Ok(Node::CharClass {
399 negated: false,
400 ranges: vec![ClassRange::Predefined(PredefinedClass::Space)],
401 }),
402 Some('S') => Ok(Node::CharClass {
403 negated: false,
404 ranges: vec![ClassRange::Predefined(PredefinedClass::NonSpace)],
405 }),
406 Some('b') => Ok(Node::WordBoundary),
407 Some('B') => Ok(Node::NonWordBoundary),
408 Some('n') => Ok(Node::Literal('\n')),
409 Some('r') => Ok(Node::Literal('\r')),
410 Some('t') => Ok(Node::Literal('\t')),
411 Some('f') => Ok(Node::Literal('\x0C')),
412 Some('v') => Ok(Node::Literal('\x0B')),
413 Some('0') => Ok(Node::Literal('\0')),
414 Some('x') => {
415 let hi = self.advance().ok_or("expected hex digit")?;
416 let lo = self.advance().ok_or("expected hex digit")?;
417 let code = hex2(hi, lo)?;
418 Ok(Node::Literal(char::from(code)))
419 }
420 Some('u') => {
421 if self.peek() == Some('{') {
422 self.advance();
423 let mut code_str = String::new();
424 while self.peek() != Some('}') {
425 code_str.push(self.advance().ok_or("expected '}'")?);
426 }
427 self.advance(); // consume '}'
428 let code =
429 u32::from_str_radix(&code_str, 16).map_err(|_| "invalid unicode escape")?;
430 let ch = char::from_u32(code).ok_or("invalid unicode code point")?;
431 Ok(Node::Literal(ch))
432 } else {
433 let a = self.advance().ok_or("expected hex digit")?;
434 let b = self.advance().ok_or("expected hex digit")?;
435 let c = self.advance().ok_or("expected hex digit")?;
436 let d = self.advance().ok_or("expected hex digit")?;
437 let code = hex4(a, b, c, d)?;
438 let ch = char::from_u32(code).ok_or("invalid unicode escape")?;
439 Ok(Node::Literal(ch))
440 }
441 }
442 Some('k') => {
443 // Named backreference \k<name>
444 self.expect('<')?;
445 let mut name = String::new();
446 while self.peek() != Some('>') {
447 name.push(self.advance().ok_or("expected '>'")?);
448 }
449 self.advance(); // consume '>'
450 // Resolve name to group index.
451 for &(ref n, idx) in &self.group_names {
452 if *n == name {
453 return Ok(Node::Backref(idx));
454 }
455 }
456 Err(format!("unknown group name '{}'", name))
457 }
458 Some(ch) if ch.is_ascii_digit() && ch != '0' => {
459 // Numeric backreference \1, \12, etc.
460 let mut num_str = String::new();
461 num_str.push(ch);
462 while let Some(next) = self.peek() {
463 if next.is_ascii_digit() {
464 num_str.push(next);
465 self.advance();
466 } else {
467 break;
468 }
469 }
470 let idx: u32 = num_str.parse().map_err(|_| "invalid backreference")?;
471 Ok(Node::Backref(idx))
472 }
473 // Escaped metacharacters — treat as literal.
474 Some(ch) => Ok(Node::Literal(ch)),
475 }
476 }
477
478 /// Parse a character class `[...]` or `[^...]`.
479 fn parse_char_class(&mut self) -> Result<Node, String> {
480 self.advance(); // consume '['
481 let negated = if self.peek() == Some('^') {
482 self.advance();
483 true
484 } else {
485 false
486 };
487 let mut ranges = Vec::new();
488 // Allow ']' as first character in class.
489 if self.peek() == Some(']') {
490 self.advance();
491 ranges.push(ClassRange::Char(']'));
492 }
493 while self.peek() != Some(']') {
494 if self.peek().is_none() {
495 return Err("unterminated character class".to_string());
496 }
497 let item = self.parse_class_atom()?;
498 // Check for range `a-b`.
499 if self.peek() == Some('-')
500 && self.chars.get(self.pos + 1) != Some(&']')
501 && self.chars.get(self.pos + 1).is_some()
502 {
503 if let ClassRange::Char(start) = item {
504 self.advance(); // consume '-'
505 let end_item = self.parse_class_atom()?;
506 if let ClassRange::Char(end) = end_item {
507 if start > end {
508 return Err("character class range out of order".to_string());
509 }
510 ranges.push(ClassRange::Range(start, end));
511 continue;
512 } else {
513 // Not a valid range, push start, '-', and end_item.
514 ranges.push(ClassRange::Char(start));
515 ranges.push(ClassRange::Char('-'));
516 ranges.push(end_item);
517 continue;
518 }
519 }
520 }
521 ranges.push(item);
522 }
523 self.advance(); // consume ']'
524 Ok(Node::CharClass { negated, ranges })
525 }
526
527 fn parse_class_atom(&mut self) -> Result<ClassRange, String> {
528 match self.peek() {
529 Some('\\') => {
530 self.advance(); // consume '\'
531 match self.advance() {
532 None => Err("unexpected end of class".to_string()),
533 Some('d') => Ok(ClassRange::Predefined(PredefinedClass::Digit)),
534 Some('D') => Ok(ClassRange::Predefined(PredefinedClass::NonDigit)),
535 Some('w') => Ok(ClassRange::Predefined(PredefinedClass::Word)),
536 Some('W') => Ok(ClassRange::Predefined(PredefinedClass::NonWord)),
537 Some('s') => Ok(ClassRange::Predefined(PredefinedClass::Space)),
538 Some('S') => Ok(ClassRange::Predefined(PredefinedClass::NonSpace)),
539 Some('n') => Ok(ClassRange::Char('\n')),
540 Some('r') => Ok(ClassRange::Char('\r')),
541 Some('t') => Ok(ClassRange::Char('\t')),
542 Some('f') => Ok(ClassRange::Char('\x0C')),
543 Some('v') => Ok(ClassRange::Char('\x0B')),
544 Some('0') => Ok(ClassRange::Char('\0')),
545 Some('x') => {
546 let hi = self.advance().ok_or("expected hex digit")?;
547 let lo = self.advance().ok_or("expected hex digit")?;
548 let code = hex2(hi, lo)?;
549 Ok(ClassRange::Char(char::from(code)))
550 }
551 Some('u') => {
552 if self.peek() == Some('{') {
553 self.advance();
554 let mut code_str = String::new();
555 while self.peek() != Some('}') {
556 code_str.push(self.advance().ok_or("expected '}'")?);
557 }
558 self.advance();
559 let code = u32::from_str_radix(&code_str, 16)
560 .map_err(|_| "invalid unicode escape")?;
561 let ch = char::from_u32(code)
562 .ok_or_else(|| "invalid unicode code point".to_string())?;
563 Ok(ClassRange::Char(ch))
564 } else {
565 let a = self.advance().ok_or("expected hex digit")?;
566 let b = self.advance().ok_or("expected hex digit")?;
567 let c = self.advance().ok_or("expected hex digit")?;
568 let d = self.advance().ok_or("expected hex digit")?;
569 let code = hex4(a, b, c, d)?;
570 let ch = char::from_u32(code)
571 .ok_or_else(|| "invalid unicode escape".to_string())?;
572 Ok(ClassRange::Char(ch))
573 }
574 }
575 Some(ch) => Ok(ClassRange::Char(ch)),
576 }
577 }
578 Some(ch) => {
579 self.advance();
580 Ok(ClassRange::Char(ch))
581 }
582 None => Err("unexpected end of character class".to_string()),
583 }
584 }
585
586 /// Parse a group `(...)`.
587 fn parse_group(&mut self) -> Result<Node, String> {
588 self.advance(); // consume '('
589 if self.peek() == Some('?') {
590 self.advance(); // consume '?'
591 match self.peek() {
592 Some(':') => {
593 self.advance();
594 let inner = self.parse_alternation()?;
595 self.expect(')')?;
596 Ok(Node::NonCapturingGroup(Box::new(inner)))
597 }
598 Some('=') => {
599 self.advance();
600 let inner = self.parse_alternation()?;
601 self.expect(')')?;
602 Ok(Node::Lookahead(Box::new(inner)))
603 }
604 Some('!') => {
605 self.advance();
606 let inner = self.parse_alternation()?;
607 self.expect(')')?;
608 Ok(Node::NegativeLookahead(Box::new(inner)))
609 }
610 Some('<') => {
611 self.advance(); // consume '<'
612 let mut name = String::new();
613 while self.peek() != Some('>') {
614 name.push(self.advance().ok_or("expected '>'")?);
615 }
616 self.advance(); // consume '>'
617 self.group_count += 1;
618 let idx = self.group_count;
619 self.group_names.push((name.clone(), idx));
620 let inner = self.parse_alternation()?;
621 self.expect(')')?;
622 Ok(Node::Group {
623 index: idx,
624 name: Some(name),
625 node: Box::new(inner),
626 })
627 }
628 _ => Err("invalid group specifier".to_string()),
629 }
630 } else {
631 self.group_count += 1;
632 let idx = self.group_count;
633 let inner = self.parse_alternation()?;
634 self.expect(')')?;
635 Ok(Node::Group {
636 index: idx,
637 name: None,
638 node: Box::new(inner),
639 })
640 }
641 }
642}
643
644// ── Hex helpers ─────────────────────────────────────────────
645
646fn hex_digit(ch: char) -> Result<u32, String> {
647 match ch {
648 '0'..='9' => Ok(ch as u32 - '0' as u32),
649 'a'..='f' => Ok(ch as u32 - 'a' as u32 + 10),
650 'A'..='F' => Ok(ch as u32 - 'A' as u32 + 10),
651 _ => Err(format!("invalid hex digit '{}'", ch)),
652 }
653}
654
655fn hex2(hi: char, lo: char) -> Result<u8, String> {
656 Ok((hex_digit(hi)? * 16 + hex_digit(lo)?) as u8)
657}
658
659fn hex4(a: char, b: char, c: char, d: char) -> Result<u32, String> {
660 Ok(hex_digit(a)? * 4096 + hex_digit(b)? * 256 + hex_digit(c)? * 16 + hex_digit(d)?)
661}
662
663// ── Compiled regex ──────────────────────────────────────────
664
665/// A compiled regular expression ready for matching.
666#[derive(Debug, Clone)]
667pub struct CompiledRegex {
668 pub pattern: String,
669 pub flags: RegexFlags,
670 pub node: Node,
671 pub group_count: u32,
672}
673
674impl CompiledRegex {
675 pub fn new(pattern: &str, flags_str: &str) -> Result<Self, String> {
676 let flags = RegexFlags::parse(flags_str)?;
677 let (node, group_count) = parse_pattern(pattern)?;
678 Ok(CompiledRegex {
679 pattern: pattern.to_string(),
680 flags,
681 node,
682 group_count,
683 })
684 }
685}
686
687// ── Backtracking matcher ────────────────────────────────────
688
689/// Result of a successful regex match.
690#[derive(Debug, Clone)]
691pub struct MatchResult {
692 /// Overall match start index (in chars).
693 pub start: usize,
694 /// Overall match end index (in chars).
695 pub end: usize,
696 /// Capture group contents: index 0 = full match, 1..n = groups.
697 pub captures: Vec<Option<(usize, usize)>>,
698}
699
700/// Execute the regex on the given string, starting the search at `start_pos` (char index).
701pub fn exec(regex: &CompiledRegex, input: &str, start_pos: usize) -> Option<MatchResult> {
702 let chars: Vec<char> = input.chars().collect();
703 let group_count = regex.group_count as usize;
704
705 if regex.flags.sticky {
706 // Sticky: only try at start_pos.
707 let mut captures = vec![None; group_count + 1];
708 let mut ctx = MatchContext {
709 chars: &chars,
710 flags: ®ex.flags,
711 captures: &mut captures,
712 backtrack_limit: 1_000_000,
713 backtrack_count: 0,
714 };
715 if let Some(end) = match_node(&mut ctx, ®ex.node, start_pos) {
716 captures = ctx.captures.to_vec();
717 captures[0] = Some((start_pos, end));
718 return Some(MatchResult {
719 start: start_pos,
720 end,
721 captures,
722 });
723 }
724 return None;
725 }
726
727 // Non-sticky: try each position starting from start_pos.
728 for i in start_pos..=chars.len() {
729 let mut captures = vec![None; group_count + 1];
730 let mut ctx = MatchContext {
731 chars: &chars,
732 flags: ®ex.flags,
733 captures: &mut captures,
734 backtrack_limit: 1_000_000,
735 backtrack_count: 0,
736 };
737 if let Some(end) = match_node(&mut ctx, ®ex.node, i) {
738 captures = ctx.captures.to_vec();
739 captures[0] = Some((i, end));
740 return Some(MatchResult {
741 start: i,
742 end,
743 captures,
744 });
745 }
746 }
747 None
748}
749
750struct MatchContext<'a> {
751 chars: &'a [char],
752 flags: &'a RegexFlags,
753 captures: &'a mut Vec<Option<(usize, usize)>>,
754 backtrack_limit: u32,
755 backtrack_count: u32,
756}
757
758/// Try to match `node` at position `pos`, returning the end position on success.
759fn match_node(ctx: &mut MatchContext, node: &Node, pos: usize) -> Option<usize> {
760 ctx.backtrack_count += 1;
761 if ctx.backtrack_count > ctx.backtrack_limit {
762 return None;
763 }
764
765 match node {
766 Node::Literal(ch) => {
767 if pos < ctx.chars.len() {
768 let input_ch = ctx.chars[pos];
769 if ctx.flags.ignore_case {
770 if char_eq_ignore_case(input_ch, *ch) {
771 Some(pos + 1)
772 } else {
773 None
774 }
775 } else if input_ch == *ch {
776 Some(pos + 1)
777 } else {
778 None
779 }
780 } else {
781 None
782 }
783 }
784
785 Node::Dot => {
786 if pos < ctx.chars.len() {
787 let ch = ctx.chars[pos];
788 if ctx.flags.dot_all || !is_line_terminator(ch) {
789 Some(pos + 1)
790 } else {
791 None
792 }
793 } else {
794 None
795 }
796 }
797
798 Node::Start => {
799 if pos == 0
800 || (ctx.flags.multiline && pos > 0 && is_line_terminator(ctx.chars[pos - 1]))
801 {
802 Some(pos)
803 } else {
804 None
805 }
806 }
807
808 Node::End => {
809 if pos == ctx.chars.len()
810 || (ctx.flags.multiline
811 && pos < ctx.chars.len()
812 && is_line_terminator(ctx.chars[pos]))
813 {
814 Some(pos)
815 } else {
816 None
817 }
818 }
819
820 Node::WordBoundary => {
821 let before = if pos > 0 {
822 is_word_char(ctx.chars[pos - 1])
823 } else {
824 false
825 };
826 let after = if pos < ctx.chars.len() {
827 is_word_char(ctx.chars[pos])
828 } else {
829 false
830 };
831 if before != after {
832 Some(pos)
833 } else {
834 None
835 }
836 }
837
838 Node::NonWordBoundary => {
839 let before = if pos > 0 {
840 is_word_char(ctx.chars[pos - 1])
841 } else {
842 false
843 };
844 let after = if pos < ctx.chars.len() {
845 is_word_char(ctx.chars[pos])
846 } else {
847 false
848 };
849 if before == after {
850 Some(pos)
851 } else {
852 None
853 }
854 }
855
856 Node::CharClass { negated, ranges } => {
857 if pos >= ctx.chars.len() {
858 return None;
859 }
860 let ch = ctx.chars[pos];
861 let matched = class_matches(ranges, ch, ctx.flags.ignore_case);
862 if matched != *negated {
863 Some(pos + 1)
864 } else {
865 None
866 }
867 }
868
869 Node::Sequence(nodes) => match_sequence(ctx, nodes, 0, pos),
870
871 Node::Alternation(branches) => {
872 for branch in branches {
873 let saved = ctx.captures.clone();
874 if let Some(end) = match_node(ctx, branch, pos) {
875 return Some(end);
876 }
877 *ctx.captures = saved;
878 }
879 None
880 }
881
882 Node::Quantifier {
883 node: inner,
884 min,
885 max,
886 greedy,
887 } => match_quantifier_standalone(ctx, inner, pos, *min, *max, *greedy),
888
889 Node::Group {
890 index, node: inner, ..
891 } => {
892 let idx = *index as usize;
893 let saved = if idx < ctx.captures.len() {
894 ctx.captures[idx]
895 } else {
896 None
897 };
898 let result = match_node(ctx, inner, pos);
899 if let Some(end) = result {
900 if idx < ctx.captures.len() {
901 ctx.captures[idx] = Some((pos, end));
902 }
903 Some(end)
904 } else {
905 if idx < ctx.captures.len() {
906 ctx.captures[idx] = saved;
907 }
908 None
909 }
910 }
911
912 Node::NonCapturingGroup(inner) => match_node(ctx, inner, pos),
913
914 Node::Backref(idx) => {
915 let idx = *idx as usize;
916 if idx >= ctx.captures.len() {
917 return Some(pos); // Unmatched backref matches empty.
918 }
919 match ctx.captures[idx] {
920 Some((start, end)) => {
921 let cap_len = end - start;
922 if pos + cap_len > ctx.chars.len() {
923 return None;
924 }
925 for i in 0..cap_len {
926 let a = ctx.chars[start + i];
927 let b = ctx.chars[pos + i];
928 if ctx.flags.ignore_case {
929 if !char_eq_ignore_case(a, b) {
930 return None;
931 }
932 } else if a != b {
933 return None;
934 }
935 }
936 Some(pos + cap_len)
937 }
938 None => Some(pos), // Unmatched group — backref matches empty.
939 }
940 }
941
942 Node::Lookahead(inner) => {
943 let saved = ctx.captures.clone();
944 if match_node(ctx, inner, pos).is_some() {
945 Some(pos) // Lookahead doesn't consume.
946 } else {
947 *ctx.captures = saved;
948 None
949 }
950 }
951
952 Node::NegativeLookahead(inner) => {
953 let saved = ctx.captures.clone();
954 if match_node(ctx, inner, pos).is_some() {
955 *ctx.captures = saved;
956 None
957 } else {
958 *ctx.captures = saved;
959 Some(pos)
960 }
961 }
962 }
963}
964
965/// Match a sequence of nodes with backtracking support for quantifiers.
966fn match_sequence(ctx: &mut MatchContext, nodes: &[Node], idx: usize, pos: usize) -> Option<usize> {
967 if idx >= nodes.len() {
968 return Some(pos);
969 }
970
971 let node = &nodes[idx];
972
973 // For quantifiers, try each count with the remaining sequence as continuation.
974 if let Node::Quantifier {
975 node: inner,
976 min,
977 max,
978 greedy,
979 } = node
980 {
981 return match_quantifier_in_seq(ctx, inner, pos, *min, *max, *greedy, &nodes[idx + 1..]);
982 }
983
984 // For non-quantifier nodes, match and continue.
985 let saved = ctx.captures.clone();
986 match match_node(ctx, node, pos) {
987 Some(next) => match_sequence(ctx, nodes, idx + 1, next).or_else(|| {
988 *ctx.captures = saved;
989 None
990 }),
991 None => None,
992 }
993}
994
995/// Match a quantifier within a sequence, trying each count with the continuation.
996fn match_quantifier_in_seq(
997 ctx: &mut MatchContext,
998 inner: &Node,
999 pos: usize,
1000 min: u32,
1001 max: Option<u32>,
1002 greedy: bool,
1003 continuation: &[Node],
1004) -> Option<usize> {
1005 // Collect all reachable positions (match inner 0 to max times).
1006 let mut positions = vec![pos]; // positions[0] = 0 matches
1007 let mut cur = pos;
1008
1009 loop {
1010 let count = positions.len() - 1;
1011 if let Some(m) = max {
1012 if count as u32 >= m {
1013 break;
1014 }
1015 }
1016 let saved = ctx.captures.clone();
1017 match match_node(ctx, inner, cur) {
1018 Some(next) if next > cur => {
1019 cur = next;
1020 positions.push(cur);
1021 }
1022 Some(next) => {
1023 // Zero-width match — record once and stop to prevent infinite loop.
1024 *ctx.captures = saved;
1025 if next == cur && (count as u32) < min {
1026 positions.push(cur);
1027 }
1028 break;
1029 }
1030 None => {
1031 *ctx.captures = saved;
1032 break;
1033 }
1034 }
1035 }
1036
1037 let max_count = positions.len() - 1;
1038 let min_count = min as usize;
1039 if max_count < min_count {
1040 return None;
1041 }
1042
1043 if greedy {
1044 // Try from most matches to fewest.
1045 for &p in positions[min_count..=max_count].iter().rev() {
1046 let saved = ctx.captures.clone();
1047 if let Some(end) = match_sequence(ctx, continuation, 0, p) {
1048 return Some(end);
1049 }
1050 *ctx.captures = saved;
1051 }
1052 None
1053 } else {
1054 // Lazy: try from fewest to most.
1055 for &p in &positions[min_count..=max_count] {
1056 let saved = ctx.captures.clone();
1057 if let Some(end) = match_sequence(ctx, continuation, 0, p) {
1058 return Some(end);
1059 }
1060 *ctx.captures = saved;
1061 }
1062 None
1063 }
1064}
1065
1066/// Match a standalone quantifier (not in a sequence context).
1067fn match_quantifier_standalone(
1068 ctx: &mut MatchContext,
1069 inner: &Node,
1070 pos: usize,
1071 min: u32,
1072 max: Option<u32>,
1073 greedy: bool,
1074) -> Option<usize> {
1075 let mut positions = vec![pos];
1076 let mut cur = pos;
1077
1078 loop {
1079 let count = positions.len() - 1;
1080 if let Some(m) = max {
1081 if count as u32 >= m {
1082 break;
1083 }
1084 }
1085 let saved = ctx.captures.clone();
1086 match match_node(ctx, inner, cur) {
1087 Some(next) if next > cur => {
1088 cur = next;
1089 positions.push(cur);
1090 }
1091 Some(next) => {
1092 *ctx.captures = saved;
1093 if next == cur && (count as u32) < min {
1094 positions.push(cur);
1095 }
1096 break;
1097 }
1098 None => {
1099 *ctx.captures = saved;
1100 break;
1101 }
1102 }
1103 }
1104
1105 let max_count = positions.len() - 1;
1106 let min_count = min as usize;
1107 if max_count < min_count {
1108 return None;
1109 }
1110
1111 if greedy {
1112 Some(positions[max_count])
1113 } else {
1114 Some(positions[min_count])
1115 }
1116}
1117
1118// ── Character helpers ───────────────────────────────────────
1119
1120fn is_line_terminator(ch: char) -> bool {
1121 matches!(ch, '\n' | '\r' | '\u{2028}' | '\u{2029}')
1122}
1123
1124fn is_word_char(ch: char) -> bool {
1125 ch.is_ascii_alphanumeric() || ch == '_'
1126}
1127
1128fn char_eq_ignore_case(a: char, b: char) -> bool {
1129 a.eq_ignore_ascii_case(&b)
1130}
1131
1132fn class_matches(ranges: &[ClassRange], ch: char, ignore_case: bool) -> bool {
1133 for range in ranges {
1134 match range {
1135 ClassRange::Char(c) => {
1136 if ignore_case {
1137 if char_eq_ignore_case(ch, *c) {
1138 return true;
1139 }
1140 } else if ch == *c {
1141 return true;
1142 }
1143 }
1144 ClassRange::Range(start, end) => {
1145 if ignore_case {
1146 let ch_lower = ch.to_ascii_lowercase();
1147 let start_lower = start.to_ascii_lowercase();
1148 let end_lower = end.to_ascii_lowercase();
1149 if ch_lower >= start_lower && ch_lower <= end_lower {
1150 return true;
1151 }
1152 } else if ch >= *start && ch <= *end {
1153 return true;
1154 }
1155 }
1156 ClassRange::Predefined(class) => {
1157 if predefined_matches(*class, ch) {
1158 return true;
1159 }
1160 }
1161 }
1162 }
1163 false
1164}
1165
1166fn predefined_matches(class: PredefinedClass, ch: char) -> bool {
1167 match class {
1168 PredefinedClass::Digit => ch.is_ascii_digit(),
1169 PredefinedClass::NonDigit => !ch.is_ascii_digit(),
1170 PredefinedClass::Word => is_word_char(ch),
1171 PredefinedClass::NonWord => !is_word_char(ch),
1172 PredefinedClass::Space => matches!(
1173 ch,
1174 ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C' | '\u{00A0}' | '\u{FEFF}'
1175 ),
1176 PredefinedClass::NonSpace => !matches!(
1177 ch,
1178 ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C' | '\u{00A0}' | '\u{FEFF}'
1179 ),
1180 }
1181}
1182
1183// ── Tests ───────────────────────────────────────────────────
1184
1185#[cfg(test)]
1186mod tests {
1187 use super::*;
1188
1189 fn match_at(pattern: &str, flags: &str, input: &str, start: usize) -> Option<MatchResult> {
1190 let regex = CompiledRegex::new(pattern, flags).unwrap();
1191 exec(®ex, input, start)
1192 }
1193
1194 fn assert_match(pattern: &str, input: &str, expected_start: usize, expected_end: usize) {
1195 let m = match_at(pattern, "", input, 0).expect("expected match");
1196 assert_eq!(
1197 m.start, expected_start,
1198 "pattern={} input={}",
1199 pattern, input
1200 );
1201 assert_eq!(m.end, expected_end, "pattern={} input={}", pattern, input);
1202 }
1203
1204 fn assert_no_match(pattern: &str, input: &str) {
1205 assert!(
1206 match_at(pattern, "", input, 0).is_none(),
1207 "expected no match: pattern={} input={}",
1208 pattern,
1209 input
1210 );
1211 }
1212
1213 #[test]
1214 fn test_literal() {
1215 assert_match("abc", "xabcx", 1, 4);
1216 assert_no_match("abc", "xyz");
1217 }
1218
1219 #[test]
1220 fn test_dot() {
1221 assert_match("a.c", "abc", 0, 3);
1222 assert_match("a.c", "axc", 0, 3);
1223 assert_no_match("a.c", "a\nc");
1224 // dotAll flag
1225 let m = match_at("a.c", "s", "a\nc", 0).unwrap();
1226 assert_eq!(m.start, 0);
1227 assert_eq!(m.end, 3);
1228 }
1229
1230 #[test]
1231 fn test_anchors() {
1232 assert_match("^abc", "abc", 0, 3);
1233 assert_no_match("^abc", "xabc");
1234 assert_match("abc$", "abc", 0, 3);
1235 assert_no_match("abc$", "abcx");
1236 }
1237
1238 #[test]
1239 fn test_quantifiers() {
1240 assert_match("a*", "aaa", 0, 3);
1241 assert_match("a+", "aaa", 0, 3);
1242 assert_no_match("a+", "bbb");
1243 assert_match("a?b", "ab", 0, 2);
1244 assert_match("a?b", "b", 0, 1);
1245 assert_match("a{2}", "aaa", 0, 2);
1246 assert_match("a{2,3}", "aaaa", 0, 3);
1247 assert_match("a{2,}", "aaaa", 0, 4);
1248 }
1249
1250 #[test]
1251 fn test_char_class() {
1252 assert_match("[abc]", "b", 0, 1);
1253 assert_no_match("[abc]", "d");
1254 assert_match("[a-z]", "m", 0, 1);
1255 assert_no_match("[a-z]", "M");
1256 assert_match("[^abc]", "d", 0, 1);
1257 assert_no_match("[^abc]", "a");
1258 }
1259
1260 #[test]
1261 fn test_predefined_classes() {
1262 assert_match("\\d+", "abc123", 3, 6);
1263 assert_match("\\w+", "hello world", 0, 5);
1264 assert_match("\\s+", "a b", 1, 2);
1265 }
1266
1267 #[test]
1268 fn test_alternation() {
1269 assert_match("cat|dog", "dog", 0, 3);
1270 assert_match("cat|dog", "catdog", 0, 3);
1271 }
1272
1273 #[test]
1274 fn test_groups() {
1275 let m = match_at("(a)(b)(c)", "", "abc", 0).unwrap();
1276 assert_eq!(m.captures[1], Some((0, 1)));
1277 assert_eq!(m.captures[2], Some((1, 2)));
1278 assert_eq!(m.captures[3], Some((2, 3)));
1279 }
1280
1281 #[test]
1282 fn test_non_capturing_group() {
1283 let m = match_at("(?:ab)(c)", "", "abc", 0).unwrap();
1284 assert_eq!(m.captures[1], Some((2, 3)));
1285 }
1286
1287 #[test]
1288 fn test_named_group() {
1289 let m = match_at("(?<word>\\w+)", "", "hello", 0).unwrap();
1290 assert_eq!(m.captures[1], Some((0, 5)));
1291 }
1292
1293 #[test]
1294 fn test_backreference() {
1295 assert_match("(a)\\1", "aa", 0, 2);
1296 assert_no_match("(a)\\1", "ab");
1297 }
1298
1299 #[test]
1300 fn test_word_boundary() {
1301 assert_match("\\bfoo\\b", "a foo b", 2, 5);
1302 assert_no_match("\\bfoo\\b", "afoo");
1303 }
1304
1305 #[test]
1306 fn test_lookahead() {
1307 assert_match("a(?=b)", "ab", 0, 1);
1308 assert_no_match("a(?=b)", "ac");
1309 assert_match("a(?!b)", "ac", 0, 1);
1310 assert_no_match("a(?!b)", "ab");
1311 }
1312
1313 #[test]
1314 fn test_ignore_case() {
1315 let m = match_at("abc", "i", "ABC", 0).unwrap();
1316 assert_eq!(m.start, 0);
1317 assert_eq!(m.end, 3);
1318 }
1319
1320 #[test]
1321 fn test_multiline() {
1322 let m = match_at("^b", "m", "a\nb", 0).unwrap();
1323 assert_eq!(m.start, 2);
1324 }
1325
1326 #[test]
1327 fn test_global_multiple() {
1328 let regex = CompiledRegex::new("a", "g").unwrap();
1329 let m1 = exec(®ex, "aba", 0).unwrap();
1330 assert_eq!(m1.start, 0);
1331 let m2 = exec(®ex, "aba", m1.end).unwrap();
1332 assert_eq!(m2.start, 2);
1333 assert!(exec(®ex, "aba", m2.end).is_none());
1334 }
1335
1336 #[test]
1337 fn test_escape_sequences() {
1338 assert_match("\\n", "\n", 0, 1);
1339 assert_match("\\t", "\t", 0, 1);
1340 assert_match("\\x41", "A", 0, 1);
1341 assert_match("\\u0041", "A", 0, 1);
1342 }
1343
1344 #[test]
1345 fn test_lazy_quantifiers() {
1346 let m = match_at("a+?", "", "aaa", 0).unwrap();
1347 assert_eq!(m.end, 1); // Lazy: match as few as possible.
1348 }
1349
1350 #[test]
1351 fn test_empty_pattern() {
1352 let m = match_at("", "", "abc", 0).unwrap();
1353 assert_eq!(m.start, 0);
1354 assert_eq!(m.end, 0);
1355 }
1356
1357 #[test]
1358 fn test_flags_parse() {
1359 let f = RegexFlags::parse("gims").unwrap();
1360 assert!(f.global);
1361 assert!(f.ignore_case);
1362 assert!(f.multiline);
1363 assert!(f.dot_all);
1364 assert!(!f.unicode);
1365 assert!(!f.sticky);
1366
1367 assert!(RegexFlags::parse("gg").is_err());
1368 assert!(RegexFlags::parse("x").is_err());
1369 }
1370}