//! Pure-Rust regular expression engine for the JavaScript RegExp built-in. //! //! Implements a regex parser (pattern → IR) and a backtracking matcher. //! Supports: character classes, quantifiers (greedy/lazy), anchors, groups //! (capturing, non-capturing, named), alternation, backreferences, lookahead, //! and standard escape sequences. // ── Regex AST ─────────────────────────────────────────────── /// A parsed regex node. #[derive(Debug, Clone)] pub enum Node { /// Match a single literal character. Literal(char), /// `.` — match any character (respects dotAll flag). Dot, /// `^` — start anchor. Start, /// `$` — end anchor. End, /// `\b` — word boundary. WordBoundary, /// `\B` — non-word boundary. NonWordBoundary, /// Character class `[...]` or `[^...]`. CharClass { negated: bool, ranges: Vec, }, /// Alternation `a|b`. Alternation(Vec), /// Sequence of nodes (implicit concatenation). Sequence(Vec), /// Quantifier applied to a sub-node. Quantifier { node: Box, min: u32, max: Option, // None = unbounded greedy: bool, }, /// Capturing group `(...)`. Group { index: u32, name: Option, node: Box, }, /// Non-capturing group `(?:...)`. NonCapturingGroup(Box), /// Backreference `\1` or `\k`. Backref(u32), /// Positive lookahead `(?=...)`. Lookahead(Box), /// Negative lookahead `(?!...)`. NegativeLookahead(Box), } /// A range within a character class. #[derive(Debug, Clone)] pub enum ClassRange { /// Single character. Char(char), /// Character range `a-z`. Range(char, char), /// Predefined class like `\d`, `\w`, `\s`. Predefined(PredefinedClass), } /// Predefined character classes. #[derive(Debug, Clone, Copy)] pub enum PredefinedClass { Digit, // \d NonDigit, // \D Word, // \w NonWord, // \W Space, // \s NonSpace, // \S } // ── Regex flags ───────────────────────────────────────────── /// Parsed regex flags. #[derive(Debug, Clone, Copy, Default)] pub struct RegexFlags { pub global: bool, pub ignore_case: bool, pub multiline: bool, pub dot_all: bool, pub unicode: bool, pub sticky: bool, } impl RegexFlags { pub fn parse(flags_str: &str) -> Result { let mut f = RegexFlags::default(); for ch in flags_str.chars() { match ch { 'g' => { if f.global { return Err(format!("duplicate flag '{}'", ch)); } f.global = true; } 'i' => { if f.ignore_case { return Err(format!("duplicate flag '{}'", ch)); } f.ignore_case = true; } 'm' => { if f.multiline { return Err(format!("duplicate flag '{}'", ch)); } f.multiline = true; } 's' => { if f.dot_all { return Err(format!("duplicate flag '{}'", ch)); } f.dot_all = true; } 'u' => { if f.unicode { return Err(format!("duplicate flag '{}'", ch)); } f.unicode = true; } 'y' => { if f.sticky { return Err(format!("duplicate flag '{}'", ch)); } f.sticky = true; } _ => return Err(format!("invalid flag '{}'", ch)), } } Ok(f) } pub fn as_flag_string(&self) -> String { let mut s = String::new(); if self.global { s.push('g'); } if self.ignore_case { s.push('i'); } if self.multiline { s.push('m'); } if self.dot_all { s.push('s'); } if self.unicode { s.push('u'); } if self.sticky { s.push('y'); } s } } // ── Regex Parser ──────────────────────────────────────────── /// Parse a regex pattern string into a `Node` tree. pub fn parse_pattern(pattern: &str) -> Result<(Node, u32), String> { let chars: Vec = pattern.chars().collect(); let mut parser = PatternParser { chars: &chars, pos: 0, group_count: 0, group_names: Vec::new(), }; let node = parser.parse_alternation()?; if parser.pos < parser.chars.len() { return Err(format!( "unexpected '{}' at position {}", parser.chars[parser.pos], parser.pos )); } Ok((node, parser.group_count)) } struct PatternParser<'a> { chars: &'a [char], pos: usize, group_count: u32, group_names: Vec<(String, u32)>, } impl<'a> PatternParser<'a> { fn peek(&self) -> Option { self.chars.get(self.pos).copied() } fn advance(&mut self) -> Option { let ch = self.chars.get(self.pos).copied(); if ch.is_some() { self.pos += 1; } ch } fn expect(&mut self, expected: char) -> Result<(), String> { match self.advance() { Some(ch) if ch == expected => Ok(()), Some(ch) => Err(format!("expected '{}', got '{}'", expected, ch)), None => Err(format!("expected '{}', got end of pattern", expected)), } } /// Parse alternation: `a|b|c` fn parse_alternation(&mut self) -> Result { let mut branches = vec![self.parse_sequence()?]; while self.peek() == Some('|') { self.advance(); // consume '|' branches.push(self.parse_sequence()?); } if branches.len() == 1 { Ok(branches.pop().unwrap()) } else { Ok(Node::Alternation(branches)) } } /// Parse a sequence of atoms (concatenation). fn parse_sequence(&mut self) -> Result { let mut nodes = Vec::new(); while let Some(ch) = self.peek() { if ch == '|' || ch == ')' { break; } nodes.push(self.parse_quantified()?); } if nodes.len() == 1 { Ok(nodes.pop().unwrap()) } else { Ok(Node::Sequence(nodes)) } } /// Parse an atom with optional quantifier. fn parse_quantified(&mut self) -> Result { let node = self.parse_atom()?; if let Some(ch) = self.peek() { match ch { '*' | '+' | '?' => { self.advance(); let (min, max) = match ch { '*' => (0, None), '+' => (1, None), '?' => (0, Some(1)), _ => unreachable!(), }; let greedy = if self.peek() == Some('?') { self.advance(); false } else { true }; Ok(Node::Quantifier { node: Box::new(node), min, max, greedy, }) } '{' => self.parse_brace_quantifier(node), _ => Ok(node), } } else { Ok(node) } } /// Parse `{n}`, `{n,}`, `{n,m}` quantifier. fn parse_brace_quantifier(&mut self, node: Node) -> Result { let save = self.pos; self.advance(); // consume '{' let min = match self.parse_decimal() { Some(n) => n, None => { // Not a valid quantifier — treat '{' as literal. self.pos = save; return Ok(node); } }; let max; match self.peek() { Some('}') => { self.advance(); max = Some(min); } Some(',') => { self.advance(); if self.peek() == Some('}') { self.advance(); max = None; } else { match self.parse_decimal() { Some(n) => { max = Some(n); if self.peek() != Some('}') { self.pos = save; return Ok(node); } self.advance(); } None => { self.pos = save; return Ok(node); } } } } _ => { self.pos = save; return Ok(node); } } let greedy = if self.peek() == Some('?') { self.advance(); false } else { true }; Ok(Node::Quantifier { node: Box::new(node), min, max, greedy, }) } fn parse_decimal(&mut self) -> Option { let start = self.pos; while let Some(ch) = self.peek() { if ch.is_ascii_digit() { self.advance(); } else { break; } } if self.pos == start { return None; } let s: String = self.chars[start..self.pos].iter().collect(); s.parse().ok() } /// Parse a single atom. fn parse_atom(&mut self) -> Result { match self.peek() { None => Err("unexpected end of pattern".to_string()), Some('.') => { self.advance(); Ok(Node::Dot) } Some('^') => { self.advance(); Ok(Node::Start) } Some('$') => { self.advance(); Ok(Node::End) } Some('\\') => self.parse_escape(), Some('[') => self.parse_char_class(), Some('(') => self.parse_group(), Some(ch) => { self.advance(); Ok(Node::Literal(ch)) } } } /// Parse an escape sequence. fn parse_escape(&mut self) -> Result { self.advance(); // consume '\' match self.advance() { None => Err("unexpected end of pattern after '\\'".to_string()), Some('d') => Ok(Node::CharClass { negated: false, ranges: vec![ClassRange::Predefined(PredefinedClass::Digit)], }), Some('D') => Ok(Node::CharClass { negated: false, ranges: vec![ClassRange::Predefined(PredefinedClass::NonDigit)], }), Some('w') => Ok(Node::CharClass { negated: false, ranges: vec![ClassRange::Predefined(PredefinedClass::Word)], }), Some('W') => Ok(Node::CharClass { negated: false, ranges: vec![ClassRange::Predefined(PredefinedClass::NonWord)], }), Some('s') => Ok(Node::CharClass { negated: false, ranges: vec![ClassRange::Predefined(PredefinedClass::Space)], }), Some('S') => Ok(Node::CharClass { negated: false, ranges: vec![ClassRange::Predefined(PredefinedClass::NonSpace)], }), Some('b') => Ok(Node::WordBoundary), Some('B') => Ok(Node::NonWordBoundary), Some('n') => Ok(Node::Literal('\n')), Some('r') => Ok(Node::Literal('\r')), Some('t') => Ok(Node::Literal('\t')), Some('f') => Ok(Node::Literal('\x0C')), Some('v') => Ok(Node::Literal('\x0B')), Some('0') => Ok(Node::Literal('\0')), Some('x') => { let hi = self.advance().ok_or("expected hex digit")?; let lo = self.advance().ok_or("expected hex digit")?; let code = hex2(hi, lo)?; Ok(Node::Literal(char::from(code))) } Some('u') => { if self.peek() == Some('{') { self.advance(); let mut code_str = String::new(); while self.peek() != Some('}') { code_str.push(self.advance().ok_or("expected '}'")?); } self.advance(); // consume '}' let code = u32::from_str_radix(&code_str, 16).map_err(|_| "invalid unicode escape")?; let ch = char::from_u32(code).ok_or("invalid unicode code point")?; Ok(Node::Literal(ch)) } else { let a = self.advance().ok_or("expected hex digit")?; let b = self.advance().ok_or("expected hex digit")?; let c = self.advance().ok_or("expected hex digit")?; let d = self.advance().ok_or("expected hex digit")?; let code = hex4(a, b, c, d)?; let ch = char::from_u32(code).ok_or("invalid unicode escape")?; Ok(Node::Literal(ch)) } } Some('k') => { // Named backreference \k self.expect('<')?; let mut name = String::new(); while self.peek() != Some('>') { name.push(self.advance().ok_or("expected '>'")?); } self.advance(); // consume '>' // Resolve name to group index. for &(ref n, idx) in &self.group_names { if *n == name { return Ok(Node::Backref(idx)); } } Err(format!("unknown group name '{}'", name)) } Some(ch) if ch.is_ascii_digit() && ch != '0' => { // Numeric backreference \1, \12, etc. let mut num_str = String::new(); num_str.push(ch); while let Some(next) = self.peek() { if next.is_ascii_digit() { num_str.push(next); self.advance(); } else { break; } } let idx: u32 = num_str.parse().map_err(|_| "invalid backreference")?; Ok(Node::Backref(idx)) } // Escaped metacharacters — treat as literal. Some(ch) => Ok(Node::Literal(ch)), } } /// Parse a character class `[...]` or `[^...]`. fn parse_char_class(&mut self) -> Result { self.advance(); // consume '[' let negated = if self.peek() == Some('^') { self.advance(); true } else { false }; let mut ranges = Vec::new(); // Allow ']' as first character in class. if self.peek() == Some(']') { self.advance(); ranges.push(ClassRange::Char(']')); } while self.peek() != Some(']') { if self.peek().is_none() { return Err("unterminated character class".to_string()); } let item = self.parse_class_atom()?; // Check for range `a-b`. if self.peek() == Some('-') && self.chars.get(self.pos + 1) != Some(&']') && self.chars.get(self.pos + 1).is_some() { if let ClassRange::Char(start) = item { self.advance(); // consume '-' let end_item = self.parse_class_atom()?; if let ClassRange::Char(end) = end_item { if start > end { return Err("character class range out of order".to_string()); } ranges.push(ClassRange::Range(start, end)); continue; } else { // Not a valid range, push start, '-', and end_item. ranges.push(ClassRange::Char(start)); ranges.push(ClassRange::Char('-')); ranges.push(end_item); continue; } } } ranges.push(item); } self.advance(); // consume ']' Ok(Node::CharClass { negated, ranges }) } fn parse_class_atom(&mut self) -> Result { match self.peek() { Some('\\') => { self.advance(); // consume '\' match self.advance() { None => Err("unexpected end of class".to_string()), Some('d') => Ok(ClassRange::Predefined(PredefinedClass::Digit)), Some('D') => Ok(ClassRange::Predefined(PredefinedClass::NonDigit)), Some('w') => Ok(ClassRange::Predefined(PredefinedClass::Word)), Some('W') => Ok(ClassRange::Predefined(PredefinedClass::NonWord)), Some('s') => Ok(ClassRange::Predefined(PredefinedClass::Space)), Some('S') => Ok(ClassRange::Predefined(PredefinedClass::NonSpace)), Some('n') => Ok(ClassRange::Char('\n')), Some('r') => Ok(ClassRange::Char('\r')), Some('t') => Ok(ClassRange::Char('\t')), Some('f') => Ok(ClassRange::Char('\x0C')), Some('v') => Ok(ClassRange::Char('\x0B')), Some('0') => Ok(ClassRange::Char('\0')), Some('x') => { let hi = self.advance().ok_or("expected hex digit")?; let lo = self.advance().ok_or("expected hex digit")?; let code = hex2(hi, lo)?; Ok(ClassRange::Char(char::from(code))) } Some('u') => { if self.peek() == Some('{') { self.advance(); let mut code_str = String::new(); while self.peek() != Some('}') { code_str.push(self.advance().ok_or("expected '}'")?); } self.advance(); let code = u32::from_str_radix(&code_str, 16) .map_err(|_| "invalid unicode escape")?; let ch = char::from_u32(code) .ok_or_else(|| "invalid unicode code point".to_string())?; Ok(ClassRange::Char(ch)) } else { let a = self.advance().ok_or("expected hex digit")?; let b = self.advance().ok_or("expected hex digit")?; let c = self.advance().ok_or("expected hex digit")?; let d = self.advance().ok_or("expected hex digit")?; let code = hex4(a, b, c, d)?; let ch = char::from_u32(code) .ok_or_else(|| "invalid unicode escape".to_string())?; Ok(ClassRange::Char(ch)) } } Some(ch) => Ok(ClassRange::Char(ch)), } } Some(ch) => { self.advance(); Ok(ClassRange::Char(ch)) } None => Err("unexpected end of character class".to_string()), } } /// Parse a group `(...)`. fn parse_group(&mut self) -> Result { self.advance(); // consume '(' if self.peek() == Some('?') { self.advance(); // consume '?' match self.peek() { Some(':') => { self.advance(); let inner = self.parse_alternation()?; self.expect(')')?; Ok(Node::NonCapturingGroup(Box::new(inner))) } Some('=') => { self.advance(); let inner = self.parse_alternation()?; self.expect(')')?; Ok(Node::Lookahead(Box::new(inner))) } Some('!') => { self.advance(); let inner = self.parse_alternation()?; self.expect(')')?; Ok(Node::NegativeLookahead(Box::new(inner))) } Some('<') => { self.advance(); // consume '<' let mut name = String::new(); while self.peek() != Some('>') { name.push(self.advance().ok_or("expected '>'")?); } self.advance(); // consume '>' self.group_count += 1; let idx = self.group_count; self.group_names.push((name.clone(), idx)); let inner = self.parse_alternation()?; self.expect(')')?; Ok(Node::Group { index: idx, name: Some(name), node: Box::new(inner), }) } _ => Err("invalid group specifier".to_string()), } } else { self.group_count += 1; let idx = self.group_count; let inner = self.parse_alternation()?; self.expect(')')?; Ok(Node::Group { index: idx, name: None, node: Box::new(inner), }) } } } // ── Hex helpers ───────────────────────────────────────────── fn hex_digit(ch: char) -> Result { match ch { '0'..='9' => Ok(ch as u32 - '0' as u32), 'a'..='f' => Ok(ch as u32 - 'a' as u32 + 10), 'A'..='F' => Ok(ch as u32 - 'A' as u32 + 10), _ => Err(format!("invalid hex digit '{}'", ch)), } } fn hex2(hi: char, lo: char) -> Result { Ok((hex_digit(hi)? * 16 + hex_digit(lo)?) as u8) } fn hex4(a: char, b: char, c: char, d: char) -> Result { Ok(hex_digit(a)? * 4096 + hex_digit(b)? * 256 + hex_digit(c)? * 16 + hex_digit(d)?) } // ── Compiled regex ────────────────────────────────────────── /// A compiled regular expression ready for matching. #[derive(Debug, Clone)] pub struct CompiledRegex { pub pattern: String, pub flags: RegexFlags, pub node: Node, pub group_count: u32, } impl CompiledRegex { pub fn new(pattern: &str, flags_str: &str) -> Result { let flags = RegexFlags::parse(flags_str)?; let (node, group_count) = parse_pattern(pattern)?; Ok(CompiledRegex { pattern: pattern.to_string(), flags, node, group_count, }) } } // ── Backtracking matcher ──────────────────────────────────── /// Result of a successful regex match. #[derive(Debug, Clone)] pub struct MatchResult { /// Overall match start index (in chars). pub start: usize, /// Overall match end index (in chars). pub end: usize, /// Capture group contents: index 0 = full match, 1..n = groups. pub captures: Vec>, } /// Execute the regex on the given string, starting the search at `start_pos` (char index). pub fn exec(regex: &CompiledRegex, input: &str, start_pos: usize) -> Option { let chars: Vec = input.chars().collect(); let group_count = regex.group_count as usize; if regex.flags.sticky { // Sticky: only try at start_pos. let mut captures = vec![None; group_count + 1]; let mut ctx = MatchContext { chars: &chars, flags: ®ex.flags, captures: &mut captures, backtrack_limit: 1_000_000, backtrack_count: 0, }; if let Some(end) = match_node(&mut ctx, ®ex.node, start_pos) { captures = ctx.captures.to_vec(); captures[0] = Some((start_pos, end)); return Some(MatchResult { start: start_pos, end, captures, }); } return None; } // Non-sticky: try each position starting from start_pos. for i in start_pos..=chars.len() { let mut captures = vec![None; group_count + 1]; let mut ctx = MatchContext { chars: &chars, flags: ®ex.flags, captures: &mut captures, backtrack_limit: 1_000_000, backtrack_count: 0, }; if let Some(end) = match_node(&mut ctx, ®ex.node, i) { captures = ctx.captures.to_vec(); captures[0] = Some((i, end)); return Some(MatchResult { start: i, end, captures, }); } } None } struct MatchContext<'a> { chars: &'a [char], flags: &'a RegexFlags, captures: &'a mut Vec>, backtrack_limit: u32, backtrack_count: u32, } /// Try to match `node` at position `pos`, returning the end position on success. fn match_node(ctx: &mut MatchContext, node: &Node, pos: usize) -> Option { ctx.backtrack_count += 1; if ctx.backtrack_count > ctx.backtrack_limit { return None; } match node { Node::Literal(ch) => { if pos < ctx.chars.len() { let input_ch = ctx.chars[pos]; if ctx.flags.ignore_case { if char_eq_ignore_case(input_ch, *ch) { Some(pos + 1) } else { None } } else if input_ch == *ch { Some(pos + 1) } else { None } } else { None } } Node::Dot => { if pos < ctx.chars.len() { let ch = ctx.chars[pos]; if ctx.flags.dot_all || !is_line_terminator(ch) { Some(pos + 1) } else { None } } else { None } } Node::Start => { if pos == 0 || (ctx.flags.multiline && pos > 0 && is_line_terminator(ctx.chars[pos - 1])) { Some(pos) } else { None } } Node::End => { if pos == ctx.chars.len() || (ctx.flags.multiline && pos < ctx.chars.len() && is_line_terminator(ctx.chars[pos])) { Some(pos) } else { None } } Node::WordBoundary => { let before = if pos > 0 { is_word_char(ctx.chars[pos - 1]) } else { false }; let after = if pos < ctx.chars.len() { is_word_char(ctx.chars[pos]) } else { false }; if before != after { Some(pos) } else { None } } Node::NonWordBoundary => { let before = if pos > 0 { is_word_char(ctx.chars[pos - 1]) } else { false }; let after = if pos < ctx.chars.len() { is_word_char(ctx.chars[pos]) } else { false }; if before == after { Some(pos) } else { None } } Node::CharClass { negated, ranges } => { if pos >= ctx.chars.len() { return None; } let ch = ctx.chars[pos]; let matched = class_matches(ranges, ch, ctx.flags.ignore_case); if matched != *negated { Some(pos + 1) } else { None } } Node::Sequence(nodes) => match_sequence(ctx, nodes, 0, pos), Node::Alternation(branches) => { for branch in branches { let saved = ctx.captures.clone(); if let Some(end) = match_node(ctx, branch, pos) { return Some(end); } *ctx.captures = saved; } None } Node::Quantifier { node: inner, min, max, greedy, } => match_quantifier_standalone(ctx, inner, pos, *min, *max, *greedy), Node::Group { index, node: inner, .. } => { let idx = *index as usize; let saved = if idx < ctx.captures.len() { ctx.captures[idx] } else { None }; let result = match_node(ctx, inner, pos); if let Some(end) = result { if idx < ctx.captures.len() { ctx.captures[idx] = Some((pos, end)); } Some(end) } else { if idx < ctx.captures.len() { ctx.captures[idx] = saved; } None } } Node::NonCapturingGroup(inner) => match_node(ctx, inner, pos), Node::Backref(idx) => { let idx = *idx as usize; if idx >= ctx.captures.len() { return Some(pos); // Unmatched backref matches empty. } match ctx.captures[idx] { Some((start, end)) => { let cap_len = end - start; if pos + cap_len > ctx.chars.len() { return None; } for i in 0..cap_len { let a = ctx.chars[start + i]; let b = ctx.chars[pos + i]; if ctx.flags.ignore_case { if !char_eq_ignore_case(a, b) { return None; } } else if a != b { return None; } } Some(pos + cap_len) } None => Some(pos), // Unmatched group — backref matches empty. } } Node::Lookahead(inner) => { let saved = ctx.captures.clone(); if match_node(ctx, inner, pos).is_some() { Some(pos) // Lookahead doesn't consume. } else { *ctx.captures = saved; None } } Node::NegativeLookahead(inner) => { let saved = ctx.captures.clone(); if match_node(ctx, inner, pos).is_some() { *ctx.captures = saved; None } else { *ctx.captures = saved; Some(pos) } } } } /// Match a sequence of nodes with backtracking support for quantifiers. fn match_sequence(ctx: &mut MatchContext, nodes: &[Node], idx: usize, pos: usize) -> Option { if idx >= nodes.len() { return Some(pos); } let node = &nodes[idx]; // For quantifiers, try each count with the remaining sequence as continuation. if let Node::Quantifier { node: inner, min, max, greedy, } = node { return match_quantifier_in_seq(ctx, inner, pos, *min, *max, *greedy, &nodes[idx + 1..]); } // For non-quantifier nodes, match and continue. let saved = ctx.captures.clone(); match match_node(ctx, node, pos) { Some(next) => match_sequence(ctx, nodes, idx + 1, next).or_else(|| { *ctx.captures = saved; None }), None => None, } } /// Match a quantifier within a sequence, trying each count with the continuation. fn match_quantifier_in_seq( ctx: &mut MatchContext, inner: &Node, pos: usize, min: u32, max: Option, greedy: bool, continuation: &[Node], ) -> Option { // Collect all reachable positions (match inner 0 to max times). let mut positions = vec![pos]; // positions[0] = 0 matches let mut cur = pos; loop { let count = positions.len() - 1; if let Some(m) = max { if count as u32 >= m { break; } } let saved = ctx.captures.clone(); match match_node(ctx, inner, cur) { Some(next) if next > cur => { cur = next; positions.push(cur); } Some(next) => { // Zero-width match — record once and stop to prevent infinite loop. *ctx.captures = saved; if next == cur && (count as u32) < min { positions.push(cur); } break; } None => { *ctx.captures = saved; break; } } } let max_count = positions.len() - 1; let min_count = min as usize; if max_count < min_count { return None; } if greedy { // Try from most matches to fewest. for &p in positions[min_count..=max_count].iter().rev() { let saved = ctx.captures.clone(); if let Some(end) = match_sequence(ctx, continuation, 0, p) { return Some(end); } *ctx.captures = saved; } None } else { // Lazy: try from fewest to most. for &p in &positions[min_count..=max_count] { let saved = ctx.captures.clone(); if let Some(end) = match_sequence(ctx, continuation, 0, p) { return Some(end); } *ctx.captures = saved; } None } } /// Match a standalone quantifier (not in a sequence context). fn match_quantifier_standalone( ctx: &mut MatchContext, inner: &Node, pos: usize, min: u32, max: Option, greedy: bool, ) -> Option { let mut positions = vec![pos]; let mut cur = pos; loop { let count = positions.len() - 1; if let Some(m) = max { if count as u32 >= m { break; } } let saved = ctx.captures.clone(); match match_node(ctx, inner, cur) { Some(next) if next > cur => { cur = next; positions.push(cur); } Some(next) => { *ctx.captures = saved; if next == cur && (count as u32) < min { positions.push(cur); } break; } None => { *ctx.captures = saved; break; } } } let max_count = positions.len() - 1; let min_count = min as usize; if max_count < min_count { return None; } if greedy { Some(positions[max_count]) } else { Some(positions[min_count]) } } // ── Character helpers ─────────────────────────────────────── fn is_line_terminator(ch: char) -> bool { matches!(ch, '\n' | '\r' | '\u{2028}' | '\u{2029}') } fn is_word_char(ch: char) -> bool { ch.is_ascii_alphanumeric() || ch == '_' } fn char_eq_ignore_case(a: char, b: char) -> bool { a.eq_ignore_ascii_case(&b) } fn class_matches(ranges: &[ClassRange], ch: char, ignore_case: bool) -> bool { for range in ranges { match range { ClassRange::Char(c) => { if ignore_case { if char_eq_ignore_case(ch, *c) { return true; } } else if ch == *c { return true; } } ClassRange::Range(start, end) => { if ignore_case { let ch_lower = ch.to_ascii_lowercase(); let start_lower = start.to_ascii_lowercase(); let end_lower = end.to_ascii_lowercase(); if ch_lower >= start_lower && ch_lower <= end_lower { return true; } } else if ch >= *start && ch <= *end { return true; } } ClassRange::Predefined(class) => { if predefined_matches(*class, ch) { return true; } } } } false } fn predefined_matches(class: PredefinedClass, ch: char) -> bool { match class { PredefinedClass::Digit => ch.is_ascii_digit(), PredefinedClass::NonDigit => !ch.is_ascii_digit(), PredefinedClass::Word => is_word_char(ch), PredefinedClass::NonWord => !is_word_char(ch), PredefinedClass::Space => matches!( ch, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C' | '\u{00A0}' | '\u{FEFF}' ), PredefinedClass::NonSpace => !matches!( ch, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C' | '\u{00A0}' | '\u{FEFF}' ), } } // ── Tests ─────────────────────────────────────────────────── #[cfg(test)] mod tests { use super::*; fn match_at(pattern: &str, flags: &str, input: &str, start: usize) -> Option { let regex = CompiledRegex::new(pattern, flags).unwrap(); exec(®ex, input, start) } fn assert_match(pattern: &str, input: &str, expected_start: usize, expected_end: usize) { let m = match_at(pattern, "", input, 0).expect("expected match"); assert_eq!( m.start, expected_start, "pattern={} input={}", pattern, input ); assert_eq!(m.end, expected_end, "pattern={} input={}", pattern, input); } fn assert_no_match(pattern: &str, input: &str) { assert!( match_at(pattern, "", input, 0).is_none(), "expected no match: pattern={} input={}", pattern, input ); } #[test] fn test_literal() { assert_match("abc", "xabcx", 1, 4); assert_no_match("abc", "xyz"); } #[test] fn test_dot() { assert_match("a.c", "abc", 0, 3); assert_match("a.c", "axc", 0, 3); assert_no_match("a.c", "a\nc"); // dotAll flag let m = match_at("a.c", "s", "a\nc", 0).unwrap(); assert_eq!(m.start, 0); assert_eq!(m.end, 3); } #[test] fn test_anchors() { assert_match("^abc", "abc", 0, 3); assert_no_match("^abc", "xabc"); assert_match("abc$", "abc", 0, 3); assert_no_match("abc$", "abcx"); } #[test] fn test_quantifiers() { assert_match("a*", "aaa", 0, 3); assert_match("a+", "aaa", 0, 3); assert_no_match("a+", "bbb"); assert_match("a?b", "ab", 0, 2); assert_match("a?b", "b", 0, 1); assert_match("a{2}", "aaa", 0, 2); assert_match("a{2,3}", "aaaa", 0, 3); assert_match("a{2,}", "aaaa", 0, 4); } #[test] fn test_char_class() { assert_match("[abc]", "b", 0, 1); assert_no_match("[abc]", "d"); assert_match("[a-z]", "m", 0, 1); assert_no_match("[a-z]", "M"); assert_match("[^abc]", "d", 0, 1); assert_no_match("[^abc]", "a"); } #[test] fn test_predefined_classes() { assert_match("\\d+", "abc123", 3, 6); assert_match("\\w+", "hello world", 0, 5); assert_match("\\s+", "a b", 1, 2); } #[test] fn test_alternation() { assert_match("cat|dog", "dog", 0, 3); assert_match("cat|dog", "catdog", 0, 3); } #[test] fn test_groups() { let m = match_at("(a)(b)(c)", "", "abc", 0).unwrap(); assert_eq!(m.captures[1], Some((0, 1))); assert_eq!(m.captures[2], Some((1, 2))); assert_eq!(m.captures[3], Some((2, 3))); } #[test] fn test_non_capturing_group() { let m = match_at("(?:ab)(c)", "", "abc", 0).unwrap(); assert_eq!(m.captures[1], Some((2, 3))); } #[test] fn test_named_group() { let m = match_at("(?\\w+)", "", "hello", 0).unwrap(); assert_eq!(m.captures[1], Some((0, 5))); } #[test] fn test_backreference() { assert_match("(a)\\1", "aa", 0, 2); assert_no_match("(a)\\1", "ab"); } #[test] fn test_word_boundary() { assert_match("\\bfoo\\b", "a foo b", 2, 5); assert_no_match("\\bfoo\\b", "afoo"); } #[test] fn test_lookahead() { assert_match("a(?=b)", "ab", 0, 1); assert_no_match("a(?=b)", "ac"); assert_match("a(?!b)", "ac", 0, 1); assert_no_match("a(?!b)", "ab"); } #[test] fn test_ignore_case() { let m = match_at("abc", "i", "ABC", 0).unwrap(); assert_eq!(m.start, 0); assert_eq!(m.end, 3); } #[test] fn test_multiline() { let m = match_at("^b", "m", "a\nb", 0).unwrap(); assert_eq!(m.start, 2); } #[test] fn test_global_multiple() { let regex = CompiledRegex::new("a", "g").unwrap(); let m1 = exec(®ex, "aba", 0).unwrap(); assert_eq!(m1.start, 0); let m2 = exec(®ex, "aba", m1.end).unwrap(); assert_eq!(m2.start, 2); assert!(exec(®ex, "aba", m2.end).is_none()); } #[test] fn test_escape_sequences() { assert_match("\\n", "\n", 0, 1); assert_match("\\t", "\t", 0, 1); assert_match("\\x41", "A", 0, 1); assert_match("\\u0041", "A", 0, 1); } #[test] fn test_lazy_quantifiers() { let m = match_at("a+?", "", "aaa", 0).unwrap(); assert_eq!(m.end, 1); // Lazy: match as few as possible. } #[test] fn test_empty_pattern() { let m = match_at("", "", "abc", 0).unwrap(); assert_eq!(m.start, 0); assert_eq!(m.end, 0); } #[test] fn test_flags_parse() { let f = RegexFlags::parse("gims").unwrap(); assert!(f.global); assert!(f.ignore_case); assert!(f.multiline); assert!(f.dot_all); assert!(!f.unicode); assert!(!f.sticky); assert!(RegexFlags::parse("gg").is_err()); assert!(RegexFlags::parse("x").is_err()); } }