we (web engine): Experimental web browser project to understand the limits of Claude
1//! JavaScript lexer/tokenizer conforming to ECMAScript 2024.
2//!
3//! Converts JavaScript source text into a stream of [`Token`]s, each annotated
4//! with its [`Span`] (byte offset, line, column).
5
6use std::fmt;
7
8/// A position in the source text.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub struct SourcePos {
11 /// 1-based line number.
12 pub line: u32,
13 /// 1-based column (in bytes from the start of the line).
14 pub col: u32,
15}
16
17/// A span covering a range of source text.
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub struct Span {
20 pub start: SourcePos,
21 pub end: SourcePos,
22}
23
24/// A token produced by the lexer.
25#[derive(Debug, Clone, PartialEq)]
26pub struct Token {
27 pub kind: TokenKind,
28 pub span: Span,
29 /// Whether at least one newline preceded this token (for ASI).
30 pub preceded_by_newline: bool,
31}
32
33/// Every distinct token kind the lexer can produce.
34#[derive(Debug, Clone, PartialEq)]
35pub enum TokenKind {
36 // ── Literals ──────────────────────────────────────────────
37 /// Numeric literal (the parsed `f64` value).
38 Number(f64),
39 /// String literal (the decoded content, without quotes).
40 String(std::string::String),
41 /// Regular expression literal: pattern and flags.
42 RegExp {
43 pattern: std::string::String,
44 flags: std::string::String,
45 },
46 /// Template literal with no substitutions (full string content).
47 TemplateFull(std::string::String),
48 /// Opening part of a template literal (before the first `${`).
49 TemplateHead(std::string::String),
50 /// Middle part of a template literal (between `}` and next `${`).
51 TemplateMiddle(std::string::String),
52 /// Closing part of a template literal (after the last `}`).
53 TemplateTail(std::string::String),
54
55 // ── Identifiers & Keywords ───────────────────────────────
56 Identifier(std::string::String),
57
58 // Keywords
59 Await,
60 Break,
61 Case,
62 Catch,
63 Class,
64 Const,
65 Continue,
66 Debugger,
67 Default,
68 Delete,
69 Do,
70 Else,
71 Export,
72 Extends,
73 Finally,
74 For,
75 Function,
76 If,
77 Import,
78 In,
79 Instanceof,
80 Let,
81 New,
82 Of,
83 Return,
84 Static,
85 Super,
86 Switch,
87 This,
88 Throw,
89 Try,
90 Typeof,
91 Var,
92 Void,
93 While,
94 With,
95 Yield,
96 Async,
97
98 // Literal keywords
99 True,
100 False,
101 Null,
102
103 // ── Punctuators ──────────────────────────────────────────
104 // Grouping
105 LParen, // (
106 RParen, // )
107 LBracket, // [
108 RBracket, // ]
109 LBrace, // {
110 RBrace, // }
111
112 // Delimiters
113 Semicolon, // ;
114 Comma, // ,
115 Colon, // :
116 Dot, // .
117 Ellipsis, // ...
118
119 // Arrow
120 Arrow, // =>
121
122 // Optional chaining
123 QuestionDot, // ?.
124
125 // Ternary
126 Question, // ?
127
128 // Assignment
129 Assign, // =
130 PlusAssign, // +=
131 MinusAssign, // -=
132 StarAssign, // *=
133 SlashAssign, // /=
134 PercentAssign, // %=
135 ExpAssign, // **=
136 AmpAssign, // &=
137 PipeAssign, // |=
138 CaretAssign, // ^=
139 ShlAssign, // <<=
140 ShrAssign, // >>=
141 UshrAssign, // >>>=
142 AndAssign, // &&=
143 OrAssign, // ||=
144 NullishAssign, // ??=
145
146 // Comparison
147 Eq, // ==
148 Ne, // !=
149 StrictEq, // ===
150 StrictNe, // !==
151 Lt, // <
152 Gt, // >
153 Le, // <=
154 Ge, // >=
155
156 // Arithmetic
157 Plus, // +
158 Minus, // -
159 Star, // *
160 Slash, // /
161 Percent, // %
162 Exp, // **
163
164 // Increment / Decrement
165 PlusPlus, // ++
166 MinusMinus, // --
167
168 // Bitwise
169 Amp, // &
170 Pipe, // |
171 Caret, // ^
172 Tilde, // ~
173 Shl, // <<
174 Shr, // >>
175 Ushr, // >>>
176
177 // Logical
178 And, // &&
179 Or, // ||
180 Not, // !
181 Nullish, // ??
182
183 // ── Special ──────────────────────────────────────────────
184 /// End of input.
185 Eof,
186}
187
188impl fmt::Display for TokenKind {
189 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
190 match self {
191 TokenKind::Number(n) => write!(f, "{}", n),
192 TokenKind::String(s) => write!(f, "\"{}\"", s),
193 TokenKind::RegExp { pattern, flags } => write!(f, "/{}/{}", pattern, flags),
194 TokenKind::TemplateFull(s) => write!(f, "`{}`", s),
195 TokenKind::TemplateHead(s) => write!(f, "`{}${{", s),
196 TokenKind::TemplateMiddle(s) => write!(f, "}}{}${{", s),
197 TokenKind::TemplateTail(s) => write!(f, "}}{}`", s),
198 TokenKind::Identifier(s) => write!(f, "{}", s),
199 TokenKind::Await => write!(f, "await"),
200 TokenKind::Break => write!(f, "break"),
201 TokenKind::Case => write!(f, "case"),
202 TokenKind::Catch => write!(f, "catch"),
203 TokenKind::Class => write!(f, "class"),
204 TokenKind::Const => write!(f, "const"),
205 TokenKind::Continue => write!(f, "continue"),
206 TokenKind::Debugger => write!(f, "debugger"),
207 TokenKind::Default => write!(f, "default"),
208 TokenKind::Delete => write!(f, "delete"),
209 TokenKind::Do => write!(f, "do"),
210 TokenKind::Else => write!(f, "else"),
211 TokenKind::Export => write!(f, "export"),
212 TokenKind::Extends => write!(f, "extends"),
213 TokenKind::Finally => write!(f, "finally"),
214 TokenKind::For => write!(f, "for"),
215 TokenKind::Function => write!(f, "function"),
216 TokenKind::If => write!(f, "if"),
217 TokenKind::Import => write!(f, "import"),
218 TokenKind::In => write!(f, "in"),
219 TokenKind::Instanceof => write!(f, "instanceof"),
220 TokenKind::Let => write!(f, "let"),
221 TokenKind::New => write!(f, "new"),
222 TokenKind::Of => write!(f, "of"),
223 TokenKind::Return => write!(f, "return"),
224 TokenKind::Static => write!(f, "static"),
225 TokenKind::Super => write!(f, "super"),
226 TokenKind::Switch => write!(f, "switch"),
227 TokenKind::This => write!(f, "this"),
228 TokenKind::Throw => write!(f, "throw"),
229 TokenKind::Try => write!(f, "try"),
230 TokenKind::Typeof => write!(f, "typeof"),
231 TokenKind::Var => write!(f, "var"),
232 TokenKind::Void => write!(f, "void"),
233 TokenKind::While => write!(f, "while"),
234 TokenKind::With => write!(f, "with"),
235 TokenKind::Yield => write!(f, "yield"),
236 TokenKind::Async => write!(f, "async"),
237 TokenKind::True => write!(f, "true"),
238 TokenKind::False => write!(f, "false"),
239 TokenKind::Null => write!(f, "null"),
240 TokenKind::LParen => write!(f, "("),
241 TokenKind::RParen => write!(f, ")"),
242 TokenKind::LBracket => write!(f, "["),
243 TokenKind::RBracket => write!(f, "]"),
244 TokenKind::LBrace => write!(f, "{{"),
245 TokenKind::RBrace => write!(f, "}}"),
246 TokenKind::Semicolon => write!(f, ";"),
247 TokenKind::Comma => write!(f, ","),
248 TokenKind::Colon => write!(f, ":"),
249 TokenKind::Dot => write!(f, "."),
250 TokenKind::Ellipsis => write!(f, "..."),
251 TokenKind::Arrow => write!(f, "=>"),
252 TokenKind::QuestionDot => write!(f, "?."),
253 TokenKind::Question => write!(f, "?"),
254 TokenKind::Assign => write!(f, "="),
255 TokenKind::PlusAssign => write!(f, "+="),
256 TokenKind::MinusAssign => write!(f, "-="),
257 TokenKind::StarAssign => write!(f, "*="),
258 TokenKind::SlashAssign => write!(f, "/="),
259 TokenKind::PercentAssign => write!(f, "%="),
260 TokenKind::ExpAssign => write!(f, "**="),
261 TokenKind::AmpAssign => write!(f, "&="),
262 TokenKind::PipeAssign => write!(f, "|="),
263 TokenKind::CaretAssign => write!(f, "^="),
264 TokenKind::ShlAssign => write!(f, "<<="),
265 TokenKind::ShrAssign => write!(f, ">>="),
266 TokenKind::UshrAssign => write!(f, ">>>="),
267 TokenKind::AndAssign => write!(f, "&&="),
268 TokenKind::OrAssign => write!(f, "||="),
269 TokenKind::NullishAssign => write!(f, "??="),
270 TokenKind::Eq => write!(f, "=="),
271 TokenKind::Ne => write!(f, "!="),
272 TokenKind::StrictEq => write!(f, "==="),
273 TokenKind::StrictNe => write!(f, "!=="),
274 TokenKind::Lt => write!(f, "<"),
275 TokenKind::Gt => write!(f, ">"),
276 TokenKind::Le => write!(f, "<="),
277 TokenKind::Ge => write!(f, ">="),
278 TokenKind::Plus => write!(f, "+"),
279 TokenKind::Minus => write!(f, "-"),
280 TokenKind::Star => write!(f, "*"),
281 TokenKind::Slash => write!(f, "/"),
282 TokenKind::Percent => write!(f, "%"),
283 TokenKind::Exp => write!(f, "**"),
284 TokenKind::PlusPlus => write!(f, "++"),
285 TokenKind::MinusMinus => write!(f, "--"),
286 TokenKind::Amp => write!(f, "&"),
287 TokenKind::Pipe => write!(f, "|"),
288 TokenKind::Caret => write!(f, "^"),
289 TokenKind::Tilde => write!(f, "~"),
290 TokenKind::Shl => write!(f, "<<"),
291 TokenKind::Shr => write!(f, ">>"),
292 TokenKind::Ushr => write!(f, ">>>"),
293 TokenKind::And => write!(f, "&&"),
294 TokenKind::Or => write!(f, "||"),
295 TokenKind::Not => write!(f, "!"),
296 TokenKind::Nullish => write!(f, "??"),
297 TokenKind::Eof => write!(f, "<EOF>"),
298 }
299 }
300}
301
302/// The lexer converts JavaScript source text into tokens.
303pub struct Lexer<'a> {
304 source: &'a [u8],
305 /// Current byte offset into `source`.
306 pos: usize,
307 /// Current 1-based line number.
308 line: u32,
309 /// Current 1-based column (byte offset from line start).
310 col: u32,
311 /// Whether we have crossed at least one newline since the last token.
312 saw_newline: bool,
313 /// Nesting depth for template literal `${...}` expressions.
314 /// When > 0, a `}` at the matching depth resumes template scanning.
315 template_depth: u32,
316 /// Stack tracking brace depth at each template nesting level.
317 /// When we enter `${`, we push the current brace depth.
318 template_brace_stack: Vec<u32>,
319 /// Current brace depth (incremented on `{`, decremented on `}`).
320 brace_depth: u32,
321 /// Tracks whether the previous token could end an expression.
322 /// Used to disambiguate `/` as division vs RegExp.
323 prev_token_is_expr_end: bool,
324}
325
326impl<'a> Lexer<'a> {
327 /// Create a new lexer for the given source text.
328 pub fn new(source: &'a str) -> Self {
329 Self {
330 source: source.as_bytes(),
331 pos: 0,
332 line: 1,
333 col: 1,
334 saw_newline: false,
335 template_depth: 0,
336 template_brace_stack: Vec::new(),
337 brace_depth: 0,
338 prev_token_is_expr_end: false,
339 }
340 }
341
342 /// Tokenize the entire source and return all tokens (including final `Eof`).
343 pub fn tokenize(source: &str) -> Result<Vec<Token>, LexError> {
344 let mut lexer = Lexer::new(source);
345 let mut tokens = Vec::new();
346 loop {
347 let tok = lexer.next_token()?;
348 let is_eof = tok.kind == TokenKind::Eof;
349 tokens.push(tok);
350 if is_eof {
351 break;
352 }
353 }
354 Ok(tokens)
355 }
356
357 // ── Helpers ──────────────────────────────────────────────
358
359 fn current_pos(&self) -> SourcePos {
360 SourcePos {
361 line: self.line,
362 col: self.col,
363 }
364 }
365
366 fn peek(&self) -> Option<u8> {
367 self.source.get(self.pos).copied()
368 }
369
370 fn peek_at(&self, offset: usize) -> Option<u8> {
371 self.source.get(self.pos + offset).copied()
372 }
373
374 fn advance(&mut self) -> Option<u8> {
375 let b = self.source.get(self.pos).copied()?;
376 self.pos += 1;
377 if b == b'\n' {
378 self.line += 1;
379 self.col = 1;
380 self.saw_newline = true;
381 } else {
382 self.col += 1;
383 }
384 Some(b)
385 }
386
387 fn advance_if(&mut self, expected: u8) -> bool {
388 if self.peek() == Some(expected) {
389 self.advance();
390 true
391 } else {
392 false
393 }
394 }
395
396 fn slice(&self, start: usize, end: usize) -> &'a str {
397 // Safety: we only slice at positions we've already walked over,
398 // and we trust the input to be valid UTF-8 at identifier/keyword
399 // boundaries. In practice this is safe because the lexer only
400 // slices ASCII-compatible byte sequences.
401 std::str::from_utf8(&self.source[start..end]).unwrap_or("")
402 }
403
404 // ── Whitespace & Comments ────────────────────────────────
405
406 fn skip_whitespace_and_comments(&mut self) -> Result<(), LexError> {
407 loop {
408 match self.peek() {
409 Some(b' ' | b'\t' | b'\r' | b'\n') => {
410 self.advance();
411 }
412 // Unicode BOM / non-breaking spaces
413 Some(0xC2) if self.peek_at(1) == Some(0xA0) => {
414 // U+00A0 non-breaking space (2-byte UTF-8)
415 self.advance();
416 self.advance();
417 }
418 Some(0xEF) if self.peek_at(1) == Some(0xBB) && self.peek_at(2) == Some(0xBF) => {
419 // BOM U+FEFF
420 self.advance();
421 self.advance();
422 self.advance();
423 }
424 Some(b'/') => {
425 match self.peek_at(1) {
426 Some(b'/') => {
427 // single-line comment
428 self.advance(); // /
429 self.advance(); // /
430 while let Some(b) = self.peek() {
431 if b == b'\n' {
432 break;
433 }
434 self.advance();
435 }
436 }
437 Some(b'*') => {
438 // multi-line comment
439 let start = self.current_pos();
440 self.advance(); // /
441 self.advance(); // *
442 let mut closed = false;
443 while let Some(b) = self.advance() {
444 if b == b'*' && self.peek() == Some(b'/') {
445 self.advance(); // /
446 closed = true;
447 break;
448 }
449 }
450 if !closed {
451 return Err(LexError {
452 message: "unterminated block comment".into(),
453 pos: start,
454 });
455 }
456 }
457 _ => break,
458 }
459 }
460 _ => break,
461 }
462 }
463 Ok(())
464 }
465
466 // ── Main dispatch ────────────────────────────────────────
467
468 /// Produce the next token.
469 pub fn next_token(&mut self) -> Result<Token, LexError> {
470 self.saw_newline = false;
471 self.skip_whitespace_and_comments()?;
472
473 let start = self.current_pos();
474
475 let Some(b) = self.peek() else {
476 return Ok(Token {
477 kind: TokenKind::Eof,
478 span: Span {
479 start,
480 end: self.current_pos(),
481 },
482 preceded_by_newline: self.saw_newline,
483 });
484 };
485
486 // If we're inside a template `${...}` and hit the matching `}`,
487 // resume template scanning.
488 if b == b'}'
489 && !self.template_brace_stack.is_empty()
490 && self.brace_depth == *self.template_brace_stack.last().unwrap()
491 {
492 self.template_brace_stack.pop();
493 self.template_depth -= 1;
494 self.advance(); // consume }
495 return self.scan_template_continuation(start);
496 }
497
498 let kind = match b {
499 b'`' => {
500 self.advance();
501 return self.scan_template_start(start);
502 }
503
504 b'0'..=b'9' => self.scan_number()?,
505 b'.' if matches!(self.peek_at(1), Some(b'0'..=b'9')) => self.scan_number()?,
506
507 b'"' | b'\'' => self.scan_string()?,
508
509 b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$' => self.scan_identifier_or_keyword(),
510 // UTF-8 multi-byte identifier start
511 0xC0..=0xF7 if is_unicode_id_start(self.source, self.pos) => {
512 self.scan_identifier_or_keyword()
513 }
514
515 b'/' if !self.prev_token_is_expr_end => self.scan_regexp()?,
516
517 _ => self.scan_punctuator()?,
518 };
519
520 let end = self.current_pos();
521 let preceded_by_newline = self.saw_newline;
522
523 // Track whether this token ends an expression (for `/` disambiguation).
524 self.prev_token_is_expr_end = token_is_expr_end(&kind);
525
526 Ok(Token {
527 kind,
528 span: Span { start, end },
529 preceded_by_newline,
530 })
531 }
532
533 // ── Numbers ──────────────────────────────────────────────
534
535 fn scan_number(&mut self) -> Result<TokenKind, LexError> {
536 let start = self.pos;
537
538 if self.peek() == Some(b'0') {
539 match self.peek_at(1) {
540 Some(b'x' | b'X') => return self.scan_hex_number(),
541 Some(b'o' | b'O') => return self.scan_octal_number(),
542 Some(b'b' | b'B') => return self.scan_binary_number(),
543 _ => {}
544 }
545 }
546
547 // Decimal integer or float
548 self.eat_decimal_digits();
549
550 if self.peek() == Some(b'.') {
551 // Could be `1..toString()` — only consume `.` if followed by a digit
552 // or if this is a leading dot (start already has a digit, so peek is safe).
553 // Actually, `1.` is a valid numeric literal (= 1.0), and `1.e2` = 100.
554 // We consume the dot always unless it's `..` (spread).
555 if self.peek_at(1) != Some(b'.') {
556 self.advance(); // .
557 self.eat_decimal_digits();
558 }
559 }
560
561 // Exponent
562 if matches!(self.peek(), Some(b'e' | b'E')) {
563 self.advance();
564 if matches!(self.peek(), Some(b'+' | b'-')) {
565 self.advance();
566 }
567 if !matches!(self.peek(), Some(b'0'..=b'9')) {
568 return Err(LexError {
569 message: "expected digit after exponent".into(),
570 pos: self.current_pos(),
571 });
572 }
573 self.eat_decimal_digits();
574 }
575
576 // BigInt suffix `n` — we tokenize it but store as f64 (for now)
577 self.advance_if(b'n');
578
579 let text = self.slice(start, self.pos);
580 let value = parse_decimal(text);
581 Ok(TokenKind::Number(value))
582 }
583
584 fn scan_hex_number(&mut self) -> Result<TokenKind, LexError> {
585 self.advance(); // 0
586 self.advance(); // x/X
587 let digit_start = self.pos;
588 self.eat_hex_digits();
589 if self.pos == digit_start {
590 return Err(LexError {
591 message: "expected hex digit after 0x".into(),
592 pos: self.current_pos(),
593 });
594 }
595 self.advance_if(b'n');
596 let text = self.slice(digit_start, self.pos);
597 let text = text.trim_end_matches('n');
598 let value = u64_from_hex(text) as f64;
599 Ok(TokenKind::Number(value))
600 }
601
602 fn scan_octal_number(&mut self) -> Result<TokenKind, LexError> {
603 self.advance(); // 0
604 self.advance(); // o/O
605 let digit_start = self.pos;
606 while matches!(self.peek(), Some(b'0'..=b'7' | b'_')) {
607 self.advance();
608 }
609 if self.pos == digit_start {
610 return Err(LexError {
611 message: "expected octal digit after 0o".into(),
612 pos: self.current_pos(),
613 });
614 }
615 self.advance_if(b'n');
616 let text = self.slice(digit_start, self.pos).trim_end_matches('n');
617 let value = u64_from_octal(text) as f64;
618 Ok(TokenKind::Number(value))
619 }
620
621 fn scan_binary_number(&mut self) -> Result<TokenKind, LexError> {
622 self.advance(); // 0
623 self.advance(); // b/B
624 let digit_start = self.pos;
625 while matches!(self.peek(), Some(b'0' | b'1' | b'_')) {
626 self.advance();
627 }
628 if self.pos == digit_start {
629 return Err(LexError {
630 message: "expected binary digit after 0b".into(),
631 pos: self.current_pos(),
632 });
633 }
634 self.advance_if(b'n');
635 let text = self.slice(digit_start, self.pos).trim_end_matches('n');
636 let value = u64_from_binary(text) as f64;
637 Ok(TokenKind::Number(value))
638 }
639
640 fn eat_decimal_digits(&mut self) {
641 while matches!(self.peek(), Some(b'0'..=b'9' | b'_')) {
642 self.advance();
643 }
644 }
645
646 fn eat_hex_digits(&mut self) {
647 while matches!(
648 self.peek(),
649 Some(b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'_')
650 ) {
651 self.advance();
652 }
653 }
654
655 // ── Strings ──────────────────────────────────────────────
656
657 fn scan_string(&mut self) -> Result<TokenKind, LexError> {
658 let quote = self.advance().unwrap(); // opening quote
659 let start_pos = self.current_pos();
660 let mut value = std::string::String::new();
661
662 loop {
663 match self.peek() {
664 None | Some(b'\n') => {
665 return Err(LexError {
666 message: "unterminated string literal".into(),
667 pos: start_pos,
668 });
669 }
670 Some(b) if b == quote => {
671 self.advance();
672 break;
673 }
674 Some(b'\\') => {
675 self.advance(); // backslash
676 if let Some(ch) = self.scan_escape_sequence()? {
677 value.push(ch);
678 }
679 }
680 Some(_) => {
681 let ch = self.advance_char();
682 value.push(ch);
683 }
684 }
685 }
686
687 Ok(TokenKind::String(value))
688 }
689
690 /// Scan an escape sequence after the backslash has been consumed.
691 /// Returns `None` for line continuations (`\<newline>`), which produce no character.
692 fn scan_escape_sequence(&mut self) -> Result<Option<char>, LexError> {
693 let pos = self.current_pos();
694 match self.advance() {
695 Some(b'n') => Ok(Some('\n')),
696 Some(b'r') => Ok(Some('\r')),
697 Some(b't') => Ok(Some('\t')),
698 Some(b'b') => Ok(Some('\u{0008}')),
699 Some(b'f') => Ok(Some('\u{000C}')),
700 Some(b'v') => Ok(Some('\u{000B}')),
701 Some(b'0') if !matches!(self.peek(), Some(b'0'..=b'9')) => Ok(Some('\0')),
702 Some(b'\\') => Ok(Some('\\')),
703 Some(b'\'') => Ok(Some('\'')),
704 Some(b'"') => Ok(Some('"')),
705 Some(b'`') => Ok(Some('`')),
706 // Line continuation: \<newline> produces no character
707 Some(b'\n') => Ok(None),
708 Some(b'\r') => {
709 self.advance_if(b'\n');
710 Ok(None)
711 }
712 Some(b'x') => {
713 let hi = self.advance().and_then(hex_digit_val).ok_or(LexError {
714 message: "invalid hex escape".into(),
715 pos,
716 })?;
717 let lo = self.advance().and_then(hex_digit_val).ok_or(LexError {
718 message: "invalid hex escape".into(),
719 pos,
720 })?;
721 let code = (hi << 4) | lo;
722 Ok(Some(code as char))
723 }
724 Some(b'u') => self.scan_unicode_escape(pos).map(Some),
725 Some(b) => {
726 // identity escape
727 Ok(Some(b as char))
728 }
729 None => Err(LexError {
730 message: "unexpected end of input in escape sequence".into(),
731 pos,
732 }),
733 }
734 }
735
736 fn scan_unicode_escape(&mut self, pos: SourcePos) -> Result<char, LexError> {
737 if self.advance_if(b'{') {
738 // \u{XXXXX}
739 let mut code: u32 = 0;
740 let mut count = 0;
741 while let Some(b) = self.peek() {
742 if b == b'}' {
743 break;
744 }
745 let d = hex_digit_val(self.advance().unwrap()).ok_or(LexError {
746 message: "invalid unicode escape".into(),
747 pos,
748 })?;
749 code = code * 16 + d as u32;
750 count += 1;
751 if code > 0x10FFFF {
752 return Err(LexError {
753 message: "unicode escape out of range".into(),
754 pos,
755 });
756 }
757 }
758 if count == 0 || !self.advance_if(b'}') {
759 return Err(LexError {
760 message: "invalid unicode escape".into(),
761 pos,
762 });
763 }
764 char::from_u32(code).ok_or(LexError {
765 message: "invalid unicode code point".into(),
766 pos,
767 })
768 } else {
769 // \uXXXX
770 let mut code: u32 = 0;
771 for _ in 0..4 {
772 let d = self.advance().and_then(hex_digit_val).ok_or(LexError {
773 message: "invalid unicode escape".into(),
774 pos,
775 })?;
776 code = code * 16 + d as u32;
777 }
778 char::from_u32(code).ok_or(LexError {
779 message: "invalid unicode code point".into(),
780 pos,
781 })
782 }
783 }
784
785 /// Advance one full UTF-8 character and return it.
786 fn advance_char(&mut self) -> char {
787 let start = self.pos;
788 let b = self.advance().unwrap();
789 if b < 0x80 {
790 return b as char;
791 }
792 // multi-byte: determine length
793 let len = if b >= 0xF0 {
794 4
795 } else if b >= 0xE0 {
796 3
797 } else {
798 2
799 };
800 for _ in 1..len {
801 self.advance();
802 }
803 let s = std::str::from_utf8(&self.source[start..self.pos]).unwrap_or("\u{FFFD}");
804 s.chars().next().unwrap_or('\u{FFFD}')
805 }
806
807 // ── Template Literals ────────────────────────────────────
808
809 fn scan_template_start(&mut self, start: SourcePos) -> Result<Token, LexError> {
810 let mut value = std::string::String::new();
811 loop {
812 match self.peek() {
813 None => {
814 return Err(LexError {
815 message: "unterminated template literal".into(),
816 pos: start,
817 });
818 }
819 Some(b'`') => {
820 self.advance();
821 let end = self.current_pos();
822 let kind = TokenKind::TemplateFull(value);
823 self.prev_token_is_expr_end = true;
824 return Ok(Token {
825 kind,
826 span: Span { start, end },
827 preceded_by_newline: self.saw_newline,
828 });
829 }
830 Some(b'$') if self.peek_at(1) == Some(b'{') => {
831 self.advance(); // $
832 self.advance(); // {
833 self.template_depth += 1;
834 self.template_brace_stack.push(self.brace_depth);
835 let end = self.current_pos();
836 let kind = TokenKind::TemplateHead(value);
837 self.prev_token_is_expr_end = false;
838 return Ok(Token {
839 kind,
840 span: Span { start, end },
841 preceded_by_newline: self.saw_newline,
842 });
843 }
844 Some(b'\\') => {
845 self.advance();
846 if let Some(ch) = self.scan_escape_sequence()? {
847 value.push(ch);
848 }
849 }
850 Some(_) => {
851 let ch = self.advance_char();
852 value.push(ch);
853 }
854 }
855 }
856 }
857
858 fn scan_template_continuation(&mut self, start: SourcePos) -> Result<Token, LexError> {
859 let mut value = std::string::String::new();
860 loop {
861 match self.peek() {
862 None => {
863 return Err(LexError {
864 message: "unterminated template literal".into(),
865 pos: start,
866 });
867 }
868 Some(b'`') => {
869 self.advance();
870 let end = self.current_pos();
871 let kind = TokenKind::TemplateTail(value);
872 self.prev_token_is_expr_end = true;
873 return Ok(Token {
874 kind,
875 span: Span { start, end },
876 preceded_by_newline: self.saw_newline,
877 });
878 }
879 Some(b'$') if self.peek_at(1) == Some(b'{') => {
880 self.advance(); // $
881 self.advance(); // {
882 self.template_depth += 1;
883 self.template_brace_stack.push(self.brace_depth);
884 let end = self.current_pos();
885 let kind = TokenKind::TemplateMiddle(value);
886 self.prev_token_is_expr_end = false;
887 return Ok(Token {
888 kind,
889 span: Span { start, end },
890 preceded_by_newline: self.saw_newline,
891 });
892 }
893 Some(b'\\') => {
894 self.advance();
895 if let Some(ch) = self.scan_escape_sequence()? {
896 value.push(ch);
897 }
898 }
899 Some(_) => {
900 let ch = self.advance_char();
901 value.push(ch);
902 }
903 }
904 }
905 }
906
907 // ── Identifiers & Keywords ───────────────────────────────
908
909 fn scan_identifier_or_keyword(&mut self) -> TokenKind {
910 let start = self.pos;
911
912 // Consume the first character (which we already validated)
913 self.advance_char();
914
915 // Consume continue characters
916 while self.pos < self.source.len() {
917 let b = self.source[self.pos];
918 match b {
919 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'$' => {
920 self.advance();
921 }
922 0xC0..=0xF7 if is_unicode_id_continue(self.source, self.pos) => {
923 self.advance_char();
924 }
925 _ => break,
926 }
927 }
928
929 let text = self.slice(start, self.pos);
930 keyword_or_ident(text)
931 }
932
933 // ── Regular Expressions ──────────────────────────────────
934
935 fn scan_regexp(&mut self) -> Result<TokenKind, LexError> {
936 let start_pos = self.current_pos();
937 self.advance(); // opening /
938
939 let mut pattern = std::string::String::new();
940 let mut in_class = false;
941
942 loop {
943 match self.peek() {
944 None | Some(b'\n') => {
945 return Err(LexError {
946 message: "unterminated regexp literal".into(),
947 pos: start_pos,
948 });
949 }
950 Some(b'/') if !in_class => {
951 self.advance();
952 break;
953 }
954 Some(b'[') => {
955 in_class = true;
956 pattern.push('[');
957 self.advance();
958 }
959 Some(b']') if in_class => {
960 in_class = false;
961 pattern.push(']');
962 self.advance();
963 }
964 Some(b'\\') => {
965 self.advance();
966 pattern.push('\\');
967 if let Some(b2) = self.peek() {
968 if b2 != b'\n' {
969 pattern.push(b2 as char);
970 self.advance();
971 }
972 }
973 }
974 Some(b) => {
975 pattern.push(b as char);
976 self.advance();
977 }
978 }
979 }
980
981 // Flags
982 let mut flags = std::string::String::new();
983 while matches!(
984 self.peek(),
985 Some(b'g' | b'i' | b'm' | b's' | b'u' | b'v' | b'y' | b'd')
986 ) {
987 flags.push(self.advance().unwrap() as char);
988 }
989
990 Ok(TokenKind::RegExp { pattern, flags })
991 }
992
993 // ── Punctuators ──────────────────────────────────────────
994
995 fn scan_punctuator(&mut self) -> Result<TokenKind, LexError> {
996 let pos = self.current_pos();
997 let b = self.advance().unwrap();
998
999 let kind = match b {
1000 b'(' => TokenKind::LParen,
1001 b')' => TokenKind::RParen,
1002 b'[' => TokenKind::LBracket,
1003 b']' => TokenKind::RBracket,
1004 b'{' => {
1005 self.brace_depth += 1;
1006 TokenKind::LBrace
1007 }
1008 b'}' => {
1009 self.brace_depth = self.brace_depth.saturating_sub(1);
1010 TokenKind::RBrace
1011 }
1012 b';' => TokenKind::Semicolon,
1013 b',' => TokenKind::Comma,
1014 b':' => TokenKind::Colon,
1015 b'~' => TokenKind::Tilde,
1016
1017 b'.' => {
1018 if self.peek() == Some(b'.') && self.peek_at(1) == Some(b'.') {
1019 self.advance();
1020 self.advance();
1021 TokenKind::Ellipsis
1022 } else {
1023 TokenKind::Dot
1024 }
1025 }
1026
1027 b'?' => {
1028 if self.advance_if(b'?') {
1029 if self.advance_if(b'=') {
1030 TokenKind::NullishAssign
1031 } else {
1032 TokenKind::Nullish
1033 }
1034 } else if self.peek() == Some(b'.') && !matches!(self.peek_at(1), Some(b'0'..=b'9'))
1035 {
1036 self.advance();
1037 TokenKind::QuestionDot
1038 } else {
1039 TokenKind::Question
1040 }
1041 }
1042
1043 b'+' => {
1044 if self.advance_if(b'+') {
1045 TokenKind::PlusPlus
1046 } else if self.advance_if(b'=') {
1047 TokenKind::PlusAssign
1048 } else {
1049 TokenKind::Plus
1050 }
1051 }
1052
1053 b'-' => {
1054 if self.advance_if(b'-') {
1055 TokenKind::MinusMinus
1056 } else if self.advance_if(b'=') {
1057 TokenKind::MinusAssign
1058 } else {
1059 TokenKind::Minus
1060 }
1061 }
1062
1063 b'*' => {
1064 if self.advance_if(b'*') {
1065 if self.advance_if(b'=') {
1066 TokenKind::ExpAssign
1067 } else {
1068 TokenKind::Exp
1069 }
1070 } else if self.advance_if(b'=') {
1071 TokenKind::StarAssign
1072 } else {
1073 TokenKind::Star
1074 }
1075 }
1076
1077 b'/' => {
1078 // We only get here for division (regexp was handled earlier)
1079 if self.advance_if(b'=') {
1080 TokenKind::SlashAssign
1081 } else {
1082 TokenKind::Slash
1083 }
1084 }
1085
1086 b'%' => {
1087 if self.advance_if(b'=') {
1088 TokenKind::PercentAssign
1089 } else {
1090 TokenKind::Percent
1091 }
1092 }
1093
1094 b'=' => {
1095 if self.advance_if(b'=') {
1096 if self.advance_if(b'=') {
1097 TokenKind::StrictEq
1098 } else {
1099 TokenKind::Eq
1100 }
1101 } else if self.advance_if(b'>') {
1102 TokenKind::Arrow
1103 } else {
1104 TokenKind::Assign
1105 }
1106 }
1107
1108 b'!' => {
1109 if self.advance_if(b'=') {
1110 if self.advance_if(b'=') {
1111 TokenKind::StrictNe
1112 } else {
1113 TokenKind::Ne
1114 }
1115 } else {
1116 TokenKind::Not
1117 }
1118 }
1119
1120 b'<' => {
1121 if self.advance_if(b'<') {
1122 if self.advance_if(b'=') {
1123 TokenKind::ShlAssign
1124 } else {
1125 TokenKind::Shl
1126 }
1127 } else if self.advance_if(b'=') {
1128 TokenKind::Le
1129 } else {
1130 TokenKind::Lt
1131 }
1132 }
1133
1134 b'>' => {
1135 if self.advance_if(b'>') {
1136 if self.advance_if(b'>') {
1137 if self.advance_if(b'=') {
1138 TokenKind::UshrAssign
1139 } else {
1140 TokenKind::Ushr
1141 }
1142 } else if self.advance_if(b'=') {
1143 TokenKind::ShrAssign
1144 } else {
1145 TokenKind::Shr
1146 }
1147 } else if self.advance_if(b'=') {
1148 TokenKind::Ge
1149 } else {
1150 TokenKind::Gt
1151 }
1152 }
1153
1154 b'&' => {
1155 if self.advance_if(b'&') {
1156 if self.advance_if(b'=') {
1157 TokenKind::AndAssign
1158 } else {
1159 TokenKind::And
1160 }
1161 } else if self.advance_if(b'=') {
1162 TokenKind::AmpAssign
1163 } else {
1164 TokenKind::Amp
1165 }
1166 }
1167
1168 b'|' => {
1169 if self.advance_if(b'|') {
1170 if self.advance_if(b'=') {
1171 TokenKind::OrAssign
1172 } else {
1173 TokenKind::Or
1174 }
1175 } else if self.advance_if(b'=') {
1176 TokenKind::PipeAssign
1177 } else {
1178 TokenKind::Pipe
1179 }
1180 }
1181
1182 b'^' => {
1183 if self.advance_if(b'=') {
1184 TokenKind::CaretAssign
1185 } else {
1186 TokenKind::Caret
1187 }
1188 }
1189
1190 _ => {
1191 return Err(LexError {
1192 message: format!("unexpected character: {:?}", b as char),
1193 pos,
1194 });
1195 }
1196 };
1197
1198 Ok(kind)
1199 }
1200}
1201
1202// ── Keyword lookup ───────────────────────────────────────────
1203
1204fn keyword_or_ident(s: &str) -> TokenKind {
1205 match s {
1206 "await" => TokenKind::Await,
1207 "break" => TokenKind::Break,
1208 "case" => TokenKind::Case,
1209 "catch" => TokenKind::Catch,
1210 "class" => TokenKind::Class,
1211 "const" => TokenKind::Const,
1212 "continue" => TokenKind::Continue,
1213 "debugger" => TokenKind::Debugger,
1214 "default" => TokenKind::Default,
1215 "delete" => TokenKind::Delete,
1216 "do" => TokenKind::Do,
1217 "else" => TokenKind::Else,
1218 "export" => TokenKind::Export,
1219 "extends" => TokenKind::Extends,
1220 "finally" => TokenKind::Finally,
1221 "for" => TokenKind::For,
1222 "function" => TokenKind::Function,
1223 "if" => TokenKind::If,
1224 "import" => TokenKind::Import,
1225 "in" => TokenKind::In,
1226 "instanceof" => TokenKind::Instanceof,
1227 "let" => TokenKind::Let,
1228 "new" => TokenKind::New,
1229 "of" => TokenKind::Of,
1230 "return" => TokenKind::Return,
1231 "static" => TokenKind::Static,
1232 "super" => TokenKind::Super,
1233 "switch" => TokenKind::Switch,
1234 "this" => TokenKind::This,
1235 "throw" => TokenKind::Throw,
1236 "try" => TokenKind::Try,
1237 "typeof" => TokenKind::Typeof,
1238 "var" => TokenKind::Var,
1239 "void" => TokenKind::Void,
1240 "while" => TokenKind::While,
1241 "with" => TokenKind::With,
1242 "yield" => TokenKind::Yield,
1243 "async" => TokenKind::Async,
1244 "true" => TokenKind::True,
1245 "false" => TokenKind::False,
1246 "null" => TokenKind::Null,
1247 _ => TokenKind::Identifier(s.to_owned()),
1248 }
1249}
1250
1251// ── Expression-end tracking ──────────────────────────────────
1252
1253/// Returns `true` if a token of this kind could end an expression.
1254/// Used to decide whether a following `/` is division or a RegExp literal.
1255fn token_is_expr_end(kind: &TokenKind) -> bool {
1256 matches!(
1257 kind,
1258 TokenKind::Identifier(_)
1259 | TokenKind::Number(_)
1260 | TokenKind::String(_)
1261 | TokenKind::TemplateFull(_)
1262 | TokenKind::TemplateTail(_)
1263 | TokenKind::True
1264 | TokenKind::False
1265 | TokenKind::Null
1266 | TokenKind::This
1267 | TokenKind::Super
1268 | TokenKind::RParen
1269 | TokenKind::RBracket
1270 | TokenKind::RBrace
1271 | TokenKind::PlusPlus
1272 | TokenKind::MinusMinus
1273 | TokenKind::RegExp { .. }
1274 )
1275}
1276
1277// ── Unicode helpers ──────────────────────────────────────────
1278
1279/// Check if the byte sequence at `pos` starts a valid Unicode identifier start character.
1280fn is_unicode_id_start(source: &[u8], pos: usize) -> bool {
1281 let s = std::str::from_utf8(&source[pos..]).unwrap_or("");
1282 if let Some(ch) = s.chars().next() {
1283 ch.is_alphabetic() || ch == '_' || ch == '$'
1284 } else {
1285 false
1286 }
1287}
1288
1289/// Check if the byte sequence at `pos` starts a valid Unicode identifier continue character.
1290fn is_unicode_id_continue(source: &[u8], pos: usize) -> bool {
1291 let s = std::str::from_utf8(&source[pos..]).unwrap_or("");
1292 if let Some(ch) = s.chars().next() {
1293 ch.is_alphanumeric() || ch == '_' || ch == '$' || ch == '\u{200C}' || ch == '\u{200D}'
1294 } else {
1295 false
1296 }
1297}
1298
1299// ── Numeric parsing helpers ──────────────────────────────────
1300
1301fn hex_digit_val(b: u8) -> Option<u8> {
1302 match b {
1303 b'0'..=b'9' => Some(b - b'0'),
1304 b'a'..=b'f' => Some(b - b'a' + 10),
1305 b'A'..=b'F' => Some(b - b'A' + 10),
1306 _ => None,
1307 }
1308}
1309
1310fn parse_decimal(s: &str) -> f64 {
1311 let s = s.replace('_', "");
1312 let s = s.trim_end_matches('n');
1313 // Use manual parsing for basic decimal and float
1314 if let Ok(v) = s.parse::<f64>() {
1315 return v;
1316 }
1317 0.0
1318}
1319
1320fn u64_from_hex(s: &str) -> u64 {
1321 let mut result: u64 = 0;
1322 for b in s.bytes() {
1323 if b == b'_' {
1324 continue;
1325 }
1326 let d = hex_digit_val(b).unwrap_or(0) as u64;
1327 result = result.wrapping_mul(16).wrapping_add(d);
1328 }
1329 result
1330}
1331
1332fn u64_from_octal(s: &str) -> u64 {
1333 let mut result: u64 = 0;
1334 for b in s.bytes() {
1335 if b == b'_' {
1336 continue;
1337 }
1338 let d = (b - b'0') as u64;
1339 result = result.wrapping_mul(8).wrapping_add(d);
1340 }
1341 result
1342}
1343
1344fn u64_from_binary(s: &str) -> u64 {
1345 let mut result: u64 = 0;
1346 for b in s.bytes() {
1347 if b == b'_' {
1348 continue;
1349 }
1350 let d = (b - b'0') as u64;
1351 result = result.wrapping_mul(2).wrapping_add(d);
1352 }
1353 result
1354}
1355
1356// ── Error type ───────────────────────────────────────────────
1357
1358/// An error produced during lexing.
1359#[derive(Debug, Clone, PartialEq, Eq)]
1360pub struct LexError {
1361 pub message: std::string::String,
1362 pub pos: SourcePos,
1363}
1364
1365impl fmt::Display for LexError {
1366 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1367 write!(
1368 f,
1369 "LexError at {}:{}: {}",
1370 self.pos.line, self.pos.col, self.message
1371 )
1372 }
1373}
1374
1375// ── Tests ────────────────────────────────────────────────────
1376
1377#[cfg(test)]
1378mod tests {
1379 use super::*;
1380
1381 fn kinds(src: &str) -> Vec<TokenKind> {
1382 Lexer::tokenize(src)
1383 .unwrap()
1384 .into_iter()
1385 .map(|t| t.kind)
1386 .collect()
1387 }
1388
1389 fn kind(src: &str) -> TokenKind {
1390 let tokens = Lexer::tokenize(src).unwrap();
1391 assert!(tokens.len() >= 2, "expected at least one token + Eof");
1392 tokens[0].kind.clone()
1393 }
1394
1395 // ── Keywords ──────────────────────────────────────────
1396
1397 #[test]
1398 fn test_keywords() {
1399 assert_eq!(kind("var"), TokenKind::Var);
1400 assert_eq!(kind("let"), TokenKind::Let);
1401 assert_eq!(kind("const"), TokenKind::Const);
1402 assert_eq!(kind("function"), TokenKind::Function);
1403 assert_eq!(kind("class"), TokenKind::Class);
1404 assert_eq!(kind("if"), TokenKind::If);
1405 assert_eq!(kind("else"), TokenKind::Else);
1406 assert_eq!(kind("for"), TokenKind::For);
1407 assert_eq!(kind("while"), TokenKind::While);
1408 assert_eq!(kind("do"), TokenKind::Do);
1409 assert_eq!(kind("switch"), TokenKind::Switch);
1410 assert_eq!(kind("case"), TokenKind::Case);
1411 assert_eq!(kind("break"), TokenKind::Break);
1412 assert_eq!(kind("continue"), TokenKind::Continue);
1413 assert_eq!(kind("return"), TokenKind::Return);
1414 assert_eq!(kind("throw"), TokenKind::Throw);
1415 assert_eq!(kind("try"), TokenKind::Try);
1416 assert_eq!(kind("catch"), TokenKind::Catch);
1417 assert_eq!(kind("finally"), TokenKind::Finally);
1418 assert_eq!(kind("new"), TokenKind::New);
1419 assert_eq!(kind("delete"), TokenKind::Delete);
1420 assert_eq!(kind("typeof"), TokenKind::Typeof);
1421 assert_eq!(kind("instanceof"), TokenKind::Instanceof);
1422 assert_eq!(kind("void"), TokenKind::Void);
1423 assert_eq!(kind("in"), TokenKind::In);
1424 assert_eq!(kind("of"), TokenKind::Of);
1425 assert_eq!(kind("import"), TokenKind::Import);
1426 assert_eq!(kind("export"), TokenKind::Export);
1427 assert_eq!(kind("default"), TokenKind::Default);
1428 assert_eq!(kind("async"), TokenKind::Async);
1429 assert_eq!(kind("await"), TokenKind::Await);
1430 assert_eq!(kind("yield"), TokenKind::Yield);
1431 assert_eq!(kind("this"), TokenKind::This);
1432 assert_eq!(kind("super"), TokenKind::Super);
1433 assert_eq!(kind("extends"), TokenKind::Extends);
1434 assert_eq!(kind("static"), TokenKind::Static);
1435 assert_eq!(kind("debugger"), TokenKind::Debugger);
1436 assert_eq!(kind("with"), TokenKind::With);
1437 }
1438
1439 #[test]
1440 fn test_literal_keywords() {
1441 assert_eq!(kind("true"), TokenKind::True);
1442 assert_eq!(kind("false"), TokenKind::False);
1443 assert_eq!(kind("null"), TokenKind::Null);
1444 }
1445
1446 // ── Identifiers ──────────────────────────────────────
1447
1448 #[test]
1449 fn test_identifiers() {
1450 assert_eq!(kind("foo"), TokenKind::Identifier("foo".into()));
1451 assert_eq!(kind("_bar"), TokenKind::Identifier("_bar".into()));
1452 assert_eq!(kind("$baz"), TokenKind::Identifier("$baz".into()));
1453 assert_eq!(kind("abc123"), TokenKind::Identifier("abc123".into()));
1454 assert_eq!(kind("camelCase"), TokenKind::Identifier("camelCase".into()));
1455 }
1456
1457 #[test]
1458 fn test_unicode_identifiers() {
1459 assert_eq!(kind("café"), TokenKind::Identifier("café".into()));
1460 }
1461
1462 // ── Numbers ──────────────────────────────────────────
1463
1464 #[test]
1465 fn test_integers() {
1466 assert_eq!(kind("0"), TokenKind::Number(0.0));
1467 assert_eq!(kind("42"), TokenKind::Number(42.0));
1468 assert_eq!(kind("123456"), TokenKind::Number(123456.0));
1469 }
1470
1471 #[test]
1472 fn test_floats() {
1473 assert_eq!(kind("3.14"), TokenKind::Number(3.14));
1474 assert_eq!(kind("0.5"), TokenKind::Number(0.5));
1475 assert_eq!(kind(".5"), TokenKind::Number(0.5));
1476 assert_eq!(kind("1."), TokenKind::Number(1.0));
1477 }
1478
1479 #[test]
1480 fn test_exponents() {
1481 assert_eq!(kind("1e2"), TokenKind::Number(100.0));
1482 assert_eq!(kind("1E2"), TokenKind::Number(100.0));
1483 assert_eq!(kind("1e+2"), TokenKind::Number(100.0));
1484 assert_eq!(kind("1e-2"), TokenKind::Number(0.01));
1485 assert_eq!(kind("2.5e3"), TokenKind::Number(2500.0));
1486 }
1487
1488 #[test]
1489 fn test_hex() {
1490 assert_eq!(kind("0xFF"), TokenKind::Number(255.0));
1491 assert_eq!(kind("0x0"), TokenKind::Number(0.0));
1492 assert_eq!(kind("0xDEAD"), TokenKind::Number(0xDEAD as f64));
1493 }
1494
1495 #[test]
1496 fn test_octal() {
1497 assert_eq!(kind("0o77"), TokenKind::Number(63.0));
1498 assert_eq!(kind("0O10"), TokenKind::Number(8.0));
1499 }
1500
1501 #[test]
1502 fn test_binary() {
1503 assert_eq!(kind("0b1010"), TokenKind::Number(10.0));
1504 assert_eq!(kind("0B11"), TokenKind::Number(3.0));
1505 }
1506
1507 #[test]
1508 fn test_numeric_separators() {
1509 assert_eq!(kind("1_000"), TokenKind::Number(1000.0));
1510 assert_eq!(kind("0xFF_FF"), TokenKind::Number(65535.0));
1511 assert_eq!(kind("0b1010_0101"), TokenKind::Number(165.0));
1512 }
1513
1514 // ── Strings ──────────────────────────────────────────
1515
1516 #[test]
1517 fn test_double_quoted_string() {
1518 assert_eq!(kind(r#""hello""#), TokenKind::String("hello".into()));
1519 }
1520
1521 #[test]
1522 fn test_single_quoted_string() {
1523 assert_eq!(kind("'world'"), TokenKind::String("world".into()));
1524 }
1525
1526 #[test]
1527 fn test_string_escapes() {
1528 assert_eq!(kind(r#""\n\t\r""#), TokenKind::String("\n\t\r".into()));
1529 assert_eq!(kind(r#""\\""#), TokenKind::String("\\".into()));
1530 assert_eq!(kind(r#""\"""#), TokenKind::String("\"".into()));
1531 }
1532
1533 #[test]
1534 fn test_string_hex_escape() {
1535 assert_eq!(kind(r#""\x41""#), TokenKind::String("A".into()));
1536 }
1537
1538 #[test]
1539 fn test_string_unicode_escape() {
1540 assert_eq!(kind(r#""\u0041""#), TokenKind::String("A".into()));
1541 assert_eq!(
1542 kind(r#""\u{1F600}""#),
1543 TokenKind::String("\u{1F600}".into())
1544 );
1545 }
1546
1547 #[test]
1548 fn test_string_line_continuation() {
1549 // \<newline> is a line continuation producing no character
1550 assert_eq!(
1551 kind("\"line1\\\nline2\""),
1552 TokenKind::String("line1line2".into())
1553 );
1554 }
1555
1556 #[test]
1557 fn test_empty_string() {
1558 assert_eq!(kind(r#""""#), TokenKind::String("".into()));
1559 assert_eq!(kind("''"), TokenKind::String("".into()));
1560 }
1561
1562 // ── Template Literals ────────────────────────────────
1563
1564 #[test]
1565 fn test_template_no_substitution() {
1566 assert_eq!(kind("`hello`"), TokenKind::TemplateFull("hello".into()));
1567 }
1568
1569 #[test]
1570 fn test_template_with_substitution() {
1571 let tokens = Lexer::tokenize("`hello ${name}!`").unwrap();
1572 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
1573 assert_eq!(k[0], &TokenKind::TemplateHead("hello ".into()));
1574 assert_eq!(k[1], &TokenKind::Identifier("name".into()));
1575 assert_eq!(k[2], &TokenKind::TemplateTail("!".into()));
1576 }
1577
1578 #[test]
1579 fn test_template_multiple_substitutions() {
1580 let tokens = Lexer::tokenize("`a${1}b${2}c`").unwrap();
1581 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
1582 assert_eq!(k[0], &TokenKind::TemplateHead("a".into()));
1583 assert_eq!(k[1], &TokenKind::Number(1.0));
1584 assert_eq!(k[2], &TokenKind::TemplateMiddle("b".into()));
1585 assert_eq!(k[3], &TokenKind::Number(2.0));
1586 assert_eq!(k[4], &TokenKind::TemplateTail("c".into()));
1587 }
1588
1589 #[test]
1590 fn test_template_with_nested_braces() {
1591 // `${({a:1})}` — the object literal inside ${ } has its own braces
1592 let tokens = Lexer::tokenize("`${({a:1})}`").unwrap();
1593 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
1594 assert_eq!(k[0], &TokenKind::TemplateHead("".into()));
1595 assert_eq!(k[1], &TokenKind::LParen);
1596 assert_eq!(k[2], &TokenKind::LBrace);
1597 assert_eq!(k[3], &TokenKind::Identifier("a".into()));
1598 assert_eq!(k[4], &TokenKind::Colon);
1599 assert_eq!(k[5], &TokenKind::Number(1.0));
1600 assert_eq!(k[6], &TokenKind::RBrace);
1601 assert_eq!(k[7], &TokenKind::RParen);
1602 assert_eq!(k[8], &TokenKind::TemplateTail("".into()));
1603 }
1604
1605 // ── Regular Expressions ──────────────────────────────
1606
1607 #[test]
1608 fn test_regexp_basic() {
1609 let tokens = Lexer::tokenize("x = /foo/gi").unwrap();
1610 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
1611 assert_eq!(
1612 k[2],
1613 &TokenKind::RegExp {
1614 pattern: "foo".into(),
1615 flags: "gi".into()
1616 }
1617 );
1618 }
1619
1620 #[test]
1621 fn test_regexp_with_class() {
1622 // /[a-z]/ — the `/` inside the character class is not the end
1623 let tokens = Lexer::tokenize("x = /[a/b]/").unwrap();
1624 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
1625 assert_eq!(
1626 k[2],
1627 &TokenKind::RegExp {
1628 pattern: "[a/b]".into(),
1629 flags: "".into()
1630 }
1631 );
1632 }
1633
1634 #[test]
1635 fn test_regexp_vs_division() {
1636 // After an identifier, `/` is division
1637 let tokens = Lexer::tokenize("a / b").unwrap();
1638 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
1639 assert_eq!(k[1], &TokenKind::Slash);
1640 }
1641
1642 // ── Punctuators ──────────────────────────────────────
1643
1644 #[test]
1645 fn test_simple_punctuators() {
1646 assert_eq!(kind("("), TokenKind::LParen);
1647 assert_eq!(kind(")"), TokenKind::RParen);
1648 assert_eq!(kind("["), TokenKind::LBracket);
1649 assert_eq!(kind("]"), TokenKind::RBracket);
1650 assert_eq!(kind("{"), TokenKind::LBrace);
1651 assert_eq!(kind("}"), TokenKind::RBrace);
1652 assert_eq!(kind(";"), TokenKind::Semicolon);
1653 assert_eq!(kind(","), TokenKind::Comma);
1654 assert_eq!(kind(":"), TokenKind::Colon);
1655 assert_eq!(kind("~"), TokenKind::Tilde);
1656 }
1657
1658 #[test]
1659 fn test_dot_and_ellipsis() {
1660 assert_eq!(kind("."), TokenKind::Dot);
1661 assert_eq!(kind("..."), TokenKind::Ellipsis);
1662 }
1663
1664 #[test]
1665 fn test_arrow() {
1666 assert_eq!(kind("=>"), TokenKind::Arrow);
1667 }
1668
1669 #[test]
1670 fn test_optional_chaining() {
1671 assert_eq!(kind("?."), TokenKind::QuestionDot);
1672 }
1673
1674 #[test]
1675 fn test_comparison_operators() {
1676 assert_eq!(kind("=="), TokenKind::Eq);
1677 assert_eq!(kind("!="), TokenKind::Ne);
1678 assert_eq!(kind("==="), TokenKind::StrictEq);
1679 assert_eq!(kind("!=="), TokenKind::StrictNe);
1680 assert_eq!(kind("<"), TokenKind::Lt);
1681 assert_eq!(kind(">"), TokenKind::Gt);
1682 assert_eq!(kind("<="), TokenKind::Le);
1683 assert_eq!(kind(">="), TokenKind::Ge);
1684 }
1685
1686 #[test]
1687 fn test_arithmetic_operators() {
1688 assert_eq!(kind("+"), TokenKind::Plus);
1689 assert_eq!(kind("-"), TokenKind::Minus);
1690 assert_eq!(kind("*"), TokenKind::Star);
1691 assert_eq!(kind("%"), TokenKind::Percent);
1692 assert_eq!(kind("**"), TokenKind::Exp);
1693 assert_eq!(kind("++"), TokenKind::PlusPlus);
1694 assert_eq!(kind("--"), TokenKind::MinusMinus);
1695 }
1696
1697 #[test]
1698 fn test_bitwise_operators() {
1699 assert_eq!(kind("&"), TokenKind::Amp);
1700 assert_eq!(kind("|"), TokenKind::Pipe);
1701 assert_eq!(kind("^"), TokenKind::Caret);
1702 assert_eq!(kind("<<"), TokenKind::Shl);
1703 assert_eq!(kind(">>"), TokenKind::Shr);
1704 assert_eq!(kind(">>>"), TokenKind::Ushr);
1705 }
1706
1707 #[test]
1708 fn test_logical_operators() {
1709 assert_eq!(kind("&&"), TokenKind::And);
1710 assert_eq!(kind("||"), TokenKind::Or);
1711 assert_eq!(kind("!"), TokenKind::Not);
1712 assert_eq!(kind("??"), TokenKind::Nullish);
1713 }
1714
1715 #[test]
1716 fn test_assignment_operators() {
1717 assert_eq!(kind("="), TokenKind::Assign);
1718 assert_eq!(kind("+="), TokenKind::PlusAssign);
1719 assert_eq!(kind("-="), TokenKind::MinusAssign);
1720 assert_eq!(kind("*="), TokenKind::StarAssign);
1721 assert_eq!(kind("%="), TokenKind::PercentAssign);
1722 assert_eq!(kind("**="), TokenKind::ExpAssign);
1723 assert_eq!(kind("&="), TokenKind::AmpAssign);
1724 assert_eq!(kind("|="), TokenKind::PipeAssign);
1725 assert_eq!(kind("^="), TokenKind::CaretAssign);
1726 assert_eq!(kind("<<="), TokenKind::ShlAssign);
1727 assert_eq!(kind(">>="), TokenKind::ShrAssign);
1728 assert_eq!(kind(">>>="), TokenKind::UshrAssign);
1729 assert_eq!(kind("&&="), TokenKind::AndAssign);
1730 assert_eq!(kind("||="), TokenKind::OrAssign);
1731 assert_eq!(kind("??="), TokenKind::NullishAssign);
1732 }
1733
1734 // ── Comments ─────────────────────────────────────────
1735
1736 #[test]
1737 fn test_single_line_comment() {
1738 let tokens = kinds("a // comment\nb");
1739 assert_eq!(tokens.len(), 3); // a, b, Eof
1740 assert_eq!(tokens[0], TokenKind::Identifier("a".into()));
1741 assert_eq!(tokens[1], TokenKind::Identifier("b".into()));
1742 }
1743
1744 #[test]
1745 fn test_multi_line_comment() {
1746 let tokens = kinds("a /* comment */ b");
1747 assert_eq!(tokens.len(), 3);
1748 assert_eq!(tokens[0], TokenKind::Identifier("a".into()));
1749 assert_eq!(tokens[1], TokenKind::Identifier("b".into()));
1750 }
1751
1752 // ── Source positions ─────────────────────────────────
1753
1754 #[test]
1755 fn test_source_positions() {
1756 let tokens = Lexer::tokenize("let x = 42").unwrap();
1757 // `let` at line 1, col 1
1758 assert_eq!(tokens[0].span.start, SourcePos { line: 1, col: 1 });
1759 // `x` at line 1, col 5
1760 assert_eq!(tokens[1].span.start, SourcePos { line: 1, col: 5 });
1761 // `=` at line 1, col 7
1762 assert_eq!(tokens[2].span.start, SourcePos { line: 1, col: 7 });
1763 // `42` at line 1, col 9
1764 assert_eq!(tokens[3].span.start, SourcePos { line: 1, col: 9 });
1765 }
1766
1767 #[test]
1768 fn test_multiline_positions() {
1769 let tokens = Lexer::tokenize("a\nb\nc").unwrap();
1770 assert_eq!(tokens[0].span.start, SourcePos { line: 1, col: 1 });
1771 assert_eq!(tokens[1].span.start, SourcePos { line: 2, col: 1 });
1772 assert_eq!(tokens[2].span.start, SourcePos { line: 3, col: 1 });
1773 }
1774
1775 // ── Newline tracking (ASI) ───────────────────────────
1776
1777 #[test]
1778 fn test_preceded_by_newline() {
1779 let tokens = Lexer::tokenize("a\nb").unwrap();
1780 assert!(!tokens[0].preceded_by_newline); // `a`
1781 assert!(tokens[1].preceded_by_newline); // `b`
1782 }
1783
1784 // ── Error cases ──────────────────────────────────────
1785
1786 #[test]
1787 fn test_unterminated_string() {
1788 assert!(Lexer::tokenize("\"hello").is_err());
1789 }
1790
1791 #[test]
1792 fn test_unterminated_block_comment() {
1793 assert!(Lexer::tokenize("/* oops").is_err());
1794 }
1795
1796 #[test]
1797 fn test_unterminated_template() {
1798 assert!(Lexer::tokenize("`hello").is_err());
1799 }
1800
1801 #[test]
1802 fn test_bad_hex_literal() {
1803 assert!(Lexer::tokenize("0x").is_err());
1804 }
1805
1806 // ── Full statement tokenization ──────────────────────
1807
1808 #[test]
1809 fn test_full_statement() {
1810 let tokens = kinds("const x = 42 + y;");
1811 assert_eq!(
1812 tokens,
1813 vec![
1814 TokenKind::Const,
1815 TokenKind::Identifier("x".into()),
1816 TokenKind::Assign,
1817 TokenKind::Number(42.0),
1818 TokenKind::Plus,
1819 TokenKind::Identifier("y".into()),
1820 TokenKind::Semicolon,
1821 TokenKind::Eof,
1822 ]
1823 );
1824 }
1825
1826 #[test]
1827 fn test_arrow_function() {
1828 let tokens = kinds("(x) => x + 1");
1829 assert_eq!(
1830 tokens,
1831 vec![
1832 TokenKind::LParen,
1833 TokenKind::Identifier("x".into()),
1834 TokenKind::RParen,
1835 TokenKind::Arrow,
1836 TokenKind::Identifier("x".into()),
1837 TokenKind::Plus,
1838 TokenKind::Number(1.0),
1839 TokenKind::Eof,
1840 ]
1841 );
1842 }
1843
1844 #[test]
1845 fn test_complex_expression() {
1846 let tokens = kinds("a?.b ?? c !== d");
1847 assert_eq!(
1848 tokens,
1849 vec![
1850 TokenKind::Identifier("a".into()),
1851 TokenKind::QuestionDot,
1852 TokenKind::Identifier("b".into()),
1853 TokenKind::Nullish,
1854 TokenKind::Identifier("c".into()),
1855 TokenKind::StrictNe,
1856 TokenKind::Identifier("d".into()),
1857 TokenKind::Eof,
1858 ]
1859 );
1860 }
1861
1862 #[test]
1863 fn test_division_after_paren() {
1864 // `(a) / b` — the `/` after `)` should be division, not regexp
1865 let tokens = kinds("(a) / b");
1866 assert_eq!(
1867 tokens,
1868 vec![
1869 TokenKind::LParen,
1870 TokenKind::Identifier("a".into()),
1871 TokenKind::RParen,
1872 TokenKind::Slash,
1873 TokenKind::Identifier("b".into()),
1874 TokenKind::Eof,
1875 ]
1876 );
1877 }
1878
1879 #[test]
1880 fn test_slash_assign() {
1881 let tokens = kinds("a /= b");
1882 assert_eq!(
1883 tokens,
1884 vec![
1885 TokenKind::Identifier("a".into()),
1886 TokenKind::SlashAssign,
1887 TokenKind::Identifier("b".into()),
1888 TokenKind::Eof,
1889 ]
1890 );
1891 }
1892
1893 #[test]
1894 fn test_regexp_after_assign() {
1895 let tokens = kinds("x = /test/g");
1896 assert_eq!(
1897 tokens,
1898 vec![
1899 TokenKind::Identifier("x".into()),
1900 TokenKind::Assign,
1901 TokenKind::RegExp {
1902 pattern: "test".into(),
1903 flags: "g".into()
1904 },
1905 TokenKind::Eof,
1906 ]
1907 );
1908 }
1909}