we (web engine): Experimental web browser project to understand the limits of Claude
at map-set 1906 lines 65 kB view raw
1//! JavaScript lexer/tokenizer conforming to ECMAScript 2024. 2//! 3//! Converts JavaScript source text into a stream of [`Token`]s, each annotated 4//! with its [`Span`] (byte offset, line, column). 5 6use std::fmt; 7 8/// A position in the source text. 9#[derive(Debug, Clone, Copy, PartialEq, Eq)] 10pub struct SourcePos { 11 /// 1-based line number. 12 pub line: u32, 13 /// 1-based column (in bytes from the start of the line). 14 pub col: u32, 15} 16 17/// A span covering a range of source text. 18#[derive(Debug, Clone, Copy, PartialEq, Eq)] 19pub struct Span { 20 pub start: SourcePos, 21 pub end: SourcePos, 22} 23 24/// A token produced by the lexer. 25#[derive(Debug, Clone, PartialEq)] 26pub struct Token { 27 pub kind: TokenKind, 28 pub span: Span, 29 /// Whether at least one newline preceded this token (for ASI). 30 pub preceded_by_newline: bool, 31} 32 33/// Every distinct token kind the lexer can produce. 34#[derive(Debug, Clone, PartialEq)] 35pub enum TokenKind { 36 // ── Literals ────────────────────────────────────────────── 37 /// Numeric literal (the parsed `f64` value). 38 Number(f64), 39 /// String literal (the decoded content, without quotes). 40 String(std::string::String), 41 /// Regular expression literal: pattern and flags. 42 RegExp { 43 pattern: std::string::String, 44 flags: std::string::String, 45 }, 46 /// Template literal with no substitutions (full string content). 47 TemplateFull(std::string::String), 48 /// Opening part of a template literal (before the first `${`). 49 TemplateHead(std::string::String), 50 /// Middle part of a template literal (between `}` and next `${`). 51 TemplateMiddle(std::string::String), 52 /// Closing part of a template literal (after the last `}`). 53 TemplateTail(std::string::String), 54 55 // ── Identifiers & Keywords ─────────────────────────────── 56 Identifier(std::string::String), 57 58 // Keywords 59 Await, 60 Break, 61 Case, 62 Catch, 63 Class, 64 Const, 65 Continue, 66 Debugger, 67 Default, 68 Delete, 69 Do, 70 Else, 71 Export, 72 Extends, 73 Finally, 74 For, 75 Function, 76 If, 77 Import, 78 In, 79 Instanceof, 80 Let, 81 New, 82 Of, 83 Return, 84 Static, 85 Super, 86 Switch, 87 This, 88 Throw, 89 Try, 90 Typeof, 91 Var, 92 Void, 93 While, 94 With, 95 Yield, 96 Async, 97 98 // Literal keywords 99 True, 100 False, 101 Null, 102 103 // ── Punctuators ────────────────────────────────────────── 104 // Grouping 105 LParen, // ( 106 RParen, // ) 107 LBracket, // [ 108 RBracket, // ] 109 LBrace, // { 110 RBrace, // } 111 112 // Delimiters 113 Semicolon, // ; 114 Comma, // , 115 Colon, // : 116 Dot, // . 117 Ellipsis, // ... 118 119 // Arrow 120 Arrow, // => 121 122 // Optional chaining 123 QuestionDot, // ?. 124 125 // Ternary 126 Question, // ? 127 128 // Assignment 129 Assign, // = 130 PlusAssign, // += 131 MinusAssign, // -= 132 StarAssign, // *= 133 SlashAssign, // /= 134 PercentAssign, // %= 135 ExpAssign, // **= 136 AmpAssign, // &= 137 PipeAssign, // |= 138 CaretAssign, // ^= 139 ShlAssign, // <<= 140 ShrAssign, // >>= 141 UshrAssign, // >>>= 142 AndAssign, // &&= 143 OrAssign, // ||= 144 NullishAssign, // ??= 145 146 // Comparison 147 Eq, // == 148 Ne, // != 149 StrictEq, // === 150 StrictNe, // !== 151 Lt, // < 152 Gt, // > 153 Le, // <= 154 Ge, // >= 155 156 // Arithmetic 157 Plus, // + 158 Minus, // - 159 Star, // * 160 Slash, // / 161 Percent, // % 162 Exp, // ** 163 164 // Increment / Decrement 165 PlusPlus, // ++ 166 MinusMinus, // -- 167 168 // Bitwise 169 Amp, // & 170 Pipe, // | 171 Caret, // ^ 172 Tilde, // ~ 173 Shl, // << 174 Shr, // >> 175 Ushr, // >>> 176 177 // Logical 178 And, // && 179 Or, // || 180 Not, // ! 181 Nullish, // ?? 182 183 // ── Special ────────────────────────────────────────────── 184 /// End of input. 185 Eof, 186} 187 188impl fmt::Display for TokenKind { 189 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 190 match self { 191 TokenKind::Number(n) => write!(f, "{}", n), 192 TokenKind::String(s) => write!(f, "\"{}\"", s), 193 TokenKind::RegExp { pattern, flags } => write!(f, "/{}/{}", pattern, flags), 194 TokenKind::TemplateFull(s) => write!(f, "`{}`", s), 195 TokenKind::TemplateHead(s) => write!(f, "`{}${{", s), 196 TokenKind::TemplateMiddle(s) => write!(f, "}}{}${{", s), 197 TokenKind::TemplateTail(s) => write!(f, "}}{}`", s), 198 TokenKind::Identifier(s) => write!(f, "{}", s), 199 TokenKind::Await => write!(f, "await"), 200 TokenKind::Break => write!(f, "break"), 201 TokenKind::Case => write!(f, "case"), 202 TokenKind::Catch => write!(f, "catch"), 203 TokenKind::Class => write!(f, "class"), 204 TokenKind::Const => write!(f, "const"), 205 TokenKind::Continue => write!(f, "continue"), 206 TokenKind::Debugger => write!(f, "debugger"), 207 TokenKind::Default => write!(f, "default"), 208 TokenKind::Delete => write!(f, "delete"), 209 TokenKind::Do => write!(f, "do"), 210 TokenKind::Else => write!(f, "else"), 211 TokenKind::Export => write!(f, "export"), 212 TokenKind::Extends => write!(f, "extends"), 213 TokenKind::Finally => write!(f, "finally"), 214 TokenKind::For => write!(f, "for"), 215 TokenKind::Function => write!(f, "function"), 216 TokenKind::If => write!(f, "if"), 217 TokenKind::Import => write!(f, "import"), 218 TokenKind::In => write!(f, "in"), 219 TokenKind::Instanceof => write!(f, "instanceof"), 220 TokenKind::Let => write!(f, "let"), 221 TokenKind::New => write!(f, "new"), 222 TokenKind::Of => write!(f, "of"), 223 TokenKind::Return => write!(f, "return"), 224 TokenKind::Static => write!(f, "static"), 225 TokenKind::Super => write!(f, "super"), 226 TokenKind::Switch => write!(f, "switch"), 227 TokenKind::This => write!(f, "this"), 228 TokenKind::Throw => write!(f, "throw"), 229 TokenKind::Try => write!(f, "try"), 230 TokenKind::Typeof => write!(f, "typeof"), 231 TokenKind::Var => write!(f, "var"), 232 TokenKind::Void => write!(f, "void"), 233 TokenKind::While => write!(f, "while"), 234 TokenKind::With => write!(f, "with"), 235 TokenKind::Yield => write!(f, "yield"), 236 TokenKind::Async => write!(f, "async"), 237 TokenKind::True => write!(f, "true"), 238 TokenKind::False => write!(f, "false"), 239 TokenKind::Null => write!(f, "null"), 240 TokenKind::LParen => write!(f, "("), 241 TokenKind::RParen => write!(f, ")"), 242 TokenKind::LBracket => write!(f, "["), 243 TokenKind::RBracket => write!(f, "]"), 244 TokenKind::LBrace => write!(f, "{{"), 245 TokenKind::RBrace => write!(f, "}}"), 246 TokenKind::Semicolon => write!(f, ";"), 247 TokenKind::Comma => write!(f, ","), 248 TokenKind::Colon => write!(f, ":"), 249 TokenKind::Dot => write!(f, "."), 250 TokenKind::Ellipsis => write!(f, "..."), 251 TokenKind::Arrow => write!(f, "=>"), 252 TokenKind::QuestionDot => write!(f, "?."), 253 TokenKind::Question => write!(f, "?"), 254 TokenKind::Assign => write!(f, "="), 255 TokenKind::PlusAssign => write!(f, "+="), 256 TokenKind::MinusAssign => write!(f, "-="), 257 TokenKind::StarAssign => write!(f, "*="), 258 TokenKind::SlashAssign => write!(f, "/="), 259 TokenKind::PercentAssign => write!(f, "%="), 260 TokenKind::ExpAssign => write!(f, "**="), 261 TokenKind::AmpAssign => write!(f, "&="), 262 TokenKind::PipeAssign => write!(f, "|="), 263 TokenKind::CaretAssign => write!(f, "^="), 264 TokenKind::ShlAssign => write!(f, "<<="), 265 TokenKind::ShrAssign => write!(f, ">>="), 266 TokenKind::UshrAssign => write!(f, ">>>="), 267 TokenKind::AndAssign => write!(f, "&&="), 268 TokenKind::OrAssign => write!(f, "||="), 269 TokenKind::NullishAssign => write!(f, "??="), 270 TokenKind::Eq => write!(f, "=="), 271 TokenKind::Ne => write!(f, "!="), 272 TokenKind::StrictEq => write!(f, "==="), 273 TokenKind::StrictNe => write!(f, "!=="), 274 TokenKind::Lt => write!(f, "<"), 275 TokenKind::Gt => write!(f, ">"), 276 TokenKind::Le => write!(f, "<="), 277 TokenKind::Ge => write!(f, ">="), 278 TokenKind::Plus => write!(f, "+"), 279 TokenKind::Minus => write!(f, "-"), 280 TokenKind::Star => write!(f, "*"), 281 TokenKind::Slash => write!(f, "/"), 282 TokenKind::Percent => write!(f, "%"), 283 TokenKind::Exp => write!(f, "**"), 284 TokenKind::PlusPlus => write!(f, "++"), 285 TokenKind::MinusMinus => write!(f, "--"), 286 TokenKind::Amp => write!(f, "&"), 287 TokenKind::Pipe => write!(f, "|"), 288 TokenKind::Caret => write!(f, "^"), 289 TokenKind::Tilde => write!(f, "~"), 290 TokenKind::Shl => write!(f, "<<"), 291 TokenKind::Shr => write!(f, ">>"), 292 TokenKind::Ushr => write!(f, ">>>"), 293 TokenKind::And => write!(f, "&&"), 294 TokenKind::Or => write!(f, "||"), 295 TokenKind::Not => write!(f, "!"), 296 TokenKind::Nullish => write!(f, "??"), 297 TokenKind::Eof => write!(f, "<EOF>"), 298 } 299 } 300} 301 302/// The lexer converts JavaScript source text into tokens. 303pub struct Lexer<'a> { 304 source: &'a [u8], 305 /// Current byte offset into `source`. 306 pos: usize, 307 /// Current 1-based line number. 308 line: u32, 309 /// Current 1-based column (byte offset from line start). 310 col: u32, 311 /// Whether we have crossed at least one newline since the last token. 312 saw_newline: bool, 313 /// Nesting depth for template literal `${...}` expressions. 314 /// When > 0, a `}` at the matching depth resumes template scanning. 315 template_depth: u32, 316 /// Stack tracking brace depth at each template nesting level. 317 /// When we enter `${`, we push the current brace depth. 318 template_brace_stack: Vec<u32>, 319 /// Current brace depth (incremented on `{`, decremented on `}`). 320 brace_depth: u32, 321 /// Tracks whether the previous token could end an expression. 322 /// Used to disambiguate `/` as division vs RegExp. 323 prev_token_is_expr_end: bool, 324} 325 326impl<'a> Lexer<'a> { 327 /// Create a new lexer for the given source text. 328 pub fn new(source: &'a str) -> Self { 329 Self { 330 source: source.as_bytes(), 331 pos: 0, 332 line: 1, 333 col: 1, 334 saw_newline: false, 335 template_depth: 0, 336 template_brace_stack: Vec::new(), 337 brace_depth: 0, 338 prev_token_is_expr_end: false, 339 } 340 } 341 342 /// Tokenize the entire source and return all tokens (including final `Eof`). 343 pub fn tokenize(source: &str) -> Result<Vec<Token>, LexError> { 344 let mut lexer = Lexer::new(source); 345 let mut tokens = Vec::new(); 346 loop { 347 let tok = lexer.next_token()?; 348 let is_eof = tok.kind == TokenKind::Eof; 349 tokens.push(tok); 350 if is_eof { 351 break; 352 } 353 } 354 Ok(tokens) 355 } 356 357 // ── Helpers ────────────────────────────────────────────── 358 359 fn current_pos(&self) -> SourcePos { 360 SourcePos { 361 line: self.line, 362 col: self.col, 363 } 364 } 365 366 fn peek(&self) -> Option<u8> { 367 self.source.get(self.pos).copied() 368 } 369 370 fn peek_at(&self, offset: usize) -> Option<u8> { 371 self.source.get(self.pos + offset).copied() 372 } 373 374 fn advance(&mut self) -> Option<u8> { 375 let b = self.source.get(self.pos).copied()?; 376 self.pos += 1; 377 if b == b'\n' { 378 self.line += 1; 379 self.col = 1; 380 self.saw_newline = true; 381 } else { 382 self.col += 1; 383 } 384 Some(b) 385 } 386 387 fn advance_if(&mut self, expected: u8) -> bool { 388 if self.peek() == Some(expected) { 389 self.advance(); 390 true 391 } else { 392 false 393 } 394 } 395 396 fn slice(&self, start: usize, end: usize) -> &'a str { 397 // Safety: we only slice at positions we've already walked over, 398 // and we trust the input to be valid UTF-8 at identifier/keyword 399 // boundaries. In practice this is safe because the lexer only 400 // slices ASCII-compatible byte sequences. 401 std::str::from_utf8(&self.source[start..end]).unwrap_or("") 402 } 403 404 // ── Whitespace & Comments ──────────────────────────────── 405 406 fn skip_whitespace_and_comments(&mut self) -> Result<(), LexError> { 407 loop { 408 match self.peek() { 409 Some(b' ' | b'\t' | b'\r' | b'\n') => { 410 self.advance(); 411 } 412 // Unicode BOM / non-breaking spaces 413 Some(0xC2) if self.peek_at(1) == Some(0xA0) => { 414 // U+00A0 non-breaking space (2-byte UTF-8) 415 self.advance(); 416 self.advance(); 417 } 418 Some(0xEF) if self.peek_at(1) == Some(0xBB) && self.peek_at(2) == Some(0xBF) => { 419 // BOM U+FEFF 420 self.advance(); 421 self.advance(); 422 self.advance(); 423 } 424 Some(b'/') => { 425 match self.peek_at(1) { 426 Some(b'/') => { 427 // single-line comment 428 self.advance(); // / 429 self.advance(); // / 430 while let Some(b) = self.peek() { 431 if b == b'\n' { 432 break; 433 } 434 self.advance(); 435 } 436 } 437 Some(b'*') => { 438 // multi-line comment 439 let start = self.current_pos(); 440 self.advance(); // / 441 self.advance(); // * 442 let mut closed = false; 443 while let Some(b) = self.advance() { 444 if b == b'*' && self.peek() == Some(b'/') { 445 self.advance(); // / 446 closed = true; 447 break; 448 } 449 } 450 if !closed { 451 return Err(LexError { 452 message: "unterminated block comment".into(), 453 pos: start, 454 }); 455 } 456 } 457 _ => break, 458 } 459 } 460 _ => break, 461 } 462 } 463 Ok(()) 464 } 465 466 // ── Main dispatch ──────────────────────────────────────── 467 468 /// Produce the next token. 469 pub fn next_token(&mut self) -> Result<Token, LexError> { 470 self.saw_newline = false; 471 self.skip_whitespace_and_comments()?; 472 473 let start = self.current_pos(); 474 475 let Some(b) = self.peek() else { 476 return Ok(Token { 477 kind: TokenKind::Eof, 478 span: Span { 479 start, 480 end: self.current_pos(), 481 }, 482 preceded_by_newline: self.saw_newline, 483 }); 484 }; 485 486 // If we're inside a template `${...}` and hit the matching `}`, 487 // resume template scanning. 488 if b == b'}' 489 && !self.template_brace_stack.is_empty() 490 && self.brace_depth == *self.template_brace_stack.last().unwrap() 491 { 492 self.template_brace_stack.pop(); 493 self.template_depth -= 1; 494 self.advance(); // consume } 495 return self.scan_template_continuation(start); 496 } 497 498 let kind = match b { 499 b'`' => { 500 self.advance(); 501 return self.scan_template_start(start); 502 } 503 504 b'0'..=b'9' => self.scan_number()?, 505 b'.' if matches!(self.peek_at(1), Some(b'0'..=b'9')) => self.scan_number()?, 506 507 b'"' | b'\'' => self.scan_string()?, 508 509 b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$' => self.scan_identifier_or_keyword(), 510 // UTF-8 multi-byte identifier start 511 0xC0..=0xF7 if is_unicode_id_start(self.source, self.pos) => { 512 self.scan_identifier_or_keyword() 513 } 514 515 b'/' if !self.prev_token_is_expr_end => self.scan_regexp()?, 516 517 _ => self.scan_punctuator()?, 518 }; 519 520 let end = self.current_pos(); 521 let preceded_by_newline = self.saw_newline; 522 523 // Track whether this token ends an expression (for `/` disambiguation). 524 self.prev_token_is_expr_end = token_is_expr_end(&kind); 525 526 Ok(Token { 527 kind, 528 span: Span { start, end }, 529 preceded_by_newline, 530 }) 531 } 532 533 // ── Numbers ────────────────────────────────────────────── 534 535 fn scan_number(&mut self) -> Result<TokenKind, LexError> { 536 let start = self.pos; 537 538 if self.peek() == Some(b'0') { 539 match self.peek_at(1) { 540 Some(b'x' | b'X') => return self.scan_hex_number(), 541 Some(b'o' | b'O') => return self.scan_octal_number(), 542 Some(b'b' | b'B') => return self.scan_binary_number(), 543 _ => {} 544 } 545 } 546 547 // Decimal integer or float 548 self.eat_decimal_digits(); 549 550 if self.peek() == Some(b'.') { 551 // Could be `1..toString()` — only consume `.` if followed by a digit 552 // or if this is a leading dot (start already has a digit, so peek is safe). 553 // Actually, `1.` is a valid numeric literal (= 1.0), and `1.e2` = 100. 554 // We consume the dot always unless it's `..` (spread). 555 if self.peek_at(1) != Some(b'.') { 556 self.advance(); // . 557 self.eat_decimal_digits(); 558 } 559 } 560 561 // Exponent 562 if matches!(self.peek(), Some(b'e' | b'E')) { 563 self.advance(); 564 if matches!(self.peek(), Some(b'+' | b'-')) { 565 self.advance(); 566 } 567 if !matches!(self.peek(), Some(b'0'..=b'9')) { 568 return Err(LexError { 569 message: "expected digit after exponent".into(), 570 pos: self.current_pos(), 571 }); 572 } 573 self.eat_decimal_digits(); 574 } 575 576 // BigInt suffix `n` — we tokenize it but store as f64 (for now) 577 self.advance_if(b'n'); 578 579 let text = self.slice(start, self.pos); 580 let value = parse_decimal(text); 581 Ok(TokenKind::Number(value)) 582 } 583 584 fn scan_hex_number(&mut self) -> Result<TokenKind, LexError> { 585 self.advance(); // 0 586 self.advance(); // x/X 587 let digit_start = self.pos; 588 self.eat_hex_digits(); 589 if self.pos == digit_start { 590 return Err(LexError { 591 message: "expected hex digit after 0x".into(), 592 pos: self.current_pos(), 593 }); 594 } 595 self.advance_if(b'n'); 596 let text = self.slice(digit_start, self.pos); 597 let text = text.trim_end_matches('n'); 598 let value = u64_from_hex(text) as f64; 599 Ok(TokenKind::Number(value)) 600 } 601 602 fn scan_octal_number(&mut self) -> Result<TokenKind, LexError> { 603 self.advance(); // 0 604 self.advance(); // o/O 605 let digit_start = self.pos; 606 while matches!(self.peek(), Some(b'0'..=b'7' | b'_')) { 607 self.advance(); 608 } 609 if self.pos == digit_start { 610 return Err(LexError { 611 message: "expected octal digit after 0o".into(), 612 pos: self.current_pos(), 613 }); 614 } 615 self.advance_if(b'n'); 616 let text = self.slice(digit_start, self.pos).trim_end_matches('n'); 617 let value = u64_from_octal(text) as f64; 618 Ok(TokenKind::Number(value)) 619 } 620 621 fn scan_binary_number(&mut self) -> Result<TokenKind, LexError> { 622 self.advance(); // 0 623 self.advance(); // b/B 624 let digit_start = self.pos; 625 while matches!(self.peek(), Some(b'0' | b'1' | b'_')) { 626 self.advance(); 627 } 628 if self.pos == digit_start { 629 return Err(LexError { 630 message: "expected binary digit after 0b".into(), 631 pos: self.current_pos(), 632 }); 633 } 634 self.advance_if(b'n'); 635 let text = self.slice(digit_start, self.pos).trim_end_matches('n'); 636 let value = u64_from_binary(text) as f64; 637 Ok(TokenKind::Number(value)) 638 } 639 640 fn eat_decimal_digits(&mut self) { 641 while matches!(self.peek(), Some(b'0'..=b'9' | b'_')) { 642 self.advance(); 643 } 644 } 645 646 fn eat_hex_digits(&mut self) { 647 while matches!( 648 self.peek(), 649 Some(b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'_') 650 ) { 651 self.advance(); 652 } 653 } 654 655 // ── Strings ────────────────────────────────────────────── 656 657 fn scan_string(&mut self) -> Result<TokenKind, LexError> { 658 let quote = self.advance().unwrap(); // opening quote 659 let start_pos = self.current_pos(); 660 let mut value = std::string::String::new(); 661 662 loop { 663 match self.peek() { 664 None | Some(b'\n') => { 665 return Err(LexError { 666 message: "unterminated string literal".into(), 667 pos: start_pos, 668 }); 669 } 670 Some(b) if b == quote => { 671 self.advance(); 672 break; 673 } 674 Some(b'\\') => { 675 self.advance(); // backslash 676 if let Some(ch) = self.scan_escape_sequence()? { 677 value.push(ch); 678 } 679 } 680 Some(_) => { 681 let ch = self.advance_char(); 682 value.push(ch); 683 } 684 } 685 } 686 687 Ok(TokenKind::String(value)) 688 } 689 690 /// Scan an escape sequence after the backslash has been consumed. 691 /// Returns `None` for line continuations (`\<newline>`), which produce no character. 692 fn scan_escape_sequence(&mut self) -> Result<Option<char>, LexError> { 693 let pos = self.current_pos(); 694 match self.advance() { 695 Some(b'n') => Ok(Some('\n')), 696 Some(b'r') => Ok(Some('\r')), 697 Some(b't') => Ok(Some('\t')), 698 Some(b'b') => Ok(Some('\u{0008}')), 699 Some(b'f') => Ok(Some('\u{000C}')), 700 Some(b'v') => Ok(Some('\u{000B}')), 701 Some(b'0') if !matches!(self.peek(), Some(b'0'..=b'9')) => Ok(Some('\0')), 702 Some(b'\\') => Ok(Some('\\')), 703 Some(b'\'') => Ok(Some('\'')), 704 Some(b'"') => Ok(Some('"')), 705 Some(b'`') => Ok(Some('`')), 706 // Line continuation: \<newline> produces no character 707 Some(b'\n') => Ok(None), 708 Some(b'\r') => { 709 self.advance_if(b'\n'); 710 Ok(None) 711 } 712 Some(b'x') => { 713 let hi = self.advance().and_then(hex_digit_val).ok_or(LexError { 714 message: "invalid hex escape".into(), 715 pos, 716 })?; 717 let lo = self.advance().and_then(hex_digit_val).ok_or(LexError { 718 message: "invalid hex escape".into(), 719 pos, 720 })?; 721 let code = (hi << 4) | lo; 722 Ok(Some(code as char)) 723 } 724 Some(b'u') => self.scan_unicode_escape(pos).map(Some), 725 Some(b) => { 726 // identity escape 727 Ok(Some(b as char)) 728 } 729 None => Err(LexError { 730 message: "unexpected end of input in escape sequence".into(), 731 pos, 732 }), 733 } 734 } 735 736 fn scan_unicode_escape(&mut self, pos: SourcePos) -> Result<char, LexError> { 737 if self.advance_if(b'{') { 738 // \u{XXXXX} 739 let mut code: u32 = 0; 740 let mut count = 0; 741 while let Some(b) = self.peek() { 742 if b == b'}' { 743 break; 744 } 745 let d = hex_digit_val(self.advance().unwrap()).ok_or(LexError { 746 message: "invalid unicode escape".into(), 747 pos, 748 })?; 749 code = code * 16 + d as u32; 750 count += 1; 751 if code > 0x10FFFF { 752 return Err(LexError { 753 message: "unicode escape out of range".into(), 754 pos, 755 }); 756 } 757 } 758 if count == 0 || !self.advance_if(b'}') { 759 return Err(LexError { 760 message: "invalid unicode escape".into(), 761 pos, 762 }); 763 } 764 char::from_u32(code).ok_or(LexError { 765 message: "invalid unicode code point".into(), 766 pos, 767 }) 768 } else { 769 // \uXXXX 770 let mut code: u32 = 0; 771 for _ in 0..4 { 772 let d = self.advance().and_then(hex_digit_val).ok_or(LexError { 773 message: "invalid unicode escape".into(), 774 pos, 775 })?; 776 code = code * 16 + d as u32; 777 } 778 char::from_u32(code).ok_or(LexError { 779 message: "invalid unicode code point".into(), 780 pos, 781 }) 782 } 783 } 784 785 /// Advance one full UTF-8 character and return it. 786 fn advance_char(&mut self) -> char { 787 let start = self.pos; 788 let b = self.advance().unwrap(); 789 if b < 0x80 { 790 return b as char; 791 } 792 // multi-byte: determine length 793 let len = if b >= 0xF0 { 794 4 795 } else if b >= 0xE0 { 796 3 797 } else { 798 2 799 }; 800 for _ in 1..len { 801 self.advance(); 802 } 803 let s = std::str::from_utf8(&self.source[start..self.pos]).unwrap_or("\u{FFFD}"); 804 s.chars().next().unwrap_or('\u{FFFD}') 805 } 806 807 // ── Template Literals ──────────────────────────────────── 808 809 fn scan_template_start(&mut self, start: SourcePos) -> Result<Token, LexError> { 810 let mut value = std::string::String::new(); 811 loop { 812 match self.peek() { 813 None => { 814 return Err(LexError { 815 message: "unterminated template literal".into(), 816 pos: start, 817 }); 818 } 819 Some(b'`') => { 820 self.advance(); 821 let end = self.current_pos(); 822 let kind = TokenKind::TemplateFull(value); 823 self.prev_token_is_expr_end = true; 824 return Ok(Token { 825 kind, 826 span: Span { start, end }, 827 preceded_by_newline: self.saw_newline, 828 }); 829 } 830 Some(b'$') if self.peek_at(1) == Some(b'{') => { 831 self.advance(); // $ 832 self.advance(); // { 833 self.template_depth += 1; 834 self.template_brace_stack.push(self.brace_depth); 835 let end = self.current_pos(); 836 let kind = TokenKind::TemplateHead(value); 837 self.prev_token_is_expr_end = false; 838 return Ok(Token { 839 kind, 840 span: Span { start, end }, 841 preceded_by_newline: self.saw_newline, 842 }); 843 } 844 Some(b'\\') => { 845 self.advance(); 846 if let Some(ch) = self.scan_escape_sequence()? { 847 value.push(ch); 848 } 849 } 850 Some(_) => { 851 let ch = self.advance_char(); 852 value.push(ch); 853 } 854 } 855 } 856 } 857 858 fn scan_template_continuation(&mut self, start: SourcePos) -> Result<Token, LexError> { 859 let mut value = std::string::String::new(); 860 loop { 861 match self.peek() { 862 None => { 863 return Err(LexError { 864 message: "unterminated template literal".into(), 865 pos: start, 866 }); 867 } 868 Some(b'`') => { 869 self.advance(); 870 let end = self.current_pos(); 871 let kind = TokenKind::TemplateTail(value); 872 self.prev_token_is_expr_end = true; 873 return Ok(Token { 874 kind, 875 span: Span { start, end }, 876 preceded_by_newline: self.saw_newline, 877 }); 878 } 879 Some(b'$') if self.peek_at(1) == Some(b'{') => { 880 self.advance(); // $ 881 self.advance(); // { 882 self.template_depth += 1; 883 self.template_brace_stack.push(self.brace_depth); 884 let end = self.current_pos(); 885 let kind = TokenKind::TemplateMiddle(value); 886 self.prev_token_is_expr_end = false; 887 return Ok(Token { 888 kind, 889 span: Span { start, end }, 890 preceded_by_newline: self.saw_newline, 891 }); 892 } 893 Some(b'\\') => { 894 self.advance(); 895 if let Some(ch) = self.scan_escape_sequence()? { 896 value.push(ch); 897 } 898 } 899 Some(_) => { 900 let ch = self.advance_char(); 901 value.push(ch); 902 } 903 } 904 } 905 } 906 907 // ── Identifiers & Keywords ─────────────────────────────── 908 909 fn scan_identifier_or_keyword(&mut self) -> TokenKind { 910 let start = self.pos; 911 912 // Consume the first character (which we already validated) 913 self.advance_char(); 914 915 // Consume continue characters 916 while self.pos < self.source.len() { 917 let b = self.source[self.pos]; 918 match b { 919 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'$' => { 920 self.advance(); 921 } 922 0xC0..=0xF7 if is_unicode_id_continue(self.source, self.pos) => { 923 self.advance_char(); 924 } 925 _ => break, 926 } 927 } 928 929 let text = self.slice(start, self.pos); 930 keyword_or_ident(text) 931 } 932 933 // ── Regular Expressions ────────────────────────────────── 934 935 fn scan_regexp(&mut self) -> Result<TokenKind, LexError> { 936 let start_pos = self.current_pos(); 937 self.advance(); // opening / 938 939 let mut pattern = std::string::String::new(); 940 let mut in_class = false; 941 942 loop { 943 match self.peek() { 944 None | Some(b'\n') => { 945 return Err(LexError { 946 message: "unterminated regexp literal".into(), 947 pos: start_pos, 948 }); 949 } 950 Some(b'/') if !in_class => { 951 self.advance(); 952 break; 953 } 954 Some(b'[') => { 955 in_class = true; 956 pattern.push('['); 957 self.advance(); 958 } 959 Some(b']') if in_class => { 960 in_class = false; 961 pattern.push(']'); 962 self.advance(); 963 } 964 Some(b'\\') => { 965 self.advance(); 966 pattern.push('\\'); 967 if let Some(b2) = self.peek() { 968 if b2 != b'\n' { 969 pattern.push(b2 as char); 970 self.advance(); 971 } 972 } 973 } 974 Some(b) => { 975 pattern.push(b as char); 976 self.advance(); 977 } 978 } 979 } 980 981 // Flags 982 let mut flags = std::string::String::new(); 983 while matches!(self.peek(), Some(b'g' | b'i' | b'm' | b's' | b'u' | b'y')) { 984 flags.push(self.advance().unwrap() as char); 985 } 986 987 Ok(TokenKind::RegExp { pattern, flags }) 988 } 989 990 // ── Punctuators ────────────────────────────────────────── 991 992 fn scan_punctuator(&mut self) -> Result<TokenKind, LexError> { 993 let pos = self.current_pos(); 994 let b = self.advance().unwrap(); 995 996 let kind = match b { 997 b'(' => TokenKind::LParen, 998 b')' => TokenKind::RParen, 999 b'[' => TokenKind::LBracket, 1000 b']' => TokenKind::RBracket, 1001 b'{' => { 1002 self.brace_depth += 1; 1003 TokenKind::LBrace 1004 } 1005 b'}' => { 1006 self.brace_depth = self.brace_depth.saturating_sub(1); 1007 TokenKind::RBrace 1008 } 1009 b';' => TokenKind::Semicolon, 1010 b',' => TokenKind::Comma, 1011 b':' => TokenKind::Colon, 1012 b'~' => TokenKind::Tilde, 1013 1014 b'.' => { 1015 if self.peek() == Some(b'.') && self.peek_at(1) == Some(b'.') { 1016 self.advance(); 1017 self.advance(); 1018 TokenKind::Ellipsis 1019 } else { 1020 TokenKind::Dot 1021 } 1022 } 1023 1024 b'?' => { 1025 if self.advance_if(b'?') { 1026 if self.advance_if(b'=') { 1027 TokenKind::NullishAssign 1028 } else { 1029 TokenKind::Nullish 1030 } 1031 } else if self.peek() == Some(b'.') && !matches!(self.peek_at(1), Some(b'0'..=b'9')) 1032 { 1033 self.advance(); 1034 TokenKind::QuestionDot 1035 } else { 1036 TokenKind::Question 1037 } 1038 } 1039 1040 b'+' => { 1041 if self.advance_if(b'+') { 1042 TokenKind::PlusPlus 1043 } else if self.advance_if(b'=') { 1044 TokenKind::PlusAssign 1045 } else { 1046 TokenKind::Plus 1047 } 1048 } 1049 1050 b'-' => { 1051 if self.advance_if(b'-') { 1052 TokenKind::MinusMinus 1053 } else if self.advance_if(b'=') { 1054 TokenKind::MinusAssign 1055 } else { 1056 TokenKind::Minus 1057 } 1058 } 1059 1060 b'*' => { 1061 if self.advance_if(b'*') { 1062 if self.advance_if(b'=') { 1063 TokenKind::ExpAssign 1064 } else { 1065 TokenKind::Exp 1066 } 1067 } else if self.advance_if(b'=') { 1068 TokenKind::StarAssign 1069 } else { 1070 TokenKind::Star 1071 } 1072 } 1073 1074 b'/' => { 1075 // We only get here for division (regexp was handled earlier) 1076 if self.advance_if(b'=') { 1077 TokenKind::SlashAssign 1078 } else { 1079 TokenKind::Slash 1080 } 1081 } 1082 1083 b'%' => { 1084 if self.advance_if(b'=') { 1085 TokenKind::PercentAssign 1086 } else { 1087 TokenKind::Percent 1088 } 1089 } 1090 1091 b'=' => { 1092 if self.advance_if(b'=') { 1093 if self.advance_if(b'=') { 1094 TokenKind::StrictEq 1095 } else { 1096 TokenKind::Eq 1097 } 1098 } else if self.advance_if(b'>') { 1099 TokenKind::Arrow 1100 } else { 1101 TokenKind::Assign 1102 } 1103 } 1104 1105 b'!' => { 1106 if self.advance_if(b'=') { 1107 if self.advance_if(b'=') { 1108 TokenKind::StrictNe 1109 } else { 1110 TokenKind::Ne 1111 } 1112 } else { 1113 TokenKind::Not 1114 } 1115 } 1116 1117 b'<' => { 1118 if self.advance_if(b'<') { 1119 if self.advance_if(b'=') { 1120 TokenKind::ShlAssign 1121 } else { 1122 TokenKind::Shl 1123 } 1124 } else if self.advance_if(b'=') { 1125 TokenKind::Le 1126 } else { 1127 TokenKind::Lt 1128 } 1129 } 1130 1131 b'>' => { 1132 if self.advance_if(b'>') { 1133 if self.advance_if(b'>') { 1134 if self.advance_if(b'=') { 1135 TokenKind::UshrAssign 1136 } else { 1137 TokenKind::Ushr 1138 } 1139 } else if self.advance_if(b'=') { 1140 TokenKind::ShrAssign 1141 } else { 1142 TokenKind::Shr 1143 } 1144 } else if self.advance_if(b'=') { 1145 TokenKind::Ge 1146 } else { 1147 TokenKind::Gt 1148 } 1149 } 1150 1151 b'&' => { 1152 if self.advance_if(b'&') { 1153 if self.advance_if(b'=') { 1154 TokenKind::AndAssign 1155 } else { 1156 TokenKind::And 1157 } 1158 } else if self.advance_if(b'=') { 1159 TokenKind::AmpAssign 1160 } else { 1161 TokenKind::Amp 1162 } 1163 } 1164 1165 b'|' => { 1166 if self.advance_if(b'|') { 1167 if self.advance_if(b'=') { 1168 TokenKind::OrAssign 1169 } else { 1170 TokenKind::Or 1171 } 1172 } else if self.advance_if(b'=') { 1173 TokenKind::PipeAssign 1174 } else { 1175 TokenKind::Pipe 1176 } 1177 } 1178 1179 b'^' => { 1180 if self.advance_if(b'=') { 1181 TokenKind::CaretAssign 1182 } else { 1183 TokenKind::Caret 1184 } 1185 } 1186 1187 _ => { 1188 return Err(LexError { 1189 message: format!("unexpected character: {:?}", b as char), 1190 pos, 1191 }); 1192 } 1193 }; 1194 1195 Ok(kind) 1196 } 1197} 1198 1199// ── Keyword lookup ─────────────────────────────────────────── 1200 1201fn keyword_or_ident(s: &str) -> TokenKind { 1202 match s { 1203 "await" => TokenKind::Await, 1204 "break" => TokenKind::Break, 1205 "case" => TokenKind::Case, 1206 "catch" => TokenKind::Catch, 1207 "class" => TokenKind::Class, 1208 "const" => TokenKind::Const, 1209 "continue" => TokenKind::Continue, 1210 "debugger" => TokenKind::Debugger, 1211 "default" => TokenKind::Default, 1212 "delete" => TokenKind::Delete, 1213 "do" => TokenKind::Do, 1214 "else" => TokenKind::Else, 1215 "export" => TokenKind::Export, 1216 "extends" => TokenKind::Extends, 1217 "finally" => TokenKind::Finally, 1218 "for" => TokenKind::For, 1219 "function" => TokenKind::Function, 1220 "if" => TokenKind::If, 1221 "import" => TokenKind::Import, 1222 "in" => TokenKind::In, 1223 "instanceof" => TokenKind::Instanceof, 1224 "let" => TokenKind::Let, 1225 "new" => TokenKind::New, 1226 "of" => TokenKind::Of, 1227 "return" => TokenKind::Return, 1228 "static" => TokenKind::Static, 1229 "super" => TokenKind::Super, 1230 "switch" => TokenKind::Switch, 1231 "this" => TokenKind::This, 1232 "throw" => TokenKind::Throw, 1233 "try" => TokenKind::Try, 1234 "typeof" => TokenKind::Typeof, 1235 "var" => TokenKind::Var, 1236 "void" => TokenKind::Void, 1237 "while" => TokenKind::While, 1238 "with" => TokenKind::With, 1239 "yield" => TokenKind::Yield, 1240 "async" => TokenKind::Async, 1241 "true" => TokenKind::True, 1242 "false" => TokenKind::False, 1243 "null" => TokenKind::Null, 1244 _ => TokenKind::Identifier(s.to_owned()), 1245 } 1246} 1247 1248// ── Expression-end tracking ────────────────────────────────── 1249 1250/// Returns `true` if a token of this kind could end an expression. 1251/// Used to decide whether a following `/` is division or a RegExp literal. 1252fn token_is_expr_end(kind: &TokenKind) -> bool { 1253 matches!( 1254 kind, 1255 TokenKind::Identifier(_) 1256 | TokenKind::Number(_) 1257 | TokenKind::String(_) 1258 | TokenKind::TemplateFull(_) 1259 | TokenKind::TemplateTail(_) 1260 | TokenKind::True 1261 | TokenKind::False 1262 | TokenKind::Null 1263 | TokenKind::This 1264 | TokenKind::Super 1265 | TokenKind::RParen 1266 | TokenKind::RBracket 1267 | TokenKind::RBrace 1268 | TokenKind::PlusPlus 1269 | TokenKind::MinusMinus 1270 | TokenKind::RegExp { .. } 1271 ) 1272} 1273 1274// ── Unicode helpers ────────────────────────────────────────── 1275 1276/// Check if the byte sequence at `pos` starts a valid Unicode identifier start character. 1277fn is_unicode_id_start(source: &[u8], pos: usize) -> bool { 1278 let s = std::str::from_utf8(&source[pos..]).unwrap_or(""); 1279 if let Some(ch) = s.chars().next() { 1280 ch.is_alphabetic() || ch == '_' || ch == '$' 1281 } else { 1282 false 1283 } 1284} 1285 1286/// Check if the byte sequence at `pos` starts a valid Unicode identifier continue character. 1287fn is_unicode_id_continue(source: &[u8], pos: usize) -> bool { 1288 let s = std::str::from_utf8(&source[pos..]).unwrap_or(""); 1289 if let Some(ch) = s.chars().next() { 1290 ch.is_alphanumeric() || ch == '_' || ch == '$' || ch == '\u{200C}' || ch == '\u{200D}' 1291 } else { 1292 false 1293 } 1294} 1295 1296// ── Numeric parsing helpers ────────────────────────────────── 1297 1298fn hex_digit_val(b: u8) -> Option<u8> { 1299 match b { 1300 b'0'..=b'9' => Some(b - b'0'), 1301 b'a'..=b'f' => Some(b - b'a' + 10), 1302 b'A'..=b'F' => Some(b - b'A' + 10), 1303 _ => None, 1304 } 1305} 1306 1307fn parse_decimal(s: &str) -> f64 { 1308 let s = s.replace('_', ""); 1309 let s = s.trim_end_matches('n'); 1310 // Use manual parsing for basic decimal and float 1311 if let Ok(v) = s.parse::<f64>() { 1312 return v; 1313 } 1314 0.0 1315} 1316 1317fn u64_from_hex(s: &str) -> u64 { 1318 let mut result: u64 = 0; 1319 for b in s.bytes() { 1320 if b == b'_' { 1321 continue; 1322 } 1323 let d = hex_digit_val(b).unwrap_or(0) as u64; 1324 result = result.wrapping_mul(16).wrapping_add(d); 1325 } 1326 result 1327} 1328 1329fn u64_from_octal(s: &str) -> u64 { 1330 let mut result: u64 = 0; 1331 for b in s.bytes() { 1332 if b == b'_' { 1333 continue; 1334 } 1335 let d = (b - b'0') as u64; 1336 result = result.wrapping_mul(8).wrapping_add(d); 1337 } 1338 result 1339} 1340 1341fn u64_from_binary(s: &str) -> u64 { 1342 let mut result: u64 = 0; 1343 for b in s.bytes() { 1344 if b == b'_' { 1345 continue; 1346 } 1347 let d = (b - b'0') as u64; 1348 result = result.wrapping_mul(2).wrapping_add(d); 1349 } 1350 result 1351} 1352 1353// ── Error type ─────────────────────────────────────────────── 1354 1355/// An error produced during lexing. 1356#[derive(Debug, Clone, PartialEq, Eq)] 1357pub struct LexError { 1358 pub message: std::string::String, 1359 pub pos: SourcePos, 1360} 1361 1362impl fmt::Display for LexError { 1363 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1364 write!( 1365 f, 1366 "LexError at {}:{}: {}", 1367 self.pos.line, self.pos.col, self.message 1368 ) 1369 } 1370} 1371 1372// ── Tests ──────────────────────────────────────────────────── 1373 1374#[cfg(test)] 1375mod tests { 1376 use super::*; 1377 1378 fn kinds(src: &str) -> Vec<TokenKind> { 1379 Lexer::tokenize(src) 1380 .unwrap() 1381 .into_iter() 1382 .map(|t| t.kind) 1383 .collect() 1384 } 1385 1386 fn kind(src: &str) -> TokenKind { 1387 let tokens = Lexer::tokenize(src).unwrap(); 1388 assert!(tokens.len() >= 2, "expected at least one token + Eof"); 1389 tokens[0].kind.clone() 1390 } 1391 1392 // ── Keywords ────────────────────────────────────────── 1393 1394 #[test] 1395 fn test_keywords() { 1396 assert_eq!(kind("var"), TokenKind::Var); 1397 assert_eq!(kind("let"), TokenKind::Let); 1398 assert_eq!(kind("const"), TokenKind::Const); 1399 assert_eq!(kind("function"), TokenKind::Function); 1400 assert_eq!(kind("class"), TokenKind::Class); 1401 assert_eq!(kind("if"), TokenKind::If); 1402 assert_eq!(kind("else"), TokenKind::Else); 1403 assert_eq!(kind("for"), TokenKind::For); 1404 assert_eq!(kind("while"), TokenKind::While); 1405 assert_eq!(kind("do"), TokenKind::Do); 1406 assert_eq!(kind("switch"), TokenKind::Switch); 1407 assert_eq!(kind("case"), TokenKind::Case); 1408 assert_eq!(kind("break"), TokenKind::Break); 1409 assert_eq!(kind("continue"), TokenKind::Continue); 1410 assert_eq!(kind("return"), TokenKind::Return); 1411 assert_eq!(kind("throw"), TokenKind::Throw); 1412 assert_eq!(kind("try"), TokenKind::Try); 1413 assert_eq!(kind("catch"), TokenKind::Catch); 1414 assert_eq!(kind("finally"), TokenKind::Finally); 1415 assert_eq!(kind("new"), TokenKind::New); 1416 assert_eq!(kind("delete"), TokenKind::Delete); 1417 assert_eq!(kind("typeof"), TokenKind::Typeof); 1418 assert_eq!(kind("instanceof"), TokenKind::Instanceof); 1419 assert_eq!(kind("void"), TokenKind::Void); 1420 assert_eq!(kind("in"), TokenKind::In); 1421 assert_eq!(kind("of"), TokenKind::Of); 1422 assert_eq!(kind("import"), TokenKind::Import); 1423 assert_eq!(kind("export"), TokenKind::Export); 1424 assert_eq!(kind("default"), TokenKind::Default); 1425 assert_eq!(kind("async"), TokenKind::Async); 1426 assert_eq!(kind("await"), TokenKind::Await); 1427 assert_eq!(kind("yield"), TokenKind::Yield); 1428 assert_eq!(kind("this"), TokenKind::This); 1429 assert_eq!(kind("super"), TokenKind::Super); 1430 assert_eq!(kind("extends"), TokenKind::Extends); 1431 assert_eq!(kind("static"), TokenKind::Static); 1432 assert_eq!(kind("debugger"), TokenKind::Debugger); 1433 assert_eq!(kind("with"), TokenKind::With); 1434 } 1435 1436 #[test] 1437 fn test_literal_keywords() { 1438 assert_eq!(kind("true"), TokenKind::True); 1439 assert_eq!(kind("false"), TokenKind::False); 1440 assert_eq!(kind("null"), TokenKind::Null); 1441 } 1442 1443 // ── Identifiers ────────────────────────────────────── 1444 1445 #[test] 1446 fn test_identifiers() { 1447 assert_eq!(kind("foo"), TokenKind::Identifier("foo".into())); 1448 assert_eq!(kind("_bar"), TokenKind::Identifier("_bar".into())); 1449 assert_eq!(kind("$baz"), TokenKind::Identifier("$baz".into())); 1450 assert_eq!(kind("abc123"), TokenKind::Identifier("abc123".into())); 1451 assert_eq!(kind("camelCase"), TokenKind::Identifier("camelCase".into())); 1452 } 1453 1454 #[test] 1455 fn test_unicode_identifiers() { 1456 assert_eq!(kind("café"), TokenKind::Identifier("café".into())); 1457 } 1458 1459 // ── Numbers ────────────────────────────────────────── 1460 1461 #[test] 1462 fn test_integers() { 1463 assert_eq!(kind("0"), TokenKind::Number(0.0)); 1464 assert_eq!(kind("42"), TokenKind::Number(42.0)); 1465 assert_eq!(kind("123456"), TokenKind::Number(123456.0)); 1466 } 1467 1468 #[test] 1469 fn test_floats() { 1470 assert_eq!(kind("3.14"), TokenKind::Number(3.14)); 1471 assert_eq!(kind("0.5"), TokenKind::Number(0.5)); 1472 assert_eq!(kind(".5"), TokenKind::Number(0.5)); 1473 assert_eq!(kind("1."), TokenKind::Number(1.0)); 1474 } 1475 1476 #[test] 1477 fn test_exponents() { 1478 assert_eq!(kind("1e2"), TokenKind::Number(100.0)); 1479 assert_eq!(kind("1E2"), TokenKind::Number(100.0)); 1480 assert_eq!(kind("1e+2"), TokenKind::Number(100.0)); 1481 assert_eq!(kind("1e-2"), TokenKind::Number(0.01)); 1482 assert_eq!(kind("2.5e3"), TokenKind::Number(2500.0)); 1483 } 1484 1485 #[test] 1486 fn test_hex() { 1487 assert_eq!(kind("0xFF"), TokenKind::Number(255.0)); 1488 assert_eq!(kind("0x0"), TokenKind::Number(0.0)); 1489 assert_eq!(kind("0xDEAD"), TokenKind::Number(0xDEAD as f64)); 1490 } 1491 1492 #[test] 1493 fn test_octal() { 1494 assert_eq!(kind("0o77"), TokenKind::Number(63.0)); 1495 assert_eq!(kind("0O10"), TokenKind::Number(8.0)); 1496 } 1497 1498 #[test] 1499 fn test_binary() { 1500 assert_eq!(kind("0b1010"), TokenKind::Number(10.0)); 1501 assert_eq!(kind("0B11"), TokenKind::Number(3.0)); 1502 } 1503 1504 #[test] 1505 fn test_numeric_separators() { 1506 assert_eq!(kind("1_000"), TokenKind::Number(1000.0)); 1507 assert_eq!(kind("0xFF_FF"), TokenKind::Number(65535.0)); 1508 assert_eq!(kind("0b1010_0101"), TokenKind::Number(165.0)); 1509 } 1510 1511 // ── Strings ────────────────────────────────────────── 1512 1513 #[test] 1514 fn test_double_quoted_string() { 1515 assert_eq!(kind(r#""hello""#), TokenKind::String("hello".into())); 1516 } 1517 1518 #[test] 1519 fn test_single_quoted_string() { 1520 assert_eq!(kind("'world'"), TokenKind::String("world".into())); 1521 } 1522 1523 #[test] 1524 fn test_string_escapes() { 1525 assert_eq!(kind(r#""\n\t\r""#), TokenKind::String("\n\t\r".into())); 1526 assert_eq!(kind(r#""\\""#), TokenKind::String("\\".into())); 1527 assert_eq!(kind(r#""\"""#), TokenKind::String("\"".into())); 1528 } 1529 1530 #[test] 1531 fn test_string_hex_escape() { 1532 assert_eq!(kind(r#""\x41""#), TokenKind::String("A".into())); 1533 } 1534 1535 #[test] 1536 fn test_string_unicode_escape() { 1537 assert_eq!(kind(r#""\u0041""#), TokenKind::String("A".into())); 1538 assert_eq!( 1539 kind(r#""\u{1F600}""#), 1540 TokenKind::String("\u{1F600}".into()) 1541 ); 1542 } 1543 1544 #[test] 1545 fn test_string_line_continuation() { 1546 // \<newline> is a line continuation producing no character 1547 assert_eq!( 1548 kind("\"line1\\\nline2\""), 1549 TokenKind::String("line1line2".into()) 1550 ); 1551 } 1552 1553 #[test] 1554 fn test_empty_string() { 1555 assert_eq!(kind(r#""""#), TokenKind::String("".into())); 1556 assert_eq!(kind("''"), TokenKind::String("".into())); 1557 } 1558 1559 // ── Template Literals ──────────────────────────────── 1560 1561 #[test] 1562 fn test_template_no_substitution() { 1563 assert_eq!(kind("`hello`"), TokenKind::TemplateFull("hello".into())); 1564 } 1565 1566 #[test] 1567 fn test_template_with_substitution() { 1568 let tokens = Lexer::tokenize("`hello ${name}!`").unwrap(); 1569 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1570 assert_eq!(k[0], &TokenKind::TemplateHead("hello ".into())); 1571 assert_eq!(k[1], &TokenKind::Identifier("name".into())); 1572 assert_eq!(k[2], &TokenKind::TemplateTail("!".into())); 1573 } 1574 1575 #[test] 1576 fn test_template_multiple_substitutions() { 1577 let tokens = Lexer::tokenize("`a${1}b${2}c`").unwrap(); 1578 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1579 assert_eq!(k[0], &TokenKind::TemplateHead("a".into())); 1580 assert_eq!(k[1], &TokenKind::Number(1.0)); 1581 assert_eq!(k[2], &TokenKind::TemplateMiddle("b".into())); 1582 assert_eq!(k[3], &TokenKind::Number(2.0)); 1583 assert_eq!(k[4], &TokenKind::TemplateTail("c".into())); 1584 } 1585 1586 #[test] 1587 fn test_template_with_nested_braces() { 1588 // `${({a:1})}` — the object literal inside ${ } has its own braces 1589 let tokens = Lexer::tokenize("`${({a:1})}`").unwrap(); 1590 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1591 assert_eq!(k[0], &TokenKind::TemplateHead("".into())); 1592 assert_eq!(k[1], &TokenKind::LParen); 1593 assert_eq!(k[2], &TokenKind::LBrace); 1594 assert_eq!(k[3], &TokenKind::Identifier("a".into())); 1595 assert_eq!(k[4], &TokenKind::Colon); 1596 assert_eq!(k[5], &TokenKind::Number(1.0)); 1597 assert_eq!(k[6], &TokenKind::RBrace); 1598 assert_eq!(k[7], &TokenKind::RParen); 1599 assert_eq!(k[8], &TokenKind::TemplateTail("".into())); 1600 } 1601 1602 // ── Regular Expressions ────────────────────────────── 1603 1604 #[test] 1605 fn test_regexp_basic() { 1606 let tokens = Lexer::tokenize("x = /foo/gi").unwrap(); 1607 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1608 assert_eq!( 1609 k[2], 1610 &TokenKind::RegExp { 1611 pattern: "foo".into(), 1612 flags: "gi".into() 1613 } 1614 ); 1615 } 1616 1617 #[test] 1618 fn test_regexp_with_class() { 1619 // /[a-z]/ — the `/` inside the character class is not the end 1620 let tokens = Lexer::tokenize("x = /[a/b]/").unwrap(); 1621 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1622 assert_eq!( 1623 k[2], 1624 &TokenKind::RegExp { 1625 pattern: "[a/b]".into(), 1626 flags: "".into() 1627 } 1628 ); 1629 } 1630 1631 #[test] 1632 fn test_regexp_vs_division() { 1633 // After an identifier, `/` is division 1634 let tokens = Lexer::tokenize("a / b").unwrap(); 1635 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1636 assert_eq!(k[1], &TokenKind::Slash); 1637 } 1638 1639 // ── Punctuators ────────────────────────────────────── 1640 1641 #[test] 1642 fn test_simple_punctuators() { 1643 assert_eq!(kind("("), TokenKind::LParen); 1644 assert_eq!(kind(")"), TokenKind::RParen); 1645 assert_eq!(kind("["), TokenKind::LBracket); 1646 assert_eq!(kind("]"), TokenKind::RBracket); 1647 assert_eq!(kind("{"), TokenKind::LBrace); 1648 assert_eq!(kind("}"), TokenKind::RBrace); 1649 assert_eq!(kind(";"), TokenKind::Semicolon); 1650 assert_eq!(kind(","), TokenKind::Comma); 1651 assert_eq!(kind(":"), TokenKind::Colon); 1652 assert_eq!(kind("~"), TokenKind::Tilde); 1653 } 1654 1655 #[test] 1656 fn test_dot_and_ellipsis() { 1657 assert_eq!(kind("."), TokenKind::Dot); 1658 assert_eq!(kind("..."), TokenKind::Ellipsis); 1659 } 1660 1661 #[test] 1662 fn test_arrow() { 1663 assert_eq!(kind("=>"), TokenKind::Arrow); 1664 } 1665 1666 #[test] 1667 fn test_optional_chaining() { 1668 assert_eq!(kind("?."), TokenKind::QuestionDot); 1669 } 1670 1671 #[test] 1672 fn test_comparison_operators() { 1673 assert_eq!(kind("=="), TokenKind::Eq); 1674 assert_eq!(kind("!="), TokenKind::Ne); 1675 assert_eq!(kind("==="), TokenKind::StrictEq); 1676 assert_eq!(kind("!=="), TokenKind::StrictNe); 1677 assert_eq!(kind("<"), TokenKind::Lt); 1678 assert_eq!(kind(">"), TokenKind::Gt); 1679 assert_eq!(kind("<="), TokenKind::Le); 1680 assert_eq!(kind(">="), TokenKind::Ge); 1681 } 1682 1683 #[test] 1684 fn test_arithmetic_operators() { 1685 assert_eq!(kind("+"), TokenKind::Plus); 1686 assert_eq!(kind("-"), TokenKind::Minus); 1687 assert_eq!(kind("*"), TokenKind::Star); 1688 assert_eq!(kind("%"), TokenKind::Percent); 1689 assert_eq!(kind("**"), TokenKind::Exp); 1690 assert_eq!(kind("++"), TokenKind::PlusPlus); 1691 assert_eq!(kind("--"), TokenKind::MinusMinus); 1692 } 1693 1694 #[test] 1695 fn test_bitwise_operators() { 1696 assert_eq!(kind("&"), TokenKind::Amp); 1697 assert_eq!(kind("|"), TokenKind::Pipe); 1698 assert_eq!(kind("^"), TokenKind::Caret); 1699 assert_eq!(kind("<<"), TokenKind::Shl); 1700 assert_eq!(kind(">>"), TokenKind::Shr); 1701 assert_eq!(kind(">>>"), TokenKind::Ushr); 1702 } 1703 1704 #[test] 1705 fn test_logical_operators() { 1706 assert_eq!(kind("&&"), TokenKind::And); 1707 assert_eq!(kind("||"), TokenKind::Or); 1708 assert_eq!(kind("!"), TokenKind::Not); 1709 assert_eq!(kind("??"), TokenKind::Nullish); 1710 } 1711 1712 #[test] 1713 fn test_assignment_operators() { 1714 assert_eq!(kind("="), TokenKind::Assign); 1715 assert_eq!(kind("+="), TokenKind::PlusAssign); 1716 assert_eq!(kind("-="), TokenKind::MinusAssign); 1717 assert_eq!(kind("*="), TokenKind::StarAssign); 1718 assert_eq!(kind("%="), TokenKind::PercentAssign); 1719 assert_eq!(kind("**="), TokenKind::ExpAssign); 1720 assert_eq!(kind("&="), TokenKind::AmpAssign); 1721 assert_eq!(kind("|="), TokenKind::PipeAssign); 1722 assert_eq!(kind("^="), TokenKind::CaretAssign); 1723 assert_eq!(kind("<<="), TokenKind::ShlAssign); 1724 assert_eq!(kind(">>="), TokenKind::ShrAssign); 1725 assert_eq!(kind(">>>="), TokenKind::UshrAssign); 1726 assert_eq!(kind("&&="), TokenKind::AndAssign); 1727 assert_eq!(kind("||="), TokenKind::OrAssign); 1728 assert_eq!(kind("??="), TokenKind::NullishAssign); 1729 } 1730 1731 // ── Comments ───────────────────────────────────────── 1732 1733 #[test] 1734 fn test_single_line_comment() { 1735 let tokens = kinds("a // comment\nb"); 1736 assert_eq!(tokens.len(), 3); // a, b, Eof 1737 assert_eq!(tokens[0], TokenKind::Identifier("a".into())); 1738 assert_eq!(tokens[1], TokenKind::Identifier("b".into())); 1739 } 1740 1741 #[test] 1742 fn test_multi_line_comment() { 1743 let tokens = kinds("a /* comment */ b"); 1744 assert_eq!(tokens.len(), 3); 1745 assert_eq!(tokens[0], TokenKind::Identifier("a".into())); 1746 assert_eq!(tokens[1], TokenKind::Identifier("b".into())); 1747 } 1748 1749 // ── Source positions ───────────────────────────────── 1750 1751 #[test] 1752 fn test_source_positions() { 1753 let tokens = Lexer::tokenize("let x = 42").unwrap(); 1754 // `let` at line 1, col 1 1755 assert_eq!(tokens[0].span.start, SourcePos { line: 1, col: 1 }); 1756 // `x` at line 1, col 5 1757 assert_eq!(tokens[1].span.start, SourcePos { line: 1, col: 5 }); 1758 // `=` at line 1, col 7 1759 assert_eq!(tokens[2].span.start, SourcePos { line: 1, col: 7 }); 1760 // `42` at line 1, col 9 1761 assert_eq!(tokens[3].span.start, SourcePos { line: 1, col: 9 }); 1762 } 1763 1764 #[test] 1765 fn test_multiline_positions() { 1766 let tokens = Lexer::tokenize("a\nb\nc").unwrap(); 1767 assert_eq!(tokens[0].span.start, SourcePos { line: 1, col: 1 }); 1768 assert_eq!(tokens[1].span.start, SourcePos { line: 2, col: 1 }); 1769 assert_eq!(tokens[2].span.start, SourcePos { line: 3, col: 1 }); 1770 } 1771 1772 // ── Newline tracking (ASI) ─────────────────────────── 1773 1774 #[test] 1775 fn test_preceded_by_newline() { 1776 let tokens = Lexer::tokenize("a\nb").unwrap(); 1777 assert!(!tokens[0].preceded_by_newline); // `a` 1778 assert!(tokens[1].preceded_by_newline); // `b` 1779 } 1780 1781 // ── Error cases ────────────────────────────────────── 1782 1783 #[test] 1784 fn test_unterminated_string() { 1785 assert!(Lexer::tokenize("\"hello").is_err()); 1786 } 1787 1788 #[test] 1789 fn test_unterminated_block_comment() { 1790 assert!(Lexer::tokenize("/* oops").is_err()); 1791 } 1792 1793 #[test] 1794 fn test_unterminated_template() { 1795 assert!(Lexer::tokenize("`hello").is_err()); 1796 } 1797 1798 #[test] 1799 fn test_bad_hex_literal() { 1800 assert!(Lexer::tokenize("0x").is_err()); 1801 } 1802 1803 // ── Full statement tokenization ────────────────────── 1804 1805 #[test] 1806 fn test_full_statement() { 1807 let tokens = kinds("const x = 42 + y;"); 1808 assert_eq!( 1809 tokens, 1810 vec![ 1811 TokenKind::Const, 1812 TokenKind::Identifier("x".into()), 1813 TokenKind::Assign, 1814 TokenKind::Number(42.0), 1815 TokenKind::Plus, 1816 TokenKind::Identifier("y".into()), 1817 TokenKind::Semicolon, 1818 TokenKind::Eof, 1819 ] 1820 ); 1821 } 1822 1823 #[test] 1824 fn test_arrow_function() { 1825 let tokens = kinds("(x) => x + 1"); 1826 assert_eq!( 1827 tokens, 1828 vec![ 1829 TokenKind::LParen, 1830 TokenKind::Identifier("x".into()), 1831 TokenKind::RParen, 1832 TokenKind::Arrow, 1833 TokenKind::Identifier("x".into()), 1834 TokenKind::Plus, 1835 TokenKind::Number(1.0), 1836 TokenKind::Eof, 1837 ] 1838 ); 1839 } 1840 1841 #[test] 1842 fn test_complex_expression() { 1843 let tokens = kinds("a?.b ?? c !== d"); 1844 assert_eq!( 1845 tokens, 1846 vec![ 1847 TokenKind::Identifier("a".into()), 1848 TokenKind::QuestionDot, 1849 TokenKind::Identifier("b".into()), 1850 TokenKind::Nullish, 1851 TokenKind::Identifier("c".into()), 1852 TokenKind::StrictNe, 1853 TokenKind::Identifier("d".into()), 1854 TokenKind::Eof, 1855 ] 1856 ); 1857 } 1858 1859 #[test] 1860 fn test_division_after_paren() { 1861 // `(a) / b` — the `/` after `)` should be division, not regexp 1862 let tokens = kinds("(a) / b"); 1863 assert_eq!( 1864 tokens, 1865 vec![ 1866 TokenKind::LParen, 1867 TokenKind::Identifier("a".into()), 1868 TokenKind::RParen, 1869 TokenKind::Slash, 1870 TokenKind::Identifier("b".into()), 1871 TokenKind::Eof, 1872 ] 1873 ); 1874 } 1875 1876 #[test] 1877 fn test_slash_assign() { 1878 let tokens = kinds("a /= b"); 1879 assert_eq!( 1880 tokens, 1881 vec![ 1882 TokenKind::Identifier("a".into()), 1883 TokenKind::SlashAssign, 1884 TokenKind::Identifier("b".into()), 1885 TokenKind::Eof, 1886 ] 1887 ); 1888 } 1889 1890 #[test] 1891 fn test_regexp_after_assign() { 1892 let tokens = kinds("x = /test/g"); 1893 assert_eq!( 1894 tokens, 1895 vec![ 1896 TokenKind::Identifier("x".into()), 1897 TokenKind::Assign, 1898 TokenKind::RegExp { 1899 pattern: "test".into(), 1900 flags: "g".into() 1901 }, 1902 TokenKind::Eof, 1903 ] 1904 ); 1905 } 1906}