we (web engine): Experimental web browser project to understand the limits of Claude
at js-bytecode 1909 lines 65 kB view raw
1//! JavaScript lexer/tokenizer conforming to ECMAScript 2024. 2//! 3//! Converts JavaScript source text into a stream of [`Token`]s, each annotated 4//! with its [`Span`] (byte offset, line, column). 5 6use std::fmt; 7 8/// A position in the source text. 9#[derive(Debug, Clone, Copy, PartialEq, Eq)] 10pub struct SourcePos { 11 /// 1-based line number. 12 pub line: u32, 13 /// 1-based column (in bytes from the start of the line). 14 pub col: u32, 15} 16 17/// A span covering a range of source text. 18#[derive(Debug, Clone, Copy, PartialEq, Eq)] 19pub struct Span { 20 pub start: SourcePos, 21 pub end: SourcePos, 22} 23 24/// A token produced by the lexer. 25#[derive(Debug, Clone, PartialEq)] 26pub struct Token { 27 pub kind: TokenKind, 28 pub span: Span, 29 /// Whether at least one newline preceded this token (for ASI). 30 pub preceded_by_newline: bool, 31} 32 33/// Every distinct token kind the lexer can produce. 34#[derive(Debug, Clone, PartialEq)] 35pub enum TokenKind { 36 // ── Literals ────────────────────────────────────────────── 37 /// Numeric literal (the parsed `f64` value). 38 Number(f64), 39 /// String literal (the decoded content, without quotes). 40 String(std::string::String), 41 /// Regular expression literal: pattern and flags. 42 RegExp { 43 pattern: std::string::String, 44 flags: std::string::String, 45 }, 46 /// Template literal with no substitutions (full string content). 47 TemplateFull(std::string::String), 48 /// Opening part of a template literal (before the first `${`). 49 TemplateHead(std::string::String), 50 /// Middle part of a template literal (between `}` and next `${`). 51 TemplateMiddle(std::string::String), 52 /// Closing part of a template literal (after the last `}`). 53 TemplateTail(std::string::String), 54 55 // ── Identifiers & Keywords ─────────────────────────────── 56 Identifier(std::string::String), 57 58 // Keywords 59 Await, 60 Break, 61 Case, 62 Catch, 63 Class, 64 Const, 65 Continue, 66 Debugger, 67 Default, 68 Delete, 69 Do, 70 Else, 71 Export, 72 Extends, 73 Finally, 74 For, 75 Function, 76 If, 77 Import, 78 In, 79 Instanceof, 80 Let, 81 New, 82 Of, 83 Return, 84 Static, 85 Super, 86 Switch, 87 This, 88 Throw, 89 Try, 90 Typeof, 91 Var, 92 Void, 93 While, 94 With, 95 Yield, 96 Async, 97 98 // Literal keywords 99 True, 100 False, 101 Null, 102 103 // ── Punctuators ────────────────────────────────────────── 104 // Grouping 105 LParen, // ( 106 RParen, // ) 107 LBracket, // [ 108 RBracket, // ] 109 LBrace, // { 110 RBrace, // } 111 112 // Delimiters 113 Semicolon, // ; 114 Comma, // , 115 Colon, // : 116 Dot, // . 117 Ellipsis, // ... 118 119 // Arrow 120 Arrow, // => 121 122 // Optional chaining 123 QuestionDot, // ?. 124 125 // Ternary 126 Question, // ? 127 128 // Assignment 129 Assign, // = 130 PlusAssign, // += 131 MinusAssign, // -= 132 StarAssign, // *= 133 SlashAssign, // /= 134 PercentAssign, // %= 135 ExpAssign, // **= 136 AmpAssign, // &= 137 PipeAssign, // |= 138 CaretAssign, // ^= 139 ShlAssign, // <<= 140 ShrAssign, // >>= 141 UshrAssign, // >>>= 142 AndAssign, // &&= 143 OrAssign, // ||= 144 NullishAssign, // ??= 145 146 // Comparison 147 Eq, // == 148 Ne, // != 149 StrictEq, // === 150 StrictNe, // !== 151 Lt, // < 152 Gt, // > 153 Le, // <= 154 Ge, // >= 155 156 // Arithmetic 157 Plus, // + 158 Minus, // - 159 Star, // * 160 Slash, // / 161 Percent, // % 162 Exp, // ** 163 164 // Increment / Decrement 165 PlusPlus, // ++ 166 MinusMinus, // -- 167 168 // Bitwise 169 Amp, // & 170 Pipe, // | 171 Caret, // ^ 172 Tilde, // ~ 173 Shl, // << 174 Shr, // >> 175 Ushr, // >>> 176 177 // Logical 178 And, // && 179 Or, // || 180 Not, // ! 181 Nullish, // ?? 182 183 // ── Special ────────────────────────────────────────────── 184 /// End of input. 185 Eof, 186} 187 188impl fmt::Display for TokenKind { 189 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 190 match self { 191 TokenKind::Number(n) => write!(f, "{}", n), 192 TokenKind::String(s) => write!(f, "\"{}\"", s), 193 TokenKind::RegExp { pattern, flags } => write!(f, "/{}/{}", pattern, flags), 194 TokenKind::TemplateFull(s) => write!(f, "`{}`", s), 195 TokenKind::TemplateHead(s) => write!(f, "`{}${{", s), 196 TokenKind::TemplateMiddle(s) => write!(f, "}}{}${{", s), 197 TokenKind::TemplateTail(s) => write!(f, "}}{}`", s), 198 TokenKind::Identifier(s) => write!(f, "{}", s), 199 TokenKind::Await => write!(f, "await"), 200 TokenKind::Break => write!(f, "break"), 201 TokenKind::Case => write!(f, "case"), 202 TokenKind::Catch => write!(f, "catch"), 203 TokenKind::Class => write!(f, "class"), 204 TokenKind::Const => write!(f, "const"), 205 TokenKind::Continue => write!(f, "continue"), 206 TokenKind::Debugger => write!(f, "debugger"), 207 TokenKind::Default => write!(f, "default"), 208 TokenKind::Delete => write!(f, "delete"), 209 TokenKind::Do => write!(f, "do"), 210 TokenKind::Else => write!(f, "else"), 211 TokenKind::Export => write!(f, "export"), 212 TokenKind::Extends => write!(f, "extends"), 213 TokenKind::Finally => write!(f, "finally"), 214 TokenKind::For => write!(f, "for"), 215 TokenKind::Function => write!(f, "function"), 216 TokenKind::If => write!(f, "if"), 217 TokenKind::Import => write!(f, "import"), 218 TokenKind::In => write!(f, "in"), 219 TokenKind::Instanceof => write!(f, "instanceof"), 220 TokenKind::Let => write!(f, "let"), 221 TokenKind::New => write!(f, "new"), 222 TokenKind::Of => write!(f, "of"), 223 TokenKind::Return => write!(f, "return"), 224 TokenKind::Static => write!(f, "static"), 225 TokenKind::Super => write!(f, "super"), 226 TokenKind::Switch => write!(f, "switch"), 227 TokenKind::This => write!(f, "this"), 228 TokenKind::Throw => write!(f, "throw"), 229 TokenKind::Try => write!(f, "try"), 230 TokenKind::Typeof => write!(f, "typeof"), 231 TokenKind::Var => write!(f, "var"), 232 TokenKind::Void => write!(f, "void"), 233 TokenKind::While => write!(f, "while"), 234 TokenKind::With => write!(f, "with"), 235 TokenKind::Yield => write!(f, "yield"), 236 TokenKind::Async => write!(f, "async"), 237 TokenKind::True => write!(f, "true"), 238 TokenKind::False => write!(f, "false"), 239 TokenKind::Null => write!(f, "null"), 240 TokenKind::LParen => write!(f, "("), 241 TokenKind::RParen => write!(f, ")"), 242 TokenKind::LBracket => write!(f, "["), 243 TokenKind::RBracket => write!(f, "]"), 244 TokenKind::LBrace => write!(f, "{{"), 245 TokenKind::RBrace => write!(f, "}}"), 246 TokenKind::Semicolon => write!(f, ";"), 247 TokenKind::Comma => write!(f, ","), 248 TokenKind::Colon => write!(f, ":"), 249 TokenKind::Dot => write!(f, "."), 250 TokenKind::Ellipsis => write!(f, "..."), 251 TokenKind::Arrow => write!(f, "=>"), 252 TokenKind::QuestionDot => write!(f, "?."), 253 TokenKind::Question => write!(f, "?"), 254 TokenKind::Assign => write!(f, "="), 255 TokenKind::PlusAssign => write!(f, "+="), 256 TokenKind::MinusAssign => write!(f, "-="), 257 TokenKind::StarAssign => write!(f, "*="), 258 TokenKind::SlashAssign => write!(f, "/="), 259 TokenKind::PercentAssign => write!(f, "%="), 260 TokenKind::ExpAssign => write!(f, "**="), 261 TokenKind::AmpAssign => write!(f, "&="), 262 TokenKind::PipeAssign => write!(f, "|="), 263 TokenKind::CaretAssign => write!(f, "^="), 264 TokenKind::ShlAssign => write!(f, "<<="), 265 TokenKind::ShrAssign => write!(f, ">>="), 266 TokenKind::UshrAssign => write!(f, ">>>="), 267 TokenKind::AndAssign => write!(f, "&&="), 268 TokenKind::OrAssign => write!(f, "||="), 269 TokenKind::NullishAssign => write!(f, "??="), 270 TokenKind::Eq => write!(f, "=="), 271 TokenKind::Ne => write!(f, "!="), 272 TokenKind::StrictEq => write!(f, "==="), 273 TokenKind::StrictNe => write!(f, "!=="), 274 TokenKind::Lt => write!(f, "<"), 275 TokenKind::Gt => write!(f, ">"), 276 TokenKind::Le => write!(f, "<="), 277 TokenKind::Ge => write!(f, ">="), 278 TokenKind::Plus => write!(f, "+"), 279 TokenKind::Minus => write!(f, "-"), 280 TokenKind::Star => write!(f, "*"), 281 TokenKind::Slash => write!(f, "/"), 282 TokenKind::Percent => write!(f, "%"), 283 TokenKind::Exp => write!(f, "**"), 284 TokenKind::PlusPlus => write!(f, "++"), 285 TokenKind::MinusMinus => write!(f, "--"), 286 TokenKind::Amp => write!(f, "&"), 287 TokenKind::Pipe => write!(f, "|"), 288 TokenKind::Caret => write!(f, "^"), 289 TokenKind::Tilde => write!(f, "~"), 290 TokenKind::Shl => write!(f, "<<"), 291 TokenKind::Shr => write!(f, ">>"), 292 TokenKind::Ushr => write!(f, ">>>"), 293 TokenKind::And => write!(f, "&&"), 294 TokenKind::Or => write!(f, "||"), 295 TokenKind::Not => write!(f, "!"), 296 TokenKind::Nullish => write!(f, "??"), 297 TokenKind::Eof => write!(f, "<EOF>"), 298 } 299 } 300} 301 302/// The lexer converts JavaScript source text into tokens. 303pub struct Lexer<'a> { 304 source: &'a [u8], 305 /// Current byte offset into `source`. 306 pos: usize, 307 /// Current 1-based line number. 308 line: u32, 309 /// Current 1-based column (byte offset from line start). 310 col: u32, 311 /// Whether we have crossed at least one newline since the last token. 312 saw_newline: bool, 313 /// Nesting depth for template literal `${...}` expressions. 314 /// When > 0, a `}` at the matching depth resumes template scanning. 315 template_depth: u32, 316 /// Stack tracking brace depth at each template nesting level. 317 /// When we enter `${`, we push the current brace depth. 318 template_brace_stack: Vec<u32>, 319 /// Current brace depth (incremented on `{`, decremented on `}`). 320 brace_depth: u32, 321 /// Tracks whether the previous token could end an expression. 322 /// Used to disambiguate `/` as division vs RegExp. 323 prev_token_is_expr_end: bool, 324} 325 326impl<'a> Lexer<'a> { 327 /// Create a new lexer for the given source text. 328 pub fn new(source: &'a str) -> Self { 329 Self { 330 source: source.as_bytes(), 331 pos: 0, 332 line: 1, 333 col: 1, 334 saw_newline: false, 335 template_depth: 0, 336 template_brace_stack: Vec::new(), 337 brace_depth: 0, 338 prev_token_is_expr_end: false, 339 } 340 } 341 342 /// Tokenize the entire source and return all tokens (including final `Eof`). 343 pub fn tokenize(source: &str) -> Result<Vec<Token>, LexError> { 344 let mut lexer = Lexer::new(source); 345 let mut tokens = Vec::new(); 346 loop { 347 let tok = lexer.next_token()?; 348 let is_eof = tok.kind == TokenKind::Eof; 349 tokens.push(tok); 350 if is_eof { 351 break; 352 } 353 } 354 Ok(tokens) 355 } 356 357 // ── Helpers ────────────────────────────────────────────── 358 359 fn current_pos(&self) -> SourcePos { 360 SourcePos { 361 line: self.line, 362 col: self.col, 363 } 364 } 365 366 fn peek(&self) -> Option<u8> { 367 self.source.get(self.pos).copied() 368 } 369 370 fn peek_at(&self, offset: usize) -> Option<u8> { 371 self.source.get(self.pos + offset).copied() 372 } 373 374 fn advance(&mut self) -> Option<u8> { 375 let b = self.source.get(self.pos).copied()?; 376 self.pos += 1; 377 if b == b'\n' { 378 self.line += 1; 379 self.col = 1; 380 self.saw_newline = true; 381 } else { 382 self.col += 1; 383 } 384 Some(b) 385 } 386 387 fn advance_if(&mut self, expected: u8) -> bool { 388 if self.peek() == Some(expected) { 389 self.advance(); 390 true 391 } else { 392 false 393 } 394 } 395 396 fn slice(&self, start: usize, end: usize) -> &'a str { 397 // Safety: we only slice at positions we've already walked over, 398 // and we trust the input to be valid UTF-8 at identifier/keyword 399 // boundaries. In practice this is safe because the lexer only 400 // slices ASCII-compatible byte sequences. 401 std::str::from_utf8(&self.source[start..end]).unwrap_or("") 402 } 403 404 // ── Whitespace & Comments ──────────────────────────────── 405 406 fn skip_whitespace_and_comments(&mut self) -> Result<(), LexError> { 407 loop { 408 match self.peek() { 409 Some(b' ' | b'\t' | b'\r' | b'\n') => { 410 self.advance(); 411 } 412 // Unicode BOM / non-breaking spaces 413 Some(0xC2) if self.peek_at(1) == Some(0xA0) => { 414 // U+00A0 non-breaking space (2-byte UTF-8) 415 self.advance(); 416 self.advance(); 417 } 418 Some(0xEF) if self.peek_at(1) == Some(0xBB) && self.peek_at(2) == Some(0xBF) => { 419 // BOM U+FEFF 420 self.advance(); 421 self.advance(); 422 self.advance(); 423 } 424 Some(b'/') => { 425 match self.peek_at(1) { 426 Some(b'/') => { 427 // single-line comment 428 self.advance(); // / 429 self.advance(); // / 430 while let Some(b) = self.peek() { 431 if b == b'\n' { 432 break; 433 } 434 self.advance(); 435 } 436 } 437 Some(b'*') => { 438 // multi-line comment 439 let start = self.current_pos(); 440 self.advance(); // / 441 self.advance(); // * 442 let mut closed = false; 443 while let Some(b) = self.advance() { 444 if b == b'*' && self.peek() == Some(b'/') { 445 self.advance(); // / 446 closed = true; 447 break; 448 } 449 } 450 if !closed { 451 return Err(LexError { 452 message: "unterminated block comment".into(), 453 pos: start, 454 }); 455 } 456 } 457 _ => break, 458 } 459 } 460 _ => break, 461 } 462 } 463 Ok(()) 464 } 465 466 // ── Main dispatch ──────────────────────────────────────── 467 468 /// Produce the next token. 469 pub fn next_token(&mut self) -> Result<Token, LexError> { 470 self.saw_newline = false; 471 self.skip_whitespace_and_comments()?; 472 473 let start = self.current_pos(); 474 475 let Some(b) = self.peek() else { 476 return Ok(Token { 477 kind: TokenKind::Eof, 478 span: Span { 479 start, 480 end: self.current_pos(), 481 }, 482 preceded_by_newline: self.saw_newline, 483 }); 484 }; 485 486 // If we're inside a template `${...}` and hit the matching `}`, 487 // resume template scanning. 488 if b == b'}' 489 && !self.template_brace_stack.is_empty() 490 && self.brace_depth == *self.template_brace_stack.last().unwrap() 491 { 492 self.template_brace_stack.pop(); 493 self.template_depth -= 1; 494 self.advance(); // consume } 495 return self.scan_template_continuation(start); 496 } 497 498 let kind = match b { 499 b'`' => { 500 self.advance(); 501 return self.scan_template_start(start); 502 } 503 504 b'0'..=b'9' => self.scan_number()?, 505 b'.' if matches!(self.peek_at(1), Some(b'0'..=b'9')) => self.scan_number()?, 506 507 b'"' | b'\'' => self.scan_string()?, 508 509 b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'$' => self.scan_identifier_or_keyword(), 510 // UTF-8 multi-byte identifier start 511 0xC0..=0xF7 if is_unicode_id_start(self.source, self.pos) => { 512 self.scan_identifier_or_keyword() 513 } 514 515 b'/' if !self.prev_token_is_expr_end => self.scan_regexp()?, 516 517 _ => self.scan_punctuator()?, 518 }; 519 520 let end = self.current_pos(); 521 let preceded_by_newline = self.saw_newline; 522 523 // Track whether this token ends an expression (for `/` disambiguation). 524 self.prev_token_is_expr_end = token_is_expr_end(&kind); 525 526 Ok(Token { 527 kind, 528 span: Span { start, end }, 529 preceded_by_newline, 530 }) 531 } 532 533 // ── Numbers ────────────────────────────────────────────── 534 535 fn scan_number(&mut self) -> Result<TokenKind, LexError> { 536 let start = self.pos; 537 538 if self.peek() == Some(b'0') { 539 match self.peek_at(1) { 540 Some(b'x' | b'X') => return self.scan_hex_number(), 541 Some(b'o' | b'O') => return self.scan_octal_number(), 542 Some(b'b' | b'B') => return self.scan_binary_number(), 543 _ => {} 544 } 545 } 546 547 // Decimal integer or float 548 self.eat_decimal_digits(); 549 550 if self.peek() == Some(b'.') { 551 // Could be `1..toString()` — only consume `.` if followed by a digit 552 // or if this is a leading dot (start already has a digit, so peek is safe). 553 // Actually, `1.` is a valid numeric literal (= 1.0), and `1.e2` = 100. 554 // We consume the dot always unless it's `..` (spread). 555 if self.peek_at(1) != Some(b'.') { 556 self.advance(); // . 557 self.eat_decimal_digits(); 558 } 559 } 560 561 // Exponent 562 if matches!(self.peek(), Some(b'e' | b'E')) { 563 self.advance(); 564 if matches!(self.peek(), Some(b'+' | b'-')) { 565 self.advance(); 566 } 567 if !matches!(self.peek(), Some(b'0'..=b'9')) { 568 return Err(LexError { 569 message: "expected digit after exponent".into(), 570 pos: self.current_pos(), 571 }); 572 } 573 self.eat_decimal_digits(); 574 } 575 576 // BigInt suffix `n` — we tokenize it but store as f64 (for now) 577 self.advance_if(b'n'); 578 579 let text = self.slice(start, self.pos); 580 let value = parse_decimal(text); 581 Ok(TokenKind::Number(value)) 582 } 583 584 fn scan_hex_number(&mut self) -> Result<TokenKind, LexError> { 585 self.advance(); // 0 586 self.advance(); // x/X 587 let digit_start = self.pos; 588 self.eat_hex_digits(); 589 if self.pos == digit_start { 590 return Err(LexError { 591 message: "expected hex digit after 0x".into(), 592 pos: self.current_pos(), 593 }); 594 } 595 self.advance_if(b'n'); 596 let text = self.slice(digit_start, self.pos); 597 let text = text.trim_end_matches('n'); 598 let value = u64_from_hex(text) as f64; 599 Ok(TokenKind::Number(value)) 600 } 601 602 fn scan_octal_number(&mut self) -> Result<TokenKind, LexError> { 603 self.advance(); // 0 604 self.advance(); // o/O 605 let digit_start = self.pos; 606 while matches!(self.peek(), Some(b'0'..=b'7' | b'_')) { 607 self.advance(); 608 } 609 if self.pos == digit_start { 610 return Err(LexError { 611 message: "expected octal digit after 0o".into(), 612 pos: self.current_pos(), 613 }); 614 } 615 self.advance_if(b'n'); 616 let text = self.slice(digit_start, self.pos).trim_end_matches('n'); 617 let value = u64_from_octal(text) as f64; 618 Ok(TokenKind::Number(value)) 619 } 620 621 fn scan_binary_number(&mut self) -> Result<TokenKind, LexError> { 622 self.advance(); // 0 623 self.advance(); // b/B 624 let digit_start = self.pos; 625 while matches!(self.peek(), Some(b'0' | b'1' | b'_')) { 626 self.advance(); 627 } 628 if self.pos == digit_start { 629 return Err(LexError { 630 message: "expected binary digit after 0b".into(), 631 pos: self.current_pos(), 632 }); 633 } 634 self.advance_if(b'n'); 635 let text = self.slice(digit_start, self.pos).trim_end_matches('n'); 636 let value = u64_from_binary(text) as f64; 637 Ok(TokenKind::Number(value)) 638 } 639 640 fn eat_decimal_digits(&mut self) { 641 while matches!(self.peek(), Some(b'0'..=b'9' | b'_')) { 642 self.advance(); 643 } 644 } 645 646 fn eat_hex_digits(&mut self) { 647 while matches!( 648 self.peek(), 649 Some(b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'_') 650 ) { 651 self.advance(); 652 } 653 } 654 655 // ── Strings ────────────────────────────────────────────── 656 657 fn scan_string(&mut self) -> Result<TokenKind, LexError> { 658 let quote = self.advance().unwrap(); // opening quote 659 let start_pos = self.current_pos(); 660 let mut value = std::string::String::new(); 661 662 loop { 663 match self.peek() { 664 None | Some(b'\n') => { 665 return Err(LexError { 666 message: "unterminated string literal".into(), 667 pos: start_pos, 668 }); 669 } 670 Some(b) if b == quote => { 671 self.advance(); 672 break; 673 } 674 Some(b'\\') => { 675 self.advance(); // backslash 676 if let Some(ch) = self.scan_escape_sequence()? { 677 value.push(ch); 678 } 679 } 680 Some(_) => { 681 let ch = self.advance_char(); 682 value.push(ch); 683 } 684 } 685 } 686 687 Ok(TokenKind::String(value)) 688 } 689 690 /// Scan an escape sequence after the backslash has been consumed. 691 /// Returns `None` for line continuations (`\<newline>`), which produce no character. 692 fn scan_escape_sequence(&mut self) -> Result<Option<char>, LexError> { 693 let pos = self.current_pos(); 694 match self.advance() { 695 Some(b'n') => Ok(Some('\n')), 696 Some(b'r') => Ok(Some('\r')), 697 Some(b't') => Ok(Some('\t')), 698 Some(b'b') => Ok(Some('\u{0008}')), 699 Some(b'f') => Ok(Some('\u{000C}')), 700 Some(b'v') => Ok(Some('\u{000B}')), 701 Some(b'0') if !matches!(self.peek(), Some(b'0'..=b'9')) => Ok(Some('\0')), 702 Some(b'\\') => Ok(Some('\\')), 703 Some(b'\'') => Ok(Some('\'')), 704 Some(b'"') => Ok(Some('"')), 705 Some(b'`') => Ok(Some('`')), 706 // Line continuation: \<newline> produces no character 707 Some(b'\n') => Ok(None), 708 Some(b'\r') => { 709 self.advance_if(b'\n'); 710 Ok(None) 711 } 712 Some(b'x') => { 713 let hi = self.advance().and_then(hex_digit_val).ok_or(LexError { 714 message: "invalid hex escape".into(), 715 pos, 716 })?; 717 let lo = self.advance().and_then(hex_digit_val).ok_or(LexError { 718 message: "invalid hex escape".into(), 719 pos, 720 })?; 721 let code = (hi << 4) | lo; 722 Ok(Some(code as char)) 723 } 724 Some(b'u') => self.scan_unicode_escape(pos).map(Some), 725 Some(b) => { 726 // identity escape 727 Ok(Some(b as char)) 728 } 729 None => Err(LexError { 730 message: "unexpected end of input in escape sequence".into(), 731 pos, 732 }), 733 } 734 } 735 736 fn scan_unicode_escape(&mut self, pos: SourcePos) -> Result<char, LexError> { 737 if self.advance_if(b'{') { 738 // \u{XXXXX} 739 let mut code: u32 = 0; 740 let mut count = 0; 741 while let Some(b) = self.peek() { 742 if b == b'}' { 743 break; 744 } 745 let d = hex_digit_val(self.advance().unwrap()).ok_or(LexError { 746 message: "invalid unicode escape".into(), 747 pos, 748 })?; 749 code = code * 16 + d as u32; 750 count += 1; 751 if code > 0x10FFFF { 752 return Err(LexError { 753 message: "unicode escape out of range".into(), 754 pos, 755 }); 756 } 757 } 758 if count == 0 || !self.advance_if(b'}') { 759 return Err(LexError { 760 message: "invalid unicode escape".into(), 761 pos, 762 }); 763 } 764 char::from_u32(code).ok_or(LexError { 765 message: "invalid unicode code point".into(), 766 pos, 767 }) 768 } else { 769 // \uXXXX 770 let mut code: u32 = 0; 771 for _ in 0..4 { 772 let d = self.advance().and_then(hex_digit_val).ok_or(LexError { 773 message: "invalid unicode escape".into(), 774 pos, 775 })?; 776 code = code * 16 + d as u32; 777 } 778 char::from_u32(code).ok_or(LexError { 779 message: "invalid unicode code point".into(), 780 pos, 781 }) 782 } 783 } 784 785 /// Advance one full UTF-8 character and return it. 786 fn advance_char(&mut self) -> char { 787 let start = self.pos; 788 let b = self.advance().unwrap(); 789 if b < 0x80 { 790 return b as char; 791 } 792 // multi-byte: determine length 793 let len = if b >= 0xF0 { 794 4 795 } else if b >= 0xE0 { 796 3 797 } else { 798 2 799 }; 800 for _ in 1..len { 801 self.advance(); 802 } 803 let s = std::str::from_utf8(&self.source[start..self.pos]).unwrap_or("\u{FFFD}"); 804 s.chars().next().unwrap_or('\u{FFFD}') 805 } 806 807 // ── Template Literals ──────────────────────────────────── 808 809 fn scan_template_start(&mut self, start: SourcePos) -> Result<Token, LexError> { 810 let mut value = std::string::String::new(); 811 loop { 812 match self.peek() { 813 None => { 814 return Err(LexError { 815 message: "unterminated template literal".into(), 816 pos: start, 817 }); 818 } 819 Some(b'`') => { 820 self.advance(); 821 let end = self.current_pos(); 822 let kind = TokenKind::TemplateFull(value); 823 self.prev_token_is_expr_end = true; 824 return Ok(Token { 825 kind, 826 span: Span { start, end }, 827 preceded_by_newline: self.saw_newline, 828 }); 829 } 830 Some(b'$') if self.peek_at(1) == Some(b'{') => { 831 self.advance(); // $ 832 self.advance(); // { 833 self.template_depth += 1; 834 self.template_brace_stack.push(self.brace_depth); 835 let end = self.current_pos(); 836 let kind = TokenKind::TemplateHead(value); 837 self.prev_token_is_expr_end = false; 838 return Ok(Token { 839 kind, 840 span: Span { start, end }, 841 preceded_by_newline: self.saw_newline, 842 }); 843 } 844 Some(b'\\') => { 845 self.advance(); 846 if let Some(ch) = self.scan_escape_sequence()? { 847 value.push(ch); 848 } 849 } 850 Some(_) => { 851 let ch = self.advance_char(); 852 value.push(ch); 853 } 854 } 855 } 856 } 857 858 fn scan_template_continuation(&mut self, start: SourcePos) -> Result<Token, LexError> { 859 let mut value = std::string::String::new(); 860 loop { 861 match self.peek() { 862 None => { 863 return Err(LexError { 864 message: "unterminated template literal".into(), 865 pos: start, 866 }); 867 } 868 Some(b'`') => { 869 self.advance(); 870 let end = self.current_pos(); 871 let kind = TokenKind::TemplateTail(value); 872 self.prev_token_is_expr_end = true; 873 return Ok(Token { 874 kind, 875 span: Span { start, end }, 876 preceded_by_newline: self.saw_newline, 877 }); 878 } 879 Some(b'$') if self.peek_at(1) == Some(b'{') => { 880 self.advance(); // $ 881 self.advance(); // { 882 self.template_depth += 1; 883 self.template_brace_stack.push(self.brace_depth); 884 let end = self.current_pos(); 885 let kind = TokenKind::TemplateMiddle(value); 886 self.prev_token_is_expr_end = false; 887 return Ok(Token { 888 kind, 889 span: Span { start, end }, 890 preceded_by_newline: self.saw_newline, 891 }); 892 } 893 Some(b'\\') => { 894 self.advance(); 895 if let Some(ch) = self.scan_escape_sequence()? { 896 value.push(ch); 897 } 898 } 899 Some(_) => { 900 let ch = self.advance_char(); 901 value.push(ch); 902 } 903 } 904 } 905 } 906 907 // ── Identifiers & Keywords ─────────────────────────────── 908 909 fn scan_identifier_or_keyword(&mut self) -> TokenKind { 910 let start = self.pos; 911 912 // Consume the first character (which we already validated) 913 self.advance_char(); 914 915 // Consume continue characters 916 while self.pos < self.source.len() { 917 let b = self.source[self.pos]; 918 match b { 919 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'$' => { 920 self.advance(); 921 } 922 0xC0..=0xF7 if is_unicode_id_continue(self.source, self.pos) => { 923 self.advance_char(); 924 } 925 _ => break, 926 } 927 } 928 929 let text = self.slice(start, self.pos); 930 keyword_or_ident(text) 931 } 932 933 // ── Regular Expressions ────────────────────────────────── 934 935 fn scan_regexp(&mut self) -> Result<TokenKind, LexError> { 936 let start_pos = self.current_pos(); 937 self.advance(); // opening / 938 939 let mut pattern = std::string::String::new(); 940 let mut in_class = false; 941 942 loop { 943 match self.peek() { 944 None | Some(b'\n') => { 945 return Err(LexError { 946 message: "unterminated regexp literal".into(), 947 pos: start_pos, 948 }); 949 } 950 Some(b'/') if !in_class => { 951 self.advance(); 952 break; 953 } 954 Some(b'[') => { 955 in_class = true; 956 pattern.push('['); 957 self.advance(); 958 } 959 Some(b']') if in_class => { 960 in_class = false; 961 pattern.push(']'); 962 self.advance(); 963 } 964 Some(b'\\') => { 965 self.advance(); 966 pattern.push('\\'); 967 if let Some(b2) = self.peek() { 968 if b2 != b'\n' { 969 pattern.push(b2 as char); 970 self.advance(); 971 } 972 } 973 } 974 Some(b) => { 975 pattern.push(b as char); 976 self.advance(); 977 } 978 } 979 } 980 981 // Flags 982 let mut flags = std::string::String::new(); 983 while matches!( 984 self.peek(), 985 Some(b'g' | b'i' | b'm' | b's' | b'u' | b'v' | b'y' | b'd') 986 ) { 987 flags.push(self.advance().unwrap() as char); 988 } 989 990 Ok(TokenKind::RegExp { pattern, flags }) 991 } 992 993 // ── Punctuators ────────────────────────────────────────── 994 995 fn scan_punctuator(&mut self) -> Result<TokenKind, LexError> { 996 let pos = self.current_pos(); 997 let b = self.advance().unwrap(); 998 999 let kind = match b { 1000 b'(' => TokenKind::LParen, 1001 b')' => TokenKind::RParen, 1002 b'[' => TokenKind::LBracket, 1003 b']' => TokenKind::RBracket, 1004 b'{' => { 1005 self.brace_depth += 1; 1006 TokenKind::LBrace 1007 } 1008 b'}' => { 1009 self.brace_depth = self.brace_depth.saturating_sub(1); 1010 TokenKind::RBrace 1011 } 1012 b';' => TokenKind::Semicolon, 1013 b',' => TokenKind::Comma, 1014 b':' => TokenKind::Colon, 1015 b'~' => TokenKind::Tilde, 1016 1017 b'.' => { 1018 if self.peek() == Some(b'.') && self.peek_at(1) == Some(b'.') { 1019 self.advance(); 1020 self.advance(); 1021 TokenKind::Ellipsis 1022 } else { 1023 TokenKind::Dot 1024 } 1025 } 1026 1027 b'?' => { 1028 if self.advance_if(b'?') { 1029 if self.advance_if(b'=') { 1030 TokenKind::NullishAssign 1031 } else { 1032 TokenKind::Nullish 1033 } 1034 } else if self.peek() == Some(b'.') && !matches!(self.peek_at(1), Some(b'0'..=b'9')) 1035 { 1036 self.advance(); 1037 TokenKind::QuestionDot 1038 } else { 1039 TokenKind::Question 1040 } 1041 } 1042 1043 b'+' => { 1044 if self.advance_if(b'+') { 1045 TokenKind::PlusPlus 1046 } else if self.advance_if(b'=') { 1047 TokenKind::PlusAssign 1048 } else { 1049 TokenKind::Plus 1050 } 1051 } 1052 1053 b'-' => { 1054 if self.advance_if(b'-') { 1055 TokenKind::MinusMinus 1056 } else if self.advance_if(b'=') { 1057 TokenKind::MinusAssign 1058 } else { 1059 TokenKind::Minus 1060 } 1061 } 1062 1063 b'*' => { 1064 if self.advance_if(b'*') { 1065 if self.advance_if(b'=') { 1066 TokenKind::ExpAssign 1067 } else { 1068 TokenKind::Exp 1069 } 1070 } else if self.advance_if(b'=') { 1071 TokenKind::StarAssign 1072 } else { 1073 TokenKind::Star 1074 } 1075 } 1076 1077 b'/' => { 1078 // We only get here for division (regexp was handled earlier) 1079 if self.advance_if(b'=') { 1080 TokenKind::SlashAssign 1081 } else { 1082 TokenKind::Slash 1083 } 1084 } 1085 1086 b'%' => { 1087 if self.advance_if(b'=') { 1088 TokenKind::PercentAssign 1089 } else { 1090 TokenKind::Percent 1091 } 1092 } 1093 1094 b'=' => { 1095 if self.advance_if(b'=') { 1096 if self.advance_if(b'=') { 1097 TokenKind::StrictEq 1098 } else { 1099 TokenKind::Eq 1100 } 1101 } else if self.advance_if(b'>') { 1102 TokenKind::Arrow 1103 } else { 1104 TokenKind::Assign 1105 } 1106 } 1107 1108 b'!' => { 1109 if self.advance_if(b'=') { 1110 if self.advance_if(b'=') { 1111 TokenKind::StrictNe 1112 } else { 1113 TokenKind::Ne 1114 } 1115 } else { 1116 TokenKind::Not 1117 } 1118 } 1119 1120 b'<' => { 1121 if self.advance_if(b'<') { 1122 if self.advance_if(b'=') { 1123 TokenKind::ShlAssign 1124 } else { 1125 TokenKind::Shl 1126 } 1127 } else if self.advance_if(b'=') { 1128 TokenKind::Le 1129 } else { 1130 TokenKind::Lt 1131 } 1132 } 1133 1134 b'>' => { 1135 if self.advance_if(b'>') { 1136 if self.advance_if(b'>') { 1137 if self.advance_if(b'=') { 1138 TokenKind::UshrAssign 1139 } else { 1140 TokenKind::Ushr 1141 } 1142 } else if self.advance_if(b'=') { 1143 TokenKind::ShrAssign 1144 } else { 1145 TokenKind::Shr 1146 } 1147 } else if self.advance_if(b'=') { 1148 TokenKind::Ge 1149 } else { 1150 TokenKind::Gt 1151 } 1152 } 1153 1154 b'&' => { 1155 if self.advance_if(b'&') { 1156 if self.advance_if(b'=') { 1157 TokenKind::AndAssign 1158 } else { 1159 TokenKind::And 1160 } 1161 } else if self.advance_if(b'=') { 1162 TokenKind::AmpAssign 1163 } else { 1164 TokenKind::Amp 1165 } 1166 } 1167 1168 b'|' => { 1169 if self.advance_if(b'|') { 1170 if self.advance_if(b'=') { 1171 TokenKind::OrAssign 1172 } else { 1173 TokenKind::Or 1174 } 1175 } else if self.advance_if(b'=') { 1176 TokenKind::PipeAssign 1177 } else { 1178 TokenKind::Pipe 1179 } 1180 } 1181 1182 b'^' => { 1183 if self.advance_if(b'=') { 1184 TokenKind::CaretAssign 1185 } else { 1186 TokenKind::Caret 1187 } 1188 } 1189 1190 _ => { 1191 return Err(LexError { 1192 message: format!("unexpected character: {:?}", b as char), 1193 pos, 1194 }); 1195 } 1196 }; 1197 1198 Ok(kind) 1199 } 1200} 1201 1202// ── Keyword lookup ─────────────────────────────────────────── 1203 1204fn keyword_or_ident(s: &str) -> TokenKind { 1205 match s { 1206 "await" => TokenKind::Await, 1207 "break" => TokenKind::Break, 1208 "case" => TokenKind::Case, 1209 "catch" => TokenKind::Catch, 1210 "class" => TokenKind::Class, 1211 "const" => TokenKind::Const, 1212 "continue" => TokenKind::Continue, 1213 "debugger" => TokenKind::Debugger, 1214 "default" => TokenKind::Default, 1215 "delete" => TokenKind::Delete, 1216 "do" => TokenKind::Do, 1217 "else" => TokenKind::Else, 1218 "export" => TokenKind::Export, 1219 "extends" => TokenKind::Extends, 1220 "finally" => TokenKind::Finally, 1221 "for" => TokenKind::For, 1222 "function" => TokenKind::Function, 1223 "if" => TokenKind::If, 1224 "import" => TokenKind::Import, 1225 "in" => TokenKind::In, 1226 "instanceof" => TokenKind::Instanceof, 1227 "let" => TokenKind::Let, 1228 "new" => TokenKind::New, 1229 "of" => TokenKind::Of, 1230 "return" => TokenKind::Return, 1231 "static" => TokenKind::Static, 1232 "super" => TokenKind::Super, 1233 "switch" => TokenKind::Switch, 1234 "this" => TokenKind::This, 1235 "throw" => TokenKind::Throw, 1236 "try" => TokenKind::Try, 1237 "typeof" => TokenKind::Typeof, 1238 "var" => TokenKind::Var, 1239 "void" => TokenKind::Void, 1240 "while" => TokenKind::While, 1241 "with" => TokenKind::With, 1242 "yield" => TokenKind::Yield, 1243 "async" => TokenKind::Async, 1244 "true" => TokenKind::True, 1245 "false" => TokenKind::False, 1246 "null" => TokenKind::Null, 1247 _ => TokenKind::Identifier(s.to_owned()), 1248 } 1249} 1250 1251// ── Expression-end tracking ────────────────────────────────── 1252 1253/// Returns `true` if a token of this kind could end an expression. 1254/// Used to decide whether a following `/` is division or a RegExp literal. 1255fn token_is_expr_end(kind: &TokenKind) -> bool { 1256 matches!( 1257 kind, 1258 TokenKind::Identifier(_) 1259 | TokenKind::Number(_) 1260 | TokenKind::String(_) 1261 | TokenKind::TemplateFull(_) 1262 | TokenKind::TemplateTail(_) 1263 | TokenKind::True 1264 | TokenKind::False 1265 | TokenKind::Null 1266 | TokenKind::This 1267 | TokenKind::Super 1268 | TokenKind::RParen 1269 | TokenKind::RBracket 1270 | TokenKind::RBrace 1271 | TokenKind::PlusPlus 1272 | TokenKind::MinusMinus 1273 | TokenKind::RegExp { .. } 1274 ) 1275} 1276 1277// ── Unicode helpers ────────────────────────────────────────── 1278 1279/// Check if the byte sequence at `pos` starts a valid Unicode identifier start character. 1280fn is_unicode_id_start(source: &[u8], pos: usize) -> bool { 1281 let s = std::str::from_utf8(&source[pos..]).unwrap_or(""); 1282 if let Some(ch) = s.chars().next() { 1283 ch.is_alphabetic() || ch == '_' || ch == '$' 1284 } else { 1285 false 1286 } 1287} 1288 1289/// Check if the byte sequence at `pos` starts a valid Unicode identifier continue character. 1290fn is_unicode_id_continue(source: &[u8], pos: usize) -> bool { 1291 let s = std::str::from_utf8(&source[pos..]).unwrap_or(""); 1292 if let Some(ch) = s.chars().next() { 1293 ch.is_alphanumeric() || ch == '_' || ch == '$' || ch == '\u{200C}' || ch == '\u{200D}' 1294 } else { 1295 false 1296 } 1297} 1298 1299// ── Numeric parsing helpers ────────────────────────────────── 1300 1301fn hex_digit_val(b: u8) -> Option<u8> { 1302 match b { 1303 b'0'..=b'9' => Some(b - b'0'), 1304 b'a'..=b'f' => Some(b - b'a' + 10), 1305 b'A'..=b'F' => Some(b - b'A' + 10), 1306 _ => None, 1307 } 1308} 1309 1310fn parse_decimal(s: &str) -> f64 { 1311 let s = s.replace('_', ""); 1312 let s = s.trim_end_matches('n'); 1313 // Use manual parsing for basic decimal and float 1314 if let Ok(v) = s.parse::<f64>() { 1315 return v; 1316 } 1317 0.0 1318} 1319 1320fn u64_from_hex(s: &str) -> u64 { 1321 let mut result: u64 = 0; 1322 for b in s.bytes() { 1323 if b == b'_' { 1324 continue; 1325 } 1326 let d = hex_digit_val(b).unwrap_or(0) as u64; 1327 result = result.wrapping_mul(16).wrapping_add(d); 1328 } 1329 result 1330} 1331 1332fn u64_from_octal(s: &str) -> u64 { 1333 let mut result: u64 = 0; 1334 for b in s.bytes() { 1335 if b == b'_' { 1336 continue; 1337 } 1338 let d = (b - b'0') as u64; 1339 result = result.wrapping_mul(8).wrapping_add(d); 1340 } 1341 result 1342} 1343 1344fn u64_from_binary(s: &str) -> u64 { 1345 let mut result: u64 = 0; 1346 for b in s.bytes() { 1347 if b == b'_' { 1348 continue; 1349 } 1350 let d = (b - b'0') as u64; 1351 result = result.wrapping_mul(2).wrapping_add(d); 1352 } 1353 result 1354} 1355 1356// ── Error type ─────────────────────────────────────────────── 1357 1358/// An error produced during lexing. 1359#[derive(Debug, Clone, PartialEq, Eq)] 1360pub struct LexError { 1361 pub message: std::string::String, 1362 pub pos: SourcePos, 1363} 1364 1365impl fmt::Display for LexError { 1366 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1367 write!( 1368 f, 1369 "LexError at {}:{}: {}", 1370 self.pos.line, self.pos.col, self.message 1371 ) 1372 } 1373} 1374 1375// ── Tests ──────────────────────────────────────────────────── 1376 1377#[cfg(test)] 1378mod tests { 1379 use super::*; 1380 1381 fn kinds(src: &str) -> Vec<TokenKind> { 1382 Lexer::tokenize(src) 1383 .unwrap() 1384 .into_iter() 1385 .map(|t| t.kind) 1386 .collect() 1387 } 1388 1389 fn kind(src: &str) -> TokenKind { 1390 let tokens = Lexer::tokenize(src).unwrap(); 1391 assert!(tokens.len() >= 2, "expected at least one token + Eof"); 1392 tokens[0].kind.clone() 1393 } 1394 1395 // ── Keywords ────────────────────────────────────────── 1396 1397 #[test] 1398 fn test_keywords() { 1399 assert_eq!(kind("var"), TokenKind::Var); 1400 assert_eq!(kind("let"), TokenKind::Let); 1401 assert_eq!(kind("const"), TokenKind::Const); 1402 assert_eq!(kind("function"), TokenKind::Function); 1403 assert_eq!(kind("class"), TokenKind::Class); 1404 assert_eq!(kind("if"), TokenKind::If); 1405 assert_eq!(kind("else"), TokenKind::Else); 1406 assert_eq!(kind("for"), TokenKind::For); 1407 assert_eq!(kind("while"), TokenKind::While); 1408 assert_eq!(kind("do"), TokenKind::Do); 1409 assert_eq!(kind("switch"), TokenKind::Switch); 1410 assert_eq!(kind("case"), TokenKind::Case); 1411 assert_eq!(kind("break"), TokenKind::Break); 1412 assert_eq!(kind("continue"), TokenKind::Continue); 1413 assert_eq!(kind("return"), TokenKind::Return); 1414 assert_eq!(kind("throw"), TokenKind::Throw); 1415 assert_eq!(kind("try"), TokenKind::Try); 1416 assert_eq!(kind("catch"), TokenKind::Catch); 1417 assert_eq!(kind("finally"), TokenKind::Finally); 1418 assert_eq!(kind("new"), TokenKind::New); 1419 assert_eq!(kind("delete"), TokenKind::Delete); 1420 assert_eq!(kind("typeof"), TokenKind::Typeof); 1421 assert_eq!(kind("instanceof"), TokenKind::Instanceof); 1422 assert_eq!(kind("void"), TokenKind::Void); 1423 assert_eq!(kind("in"), TokenKind::In); 1424 assert_eq!(kind("of"), TokenKind::Of); 1425 assert_eq!(kind("import"), TokenKind::Import); 1426 assert_eq!(kind("export"), TokenKind::Export); 1427 assert_eq!(kind("default"), TokenKind::Default); 1428 assert_eq!(kind("async"), TokenKind::Async); 1429 assert_eq!(kind("await"), TokenKind::Await); 1430 assert_eq!(kind("yield"), TokenKind::Yield); 1431 assert_eq!(kind("this"), TokenKind::This); 1432 assert_eq!(kind("super"), TokenKind::Super); 1433 assert_eq!(kind("extends"), TokenKind::Extends); 1434 assert_eq!(kind("static"), TokenKind::Static); 1435 assert_eq!(kind("debugger"), TokenKind::Debugger); 1436 assert_eq!(kind("with"), TokenKind::With); 1437 } 1438 1439 #[test] 1440 fn test_literal_keywords() { 1441 assert_eq!(kind("true"), TokenKind::True); 1442 assert_eq!(kind("false"), TokenKind::False); 1443 assert_eq!(kind("null"), TokenKind::Null); 1444 } 1445 1446 // ── Identifiers ────────────────────────────────────── 1447 1448 #[test] 1449 fn test_identifiers() { 1450 assert_eq!(kind("foo"), TokenKind::Identifier("foo".into())); 1451 assert_eq!(kind("_bar"), TokenKind::Identifier("_bar".into())); 1452 assert_eq!(kind("$baz"), TokenKind::Identifier("$baz".into())); 1453 assert_eq!(kind("abc123"), TokenKind::Identifier("abc123".into())); 1454 assert_eq!(kind("camelCase"), TokenKind::Identifier("camelCase".into())); 1455 } 1456 1457 #[test] 1458 fn test_unicode_identifiers() { 1459 assert_eq!(kind("café"), TokenKind::Identifier("café".into())); 1460 } 1461 1462 // ── Numbers ────────────────────────────────────────── 1463 1464 #[test] 1465 fn test_integers() { 1466 assert_eq!(kind("0"), TokenKind::Number(0.0)); 1467 assert_eq!(kind("42"), TokenKind::Number(42.0)); 1468 assert_eq!(kind("123456"), TokenKind::Number(123456.0)); 1469 } 1470 1471 #[test] 1472 fn test_floats() { 1473 assert_eq!(kind("3.14"), TokenKind::Number(3.14)); 1474 assert_eq!(kind("0.5"), TokenKind::Number(0.5)); 1475 assert_eq!(kind(".5"), TokenKind::Number(0.5)); 1476 assert_eq!(kind("1."), TokenKind::Number(1.0)); 1477 } 1478 1479 #[test] 1480 fn test_exponents() { 1481 assert_eq!(kind("1e2"), TokenKind::Number(100.0)); 1482 assert_eq!(kind("1E2"), TokenKind::Number(100.0)); 1483 assert_eq!(kind("1e+2"), TokenKind::Number(100.0)); 1484 assert_eq!(kind("1e-2"), TokenKind::Number(0.01)); 1485 assert_eq!(kind("2.5e3"), TokenKind::Number(2500.0)); 1486 } 1487 1488 #[test] 1489 fn test_hex() { 1490 assert_eq!(kind("0xFF"), TokenKind::Number(255.0)); 1491 assert_eq!(kind("0x0"), TokenKind::Number(0.0)); 1492 assert_eq!(kind("0xDEAD"), TokenKind::Number(0xDEAD as f64)); 1493 } 1494 1495 #[test] 1496 fn test_octal() { 1497 assert_eq!(kind("0o77"), TokenKind::Number(63.0)); 1498 assert_eq!(kind("0O10"), TokenKind::Number(8.0)); 1499 } 1500 1501 #[test] 1502 fn test_binary() { 1503 assert_eq!(kind("0b1010"), TokenKind::Number(10.0)); 1504 assert_eq!(kind("0B11"), TokenKind::Number(3.0)); 1505 } 1506 1507 #[test] 1508 fn test_numeric_separators() { 1509 assert_eq!(kind("1_000"), TokenKind::Number(1000.0)); 1510 assert_eq!(kind("0xFF_FF"), TokenKind::Number(65535.0)); 1511 assert_eq!(kind("0b1010_0101"), TokenKind::Number(165.0)); 1512 } 1513 1514 // ── Strings ────────────────────────────────────────── 1515 1516 #[test] 1517 fn test_double_quoted_string() { 1518 assert_eq!(kind(r#""hello""#), TokenKind::String("hello".into())); 1519 } 1520 1521 #[test] 1522 fn test_single_quoted_string() { 1523 assert_eq!(kind("'world'"), TokenKind::String("world".into())); 1524 } 1525 1526 #[test] 1527 fn test_string_escapes() { 1528 assert_eq!(kind(r#""\n\t\r""#), TokenKind::String("\n\t\r".into())); 1529 assert_eq!(kind(r#""\\""#), TokenKind::String("\\".into())); 1530 assert_eq!(kind(r#""\"""#), TokenKind::String("\"".into())); 1531 } 1532 1533 #[test] 1534 fn test_string_hex_escape() { 1535 assert_eq!(kind(r#""\x41""#), TokenKind::String("A".into())); 1536 } 1537 1538 #[test] 1539 fn test_string_unicode_escape() { 1540 assert_eq!(kind(r#""\u0041""#), TokenKind::String("A".into())); 1541 assert_eq!( 1542 kind(r#""\u{1F600}""#), 1543 TokenKind::String("\u{1F600}".into()) 1544 ); 1545 } 1546 1547 #[test] 1548 fn test_string_line_continuation() { 1549 // \<newline> is a line continuation producing no character 1550 assert_eq!( 1551 kind("\"line1\\\nline2\""), 1552 TokenKind::String("line1line2".into()) 1553 ); 1554 } 1555 1556 #[test] 1557 fn test_empty_string() { 1558 assert_eq!(kind(r#""""#), TokenKind::String("".into())); 1559 assert_eq!(kind("''"), TokenKind::String("".into())); 1560 } 1561 1562 // ── Template Literals ──────────────────────────────── 1563 1564 #[test] 1565 fn test_template_no_substitution() { 1566 assert_eq!(kind("`hello`"), TokenKind::TemplateFull("hello".into())); 1567 } 1568 1569 #[test] 1570 fn test_template_with_substitution() { 1571 let tokens = Lexer::tokenize("`hello ${name}!`").unwrap(); 1572 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1573 assert_eq!(k[0], &TokenKind::TemplateHead("hello ".into())); 1574 assert_eq!(k[1], &TokenKind::Identifier("name".into())); 1575 assert_eq!(k[2], &TokenKind::TemplateTail("!".into())); 1576 } 1577 1578 #[test] 1579 fn test_template_multiple_substitutions() { 1580 let tokens = Lexer::tokenize("`a${1}b${2}c`").unwrap(); 1581 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1582 assert_eq!(k[0], &TokenKind::TemplateHead("a".into())); 1583 assert_eq!(k[1], &TokenKind::Number(1.0)); 1584 assert_eq!(k[2], &TokenKind::TemplateMiddle("b".into())); 1585 assert_eq!(k[3], &TokenKind::Number(2.0)); 1586 assert_eq!(k[4], &TokenKind::TemplateTail("c".into())); 1587 } 1588 1589 #[test] 1590 fn test_template_with_nested_braces() { 1591 // `${({a:1})}` — the object literal inside ${ } has its own braces 1592 let tokens = Lexer::tokenize("`${({a:1})}`").unwrap(); 1593 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1594 assert_eq!(k[0], &TokenKind::TemplateHead("".into())); 1595 assert_eq!(k[1], &TokenKind::LParen); 1596 assert_eq!(k[2], &TokenKind::LBrace); 1597 assert_eq!(k[3], &TokenKind::Identifier("a".into())); 1598 assert_eq!(k[4], &TokenKind::Colon); 1599 assert_eq!(k[5], &TokenKind::Number(1.0)); 1600 assert_eq!(k[6], &TokenKind::RBrace); 1601 assert_eq!(k[7], &TokenKind::RParen); 1602 assert_eq!(k[8], &TokenKind::TemplateTail("".into())); 1603 } 1604 1605 // ── Regular Expressions ────────────────────────────── 1606 1607 #[test] 1608 fn test_regexp_basic() { 1609 let tokens = Lexer::tokenize("x = /foo/gi").unwrap(); 1610 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1611 assert_eq!( 1612 k[2], 1613 &TokenKind::RegExp { 1614 pattern: "foo".into(), 1615 flags: "gi".into() 1616 } 1617 ); 1618 } 1619 1620 #[test] 1621 fn test_regexp_with_class() { 1622 // /[a-z]/ — the `/` inside the character class is not the end 1623 let tokens = Lexer::tokenize("x = /[a/b]/").unwrap(); 1624 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1625 assert_eq!( 1626 k[2], 1627 &TokenKind::RegExp { 1628 pattern: "[a/b]".into(), 1629 flags: "".into() 1630 } 1631 ); 1632 } 1633 1634 #[test] 1635 fn test_regexp_vs_division() { 1636 // After an identifier, `/` is division 1637 let tokens = Lexer::tokenize("a / b").unwrap(); 1638 let k: Vec<_> = tokens.iter().map(|t| &t.kind).collect(); 1639 assert_eq!(k[1], &TokenKind::Slash); 1640 } 1641 1642 // ── Punctuators ────────────────────────────────────── 1643 1644 #[test] 1645 fn test_simple_punctuators() { 1646 assert_eq!(kind("("), TokenKind::LParen); 1647 assert_eq!(kind(")"), TokenKind::RParen); 1648 assert_eq!(kind("["), TokenKind::LBracket); 1649 assert_eq!(kind("]"), TokenKind::RBracket); 1650 assert_eq!(kind("{"), TokenKind::LBrace); 1651 assert_eq!(kind("}"), TokenKind::RBrace); 1652 assert_eq!(kind(";"), TokenKind::Semicolon); 1653 assert_eq!(kind(","), TokenKind::Comma); 1654 assert_eq!(kind(":"), TokenKind::Colon); 1655 assert_eq!(kind("~"), TokenKind::Tilde); 1656 } 1657 1658 #[test] 1659 fn test_dot_and_ellipsis() { 1660 assert_eq!(kind("."), TokenKind::Dot); 1661 assert_eq!(kind("..."), TokenKind::Ellipsis); 1662 } 1663 1664 #[test] 1665 fn test_arrow() { 1666 assert_eq!(kind("=>"), TokenKind::Arrow); 1667 } 1668 1669 #[test] 1670 fn test_optional_chaining() { 1671 assert_eq!(kind("?."), TokenKind::QuestionDot); 1672 } 1673 1674 #[test] 1675 fn test_comparison_operators() { 1676 assert_eq!(kind("=="), TokenKind::Eq); 1677 assert_eq!(kind("!="), TokenKind::Ne); 1678 assert_eq!(kind("==="), TokenKind::StrictEq); 1679 assert_eq!(kind("!=="), TokenKind::StrictNe); 1680 assert_eq!(kind("<"), TokenKind::Lt); 1681 assert_eq!(kind(">"), TokenKind::Gt); 1682 assert_eq!(kind("<="), TokenKind::Le); 1683 assert_eq!(kind(">="), TokenKind::Ge); 1684 } 1685 1686 #[test] 1687 fn test_arithmetic_operators() { 1688 assert_eq!(kind("+"), TokenKind::Plus); 1689 assert_eq!(kind("-"), TokenKind::Minus); 1690 assert_eq!(kind("*"), TokenKind::Star); 1691 assert_eq!(kind("%"), TokenKind::Percent); 1692 assert_eq!(kind("**"), TokenKind::Exp); 1693 assert_eq!(kind("++"), TokenKind::PlusPlus); 1694 assert_eq!(kind("--"), TokenKind::MinusMinus); 1695 } 1696 1697 #[test] 1698 fn test_bitwise_operators() { 1699 assert_eq!(kind("&"), TokenKind::Amp); 1700 assert_eq!(kind("|"), TokenKind::Pipe); 1701 assert_eq!(kind("^"), TokenKind::Caret); 1702 assert_eq!(kind("<<"), TokenKind::Shl); 1703 assert_eq!(kind(">>"), TokenKind::Shr); 1704 assert_eq!(kind(">>>"), TokenKind::Ushr); 1705 } 1706 1707 #[test] 1708 fn test_logical_operators() { 1709 assert_eq!(kind("&&"), TokenKind::And); 1710 assert_eq!(kind("||"), TokenKind::Or); 1711 assert_eq!(kind("!"), TokenKind::Not); 1712 assert_eq!(kind("??"), TokenKind::Nullish); 1713 } 1714 1715 #[test] 1716 fn test_assignment_operators() { 1717 assert_eq!(kind("="), TokenKind::Assign); 1718 assert_eq!(kind("+="), TokenKind::PlusAssign); 1719 assert_eq!(kind("-="), TokenKind::MinusAssign); 1720 assert_eq!(kind("*="), TokenKind::StarAssign); 1721 assert_eq!(kind("%="), TokenKind::PercentAssign); 1722 assert_eq!(kind("**="), TokenKind::ExpAssign); 1723 assert_eq!(kind("&="), TokenKind::AmpAssign); 1724 assert_eq!(kind("|="), TokenKind::PipeAssign); 1725 assert_eq!(kind("^="), TokenKind::CaretAssign); 1726 assert_eq!(kind("<<="), TokenKind::ShlAssign); 1727 assert_eq!(kind(">>="), TokenKind::ShrAssign); 1728 assert_eq!(kind(">>>="), TokenKind::UshrAssign); 1729 assert_eq!(kind("&&="), TokenKind::AndAssign); 1730 assert_eq!(kind("||="), TokenKind::OrAssign); 1731 assert_eq!(kind("??="), TokenKind::NullishAssign); 1732 } 1733 1734 // ── Comments ───────────────────────────────────────── 1735 1736 #[test] 1737 fn test_single_line_comment() { 1738 let tokens = kinds("a // comment\nb"); 1739 assert_eq!(tokens.len(), 3); // a, b, Eof 1740 assert_eq!(tokens[0], TokenKind::Identifier("a".into())); 1741 assert_eq!(tokens[1], TokenKind::Identifier("b".into())); 1742 } 1743 1744 #[test] 1745 fn test_multi_line_comment() { 1746 let tokens = kinds("a /* comment */ b"); 1747 assert_eq!(tokens.len(), 3); 1748 assert_eq!(tokens[0], TokenKind::Identifier("a".into())); 1749 assert_eq!(tokens[1], TokenKind::Identifier("b".into())); 1750 } 1751 1752 // ── Source positions ───────────────────────────────── 1753 1754 #[test] 1755 fn test_source_positions() { 1756 let tokens = Lexer::tokenize("let x = 42").unwrap(); 1757 // `let` at line 1, col 1 1758 assert_eq!(tokens[0].span.start, SourcePos { line: 1, col: 1 }); 1759 // `x` at line 1, col 5 1760 assert_eq!(tokens[1].span.start, SourcePos { line: 1, col: 5 }); 1761 // `=` at line 1, col 7 1762 assert_eq!(tokens[2].span.start, SourcePos { line: 1, col: 7 }); 1763 // `42` at line 1, col 9 1764 assert_eq!(tokens[3].span.start, SourcePos { line: 1, col: 9 }); 1765 } 1766 1767 #[test] 1768 fn test_multiline_positions() { 1769 let tokens = Lexer::tokenize("a\nb\nc").unwrap(); 1770 assert_eq!(tokens[0].span.start, SourcePos { line: 1, col: 1 }); 1771 assert_eq!(tokens[1].span.start, SourcePos { line: 2, col: 1 }); 1772 assert_eq!(tokens[2].span.start, SourcePos { line: 3, col: 1 }); 1773 } 1774 1775 // ── Newline tracking (ASI) ─────────────────────────── 1776 1777 #[test] 1778 fn test_preceded_by_newline() { 1779 let tokens = Lexer::tokenize("a\nb").unwrap(); 1780 assert!(!tokens[0].preceded_by_newline); // `a` 1781 assert!(tokens[1].preceded_by_newline); // `b` 1782 } 1783 1784 // ── Error cases ────────────────────────────────────── 1785 1786 #[test] 1787 fn test_unterminated_string() { 1788 assert!(Lexer::tokenize("\"hello").is_err()); 1789 } 1790 1791 #[test] 1792 fn test_unterminated_block_comment() { 1793 assert!(Lexer::tokenize("/* oops").is_err()); 1794 } 1795 1796 #[test] 1797 fn test_unterminated_template() { 1798 assert!(Lexer::tokenize("`hello").is_err()); 1799 } 1800 1801 #[test] 1802 fn test_bad_hex_literal() { 1803 assert!(Lexer::tokenize("0x").is_err()); 1804 } 1805 1806 // ── Full statement tokenization ────────────────────── 1807 1808 #[test] 1809 fn test_full_statement() { 1810 let tokens = kinds("const x = 42 + y;"); 1811 assert_eq!( 1812 tokens, 1813 vec![ 1814 TokenKind::Const, 1815 TokenKind::Identifier("x".into()), 1816 TokenKind::Assign, 1817 TokenKind::Number(42.0), 1818 TokenKind::Plus, 1819 TokenKind::Identifier("y".into()), 1820 TokenKind::Semicolon, 1821 TokenKind::Eof, 1822 ] 1823 ); 1824 } 1825 1826 #[test] 1827 fn test_arrow_function() { 1828 let tokens = kinds("(x) => x + 1"); 1829 assert_eq!( 1830 tokens, 1831 vec![ 1832 TokenKind::LParen, 1833 TokenKind::Identifier("x".into()), 1834 TokenKind::RParen, 1835 TokenKind::Arrow, 1836 TokenKind::Identifier("x".into()), 1837 TokenKind::Plus, 1838 TokenKind::Number(1.0), 1839 TokenKind::Eof, 1840 ] 1841 ); 1842 } 1843 1844 #[test] 1845 fn test_complex_expression() { 1846 let tokens = kinds("a?.b ?? c !== d"); 1847 assert_eq!( 1848 tokens, 1849 vec![ 1850 TokenKind::Identifier("a".into()), 1851 TokenKind::QuestionDot, 1852 TokenKind::Identifier("b".into()), 1853 TokenKind::Nullish, 1854 TokenKind::Identifier("c".into()), 1855 TokenKind::StrictNe, 1856 TokenKind::Identifier("d".into()), 1857 TokenKind::Eof, 1858 ] 1859 ); 1860 } 1861 1862 #[test] 1863 fn test_division_after_paren() { 1864 // `(a) / b` — the `/` after `)` should be division, not regexp 1865 let tokens = kinds("(a) / b"); 1866 assert_eq!( 1867 tokens, 1868 vec![ 1869 TokenKind::LParen, 1870 TokenKind::Identifier("a".into()), 1871 TokenKind::RParen, 1872 TokenKind::Slash, 1873 TokenKind::Identifier("b".into()), 1874 TokenKind::Eof, 1875 ] 1876 ); 1877 } 1878 1879 #[test] 1880 fn test_slash_assign() { 1881 let tokens = kinds("a /= b"); 1882 assert_eq!( 1883 tokens, 1884 vec![ 1885 TokenKind::Identifier("a".into()), 1886 TokenKind::SlashAssign, 1887 TokenKind::Identifier("b".into()), 1888 TokenKind::Eof, 1889 ] 1890 ); 1891 } 1892 1893 #[test] 1894 fn test_regexp_after_assign() { 1895 let tokens = kinds("x = /test/g"); 1896 assert_eq!( 1897 tokens, 1898 vec![ 1899 TokenKind::Identifier("x".into()), 1900 TokenKind::Assign, 1901 TokenKind::RegExp { 1902 pattern: "test".into(), 1903 flags: "g".into() 1904 }, 1905 TokenKind::Eof, 1906 ] 1907 ); 1908 } 1909}