Serenity Operating System
at master 810 lines 23 kB view raw
1/* 2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include "Lexer.h" 8#include <AK/CharacterTypes.h> 9#include <AK/DeprecatedString.h> 10#include <AK/Function.h> 11#include <AK/HashTable.h> 12#include <AK/StdLibExtras.h> 13 14namespace Cpp { 15 16Lexer::Lexer(StringView input, size_t start_line) 17 : m_input(input) 18 , m_previous_position { start_line, 0 } 19 , m_position { start_line, 0 } 20{ 21} 22 23char Lexer::peek(size_t offset) const 24{ 25 if ((m_index + offset) >= m_input.length()) 26 return 0; 27 return m_input[m_index + offset]; 28} 29 30char Lexer::consume() 31{ 32 VERIFY(m_index < m_input.length()); 33 char ch = m_input[m_index++]; 34 m_previous_position = m_position; 35 if (ch == '\n') { 36 m_position.line++; 37 m_position.column = 0; 38 } else { 39 m_position.column++; 40 } 41 return ch; 42} 43 44constexpr bool is_valid_first_character_of_identifier(char ch) 45{ 46 return is_ascii_alpha(ch) || ch == '_' || ch == '$'; 47} 48 49constexpr bool is_valid_nonfirst_character_of_identifier(char ch) 50{ 51 return is_valid_first_character_of_identifier(ch) || is_ascii_digit(ch); 52} 53 54constexpr StringView s_known_keywords[] = { 55 "alignas"sv, 56 "alignof"sv, 57 "and"sv, 58 "and_eq"sv, 59 "asm"sv, 60 "bitand"sv, 61 "bitor"sv, 62 "break"sv, 63 "case"sv, 64 "catch"sv, 65 "class"sv, 66 "compl"sv, 67 "const"sv, 68 "const_cast"sv, 69 "constexpr"sv, 70 "continue"sv, 71 "decltype"sv, 72 "default"sv, 73 "delete"sv, 74 "do"sv, 75 "dynamic_cast"sv, 76 "else"sv, 77 "enum"sv, 78 "explicit"sv, 79 "export"sv, 80 "extern"sv, 81 "false"sv, 82 "final"sv, 83 "for"sv, 84 "friend"sv, 85 "goto"sv, 86 "if"sv, 87 "inline"sv, 88 "mutable"sv, 89 "namespace"sv, 90 "new"sv, 91 "noexcept"sv, 92 "not"sv, 93 "not_eq"sv, 94 "nullptr"sv, 95 "operator"sv, 96 "or"sv, 97 "or_eq"sv, 98 "override"sv, 99 "private"sv, 100 "protected"sv, 101 "public"sv, 102 "register"sv, 103 "reinterpret_cast"sv, 104 "return"sv, 105 "signed"sv, 106 "sizeof"sv, 107 "static"sv, 108 "static_assert"sv, 109 "static_cast"sv, 110 "struct"sv, 111 "switch"sv, 112 "template"sv, 113 "this"sv, 114 "thread_local"sv, 115 "throw"sv, 116 "true"sv, 117 "try"sv, 118 "typedef"sv, 119 "typeid"sv, 120 "typename"sv, 121 "union"sv, 122 "using"sv, 123 "virtual"sv, 124 "volatile"sv, 125 "while"sv, 126 "xor"sv, 127 "xor_eq"sv 128}; 129 130constexpr StringView s_known_types[] = { 131 "Array"sv, 132 "Array"sv, 133 "Badge"sv, 134 "Bitmap"sv, 135 "ByteBuffer"sv, 136 "Bytes"sv, 137 "Checked"sv, 138 "CircularDeque"sv, 139 "CircularQueue"sv, 140 "Deque"sv, 141 "DoublyLinkedList"sv, 142 "Error"sv, 143 "ErrorOr"sv, 144 "FlyString"sv, 145 "Function"sv, 146 "HashMap"sv, 147 "HashTable"sv, 148 "IPv4Address"sv, 149 "IntrusiveList"sv, 150 "IntrusiveList"sv, 151 "JsonArray"sv, 152 "JsonObject"sv, 153 "JsonValue"sv, 154 "LexicalPath"sv, 155 "MappedFile"sv, 156 "NetworkOrdered"sv, 157 "NeverDestroyed"sv, 158 "NonnullOwnPtr"sv, 159 "NonnullRefPtr"sv, 160 "Optional"sv, 161 "OwnPtr"sv, 162 "ReadonlyBytes"sv, 163 "RedBlackTree"sv, 164 "RefPtr"sv, 165 "Result"sv, 166 "ScopeGuard"sv, 167 "Singleton"sv, 168 "SinglyLinkedList"sv, 169 "Span"sv, 170 "String"sv, 171 "StringBuilder"sv, 172 "StringImpl"sv, 173 "StringView"sv, 174 "Utf8View"sv, 175 "Variant"sv, 176 "Vector"sv, 177 "WeakPtr"sv, 178 "auto"sv, 179 "bool"sv, 180 "char"sv, 181 "char16_t"sv, 182 "char32_t"sv, 183 "char8_t"sv, 184 "double"sv, 185 "float"sv, 186 "i16"sv, 187 "i32"sv, 188 "i64"sv, 189 "i8"sv, 190 "int"sv, 191 "int"sv, 192 "long"sv, 193 "short"sv, 194 "signed"sv, 195 "u16"sv, 196 "u32"sv, 197 "u64"sv, 198 "u8"sv, 199 "unsigned"sv, 200 "void"sv, 201 "wchar_t"sv, 202}; 203 204static bool is_keyword(StringView string) 205{ 206 static HashTable<DeprecatedString> keywords(array_size(s_known_keywords)); 207 if (keywords.is_empty()) { 208 keywords.set_from(s_known_keywords); 209 } 210 return keywords.contains(string); 211} 212 213static bool is_known_type(StringView string) 214{ 215 static HashTable<DeprecatedString> types(array_size(s_known_types)); 216 if (types.is_empty()) { 217 types.set_from(s_known_types); 218 } 219 return types.contains(string); 220} 221 222void Lexer::lex_impl(Function<void(Token)> callback) 223{ 224 size_t token_start_index = 0; 225 Position token_start_position; 226 227 auto emit_single_char_token = [&](auto type) { 228 callback(Token(type, m_position, m_position, m_input.substring_view(m_index, 1))); 229 consume(); 230 }; 231 232 auto begin_token = [&] { 233 token_start_index = m_index; 234 token_start_position = m_position; 235 }; 236 auto commit_token = [&](auto type) { 237 if (m_options.ignore_whitespace && type == Token::Type::Whitespace) 238 return; 239 callback(Token(type, token_start_position, m_previous_position, m_input.substring_view(token_start_index, m_index - token_start_index))); 240 }; 241 242 auto emit_token_equals = [&](auto type, auto equals_type) { 243 if (peek(1) == '=') { 244 begin_token(); 245 consume(); 246 consume(); 247 commit_token(equals_type); 248 return; 249 } 250 emit_single_char_token(type); 251 }; 252 253 auto match_escape_sequence = [&]() -> size_t { 254 switch (peek(1)) { 255 case '\'': 256 case '"': 257 case '?': 258 case '\\': 259 case 'a': 260 case 'b': 261 case 'f': 262 case 'n': 263 case 'r': 264 case 't': 265 case 'v': 266 return 2; 267 case '0': 268 case '1': 269 case '2': 270 case '3': 271 case '4': 272 case '5': 273 case '6': 274 case '7': { 275 size_t octal_digits = 1; 276 for (size_t i = 0; i < 2; ++i) { 277 char next = peek(2 + i); 278 if (next < '0' || next > '7') 279 break; 280 ++octal_digits; 281 } 282 return 1 + octal_digits; 283 } 284 case 'x': { 285 size_t hex_digits = 0; 286 while (is_ascii_hex_digit(peek(2 + hex_digits))) 287 ++hex_digits; 288 return 2 + hex_digits; 289 } 290 case 'u': 291 case 'U': { 292 bool is_unicode = true; 293 size_t number_of_digits = peek(1) == 'u' ? 4 : 8; 294 for (size_t i = 0; i < number_of_digits; ++i) { 295 if (!is_ascii_hex_digit(peek(2 + i))) { 296 is_unicode = false; 297 break; 298 } 299 } 300 return is_unicode ? 2 + number_of_digits : 0; 301 } 302 default: 303 return 0; 304 } 305 }; 306 307 auto match_string_prefix = [&](char quote) -> size_t { 308 if (peek() == quote) 309 return 1; 310 if (peek() == 'L' && peek(1) == quote) 311 return 2; 312 if (peek() == 'u') { 313 if (peek(1) == quote) 314 return 2; 315 if (peek(1) == '8' && peek(2) == quote) 316 return 3; 317 } 318 if (peek() == 'U' && peek(1) == quote) 319 return 2; 320 return 0; 321 }; 322 323 while (m_index < m_input.length()) { 324 auto ch = peek(); 325 if (is_ascii_space(ch)) { 326 begin_token(); 327 while (is_ascii_space(peek())) 328 consume(); 329 commit_token(Token::Type::Whitespace); 330 continue; 331 } 332 if (ch == '(') { 333 emit_single_char_token(Token::Type::LeftParen); 334 continue; 335 } 336 if (ch == ')') { 337 emit_single_char_token(Token::Type::RightParen); 338 continue; 339 } 340 if (ch == '{') { 341 emit_single_char_token(Token::Type::LeftCurly); 342 continue; 343 } 344 if (ch == '}') { 345 emit_single_char_token(Token::Type::RightCurly); 346 continue; 347 } 348 if (ch == '[') { 349 emit_single_char_token(Token::Type::LeftBracket); 350 continue; 351 } 352 if (ch == ']') { 353 emit_single_char_token(Token::Type::RightBracket); 354 continue; 355 } 356 if (ch == '<') { 357 begin_token(); 358 consume(); 359 if (peek() == '<') { 360 consume(); 361 if (peek() == '=') { 362 consume(); 363 commit_token(Token::Type::LessLessEquals); 364 continue; 365 } 366 commit_token(Token::Type::LessLess); 367 continue; 368 } 369 if (peek() == '=') { 370 consume(); 371 commit_token(Token::Type::LessEquals); 372 continue; 373 } 374 if (peek() == '>') { 375 consume(); 376 commit_token(Token::Type::LessGreater); 377 continue; 378 } 379 commit_token(Token::Type::Less); 380 continue; 381 } 382 if (ch == '>') { 383 begin_token(); 384 consume(); 385 if (peek() == '>') { 386 consume(); 387 if (peek() == '=') { 388 consume(); 389 commit_token(Token::Type::GreaterGreaterEquals); 390 continue; 391 } 392 commit_token(Token::Type::GreaterGreater); 393 continue; 394 } 395 if (peek() == '=') { 396 consume(); 397 commit_token(Token::Type::GreaterEquals); 398 continue; 399 } 400 commit_token(Token::Type::Greater); 401 continue; 402 } 403 if (ch == ',') { 404 emit_single_char_token(Token::Type::Comma); 405 continue; 406 } 407 if (ch == '+') { 408 begin_token(); 409 consume(); 410 if (peek() == '+') { 411 consume(); 412 commit_token(Token::Type::PlusPlus); 413 continue; 414 } 415 if (peek() == '=') { 416 consume(); 417 commit_token(Token::Type::PlusEquals); 418 continue; 419 } 420 commit_token(Token::Type::Plus); 421 continue; 422 } 423 if (ch == '-') { 424 begin_token(); 425 consume(); 426 if (peek() == '-') { 427 consume(); 428 commit_token(Token::Type::MinusMinus); 429 continue; 430 } 431 if (peek() == '=') { 432 consume(); 433 commit_token(Token::Type::MinusEquals); 434 continue; 435 } 436 if (peek() == '>') { 437 consume(); 438 if (peek() == '*') { 439 consume(); 440 commit_token(Token::Type::ArrowAsterisk); 441 continue; 442 } 443 commit_token(Token::Type::Arrow); 444 continue; 445 } 446 commit_token(Token::Type::Minus); 447 continue; 448 } 449 if (ch == '*') { 450 emit_token_equals(Token::Type::Asterisk, Token::Type::AsteriskEquals); 451 continue; 452 } 453 if (ch == '%') { 454 emit_token_equals(Token::Type::Percent, Token::Type::PercentEquals); 455 continue; 456 } 457 if (ch == '^') { 458 emit_token_equals(Token::Type::Caret, Token::Type::CaretEquals); 459 continue; 460 } 461 if (ch == '!') { 462 emit_token_equals(Token::Type::ExclamationMark, Token::Type::ExclamationMarkEquals); 463 continue; 464 } 465 if (ch == '=') { 466 emit_token_equals(Token::Type::Equals, Token::Type::EqualsEquals); 467 continue; 468 } 469 if (ch == '&') { 470 begin_token(); 471 consume(); 472 if (peek() == '&') { 473 consume(); 474 commit_token(Token::Type::AndAnd); 475 continue; 476 } 477 if (peek() == '=') { 478 consume(); 479 commit_token(Token::Type::AndEquals); 480 continue; 481 } 482 commit_token(Token::Type::And); 483 continue; 484 } 485 if (ch == '|') { 486 begin_token(); 487 consume(); 488 if (peek() == '|') { 489 consume(); 490 commit_token(Token::Type::PipePipe); 491 continue; 492 } 493 if (peek() == '=') { 494 consume(); 495 commit_token(Token::Type::PipeEquals); 496 continue; 497 } 498 commit_token(Token::Type::Pipe); 499 continue; 500 } 501 if (ch == '~') { 502 emit_single_char_token(Token::Type::Tilde); 503 continue; 504 } 505 if (ch == '?') { 506 emit_single_char_token(Token::Type::QuestionMark); 507 continue; 508 } 509 if (ch == ':') { 510 begin_token(); 511 consume(); 512 if (peek() == ':') { 513 consume(); 514 if (peek() == '*') { 515 consume(); 516 commit_token(Token::Type::ColonColonAsterisk); 517 continue; 518 } 519 commit_token(Token::Type::ColonColon); 520 continue; 521 } 522 commit_token(Token::Type::Colon); 523 continue; 524 } 525 if (ch == ';') { 526 emit_single_char_token(Token::Type::Semicolon); 527 continue; 528 } 529 if (ch == '.') { 530 begin_token(); 531 consume(); 532 if (peek() == '*') { 533 consume(); 534 commit_token(Token::Type::DotAsterisk); 535 continue; 536 } 537 commit_token(Token::Type::Dot); 538 continue; 539 } 540 if (ch == '#') { 541 begin_token(); 542 consume(); 543 while (AK::is_ascii_space(peek())) 544 consume(); 545 546 size_t directive_start = m_index; 547 if (is_valid_first_character_of_identifier(peek())) 548 while (peek() && is_valid_nonfirst_character_of_identifier(peek())) 549 consume(); 550 551 auto directive = StringView(m_input.characters_without_null_termination() + directive_start, m_index - directive_start); 552 if (directive == "include"sv) { 553 commit_token(Token::Type::IncludeStatement); 554 555 if (is_ascii_space(peek())) { 556 begin_token(); 557 do { 558 consume(); 559 } while (is_ascii_space(peek())); 560 commit_token(Token::Type::Whitespace); 561 } 562 563 begin_token(); 564 if (peek() == '<' || peek() == '"') { 565 char closing = consume() == '<' ? '>' : '"'; 566 while (peek() && peek() != closing && peek() != '\n') 567 consume(); 568 569 if (peek() && consume() == '\n') { 570 commit_token(Token::Type::IncludePath); 571 continue; 572 } 573 574 commit_token(Token::Type::IncludePath); 575 begin_token(); 576 } 577 } else { 578 while (peek()) { 579 if (peek() == '\\' && peek(1) == '\n') { 580 consume(); 581 consume(); 582 } else if (peek() == '\n') { 583 break; 584 } else { 585 consume(); 586 } 587 } 588 589 commit_token(Token::Type::PreprocessorStatement); 590 } 591 592 continue; 593 } 594 if (ch == '/' && peek(1) == '/') { 595 begin_token(); 596 while (peek() && peek() != '\n') 597 consume(); 598 commit_token(Token::Type::Comment); 599 continue; 600 } 601 if (ch == '/' && peek(1) == '*') { 602 begin_token(); 603 consume(); 604 consume(); 605 bool comment_block_ends = false; 606 while (peek()) { 607 if (peek() == '*' && peek(1) == '/') { 608 comment_block_ends = true; 609 break; 610 } 611 612 consume(); 613 } 614 615 if (comment_block_ends) { 616 consume(); 617 consume(); 618 } 619 620 commit_token(Token::Type::Comment); 621 continue; 622 } 623 if (ch == '/') { 624 emit_token_equals(Token::Type::Slash, Token::Type::SlashEquals); 625 continue; 626 } 627 if (size_t prefix = match_string_prefix('"'); prefix > 0) { 628 begin_token(); 629 for (size_t i = 0; i < prefix; ++i) 630 consume(); 631 while (peek()) { 632 if (peek() == '\\') { 633 if (size_t escape = match_escape_sequence(); escape > 0) { 634 commit_token(Token::Type::DoubleQuotedString); 635 begin_token(); 636 for (size_t i = 0; i < escape; ++i) 637 consume(); 638 commit_token(Token::Type::EscapeSequence); 639 begin_token(); 640 continue; 641 } 642 } 643 644 // If string is not terminated - stop before EOF 645 if (!peek(1)) 646 break; 647 648 if (consume() == '"') 649 break; 650 } 651 commit_token(Token::Type::DoubleQuotedString); 652 continue; 653 } 654 if (size_t prefix = match_string_prefix('R'); prefix > 0 && peek(prefix) == '"') { 655 begin_token(); 656 for (size_t i = 0; i < prefix + 1; ++i) 657 consume(); 658 size_t prefix_start = m_index; 659 while (peek() && peek() != '(') 660 consume(); 661 StringView prefix_string = m_input.substring_view(prefix_start, m_index - prefix_start); 662 while (peek()) { 663 if (consume() == '"') { 664 VERIFY(m_index >= prefix_string.length() + 2); 665 VERIFY(m_input[m_index - 1] == '"'); 666 if (m_input[m_index - 1 - prefix_string.length() - 1] == ')') { 667 StringView suffix_string = m_input.substring_view(m_index - 1 - prefix_string.length(), prefix_string.length()); 668 if (prefix_string == suffix_string) 669 break; 670 } 671 } 672 } 673 commit_token(Token::Type::RawString); 674 continue; 675 } 676 if (size_t prefix = match_string_prefix('\''); prefix > 0) { 677 begin_token(); 678 for (size_t i = 0; i < prefix; ++i) 679 consume(); 680 while (peek()) { 681 if (peek() == '\\') { 682 if (size_t escape = match_escape_sequence(); escape > 0) { 683 commit_token(Token::Type::SingleQuotedString); 684 begin_token(); 685 for (size_t i = 0; i < escape; ++i) 686 consume(); 687 commit_token(Token::Type::EscapeSequence); 688 begin_token(); 689 continue; 690 } 691 } 692 693 if (consume() == '\'') 694 break; 695 } 696 commit_token(Token::Type::SingleQuotedString); 697 continue; 698 } 699 if (is_ascii_digit(ch) || (ch == '.' && is_ascii_digit(peek(1)))) { 700 begin_token(); 701 consume(); 702 703 auto type = ch == '.' ? Token::Type::Float : Token::Type::Integer; 704 bool is_hex = false; 705 bool is_binary = false; 706 707 auto match_exponent = [&]() -> size_t { 708 char ch = peek(); 709 if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P') 710 return 0; 711 712 type = Token::Type::Float; 713 size_t length = 1; 714 ch = peek(length); 715 if (ch == '+' || ch == '-') { 716 ++length; 717 } 718 for (ch = peek(length); is_ascii_digit(ch); ch = peek(length)) { 719 ++length; 720 } 721 return length; 722 }; 723 724 auto match_type_literal = [&]() -> size_t { 725 size_t length = 0; 726 for (;;) { 727 char ch = peek(length); 728 if ((ch == 'u' || ch == 'U') && type == Token::Type::Integer) { 729 ++length; 730 } else if ((ch == 'f' || ch == 'F') && !is_binary) { 731 type = Token::Type::Float; 732 ++length; 733 } else if (ch == 'l' || ch == 'L') { 734 ++length; 735 } else 736 return length; 737 } 738 }; 739 740 if (peek() == 'b' || peek() == 'B') { 741 consume(); 742 is_binary = true; 743 for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) { 744 consume(); 745 } 746 } else { 747 if (peek() == 'x' || peek() == 'X') { 748 consume(); 749 is_hex = true; 750 } 751 752 for (char ch = peek(); (is_hex ? is_ascii_hex_digit(ch) : is_ascii_digit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) { 753 if (ch == '.') { 754 if (type == Token::Type::Integer) { 755 type = Token::Type::Float; 756 } else 757 break; 758 }; 759 consume(); 760 } 761 } 762 763 if (!is_binary) { 764 size_t length = match_exponent(); 765 for (size_t i = 0; i < length; ++i) 766 consume(); 767 } 768 769 size_t length = match_type_literal(); 770 for (size_t i = 0; i < length; ++i) 771 consume(); 772 773 commit_token(type); 774 continue; 775 } 776 if (is_valid_first_character_of_identifier(ch)) { 777 begin_token(); 778 while (peek() && is_valid_nonfirst_character_of_identifier(peek())) 779 consume(); 780 auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index); 781 if (is_keyword(token_view)) 782 commit_token(Token::Type::Keyword); 783 else if (is_known_type(token_view)) 784 commit_token(Token::Type::KnownType); 785 else 786 commit_token(Token::Type::Identifier); 787 continue; 788 } 789 790 if (ch == '\\' && peek(1) == '\n') { 791 consume(); 792 consume(); 793 continue; 794 } 795 796 dbgln("Unimplemented token character: {}", ch); 797 emit_single_char_token(Token::Type::Unknown); 798 } 799} 800 801Vector<Token> Lexer::lex() 802{ 803 Vector<Token> tokens; 804 lex_impl([&](auto token) { 805 tokens.append(move(token)); 806 }); 807 return tokens; 808} 809 810}