Userland/Libraries/LibCpp/Lexer.cpp at master

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / Userland / Libraries / LibCpp / Lexer.cpp
at master 810 lines 23 kB view raw
wrap content
Andreas Kling LibCpp: Remove Nonnull*PtrVector from list of known SerenityOS types 3y ago
f4849521
  1/*
  2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3 *
  4 * SPDX-License-Identifier: BSD-2-Clause
  5 */
  6
  7#include "Lexer.h"
  8#include <AK/CharacterTypes.h>
  9#include <AK/DeprecatedString.h>
 10#include <AK/Function.h>
 11#include <AK/HashTable.h>
 12#include <AK/StdLibExtras.h>
 13
 14namespace Cpp {
 15
 16Lexer::Lexer(StringView input, size_t start_line)
 17    : m_input(input)
 18    , m_previous_position { start_line, 0 }
 19    , m_position { start_line, 0 }
 20{
 21}
 22
 23char Lexer::peek(size_t offset) const
 24{
 25    if ((m_index + offset) >= m_input.length())
 26        return 0;
 27    return m_input[m_index + offset];
 28}
 29
 30char Lexer::consume()
 31{
 32    VERIFY(m_index < m_input.length());
 33    char ch = m_input[m_index++];
 34    m_previous_position = m_position;
 35    if (ch == '\n') {
 36        m_position.line++;
 37        m_position.column = 0;
 38    } else {
 39        m_position.column++;
 40    }
 41    return ch;
 42}
 43
 44constexpr bool is_valid_first_character_of_identifier(char ch)
 45{
 46    return is_ascii_alpha(ch) || ch == '_' || ch == '$';
 47}
 48
 49constexpr bool is_valid_nonfirst_character_of_identifier(char ch)
 50{
 51    return is_valid_first_character_of_identifier(ch) || is_ascii_digit(ch);
 52}
 53
 54constexpr StringView s_known_keywords[] = {
 55    "alignas"sv,
 56    "alignof"sv,
 57    "and"sv,
 58    "and_eq"sv,
 59    "asm"sv,
 60    "bitand"sv,
 61    "bitor"sv,
 62    "break"sv,
 63    "case"sv,
 64    "catch"sv,
 65    "class"sv,
 66    "compl"sv,
 67    "const"sv,
 68    "const_cast"sv,
 69    "constexpr"sv,
 70    "continue"sv,
 71    "decltype"sv,
 72    "default"sv,
 73    "delete"sv,
 74    "do"sv,
 75    "dynamic_cast"sv,
 76    "else"sv,
 77    "enum"sv,
 78    "explicit"sv,
 79    "export"sv,
 80    "extern"sv,
 81    "false"sv,
 82    "final"sv,
 83    "for"sv,
 84    "friend"sv,
 85    "goto"sv,
 86    "if"sv,
 87    "inline"sv,
 88    "mutable"sv,
 89    "namespace"sv,
 90    "new"sv,
 91    "noexcept"sv,
 92    "not"sv,
 93    "not_eq"sv,
 94    "nullptr"sv,
 95    "operator"sv,
 96    "or"sv,
 97    "or_eq"sv,
 98    "override"sv,
 99    "private"sv,
100    "protected"sv,
101    "public"sv,
102    "register"sv,
103    "reinterpret_cast"sv,
104    "return"sv,
105    "signed"sv,
106    "sizeof"sv,
107    "static"sv,
108    "static_assert"sv,
109    "static_cast"sv,
110    "struct"sv,
111    "switch"sv,
112    "template"sv,
113    "this"sv,
114    "thread_local"sv,
115    "throw"sv,
116    "true"sv,
117    "try"sv,
118    "typedef"sv,
119    "typeid"sv,
120    "typename"sv,
121    "union"sv,
122    "using"sv,
123    "virtual"sv,
124    "volatile"sv,
125    "while"sv,
126    "xor"sv,
127    "xor_eq"sv
128};
129
130constexpr StringView s_known_types[] = {
131    "Array"sv,
132    "Array"sv,
133    "Badge"sv,
134    "Bitmap"sv,
135    "ByteBuffer"sv,
136    "Bytes"sv,
137    "Checked"sv,
138    "CircularDeque"sv,
139    "CircularQueue"sv,
140    "Deque"sv,
141    "DoublyLinkedList"sv,
142    "Error"sv,
143    "ErrorOr"sv,
144    "FlyString"sv,
145    "Function"sv,
146    "HashMap"sv,
147    "HashTable"sv,
148    "IPv4Address"sv,
149    "IntrusiveList"sv,
150    "IntrusiveList"sv,
151    "JsonArray"sv,
152    "JsonObject"sv,
153    "JsonValue"sv,
154    "LexicalPath"sv,
155    "MappedFile"sv,
156    "NetworkOrdered"sv,
157    "NeverDestroyed"sv,
158    "NonnullOwnPtr"sv,
159    "NonnullRefPtr"sv,
160    "Optional"sv,
161    "OwnPtr"sv,
162    "ReadonlyBytes"sv,
163    "RedBlackTree"sv,
164    "RefPtr"sv,
165    "Result"sv,
166    "ScopeGuard"sv,
167    "Singleton"sv,
168    "SinglyLinkedList"sv,
169    "Span"sv,
170    "String"sv,
171    "StringBuilder"sv,
172    "StringImpl"sv,
173    "StringView"sv,
174    "Utf8View"sv,
175    "Variant"sv,
176    "Vector"sv,
177    "WeakPtr"sv,
178    "auto"sv,
179    "bool"sv,
180    "char"sv,
181    "char16_t"sv,
182    "char32_t"sv,
183    "char8_t"sv,
184    "double"sv,
185    "float"sv,
186    "i16"sv,
187    "i32"sv,
188    "i64"sv,
189    "i8"sv,
190    "int"sv,
191    "int"sv,
192    "long"sv,
193    "short"sv,
194    "signed"sv,
195    "u16"sv,
196    "u32"sv,
197    "u64"sv,
198    "u8"sv,
199    "unsigned"sv,
200    "void"sv,
201    "wchar_t"sv,
202};
203
204static bool is_keyword(StringView string)
205{
206    static HashTable<DeprecatedString> keywords(array_size(s_known_keywords));
207    if (keywords.is_empty()) {
208        keywords.set_from(s_known_keywords);
209    }
210    return keywords.contains(string);
211}
212
213static bool is_known_type(StringView string)
214{
215    static HashTable<DeprecatedString> types(array_size(s_known_types));
216    if (types.is_empty()) {
217        types.set_from(s_known_types);
218    }
219    return types.contains(string);
220}
221
222void Lexer::lex_impl(Function<void(Token)> callback)
223{
224    size_t token_start_index = 0;
225    Position token_start_position;
226
227    auto emit_single_char_token = [&](auto type) {
228        callback(Token(type, m_position, m_position, m_input.substring_view(m_index, 1)));
229        consume();
230    };
231
232    auto begin_token = [&] {
233        token_start_index = m_index;
234        token_start_position = m_position;
235    };
236    auto commit_token = [&](auto type) {
237        if (m_options.ignore_whitespace && type == Token::Type::Whitespace)
238            return;
239        callback(Token(type, token_start_position, m_previous_position, m_input.substring_view(token_start_index, m_index - token_start_index)));
240    };
241
242    auto emit_token_equals = [&](auto type, auto equals_type) {
243        if (peek(1) == '=') {
244            begin_token();
245            consume();
246            consume();
247            commit_token(equals_type);
248            return;
249        }
250        emit_single_char_token(type);
251    };
252
253    auto match_escape_sequence = [&]() -> size_t {
254        switch (peek(1)) {
255        case '\'':
256        case '"':
257        case '?':
258        case '\\':
259        case 'a':
260        case 'b':
261        case 'f':
262        case 'n':
263        case 'r':
264        case 't':
265        case 'v':
266            return 2;
267        case '0':
268        case '1':
269        case '2':
270        case '3':
271        case '4':
272        case '5':
273        case '6':
274        case '7': {
275            size_t octal_digits = 1;
276            for (size_t i = 0; i < 2; ++i) {
277                char next = peek(2 + i);
278                if (next < '0' || next > '7')
279                    break;
280                ++octal_digits;
281            }
282            return 1 + octal_digits;
283        }
284        case 'x': {
285            size_t hex_digits = 0;
286            while (is_ascii_hex_digit(peek(2 + hex_digits)))
287                ++hex_digits;
288            return 2 + hex_digits;
289        }
290        case 'u':
291        case 'U': {
292            bool is_unicode = true;
293            size_t number_of_digits = peek(1) == 'u' ? 4 : 8;
294            for (size_t i = 0; i < number_of_digits; ++i) {
295                if (!is_ascii_hex_digit(peek(2 + i))) {
296                    is_unicode = false;
297                    break;
298                }
299            }
300            return is_unicode ? 2 + number_of_digits : 0;
301        }
302        default:
303            return 0;
304        }
305    };
306
307    auto match_string_prefix = [&](char quote) -> size_t {
308        if (peek() == quote)
309            return 1;
310        if (peek() == 'L' && peek(1) == quote)
311            return 2;
312        if (peek() == 'u') {
313            if (peek(1) == quote)
314                return 2;
315            if (peek(1) == '8' && peek(2) == quote)
316                return 3;
317        }
318        if (peek() == 'U' && peek(1) == quote)
319            return 2;
320        return 0;
321    };
322
323    while (m_index < m_input.length()) {
324        auto ch = peek();
325        if (is_ascii_space(ch)) {
326            begin_token();
327            while (is_ascii_space(peek()))
328                consume();
329            commit_token(Token::Type::Whitespace);
330            continue;
331        }
332        if (ch == '(') {
333            emit_single_char_token(Token::Type::LeftParen);
334            continue;
335        }
336        if (ch == ')') {
337            emit_single_char_token(Token::Type::RightParen);
338            continue;
339        }
340        if (ch == '{') {
341            emit_single_char_token(Token::Type::LeftCurly);
342            continue;
343        }
344        if (ch == '}') {
345            emit_single_char_token(Token::Type::RightCurly);
346            continue;
347        }
348        if (ch == '[') {
349            emit_single_char_token(Token::Type::LeftBracket);
350            continue;
351        }
352        if (ch == ']') {
353            emit_single_char_token(Token::Type::RightBracket);
354            continue;
355        }
356        if (ch == '<') {
357            begin_token();
358            consume();
359            if (peek() == '<') {
360                consume();
361                if (peek() == '=') {
362                    consume();
363                    commit_token(Token::Type::LessLessEquals);
364                    continue;
365                }
366                commit_token(Token::Type::LessLess);
367                continue;
368            }
369            if (peek() == '=') {
370                consume();
371                commit_token(Token::Type::LessEquals);
372                continue;
373            }
374            if (peek() == '>') {
375                consume();
376                commit_token(Token::Type::LessGreater);
377                continue;
378            }
379            commit_token(Token::Type::Less);
380            continue;
381        }
382        if (ch == '>') {
383            begin_token();
384            consume();
385            if (peek() == '>') {
386                consume();
387                if (peek() == '=') {
388                    consume();
389                    commit_token(Token::Type::GreaterGreaterEquals);
390                    continue;
391                }
392                commit_token(Token::Type::GreaterGreater);
393                continue;
394            }
395            if (peek() == '=') {
396                consume();
397                commit_token(Token::Type::GreaterEquals);
398                continue;
399            }
400            commit_token(Token::Type::Greater);
401            continue;
402        }
403        if (ch == ',') {
404            emit_single_char_token(Token::Type::Comma);
405            continue;
406        }
407        if (ch == '+') {
408            begin_token();
409            consume();
410            if (peek() == '+') {
411                consume();
412                commit_token(Token::Type::PlusPlus);
413                continue;
414            }
415            if (peek() == '=') {
416                consume();
417                commit_token(Token::Type::PlusEquals);
418                continue;
419            }
420            commit_token(Token::Type::Plus);
421            continue;
422        }
423        if (ch == '-') {
424            begin_token();
425            consume();
426            if (peek() == '-') {
427                consume();
428                commit_token(Token::Type::MinusMinus);
429                continue;
430            }
431            if (peek() == '=') {
432                consume();
433                commit_token(Token::Type::MinusEquals);
434                continue;
435            }
436            if (peek() == '>') {
437                consume();
438                if (peek() == '*') {
439                    consume();
440                    commit_token(Token::Type::ArrowAsterisk);
441                    continue;
442                }
443                commit_token(Token::Type::Arrow);
444                continue;
445            }
446            commit_token(Token::Type::Minus);
447            continue;
448        }
449        if (ch == '*') {
450            emit_token_equals(Token::Type::Asterisk, Token::Type::AsteriskEquals);
451            continue;
452        }
453        if (ch == '%') {
454            emit_token_equals(Token::Type::Percent, Token::Type::PercentEquals);
455            continue;
456        }
457        if (ch == '^') {
458            emit_token_equals(Token::Type::Caret, Token::Type::CaretEquals);
459            continue;
460        }
461        if (ch == '!') {
462            emit_token_equals(Token::Type::ExclamationMark, Token::Type::ExclamationMarkEquals);
463            continue;
464        }
465        if (ch == '=') {
466            emit_token_equals(Token::Type::Equals, Token::Type::EqualsEquals);
467            continue;
468        }
469        if (ch == '&') {
470            begin_token();
471            consume();
472            if (peek() == '&') {
473                consume();
474                commit_token(Token::Type::AndAnd);
475                continue;
476            }
477            if (peek() == '=') {
478                consume();
479                commit_token(Token::Type::AndEquals);
480                continue;
481            }
482            commit_token(Token::Type::And);
483            continue;
484        }
485        if (ch == '|') {
486            begin_token();
487            consume();
488            if (peek() == '|') {
489                consume();
490                commit_token(Token::Type::PipePipe);
491                continue;
492            }
493            if (peek() == '=') {
494                consume();
495                commit_token(Token::Type::PipeEquals);
496                continue;
497            }
498            commit_token(Token::Type::Pipe);
499            continue;
500        }
501        if (ch == '~') {
502            emit_single_char_token(Token::Type::Tilde);
503            continue;
504        }
505        if (ch == '?') {
506            emit_single_char_token(Token::Type::QuestionMark);
507            continue;
508        }
509        if (ch == ':') {
510            begin_token();
511            consume();
512            if (peek() == ':') {
513                consume();
514                if (peek() == '*') {
515                    consume();
516                    commit_token(Token::Type::ColonColonAsterisk);
517                    continue;
518                }
519                commit_token(Token::Type::ColonColon);
520                continue;
521            }
522            commit_token(Token::Type::Colon);
523            continue;
524        }
525        if (ch == ';') {
526            emit_single_char_token(Token::Type::Semicolon);
527            continue;
528        }
529        if (ch == '.') {
530            begin_token();
531            consume();
532            if (peek() == '*') {
533                consume();
534                commit_token(Token::Type::DotAsterisk);
535                continue;
536            }
537            commit_token(Token::Type::Dot);
538            continue;
539        }
540        if (ch == '#') {
541            begin_token();
542            consume();
543            while (AK::is_ascii_space(peek()))
544                consume();
545
546            size_t directive_start = m_index;
547            if (is_valid_first_character_of_identifier(peek()))
548                while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
549                    consume();
550
551            auto directive = StringView(m_input.characters_without_null_termination() + directive_start, m_index - directive_start);
552            if (directive == "include"sv) {
553                commit_token(Token::Type::IncludeStatement);
554
555                if (is_ascii_space(peek())) {
556                    begin_token();
557                    do {
558                        consume();
559                    } while (is_ascii_space(peek()));
560                    commit_token(Token::Type::Whitespace);
561                }
562
563                begin_token();
564                if (peek() == '<' || peek() == '"') {
565                    char closing = consume() == '<' ? '>' : '"';
566                    while (peek() && peek() != closing && peek() != '\n')
567                        consume();
568
569                    if (peek() && consume() == '\n') {
570                        commit_token(Token::Type::IncludePath);
571                        continue;
572                    }
573
574                    commit_token(Token::Type::IncludePath);
575                    begin_token();
576                }
577            } else {
578                while (peek()) {
579                    if (peek() == '\\' && peek(1) == '\n') {
580                        consume();
581                        consume();
582                    } else if (peek() == '\n') {
583                        break;
584                    } else {
585                        consume();
586                    }
587                }
588
589                commit_token(Token::Type::PreprocessorStatement);
590            }
591
592            continue;
593        }
594        if (ch == '/' && peek(1) == '/') {
595            begin_token();
596            while (peek() && peek() != '\n')
597                consume();
598            commit_token(Token::Type::Comment);
599            continue;
600        }
601        if (ch == '/' && peek(1) == '*') {
602            begin_token();
603            consume();
604            consume();
605            bool comment_block_ends = false;
606            while (peek()) {
607                if (peek() == '*' && peek(1) == '/') {
608                    comment_block_ends = true;
609                    break;
610                }
611
612                consume();
613            }
614
615            if (comment_block_ends) {
616                consume();
617                consume();
618            }
619
620            commit_token(Token::Type::Comment);
621            continue;
622        }
623        if (ch == '/') {
624            emit_token_equals(Token::Type::Slash, Token::Type::SlashEquals);
625            continue;
626        }
627        if (size_t prefix = match_string_prefix('"'); prefix > 0) {
628            begin_token();
629            for (size_t i = 0; i < prefix; ++i)
630                consume();
631            while (peek()) {
632                if (peek() == '\\') {
633                    if (size_t escape = match_escape_sequence(); escape > 0) {
634                        commit_token(Token::Type::DoubleQuotedString);
635                        begin_token();
636                        for (size_t i = 0; i < escape; ++i)
637                            consume();
638                        commit_token(Token::Type::EscapeSequence);
639                        begin_token();
640                        continue;
641                    }
642                }
643
644                // If string is not terminated - stop before EOF
645                if (!peek(1))
646                    break;
647
648                if (consume() == '"')
649                    break;
650            }
651            commit_token(Token::Type::DoubleQuotedString);
652            continue;
653        }
654        if (size_t prefix = match_string_prefix('R'); prefix > 0 && peek(prefix) == '"') {
655            begin_token();
656            for (size_t i = 0; i < prefix + 1; ++i)
657                consume();
658            size_t prefix_start = m_index;
659            while (peek() && peek() != '(')
660                consume();
661            StringView prefix_string = m_input.substring_view(prefix_start, m_index - prefix_start);
662            while (peek()) {
663                if (consume() == '"') {
664                    VERIFY(m_index >= prefix_string.length() + 2);
665                    VERIFY(m_input[m_index - 1] == '"');
666                    if (m_input[m_index - 1 - prefix_string.length() - 1] == ')') {
667                        StringView suffix_string = m_input.substring_view(m_index - 1 - prefix_string.length(), prefix_string.length());
668                        if (prefix_string == suffix_string)
669                            break;
670                    }
671                }
672            }
673            commit_token(Token::Type::RawString);
674            continue;
675        }
676        if (size_t prefix = match_string_prefix('\''); prefix > 0) {
677            begin_token();
678            for (size_t i = 0; i < prefix; ++i)
679                consume();
680            while (peek()) {
681                if (peek() == '\\') {
682                    if (size_t escape = match_escape_sequence(); escape > 0) {
683                        commit_token(Token::Type::SingleQuotedString);
684                        begin_token();
685                        for (size_t i = 0; i < escape; ++i)
686                            consume();
687                        commit_token(Token::Type::EscapeSequence);
688                        begin_token();
689                        continue;
690                    }
691                }
692
693                if (consume() == '\'')
694                    break;
695            }
696            commit_token(Token::Type::SingleQuotedString);
697            continue;
698        }
699        if (is_ascii_digit(ch) || (ch == '.' && is_ascii_digit(peek(1)))) {
700            begin_token();
701            consume();
702
703            auto type = ch == '.' ? Token::Type::Float : Token::Type::Integer;
704            bool is_hex = false;
705            bool is_binary = false;
706
707            auto match_exponent = [&]() -> size_t {
708                char ch = peek();
709                if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P')
710                    return 0;
711
712                type = Token::Type::Float;
713                size_t length = 1;
714                ch = peek(length);
715                if (ch == '+' || ch == '-') {
716                    ++length;
717                }
718                for (ch = peek(length); is_ascii_digit(ch); ch = peek(length)) {
719                    ++length;
720                }
721                return length;
722            };
723
724            auto match_type_literal = [&]() -> size_t {
725                size_t length = 0;
726                for (;;) {
727                    char ch = peek(length);
728                    if ((ch == 'u' || ch == 'U') && type == Token::Type::Integer) {
729                        ++length;
730                    } else if ((ch == 'f' || ch == 'F') && !is_binary) {
731                        type = Token::Type::Float;
732                        ++length;
733                    } else if (ch == 'l' || ch == 'L') {
734                        ++length;
735                    } else
736                        return length;
737                }
738            };
739
740            if (peek() == 'b' || peek() == 'B') {
741                consume();
742                is_binary = true;
743                for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) {
744                    consume();
745                }
746            } else {
747                if (peek() == 'x' || peek() == 'X') {
748                    consume();
749                    is_hex = true;
750                }
751
752                for (char ch = peek(); (is_hex ? is_ascii_hex_digit(ch) : is_ascii_digit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) {
753                    if (ch == '.') {
754                        if (type == Token::Type::Integer) {
755                            type = Token::Type::Float;
756                        } else
757                            break;
758                    };
759                    consume();
760                }
761            }
762
763            if (!is_binary) {
764                size_t length = match_exponent();
765                for (size_t i = 0; i < length; ++i)
766                    consume();
767            }
768
769            size_t length = match_type_literal();
770            for (size_t i = 0; i < length; ++i)
771                consume();
772
773            commit_token(type);
774            continue;
775        }
776        if (is_valid_first_character_of_identifier(ch)) {
777            begin_token();
778            while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
779                consume();
780            auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
781            if (is_keyword(token_view))
782                commit_token(Token::Type::Keyword);
783            else if (is_known_type(token_view))
784                commit_token(Token::Type::KnownType);
785            else
786                commit_token(Token::Type::Identifier);
787            continue;
788        }
789
790        if (ch == '\\' && peek(1) == '\n') {
791            consume();
792            consume();
793            continue;
794        }
795
796        dbgln("Unimplemented token character: {}", ch);
797        emit_single_char_token(Token::Type::Unknown);
798    }
799}
800
801Vector<Token> Lexer::lex()
802{
803    Vector<Token> tokens;
804    lex_impl([&](auto token) {
805        tokens.append(move(token));
806    });
807    return tokens;
808}
809
810}