Serenity Operating System
at hosted 556 lines 17 kB view raw
1/* 2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "CppLexer.h" 28#include <AK/HashTable.h> 29#include <AK/String.h> 30#include <ctype.h> 31 32namespace GUI { 33 34CppLexer::CppLexer(const StringView& input) 35 : m_input(input) 36{ 37} 38 39char CppLexer::peek(size_t offset) const 40{ 41 if ((m_index + offset) >= m_input.length()) 42 return 0; 43 return m_input[m_index + offset]; 44} 45 46char CppLexer::consume() 47{ 48 ASSERT(m_index < m_input.length()); 49 char ch = m_input[m_index++]; 50 m_previous_position = m_position; 51 if (ch == '\n') { 52 m_position.line++; 53 m_position.column = 0; 54 } else { 55 m_position.column++; 56 } 57 return ch; 58} 59 60static bool is_valid_first_character_of_identifier(char ch) 61{ 62 return isalpha(ch) || ch == '_' || ch == '$'; 63} 64 65static bool is_valid_nonfirst_character_of_identifier(char ch) 66{ 67 return is_valid_first_character_of_identifier(ch) || isdigit(ch); 68} 69 70static bool is_keyword(const StringView& string) 71{ 72 static HashTable<String> keywords; 73 if (keywords.is_empty()) { 74 keywords.set("alignas"); 75 keywords.set("alignof"); 76 keywords.set("and"); 77 keywords.set("and_eq"); 78 keywords.set("asm"); 79 keywords.set("bitand"); 80 keywords.set("bitor"); 81 keywords.set("bool"); 82 keywords.set("break"); 83 keywords.set("case"); 84 keywords.set("catch"); 85 keywords.set("class"); 86 keywords.set("compl"); 87 keywords.set("const"); 88 keywords.set("const_cast"); 89 keywords.set("constexpr"); 90 keywords.set("continue"); 91 keywords.set("decltype"); 92 keywords.set("default"); 93 keywords.set("delete"); 94 keywords.set("do"); 95 keywords.set("dynamic_cast"); 96 keywords.set("else"); 97 keywords.set("enum"); 98 keywords.set("explicit"); 99 keywords.set("export"); 100 keywords.set("extern"); 101 keywords.set("false"); 102 keywords.set("final"); 103 keywords.set("for"); 104 keywords.set("friend"); 105 keywords.set("goto"); 106 keywords.set("if"); 107 keywords.set("inline"); 108 keywords.set("mutable"); 109 keywords.set("namespace"); 110 keywords.set("new"); 111 keywords.set("noexcept"); 112 keywords.set("not"); 113 keywords.set("not_eq"); 114 keywords.set("nullptr"); 115 keywords.set("operator"); 116 keywords.set("or"); 117 keywords.set("or_eq"); 118 keywords.set("override"); 119 keywords.set("private"); 120 keywords.set("protected"); 121 keywords.set("public"); 122 keywords.set("register"); 123 keywords.set("reinterpret_cast"); 124 keywords.set("return"); 125 keywords.set("signed"); 126 keywords.set("sizeof"); 127 keywords.set("static"); 128 keywords.set("static_assert"); 129 keywords.set("static_cast"); 130 keywords.set("struct"); 131 keywords.set("switch"); 132 keywords.set("template"); 133 keywords.set("this"); 134 keywords.set("thread_local"); 135 keywords.set("throw"); 136 keywords.set("true"); 137 keywords.set("try"); 138 keywords.set("typedef"); 139 keywords.set("typeid"); 140 keywords.set("typename"); 141 keywords.set("union"); 142 keywords.set("using"); 143 keywords.set("virtual"); 144 keywords.set("volatile"); 145 keywords.set("while"); 146 keywords.set("xor"); 147 keywords.set("xor_eq"); 148 } 149 return keywords.contains(string); 150} 151 152static bool is_known_type(const StringView& string) 153{ 154 static HashTable<String> types; 155 if (types.is_empty()) { 156 types.set("ByteBuffer"); 157 types.set("CircularDeque"); 158 types.set("CircularQueue"); 159 types.set("Deque"); 160 types.set("DoublyLinkedList"); 161 types.set("FileSystemPath"); 162 types.set("FixedArray"); 163 types.set("Function"); 164 types.set("HashMap"); 165 types.set("HashTable"); 166 types.set("IPv4Address"); 167 types.set("InlineLinkedList"); 168 types.set("IntrusiveList"); 169 types.set("JsonArray"); 170 types.set("JsonObject"); 171 types.set("JsonValue"); 172 types.set("MappedFile"); 173 types.set("NetworkOrdered"); 174 types.set("NonnullOwnPtr"); 175 types.set("NonnullOwnPtrVector"); 176 types.set("NonnullRefPtr"); 177 types.set("NonnullRefPtrVector"); 178 types.set("Optional"); 179 types.set("OwnPtr"); 180 types.set("RefPtr"); 181 types.set("Result"); 182 types.set("ScopeGuard"); 183 types.set("SinglyLinkedList"); 184 types.set("String"); 185 types.set("StringBuilder"); 186 types.set("StringImpl"); 187 types.set("StringView"); 188 types.set("Utf8View"); 189 types.set("Vector"); 190 types.set("WeakPtr"); 191 types.set("auto"); 192 types.set("char"); 193 types.set("char16_t"); 194 types.set("char32_t"); 195 types.set("char8_t"); 196 types.set("double"); 197 types.set("float"); 198 types.set("i16"); 199 types.set("i32"); 200 types.set("i64"); 201 types.set("i8"); 202 types.set("int"); 203 types.set("int"); 204 types.set("long"); 205 types.set("short"); 206 types.set("signed"); 207 types.set("u16"); 208 types.set("u32"); 209 types.set("u64"); 210 types.set("u8"); 211 types.set("unsigned"); 212 types.set("void"); 213 types.set("wchar_t"); 214 } 215 return types.contains(string); 216} 217 218Vector<CppToken> CppLexer::lex() 219{ 220 Vector<CppToken> tokens; 221 222 size_t token_start_index = 0; 223 CppPosition token_start_position; 224 225 auto emit_token = [&](auto type) { 226 CppToken token; 227 token.m_type = type; 228 token.m_start = m_position; 229 token.m_end = m_position; 230 tokens.append(token); 231 consume(); 232 }; 233 234 auto begin_token = [&] { 235 token_start_index = m_index; 236 token_start_position = m_position; 237 }; 238 auto commit_token = [&](auto type) { 239 CppToken token; 240 token.m_type = type; 241 token.m_start = token_start_position; 242 token.m_end = m_previous_position; 243 tokens.append(token); 244 }; 245 246 auto match_escape_sequence = [&]() -> size_t { 247 switch (peek(1)) { 248 case '\'': 249 case '"': 250 case '?': 251 case '\\': 252 case 'a': 253 case 'b': 254 case 'f': 255 case 'n': 256 case 'r': 257 case 't': 258 case 'v': 259 return 2; 260 case '0': 261 case '1': 262 case '2': 263 case '3': 264 case '4': 265 case '5': 266 case '6': 267 case '7': { 268 size_t octal_digits = 1; 269 for (size_t i = 0; i < 2; ++i) { 270 char next = peek(2 + i); 271 if (next < '0' || next > '7') 272 break; 273 ++octal_digits; 274 } 275 return 1 + octal_digits; 276 } 277 case 'x': { 278 size_t hex_digits = 0; 279 for (size_t i = 0; i < 2; ++i) { 280 if (!isxdigit(peek(2 + i))) 281 break; 282 ++hex_digits; 283 } 284 return 2 + hex_digits; 285 } 286 case 'u': { 287 bool is_unicode = true; 288 for (size_t i = 0; i < 4; ++i) { 289 if (!isxdigit(peek(2 + i))) { 290 is_unicode = false; 291 break; 292 } 293 } 294 return is_unicode ? 6 : 0; 295 } 296 default: 297 return 0; 298 } 299 }; 300 301 while (m_index < m_input.length()) { 302 auto ch = peek(); 303 if (isspace(ch)) { 304 begin_token(); 305 while (isspace(peek())) 306 consume(); 307 commit_token(CppToken::Type::Whitespace); 308 continue; 309 } 310 if (ch == '(') { 311 emit_token(CppToken::Type::LeftParen); 312 continue; 313 } 314 if (ch == ')') { 315 emit_token(CppToken::Type::RightParen); 316 continue; 317 } 318 if (ch == '{') { 319 emit_token(CppToken::Type::LeftCurly); 320 continue; 321 } 322 if (ch == '}') { 323 emit_token(CppToken::Type::RightCurly); 324 continue; 325 } 326 if (ch == '[') { 327 emit_token(CppToken::Type::LeftBracket); 328 continue; 329 } 330 if (ch == ']') { 331 emit_token(CppToken::Type::RightBracket); 332 continue; 333 } 334 if (ch == ',') { 335 emit_token(CppToken::Type::Comma); 336 continue; 337 } 338 if (ch == '*') { 339 emit_token(CppToken::Type::Asterisk); 340 continue; 341 } 342 if (ch == ';') { 343 emit_token(CppToken::Type::Semicolon); 344 continue; 345 } 346 if (ch == '#') { 347 begin_token(); 348 consume(); 349 350 if (is_valid_first_character_of_identifier(peek())) 351 while (peek() && is_valid_nonfirst_character_of_identifier(peek())) 352 consume(); 353 354 auto directive = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index); 355 if (directive == "#include") { 356 commit_token(CppToken::Type::IncludeStatement); 357 358 begin_token(); 359 while (isspace(peek())) 360 consume(); 361 commit_token(CppToken::Type::Whitespace); 362 363 begin_token(); 364 if (peek() == '<' || peek() == '"') { 365 char closing = consume() == '<' ? '>' : '"'; 366 while (peek() && peek() != closing && peek() != '\n') 367 consume(); 368 369 if (peek() && consume() == '\n') { 370 commit_token(CppToken::Type::IncludePath); 371 continue; 372 } 373 374 commit_token(CppToken::Type::IncludePath); 375 begin_token(); 376 } 377 } 378 379 while (peek() && peek() != '\n') 380 consume(); 381 382 commit_token(CppToken::Type::PreprocessorStatement); 383 continue; 384 } 385 if (ch == '/' && peek(1) == '/') { 386 begin_token(); 387 while (peek() && peek() != '\n') 388 consume(); 389 commit_token(CppToken::Type::Comment); 390 continue; 391 } 392 if (ch == '/' && peek(1) == '*') { 393 begin_token(); 394 consume(); 395 consume(); 396 bool comment_block_ends = false; 397 while (peek()) { 398 if (peek() == '*' && peek(1) == '/') { 399 comment_block_ends = true; 400 break; 401 } 402 403 consume(); 404 } 405 406 if (comment_block_ends) { 407 consume(); 408 consume(); 409 } 410 411 commit_token(CppToken::Type::Comment); 412 continue; 413 } 414 if (ch == '"') { 415 begin_token(); 416 consume(); 417 while (peek()) { 418 if (peek() == '\\') { 419 size_t escape = match_escape_sequence(); 420 if (escape > 0) { 421 commit_token(CppToken::Type::DoubleQuotedString); 422 begin_token(); 423 for (size_t i = 0; i < escape; ++i) 424 consume(); 425 commit_token(CppToken::Type::EscapeSequence); 426 begin_token(); 427 continue; 428 } 429 } 430 431 if (consume() == '"') 432 break; 433 } 434 commit_token(CppToken::Type::DoubleQuotedString); 435 continue; 436 } 437 if (ch == '\'') { 438 begin_token(); 439 consume(); 440 while (peek()) { 441 if (peek() == '\\') { 442 size_t escape = match_escape_sequence(); 443 if (escape > 0) { 444 commit_token(CppToken::Type::SingleQuotedString); 445 begin_token(); 446 for (size_t i = 0; i < escape; ++i) 447 consume(); 448 commit_token(CppToken::Type::EscapeSequence); 449 begin_token(); 450 continue; 451 } 452 } 453 454 if (consume() == '\'') 455 break; 456 } 457 commit_token(CppToken::Type::SingleQuotedString); 458 continue; 459 } 460 if (isdigit(ch) || (ch == '.' && isdigit(peek(1)))) { 461 begin_token(); 462 consume(); 463 464 auto type = ch == '.' ? CppToken::Type::Float : CppToken::Type::Integer; 465 bool is_hex = false; 466 bool is_binary = false; 467 468 auto match_exponent = [&]() -> size_t { 469 char ch = peek(); 470 if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P') 471 return 0; 472 473 type = CppToken::Type::Float; 474 size_t length = 1; 475 ch = peek(length); 476 if (ch == '+' || ch == '-') { 477 ++length; 478 } 479 for (ch = peek(length); isdigit(ch); ch = peek(length)) { 480 ++length; 481 } 482 return length; 483 }; 484 485 auto match_type_literal = [&]() -> size_t { 486 size_t length = 0; 487 for (;;) { 488 char ch = peek(length); 489 if ((ch == 'u' || ch == 'U') && type == CppToken::Type::Integer) { 490 ++length; 491 } else if ((ch == 'f' || ch == 'F') && !is_binary) { 492 type = CppToken::Type::Float; 493 ++length; 494 } else if (ch == 'l' || ch == 'L') { 495 ++length; 496 } else 497 return length; 498 } 499 }; 500 501 if (peek() == 'b' || peek() == 'B') { 502 consume(); 503 is_binary = true; 504 for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) { 505 consume(); 506 } 507 } else { 508 if (peek() == 'x' || peek() == 'X') { 509 consume(); 510 is_hex = true; 511 } 512 513 for (char ch = peek(); (is_hex ? isxdigit(ch) : isdigit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) { 514 if (ch == '.') { 515 if (type == CppToken::Type::Integer) { 516 type = CppToken::Type::Float; 517 } else 518 break; 519 }; 520 consume(); 521 } 522 } 523 524 if (!is_binary) { 525 size_t length = match_exponent(); 526 for (size_t i = 0; i < length; ++i) 527 consume(); 528 } 529 530 size_t length = match_type_literal(); 531 for (size_t i = 0; i < length; ++i) 532 consume(); 533 534 commit_token(type); 535 continue; 536 } 537 if (is_valid_first_character_of_identifier(ch)) { 538 begin_token(); 539 while (peek() && is_valid_nonfirst_character_of_identifier(peek())) 540 consume(); 541 auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index); 542 if (is_keyword(token_view)) 543 commit_token(CppToken::Type::Keyword); 544 else if (is_known_type(token_view)) 545 commit_token(CppToken::Type::KnownType); 546 else 547 commit_token(CppToken::Type::Identifier); 548 continue; 549 } 550 dbg() << "Unimplemented token character: " << ch; 551 emit_token(CppToken::Type::Unknown); 552 } 553 return tokens; 554} 555 556}