Serenity Operating System
1/*
2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "CppLexer.h"
28#include <AK/HashTable.h>
29#include <AK/String.h>
30#include <ctype.h>
31
32namespace GUI {
33
34CppLexer::CppLexer(const StringView& input)
35 : m_input(input)
36{
37}
38
39char CppLexer::peek(size_t offset) const
40{
41 if ((m_index + offset) >= m_input.length())
42 return 0;
43 return m_input[m_index + offset];
44}
45
46char CppLexer::consume()
47{
48 ASSERT(m_index < m_input.length());
49 char ch = m_input[m_index++];
50 m_previous_position = m_position;
51 if (ch == '\n') {
52 m_position.line++;
53 m_position.column = 0;
54 } else {
55 m_position.column++;
56 }
57 return ch;
58}
59
60static bool is_valid_first_character_of_identifier(char ch)
61{
62 return isalpha(ch) || ch == '_' || ch == '$';
63}
64
65static bool is_valid_nonfirst_character_of_identifier(char ch)
66{
67 return is_valid_first_character_of_identifier(ch) || isdigit(ch);
68}
69
70static bool is_keyword(const StringView& string)
71{
72 static HashTable<String> keywords;
73 if (keywords.is_empty()) {
74 keywords.set("alignas");
75 keywords.set("alignof");
76 keywords.set("and");
77 keywords.set("and_eq");
78 keywords.set("asm");
79 keywords.set("bitand");
80 keywords.set("bitor");
81 keywords.set("bool");
82 keywords.set("break");
83 keywords.set("case");
84 keywords.set("catch");
85 keywords.set("class");
86 keywords.set("compl");
87 keywords.set("const");
88 keywords.set("const_cast");
89 keywords.set("constexpr");
90 keywords.set("continue");
91 keywords.set("decltype");
92 keywords.set("default");
93 keywords.set("delete");
94 keywords.set("do");
95 keywords.set("dynamic_cast");
96 keywords.set("else");
97 keywords.set("enum");
98 keywords.set("explicit");
99 keywords.set("export");
100 keywords.set("extern");
101 keywords.set("false");
102 keywords.set("final");
103 keywords.set("for");
104 keywords.set("friend");
105 keywords.set("goto");
106 keywords.set("if");
107 keywords.set("inline");
108 keywords.set("mutable");
109 keywords.set("namespace");
110 keywords.set("new");
111 keywords.set("noexcept");
112 keywords.set("not");
113 keywords.set("not_eq");
114 keywords.set("nullptr");
115 keywords.set("operator");
116 keywords.set("or");
117 keywords.set("or_eq");
118 keywords.set("override");
119 keywords.set("private");
120 keywords.set("protected");
121 keywords.set("public");
122 keywords.set("register");
123 keywords.set("reinterpret_cast");
124 keywords.set("return");
125 keywords.set("signed");
126 keywords.set("sizeof");
127 keywords.set("static");
128 keywords.set("static_assert");
129 keywords.set("static_cast");
130 keywords.set("struct");
131 keywords.set("switch");
132 keywords.set("template");
133 keywords.set("this");
134 keywords.set("thread_local");
135 keywords.set("throw");
136 keywords.set("true");
137 keywords.set("try");
138 keywords.set("typedef");
139 keywords.set("typeid");
140 keywords.set("typename");
141 keywords.set("union");
142 keywords.set("using");
143 keywords.set("virtual");
144 keywords.set("volatile");
145 keywords.set("while");
146 keywords.set("xor");
147 keywords.set("xor_eq");
148 }
149 return keywords.contains(string);
150}
151
152static bool is_known_type(const StringView& string)
153{
154 static HashTable<String> types;
155 if (types.is_empty()) {
156 types.set("ByteBuffer");
157 types.set("CircularDeque");
158 types.set("CircularQueue");
159 types.set("Deque");
160 types.set("DoublyLinkedList");
161 types.set("FileSystemPath");
162 types.set("FixedArray");
163 types.set("Function");
164 types.set("HashMap");
165 types.set("HashTable");
166 types.set("IPv4Address");
167 types.set("InlineLinkedList");
168 types.set("IntrusiveList");
169 types.set("JsonArray");
170 types.set("JsonObject");
171 types.set("JsonValue");
172 types.set("MappedFile");
173 types.set("NetworkOrdered");
174 types.set("NonnullOwnPtr");
175 types.set("NonnullOwnPtrVector");
176 types.set("NonnullRefPtr");
177 types.set("NonnullRefPtrVector");
178 types.set("Optional");
179 types.set("OwnPtr");
180 types.set("RefPtr");
181 types.set("Result");
182 types.set("ScopeGuard");
183 types.set("SinglyLinkedList");
184 types.set("String");
185 types.set("StringBuilder");
186 types.set("StringImpl");
187 types.set("StringView");
188 types.set("Utf8View");
189 types.set("Vector");
190 types.set("WeakPtr");
191 types.set("auto");
192 types.set("char");
193 types.set("char16_t");
194 types.set("char32_t");
195 types.set("char8_t");
196 types.set("double");
197 types.set("float");
198 types.set("i16");
199 types.set("i32");
200 types.set("i64");
201 types.set("i8");
202 types.set("int");
203 types.set("int");
204 types.set("long");
205 types.set("short");
206 types.set("signed");
207 types.set("u16");
208 types.set("u32");
209 types.set("u64");
210 types.set("u8");
211 types.set("unsigned");
212 types.set("void");
213 types.set("wchar_t");
214 }
215 return types.contains(string);
216}
217
218Vector<CppToken> CppLexer::lex()
219{
220 Vector<CppToken> tokens;
221
222 size_t token_start_index = 0;
223 CppPosition token_start_position;
224
225 auto emit_token = [&](auto type) {
226 CppToken token;
227 token.m_type = type;
228 token.m_start = m_position;
229 token.m_end = m_position;
230 tokens.append(token);
231 consume();
232 };
233
234 auto begin_token = [&] {
235 token_start_index = m_index;
236 token_start_position = m_position;
237 };
238 auto commit_token = [&](auto type) {
239 CppToken token;
240 token.m_type = type;
241 token.m_start = token_start_position;
242 token.m_end = m_previous_position;
243 tokens.append(token);
244 };
245
246 auto match_escape_sequence = [&]() -> size_t {
247 switch (peek(1)) {
248 case '\'':
249 case '"':
250 case '?':
251 case '\\':
252 case 'a':
253 case 'b':
254 case 'f':
255 case 'n':
256 case 'r':
257 case 't':
258 case 'v':
259 return 2;
260 case '0':
261 case '1':
262 case '2':
263 case '3':
264 case '4':
265 case '5':
266 case '6':
267 case '7': {
268 size_t octal_digits = 1;
269 for (size_t i = 0; i < 2; ++i) {
270 char next = peek(2 + i);
271 if (next < '0' || next > '7')
272 break;
273 ++octal_digits;
274 }
275 return 1 + octal_digits;
276 }
277 case 'x': {
278 size_t hex_digits = 0;
279 for (size_t i = 0; i < 2; ++i) {
280 if (!isxdigit(peek(2 + i)))
281 break;
282 ++hex_digits;
283 }
284 return 2 + hex_digits;
285 }
286 case 'u': {
287 bool is_unicode = true;
288 for (size_t i = 0; i < 4; ++i) {
289 if (!isxdigit(peek(2 + i))) {
290 is_unicode = false;
291 break;
292 }
293 }
294 return is_unicode ? 6 : 0;
295 }
296 default:
297 return 0;
298 }
299 };
300
301 while (m_index < m_input.length()) {
302 auto ch = peek();
303 if (isspace(ch)) {
304 begin_token();
305 while (isspace(peek()))
306 consume();
307 commit_token(CppToken::Type::Whitespace);
308 continue;
309 }
310 if (ch == '(') {
311 emit_token(CppToken::Type::LeftParen);
312 continue;
313 }
314 if (ch == ')') {
315 emit_token(CppToken::Type::RightParen);
316 continue;
317 }
318 if (ch == '{') {
319 emit_token(CppToken::Type::LeftCurly);
320 continue;
321 }
322 if (ch == '}') {
323 emit_token(CppToken::Type::RightCurly);
324 continue;
325 }
326 if (ch == '[') {
327 emit_token(CppToken::Type::LeftBracket);
328 continue;
329 }
330 if (ch == ']') {
331 emit_token(CppToken::Type::RightBracket);
332 continue;
333 }
334 if (ch == ',') {
335 emit_token(CppToken::Type::Comma);
336 continue;
337 }
338 if (ch == '*') {
339 emit_token(CppToken::Type::Asterisk);
340 continue;
341 }
342 if (ch == ';') {
343 emit_token(CppToken::Type::Semicolon);
344 continue;
345 }
346 if (ch == '#') {
347 begin_token();
348 consume();
349
350 if (is_valid_first_character_of_identifier(peek()))
351 while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
352 consume();
353
354 auto directive = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
355 if (directive == "#include") {
356 commit_token(CppToken::Type::IncludeStatement);
357
358 begin_token();
359 while (isspace(peek()))
360 consume();
361 commit_token(CppToken::Type::Whitespace);
362
363 begin_token();
364 if (peek() == '<' || peek() == '"') {
365 char closing = consume() == '<' ? '>' : '"';
366 while (peek() && peek() != closing && peek() != '\n')
367 consume();
368
369 if (peek() && consume() == '\n') {
370 commit_token(CppToken::Type::IncludePath);
371 continue;
372 }
373
374 commit_token(CppToken::Type::IncludePath);
375 begin_token();
376 }
377 }
378
379 while (peek() && peek() != '\n')
380 consume();
381
382 commit_token(CppToken::Type::PreprocessorStatement);
383 continue;
384 }
385 if (ch == '/' && peek(1) == '/') {
386 begin_token();
387 while (peek() && peek() != '\n')
388 consume();
389 commit_token(CppToken::Type::Comment);
390 continue;
391 }
392 if (ch == '/' && peek(1) == '*') {
393 begin_token();
394 consume();
395 consume();
396 bool comment_block_ends = false;
397 while (peek()) {
398 if (peek() == '*' && peek(1) == '/') {
399 comment_block_ends = true;
400 break;
401 }
402
403 consume();
404 }
405
406 if (comment_block_ends) {
407 consume();
408 consume();
409 }
410
411 commit_token(CppToken::Type::Comment);
412 continue;
413 }
414 if (ch == '"') {
415 begin_token();
416 consume();
417 while (peek()) {
418 if (peek() == '\\') {
419 size_t escape = match_escape_sequence();
420 if (escape > 0) {
421 commit_token(CppToken::Type::DoubleQuotedString);
422 begin_token();
423 for (size_t i = 0; i < escape; ++i)
424 consume();
425 commit_token(CppToken::Type::EscapeSequence);
426 begin_token();
427 continue;
428 }
429 }
430
431 if (consume() == '"')
432 break;
433 }
434 commit_token(CppToken::Type::DoubleQuotedString);
435 continue;
436 }
437 if (ch == '\'') {
438 begin_token();
439 consume();
440 while (peek()) {
441 if (peek() == '\\') {
442 size_t escape = match_escape_sequence();
443 if (escape > 0) {
444 commit_token(CppToken::Type::SingleQuotedString);
445 begin_token();
446 for (size_t i = 0; i < escape; ++i)
447 consume();
448 commit_token(CppToken::Type::EscapeSequence);
449 begin_token();
450 continue;
451 }
452 }
453
454 if (consume() == '\'')
455 break;
456 }
457 commit_token(CppToken::Type::SingleQuotedString);
458 continue;
459 }
460 if (isdigit(ch) || (ch == '.' && isdigit(peek(1)))) {
461 begin_token();
462 consume();
463
464 auto type = ch == '.' ? CppToken::Type::Float : CppToken::Type::Integer;
465 bool is_hex = false;
466 bool is_binary = false;
467
468 auto match_exponent = [&]() -> size_t {
469 char ch = peek();
470 if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P')
471 return 0;
472
473 type = CppToken::Type::Float;
474 size_t length = 1;
475 ch = peek(length);
476 if (ch == '+' || ch == '-') {
477 ++length;
478 }
479 for (ch = peek(length); isdigit(ch); ch = peek(length)) {
480 ++length;
481 }
482 return length;
483 };
484
485 auto match_type_literal = [&]() -> size_t {
486 size_t length = 0;
487 for (;;) {
488 char ch = peek(length);
489 if ((ch == 'u' || ch == 'U') && type == CppToken::Type::Integer) {
490 ++length;
491 } else if ((ch == 'f' || ch == 'F') && !is_binary) {
492 type = CppToken::Type::Float;
493 ++length;
494 } else if (ch == 'l' || ch == 'L') {
495 ++length;
496 } else
497 return length;
498 }
499 };
500
501 if (peek() == 'b' || peek() == 'B') {
502 consume();
503 is_binary = true;
504 for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) {
505 consume();
506 }
507 } else {
508 if (peek() == 'x' || peek() == 'X') {
509 consume();
510 is_hex = true;
511 }
512
513 for (char ch = peek(); (is_hex ? isxdigit(ch) : isdigit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) {
514 if (ch == '.') {
515 if (type == CppToken::Type::Integer) {
516 type = CppToken::Type::Float;
517 } else
518 break;
519 };
520 consume();
521 }
522 }
523
524 if (!is_binary) {
525 size_t length = match_exponent();
526 for (size_t i = 0; i < length; ++i)
527 consume();
528 }
529
530 size_t length = match_type_literal();
531 for (size_t i = 0; i < length; ++i)
532 consume();
533
534 commit_token(type);
535 continue;
536 }
537 if (is_valid_first_character_of_identifier(ch)) {
538 begin_token();
539 while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
540 consume();
541 auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
542 if (is_keyword(token_view))
543 commit_token(CppToken::Type::Keyword);
544 else if (is_known_type(token_view))
545 commit_token(CppToken::Type::KnownType);
546 else
547 commit_token(CppToken::Type::Identifier);
548 continue;
549 }
550 dbg() << "Unimplemented token character: " << ch;
551 emit_token(CppToken::Type::Unknown);
552 }
553 return tokens;
554}
555
556}