Serenity Operating System
1/*
2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "CppLexer.h"
28#include <AK/HashTable.h>
29#include <AK/String.h>
30#include <ctype.h>
31
32namespace GUI {
33
34CppLexer::CppLexer(const StringView& input)
35 : m_input(input)
36{
37}
38
39char CppLexer::peek(size_t offset) const
40{
41 if ((m_index + offset) >= m_input.length())
42 return 0;
43 return m_input[m_index + offset];
44}
45
46char CppLexer::consume()
47{
48 ASSERT(m_index < m_input.length());
49 char ch = m_input[m_index++];
50 m_previous_position = m_position;
51 if (ch == '\n') {
52 m_position.line++;
53 m_position.column = 0;
54 } else {
55 m_position.column++;
56 }
57 return ch;
58}
59
60static bool is_valid_first_character_of_identifier(char ch)
61{
62 return isalpha(ch) || ch == '_' || ch == '$';
63}
64
65static bool is_valid_nonfirst_character_of_identifier(char ch)
66{
67 return is_valid_first_character_of_identifier(ch) || isdigit(ch);
68}
69
70static bool is_keyword(const StringView& string)
71{
72 static HashTable<String> keywords;
73 if (keywords.is_empty()) {
74 keywords.set("alignas");
75 keywords.set("alignof");
76 keywords.set("and");
77 keywords.set("and_eq");
78 keywords.set("asm");
79 keywords.set("bitand");
80 keywords.set("bitor");
81 keywords.set("bool");
82 keywords.set("break");
83 keywords.set("case");
84 keywords.set("catch");
85 keywords.set("class");
86 keywords.set("compl");
87 keywords.set("const");
88 keywords.set("const_cast");
89 keywords.set("constexpr");
90 keywords.set("continue");
91 keywords.set("decltype");
92 keywords.set("default");
93 keywords.set("delete");
94 keywords.set("do");
95 keywords.set("dynamic_cast");
96 keywords.set("else");
97 keywords.set("enum");
98 keywords.set("explicit");
99 keywords.set("export");
100 keywords.set("extern");
101 keywords.set("false");
102 keywords.set("final");
103 keywords.set("for");
104 keywords.set("friend");
105 keywords.set("goto");
106 keywords.set("if");
107 keywords.set("inline");
108 keywords.set("mutable");
109 keywords.set("namespace");
110 keywords.set("new");
111 keywords.set("noexcept");
112 keywords.set("not");
113 keywords.set("not_eq");
114 keywords.set("nullptr");
115 keywords.set("operator");
116 keywords.set("or");
117 keywords.set("or_eq");
118 keywords.set("override");
119 keywords.set("private");
120 keywords.set("protected");
121 keywords.set("public");
122 keywords.set("register");
123 keywords.set("reinterpret_cast");
124 keywords.set("return");
125 keywords.set("signed");
126 keywords.set("sizeof");
127 keywords.set("static");
128 keywords.set("static_assert");
129 keywords.set("static_cast");
130 keywords.set("struct");
131 keywords.set("switch");
132 keywords.set("template");
133 keywords.set("this");
134 keywords.set("thread_local");
135 keywords.set("throw");
136 keywords.set("true");
137 keywords.set("try");
138 keywords.set("typedef");
139 keywords.set("typeid");
140 keywords.set("typename");
141 keywords.set("union");
142 keywords.set("using");
143 keywords.set("virtual");
144 keywords.set("volatile");
145 keywords.set("while");
146 keywords.set("xor");
147 keywords.set("xor_eq");
148 }
149 return keywords.contains(string);
150}
151
152static bool is_known_type(const StringView& string)
153{
154 static HashTable<String> types;
155 if (types.is_empty()) {
156 types.set("ByteBuffer");
157 types.set("CircularDeque");
158 types.set("CircularQueue");
159 types.set("Deque");
160 types.set("DoublyLinkedList");
161 types.set("FileSystemPath");
162 types.set("FixedArray");
163 types.set("Function");
164 types.set("HashMap");
165 types.set("HashTable");
166 types.set("IPv4Address");
167 types.set("InlineLinkedList");
168 types.set("IntrusiveList");
169 types.set("JsonArray");
170 types.set("JsonObject");
171 types.set("JsonValue");
172 types.set("MappedFile");
173 types.set("NetworkOrdered");
174 types.set("NonnullOwnPtr");
175 types.set("NonnullOwnPtrVector");
176 types.set("NonnullRefPtr");
177 types.set("NonnullRefPtrVector");
178 types.set("Optional");
179 types.set("OwnPtr");
180 types.set("RefPtr");
181 types.set("Result");
182 types.set("ScopeGuard");
183 types.set("SinglyLinkedList");
184 types.set("String");
185 types.set("StringBuilder");
186 types.set("StringImpl");
187 types.set("StringView");
188 types.set("Utf8View");
189 types.set("Vector");
190 types.set("WeakPtr");
191 types.set("auto");
192 types.set("char");
193 types.set("char16_t");
194 types.set("char32_t");
195 types.set("char8_t");
196 types.set("double");
197 types.set("float");
198 types.set("i16");
199 types.set("i32");
200 types.set("i64");
201 types.set("i8");
202 types.set("int");
203 types.set("int");
204 types.set("long");
205 types.set("short");
206 types.set("signed");
207 types.set("u16");
208 types.set("u32");
209 types.set("u64");
210 types.set("u8");
211 types.set("unsigned");
212 types.set("void");
213 types.set("wchar_t");
214 }
215 return types.contains(string);
216}
217
218Vector<CppToken> CppLexer::lex()
219{
220 Vector<CppToken> tokens;
221
222 size_t token_start_index = 0;
223 CppPosition token_start_position;
224
225 auto emit_token = [&](auto type) {
226 CppToken token;
227 token.m_type = type;
228 token.m_start = m_position;
229 token.m_end = m_position;
230 tokens.append(token);
231 consume();
232 };
233
234 auto begin_token = [&] {
235 token_start_index = m_index;
236 token_start_position = m_position;
237 };
238 auto commit_token = [&](auto type) {
239 CppToken token;
240 token.m_type = type;
241 token.m_start = token_start_position;
242 token.m_end = m_previous_position;
243 tokens.append(token);
244 };
245
246 while (m_index < m_input.length()) {
247 auto ch = peek();
248 if (isspace(ch)) {
249 begin_token();
250 while (isspace(peek()))
251 consume();
252 commit_token(CppToken::Type::Whitespace);
253 continue;
254 }
255 if (ch == '(') {
256 emit_token(CppToken::Type::LeftParen);
257 continue;
258 }
259 if (ch == ')') {
260 emit_token(CppToken::Type::RightParen);
261 continue;
262 }
263 if (ch == '{') {
264 emit_token(CppToken::Type::LeftCurly);
265 continue;
266 }
267 if (ch == '}') {
268 emit_token(CppToken::Type::RightCurly);
269 continue;
270 }
271 if (ch == '[') {
272 emit_token(CppToken::Type::LeftBracket);
273 continue;
274 }
275 if (ch == ']') {
276 emit_token(CppToken::Type::RightBracket);
277 continue;
278 }
279 if (ch == ',') {
280 emit_token(CppToken::Type::Comma);
281 continue;
282 }
283 if (ch == '*') {
284 emit_token(CppToken::Type::Asterisk);
285 continue;
286 }
287 if (ch == ';') {
288 emit_token(CppToken::Type::Semicolon);
289 continue;
290 }
291 if (ch == '#') {
292 begin_token();
293 while (peek() && peek() != '\n')
294 consume();
295 commit_token(CppToken::Type::PreprocessorStatement);
296 continue;
297 }
298 if (ch == '/' && peek(1) == '/') {
299 begin_token();
300 while (peek() && peek() != '\n')
301 consume();
302 commit_token(CppToken::Type::Comment);
303 continue;
304 }
305 if (ch == '/' && peek(1) == '*') {
306 begin_token();
307 consume();
308 consume();
309 bool comment_block_ends = false;
310 while (peek()) {
311 if (peek() == '*' && peek(1) == '/') {
312 comment_block_ends = true;
313 break;
314 }
315
316 consume();
317 }
318
319 if (comment_block_ends) {
320 consume();
321 consume();
322 }
323
324 commit_token(CppToken::Type::Comment);
325 continue;
326 }
327 if (ch == '"') {
328 begin_token();
329 consume();
330 while (peek()) {
331 if (consume() == '"')
332 break;
333 }
334 commit_token(CppToken::Type::DoubleQuotedString);
335 continue;
336 }
337 if (ch == '\'') {
338 begin_token();
339 consume();
340 while (peek()) {
341 if (consume() == '\'')
342 break;
343 }
344 commit_token(CppToken::Type::SingleQuotedString);
345 continue;
346 }
347 if (isdigit(ch)) {
348 begin_token();
349 while (peek() && isdigit(peek())) {
350 consume();
351 }
352 commit_token(CppToken::Type::Number);
353 continue;
354 }
355 if (is_valid_first_character_of_identifier(ch)) {
356 begin_token();
357 while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
358 consume();
359 auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
360 if (is_keyword(token_view))
361 commit_token(CppToken::Type::Keyword);
362 else if (is_known_type(token_view))
363 commit_token(CppToken::Type::KnownType);
364 else
365 commit_token(CppToken::Type::Identifier);
366 continue;
367 }
368 dbg() << "Unimplemented token character: " << ch;
369 emit_token(CppToken::Type::Unknown);
370 }
371 return tokens;
372}
373
374}