Serenity Operating System
1/*
2 * Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@gmx.de>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "Lexer.h"
28#include <AK/HashMap.h>
29#include <AK/StringBuilder.h>
30#include <ctype.h>
31#include <stdio.h>
32
33namespace JS {
34
35HashMap<String, TokenType> Lexer::s_keywords;
36HashMap<String, TokenType> Lexer::s_three_char_tokens;
37HashMap<String, TokenType> Lexer::s_two_char_tokens;
38HashMap<char, TokenType> Lexer::s_single_char_tokens;
39
40Lexer::Lexer(StringView source)
41 : m_source(source)
42 , m_current_token(TokenType::Eof, StringView(nullptr), StringView(nullptr), 0, 0)
43{
44 if (s_keywords.is_empty()) {
45 s_keywords.set("await", TokenType::Await);
46 s_keywords.set("break", TokenType::Break);
47 s_keywords.set("case", TokenType::Case);
48 s_keywords.set("catch", TokenType::Catch);
49 s_keywords.set("class", TokenType::Class);
50 s_keywords.set("const", TokenType::Const);
51 s_keywords.set("continue", TokenType::Continue);
52 s_keywords.set("default", TokenType::Default);
53 s_keywords.set("delete", TokenType::Delete);
54 s_keywords.set("do", TokenType::Do);
55 s_keywords.set("else", TokenType::Else);
56 s_keywords.set("false", TokenType::BoolLiteral);
57 s_keywords.set("finally", TokenType::Finally);
58 s_keywords.set("for", TokenType::For);
59 s_keywords.set("function", TokenType::Function);
60 s_keywords.set("if", TokenType::If);
61 s_keywords.set("in", TokenType::In);
62 s_keywords.set("instanceof", TokenType::Instanceof);
63 s_keywords.set("interface", TokenType::Interface);
64 s_keywords.set("let", TokenType::Let);
65 s_keywords.set("new", TokenType::New);
66 s_keywords.set("null", TokenType::NullLiteral);
67 s_keywords.set("return", TokenType::Return);
68 s_keywords.set("switch", TokenType::Switch);
69 s_keywords.set("throw", TokenType::Throw);
70 s_keywords.set("true", TokenType::BoolLiteral);
71 s_keywords.set("try", TokenType::Try);
72 s_keywords.set("typeof", TokenType::Typeof);
73 s_keywords.set("var", TokenType::Var);
74 s_keywords.set("void", TokenType::Void);
75 s_keywords.set("while", TokenType::While);
76 s_keywords.set("yield", TokenType::Yield);
77 }
78
79 if (s_three_char_tokens.is_empty()) {
80 s_three_char_tokens.set("===", TokenType::EqualsEqualsEquals);
81 s_three_char_tokens.set("!==", TokenType::ExclamationMarkEqualsEquals);
82 s_three_char_tokens.set("**=", TokenType::AsteriskAsteriskEquals);
83 s_three_char_tokens.set("<<=", TokenType::ShiftLeftEquals);
84 s_three_char_tokens.set(">>=", TokenType::ShiftRightEquals);
85 s_three_char_tokens.set(">>>", TokenType::UnsignedShiftRight);
86 }
87
88 if (s_two_char_tokens.is_empty()) {
89 s_two_char_tokens.set("=>", TokenType::Arrow);
90 s_two_char_tokens.set("+=", TokenType::PlusEquals);
91 s_two_char_tokens.set("-=", TokenType::MinusEquals);
92 s_two_char_tokens.set("*=", TokenType::AsteriskEquals);
93 s_two_char_tokens.set("/=", TokenType::SlashEquals);
94 s_two_char_tokens.set("%=", TokenType::PercentEquals);
95 s_two_char_tokens.set("&=", TokenType::AmpersandEquals);
96 s_two_char_tokens.set("|=", TokenType::PipeEquals);
97 s_two_char_tokens.set("&&", TokenType::DoubleAmpersand);
98 s_two_char_tokens.set("||", TokenType::DoublePipe);
99 s_two_char_tokens.set("??", TokenType::DoubleQuestionMark);
100 s_two_char_tokens.set("**", TokenType::DoubleAsterisk);
101 s_two_char_tokens.set("==", TokenType::EqualsEquals);
102 s_two_char_tokens.set("<=", TokenType::LessThanEquals);
103 s_two_char_tokens.set(">=", TokenType::GreaterThanEquals);
104 s_two_char_tokens.set("!=", TokenType::ExclamationMarkEquals);
105 s_two_char_tokens.set("--", TokenType::MinusMinus);
106 s_two_char_tokens.set("++", TokenType::PlusPlus);
107 s_two_char_tokens.set("<<", TokenType::ShiftLeft);
108 s_two_char_tokens.set(">>", TokenType::ShiftRight);
109 s_two_char_tokens.set("?.", TokenType::QuestionMarkPeriod);
110 }
111
112 if (s_single_char_tokens.is_empty()) {
113 s_single_char_tokens.set('&', TokenType::Ampersand);
114 s_single_char_tokens.set('*', TokenType::Asterisk);
115 s_single_char_tokens.set('[', TokenType::BracketOpen);
116 s_single_char_tokens.set(']', TokenType::BracketClose);
117 s_single_char_tokens.set('^', TokenType::Caret);
118 s_single_char_tokens.set(':', TokenType::Colon);
119 s_single_char_tokens.set(',', TokenType::Comma);
120 s_single_char_tokens.set('{', TokenType::CurlyOpen);
121 s_single_char_tokens.set('}', TokenType::CurlyClose);
122 s_single_char_tokens.set('=', TokenType::Equals);
123 s_single_char_tokens.set('!', TokenType::ExclamationMark);
124 s_single_char_tokens.set('-', TokenType::Minus);
125 s_single_char_tokens.set('(', TokenType::ParenOpen);
126 s_single_char_tokens.set(')', TokenType::ParenClose);
127 s_single_char_tokens.set('%', TokenType::Percent);
128 s_single_char_tokens.set('.', TokenType::Period);
129 s_single_char_tokens.set('|', TokenType::Pipe);
130 s_single_char_tokens.set('+', TokenType::Plus);
131 s_single_char_tokens.set('?', TokenType::QuestionMark);
132 s_single_char_tokens.set(';', TokenType::Semicolon);
133 s_single_char_tokens.set('/', TokenType::Slash);
134 s_single_char_tokens.set('~', TokenType::Tilde);
135 s_single_char_tokens.set('<', TokenType::LessThan);
136 s_single_char_tokens.set('>', TokenType::GreaterThan);
137 }
138 consume();
139}
140
141void Lexer::consume()
142{
143 if (m_position >= m_source.length()) {
144 m_position = m_source.length() + 1;
145 m_current_char = EOF;
146 return;
147 }
148
149 if (m_current_char == '\n') {
150 m_line_number++;
151 m_line_column = 1;
152 } else {
153 m_line_column++;
154 }
155
156 m_current_char = m_source[m_position++];
157}
158
159void Lexer::consume_exponent()
160{
161 consume();
162 if (m_current_char == '-' || m_current_char == '+')
163 consume();
164 while (isdigit(m_current_char)) {
165 consume();
166 }
167}
168
169bool Lexer::is_eof() const
170{
171 return m_current_char == EOF;
172}
173
174bool Lexer::is_identifier_start() const
175{
176 return isalpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
177}
178
179bool Lexer::is_identifier_middle() const
180{
181 return is_identifier_start() || isdigit(m_current_char);
182}
183
184bool Lexer::is_line_comment_start() const
185{
186 return m_current_char == '/' && m_position < m_source.length() && m_source[m_position] == '/';
187}
188
189bool Lexer::is_block_comment_start() const
190{
191 return m_current_char == '/' && m_position < m_source.length() && m_source[m_position] == '*';
192}
193
194bool Lexer::is_block_comment_end() const
195{
196 return m_current_char == '*' && m_position < m_source.length() && m_source[m_position] == '/';
197}
198
199bool Lexer::is_numeric_literal_start() const
200{
201 return isdigit(m_current_char) || (m_current_char == '.' && m_position < m_source.length() && isdigit(m_source[m_position]));
202}
203
204void Lexer::syntax_error(const char* msg)
205{
206 m_has_errors = true;
207 if (m_log_errors)
208 fprintf(stderr, "Syntax Error: %s (line: %zu, column: %zu)\n", msg, m_line_number, m_line_column);
209}
210
211Token Lexer::next()
212{
213 size_t trivia_start = m_position;
214
215 // consume whitespace and comments
216 while (true) {
217 if (isspace(m_current_char)) {
218 do {
219 consume();
220 } while (isspace(m_current_char));
221 } else if (is_line_comment_start()) {
222 consume();
223 do {
224 consume();
225 } while (!is_eof() && m_current_char != '\n');
226 } else if (is_block_comment_start()) {
227 consume();
228 do {
229 consume();
230 } while (!is_eof() && !is_block_comment_end());
231 consume(); // consume *
232 consume(); // consume /
233 } else {
234 break;
235 }
236 }
237
238 size_t value_start = m_position;
239 auto token_type = TokenType::Invalid;
240
241 if (is_identifier_start()) {
242 // identifier or keyword
243 do {
244 consume();
245 } while (is_identifier_middle());
246
247 StringView value = m_source.substring_view(value_start - 1, m_position - value_start);
248 auto it = s_keywords.find(value);
249 if (it == s_keywords.end()) {
250 token_type = TokenType::Identifier;
251 } else {
252 token_type = it->value;
253 }
254 } else if (is_numeric_literal_start()) {
255 if (m_current_char == '0') {
256 consume();
257 if (m_current_char == '.') {
258 // decimal
259 consume();
260 while (isdigit(m_current_char)) {
261 consume();
262 }
263 if (m_current_char == 'e' || m_current_char == 'E') {
264 consume_exponent();
265 }
266 } else if (m_current_char == 'e' || m_current_char == 'E') {
267 consume_exponent();
268 } else if (m_current_char == 'o' || m_current_char == 'O') {
269 // octal
270 consume();
271 while (m_current_char >= '0' && m_current_char <= '7') {
272 consume();
273 }
274 } else if (m_current_char == 'b' || m_current_char == 'B') {
275 // binary
276 consume();
277 while (m_current_char == '0' || m_current_char == '1') {
278 consume();
279 }
280 } else if (m_current_char == 'x' || m_current_char == 'X') {
281 // hexadecimal
282 consume();
283 while (isxdigit(m_current_char)) {
284 consume();
285 }
286 } else if (isdigit(m_current_char)) {
287 // octal without 'O' prefix. Forbidden in 'strict mode'
288 // FIXME: We need to make sure this produces a syntax error when in strict mode
289 do {
290 consume();
291 } while (isdigit(m_current_char));
292 }
293 } else {
294 // 1...9 or period
295 while (isdigit(m_current_char)) {
296 consume();
297 }
298 if (m_current_char == '.') {
299 consume();
300 while (isdigit(m_current_char)) {
301 consume();
302 }
303 }
304 if (m_current_char == 'e' || m_current_char == 'E') {
305 consume_exponent();
306 }
307 }
308 token_type = TokenType::NumericLiteral;
309 } else if (m_current_char == '"' || m_current_char == '\'') {
310 char stop_char = m_current_char;
311 consume();
312 while (m_current_char != stop_char && m_current_char != '\n' && !is_eof()) {
313 if (m_current_char == '\\') {
314 consume();
315 }
316 consume();
317 }
318 if (m_current_char != stop_char) {
319 syntax_error("unterminated string literal");
320 token_type = TokenType::UnterminatedStringLiteral;
321 } else {
322 consume();
323 token_type = TokenType::StringLiteral;
324 }
325 } else if (m_current_char == EOF) {
326 token_type = TokenType::Eof;
327 } else {
328 // There is only one four-char operator: >>>=
329 bool found_four_char_token = false;
330 if (m_position + 2 < m_source.length()) {
331 if (m_current_char == '>'
332 && m_source[m_position] == '>'
333 && m_source[m_position + 1] == '>'
334 && m_source[m_position + 2] == '=') {
335
336 found_four_char_token = true;
337 consume();
338 consume();
339 consume();
340 consume();
341 token_type = TokenType::UnsignedShiftRightEquals;
342 }
343 }
344
345 bool found_three_char_token = false;
346 if (!found_four_char_token && m_position + 1 < m_source.length()) {
347 char second_char = m_source[m_position];
348 char third_char = m_source[m_position + 1];
349 char three_chars[] { (char)m_current_char, second_char, third_char, 0 };
350 auto it = s_three_char_tokens.find(three_chars);
351 if (it != s_three_char_tokens.end()) {
352 found_three_char_token = true;
353 consume();
354 consume();
355 consume();
356 token_type = it->value;
357 }
358 }
359
360 bool found_two_char_token = false;
361 if (!found_four_char_token && !found_three_char_token && m_position < m_source.length()) {
362 char second_char = m_source[m_position];
363 char two_chars[] { (char)m_current_char, second_char, 0 };
364 auto it = s_two_char_tokens.find(two_chars);
365 if (it != s_two_char_tokens.end()) {
366 found_two_char_token = true;
367 consume();
368 consume();
369 token_type = it->value;
370 }
371 }
372
373 bool found_one_char_token = false;
374 if (!found_four_char_token && !found_three_char_token && !found_two_char_token) {
375 auto it = s_single_char_tokens.find(m_current_char);
376 if (it != s_single_char_tokens.end()) {
377 found_one_char_token = true;
378 consume();
379 token_type = it->value;
380 }
381 }
382
383 if (!found_four_char_token && !found_three_char_token && !found_two_char_token && !found_one_char_token) {
384 consume();
385 token_type = TokenType::Invalid;
386 }
387 }
388
389 m_current_token = Token(
390 token_type,
391 m_source.substring_view(trivia_start - 1, value_start - trivia_start),
392 m_source.substring_view(value_start - 1, m_position - value_start),
393 m_line_number,
394 m_line_column - m_position + value_start);
395
396 return m_current_token;
397}
398}