Serenity Operating System
at hosted 398 lines 15 kB view raw
1/* 2 * Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@gmx.de> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "Lexer.h" 28#include <AK/HashMap.h> 29#include <AK/StringBuilder.h> 30#include <ctype.h> 31#include <stdio.h> 32 33namespace JS { 34 35HashMap<String, TokenType> Lexer::s_keywords; 36HashMap<String, TokenType> Lexer::s_three_char_tokens; 37HashMap<String, TokenType> Lexer::s_two_char_tokens; 38HashMap<char, TokenType> Lexer::s_single_char_tokens; 39 40Lexer::Lexer(StringView source) 41 : m_source(source) 42 , m_current_token(TokenType::Eof, StringView(nullptr), StringView(nullptr), 0, 0) 43{ 44 if (s_keywords.is_empty()) { 45 s_keywords.set("await", TokenType::Await); 46 s_keywords.set("break", TokenType::Break); 47 s_keywords.set("case", TokenType::Case); 48 s_keywords.set("catch", TokenType::Catch); 49 s_keywords.set("class", TokenType::Class); 50 s_keywords.set("const", TokenType::Const); 51 s_keywords.set("continue", TokenType::Continue); 52 s_keywords.set("default", TokenType::Default); 53 s_keywords.set("delete", TokenType::Delete); 54 s_keywords.set("do", TokenType::Do); 55 s_keywords.set("else", TokenType::Else); 56 s_keywords.set("false", TokenType::BoolLiteral); 57 s_keywords.set("finally", TokenType::Finally); 58 s_keywords.set("for", TokenType::For); 59 s_keywords.set("function", TokenType::Function); 60 s_keywords.set("if", TokenType::If); 61 s_keywords.set("in", TokenType::In); 62 s_keywords.set("instanceof", TokenType::Instanceof); 63 s_keywords.set("interface", TokenType::Interface); 64 s_keywords.set("let", TokenType::Let); 65 s_keywords.set("new", TokenType::New); 66 s_keywords.set("null", TokenType::NullLiteral); 67 s_keywords.set("return", TokenType::Return); 68 s_keywords.set("switch", TokenType::Switch); 69 s_keywords.set("throw", TokenType::Throw); 70 s_keywords.set("true", TokenType::BoolLiteral); 71 s_keywords.set("try", TokenType::Try); 72 s_keywords.set("typeof", TokenType::Typeof); 73 s_keywords.set("var", TokenType::Var); 74 s_keywords.set("void", TokenType::Void); 75 s_keywords.set("while", TokenType::While); 76 s_keywords.set("yield", TokenType::Yield); 77 } 78 79 if (s_three_char_tokens.is_empty()) { 80 s_three_char_tokens.set("===", TokenType::EqualsEqualsEquals); 81 s_three_char_tokens.set("!==", TokenType::ExclamationMarkEqualsEquals); 82 s_three_char_tokens.set("**=", TokenType::AsteriskAsteriskEquals); 83 s_three_char_tokens.set("<<=", TokenType::ShiftLeftEquals); 84 s_three_char_tokens.set(">>=", TokenType::ShiftRightEquals); 85 s_three_char_tokens.set(">>>", TokenType::UnsignedShiftRight); 86 } 87 88 if (s_two_char_tokens.is_empty()) { 89 s_two_char_tokens.set("=>", TokenType::Arrow); 90 s_two_char_tokens.set("+=", TokenType::PlusEquals); 91 s_two_char_tokens.set("-=", TokenType::MinusEquals); 92 s_two_char_tokens.set("*=", TokenType::AsteriskEquals); 93 s_two_char_tokens.set("/=", TokenType::SlashEquals); 94 s_two_char_tokens.set("%=", TokenType::PercentEquals); 95 s_two_char_tokens.set("&=", TokenType::AmpersandEquals); 96 s_two_char_tokens.set("|=", TokenType::PipeEquals); 97 s_two_char_tokens.set("&&", TokenType::DoubleAmpersand); 98 s_two_char_tokens.set("||", TokenType::DoublePipe); 99 s_two_char_tokens.set("??", TokenType::DoubleQuestionMark); 100 s_two_char_tokens.set("**", TokenType::DoubleAsterisk); 101 s_two_char_tokens.set("==", TokenType::EqualsEquals); 102 s_two_char_tokens.set("<=", TokenType::LessThanEquals); 103 s_two_char_tokens.set(">=", TokenType::GreaterThanEquals); 104 s_two_char_tokens.set("!=", TokenType::ExclamationMarkEquals); 105 s_two_char_tokens.set("--", TokenType::MinusMinus); 106 s_two_char_tokens.set("++", TokenType::PlusPlus); 107 s_two_char_tokens.set("<<", TokenType::ShiftLeft); 108 s_two_char_tokens.set(">>", TokenType::ShiftRight); 109 s_two_char_tokens.set("?.", TokenType::QuestionMarkPeriod); 110 } 111 112 if (s_single_char_tokens.is_empty()) { 113 s_single_char_tokens.set('&', TokenType::Ampersand); 114 s_single_char_tokens.set('*', TokenType::Asterisk); 115 s_single_char_tokens.set('[', TokenType::BracketOpen); 116 s_single_char_tokens.set(']', TokenType::BracketClose); 117 s_single_char_tokens.set('^', TokenType::Caret); 118 s_single_char_tokens.set(':', TokenType::Colon); 119 s_single_char_tokens.set(',', TokenType::Comma); 120 s_single_char_tokens.set('{', TokenType::CurlyOpen); 121 s_single_char_tokens.set('}', TokenType::CurlyClose); 122 s_single_char_tokens.set('=', TokenType::Equals); 123 s_single_char_tokens.set('!', TokenType::ExclamationMark); 124 s_single_char_tokens.set('-', TokenType::Minus); 125 s_single_char_tokens.set('(', TokenType::ParenOpen); 126 s_single_char_tokens.set(')', TokenType::ParenClose); 127 s_single_char_tokens.set('%', TokenType::Percent); 128 s_single_char_tokens.set('.', TokenType::Period); 129 s_single_char_tokens.set('|', TokenType::Pipe); 130 s_single_char_tokens.set('+', TokenType::Plus); 131 s_single_char_tokens.set('?', TokenType::QuestionMark); 132 s_single_char_tokens.set(';', TokenType::Semicolon); 133 s_single_char_tokens.set('/', TokenType::Slash); 134 s_single_char_tokens.set('~', TokenType::Tilde); 135 s_single_char_tokens.set('<', TokenType::LessThan); 136 s_single_char_tokens.set('>', TokenType::GreaterThan); 137 } 138 consume(); 139} 140 141void Lexer::consume() 142{ 143 if (m_position >= m_source.length()) { 144 m_position = m_source.length() + 1; 145 m_current_char = EOF; 146 return; 147 } 148 149 if (m_current_char == '\n') { 150 m_line_number++; 151 m_line_column = 1; 152 } else { 153 m_line_column++; 154 } 155 156 m_current_char = m_source[m_position++]; 157} 158 159void Lexer::consume_exponent() 160{ 161 consume(); 162 if (m_current_char == '-' || m_current_char == '+') 163 consume(); 164 while (isdigit(m_current_char)) { 165 consume(); 166 } 167} 168 169bool Lexer::is_eof() const 170{ 171 return m_current_char == EOF; 172} 173 174bool Lexer::is_identifier_start() const 175{ 176 return isalpha(m_current_char) || m_current_char == '_' || m_current_char == '$'; 177} 178 179bool Lexer::is_identifier_middle() const 180{ 181 return is_identifier_start() || isdigit(m_current_char); 182} 183 184bool Lexer::is_line_comment_start() const 185{ 186 return m_current_char == '/' && m_position < m_source.length() && m_source[m_position] == '/'; 187} 188 189bool Lexer::is_block_comment_start() const 190{ 191 return m_current_char == '/' && m_position < m_source.length() && m_source[m_position] == '*'; 192} 193 194bool Lexer::is_block_comment_end() const 195{ 196 return m_current_char == '*' && m_position < m_source.length() && m_source[m_position] == '/'; 197} 198 199bool Lexer::is_numeric_literal_start() const 200{ 201 return isdigit(m_current_char) || (m_current_char == '.' && m_position < m_source.length() && isdigit(m_source[m_position])); 202} 203 204void Lexer::syntax_error(const char* msg) 205{ 206 m_has_errors = true; 207 if (m_log_errors) 208 fprintf(stderr, "Syntax Error: %s (line: %zu, column: %zu)\n", msg, m_line_number, m_line_column); 209} 210 211Token Lexer::next() 212{ 213 size_t trivia_start = m_position; 214 215 // consume whitespace and comments 216 while (true) { 217 if (isspace(m_current_char)) { 218 do { 219 consume(); 220 } while (isspace(m_current_char)); 221 } else if (is_line_comment_start()) { 222 consume(); 223 do { 224 consume(); 225 } while (!is_eof() && m_current_char != '\n'); 226 } else if (is_block_comment_start()) { 227 consume(); 228 do { 229 consume(); 230 } while (!is_eof() && !is_block_comment_end()); 231 consume(); // consume * 232 consume(); // consume / 233 } else { 234 break; 235 } 236 } 237 238 size_t value_start = m_position; 239 auto token_type = TokenType::Invalid; 240 241 if (is_identifier_start()) { 242 // identifier or keyword 243 do { 244 consume(); 245 } while (is_identifier_middle()); 246 247 StringView value = m_source.substring_view(value_start - 1, m_position - value_start); 248 auto it = s_keywords.find(value); 249 if (it == s_keywords.end()) { 250 token_type = TokenType::Identifier; 251 } else { 252 token_type = it->value; 253 } 254 } else if (is_numeric_literal_start()) { 255 if (m_current_char == '0') { 256 consume(); 257 if (m_current_char == '.') { 258 // decimal 259 consume(); 260 while (isdigit(m_current_char)) { 261 consume(); 262 } 263 if (m_current_char == 'e' || m_current_char == 'E') { 264 consume_exponent(); 265 } 266 } else if (m_current_char == 'e' || m_current_char == 'E') { 267 consume_exponent(); 268 } else if (m_current_char == 'o' || m_current_char == 'O') { 269 // octal 270 consume(); 271 while (m_current_char >= '0' && m_current_char <= '7') { 272 consume(); 273 } 274 } else if (m_current_char == 'b' || m_current_char == 'B') { 275 // binary 276 consume(); 277 while (m_current_char == '0' || m_current_char == '1') { 278 consume(); 279 } 280 } else if (m_current_char == 'x' || m_current_char == 'X') { 281 // hexadecimal 282 consume(); 283 while (isxdigit(m_current_char)) { 284 consume(); 285 } 286 } else if (isdigit(m_current_char)) { 287 // octal without 'O' prefix. Forbidden in 'strict mode' 288 // FIXME: We need to make sure this produces a syntax error when in strict mode 289 do { 290 consume(); 291 } while (isdigit(m_current_char)); 292 } 293 } else { 294 // 1...9 or period 295 while (isdigit(m_current_char)) { 296 consume(); 297 } 298 if (m_current_char == '.') { 299 consume(); 300 while (isdigit(m_current_char)) { 301 consume(); 302 } 303 } 304 if (m_current_char == 'e' || m_current_char == 'E') { 305 consume_exponent(); 306 } 307 } 308 token_type = TokenType::NumericLiteral; 309 } else if (m_current_char == '"' || m_current_char == '\'') { 310 char stop_char = m_current_char; 311 consume(); 312 while (m_current_char != stop_char && m_current_char != '\n' && !is_eof()) { 313 if (m_current_char == '\\') { 314 consume(); 315 } 316 consume(); 317 } 318 if (m_current_char != stop_char) { 319 syntax_error("unterminated string literal"); 320 token_type = TokenType::UnterminatedStringLiteral; 321 } else { 322 consume(); 323 token_type = TokenType::StringLiteral; 324 } 325 } else if (m_current_char == EOF) { 326 token_type = TokenType::Eof; 327 } else { 328 // There is only one four-char operator: >>>= 329 bool found_four_char_token = false; 330 if (m_position + 2 < m_source.length()) { 331 if (m_current_char == '>' 332 && m_source[m_position] == '>' 333 && m_source[m_position + 1] == '>' 334 && m_source[m_position + 2] == '=') { 335 336 found_four_char_token = true; 337 consume(); 338 consume(); 339 consume(); 340 consume(); 341 token_type = TokenType::UnsignedShiftRightEquals; 342 } 343 } 344 345 bool found_three_char_token = false; 346 if (!found_four_char_token && m_position + 1 < m_source.length()) { 347 char second_char = m_source[m_position]; 348 char third_char = m_source[m_position + 1]; 349 char three_chars[] { (char)m_current_char, second_char, third_char, 0 }; 350 auto it = s_three_char_tokens.find(three_chars); 351 if (it != s_three_char_tokens.end()) { 352 found_three_char_token = true; 353 consume(); 354 consume(); 355 consume(); 356 token_type = it->value; 357 } 358 } 359 360 bool found_two_char_token = false; 361 if (!found_four_char_token && !found_three_char_token && m_position < m_source.length()) { 362 char second_char = m_source[m_position]; 363 char two_chars[] { (char)m_current_char, second_char, 0 }; 364 auto it = s_two_char_tokens.find(two_chars); 365 if (it != s_two_char_tokens.end()) { 366 found_two_char_token = true; 367 consume(); 368 consume(); 369 token_type = it->value; 370 } 371 } 372 373 bool found_one_char_token = false; 374 if (!found_four_char_token && !found_three_char_token && !found_two_char_token) { 375 auto it = s_single_char_tokens.find(m_current_char); 376 if (it != s_single_char_tokens.end()) { 377 found_one_char_token = true; 378 consume(); 379 token_type = it->value; 380 } 381 } 382 383 if (!found_four_char_token && !found_three_char_token && !found_two_char_token && !found_one_char_token) { 384 consume(); 385 token_type = TokenType::Invalid; 386 } 387 } 388 389 m_current_token = Token( 390 token_type, 391 m_source.substring_view(trivia_start - 1, value_start - trivia_start), 392 m_source.substring_view(value_start - 1, m_position - value_start), 393 m_line_number, 394 m_line_column - m_position + value_start); 395 396 return m_current_token; 397} 398}