Serenity Operating System
at master 410 lines 12 kB view raw
1/* 2 * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org> 3 * Copyright (c) 2021, Jan de Visser <jan@de-visser.net> 4 * 5 * SPDX-License-Identifier: BSD-2-Clause 6 */ 7 8#include "Lexer.h" 9#include <AK/Debug.h> 10#include <ctype.h> 11 12namespace SQL::AST { 13 14HashMap<DeprecatedString, TokenType> Lexer::s_keywords; 15HashMap<char, TokenType> Lexer::s_one_char_tokens; 16HashMap<DeprecatedString, TokenType> Lexer::s_two_char_tokens; 17 18Lexer::Lexer(StringView source) 19 : m_source(source) 20{ 21 if (s_keywords.is_empty()) { 22#define __ENUMERATE_SQL_TOKEN(value, type, category) \ 23 if (TokenCategory::category == TokenCategory::Keyword) \ 24 s_keywords.set(value, TokenType::type); 25 ENUMERATE_SQL_TOKENS 26#undef __ENUMERATE_SQL_TOKEN 27 } 28 29 if (s_one_char_tokens.is_empty()) { 30#define __ENUMERATE_SQL_TOKEN(value, type, category) \ 31 if (TokenCategory::category != TokenCategory::Keyword && value##sv.length() == 1) \ 32 s_one_char_tokens.set(value[0], TokenType::type); 33 ENUMERATE_SQL_TOKENS 34#undef __ENUMERATE_SQL_TOKEN 35 } 36 37 if (s_two_char_tokens.is_empty()) { 38#define __ENUMERATE_SQL_TOKEN(value, type, category) \ 39 if (TokenCategory::category != TokenCategory::Keyword && value##sv.length() == 2) \ 40 s_two_char_tokens.set(value, TokenType::type); 41 ENUMERATE_SQL_TOKENS 42#undef __ENUMERATE_SQL_TOKEN 43 } 44 45 consume(); 46} 47 48Token Lexer::next() 49{ 50 bool found_invalid_comment = consume_whitespace_and_comments(); 51 52 size_t value_start_line_number = m_line_number; 53 size_t value_start_column_number = m_line_column; 54 auto token_type = TokenType::Invalid; 55 StringBuilder current_token; 56 57 if (is_eof()) { 58 token_type = found_invalid_comment ? TokenType::Invalid : TokenType::Eof; 59 } else if (is_numeric_literal_start()) { 60 token_type = TokenType::NumericLiteral; 61 if (!consume_numeric_literal(current_token)) 62 token_type = TokenType::Invalid; 63 } else if (is_string_literal_start()) { 64 token_type = TokenType::StringLiteral; 65 if (!consume_string_literal(current_token)) 66 token_type = TokenType::Invalid; 67 } else if (is_quoted_identifier_start()) { 68 token_type = TokenType::Identifier; 69 if (!consume_quoted_identifier(current_token)) 70 token_type = TokenType::Invalid; 71 } else if (is_blob_literal_start()) { 72 token_type = TokenType::BlobLiteral; 73 if (!consume_blob_literal(current_token)) 74 token_type = TokenType::Invalid; 75 } else if (is_identifier_start()) { 76 do { 77 current_token.append((char)toupper(m_current_char)); 78 consume(); 79 } while (is_identifier_middle()); 80 81 if (auto it = s_keywords.find(current_token.string_view()); it != s_keywords.end()) { 82 token_type = it->value; 83 } else { 84 token_type = TokenType::Identifier; 85 } 86 } else { 87 bool found_two_char_token = false; 88 if (m_position < m_source.length()) { 89 if (auto it = s_two_char_tokens.find(m_source.substring_view(m_position - 1, 2)); it != s_two_char_tokens.end()) { 90 found_two_char_token = true; 91 token_type = it->value; 92 consume(&current_token); 93 consume(&current_token); 94 } 95 } 96 97 bool found_one_char_token = false; 98 if (!found_two_char_token) { 99 if (auto it = s_one_char_tokens.find(m_current_char); it != s_one_char_tokens.end()) { 100 found_one_char_token = true; 101 token_type = it->value; 102 consume(&current_token); 103 } 104 } 105 106 if (!found_two_char_token && !found_one_char_token) { 107 token_type = TokenType::Invalid; 108 consume(&current_token); 109 } 110 } 111 112 Token token(token_type, current_token.to_deprecated_string(), 113 { value_start_line_number, value_start_column_number }, 114 { m_line_number, m_line_column }); 115 116 if constexpr (SQL_DEBUG) { 117 dbgln("------------------------------"); 118 dbgln("Token: {}", token.name()); 119 dbgln("Value: {}", token.value()); 120 dbgln("Line: {}, Column: {}", token.start_position().line, token.start_position().column); 121 dbgln("------------------------------"); 122 } 123 124 return token; 125} 126 127void Lexer::consume(StringBuilder* current_token) 128{ 129 auto did_reach_eof = [this] { 130 if (m_position != m_source.length()) 131 return false; 132 m_eof = true; 133 m_current_char = '\0'; 134 ++m_line_column; 135 ++m_position; 136 return true; 137 }; 138 139 if (current_token) 140 current_token->append(m_current_char); 141 142 if (m_position > m_source.length()) 143 return; 144 145 if (did_reach_eof()) 146 return; 147 148 if (is_line_break()) { 149 ++m_line_number; 150 m_line_column = 1; 151 } else { 152 ++m_line_column; 153 } 154 155 m_current_char = m_source[m_position++]; 156} 157 158bool Lexer::consume_whitespace_and_comments() 159{ 160 bool found_invalid_comment = false; 161 162 while (true) { 163 if (isspace(m_current_char)) { 164 do { 165 consume(); 166 } while (isspace(m_current_char)); 167 } else if (is_line_comment_start()) { 168 consume(); 169 do { 170 consume(); 171 } while (!is_eof() && !is_line_break()); 172 } else if (is_block_comment_start()) { 173 consume(); 174 do { 175 consume(); 176 } while (!is_eof() && !is_block_comment_end()); 177 if (is_eof()) 178 found_invalid_comment = true; 179 consume(); // consume * 180 if (is_eof()) 181 found_invalid_comment = true; 182 consume(); // consume / 183 } else { 184 break; 185 } 186 } 187 188 return found_invalid_comment; 189} 190 191bool Lexer::consume_numeric_literal(StringBuilder& current_token) 192{ 193 // https://sqlite.org/syntax/numeric-literal.html 194 bool is_valid_numeric_literal = true; 195 196 if (m_current_char == '0') { 197 consume(&current_token); 198 if (m_current_char == '.') { 199 consume(&current_token); 200 while (isdigit(m_current_char)) 201 consume(&current_token); 202 if (m_current_char == 'e' || m_current_char == 'E') 203 is_valid_numeric_literal = consume_exponent(current_token); 204 } else if (m_current_char == 'e' || m_current_char == 'E') { 205 is_valid_numeric_literal = consume_exponent(current_token); 206 } else if (m_current_char == 'x' || m_current_char == 'X') { 207 is_valid_numeric_literal = consume_hexadecimal_number(current_token); 208 } else if (isdigit(m_current_char)) { 209 do { 210 consume(&current_token); 211 } while (isdigit(m_current_char)); 212 } 213 } else { 214 do { 215 consume(&current_token); 216 } while (isdigit(m_current_char)); 217 218 if (m_current_char == '.') { 219 consume(&current_token); 220 while (isdigit(m_current_char)) 221 consume(&current_token); 222 } 223 if (m_current_char == 'e' || m_current_char == 'E') 224 is_valid_numeric_literal = consume_exponent(current_token); 225 } 226 227 return is_valid_numeric_literal; 228} 229 230bool Lexer::consume_string_literal(StringBuilder& current_token) 231{ 232 // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)" 233 bool is_valid_string_literal = true; 234 235 // Skip the opening single quote: 236 consume(); 237 238 while (!is_eof() && !is_string_literal_end()) { 239 // If both the current character and the next one are single quotes, 240 // consume one single quote into the current token, and drop the 241 // other one on the floor: 242 if (match('\'', '\'')) 243 consume(); 244 consume(&current_token); 245 } 246 247 if (is_eof()) 248 is_valid_string_literal = false; 249 // Drop the closing quote on the floor: 250 consume(); 251 252 return is_valid_string_literal; 253} 254 255bool Lexer::consume_quoted_identifier(StringBuilder& current_token) 256{ 257 // I have not found a reference to the syntax for identifiers in the 258 // SQLite documentation, but PostgreSQL has this: 259 // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS 260 bool is_valid_identifier = true; 261 262 // Skip the opening double quote: 263 consume(); 264 265 while (!is_eof() && !is_quoted_identifier_end()) { 266 // If both the current character and the next one are double quotes, 267 // consume one single quote into the current token, and drop the 268 // other one on the floor: 269 if (match('"', '"')) 270 consume(); 271 consume(&current_token); 272 } 273 274 if (is_eof()) 275 is_valid_identifier = false; 276 // Drop the closing double quote on the floor: 277 consume(); 278 279 return is_valid_identifier; 280} 281 282bool Lexer::consume_blob_literal(StringBuilder& current_token) 283{ 284 // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)" 285 286 // Skip starting 'X'/'x' character: 287 consume(); 288 289 if (!consume_string_literal(current_token)) 290 return false; 291 for (auto ix = 0u; ix < current_token.length(); ix++) { 292 if (!isxdigit(current_token.string_view()[ix])) 293 return false; 294 } 295 return true; 296} 297 298bool Lexer::consume_exponent(StringBuilder& current_token) 299{ 300 consume(&current_token); 301 if (m_current_char == '-' || m_current_char == '+') 302 consume(&current_token); 303 304 if (!isdigit(m_current_char)) 305 return false; 306 307 // FIXME This code results in the string "1e" being rejected as a 308 // malformed numeric literal. We do however accept "1a" which 309 // is inconsistent. We have to decide what we want to do: 310 // - Be like `SQLite` and reject both "1a" and "1e" because we 311 // require a space between the two tokens. This is pretty invasive; 312 // we would have to decide where all spaces are required and fix 313 // the lexer accordingly. 314 // - Be like `PostgreSQL` and accept both "1e" and "1a" as two 315 // separate tokens, and accept "1e3" as a single token. This would 316 // would require pushing back the "e" we lexed here, terminate the 317 // numeric literal, and re-process the "e" as the first char of 318 // a new token. 319 while (isdigit(m_current_char)) { 320 consume(&current_token); 321 } 322 return true; 323} 324 325bool Lexer::consume_hexadecimal_number(StringBuilder& current_token) 326{ 327 consume(&current_token); 328 if (!isxdigit(m_current_char)) 329 return false; 330 331 while (isxdigit(m_current_char)) 332 consume(&current_token); 333 334 return true; 335} 336 337bool Lexer::match(char a, char b) const 338{ 339 if (m_position >= m_source.length()) 340 return false; 341 342 return m_current_char == a && m_source[m_position] == b; 343} 344 345bool Lexer::is_identifier_start() const 346{ 347 return isalpha(m_current_char) || m_current_char == '_'; 348} 349 350bool Lexer::is_identifier_middle() const 351{ 352 return is_identifier_start() || isdigit(m_current_char); 353} 354 355bool Lexer::is_numeric_literal_start() const 356{ 357 return isdigit(m_current_char) || (m_current_char == '.' && m_position < m_source.length() && isdigit(m_source[m_position])); 358} 359 360bool Lexer::is_string_literal_start() const 361{ 362 return m_current_char == '\''; 363} 364 365bool Lexer::is_string_literal_end() const 366{ 367 return m_current_char == '\'' && !(m_position < m_source.length() && m_source[m_position] == '\''); 368} 369 370bool Lexer::is_quoted_identifier_start() const 371{ 372 return m_current_char == '"'; 373} 374 375bool Lexer::is_quoted_identifier_end() const 376{ 377 return m_current_char == '"' && !(m_position < m_source.length() && m_source[m_position] == '"'); 378} 379 380bool Lexer::is_blob_literal_start() const 381{ 382 return match('x', '\'') || match('X', '\''); 383} 384 385bool Lexer::is_line_comment_start() const 386{ 387 return match('-', '-'); 388} 389 390bool Lexer::is_block_comment_start() const 391{ 392 return match('/', '*'); 393} 394 395bool Lexer::is_block_comment_end() const 396{ 397 return match('*', '/'); 398} 399 400bool Lexer::is_line_break() const 401{ 402 return m_current_char == '\n'; 403} 404 405bool Lexer::is_eof() const 406{ 407 return m_eof; 408} 409 410}