Userland/Libraries/LibSQL/AST/Lexer.cpp at master

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / Userland / Libraries / LibSQL / AST / Lexer.cpp
at master 410 lines 12 kB view raw
wrap content
Linus Groh AK: Remove StringBuilder::build() in favor of to_deprecated_string() 3y ago
6e745932
  1/*
  2 * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
  3 * Copyright (c) 2021, Jan de Visser <jan@de-visser.net>
  4 *
  5 * SPDX-License-Identifier: BSD-2-Clause
  6 */
  7
  8#include "Lexer.h"
  9#include <AK/Debug.h>
 10#include <ctype.h>
 11
 12namespace SQL::AST {
 13
 14HashMap<DeprecatedString, TokenType> Lexer::s_keywords;
 15HashMap<char, TokenType> Lexer::s_one_char_tokens;
 16HashMap<DeprecatedString, TokenType> Lexer::s_two_char_tokens;
 17
 18Lexer::Lexer(StringView source)
 19    : m_source(source)
 20{
 21    if (s_keywords.is_empty()) {
 22#define __ENUMERATE_SQL_TOKEN(value, type, category)       \
 23    if (TokenCategory::category == TokenCategory::Keyword) \
 24        s_keywords.set(value, TokenType::type);
 25        ENUMERATE_SQL_TOKENS
 26#undef __ENUMERATE_SQL_TOKEN
 27    }
 28
 29    if (s_one_char_tokens.is_empty()) {
 30#define __ENUMERATE_SQL_TOKEN(value, type, category)                                  \
 31    if (TokenCategory::category != TokenCategory::Keyword && value##sv.length() == 1) \
 32        s_one_char_tokens.set(value[0], TokenType::type);
 33        ENUMERATE_SQL_TOKENS
 34#undef __ENUMERATE_SQL_TOKEN
 35    }
 36
 37    if (s_two_char_tokens.is_empty()) {
 38#define __ENUMERATE_SQL_TOKEN(value, type, category)                                  \
 39    if (TokenCategory::category != TokenCategory::Keyword && value##sv.length() == 2) \
 40        s_two_char_tokens.set(value, TokenType::type);
 41        ENUMERATE_SQL_TOKENS
 42#undef __ENUMERATE_SQL_TOKEN
 43    }
 44
 45    consume();
 46}
 47
 48Token Lexer::next()
 49{
 50    bool found_invalid_comment = consume_whitespace_and_comments();
 51
 52    size_t value_start_line_number = m_line_number;
 53    size_t value_start_column_number = m_line_column;
 54    auto token_type = TokenType::Invalid;
 55    StringBuilder current_token;
 56
 57    if (is_eof()) {
 58        token_type = found_invalid_comment ? TokenType::Invalid : TokenType::Eof;
 59    } else if (is_numeric_literal_start()) {
 60        token_type = TokenType::NumericLiteral;
 61        if (!consume_numeric_literal(current_token))
 62            token_type = TokenType::Invalid;
 63    } else if (is_string_literal_start()) {
 64        token_type = TokenType::StringLiteral;
 65        if (!consume_string_literal(current_token))
 66            token_type = TokenType::Invalid;
 67    } else if (is_quoted_identifier_start()) {
 68        token_type = TokenType::Identifier;
 69        if (!consume_quoted_identifier(current_token))
 70            token_type = TokenType::Invalid;
 71    } else if (is_blob_literal_start()) {
 72        token_type = TokenType::BlobLiteral;
 73        if (!consume_blob_literal(current_token))
 74            token_type = TokenType::Invalid;
 75    } else if (is_identifier_start()) {
 76        do {
 77            current_token.append((char)toupper(m_current_char));
 78            consume();
 79        } while (is_identifier_middle());
 80
 81        if (auto it = s_keywords.find(current_token.string_view()); it != s_keywords.end()) {
 82            token_type = it->value;
 83        } else {
 84            token_type = TokenType::Identifier;
 85        }
 86    } else {
 87        bool found_two_char_token = false;
 88        if (m_position < m_source.length()) {
 89            if (auto it = s_two_char_tokens.find(m_source.substring_view(m_position - 1, 2)); it != s_two_char_tokens.end()) {
 90                found_two_char_token = true;
 91                token_type = it->value;
 92                consume(&current_token);
 93                consume(&current_token);
 94            }
 95        }
 96
 97        bool found_one_char_token = false;
 98        if (!found_two_char_token) {
 99            if (auto it = s_one_char_tokens.find(m_current_char); it != s_one_char_tokens.end()) {
100                found_one_char_token = true;
101                token_type = it->value;
102                consume(&current_token);
103            }
104        }
105
106        if (!found_two_char_token && !found_one_char_token) {
107            token_type = TokenType::Invalid;
108            consume(&current_token);
109        }
110    }
111
112    Token token(token_type, current_token.to_deprecated_string(),
113        { value_start_line_number, value_start_column_number },
114        { m_line_number, m_line_column });
115
116    if constexpr (SQL_DEBUG) {
117        dbgln("------------------------------");
118        dbgln("Token: {}", token.name());
119        dbgln("Value: {}", token.value());
120        dbgln("Line: {}, Column: {}", token.start_position().line, token.start_position().column);
121        dbgln("------------------------------");
122    }
123
124    return token;
125}
126
127void Lexer::consume(StringBuilder* current_token)
128{
129    auto did_reach_eof = [this] {
130        if (m_position != m_source.length())
131            return false;
132        m_eof = true;
133        m_current_char = '\0';
134        ++m_line_column;
135        ++m_position;
136        return true;
137    };
138
139    if (current_token)
140        current_token->append(m_current_char);
141
142    if (m_position > m_source.length())
143        return;
144
145    if (did_reach_eof())
146        return;
147
148    if (is_line_break()) {
149        ++m_line_number;
150        m_line_column = 1;
151    } else {
152        ++m_line_column;
153    }
154
155    m_current_char = m_source[m_position++];
156}
157
158bool Lexer::consume_whitespace_and_comments()
159{
160    bool found_invalid_comment = false;
161
162    while (true) {
163        if (isspace(m_current_char)) {
164            do {
165                consume();
166            } while (isspace(m_current_char));
167        } else if (is_line_comment_start()) {
168            consume();
169            do {
170                consume();
171            } while (!is_eof() && !is_line_break());
172        } else if (is_block_comment_start()) {
173            consume();
174            do {
175                consume();
176            } while (!is_eof() && !is_block_comment_end());
177            if (is_eof())
178                found_invalid_comment = true;
179            consume(); // consume *
180            if (is_eof())
181                found_invalid_comment = true;
182            consume(); // consume /
183        } else {
184            break;
185        }
186    }
187
188    return found_invalid_comment;
189}
190
191bool Lexer::consume_numeric_literal(StringBuilder& current_token)
192{
193    // https://sqlite.org/syntax/numeric-literal.html
194    bool is_valid_numeric_literal = true;
195
196    if (m_current_char == '0') {
197        consume(&current_token);
198        if (m_current_char == '.') {
199            consume(&current_token);
200            while (isdigit(m_current_char))
201                consume(&current_token);
202            if (m_current_char == 'e' || m_current_char == 'E')
203                is_valid_numeric_literal = consume_exponent(current_token);
204        } else if (m_current_char == 'e' || m_current_char == 'E') {
205            is_valid_numeric_literal = consume_exponent(current_token);
206        } else if (m_current_char == 'x' || m_current_char == 'X') {
207            is_valid_numeric_literal = consume_hexadecimal_number(current_token);
208        } else if (isdigit(m_current_char)) {
209            do {
210                consume(&current_token);
211            } while (isdigit(m_current_char));
212        }
213    } else {
214        do {
215            consume(&current_token);
216        } while (isdigit(m_current_char));
217
218        if (m_current_char == '.') {
219            consume(&current_token);
220            while (isdigit(m_current_char))
221                consume(&current_token);
222        }
223        if (m_current_char == 'e' || m_current_char == 'E')
224            is_valid_numeric_literal = consume_exponent(current_token);
225    }
226
227    return is_valid_numeric_literal;
228}
229
230bool Lexer::consume_string_literal(StringBuilder& current_token)
231{
232    // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)"
233    bool is_valid_string_literal = true;
234
235    // Skip the opening single quote:
236    consume();
237
238    while (!is_eof() && !is_string_literal_end()) {
239        // If both the current character and the next one are single quotes,
240        // consume one single quote into the current token, and drop the
241        // other one on the floor:
242        if (match('\'', '\''))
243            consume();
244        consume(&current_token);
245    }
246
247    if (is_eof())
248        is_valid_string_literal = false;
249    // Drop the closing quote on the floor:
250    consume();
251
252    return is_valid_string_literal;
253}
254
255bool Lexer::consume_quoted_identifier(StringBuilder& current_token)
256{
257    // I have not found a reference to the syntax for identifiers in the
258    // SQLite documentation, but PostgreSQL has this:
259    // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
260    bool is_valid_identifier = true;
261
262    // Skip the opening double quote:
263    consume();
264
265    while (!is_eof() && !is_quoted_identifier_end()) {
266        // If both the current character and the next one are double quotes,
267        // consume one single quote into the current token, and drop the
268        // other one on the floor:
269        if (match('"', '"'))
270            consume();
271        consume(&current_token);
272    }
273
274    if (is_eof())
275        is_valid_identifier = false;
276    // Drop the closing double quote on the floor:
277    consume();
278
279    return is_valid_identifier;
280}
281
282bool Lexer::consume_blob_literal(StringBuilder& current_token)
283{
284    // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)"
285
286    // Skip starting 'X'/'x' character:
287    consume();
288
289    if (!consume_string_literal(current_token))
290        return false;
291    for (auto ix = 0u; ix < current_token.length(); ix++) {
292        if (!isxdigit(current_token.string_view()[ix]))
293            return false;
294    }
295    return true;
296}
297
298bool Lexer::consume_exponent(StringBuilder& current_token)
299{
300    consume(&current_token);
301    if (m_current_char == '-' || m_current_char == '+')
302        consume(&current_token);
303
304    if (!isdigit(m_current_char))
305        return false;
306
307    // FIXME This code results in the string "1e" being rejected as a
308    //       malformed numeric literal. We do however accept "1a" which
309    //       is inconsistent. We have to decide what we want to do:
310    //        - Be like `SQLite` and reject both "1a" and "1e" because we
311    //          require a space between the two tokens. This is pretty invasive;
312    //          we would have to decide where all spaces are required and fix
313    //          the lexer accordingly.
314    //        - Be like `PostgreSQL` and accept both "1e" and "1a" as two
315    //          separate tokens, and accept "1e3" as a single token. This would
316    //          would require pushing back the "e" we lexed here, terminate the
317    //          numeric literal, and re-process the "e" as the first char of
318    //          a new token.
319    while (isdigit(m_current_char)) {
320        consume(&current_token);
321    }
322    return true;
323}
324
325bool Lexer::consume_hexadecimal_number(StringBuilder& current_token)
326{
327    consume(&current_token);
328    if (!isxdigit(m_current_char))
329        return false;
330
331    while (isxdigit(m_current_char))
332        consume(&current_token);
333
334    return true;
335}
336
337bool Lexer::match(char a, char b) const
338{
339    if (m_position >= m_source.length())
340        return false;
341
342    return m_current_char == a && m_source[m_position] == b;
343}
344
345bool Lexer::is_identifier_start() const
346{
347    return isalpha(m_current_char) || m_current_char == '_';
348}
349
350bool Lexer::is_identifier_middle() const
351{
352    return is_identifier_start() || isdigit(m_current_char);
353}
354
355bool Lexer::is_numeric_literal_start() const
356{
357    return isdigit(m_current_char) || (m_current_char == '.' && m_position < m_source.length() && isdigit(m_source[m_position]));
358}
359
360bool Lexer::is_string_literal_start() const
361{
362    return m_current_char == '\'';
363}
364
365bool Lexer::is_string_literal_end() const
366{
367    return m_current_char == '\'' && !(m_position < m_source.length() && m_source[m_position] == '\'');
368}
369
370bool Lexer::is_quoted_identifier_start() const
371{
372    return m_current_char == '"';
373}
374
375bool Lexer::is_quoted_identifier_end() const
376{
377    return m_current_char == '"' && !(m_position < m_source.length() && m_source[m_position] == '"');
378}
379
380bool Lexer::is_blob_literal_start() const
381{
382    return match('x', '\'') || match('X', '\'');
383}
384
385bool Lexer::is_line_comment_start() const
386{
387    return match('-', '-');
388}
389
390bool Lexer::is_block_comment_start() const
391{
392    return match('/', '*');
393}
394
395bool Lexer::is_block_comment_end() const
396{
397    return match('*', '/');
398}
399
400bool Lexer::is_line_break() const
401{
402    return m_current_char == '\n';
403}
404
405bool Lexer::is_eof() const
406{
407    return m_eof;
408}
409
410}