AK/GenericLexer.cpp at master · jcs.org/serenity

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / AK / GenericLexer.cpp
at master 215 lines 5.8 kB view raw
wrap content
Linus Groh Everywhere: Rename to_{string => deprecated_string}() where applicable 3y ago
57dc179b
  1/*
  2 * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
  3 *
  4 * SPDX-License-Identifier: BSD-2-Clause
  5 */
  6
  7#include <AK/Assertions.h>
  8#include <AK/CharacterTypes.h>
  9#include <AK/GenericLexer.h>
 10#include <AK/StringBuilder.h>
 11
 12#ifndef KERNEL
 13#    include <AK/DeprecatedString.h>
 14#    include <AK/Utf16View.h>
 15#endif
 16
 17namespace AK {
 18// Consume a number of characters
 19StringView GenericLexer::consume(size_t count)
 20{
 21    if (count == 0)
 22        return {};
 23
 24    size_t start = m_index;
 25    size_t length = min(count, m_input.length() - m_index);
 26    m_index += length;
 27
 28    return m_input.substring_view(start, length);
 29}
 30
 31// Consume the rest of the input
 32StringView GenericLexer::consume_all()
 33{
 34    if (is_eof())
 35        return {};
 36
 37    auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
 38    m_index = m_input.length();
 39    return rest;
 40}
 41
 42// Consume until a new line is found
 43StringView GenericLexer::consume_line()
 44{
 45    size_t start = m_index;
 46    while (!is_eof() && peek() != '\r' && peek() != '\n')
 47        m_index++;
 48    size_t length = m_index - start;
 49
 50    consume_specific('\r');
 51    consume_specific('\n');
 52
 53    if (length == 0)
 54        return {};
 55    return m_input.substring_view(start, length);
 56}
 57
 58// Consume and return characters until `stop` is peek'd
 59StringView GenericLexer::consume_until(char stop)
 60{
 61    size_t start = m_index;
 62    while (!is_eof() && peek() != stop)
 63        m_index++;
 64    size_t length = m_index - start;
 65
 66    if (length == 0)
 67        return {};
 68    return m_input.substring_view(start, length);
 69}
 70
 71// Consume and return characters until the string `stop` is found
 72StringView GenericLexer::consume_until(char const* stop)
 73{
 74    size_t start = m_index;
 75    while (!is_eof() && !next_is(stop))
 76        m_index++;
 77    size_t length = m_index - start;
 78
 79    if (length == 0)
 80        return {};
 81    return m_input.substring_view(start, length);
 82}
 83
 84// Consume and return characters until the string `stop` is found
 85StringView GenericLexer::consume_until(StringView stop)
 86{
 87    size_t start = m_index;
 88    while (!is_eof() && !next_is(stop))
 89        m_index++;
 90    size_t length = m_index - start;
 91
 92    if (length == 0)
 93        return {};
 94    return m_input.substring_view(start, length);
 95}
 96
 97/*
 98 * Consume a string surrounded by single or double quotes. The returned
 99 * StringView does not include the quotes. An escape character can be provided
100 * to capture the enclosing quotes. Please note that the escape character will
101 * still be in the resulting StringView
102 */
103StringView GenericLexer::consume_quoted_string(char escape_char)
104{
105    if (!next_is(is_quote))
106        return {};
107
108    char quote_char = consume();
109    size_t start = m_index;
110    while (!is_eof()) {
111        if (next_is(escape_char))
112            m_index++;
113        else if (next_is(quote_char))
114            break;
115        m_index++;
116    }
117    size_t length = m_index - start;
118
119    if (peek() != quote_char) {
120        // Restore the index in case the string is unterminated
121        m_index = start - 1;
122        return {};
123    }
124
125    // Ignore closing quote
126    ignore();
127
128    return m_input.substring_view(start, length);
129}
130
131#ifndef KERNEL
132DeprecatedString GenericLexer::consume_and_unescape_string(char escape_char)
133{
134    auto view = consume_quoted_string(escape_char);
135    if (view.is_null())
136        return {};
137
138    StringBuilder builder;
139    for (size_t i = 0; i < view.length(); ++i)
140        builder.append(consume_escaped_character(escape_char));
141    return builder.to_deprecated_string();
142}
143
144auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
145{
146    if (!consume_specific("\\u"sv))
147        return UnicodeEscapeError::MalformedUnicodeEscape;
148
149    if (next_is('{'))
150        return decode_code_point();
151    return decode_single_or_paired_surrogate(combine_surrogate_pairs);
152}
153
154auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
155{
156    bool starts_with_open_bracket = consume_specific('{');
157    VERIFY(starts_with_open_bracket);
158
159    u32 code_point = 0;
160
161    while (true) {
162        if (!next_is(is_ascii_hex_digit))
163            return UnicodeEscapeError::MalformedUnicodeEscape;
164
165        auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
166        if (new_code_point < code_point)
167            return UnicodeEscapeError::UnicodeEscapeOverflow;
168
169        code_point = new_code_point;
170        if (consume_specific('}'))
171            break;
172    }
173
174    if (is_unicode(code_point))
175        return code_point;
176    return UnicodeEscapeError::UnicodeEscapeOverflow;
177}
178
179auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
180{
181    constexpr size_t surrogate_length = 4;
182
183    auto decode_one_surrogate = [&]() -> Optional<u16> {
184        u16 surrogate = 0;
185
186        for (size_t i = 0; i < surrogate_length; ++i) {
187            if (!next_is(is_ascii_hex_digit))
188                return {};
189
190            surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
191        }
192
193        return surrogate;
194    };
195
196    auto high_surrogate = decode_one_surrogate();
197    if (!high_surrogate.has_value())
198        return UnicodeEscapeError::MalformedUnicodeEscape;
199    if (!Utf16View::is_high_surrogate(*high_surrogate))
200        return *high_surrogate;
201    if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
202        return *high_surrogate;
203
204    auto low_surrogate = decode_one_surrogate();
205    if (!low_surrogate.has_value())
206        return UnicodeEscapeError::MalformedUnicodeEscape;
207    if (Utf16View::is_low_surrogate(*low_surrogate))
208        return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
209
210    retreat(6);
211    return *high_surrogate;
212}
213#endif
214
215}