Serenity Operating System
at master 215 lines 5.8 kB view raw
1/* 2 * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/Assertions.h> 8#include <AK/CharacterTypes.h> 9#include <AK/GenericLexer.h> 10#include <AK/StringBuilder.h> 11 12#ifndef KERNEL 13# include <AK/DeprecatedString.h> 14# include <AK/Utf16View.h> 15#endif 16 17namespace AK { 18// Consume a number of characters 19StringView GenericLexer::consume(size_t count) 20{ 21 if (count == 0) 22 return {}; 23 24 size_t start = m_index; 25 size_t length = min(count, m_input.length() - m_index); 26 m_index += length; 27 28 return m_input.substring_view(start, length); 29} 30 31// Consume the rest of the input 32StringView GenericLexer::consume_all() 33{ 34 if (is_eof()) 35 return {}; 36 37 auto rest = m_input.substring_view(m_index, m_input.length() - m_index); 38 m_index = m_input.length(); 39 return rest; 40} 41 42// Consume until a new line is found 43StringView GenericLexer::consume_line() 44{ 45 size_t start = m_index; 46 while (!is_eof() && peek() != '\r' && peek() != '\n') 47 m_index++; 48 size_t length = m_index - start; 49 50 consume_specific('\r'); 51 consume_specific('\n'); 52 53 if (length == 0) 54 return {}; 55 return m_input.substring_view(start, length); 56} 57 58// Consume and return characters until `stop` is peek'd 59StringView GenericLexer::consume_until(char stop) 60{ 61 size_t start = m_index; 62 while (!is_eof() && peek() != stop) 63 m_index++; 64 size_t length = m_index - start; 65 66 if (length == 0) 67 return {}; 68 return m_input.substring_view(start, length); 69} 70 71// Consume and return characters until the string `stop` is found 72StringView GenericLexer::consume_until(char const* stop) 73{ 74 size_t start = m_index; 75 while (!is_eof() && !next_is(stop)) 76 m_index++; 77 size_t length = m_index - start; 78 79 if (length == 0) 80 return {}; 81 return m_input.substring_view(start, length); 82} 83 84// Consume and return characters until the string `stop` is found 85StringView GenericLexer::consume_until(StringView stop) 86{ 87 size_t start = m_index; 88 while (!is_eof() && !next_is(stop)) 89 m_index++; 90 size_t length = m_index - start; 91 92 if (length == 0) 93 return {}; 94 return m_input.substring_view(start, length); 95} 96 97/* 98 * Consume a string surrounded by single or double quotes. The returned 99 * StringView does not include the quotes. An escape character can be provided 100 * to capture the enclosing quotes. Please note that the escape character will 101 * still be in the resulting StringView 102 */ 103StringView GenericLexer::consume_quoted_string(char escape_char) 104{ 105 if (!next_is(is_quote)) 106 return {}; 107 108 char quote_char = consume(); 109 size_t start = m_index; 110 while (!is_eof()) { 111 if (next_is(escape_char)) 112 m_index++; 113 else if (next_is(quote_char)) 114 break; 115 m_index++; 116 } 117 size_t length = m_index - start; 118 119 if (peek() != quote_char) { 120 // Restore the index in case the string is unterminated 121 m_index = start - 1; 122 return {}; 123 } 124 125 // Ignore closing quote 126 ignore(); 127 128 return m_input.substring_view(start, length); 129} 130 131#ifndef KERNEL 132DeprecatedString GenericLexer::consume_and_unescape_string(char escape_char) 133{ 134 auto view = consume_quoted_string(escape_char); 135 if (view.is_null()) 136 return {}; 137 138 StringBuilder builder; 139 for (size_t i = 0; i < view.length(); ++i) 140 builder.append(consume_escaped_character(escape_char)); 141 return builder.to_deprecated_string(); 142} 143 144auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError> 145{ 146 if (!consume_specific("\\u"sv)) 147 return UnicodeEscapeError::MalformedUnicodeEscape; 148 149 if (next_is('{')) 150 return decode_code_point(); 151 return decode_single_or_paired_surrogate(combine_surrogate_pairs); 152} 153 154auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError> 155{ 156 bool starts_with_open_bracket = consume_specific('{'); 157 VERIFY(starts_with_open_bracket); 158 159 u32 code_point = 0; 160 161 while (true) { 162 if (!next_is(is_ascii_hex_digit)) 163 return UnicodeEscapeError::MalformedUnicodeEscape; 164 165 auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume()); 166 if (new_code_point < code_point) 167 return UnicodeEscapeError::UnicodeEscapeOverflow; 168 169 code_point = new_code_point; 170 if (consume_specific('}')) 171 break; 172 } 173 174 if (is_unicode(code_point)) 175 return code_point; 176 return UnicodeEscapeError::UnicodeEscapeOverflow; 177} 178 179auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError> 180{ 181 constexpr size_t surrogate_length = 4; 182 183 auto decode_one_surrogate = [&]() -> Optional<u16> { 184 u16 surrogate = 0; 185 186 for (size_t i = 0; i < surrogate_length; ++i) { 187 if (!next_is(is_ascii_hex_digit)) 188 return {}; 189 190 surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume()); 191 } 192 193 return surrogate; 194 }; 195 196 auto high_surrogate = decode_one_surrogate(); 197 if (!high_surrogate.has_value()) 198 return UnicodeEscapeError::MalformedUnicodeEscape; 199 if (!Utf16View::is_high_surrogate(*high_surrogate)) 200 return *high_surrogate; 201 if (!combine_surrogate_pairs || !consume_specific("\\u"sv)) 202 return *high_surrogate; 203 204 auto low_surrogate = decode_one_surrogate(); 205 if (!low_surrogate.has_value()) 206 return UnicodeEscapeError::MalformedUnicodeEscape; 207 if (Utf16View::is_low_surrogate(*low_surrogate)) 208 return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate); 209 210 retreat(6); 211 return *high_surrogate; 212} 213#endif 214 215}