Serenity Operating System
1/*
2 * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/Assertions.h>
8#include <AK/CharacterTypes.h>
9#include <AK/GenericLexer.h>
10#include <AK/StringBuilder.h>
11
12#ifndef KERNEL
13# include <AK/DeprecatedString.h>
14# include <AK/Utf16View.h>
15#endif
16
17namespace AK {
18// Consume a number of characters
19StringView GenericLexer::consume(size_t count)
20{
21 if (count == 0)
22 return {};
23
24 size_t start = m_index;
25 size_t length = min(count, m_input.length() - m_index);
26 m_index += length;
27
28 return m_input.substring_view(start, length);
29}
30
31// Consume the rest of the input
32StringView GenericLexer::consume_all()
33{
34 if (is_eof())
35 return {};
36
37 auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
38 m_index = m_input.length();
39 return rest;
40}
41
42// Consume until a new line is found
43StringView GenericLexer::consume_line()
44{
45 size_t start = m_index;
46 while (!is_eof() && peek() != '\r' && peek() != '\n')
47 m_index++;
48 size_t length = m_index - start;
49
50 consume_specific('\r');
51 consume_specific('\n');
52
53 if (length == 0)
54 return {};
55 return m_input.substring_view(start, length);
56}
57
58// Consume and return characters until `stop` is peek'd
59StringView GenericLexer::consume_until(char stop)
60{
61 size_t start = m_index;
62 while (!is_eof() && peek() != stop)
63 m_index++;
64 size_t length = m_index - start;
65
66 if (length == 0)
67 return {};
68 return m_input.substring_view(start, length);
69}
70
71// Consume and return characters until the string `stop` is found
72StringView GenericLexer::consume_until(char const* stop)
73{
74 size_t start = m_index;
75 while (!is_eof() && !next_is(stop))
76 m_index++;
77 size_t length = m_index - start;
78
79 if (length == 0)
80 return {};
81 return m_input.substring_view(start, length);
82}
83
84// Consume and return characters until the string `stop` is found
85StringView GenericLexer::consume_until(StringView stop)
86{
87 size_t start = m_index;
88 while (!is_eof() && !next_is(stop))
89 m_index++;
90 size_t length = m_index - start;
91
92 if (length == 0)
93 return {};
94 return m_input.substring_view(start, length);
95}
96
97/*
98 * Consume a string surrounded by single or double quotes. The returned
99 * StringView does not include the quotes. An escape character can be provided
100 * to capture the enclosing quotes. Please note that the escape character will
101 * still be in the resulting StringView
102 */
103StringView GenericLexer::consume_quoted_string(char escape_char)
104{
105 if (!next_is(is_quote))
106 return {};
107
108 char quote_char = consume();
109 size_t start = m_index;
110 while (!is_eof()) {
111 if (next_is(escape_char))
112 m_index++;
113 else if (next_is(quote_char))
114 break;
115 m_index++;
116 }
117 size_t length = m_index - start;
118
119 if (peek() != quote_char) {
120 // Restore the index in case the string is unterminated
121 m_index = start - 1;
122 return {};
123 }
124
125 // Ignore closing quote
126 ignore();
127
128 return m_input.substring_view(start, length);
129}
130
131#ifndef KERNEL
132DeprecatedString GenericLexer::consume_and_unescape_string(char escape_char)
133{
134 auto view = consume_quoted_string(escape_char);
135 if (view.is_null())
136 return {};
137
138 StringBuilder builder;
139 for (size_t i = 0; i < view.length(); ++i)
140 builder.append(consume_escaped_character(escape_char));
141 return builder.to_deprecated_string();
142}
143
144auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
145{
146 if (!consume_specific("\\u"sv))
147 return UnicodeEscapeError::MalformedUnicodeEscape;
148
149 if (next_is('{'))
150 return decode_code_point();
151 return decode_single_or_paired_surrogate(combine_surrogate_pairs);
152}
153
154auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
155{
156 bool starts_with_open_bracket = consume_specific('{');
157 VERIFY(starts_with_open_bracket);
158
159 u32 code_point = 0;
160
161 while (true) {
162 if (!next_is(is_ascii_hex_digit))
163 return UnicodeEscapeError::MalformedUnicodeEscape;
164
165 auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
166 if (new_code_point < code_point)
167 return UnicodeEscapeError::UnicodeEscapeOverflow;
168
169 code_point = new_code_point;
170 if (consume_specific('}'))
171 break;
172 }
173
174 if (is_unicode(code_point))
175 return code_point;
176 return UnicodeEscapeError::UnicodeEscapeOverflow;
177}
178
179auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
180{
181 constexpr size_t surrogate_length = 4;
182
183 auto decode_one_surrogate = [&]() -> Optional<u16> {
184 u16 surrogate = 0;
185
186 for (size_t i = 0; i < surrogate_length; ++i) {
187 if (!next_is(is_ascii_hex_digit))
188 return {};
189
190 surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
191 }
192
193 return surrogate;
194 };
195
196 auto high_surrogate = decode_one_surrogate();
197 if (!high_surrogate.has_value())
198 return UnicodeEscapeError::MalformedUnicodeEscape;
199 if (!Utf16View::is_high_surrogate(*high_surrogate))
200 return *high_surrogate;
201 if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
202 return *high_surrogate;
203
204 auto low_surrogate = decode_one_surrogate();
205 if (!low_surrogate.has_value())
206 return UnicodeEscapeError::MalformedUnicodeEscape;
207 if (Utf16View::is_low_surrogate(*low_surrogate))
208 return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
209
210 retreat(6);
211 return *high_surrogate;
212}
213#endif
214
215}