Serenity Operating System
1/*
2 * Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include "Lexer.h"
8#include <AK/CharacterTypes.h>
9#include <AK/Debug.h>
10#include <AK/Format.h>
11#include <AK/ScopeLogger.h>
12
13namespace CMake {
14
15static bool is_valid_identifier_initial_char(char c)
16{
17 return is_ascii_alpha(c) || c == '_';
18}
19
20static bool is_valid_identifier_char(char c)
21{
22 return is_ascii_alphanumeric(c) || c == '_';
23}
24
25ErrorOr<Vector<Token>> Lexer::lex(StringView input)
26{
27 Lexer lexer { input };
28 return lexer.lex_file();
29}
30
31Lexer::Lexer(StringView input)
32 : GenericLexer(input)
33{
34}
35
36ErrorOr<Vector<Token>> Lexer::lex_file()
37{
38 m_tokens.clear_with_capacity();
39
40 while (!is_eof()) {
41 consume_whitespace_or_comments();
42
43 if (is_eof())
44 break;
45
46 if (is_valid_identifier_initial_char(peek())) {
47 consume_command_invocation();
48 } else {
49 consume_garbage();
50 }
51 }
52
53 return m_tokens;
54}
55
56void Lexer::skip_whitespace()
57{
58 while (!is_eof()) {
59 if (next_is('\n')) {
60 next_line();
61 continue;
62 }
63 auto consumed = consume_while([&](char c) {
64 return c == ' ' || c == '\t';
65 });
66 if (consumed.is_empty())
67 break;
68 }
69}
70
71void Lexer::consume_whitespace_or_comments()
72{
73 ScopeLogger<CMAKE_DEBUG> log;
74 while (!is_eof()) {
75 skip_whitespace();
76
77 if (next_is('#')) {
78 consume_comment();
79 } else {
80 break;
81 }
82 }
83}
84
85// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-invocations
86void Lexer::consume_command_invocation()
87{
88 ScopeLogger<CMAKE_DEBUG> log;
89 auto identifier_start = position();
90 auto identifier = consume_while(is_valid_identifier_char);
91 auto control_keyword = control_keyword_from_string(identifier);
92 if (control_keyword.has_value()) {
93 emit_token(Token::Type::ControlKeyword, identifier, identifier_start, position(), control_keyword.release_value());
94 } else {
95 emit_token(Token::Type::Identifier, identifier, identifier_start, position());
96 }
97
98 consume_whitespace_or_comments();
99
100 if (next_is('('))
101 consume_open_paren();
102
103 consume_arguments();
104
105 if (next_is(')'))
106 consume_close_paren();
107}
108
109void Lexer::consume_arguments()
110{
111 ScopeLogger<CMAKE_DEBUG> log;
112 while (!is_eof()) {
113 consume_whitespace_or_comments();
114
115 if (next_is('(')) {
116 consume_open_paren();
117
118 consume_whitespace_or_comments();
119 consume_arguments();
120 consume_whitespace_or_comments();
121
122 if (next_is(')'))
123 consume_close_paren();
124
125 continue;
126 }
127
128 if (next_is(')'))
129 return;
130
131 consume_argument();
132 }
133}
134
135// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-arguments
136void Lexer::consume_argument()
137{
138 ScopeLogger<CMAKE_DEBUG> log;
139 consume_whitespace_or_comments();
140
141 if (next_is('[')) {
142 consume_bracket_argument();
143 return;
144 }
145
146 if (next_is('"')) {
147 consume_quoted_argument();
148 return;
149 }
150
151 consume_unquoted_argument();
152}
153
154// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument
155void Lexer::consume_bracket_argument()
156{
157 ScopeLogger<CMAKE_DEBUG> log;
158 auto start = position();
159 auto value = read_bracket_argument();
160 emit_token(Token::Type::BracketArgument, value, start, position());
161}
162
163// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#quoted-argument
164void Lexer::consume_quoted_argument()
165{
166 ScopeLogger<CMAKE_DEBUG> log;
167 auto start = position();
168 auto start_offset = tell();
169
170 VERIFY(consume_specific('"'));
171 while (!is_eof()) {
172 if (next_is('"')) {
173 ignore();
174 break;
175 }
176
177 if (next_is("\\\""sv)) {
178 ignore(2);
179 continue;
180 }
181
182 if (next_is('\n')) {
183 next_line();
184 continue;
185 }
186
187 ignore();
188 }
189
190 auto whole_token = m_input.substring_view(start_offset, tell() - start_offset);
191 auto value = whole_token.substring_view(1, whole_token.length() - 2);
192 auto variable_references = parse_variable_references_from_argument(whole_token, start);
193 emit_token(Token::Type::QuotedArgument, value, start, position(), {}, move(variable_references));
194}
195
196// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#unquoted-argument
197void Lexer::consume_unquoted_argument()
198{
199 ScopeLogger<CMAKE_DEBUG> log;
200 auto start_offset = tell();
201 auto start = position();
202
203 while (!is_eof()) {
204 if (next_is('\\')) {
205 consume_escaped_character('\\');
206 continue;
207 }
208
209 auto consumed = consume_until([](char c) { return is_ascii_space(c) || "()#\"\\'"sv.contains(c); });
210 if (consumed.is_empty())
211 break;
212
213 // FIXME: `unquoted_legacy`
214 }
215
216 auto value = m_input.substring_view(start_offset, tell() - start_offset);
217 auto variable_references = parse_variable_references_from_argument(value, start);
218 emit_token(Token::Type::UnquotedArgument, value, start, position(), {}, move(variable_references));
219}
220
221// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#comments
222void Lexer::consume_comment()
223{
224 ScopeLogger<CMAKE_DEBUG> log;
225 auto start = position();
226
227 VERIFY(consume_specific('#'));
228 if (next_is('[')) {
229 // Bracket comment
230 // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-comment
231 auto comment = read_bracket_argument();
232 emit_token(Token::Type::BracketComment, comment, start, position());
233 return;
234 }
235
236 // Line comment
237 // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#line-comment
238 auto comment = consume_until('\n');
239 emit_token(Token::Type::LineComment, comment, start, position());
240}
241
242void Lexer::consume_open_paren()
243{
244 auto start = position();
245 VERIFY(consume_specific('('));
246 emit_token(Token::Type::OpenParen, "("sv, start, position());
247}
248
249void Lexer::consume_close_paren()
250{
251 auto start = position();
252 VERIFY(consume_specific(')'));
253 emit_token(Token::Type::CloseParen, ")"sv, start, position());
254}
255
256void Lexer::consume_garbage()
257{
258 ScopeLogger<CMAKE_DEBUG> log;
259 auto start = position();
260 auto contents = consume_until(is_ascii_space);
261 if (!contents.is_empty())
262 emit_token(Token::Type::Garbage, contents, start, position());
263}
264
265// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument
266// Used by both bracket arguments and bracket comments.
267StringView Lexer::read_bracket_argument()
268{
269 VERIFY(consume_specific('['));
270 auto leading_equals_signs = consume_while([](char c) { return c == '='; });
271 consume_specific('[');
272 auto start = tell();
273 auto end = start;
274 while (!is_eof()) {
275 // Read everything until we see `]={len}]`.
276 ignore_until(']');
277 end = tell();
278 ignore();
279 if (next_is(leading_equals_signs))
280 ignore(leading_equals_signs.length());
281 if (consume_specific(']'))
282 break;
283 }
284
285 return m_input.substring_view(start, end - start);
286}
287
288// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#variable-references
289Vector<VariableReference> Lexer::parse_variable_references_from_argument(StringView argument_value, Position argument_start)
290{
291 auto position = argument_start;
292 GenericLexer lexer { argument_value };
293 Vector<VariableReference> variable_references;
294
295 while (!lexer.is_eof()) {
296 if (lexer.next_is('\n')) {
297 lexer.ignore();
298 position.column = 0;
299 position.line++;
300 continue;
301 }
302
303 if (lexer.next_is('\\')) {
304 lexer.ignore();
305 if (lexer.next_is('\n')) {
306 lexer.ignore();
307 position.column = 0;
308 position.line++;
309 continue;
310 }
311 lexer.ignore();
312 position.column += 2;
313 }
314
315 if (lexer.next_is('$')) {
316 auto start = position;
317 lexer.ignore();
318 position.column++;
319
320 if (lexer.next_is("ENV{"sv)) {
321 lexer.ignore(4);
322 position.column += 4;
323 } else if (lexer.next_is('{')) {
324 lexer.ignore();
325 position.column++;
326 } else {
327 auto skipped = lexer.consume_until(is_any_of("$ \n"sv));
328 position.column += skipped.length();
329 continue;
330 }
331
332 auto variable_name = lexer.consume_until(is_any_of("} \n"sv));
333 position.column += variable_name.length();
334 if (lexer.next_is('}')) {
335 lexer.ignore();
336 position.column++;
337 variable_references.empend(variable_name, start, position);
338 }
339
340 continue;
341 }
342
343 lexer.ignore();
344 position.column++;
345 }
346
347 return variable_references;
348}
349
350Position Lexer::position() const
351{
352 return Position {
353 .line = m_line,
354 .column = tell() - m_string_offset_after_previous_newline,
355 };
356}
357
358void Lexer::next_line()
359{
360 VERIFY(consume_specific('\n'));
361 m_string_offset_after_previous_newline = tell();
362 m_line++;
363}
364
365void Lexer::emit_token(Token::Type type, StringView value, Position start, Position end, Optional<ControlKeywordType> control_keyword, Vector<VariableReference> variable_references)
366{
367 dbgln_if(CMAKE_DEBUG, "Emitting {} token: `{}` ({}:{} to {}:{})", to_string(type), value, start.line, start.column, end.line, end.column);
368 m_tokens.empend(type, value, start, end, move(control_keyword), move(variable_references));
369}
370
371}