Serenity Operating System
at master 371 lines 9.9 kB view raw
1/* 2 * Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include "Lexer.h" 8#include <AK/CharacterTypes.h> 9#include <AK/Debug.h> 10#include <AK/Format.h> 11#include <AK/ScopeLogger.h> 12 13namespace CMake { 14 15static bool is_valid_identifier_initial_char(char c) 16{ 17 return is_ascii_alpha(c) || c == '_'; 18} 19 20static bool is_valid_identifier_char(char c) 21{ 22 return is_ascii_alphanumeric(c) || c == '_'; 23} 24 25ErrorOr<Vector<Token>> Lexer::lex(StringView input) 26{ 27 Lexer lexer { input }; 28 return lexer.lex_file(); 29} 30 31Lexer::Lexer(StringView input) 32 : GenericLexer(input) 33{ 34} 35 36ErrorOr<Vector<Token>> Lexer::lex_file() 37{ 38 m_tokens.clear_with_capacity(); 39 40 while (!is_eof()) { 41 consume_whitespace_or_comments(); 42 43 if (is_eof()) 44 break; 45 46 if (is_valid_identifier_initial_char(peek())) { 47 consume_command_invocation(); 48 } else { 49 consume_garbage(); 50 } 51 } 52 53 return m_tokens; 54} 55 56void Lexer::skip_whitespace() 57{ 58 while (!is_eof()) { 59 if (next_is('\n')) { 60 next_line(); 61 continue; 62 } 63 auto consumed = consume_while([&](char c) { 64 return c == ' ' || c == '\t'; 65 }); 66 if (consumed.is_empty()) 67 break; 68 } 69} 70 71void Lexer::consume_whitespace_or_comments() 72{ 73 ScopeLogger<CMAKE_DEBUG> log; 74 while (!is_eof()) { 75 skip_whitespace(); 76 77 if (next_is('#')) { 78 consume_comment(); 79 } else { 80 break; 81 } 82 } 83} 84 85// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-invocations 86void Lexer::consume_command_invocation() 87{ 88 ScopeLogger<CMAKE_DEBUG> log; 89 auto identifier_start = position(); 90 auto identifier = consume_while(is_valid_identifier_char); 91 auto control_keyword = control_keyword_from_string(identifier); 92 if (control_keyword.has_value()) { 93 emit_token(Token::Type::ControlKeyword, identifier, identifier_start, position(), control_keyword.release_value()); 94 } else { 95 emit_token(Token::Type::Identifier, identifier, identifier_start, position()); 96 } 97 98 consume_whitespace_or_comments(); 99 100 if (next_is('(')) 101 consume_open_paren(); 102 103 consume_arguments(); 104 105 if (next_is(')')) 106 consume_close_paren(); 107} 108 109void Lexer::consume_arguments() 110{ 111 ScopeLogger<CMAKE_DEBUG> log; 112 while (!is_eof()) { 113 consume_whitespace_or_comments(); 114 115 if (next_is('(')) { 116 consume_open_paren(); 117 118 consume_whitespace_or_comments(); 119 consume_arguments(); 120 consume_whitespace_or_comments(); 121 122 if (next_is(')')) 123 consume_close_paren(); 124 125 continue; 126 } 127 128 if (next_is(')')) 129 return; 130 131 consume_argument(); 132 } 133} 134 135// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-arguments 136void Lexer::consume_argument() 137{ 138 ScopeLogger<CMAKE_DEBUG> log; 139 consume_whitespace_or_comments(); 140 141 if (next_is('[')) { 142 consume_bracket_argument(); 143 return; 144 } 145 146 if (next_is('"')) { 147 consume_quoted_argument(); 148 return; 149 } 150 151 consume_unquoted_argument(); 152} 153 154// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument 155void Lexer::consume_bracket_argument() 156{ 157 ScopeLogger<CMAKE_DEBUG> log; 158 auto start = position(); 159 auto value = read_bracket_argument(); 160 emit_token(Token::Type::BracketArgument, value, start, position()); 161} 162 163// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#quoted-argument 164void Lexer::consume_quoted_argument() 165{ 166 ScopeLogger<CMAKE_DEBUG> log; 167 auto start = position(); 168 auto start_offset = tell(); 169 170 VERIFY(consume_specific('"')); 171 while (!is_eof()) { 172 if (next_is('"')) { 173 ignore(); 174 break; 175 } 176 177 if (next_is("\\\""sv)) { 178 ignore(2); 179 continue; 180 } 181 182 if (next_is('\n')) { 183 next_line(); 184 continue; 185 } 186 187 ignore(); 188 } 189 190 auto whole_token = m_input.substring_view(start_offset, tell() - start_offset); 191 auto value = whole_token.substring_view(1, whole_token.length() - 2); 192 auto variable_references = parse_variable_references_from_argument(whole_token, start); 193 emit_token(Token::Type::QuotedArgument, value, start, position(), {}, move(variable_references)); 194} 195 196// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#unquoted-argument 197void Lexer::consume_unquoted_argument() 198{ 199 ScopeLogger<CMAKE_DEBUG> log; 200 auto start_offset = tell(); 201 auto start = position(); 202 203 while (!is_eof()) { 204 if (next_is('\\')) { 205 consume_escaped_character('\\'); 206 continue; 207 } 208 209 auto consumed = consume_until([](char c) { return is_ascii_space(c) || "()#\"\\'"sv.contains(c); }); 210 if (consumed.is_empty()) 211 break; 212 213 // FIXME: `unquoted_legacy` 214 } 215 216 auto value = m_input.substring_view(start_offset, tell() - start_offset); 217 auto variable_references = parse_variable_references_from_argument(value, start); 218 emit_token(Token::Type::UnquotedArgument, value, start, position(), {}, move(variable_references)); 219} 220 221// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#comments 222void Lexer::consume_comment() 223{ 224 ScopeLogger<CMAKE_DEBUG> log; 225 auto start = position(); 226 227 VERIFY(consume_specific('#')); 228 if (next_is('[')) { 229 // Bracket comment 230 // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-comment 231 auto comment = read_bracket_argument(); 232 emit_token(Token::Type::BracketComment, comment, start, position()); 233 return; 234 } 235 236 // Line comment 237 // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#line-comment 238 auto comment = consume_until('\n'); 239 emit_token(Token::Type::LineComment, comment, start, position()); 240} 241 242void Lexer::consume_open_paren() 243{ 244 auto start = position(); 245 VERIFY(consume_specific('(')); 246 emit_token(Token::Type::OpenParen, "("sv, start, position()); 247} 248 249void Lexer::consume_close_paren() 250{ 251 auto start = position(); 252 VERIFY(consume_specific(')')); 253 emit_token(Token::Type::CloseParen, ")"sv, start, position()); 254} 255 256void Lexer::consume_garbage() 257{ 258 ScopeLogger<CMAKE_DEBUG> log; 259 auto start = position(); 260 auto contents = consume_until(is_ascii_space); 261 if (!contents.is_empty()) 262 emit_token(Token::Type::Garbage, contents, start, position()); 263} 264 265// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument 266// Used by both bracket arguments and bracket comments. 267StringView Lexer::read_bracket_argument() 268{ 269 VERIFY(consume_specific('[')); 270 auto leading_equals_signs = consume_while([](char c) { return c == '='; }); 271 consume_specific('['); 272 auto start = tell(); 273 auto end = start; 274 while (!is_eof()) { 275 // Read everything until we see `]={len}]`. 276 ignore_until(']'); 277 end = tell(); 278 ignore(); 279 if (next_is(leading_equals_signs)) 280 ignore(leading_equals_signs.length()); 281 if (consume_specific(']')) 282 break; 283 } 284 285 return m_input.substring_view(start, end - start); 286} 287 288// https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#variable-references 289Vector<VariableReference> Lexer::parse_variable_references_from_argument(StringView argument_value, Position argument_start) 290{ 291 auto position = argument_start; 292 GenericLexer lexer { argument_value }; 293 Vector<VariableReference> variable_references; 294 295 while (!lexer.is_eof()) { 296 if (lexer.next_is('\n')) { 297 lexer.ignore(); 298 position.column = 0; 299 position.line++; 300 continue; 301 } 302 303 if (lexer.next_is('\\')) { 304 lexer.ignore(); 305 if (lexer.next_is('\n')) { 306 lexer.ignore(); 307 position.column = 0; 308 position.line++; 309 continue; 310 } 311 lexer.ignore(); 312 position.column += 2; 313 } 314 315 if (lexer.next_is('$')) { 316 auto start = position; 317 lexer.ignore(); 318 position.column++; 319 320 if (lexer.next_is("ENV{"sv)) { 321 lexer.ignore(4); 322 position.column += 4; 323 } else if (lexer.next_is('{')) { 324 lexer.ignore(); 325 position.column++; 326 } else { 327 auto skipped = lexer.consume_until(is_any_of("$ \n"sv)); 328 position.column += skipped.length(); 329 continue; 330 } 331 332 auto variable_name = lexer.consume_until(is_any_of("} \n"sv)); 333 position.column += variable_name.length(); 334 if (lexer.next_is('}')) { 335 lexer.ignore(); 336 position.column++; 337 variable_references.empend(variable_name, start, position); 338 } 339 340 continue; 341 } 342 343 lexer.ignore(); 344 position.column++; 345 } 346 347 return variable_references; 348} 349 350Position Lexer::position() const 351{ 352 return Position { 353 .line = m_line, 354 .column = tell() - m_string_offset_after_previous_newline, 355 }; 356} 357 358void Lexer::next_line() 359{ 360 VERIFY(consume_specific('\n')); 361 m_string_offset_after_previous_newline = tell(); 362 m_line++; 363} 364 365void Lexer::emit_token(Token::Type type, StringView value, Position start, Position end, Optional<ControlKeywordType> control_keyword, Vector<VariableReference> variable_references) 366{ 367 dbgln_if(CMAKE_DEBUG, "Emitting {} token: `{}` ({}:{} to {}:{})", to_string(type), value, start.line, start.column, end.line, end.column); 368 m_tokens.empend(type, value, start, end, move(control_keyword), move(variable_references)); 369} 370 371}