Serenity Operating System
at master 1291 lines 45 kB view raw
1/* 2 * Copyright (c) 2020-2022, the SerenityOS developers. 3 * Copyright (c) 2021-2023, Sam Atkins <atkinssj@serenityos.org> 4 * 5 * SPDX-License-Identifier: BSD-2-Clause 6 */ 7 8#include <AK/CharacterTypes.h> 9#include <AK/Debug.h> 10#include <AK/FloatingPointStringConversions.h> 11#include <AK/SourceLocation.h> 12#include <AK/Vector.h> 13#include <LibTextCodec/Decoder.h> 14#include <LibWeb/CSS/Parser/Tokenizer.h> 15#include <LibWeb/Infra/Strings.h> 16 17namespace Web::CSS::Parser { 18 19// U+FFFD REPLACEMENT CHARACTER (�) 20#define REPLACEMENT_CHARACTER 0xFFFD 21static constexpr u32 TOKENIZER_EOF = 0xFFFFFFFF; 22 23static inline void log_parse_error(SourceLocation const& location = SourceLocation::current()) 24{ 25 dbgln_if(CSS_TOKENIZER_DEBUG, "Parse error (css tokenization) {} ", location); 26} 27 28static inline bool is_eof(u32 code_point) 29{ 30 return code_point == TOKENIZER_EOF; 31} 32 33static inline bool is_quotation_mark(u32 code_point) 34{ 35 return code_point == 0x22; 36} 37 38static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point) 39{ 40 return code_point > 0x10FFFF; 41} 42 43static inline bool is_low_line(u32 code_point) 44{ 45 return code_point == 0x5F; 46} 47 48// https://www.w3.org/TR/css-syntax-3/#ident-start-code-point 49static inline bool is_ident_start_code_point(u32 code_point) 50{ 51 // FIXME: We use !is_ascii() for "non-ASCII code point" in the spec, but it's not quite right - 52 // it treats EOF as a valid! The spec also lacks a definition of code point. For now, the 53 // !is_eof() check is a hack, but it should work. 54 return !is_eof(code_point) && (is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point)); 55} 56 57static inline bool is_hyphen_minus(u32 code_point) 58{ 59 return code_point == 0x2D; 60} 61 62// https://www.w3.org/TR/css-syntax-3/#ident-code-point 63static inline bool is_ident_code_point(u32 code_point) 64{ 65 return is_ident_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point); 66} 67 68static inline bool is_non_printable(u32 code_point) 69{ 70 return code_point <= 0x8 || code_point == 0xB || (code_point >= 0xE && code_point <= 0x1F) || code_point == 0x7F; 71} 72 73static inline bool is_number_sign(u32 code_point) 74{ 75 return code_point == 0x23; 76} 77 78static inline bool is_reverse_solidus(u32 code_point) 79{ 80 return code_point == 0x5C; 81} 82 83static inline bool is_apostrophe(u32 code_point) 84{ 85 return code_point == 0x27; 86} 87 88static inline bool is_left_paren(u32 code_point) 89{ 90 return code_point == 0x28; 91} 92 93static inline bool is_right_paren(u32 code_point) 94{ 95 return code_point == 0x29; 96} 97 98static inline bool is_plus_sign(u32 code_point) 99{ 100 return code_point == 0x2B; 101} 102 103static inline bool is_comma(u32 code_point) 104{ 105 return code_point == 0x2C; 106} 107 108static inline bool is_full_stop(u32 code_point) 109{ 110 return code_point == 0x2E; 111} 112 113static inline bool is_newline(u32 code_point) 114{ 115 return code_point == 0xA; 116} 117 118static inline bool is_asterisk(u32 code_point) 119{ 120 return code_point == 0x2A; 121} 122 123static inline bool is_solidus(u32 code_point) 124{ 125 return code_point == 0x2F; 126} 127 128static inline bool is_colon(u32 code_point) 129{ 130 return code_point == 0x3A; 131} 132 133static inline bool is_semicolon(u32 code_point) 134{ 135 return code_point == 0x3B; 136} 137 138static inline bool is_less_than_sign(u32 code_point) 139{ 140 return code_point == 0x3C; 141} 142 143static inline bool is_greater_than_sign(u32 code_point) 144{ 145 return code_point == 0x3E; 146} 147 148static inline bool is_at(u32 code_point) 149{ 150 return code_point == 0x40; 151} 152 153static inline bool is_open_square_bracket(u32 code_point) 154{ 155 return code_point == 0x5B; 156} 157 158static inline bool is_closed_square_bracket(u32 code_point) 159{ 160 return code_point == 0x5D; 161} 162 163static inline bool is_open_curly_bracket(u32 code_point) 164{ 165 return code_point == 0x7B; 166} 167 168static inline bool is_closed_curly_bracket(u32 code_point) 169{ 170 return code_point == 0x7D; 171} 172 173static inline bool is_whitespace(u32 code_point) 174{ 175 return code_point == 0x9 || code_point == 0xA || code_point == 0x20; 176} 177 178static inline bool is_percent(u32 code_point) 179{ 180 return code_point == 0x25; 181} 182 183static inline bool is_exclamation_mark(u32 code_point) 184{ 185 return code_point == 0x21; 186} 187 188static inline bool is_e(u32 code_point) 189{ 190 return code_point == 0x65; 191} 192 193static inline bool is_E(u32 code_point) 194{ 195 return code_point == 0x45; 196} 197 198ErrorOr<Vector<Token>> Tokenizer::tokenize(StringView input, StringView encoding) 199{ 200 // https://www.w3.org/TR/css-syntax-3/#css-filter-code-points 201 auto filter_code_points = [](StringView input, auto encoding) -> ErrorOr<String> { 202 auto decoder = TextCodec::decoder_for(encoding); 203 VERIFY(decoder.has_value()); 204 205 StringBuilder builder { input.length() }; 206 bool last_was_carriage_return = false; 207 208 // To filter code points from a stream of (unfiltered) code points input: 209 TRY(decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) -> ErrorOr<void> { 210 // Replace any U+000D CARRIAGE RETURN (CR) code points, 211 // U+000C FORM FEED (FF) code points, 212 // or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF) 213 // in input by a single U+000A LINE FEED (LF) code point. 214 if (code_point == '\r') { 215 if (last_was_carriage_return) { 216 TRY(builder.try_append('\n')); 217 } else { 218 last_was_carriage_return = true; 219 } 220 } else { 221 if (last_was_carriage_return) 222 TRY(builder.try_append('\n')); 223 224 if (code_point == '\n') { 225 if (!last_was_carriage_return) 226 TRY(builder.try_append('\n')); 227 228 } else if (code_point == '\f') { 229 TRY(builder.try_append('\n')); 230 // Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�). 231 } else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) { 232 TRY(builder.try_append_code_point(REPLACEMENT_CHARACTER)); 233 } else { 234 TRY(builder.try_append_code_point(code_point)); 235 } 236 237 last_was_carriage_return = false; 238 } 239 return {}; 240 })); 241 return builder.to_string(); 242 }; 243 244 Tokenizer tokenizer { TRY(filter_code_points(input, encoding)) }; 245 return tokenizer.tokenize(); 246} 247 248Tokenizer::Tokenizer(String decoded_input) 249 : m_decoded_input(move(decoded_input)) 250 , m_utf8_view(m_decoded_input) 251 , m_utf8_iterator(m_utf8_view.begin()) 252{ 253} 254 255ErrorOr<Vector<Token>> Tokenizer::tokenize() 256{ 257 Vector<Token> tokens; 258 for (;;) { 259 auto token_start = m_position; 260 auto token = TRY(consume_a_token()); 261 token.m_start_position = token_start; 262 token.m_end_position = m_position; 263 TRY(tokens.try_append(token)); 264 265 if (token.is(Token::Type::EndOfFile)) { 266 return tokens; 267 } 268 } 269} 270 271u32 Tokenizer::next_code_point() 272{ 273 if (m_utf8_iterator == m_utf8_view.end()) 274 return TOKENIZER_EOF; 275 m_prev_utf8_iterator = m_utf8_iterator; 276 ++m_utf8_iterator; 277 auto code_point = *m_prev_utf8_iterator; 278 279 m_prev_position = m_position; 280 if (is_newline(code_point)) { 281 m_position.line++; 282 m_position.column = 0; 283 } else { 284 m_position.column++; 285 } 286 287 dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Next code_point: {:d}", code_point); 288 return code_point; 289} 290 291u32 Tokenizer::peek_code_point(size_t offset) const 292{ 293 auto it = m_utf8_iterator; 294 for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i) 295 ++it; 296 if (it == m_utf8_view.end()) 297 return TOKENIZER_EOF; 298 dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek code_point: {:d}", *m_prev_utf8_iterator); 299 return *it; 300} 301 302U32Twin Tokenizer::peek_twin() const 303{ 304 U32Twin values { TOKENIZER_EOF, TOKENIZER_EOF }; 305 auto it = m_utf8_iterator; 306 for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) { 307 values.set(i, *it); 308 ++it; 309 } 310 dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek twin: {:d},{:d}", values.first, values.second); 311 return values; 312} 313 314U32Triplet Tokenizer::peek_triplet() const 315{ 316 U32Triplet values { TOKENIZER_EOF, TOKENIZER_EOF, TOKENIZER_EOF }; 317 auto it = m_utf8_iterator; 318 for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) { 319 values.set(i, *it); 320 ++it; 321 } 322 dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek triplet: {:d},{:d},{:d}", values.first, values.second, values.third); 323 return values; 324} 325 326U32Twin Tokenizer::start_of_input_stream_twin() 327{ 328 U32Twin twin; 329 // FIXME: Reconsuming just to read the current code point again is weird. 330 reconsume_current_input_code_point(); 331 twin.first = next_code_point(); 332 twin.second = peek_code_point(); 333 334 return twin; 335} 336 337U32Triplet Tokenizer::start_of_input_stream_triplet() 338{ 339 U32Triplet triplet; 340 // FIXME: Reconsuming just to read the current code point again is weird. 341 reconsume_current_input_code_point(); 342 triplet.first = next_code_point(); 343 auto next_two = peek_twin(); 344 triplet.second = next_two.first; 345 triplet.third = next_two.second; 346 347 return triplet; 348} 349 350Token Tokenizer::create_new_token(Token::Type type) 351{ 352 Token token = {}; 353 token.m_type = type; 354 return token; 355} 356 357Token Tokenizer::create_eof_token() 358{ 359 return create_new_token(Token::Type::EndOfFile); 360} 361 362Token Tokenizer::create_value_token(Token::Type type, FlyString&& value) 363{ 364 Token token; 365 token.m_type = type; 366 token.m_value = move(value); 367 return token; 368} 369 370Token Tokenizer::create_value_token(Token::Type type, u32 value) 371{ 372 Token token = {}; 373 token.m_type = type; 374 token.m_value = String::from_code_point(value); 375 return token; 376} 377 378// https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point 379u32 Tokenizer::consume_escaped_code_point() 380{ 381 // This section describes how to consume an escaped code point. 382 // It assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed and that the next 383 // input code point has already been verified to be part of a valid escape. 384 // It will return a code point. 385 386 // Consume the next input code point. 387 auto input = next_code_point(); 388 389 // hex digit 390 if (is_ascii_hex_digit(input)) { 391 // Consume as many hex digits as possible, but no more than 5. 392 // Note that this means 1-6 hex digits have been consumed in total. 393 StringBuilder builder; 394 builder.append_code_point(input); 395 396 size_t counter = 0; 397 while (is_ascii_hex_digit(peek_code_point()) && counter++ < 5) { 398 builder.append_code_point(next_code_point()); 399 } 400 401 // If the next input code point is whitespace, consume it as well. 402 if (is_whitespace(peek_code_point())) { 403 (void)next_code_point(); 404 } 405 406 // Interpret the hex digits as a hexadecimal number. 407 auto unhexed = AK::StringUtils::convert_to_uint_from_hex<u32>(builder.string_view()).value_or(0); 408 // If this number is zero, or is for a surrogate, or is greater than the maximum allowed 409 // code point, return U+FFFD REPLACEMENT CHARACTER (�). 410 if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) { 411 return REPLACEMENT_CHARACTER; 412 } 413 414 // Otherwise, return the code point with that value. 415 return unhexed; 416 } 417 418 // EOF 419 if (is_eof(input)) { 420 // This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�). 421 log_parse_error(); 422 return REPLACEMENT_CHARACTER; 423 } 424 425 // anything else 426 // Return the current input code point. 427 return input; 428} 429 430// https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token 431ErrorOr<Token> Tokenizer::consume_an_ident_like_token() 432{ 433 // This section describes how to consume an ident-like token from a stream of code points. 434 // It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>. 435 436 // Consume an ident sequence, and let string be the result. 437 auto string = TRY(consume_an_ident_sequence()); 438 439 // If string’s value is an ASCII case-insensitive match for "url", and the next input code 440 // point is U+0028 LEFT PARENTHESIS ((), consume it. 441 if (Infra::is_ascii_case_insensitive_match(string, "url"sv) && is_left_paren(peek_code_point())) { 442 (void)next_code_point(); 443 444 // While the next two input code points are whitespace, consume the next input code point. 445 for (;;) { 446 auto maybe_whitespace = peek_twin(); 447 if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) { 448 break; 449 } 450 451 (void)next_code_point(); 452 } 453 454 // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('), 455 // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('), then create a 456 // <function-token> with its value set to string and return it. 457 auto next_two = peek_twin(); 458 if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) { 459 return create_value_token(Token::Type::Function, move(string)); 460 } 461 462 // Otherwise, consume a url token, and return it. 463 return consume_a_url_token(); 464 } 465 466 // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. 467 if (is_left_paren(peek_code_point())) { 468 (void)next_code_point(); 469 470 // Create a <function-token> with its value set to string and return it. 471 return create_value_token(Token::Type::Function, move(string)); 472 } 473 474 // Otherwise, create an <ident-token> with its value set to string and return it. 475 return create_value_token(Token::Type::Ident, move(string)); 476} 477 478// https://www.w3.org/TR/css-syntax-3/#consume-number 479Number Tokenizer::consume_a_number() 480{ 481 // This section describes how to consume a number from a stream of code points. 482 // It returns a numeric value, and a type which is either "integer" or "number". 483 // 484 // Note: This algorithm does not do the verification of the first few code points 485 // that are necessary to ensure a number can be obtained from the stream. Ensure 486 // that the stream starts with a number before calling this algorithm. 487 488 // Execute the following steps in order: 489 490 // 1. Initially set type to "integer". Let repr be the empty string. 491 StringBuilder repr; 492 Number::Type type = Number::Type::Integer; 493 494 // 2. If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-), 495 // consume it and append it to repr. 496 bool has_explicit_sign = false; 497 auto next_input = peek_code_point(); 498 if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) { 499 has_explicit_sign = true; 500 repr.append_code_point(next_code_point()); 501 } 502 503 // 3. While the next input code point is a digit, consume it and append it to repr. 504 for (;;) { 505 auto digits = peek_code_point(); 506 if (!is_ascii_digit(digits)) 507 break; 508 509 repr.append_code_point(next_code_point()); 510 } 511 512 // 4. If the next 2 input code points are U+002E FULL STOP (.) followed by a digit, then: 513 auto maybe_number = peek_twin(); 514 if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) { 515 // 1. Consume them. 516 // 2. Append them to repr. 517 repr.append_code_point(next_code_point()); 518 repr.append_code_point(next_code_point()); 519 520 // 3. Set type to "number". 521 type = Number::Type::Number; 522 523 // 4. While the next input code point is a digit, consume it and append it to repr. 524 for (;;) { 525 auto digit = peek_code_point(); 526 if (!is_ascii_digit(digit)) 527 break; 528 529 repr.append_code_point(next_code_point()); 530 } 531 } 532 533 // 5. If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) or 534 // U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D HYPHEN-MINUS (-) 535 // or U+002B PLUS SIGN (+), followed by a digit, then: 536 auto maybe_exp = peek_triplet(); 537 if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) { 538 // 1. Consume them. 539 // 2. Append them to repr. 540 // FIXME: These conditions should be part of step 5 above. 541 if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) { 542 if (is_ascii_digit(maybe_exp.third)) { 543 repr.append_code_point(next_code_point()); 544 repr.append_code_point(next_code_point()); 545 repr.append_code_point(next_code_point()); 546 } 547 } else if (is_ascii_digit(maybe_exp.second)) { 548 repr.append_code_point(next_code_point()); 549 repr.append_code_point(next_code_point()); 550 } 551 552 // 3. Set type to "number". 553 type = Number::Type::Number; 554 555 // 4. While the next input code point is a digit, consume it and append it to repr. 556 for (;;) { 557 auto digits = peek_code_point(); 558 if (!is_ascii_digit(digits)) 559 break; 560 561 repr.append_code_point(next_code_point()); 562 } 563 } 564 565 // 6. Convert repr to a number, and set the value to the returned value. 566 auto value = convert_a_string_to_a_number(repr.string_view()); 567 568 // 7. Return value and type. 569 if (type == Number::Type::Integer && has_explicit_sign) 570 return Number { Number::Type::IntegerWithExplicitSign, value }; 571 return Number { type, value }; 572} 573 574// https://www.w3.org/TR/css-syntax-3/#convert-string-to-number 575float Tokenizer::convert_a_string_to_a_number(StringView string) 576{ 577 // FIXME: We already found the whole part, fraction part and exponent during 578 // validation, we could probably skip 579 return string.to_float(AK::TrimWhitespace::No).release_value(); 580} 581 582// https://www.w3.org/TR/css-syntax-3/#consume-name 583ErrorOr<FlyString> Tokenizer::consume_an_ident_sequence() 584{ 585 // This section describes how to consume an ident sequence from a stream of code points. 586 // It returns a string containing the largest name that can be formed from adjacent 587 // code points in the stream, starting from the first. 588 // 589 // Note: This algorithm does not do the verification of the first few code points that 590 // are necessary to ensure the returned code points would constitute an <ident-token>. 591 // If that is the intended use, ensure that the stream starts with an ident sequence before 592 // calling this algorithm. 593 594 // Let result initially be an empty string. 595 StringBuilder result; 596 597 // Repeatedly consume the next input code point from the stream: 598 for (;;) { 599 auto input = next_code_point(); 600 601 if (is_eof(input)) 602 break; 603 604 // name code point 605 if (is_ident_code_point(input)) { 606 // Append the code point to result. 607 TRY(result.try_append_code_point(input)); 608 continue; 609 } 610 611 // the stream starts with a valid escape 612 if (is_valid_escape_sequence(start_of_input_stream_twin())) { 613 // Consume an escaped code point. Append the returned code point to result. 614 TRY(result.try_append_code_point(consume_escaped_code_point())); 615 continue; 616 } 617 618 // anything else 619 // Reconsume the current input code point. Return result. 620 reconsume_current_input_code_point(); 621 break; 622 } 623 624 return result.to_fly_string(); 625} 626 627// https://www.w3.org/TR/css-syntax-3/#consume-url-token 628ErrorOr<Token> Tokenizer::consume_a_url_token() 629{ 630 // This section describes how to consume a url token from a stream of code points. 631 // It returns either a <url-token> or a <bad-url-token>. 632 // 633 // Note: This algorithm assumes that the initial "url(" has already been consumed. 634 // This algorithm also assumes that it’s being called to consume an "unquoted" value, 635 // like url(foo). A quoted value, like url("foo"), is parsed as a <function-token>. 636 // Consume an ident-like token automatically handles this distinction; this algorithm 637 // shouldn’t be called directly otherwise. 638 639 // 1. Initially create a <url-token> with its value set to the empty string. 640 auto token = create_new_token(Token::Type::Url); 641 StringBuilder builder; 642 643 // 2. Consume as much whitespace as possible. 644 consume_as_much_whitespace_as_possible(); 645 646 auto make_token = [&]() -> ErrorOr<Token> { 647 token.m_value = TRY(FlyString::from_utf8(builder.string_view())); 648 return token; 649 }; 650 651 // 3. Repeatedly consume the next input code point from the stream: 652 for (;;) { 653 auto input = next_code_point(); 654 655 // U+0029 RIGHT PARENTHESIS ()) 656 if (is_right_paren(input)) { 657 // Return the <url-token>. 658 return make_token(); 659 } 660 661 // EOF 662 if (is_eof(input)) { 663 // This is a parse error. Return the <url-token>. 664 log_parse_error(); 665 return make_token(); 666 } 667 668 // whitespace 669 if (is_whitespace(input)) { 670 // Consume as much whitespace as possible. 671 consume_as_much_whitespace_as_possible(); 672 673 // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF, consume it 674 // and return the <url-token> (if EOF was encountered, this is a parse error); 675 input = peek_code_point(); 676 677 if (is_right_paren(input)) { 678 (void)next_code_point(); 679 return make_token(); 680 } 681 682 if (is_eof(input)) { 683 (void)next_code_point(); 684 log_parse_error(); 685 return make_token(); 686 } 687 688 // otherwise, consume the remnants of a bad url, create a <bad-url-token>, and return it. 689 consume_the_remnants_of_a_bad_url(); 690 return create_new_token(Token::Type::BadUrl); 691 } 692 693 // U+0022 QUOTATION MARK (") 694 // U+0027 APOSTROPHE (') 695 // U+0028 LEFT PARENTHESIS (() 696 // non-printable code point 697 if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) { 698 // This is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it. 699 log_parse_error(); 700 consume_the_remnants_of_a_bad_url(); 701 return create_new_token(Token::Type::BadUrl); 702 } 703 704 // U+005C REVERSE SOLIDUS (\) 705 if (is_reverse_solidus(input)) { 706 // If the stream starts with a valid escape, 707 if (is_valid_escape_sequence(start_of_input_stream_twin())) { 708 // consume an escaped code point and append the returned code point to the <url-token>’s value. 709 builder.append_code_point(consume_escaped_code_point()); 710 continue; 711 } else { 712 // Otherwise, this is a parse error. 713 log_parse_error(); 714 // Consume the remnants of a bad url, create a <bad-url-token>, and return it. 715 consume_the_remnants_of_a_bad_url(); 716 return create_new_token(Token::Type::BadUrl); 717 } 718 } 719 720 // anything else 721 // Append the current input code point to the <url-token>’s value. 722 builder.append_code_point(input); 723 } 724} 725 726// https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url 727void Tokenizer::consume_the_remnants_of_a_bad_url() 728{ 729 // This section describes how to consume the remnants of a bad url from a stream of code points, 730 // "cleaning up" after the tokenizer realizes that it’s in the middle of a <bad-url-token> rather 731 // than a <url-token>. It returns nothing; its sole use is to consume enough of the input stream 732 // to reach a recovery point where normal tokenizing can resume. 733 734 // Repeatedly consume the next input code point from the stream: 735 for (;;) { 736 auto input = next_code_point(); 737 738 // U+0029 RIGHT PARENTHESIS ()) 739 // EOF 740 if (is_eof(input) || is_right_paren(input)) { 741 // Return. 742 return; 743 } 744 745 // the input stream starts with a valid escape 746 if (is_valid_escape_sequence(start_of_input_stream_twin())) { 747 // Consume an escaped code point. 748 // This allows an escaped right parenthesis ("\)") to be encountered without ending 749 // the <bad-url-token>. This is otherwise identical to the "anything else" clause. 750 (void)consume_escaped_code_point(); 751 } 752 753 // anything else 754 // Do nothing. 755 } 756} 757 758void Tokenizer::consume_as_much_whitespace_as_possible() 759{ 760 while (is_whitespace(peek_code_point())) { 761 (void)next_code_point(); 762 } 763} 764 765void Tokenizer::reconsume_current_input_code_point() 766{ 767 m_utf8_iterator = m_prev_utf8_iterator; 768 m_position = m_prev_position; 769} 770 771// https://www.w3.org/TR/css-syntax-3/#consume-numeric-token 772ErrorOr<Token> Tokenizer::consume_a_numeric_token() 773{ 774 // This section describes how to consume a numeric token from a stream of code points. 775 // It returns either a <number-token>, <percentage-token>, or <dimension-token>. 776 777 // Consume a number and let number be the result. 778 auto number = consume_a_number(); 779 780 // If the next 3 input code points would start an ident sequence, then: 781 if (would_start_an_ident_sequence(peek_triplet())) { 782 // 1. Create a <dimension-token> with the same value and type flag as number, 783 // and a unit set initially to the empty string. 784 auto token = create_new_token(Token::Type::Dimension); 785 token.m_number_value = number; 786 787 // 2. Consume an ident sequence. Set the <dimension-token>’s unit to the returned value. 788 auto unit = TRY(consume_an_ident_sequence()); 789 VERIFY(!unit.is_empty()); 790 // NOTE: We intentionally store this in the `value`, to save space. 791 token.m_value = move(unit); 792 793 // 3. Return the <dimension-token>. 794 return token; 795 } 796 797 // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. 798 if (is_percent(peek_code_point())) { 799 (void)next_code_point(); 800 801 // Create a <percentage-token> with the same value as number, and return it. 802 auto token = create_new_token(Token::Type::Percentage); 803 token.m_number_value = number; 804 return token; 805 } 806 807 // Otherwise, create a <number-token> with the same value and type flag as number, and return it. 808 auto token = create_new_token(Token::Type::Number); 809 token.m_number_value = number; 810 return token; 811} 812 813// https://www.w3.org/TR/css-syntax-3/#starts-with-a-number 814bool Tokenizer::would_start_a_number(U32Triplet values) 815{ 816 // This section describes how to check if three code points would start a number. 817 // The algorithm described here can be called explicitly with three code points, 818 // or can be called with the input stream itself. In the latter case, the three 819 // code points in question are the current input code point and the next two input 820 // code points, in that order. 821 // 822 // Note: This algorithm will not consume any additional code points. 823 824 // Look at the first code point: 825 826 // U+002B PLUS SIGN (+) 827 // U+002D HYPHEN-MINUS (-) 828 if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) { 829 // If the second code point is a digit, return true. 830 if (is_ascii_digit(values.second)) 831 return true; 832 833 // Otherwise, if the second code point is a U+002E FULL STOP (.) and the third 834 // code point is a digit, return true. 835 if (is_full_stop(values.second) && is_ascii_digit(values.third)) 836 return true; 837 838 // Otherwise, return false. 839 return false; 840 } 841 842 // U+002E FULL STOP (.) 843 if (is_full_stop(values.first)) 844 // If the second code point is a digit, return true. Otherwise, return false. 845 return is_ascii_digit(values.second); 846 847 // digit 848 if (is_ascii_digit(values.first)) 849 // Return true. 850 return true; 851 852 // anything else 853 // Return false. 854 return false; 855} 856 857// https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape 858bool Tokenizer::is_valid_escape_sequence(U32Twin values) 859{ 860 // This section describes how to check if two code points are a valid escape. 861 // The algorithm described here can be called explicitly with two code points, 862 // or can be called with the input stream itself. In the latter case, the two 863 // code points in question are the current input code point and the next input 864 // code point, in that order. 865 // 866 // Note: This algorithm will not consume any additional code point. 867 868 // If the first code point is not U+005C REVERSE SOLIDUS (\), return false. 869 if (!is_reverse_solidus(values.first)) 870 return false; 871 872 // Otherwise, if the second code point is a newline, return false. 873 if (is_newline(values.second)) 874 return false; 875 876 // Otherwise, return true. 877 return true; 878} 879 880// https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier 881bool Tokenizer::would_start_an_ident_sequence(U32Triplet values) 882{ 883 // This section describes how to check if three code points would start an ident sequence. 884 // The algorithm described here can be called explicitly with three code points, or 885 // can be called with the input stream itself. In the latter case, the three code 886 // points in question are the current input code point and the next two input code 887 // points, in that order. 888 // 889 // Note: This algorithm will not consume any additional code points. 890 891 // Look at the first code point: 892 893 // U+002D HYPHEN-MINUS 894 if (is_hyphen_minus(values.first)) { 895 // If the second code point is a name-start code point or a U+002D HYPHEN-MINUS, 896 // or the second and third code points are a valid escape, return true. 897 if (is_ident_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23())) 898 return true; 899 // Otherwise, return false. 900 return false; 901 } 902 903 // name-start code point 904 if (is_ident_start_code_point(values.first)) { 905 // Return true. 906 return true; 907 } 908 909 // U+005C REVERSE SOLIDUS (\) 910 if (is_reverse_solidus(values.first)) { 911 // If the first and second code points are a valid escape, return true. 912 if (is_valid_escape_sequence(values.to_twin_12())) 913 return true; 914 // Otherwise, return false. 915 return false; 916 } 917 918 // anything else 919 // Return false. 920 return false; 921} 922 923// https://www.w3.org/TR/css-syntax-3/#consume-string-token 924ErrorOr<Token> Tokenizer::consume_string_token(u32 ending_code_point) 925{ 926 // This section describes how to consume a string token from a stream of code points. 927 // It returns either a <string-token> or <bad-string-token>. 928 // 929 // This algorithm may be called with an ending code point, which denotes the code point 930 // that ends the string. If an ending code point is not specified, the current input 931 // code point is used. 932 933 // Initially create a <string-token> with its value set to the empty string. 934 auto token = create_new_token(Token::Type::String); 935 StringBuilder builder; 936 937 auto make_token = [&]() -> ErrorOr<Token> { 938 token.m_value = TRY(FlyString::from_utf8(builder.string_view())); 939 return token; 940 }; 941 942 // Repeatedly consume the next input code point from the stream: 943 for (;;) { 944 auto input = next_code_point(); 945 946 // ending code point 947 if (input == ending_code_point) 948 return make_token(); 949 950 // EOF 951 if (is_eof(input)) { 952 // This is a parse error. Return the <string-token>. 953 log_parse_error(); 954 return make_token(); 955 } 956 957 // newline 958 if (is_newline(input)) { 959 // This is a parse error. Reconsume the current input code point, create a 960 // <bad-string-token>, and return it. 961 reconsume_current_input_code_point(); 962 return create_new_token(Token::Type::BadString); 963 } 964 965 // U+005C REVERSE SOLIDUS (\) 966 if (is_reverse_solidus(input)) { 967 // If the next input code point is EOF, do nothing. 968 auto next_input = peek_code_point(); 969 if (is_eof(next_input)) 970 continue; 971 972 // Otherwise, if the next input code point is a newline, consume it. 973 if (is_newline(next_input)) { 974 (void)next_code_point(); 975 continue; 976 } 977 978 // Otherwise, (the stream starts with a valid escape) consume an escaped code 979 // point and append the returned code point to the <string-token>’s value. 980 auto escaped = consume_escaped_code_point(); 981 builder.append_code_point(escaped); 982 continue; 983 } 984 985 // anything else 986 // Append the current input code point to the <string-token>’s value. 987 builder.append_code_point(input); 988 } 989} 990 991// https://www.w3.org/TR/css-syntax-3/#consume-comment 992void Tokenizer::consume_comments() 993{ 994 // This section describes how to consume comments from a stream of code points. 995 // It returns nothing. 996 997start: 998 // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*), 999 // consume them and all following code points up to and including the first U+002A ASTERISK (*) 1000 // followed by a U+002F SOLIDUS (/), or up to an EOF code point. Return to the start of this step. 1001 // 1002 // If the preceding paragraph ended by consuming an EOF code point, this is a parse error. 1003 // 1004 // Return nothing. 1005 auto twin = peek_twin(); 1006 if (!(is_solidus(twin.first) && is_asterisk(twin.second))) 1007 return; 1008 1009 (void)next_code_point(); 1010 (void)next_code_point(); 1011 1012 for (;;) { 1013 auto twin_inner = peek_twin(); 1014 if (is_eof(twin_inner.first) || is_eof(twin_inner.second)) { 1015 log_parse_error(); 1016 return; 1017 } 1018 1019 if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) { 1020 (void)next_code_point(); 1021 (void)next_code_point(); 1022 goto start; 1023 } 1024 1025 (void)next_code_point(); 1026 } 1027} 1028 1029// https://www.w3.org/TR/css-syntax-3/#consume-token 1030ErrorOr<Token> Tokenizer::consume_a_token() 1031{ 1032 // This section describes how to consume a token from a stream of code points. 1033 // It will return a single token of any type. 1034 1035 // Consume comments. 1036 consume_comments(); 1037 1038 // Consume the next input code point. 1039 auto input = next_code_point(); 1040 1041 // whitespace 1042 if (is_whitespace(input)) { 1043 dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace"); 1044 // Consume as much whitespace as possible. Return a <whitespace-token>. 1045 consume_as_much_whitespace_as_possible(); 1046 return create_new_token(Token::Type::Whitespace); 1047 } 1048 1049 // U+0022 QUOTATION MARK (") 1050 if (is_quotation_mark(input)) { 1051 dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark"); 1052 // Consume a string token and return it. 1053 return consume_string_token(input); 1054 } 1055 1056 // U+0023 NUMBER SIGN (#) 1057 if (is_number_sign(input)) { 1058 dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign"); 1059 1060 // If the next input code point is an ident code point or the next two input code points 1061 // are a valid escape, then: 1062 auto next_input = peek_code_point(); 1063 auto maybe_escape = peek_twin(); 1064 1065 if (is_ident_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) { 1066 // 1. Create a <hash-token>. 1067 auto token = create_new_token(Token::Type::Hash); 1068 1069 // 2. If the next 3 input code points would start an ident sequence, set the <hash-token>’s 1070 // type flag to "id". 1071 if (would_start_an_ident_sequence(peek_triplet())) 1072 token.m_hash_type = Token::HashType::Id; 1073 1074 // 3. Consume an ident sequence, and set the <hash-token>’s value to the returned string. 1075 auto name = TRY(consume_an_ident_sequence()); 1076 token.m_value = move(name); 1077 1078 // 4. Return the <hash-token>. 1079 return token; 1080 } 1081 1082 // Otherwise, return a <delim-token> with its value set to the current input code point. 1083 return create_value_token(Token::Type::Delim, input); 1084 } 1085 1086 // U+0027 APOSTROPHE (') 1087 if (is_apostrophe(input)) { 1088 dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe"); 1089 // Consume a string token and return it. 1090 return consume_string_token(input); 1091 } 1092 1093 // U+0028 LEFT PARENTHESIS (() 1094 if (is_left_paren(input)) { 1095 dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren"); 1096 // Return a <(-token>. 1097 return create_new_token(Token::Type::OpenParen); 1098 } 1099 1100 // U+0029 RIGHT PARENTHESIS ()) 1101 if (is_right_paren(input)) { 1102 dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren"); 1103 // Return a <)-token>. 1104 return create_new_token(Token::Type::CloseParen); 1105 } 1106 1107 // U+002B PLUS SIGN (+) 1108 if (is_plus_sign(input)) { 1109 dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign"); 1110 // If the input stream starts with a number, reconsume the current input code point, 1111 // consume a numeric token and return it. 1112 if (would_start_a_number(start_of_input_stream_triplet())) { 1113 reconsume_current_input_code_point(); 1114 return consume_a_numeric_token(); 1115 } 1116 1117 // Otherwise, return a <delim-token> with its value set to the current input code point. 1118 return create_value_token(Token::Type::Delim, input); 1119 } 1120 1121 // U+002C COMMA (,) 1122 if (is_comma(input)) { 1123 dbgln_if(CSS_TOKENIZER_DEBUG, "is comma"); 1124 // Return a <comma-token>. 1125 return create_new_token(Token::Type::Comma); 1126 } 1127 1128 // U+002D HYPHEN-MINUS (-) 1129 if (is_hyphen_minus(input)) { 1130 dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus"); 1131 // If the input stream starts with a number, reconsume the current input code point, 1132 // consume a numeric token, and return it. 1133 if (would_start_a_number(start_of_input_stream_triplet())) { 1134 reconsume_current_input_code_point(); 1135 return consume_a_numeric_token(); 1136 } 1137 1138 // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E 1139 // GREATER-THAN SIGN (->), consume them and return a <CDC-token>. 1140 auto next_twin = peek_twin(); 1141 if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) { 1142 (void)next_code_point(); 1143 (void)next_code_point(); 1144 1145 return create_new_token(Token::Type::CDC); 1146 } 1147 1148 // Otherwise, if the input stream starts with an identifier, reconsume the current 1149 // input code point, consume an ident-like token, and return it. 1150 if (would_start_an_ident_sequence(start_of_input_stream_triplet())) { 1151 reconsume_current_input_code_point(); 1152 return consume_an_ident_like_token(); 1153 } 1154 1155 // Otherwise, return a <delim-token> with its value set to the current input code point. 1156 return create_value_token(Token::Type::Delim, input); 1157 } 1158 1159 // U+002E FULL STOP (.) 1160 if (is_full_stop(input)) { 1161 dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop"); 1162 // If the input stream starts with a number, reconsume the current input code point, 1163 // consume a numeric token, and return it. 1164 if (would_start_a_number(start_of_input_stream_triplet())) { 1165 reconsume_current_input_code_point(); 1166 return consume_a_numeric_token(); 1167 } 1168 1169 // Otherwise, return a <delim-token> with its value set to the current input code point. 1170 return create_value_token(Token::Type::Delim, input); 1171 } 1172 1173 // U+003A COLON (:) 1174 if (is_colon(input)) { 1175 dbgln_if(CSS_TOKENIZER_DEBUG, "is colon"); 1176 // Return a <colon-token>. 1177 return create_new_token(Token::Type::Colon); 1178 } 1179 1180 // U+003B SEMICOLON (;) 1181 if (is_semicolon(input)) { 1182 dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon"); 1183 // Return a <semicolon-token>. 1184 return create_new_token(Token::Type::Semicolon); 1185 } 1186 1187 // U+003C LESS-THAN SIGN (<) 1188 if (is_less_than_sign(input)) { 1189 dbgln_if(CSS_TOKENIZER_DEBUG, "is less than"); 1190 // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS 1191 // U+002D HYPHEN-MINUS (!--), consume them and return a <CDO-token>. 1192 auto maybe_cdo = peek_triplet(); 1193 if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) { 1194 (void)next_code_point(); 1195 (void)next_code_point(); 1196 (void)next_code_point(); 1197 1198 return create_new_token(Token::Type::CDO); 1199 } 1200 1201 // Otherwise, return a <delim-token> with its value set to the current input code point. 1202 return create_value_token(Token::Type::Delim, input); 1203 } 1204 1205 // U+0040 COMMERCIAL AT (@) 1206 if (is_at(input)) { 1207 dbgln_if(CSS_TOKENIZER_DEBUG, "is at"); 1208 // If the next 3 input code points would start an ident sequence, consume an ident sequence, create 1209 // an <at-keyword-token> with its value set to the returned value, and return it. 1210 if (would_start_an_ident_sequence(peek_triplet())) { 1211 auto name = TRY(consume_an_ident_sequence()); 1212 return create_value_token(Token::Type::AtKeyword, move(name)); 1213 } 1214 1215 // Otherwise, return a <delim-token> with its value set to the current input code point. 1216 return create_value_token(Token::Type::Delim, input); 1217 } 1218 1219 // U+005B LEFT SQUARE BRACKET ([) 1220 if (is_open_square_bracket(input)) { 1221 dbgln_if(CSS_TOKENIZER_DEBUG, "is open square"); 1222 // Return a <[-token>. 1223 return create_new_token(Token::Type::OpenSquare); 1224 } 1225 1226 // U+005C REVERSE SOLIDUS (\) 1227 if (is_reverse_solidus(input)) { 1228 dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus"); 1229 // If the input stream starts with a valid escape, reconsume the current input code point, 1230 // consume an ident-like token, and return it. 1231 if (is_valid_escape_sequence(start_of_input_stream_twin())) { 1232 reconsume_current_input_code_point(); 1233 return consume_an_ident_like_token(); 1234 } 1235 1236 // Otherwise, this is a parse error. Return a <delim-token> with its value set to the 1237 // current input code point. 1238 log_parse_error(); 1239 return create_value_token(Token::Type::Delim, input); 1240 } 1241 1242 // U+005D RIGHT SQUARE BRACKET (]) 1243 if (is_closed_square_bracket(input)) { 1244 dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square"); 1245 // Return a <]-token>. 1246 return create_new_token(Token::Type::CloseSquare); 1247 } 1248 1249 // U+007B LEFT CURLY BRACKET ({) 1250 if (is_open_curly_bracket(input)) { 1251 dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly"); 1252 // Return a <{-token>. 1253 return create_new_token(Token::Type::OpenCurly); 1254 } 1255 1256 // U+007D RIGHT CURLY BRACKET (}) 1257 if (is_closed_curly_bracket(input)) { 1258 dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly"); 1259 // Return a <}-token>. 1260 return create_new_token(Token::Type::CloseCurly); 1261 } 1262 1263 // digit 1264 if (is_ascii_digit(input)) { 1265 dbgln_if(CSS_TOKENIZER_DEBUG, "is digit"); 1266 // Reconsume the current input code point, consume a numeric token, and return it. 1267 reconsume_current_input_code_point(); 1268 return consume_a_numeric_token(); 1269 } 1270 1271 // name-start code point 1272 if (is_ident_start_code_point(input)) { 1273 dbgln_if(CSS_TOKENIZER_DEBUG, "is name start"); 1274 // Reconsume the current input code point, consume an ident-like token, and return it. 1275 reconsume_current_input_code_point(); 1276 return consume_an_ident_like_token(); 1277 } 1278 1279 // EOF 1280 if (is_eof(input)) { 1281 // Return an <EOF-token>. 1282 return create_new_token(Token::Type::EndOfFile); 1283 } 1284 1285 // anything else 1286 dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter"); 1287 // Return a <delim-token> with its value set to the current input code point. 1288 return create_value_token(Token::Type::Delim, input); 1289} 1290 1291}