Serenity Operating System
at master 2895 lines 112 kB view raw
1/* 2 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> 3 * Copyright (c) 2022, Linus Groh <linusg@serenityos.org> 4 * 5 * SPDX-License-Identifier: BSD-2-Clause 6 */ 7 8#include <AK/CharacterTypes.h> 9#include <AK/Debug.h> 10#include <AK/SourceLocation.h> 11#include <LibTextCodec/Decoder.h> 12#include <LibWeb/HTML/Parser/Entities.h> 13#include <LibWeb/HTML/Parser/HTMLParser.h> 14#include <LibWeb/HTML/Parser/HTMLToken.h> 15#include <LibWeb/HTML/Parser/HTMLTokenizer.h> 16#include <LibWeb/Namespace.h> 17#include <string.h> 18 19namespace Web::HTML { 20 21#pragma GCC diagnostic ignored "-Wunused-label" 22 23#define CONSUME_NEXT_INPUT_CHARACTER \ 24 current_input_character = next_code_point(); 25 26#define SWITCH_TO(new_state) \ 27 do { \ 28 VERIFY(m_current_builder.is_empty()); \ 29 SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state); \ 30 } while (0) 31 32#define SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state) \ 33 do { \ 34 will_switch_to(State::new_state); \ 35 m_state = State::new_state; \ 36 CONSUME_NEXT_INPUT_CHARACTER; \ 37 goto new_state; \ 38 } while (0) 39 40#define RECONSUME_IN(new_state) \ 41 do { \ 42 will_reconsume_in(State::new_state); \ 43 m_state = State::new_state; \ 44 goto new_state; \ 45 } while (0) 46 47#define SWITCH_TO_RETURN_STATE \ 48 do { \ 49 will_switch_to(m_return_state); \ 50 m_state = m_return_state; \ 51 goto _StartOfFunction; \ 52 } while (0) 53 54#define RECONSUME_IN_RETURN_STATE \ 55 do { \ 56 will_reconsume_in(m_return_state); \ 57 m_state = m_return_state; \ 58 if (current_input_character.has_value()) \ 59 restore_to(m_prev_utf8_iterator); \ 60 goto _StartOfFunction; \ 61 } while (0) 62 63#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \ 64 do { \ 65 VERIFY(m_current_builder.is_empty()); \ 66 will_switch_to(State::new_state); \ 67 m_state = State::new_state; \ 68 will_emit(m_current_token); \ 69 m_queued_tokens.enqueue(move(m_current_token)); \ 70 return m_queued_tokens.dequeue(); \ 71 } while (0) 72 73#define EMIT_CHARACTER_AND_RECONSUME_IN(code_point, new_state) \ 74 do { \ 75 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); \ 76 will_reconsume_in(State::new_state); \ 77 m_state = State::new_state; \ 78 goto new_state; \ 79 } while (0) 80 81#define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \ 82 do { \ 83 for (auto code_point : m_temporary_buffer) { \ 84 if (consumed_as_part_of_an_attribute()) { \ 85 m_current_builder.append_code_point(code_point); \ 86 } else { \ 87 create_new_token(HTMLToken::Type::Character); \ 88 m_current_token.set_code_point(code_point); \ 89 m_queued_tokens.enqueue(move(m_current_token)); \ 90 } \ 91 } \ 92 } while (0) 93 94#define DONT_CONSUME_NEXT_INPUT_CHARACTER \ 95 do { \ 96 restore_to(m_prev_utf8_iterator); \ 97 } while (0) 98 99#define ON(code_point) \ 100 if (current_input_character.has_value() && current_input_character.value() == code_point) 101 102#define ON_EOF \ 103 if (!current_input_character.has_value()) 104 105#define ON_ASCII_ALPHA \ 106 if (current_input_character.has_value() && is_ascii_alpha(current_input_character.value())) 107 108#define ON_ASCII_ALPHANUMERIC \ 109 if (current_input_character.has_value() && is_ascii_alphanumeric(current_input_character.value())) 110 111#define ON_ASCII_UPPER_ALPHA \ 112 if (current_input_character.has_value() && is_ascii_upper_alpha(current_input_character.value())) 113 114#define ON_ASCII_LOWER_ALPHA \ 115 if (current_input_character.has_value() && is_ascii_lower_alpha(current_input_character.value())) 116 117#define ON_ASCII_DIGIT \ 118 if (current_input_character.has_value() && is_ascii_digit(current_input_character.value())) 119 120#define ON_ASCII_HEX_DIGIT \ 121 if (current_input_character.has_value() && is_ascii_hex_digit(current_input_character.value())) 122 123#define ON_WHITESPACE \ 124 if (current_input_character.has_value() && is_ascii(*current_input_character) && first_is_one_of(static_cast<char>(*current_input_character), '\t', '\n', '\f', ' ')) 125 126#define ANYTHING_ELSE if (1) 127 128#define EMIT_EOF \ 129 do { \ 130 if (m_has_emitted_eof) \ 131 return {}; \ 132 m_has_emitted_eof = true; \ 133 create_new_token(HTMLToken::Type::EndOfFile); \ 134 will_emit(m_current_token); \ 135 m_queued_tokens.enqueue(move(m_current_token)); \ 136 return m_queued_tokens.dequeue(); \ 137 } while (0) 138 139#define EMIT_CURRENT_TOKEN \ 140 do { \ 141 VERIFY(m_current_builder.is_empty()); \ 142 will_emit(m_current_token); \ 143 m_queued_tokens.enqueue(move(m_current_token)); \ 144 return m_queued_tokens.dequeue(); \ 145 } while (0) 146 147#define EMIT_CHARACTER(code_point) \ 148 do { \ 149 create_new_token(HTMLToken::Type::Character); \ 150 m_current_token.set_code_point(code_point); \ 151 m_queued_tokens.enqueue(move(m_current_token)); \ 152 return m_queued_tokens.dequeue(); \ 153 } while (0) 154 155#define EMIT_CURRENT_CHARACTER \ 156 EMIT_CHARACTER(current_input_character.value()); 157 158#define SWITCH_TO_AND_EMIT_CHARACTER(code_point, new_state) \ 159 do { \ 160 will_switch_to(State::new_state); \ 161 m_state = State::new_state; \ 162 EMIT_CHARACTER(code_point); \ 163 } while (0) 164 165#define SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(new_state) \ 166 SWITCH_TO_AND_EMIT_CHARACTER(current_input_character.value(), new_state) 167 168#define BEGIN_STATE(state) \ 169 state: \ 170 case State::state: { \ 171 { \ 172 { 173 174#define END_STATE \ 175 VERIFY_NOT_REACHED(); \ 176 break; \ 177 } \ 178 } \ 179 } 180 181static inline void log_parse_error(SourceLocation const& location = SourceLocation::current()) 182{ 183 dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location); 184} 185 186Optional<u32> HTMLTokenizer::next_code_point() 187{ 188 if (m_utf8_iterator == m_utf8_view.end()) 189 return {}; 190 191 u32 code_point; 192 // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization 193 // https://infra.spec.whatwg.org/#normalize-newlines 194 if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') { 195 // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point, 196 skip(2); 197 code_point = '\n'; 198 } else if (peek_code_point(0).value_or(0) == '\r') { 199 // replace every remaining U+000D CR code point with a U+000A LF code point. 200 skip(1); 201 code_point = '\n'; 202 } else { 203 skip(1); 204 code_point = *m_prev_utf8_iterator; 205 } 206 207 dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point); 208 return code_point; 209} 210 211void HTMLTokenizer::skip(size_t count) 212{ 213 if (!m_source_positions.is_empty()) 214 m_source_positions.append(m_source_positions.last()); 215 for (size_t i = 0; i < count; ++i) { 216 m_prev_utf8_iterator = m_utf8_iterator; 217 auto code_point = *m_utf8_iterator; 218 if (!m_source_positions.is_empty()) { 219 if (code_point == '\n') { 220 m_source_positions.last().column = 0; 221 m_source_positions.last().line++; 222 } else { 223 m_source_positions.last().column++; 224 } 225 } 226 ++m_utf8_iterator; 227 } 228} 229 230Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const 231{ 232 auto it = m_utf8_iterator; 233 for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i) 234 ++it; 235 if (it == m_utf8_view.end()) 236 return {}; 237 return *it; 238} 239 240HTMLToken::Position HTMLTokenizer::nth_last_position(size_t n) 241{ 242 if (n + 1 > m_source_positions.size()) { 243 dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer::nth_last_position) Invalid position requested: {}th-last of {}. Returning (0-0).", n, m_source_positions.size()); 244 return HTMLToken::Position { 0, 0 }; 245 }; 246 return m_source_positions.at(m_source_positions.size() - 1 - n); 247} 248 249Optional<HTMLToken> HTMLTokenizer::next_token() 250{ 251 if (!m_source_positions.is_empty()) { 252 auto last_position = m_source_positions.last(); 253 m_source_positions.clear_with_capacity(); 254 m_source_positions.append(move(last_position)); 255 } 256_StartOfFunction: 257 if (!m_queued_tokens.is_empty()) 258 return m_queued_tokens.dequeue(); 259 260 if (m_aborted) 261 return {}; 262 263 for (;;) { 264 auto current_input_character = next_code_point(); 265 switch (m_state) { 266 // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state 267 BEGIN_STATE(Data) 268 { 269 ON('&') 270 { 271 m_return_state = State::Data; 272 SWITCH_TO(CharacterReference); 273 } 274 ON('<') 275 { 276 SWITCH_TO(TagOpen); 277 } 278 ON(0) 279 { 280 log_parse_error(); 281 EMIT_CURRENT_CHARACTER; 282 } 283 ON_EOF 284 { 285 EMIT_EOF; 286 } 287 ANYTHING_ELSE 288 { 289 EMIT_CURRENT_CHARACTER; 290 } 291 } 292 END_STATE 293 294 // 13.2.5.6 Tag open state, https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state 295 BEGIN_STATE(TagOpen) 296 { 297 ON('!') 298 { 299 SWITCH_TO(MarkupDeclarationOpen); 300 } 301 ON('/') 302 { 303 SWITCH_TO(EndTagOpen); 304 } 305 ON_ASCII_ALPHA 306 { 307 create_new_token(HTMLToken::Type::StartTag); 308 RECONSUME_IN(TagName); 309 } 310 ON('?') 311 { 312 log_parse_error(); 313 create_new_token(HTMLToken::Type::Comment); 314 m_current_token.set_start_position({}, nth_last_position(2)); 315 RECONSUME_IN(BogusComment); 316 } 317 ON_EOF 318 { 319 log_parse_error(); 320 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 321 EMIT_EOF; 322 } 323 ANYTHING_ELSE 324 { 325 log_parse_error(); 326 EMIT_CHARACTER_AND_RECONSUME_IN('<', Data); 327 } 328 } 329 END_STATE 330 331 // 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state 332 BEGIN_STATE(TagName) 333 { 334 ON_WHITESPACE 335 { 336 m_current_token.set_tag_name(consume_current_builder()); 337 m_current_token.set_end_position({}, nth_last_position(1)); 338 SWITCH_TO(BeforeAttributeName); 339 } 340 ON('/') 341 { 342 m_current_token.set_tag_name(consume_current_builder()); 343 m_current_token.set_end_position({}, nth_last_position(0)); 344 SWITCH_TO(SelfClosingStartTag); 345 } 346 ON('>') 347 { 348 m_current_token.set_tag_name(consume_current_builder()); 349 m_current_token.set_end_position({}, nth_last_position(1)); 350 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 351 } 352 ON_ASCII_UPPER_ALPHA 353 { 354 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); 355 m_current_token.set_end_position({}, nth_last_position(0)); 356 continue; 357 } 358 ON(0) 359 { 360 log_parse_error(); 361 m_current_builder.append_code_point(0xFFFD); 362 m_current_token.set_end_position({}, nth_last_position(0)); 363 continue; 364 } 365 ON_EOF 366 { 367 log_parse_error(); 368 m_current_token.set_end_position({}, nth_last_position(0)); 369 EMIT_EOF; 370 } 371 ANYTHING_ELSE 372 { 373 m_current_builder.append_code_point(current_input_character.value()); 374 m_current_token.set_end_position({}, nth_last_position(0)); 375 continue; 376 } 377 } 378 END_STATE 379 380 // 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state 381 BEGIN_STATE(EndTagOpen) 382 { 383 ON_ASCII_ALPHA 384 { 385 create_new_token(HTMLToken::Type::EndTag); 386 RECONSUME_IN(TagName); 387 } 388 ON('>') 389 { 390 log_parse_error(); 391 SWITCH_TO(Data); 392 } 393 ON_EOF 394 { 395 log_parse_error(); 396 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 397 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 398 EMIT_EOF; 399 } 400 ANYTHING_ELSE 401 { 402 log_parse_error(); 403 create_new_token(HTMLToken::Type::Comment); 404 RECONSUME_IN(BogusComment); 405 } 406 } 407 END_STATE 408 409 // 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state 410 BEGIN_STATE(MarkupDeclarationOpen) 411 { 412 DONT_CONSUME_NEXT_INPUT_CHARACTER; 413 if (consume_next_if_match("--"sv)) { 414 create_new_token(HTMLToken::Type::Comment); 415 m_current_token.set_start_position({}, nth_last_position(3)); 416 SWITCH_TO(CommentStart); 417 } 418 if (consume_next_if_match("DOCTYPE"sv, CaseSensitivity::CaseInsensitive)) { 419 SWITCH_TO(DOCTYPE); 420 } 421 if (consume_next_if_match("[CDATA["sv)) { 422 // We keep the parser optional so that syntax highlighting can be lexer-only. 423 // The parser registers itself with the lexer it creates. 424 if (m_parser != nullptr && m_parser->adjusted_current_node().namespace_() != Namespace::HTML) { 425 SWITCH_TO(CDATASection); 426 } else { 427 create_new_token(HTMLToken::Type::Comment); 428 m_current_builder.append("[CDATA["sv); 429 SWITCH_TO_WITH_UNCLEAN_BUILDER(BogusComment); 430 } 431 } 432 ANYTHING_ELSE 433 { 434 log_parse_error(); 435 create_new_token(HTMLToken::Type::Comment); 436 SWITCH_TO(BogusComment); 437 } 438 } 439 END_STATE 440 441 // 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state 442 BEGIN_STATE(BogusComment) 443 { 444 ON('>') 445 { 446 m_current_token.set_comment(consume_current_builder()); 447 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 448 } 449 ON_EOF 450 { 451 m_queued_tokens.enqueue(move(m_current_token)); 452 EMIT_EOF; 453 } 454 ON(0) 455 { 456 log_parse_error(); 457 m_current_builder.append_code_point(0xFFFD); 458 continue; 459 } 460 ANYTHING_ELSE 461 { 462 m_current_builder.append_code_point(current_input_character.value()); 463 continue; 464 } 465 } 466 END_STATE 467 468 // 13.2.5.53 DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-state 469 BEGIN_STATE(DOCTYPE) 470 { 471 ON_WHITESPACE 472 { 473 SWITCH_TO(BeforeDOCTYPEName); 474 } 475 ON('>') 476 { 477 RECONSUME_IN(BeforeDOCTYPEName); 478 } 479 ON_EOF 480 { 481 log_parse_error(); 482 create_new_token(HTMLToken::Type::DOCTYPE); 483 m_current_token.ensure_doctype_data().force_quirks = true; 484 m_queued_tokens.enqueue(move(m_current_token)); 485 EMIT_EOF; 486 } 487 ANYTHING_ELSE 488 { 489 log_parse_error(); 490 RECONSUME_IN(BeforeDOCTYPEName); 491 } 492 } 493 END_STATE 494 495 // 13.2.5.54 Before DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state 496 BEGIN_STATE(BeforeDOCTYPEName) 497 { 498 ON_WHITESPACE 499 { 500 continue; 501 } 502 ON_ASCII_UPPER_ALPHA 503 { 504 create_new_token(HTMLToken::Type::DOCTYPE); 505 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); 506 m_current_token.ensure_doctype_data().missing_name = false; 507 SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName); 508 } 509 ON(0) 510 { 511 log_parse_error(); 512 create_new_token(HTMLToken::Type::DOCTYPE); 513 m_current_builder.append_code_point(0xFFFD); 514 m_current_token.ensure_doctype_data().missing_name = false; 515 SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName); 516 } 517 ON('>') 518 { 519 log_parse_error(); 520 create_new_token(HTMLToken::Type::DOCTYPE); 521 m_current_token.ensure_doctype_data().force_quirks = true; 522 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 523 } 524 ON_EOF 525 { 526 log_parse_error(); 527 create_new_token(HTMLToken::Type::DOCTYPE); 528 m_current_token.ensure_doctype_data().force_quirks = true; 529 m_queued_tokens.enqueue(move(m_current_token)); 530 EMIT_EOF; 531 } 532 ANYTHING_ELSE 533 { 534 create_new_token(HTMLToken::Type::DOCTYPE); 535 m_current_builder.append_code_point(current_input_character.value()); 536 m_current_token.ensure_doctype_data().missing_name = false; 537 SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName); 538 } 539 } 540 END_STATE 541 542 // 13.2.5.55 DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state 543 BEGIN_STATE(DOCTYPEName) 544 { 545 ON_WHITESPACE 546 { 547 m_current_token.ensure_doctype_data().name = consume_current_builder(); 548 SWITCH_TO(AfterDOCTYPEName); 549 } 550 ON('>') 551 { 552 m_current_token.ensure_doctype_data().name = consume_current_builder(); 553 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 554 } 555 ON_ASCII_UPPER_ALPHA 556 { 557 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); 558 continue; 559 } 560 ON(0) 561 { 562 log_parse_error(); 563 m_current_builder.append_code_point(0xFFFD); 564 continue; 565 } 566 ON_EOF 567 { 568 log_parse_error(); 569 m_current_token.ensure_doctype_data().force_quirks = true; 570 m_queued_tokens.enqueue(move(m_current_token)); 571 EMIT_EOF; 572 } 573 ANYTHING_ELSE 574 { 575 m_current_builder.append_code_point(current_input_character.value()); 576 continue; 577 } 578 } 579 END_STATE 580 581 // 13.2.5.56 After DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state 582 BEGIN_STATE(AfterDOCTYPEName) 583 { 584 ON_WHITESPACE 585 { 586 continue; 587 } 588 ON('>') 589 { 590 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 591 } 592 ON_EOF 593 { 594 log_parse_error(); 595 m_current_token.ensure_doctype_data().force_quirks = true; 596 m_queued_tokens.enqueue(move(m_current_token)); 597 EMIT_EOF; 598 } 599 ANYTHING_ELSE 600 { 601 if (to_ascii_uppercase(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC"sv, CaseSensitivity::CaseInsensitive)) { 602 SWITCH_TO(AfterDOCTYPEPublicKeyword); 603 } 604 if (to_ascii_uppercase(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM"sv, CaseSensitivity::CaseInsensitive)) { 605 SWITCH_TO(AfterDOCTYPESystemKeyword); 606 } 607 log_parse_error(); 608 m_current_token.ensure_doctype_data().force_quirks = true; 609 RECONSUME_IN(BogusDOCTYPE); 610 } 611 } 612 END_STATE 613 614 // 13.2.5.57 After DOCTYPE public keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state 615 BEGIN_STATE(AfterDOCTYPEPublicKeyword) 616 { 617 ON_WHITESPACE 618 { 619 SWITCH_TO(BeforeDOCTYPEPublicIdentifier); 620 } 621 ON('"') 622 { 623 log_parse_error(); 624 m_current_token.ensure_doctype_data().missing_public_identifier = false; 625 SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted); 626 } 627 ON('\'') 628 { 629 log_parse_error(); 630 m_current_token.ensure_doctype_data().missing_public_identifier = false; 631 SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted); 632 } 633 ON('>') 634 { 635 log_parse_error(); 636 m_current_token.ensure_doctype_data().force_quirks = true; 637 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 638 } 639 ON_EOF 640 { 641 log_parse_error(); 642 m_current_token.ensure_doctype_data().force_quirks = true; 643 m_queued_tokens.enqueue(move(m_current_token)); 644 EMIT_EOF; 645 } 646 ANYTHING_ELSE 647 { 648 log_parse_error(); 649 m_current_token.ensure_doctype_data().force_quirks = true; 650 RECONSUME_IN(BogusDOCTYPE); 651 } 652 } 653 END_STATE 654 655 // 13.2.5.63 After DOCTYPE system keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state 656 BEGIN_STATE(AfterDOCTYPESystemKeyword) 657 { 658 ON_WHITESPACE 659 { 660 SWITCH_TO(BeforeDOCTYPESystemIdentifier); 661 } 662 ON('"') 663 { 664 log_parse_error(); 665 m_current_token.ensure_doctype_data().system_identifier = {}; 666 m_current_token.ensure_doctype_data().missing_system_identifier = false; 667 SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted); 668 } 669 ON('\'') 670 { 671 log_parse_error(); 672 m_current_token.ensure_doctype_data().system_identifier = {}; 673 m_current_token.ensure_doctype_data().missing_system_identifier = false; 674 SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted); 675 } 676 ON('>') 677 { 678 log_parse_error(); 679 m_current_token.ensure_doctype_data().force_quirks = true; 680 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 681 } 682 ON_EOF 683 { 684 log_parse_error(); 685 m_current_token.ensure_doctype_data().force_quirks = true; 686 m_queued_tokens.enqueue(move(m_current_token)); 687 EMIT_EOF; 688 } 689 ANYTHING_ELSE 690 { 691 log_parse_error(); 692 m_current_token.ensure_doctype_data().force_quirks = true; 693 RECONSUME_IN(BogusDOCTYPE); 694 } 695 } 696 END_STATE 697 698 // 13.2.5.58 Before DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state 699 BEGIN_STATE(BeforeDOCTYPEPublicIdentifier) 700 { 701 ON_WHITESPACE 702 { 703 continue; 704 } 705 ON('"') 706 { 707 m_current_token.ensure_doctype_data().missing_public_identifier = false; 708 SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted); 709 } 710 ON('\'') 711 { 712 m_current_token.ensure_doctype_data().missing_public_identifier = false; 713 SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted); 714 } 715 ON('>') 716 { 717 log_parse_error(); 718 m_current_token.ensure_doctype_data().force_quirks = true; 719 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 720 } 721 ON_EOF 722 { 723 log_parse_error(); 724 m_current_token.ensure_doctype_data().force_quirks = true; 725 m_queued_tokens.enqueue(move(m_current_token)); 726 EMIT_EOF; 727 } 728 ANYTHING_ELSE 729 { 730 log_parse_error(); 731 m_current_token.ensure_doctype_data().force_quirks = true; 732 RECONSUME_IN(BogusDOCTYPE); 733 } 734 } 735 END_STATE 736 737 // 13.2.5.64 Before DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state 738 BEGIN_STATE(BeforeDOCTYPESystemIdentifier) 739 { 740 ON_WHITESPACE 741 { 742 continue; 743 } 744 ON('"') 745 { 746 m_current_token.ensure_doctype_data().missing_system_identifier = false; 747 SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted); 748 } 749 ON('\'') 750 { 751 m_current_token.ensure_doctype_data().missing_system_identifier = false; 752 SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted); 753 } 754 ON('>') 755 { 756 log_parse_error(); 757 m_current_token.ensure_doctype_data().force_quirks = true; 758 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 759 } 760 ON_EOF 761 { 762 log_parse_error(); 763 m_current_token.ensure_doctype_data().force_quirks = true; 764 m_queued_tokens.enqueue(move(m_current_token)); 765 EMIT_EOF; 766 } 767 ANYTHING_ELSE 768 { 769 log_parse_error(); 770 m_current_token.ensure_doctype_data().force_quirks = true; 771 RECONSUME_IN(BogusDOCTYPE); 772 } 773 } 774 END_STATE 775 776 // 13.2.5.59 DOCTYPE public identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state 777 BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuoted) 778 { 779 ON('"') 780 { 781 m_current_token.ensure_doctype_data().public_identifier = consume_current_builder(); 782 SWITCH_TO(AfterDOCTYPEPublicIdentifier); 783 } 784 ON(0) 785 { 786 log_parse_error(); 787 m_current_builder.append_code_point(0xFFFD); 788 continue; 789 } 790 ON('>') 791 { 792 log_parse_error(); 793 m_current_token.ensure_doctype_data().public_identifier = consume_current_builder(); 794 m_current_token.ensure_doctype_data().force_quirks = true; 795 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 796 } 797 ON_EOF 798 { 799 log_parse_error(); 800 m_current_token.ensure_doctype_data().force_quirks = true; 801 m_queued_tokens.enqueue(move(m_current_token)); 802 EMIT_EOF; 803 } 804 ANYTHING_ELSE 805 { 806 m_current_builder.append_code_point(current_input_character.value()); 807 continue; 808 } 809 } 810 END_STATE 811 812 // 13.2.5.60 DOCTYPE public identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state 813 BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuoted) 814 { 815 ON('\'') 816 { 817 m_current_token.ensure_doctype_data().public_identifier = consume_current_builder(); 818 SWITCH_TO(AfterDOCTYPEPublicIdentifier); 819 } 820 ON(0) 821 { 822 log_parse_error(); 823 m_current_builder.append_code_point(0xFFFD); 824 continue; 825 } 826 ON('>') 827 { 828 log_parse_error(); 829 m_current_token.ensure_doctype_data().public_identifier = consume_current_builder(); 830 m_current_token.ensure_doctype_data().force_quirks = true; 831 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 832 } 833 ON_EOF 834 { 835 log_parse_error(); 836 m_current_token.ensure_doctype_data().force_quirks = true; 837 m_queued_tokens.enqueue(move(m_current_token)); 838 EMIT_EOF; 839 } 840 ANYTHING_ELSE 841 { 842 m_current_builder.append_code_point(current_input_character.value()); 843 continue; 844 } 845 } 846 END_STATE 847 848 // 13.2.5.65 DOCTYPE system identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state 849 BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuoted) 850 { 851 ON('"') 852 { 853 m_current_token.ensure_doctype_data().system_identifier = consume_current_builder(); 854 SWITCH_TO(AfterDOCTYPESystemIdentifier); 855 } 856 ON(0) 857 { 858 log_parse_error(); 859 m_current_builder.append_code_point(0xFFFD); 860 continue; 861 } 862 ON('>') 863 { 864 log_parse_error(); 865 m_current_token.ensure_doctype_data().system_identifier = consume_current_builder(); 866 m_current_token.ensure_doctype_data().force_quirks = true; 867 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 868 } 869 ON_EOF 870 { 871 log_parse_error(); 872 m_current_token.ensure_doctype_data().force_quirks = true; 873 m_queued_tokens.enqueue(move(m_current_token)); 874 EMIT_EOF; 875 } 876 ANYTHING_ELSE 877 { 878 m_current_builder.append_code_point(current_input_character.value()); 879 continue; 880 } 881 } 882 END_STATE 883 884 // 13.2.5.66 DOCTYPE system identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state 885 BEGIN_STATE(DOCTYPESystemIdentifierSingleQuoted) 886 { 887 ON('\'') 888 { 889 m_current_token.ensure_doctype_data().system_identifier = consume_current_builder(); 890 SWITCH_TO(AfterDOCTYPESystemIdentifier); 891 } 892 ON(0) 893 { 894 log_parse_error(); 895 m_current_builder.append_code_point(0xFFFD); 896 continue; 897 } 898 ON('>') 899 { 900 log_parse_error(); 901 m_current_token.ensure_doctype_data().system_identifier = consume_current_builder(); 902 m_current_token.ensure_doctype_data().force_quirks = true; 903 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 904 } 905 ON_EOF 906 { 907 log_parse_error(); 908 m_current_token.ensure_doctype_data().force_quirks = true; 909 m_queued_tokens.enqueue(move(m_current_token)); 910 EMIT_EOF; 911 } 912 ANYTHING_ELSE 913 { 914 m_current_builder.append_code_point(current_input_character.value()); 915 continue; 916 } 917 } 918 END_STATE 919 920 // 13.2.5.61 After DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state 921 BEGIN_STATE(AfterDOCTYPEPublicIdentifier) 922 { 923 ON_WHITESPACE 924 { 925 SWITCH_TO(BetweenDOCTYPEPublicAndSystemIdentifiers); 926 } 927 ON('>') 928 { 929 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 930 } 931 ON('"') 932 { 933 log_parse_error(); 934 m_current_token.ensure_doctype_data().missing_system_identifier = false; 935 SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted); 936 } 937 ON('\'') 938 { 939 log_parse_error(); 940 m_current_token.ensure_doctype_data().missing_system_identifier = false; 941 SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted); 942 } 943 ON_EOF 944 { 945 log_parse_error(); 946 m_current_token.ensure_doctype_data().force_quirks = true; 947 m_queued_tokens.enqueue(move(m_current_token)); 948 EMIT_EOF; 949 } 950 ANYTHING_ELSE 951 { 952 log_parse_error(); 953 m_current_token.ensure_doctype_data().force_quirks = true; 954 RECONSUME_IN(BogusDOCTYPE); 955 } 956 } 957 END_STATE 958 959 // 13.2.5.62 Between DOCTYPE public and system identifiers state, https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state 960 BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) 961 { 962 ON_WHITESPACE 963 { 964 continue; 965 } 966 ON('>') 967 { 968 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 969 } 970 ON('"') 971 { 972 m_current_token.ensure_doctype_data().missing_system_identifier = false; 973 SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted); 974 } 975 ON('\'') 976 { 977 m_current_token.ensure_doctype_data().missing_system_identifier = false; 978 SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted); 979 } 980 ON_EOF 981 { 982 log_parse_error(); 983 m_current_token.ensure_doctype_data().force_quirks = true; 984 m_queued_tokens.enqueue(move(m_current_token)); 985 EMIT_EOF; 986 } 987 ANYTHING_ELSE 988 { 989 log_parse_error(); 990 m_current_token.ensure_doctype_data().force_quirks = true; 991 RECONSUME_IN(BogusDOCTYPE); 992 } 993 } 994 END_STATE 995 996 // 13.2.5.67 After DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state 997 BEGIN_STATE(AfterDOCTYPESystemIdentifier) 998 { 999 ON_WHITESPACE 1000 { 1001 continue; 1002 } 1003 ON('>') 1004 { 1005 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1006 } 1007 ON_EOF 1008 { 1009 log_parse_error(); 1010 m_current_token.ensure_doctype_data().force_quirks = true; 1011 m_queued_tokens.enqueue(move(m_current_token)); 1012 EMIT_EOF; 1013 } 1014 ANYTHING_ELSE 1015 { 1016 log_parse_error(); 1017 RECONSUME_IN(BogusDOCTYPE); 1018 } 1019 } 1020 END_STATE 1021 1022 // 13.2.5.68 Bogus DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state 1023 BEGIN_STATE(BogusDOCTYPE) 1024 { 1025 ON('>') 1026 { 1027 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1028 } 1029 ON(0) 1030 { 1031 log_parse_error(); 1032 continue; 1033 } 1034 ON_EOF 1035 { 1036 m_queued_tokens.enqueue(move(m_current_token)); 1037 EMIT_EOF; 1038 } 1039 ANYTHING_ELSE 1040 { 1041 continue; 1042 } 1043 } 1044 END_STATE 1045 1046 // 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state 1047 BEGIN_STATE(BeforeAttributeName) 1048 { 1049 ON_WHITESPACE 1050 { 1051 continue; 1052 } 1053 ON('/') 1054 { 1055 if (m_current_token.has_attributes()) 1056 m_current_token.last_attribute().name_end_position = nth_last_position(1); 1057 RECONSUME_IN(AfterAttributeName); 1058 } 1059 ON('>') 1060 { 1061 RECONSUME_IN(AfterAttributeName); 1062 } 1063 ON_EOF 1064 { 1065 RECONSUME_IN(AfterAttributeName); 1066 } 1067 ON('=') 1068 { 1069 log_parse_error(); 1070 HTMLToken::Attribute new_attribute; 1071 new_attribute.name_start_position = nth_last_position(1); 1072 m_current_builder.append_code_point(current_input_character.value()); 1073 m_current_token.add_attribute(move(new_attribute)); 1074 SWITCH_TO_WITH_UNCLEAN_BUILDER(AttributeName); 1075 } 1076 ANYTHING_ELSE 1077 { 1078 HTMLToken::Attribute new_attribute; 1079 new_attribute.name_start_position = nth_last_position(1); 1080 m_current_token.add_attribute(move(new_attribute)); 1081 RECONSUME_IN(AttributeName); 1082 } 1083 } 1084 END_STATE 1085 1086 // 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state 1087 BEGIN_STATE(SelfClosingStartTag) 1088 { 1089 ON('>') 1090 { 1091 m_current_token.set_self_closing(true); 1092 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1093 } 1094 ON_EOF 1095 { 1096 log_parse_error(); 1097 EMIT_EOF; 1098 } 1099 ANYTHING_ELSE 1100 { 1101 log_parse_error(); 1102 RECONSUME_IN(BeforeAttributeName); 1103 } 1104 } 1105 END_STATE 1106 1107 // 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state 1108 BEGIN_STATE(AttributeName) 1109 { 1110 ON_WHITESPACE 1111 { 1112 m_current_token.last_attribute().local_name = consume_current_builder(); 1113 RECONSUME_IN(AfterAttributeName); 1114 } 1115 ON('/') 1116 { 1117 m_current_token.last_attribute().local_name = consume_current_builder(); 1118 RECONSUME_IN(AfterAttributeName); 1119 } 1120 ON('>') 1121 { 1122 m_current_token.last_attribute().local_name = consume_current_builder(); 1123 RECONSUME_IN(AfterAttributeName); 1124 } 1125 ON_EOF 1126 { 1127 m_current_token.last_attribute().local_name = consume_current_builder(); 1128 RECONSUME_IN(AfterAttributeName); 1129 } 1130 ON('=') 1131 { 1132 m_current_token.last_attribute().name_end_position = nth_last_position(1); 1133 m_current_token.last_attribute().local_name = consume_current_builder(); 1134 SWITCH_TO(BeforeAttributeValue); 1135 } 1136 ON_ASCII_UPPER_ALPHA 1137 { 1138 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); 1139 continue; 1140 } 1141 ON(0) 1142 { 1143 log_parse_error(); 1144 m_current_builder.append_code_point(0xFFFD); 1145 continue; 1146 } 1147 ON('"') 1148 { 1149 log_parse_error(); 1150 goto AnythingElseAttributeName; 1151 } 1152 ON('\'') 1153 { 1154 log_parse_error(); 1155 goto AnythingElseAttributeName; 1156 } 1157 ON('<') 1158 { 1159 log_parse_error(); 1160 goto AnythingElseAttributeName; 1161 } 1162 ANYTHING_ELSE 1163 { 1164 AnythingElseAttributeName: 1165 m_current_builder.append_code_point(current_input_character.value()); 1166 continue; 1167 } 1168 } 1169 END_STATE 1170 1171 // 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state 1172 BEGIN_STATE(AfterAttributeName) 1173 { 1174 ON_WHITESPACE 1175 { 1176 continue; 1177 } 1178 ON('/') 1179 { 1180 SWITCH_TO(SelfClosingStartTag); 1181 } 1182 ON('=') 1183 { 1184 m_current_token.last_attribute().name_end_position = nth_last_position(1); 1185 SWITCH_TO(BeforeAttributeValue); 1186 } 1187 ON('>') 1188 { 1189 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1190 } 1191 ON_EOF 1192 { 1193 log_parse_error(); 1194 EMIT_EOF; 1195 } 1196 ANYTHING_ELSE 1197 { 1198 m_current_token.add_attribute({}); 1199 if (!m_source_positions.is_empty()) 1200 m_current_token.last_attribute().name_start_position = m_source_positions.last(); 1201 RECONSUME_IN(AttributeName); 1202 } 1203 } 1204 END_STATE 1205 1206 // 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state 1207 BEGIN_STATE(BeforeAttributeValue) 1208 { 1209 m_current_token.last_attribute().value_start_position = nth_last_position(1); 1210 ON_WHITESPACE 1211 { 1212 continue; 1213 } 1214 ON('"') 1215 { 1216 SWITCH_TO(AttributeValueDoubleQuoted); 1217 } 1218 ON('\'') 1219 { 1220 SWITCH_TO(AttributeValueSingleQuoted); 1221 } 1222 ON('>') 1223 { 1224 log_parse_error(); 1225 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1226 } 1227 ANYTHING_ELSE 1228 { 1229 RECONSUME_IN(AttributeValueUnquoted); 1230 } 1231 } 1232 END_STATE 1233 1234 // 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state 1235 BEGIN_STATE(AttributeValueDoubleQuoted) 1236 { 1237 ON('"') 1238 { 1239 m_current_token.last_attribute().value = consume_current_builder(); 1240 SWITCH_TO(AfterAttributeValueQuoted); 1241 } 1242 ON('&') 1243 { 1244 m_return_state = State::AttributeValueDoubleQuoted; 1245 SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference); 1246 } 1247 ON(0) 1248 { 1249 log_parse_error(); 1250 m_current_builder.append_code_point(0xFFFD); 1251 continue; 1252 } 1253 ON_EOF 1254 { 1255 log_parse_error(); 1256 EMIT_EOF; 1257 } 1258 ANYTHING_ELSE 1259 { 1260 m_current_builder.append_code_point(current_input_character.value()); 1261 continue; 1262 } 1263 } 1264 END_STATE 1265 1266 // 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state 1267 BEGIN_STATE(AttributeValueSingleQuoted) 1268 { 1269 ON('\'') 1270 { 1271 m_current_token.last_attribute().value = consume_current_builder(); 1272 SWITCH_TO(AfterAttributeValueQuoted); 1273 } 1274 ON('&') 1275 { 1276 m_return_state = State::AttributeValueSingleQuoted; 1277 SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference); 1278 } 1279 ON(0) 1280 { 1281 log_parse_error(); 1282 m_current_builder.append_code_point(0xFFFD); 1283 continue; 1284 } 1285 ON_EOF 1286 { 1287 log_parse_error(); 1288 EMIT_EOF; 1289 } 1290 ANYTHING_ELSE 1291 { 1292 m_current_builder.append_code_point(current_input_character.value()); 1293 continue; 1294 } 1295 } 1296 END_STATE 1297 1298 // 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state 1299 BEGIN_STATE(AttributeValueUnquoted) 1300 { 1301 ON_WHITESPACE 1302 { 1303 m_current_token.last_attribute().value = consume_current_builder(); 1304 m_current_token.last_attribute().value_end_position = nth_last_position(1); 1305 SWITCH_TO(BeforeAttributeName); 1306 } 1307 ON('&') 1308 { 1309 m_return_state = State::AttributeValueUnquoted; 1310 SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference); 1311 } 1312 ON('>') 1313 { 1314 m_current_token.last_attribute().value = consume_current_builder(); 1315 m_current_token.last_attribute().value_end_position = nth_last_position(1); 1316 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1317 } 1318 ON(0) 1319 { 1320 log_parse_error(); 1321 m_current_builder.append_code_point(0xFFFD); 1322 continue; 1323 } 1324 ON('"') 1325 { 1326 log_parse_error(); 1327 goto AnythingElseAttributeValueUnquoted; 1328 } 1329 ON('\'') 1330 { 1331 log_parse_error(); 1332 goto AnythingElseAttributeValueUnquoted; 1333 } 1334 ON('<') 1335 { 1336 log_parse_error(); 1337 goto AnythingElseAttributeValueUnquoted; 1338 } 1339 ON('=') 1340 { 1341 log_parse_error(); 1342 goto AnythingElseAttributeValueUnquoted; 1343 } 1344 ON('`') 1345 { 1346 log_parse_error(); 1347 goto AnythingElseAttributeValueUnquoted; 1348 } 1349 ON_EOF 1350 { 1351 log_parse_error(); 1352 EMIT_EOF; 1353 } 1354 ANYTHING_ELSE 1355 { 1356 AnythingElseAttributeValueUnquoted: 1357 m_current_builder.append_code_point(current_input_character.value()); 1358 continue; 1359 } 1360 } 1361 END_STATE 1362 1363 // 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state 1364 BEGIN_STATE(AfterAttributeValueQuoted) 1365 { 1366 m_current_token.last_attribute().value_end_position = nth_last_position(1); 1367 ON_WHITESPACE 1368 { 1369 SWITCH_TO(BeforeAttributeName); 1370 } 1371 ON('/') 1372 { 1373 SWITCH_TO(SelfClosingStartTag); 1374 } 1375 ON('>') 1376 { 1377 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1378 } 1379 ON_EOF 1380 { 1381 log_parse_error(); 1382 EMIT_EOF; 1383 } 1384 ANYTHING_ELSE 1385 { 1386 log_parse_error(); 1387 RECONSUME_IN(BeforeAttributeName); 1388 } 1389 } 1390 END_STATE 1391 1392 // 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state 1393 BEGIN_STATE(CommentStart) 1394 { 1395 ON('-') 1396 { 1397 SWITCH_TO(CommentStartDash); 1398 } 1399 ON('>') 1400 { 1401 log_parse_error(); 1402 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1403 } 1404 ANYTHING_ELSE 1405 { 1406 RECONSUME_IN(Comment); 1407 } 1408 } 1409 END_STATE 1410 1411 // 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state 1412 BEGIN_STATE(CommentStartDash) 1413 { 1414 ON('-') 1415 { 1416 SWITCH_TO(CommentEnd); 1417 } 1418 ON('>') 1419 { 1420 log_parse_error(); 1421 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1422 } 1423 ON_EOF 1424 { 1425 log_parse_error(); 1426 EMIT_EOF; 1427 } 1428 ANYTHING_ELSE 1429 { 1430 m_current_builder.append('-'); 1431 RECONSUME_IN(Comment); 1432 } 1433 } 1434 END_STATE 1435 1436 // 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state 1437 BEGIN_STATE(Comment) 1438 { 1439 ON('<') 1440 { 1441 m_current_builder.append_code_point(current_input_character.value()); 1442 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSign); 1443 } 1444 ON('-') 1445 { 1446 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndDash); 1447 } 1448 ON(0) 1449 { 1450 log_parse_error(); 1451 m_current_builder.append_code_point(0xFFFD); 1452 continue; 1453 } 1454 ON_EOF 1455 { 1456 log_parse_error(); 1457 m_current_token.set_comment(consume_current_builder()); 1458 EMIT_EOF; 1459 } 1460 ANYTHING_ELSE 1461 { 1462 m_current_builder.append_code_point(current_input_character.value()); 1463 continue; 1464 } 1465 } 1466 END_STATE 1467 1468 // 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state 1469 BEGIN_STATE(CommentEnd) 1470 { 1471 ON('>') 1472 { 1473 m_current_token.set_comment(consume_current_builder()); 1474 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1475 } 1476 ON('!') 1477 { 1478 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndBang); 1479 } 1480 ON('-') 1481 { 1482 m_current_builder.append('-'); 1483 continue; 1484 } 1485 ON_EOF 1486 { 1487 log_parse_error(); 1488 m_current_token.set_comment(consume_current_builder()); 1489 EMIT_EOF; 1490 } 1491 ANYTHING_ELSE 1492 { 1493 m_current_builder.append("--"sv); 1494 RECONSUME_IN(Comment); 1495 } 1496 } 1497 END_STATE 1498 1499 // 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state 1500 BEGIN_STATE(CommentEndBang) 1501 { 1502 ON('-') 1503 { 1504 m_current_builder.append("--!"sv); 1505 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndDash); 1506 } 1507 ON('>') 1508 { 1509 log_parse_error(); 1510 m_current_token.set_comment(consume_current_builder()); 1511 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1512 } 1513 ON_EOF 1514 { 1515 log_parse_error(); 1516 m_current_token.set_comment(consume_current_builder()); 1517 EMIT_EOF; 1518 } 1519 ANYTHING_ELSE 1520 { 1521 m_current_builder.append("--!"sv); 1522 RECONSUME_IN(Comment); 1523 } 1524 } 1525 END_STATE 1526 1527 // 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state 1528 BEGIN_STATE(CommentEndDash) 1529 { 1530 ON('-') 1531 { 1532 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEnd); 1533 } 1534 ON_EOF 1535 { 1536 log_parse_error(); 1537 m_current_token.set_comment(consume_current_builder()); 1538 EMIT_EOF; 1539 } 1540 ANYTHING_ELSE 1541 { 1542 m_current_builder.append('-'); 1543 RECONSUME_IN(Comment); 1544 } 1545 } 1546 END_STATE 1547 1548 // 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state 1549 BEGIN_STATE(CommentLessThanSign) 1550 { 1551 ON('!') 1552 { 1553 m_current_builder.append_code_point(current_input_character.value()); 1554 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBang); 1555 } 1556 ON('<') 1557 { 1558 m_current_builder.append_code_point(current_input_character.value()); 1559 continue; 1560 } 1561 ANYTHING_ELSE 1562 { 1563 RECONSUME_IN(Comment); 1564 } 1565 } 1566 END_STATE 1567 1568 // 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state 1569 BEGIN_STATE(CommentLessThanSignBang) 1570 { 1571 ON('-') 1572 { 1573 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBangDash); 1574 } 1575 ANYTHING_ELSE 1576 { 1577 RECONSUME_IN(Comment); 1578 } 1579 } 1580 END_STATE 1581 1582 // 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state 1583 BEGIN_STATE(CommentLessThanSignBangDash) 1584 { 1585 ON('-') 1586 { 1587 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBangDashDash); 1588 } 1589 ANYTHING_ELSE 1590 { 1591 RECONSUME_IN(CommentEndDash); 1592 } 1593 } 1594 END_STATE 1595 1596 // 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state 1597 BEGIN_STATE(CommentLessThanSignBangDashDash) 1598 { 1599 ON('>') 1600 { 1601 RECONSUME_IN(CommentEnd); 1602 } 1603 ON_EOF 1604 { 1605 RECONSUME_IN(CommentEnd); 1606 } 1607 ANYTHING_ELSE 1608 { 1609 log_parse_error(); 1610 RECONSUME_IN(CommentEnd); 1611 } 1612 } 1613 END_STATE 1614 1615 // 13.2.5.72 Character reference state, https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state 1616 BEGIN_STATE(CharacterReference) 1617 { 1618 m_temporary_buffer.clear(); 1619 m_temporary_buffer.append('&'); 1620 1621 ON_ASCII_ALPHANUMERIC 1622 { 1623 RECONSUME_IN(NamedCharacterReference); 1624 } 1625 ON('#') 1626 { 1627 m_temporary_buffer.append(current_input_character.value()); 1628 SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReference); 1629 } 1630 ANYTHING_ELSE 1631 { 1632 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; 1633 RECONSUME_IN_RETURN_STATE; 1634 } 1635 } 1636 END_STATE 1637 1638 // 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state 1639 BEGIN_STATE(NamedCharacterReference) 1640 { 1641 size_t byte_offset = m_utf8_view.byte_offset_of(m_prev_utf8_iterator); 1642 1643 auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset)); 1644 1645 if (match.has_value()) { 1646 skip(match->entity.length() - 1); 1647 for (auto ch : match.value().entity) 1648 m_temporary_buffer.append(ch); 1649 1650 if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) { 1651 auto next_code_point = peek_code_point(0); 1652 if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) { 1653 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; 1654 SWITCH_TO_RETURN_STATE; 1655 } 1656 } 1657 1658 if (!match.value().entity.ends_with(';')) { 1659 log_parse_error(); 1660 } 1661 1662 m_temporary_buffer = match.value().code_points; 1663 1664 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; 1665 SWITCH_TO_RETURN_STATE; 1666 } else { 1667 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; 1668 // FIXME: This should be SWITCH_TO, but we always lose the first character on this path, so just reconsume it. 1669 // I can't wrap my head around how to do it as the spec says. 1670 RECONSUME_IN(AmbiguousAmpersand); 1671 } 1672 } 1673 END_STATE 1674 1675 // 13.2.5.74 Ambiguous ampersand state, https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state 1676 BEGIN_STATE(AmbiguousAmpersand) 1677 { 1678 ON_ASCII_ALPHANUMERIC 1679 { 1680 if (consumed_as_part_of_an_attribute()) { 1681 m_current_builder.append_code_point(current_input_character.value()); 1682 continue; 1683 } else { 1684 EMIT_CURRENT_CHARACTER; 1685 } 1686 } 1687 ON(';') 1688 { 1689 log_parse_error(); 1690 RECONSUME_IN_RETURN_STATE; 1691 } 1692 ANYTHING_ELSE 1693 { 1694 RECONSUME_IN_RETURN_STATE; 1695 } 1696 } 1697 END_STATE 1698 1699 // 13.2.5.75 Numeric character reference state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state 1700 BEGIN_STATE(NumericCharacterReference) 1701 { 1702 m_character_reference_code = 0; 1703 1704 ON('X') 1705 { 1706 m_temporary_buffer.append(current_input_character.value()); 1707 SWITCH_TO_WITH_UNCLEAN_BUILDER(HexadecimalCharacterReferenceStart); 1708 } 1709 ON('x') 1710 { 1711 m_temporary_buffer.append(current_input_character.value()); 1712 SWITCH_TO_WITH_UNCLEAN_BUILDER(HexadecimalCharacterReferenceStart); 1713 } 1714 ANYTHING_ELSE 1715 { 1716 RECONSUME_IN(DecimalCharacterReferenceStart); 1717 } 1718 } 1719 END_STATE 1720 1721 // 13.2.5.76 Hexadecimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state 1722 BEGIN_STATE(HexadecimalCharacterReferenceStart) 1723 { 1724 ON_ASCII_HEX_DIGIT 1725 { 1726 RECONSUME_IN(HexadecimalCharacterReference); 1727 } 1728 ANYTHING_ELSE 1729 { 1730 log_parse_error(); 1731 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; 1732 RECONSUME_IN_RETURN_STATE; 1733 } 1734 } 1735 END_STATE 1736 1737 // 13.2.5.77 Decimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state 1738 BEGIN_STATE(DecimalCharacterReferenceStart) 1739 { 1740 ON_ASCII_DIGIT 1741 { 1742 RECONSUME_IN(DecimalCharacterReference); 1743 } 1744 ANYTHING_ELSE 1745 { 1746 log_parse_error(); 1747 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; 1748 RECONSUME_IN_RETURN_STATE; 1749 } 1750 } 1751 END_STATE 1752 1753 // 13.2.5.78 Hexadecimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state 1754 BEGIN_STATE(HexadecimalCharacterReference) 1755 { 1756 ON_ASCII_DIGIT 1757 { 1758 m_character_reference_code *= 16; 1759 m_character_reference_code += current_input_character.value() - 0x30; 1760 continue; 1761 } 1762 ON_ASCII_UPPER_ALPHA 1763 { 1764 m_character_reference_code *= 16; 1765 m_character_reference_code += current_input_character.value() - 0x37; 1766 continue; 1767 } 1768 ON_ASCII_LOWER_ALPHA 1769 { 1770 m_character_reference_code *= 16; 1771 m_character_reference_code += current_input_character.value() - 0x57; 1772 continue; 1773 } 1774 ON(';') 1775 { 1776 SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReferenceEnd); 1777 } 1778 ANYTHING_ELSE 1779 { 1780 log_parse_error(); 1781 RECONSUME_IN(NumericCharacterReferenceEnd); 1782 } 1783 } 1784 END_STATE 1785 1786 // 13.2.5.79 Decimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state 1787 BEGIN_STATE(DecimalCharacterReference) 1788 { 1789 ON_ASCII_DIGIT 1790 { 1791 m_character_reference_code *= 10; 1792 m_character_reference_code += current_input_character.value() - 0x30; 1793 continue; 1794 } 1795 ON(';') 1796 { 1797 SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReferenceEnd); 1798 } 1799 ANYTHING_ELSE 1800 { 1801 log_parse_error(); 1802 RECONSUME_IN(NumericCharacterReferenceEnd); 1803 } 1804 } 1805 END_STATE 1806 1807 // 13.2.5.80 Numeric character reference end state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state 1808 BEGIN_STATE(NumericCharacterReferenceEnd) 1809 { 1810 DONT_CONSUME_NEXT_INPUT_CHARACTER; 1811 1812 if (m_character_reference_code == 0) { 1813 log_parse_error(); 1814 m_character_reference_code = 0xFFFD; 1815 } 1816 if (m_character_reference_code > 0x10ffff) { 1817 log_parse_error(); 1818 m_character_reference_code = 0xFFFD; 1819 } 1820 if (is_unicode_surrogate(m_character_reference_code)) { 1821 log_parse_error(); 1822 m_character_reference_code = 0xFFFD; 1823 } 1824 if (is_unicode_noncharacter(m_character_reference_code)) { 1825 log_parse_error(); 1826 } 1827 if (m_character_reference_code == 0xd || (is_unicode_control(m_character_reference_code) && !is_ascii_space(m_character_reference_code))) { 1828 log_parse_error(); 1829 constexpr struct { 1830 u32 number; 1831 u32 code_point; 1832 } conversion_table[] = { 1833 { 0x80, 0x20AC }, 1834 { 0x82, 0x201A }, 1835 { 0x83, 0x0192 }, 1836 { 0x84, 0x201E }, 1837 { 0x85, 0x2026 }, 1838 { 0x86, 0x2020 }, 1839 { 0x87, 0x2021 }, 1840 { 0x88, 0x02C6 }, 1841 { 0x89, 0x2030 }, 1842 { 0x8A, 0x0160 }, 1843 { 0x8B, 0x2039 }, 1844 { 0x8C, 0x0152 }, 1845 { 0x8E, 0x017D }, 1846 { 0x91, 0x2018 }, 1847 { 0x92, 0x2019 }, 1848 { 0x93, 0x201C }, 1849 { 0x94, 0x201D }, 1850 { 0x95, 0x2022 }, 1851 { 0x96, 0x2013 }, 1852 { 0x97, 0x2014 }, 1853 { 0x98, 0x02DC }, 1854 { 0x99, 0x2122 }, 1855 { 0x9A, 0x0161 }, 1856 { 0x9B, 0x203A }, 1857 { 0x9C, 0x0153 }, 1858 { 0x9E, 0x017E }, 1859 { 0x9F, 0x0178 }, 1860 }; 1861 for (auto& entry : conversion_table) { 1862 if (m_character_reference_code == entry.number) { 1863 m_character_reference_code = entry.code_point; 1864 break; 1865 } 1866 } 1867 } 1868 1869 m_temporary_buffer.clear(); 1870 m_temporary_buffer.append(m_character_reference_code); 1871 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; 1872 SWITCH_TO_RETURN_STATE; 1873 } 1874 END_STATE 1875 1876 // 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state 1877 BEGIN_STATE(RCDATA) 1878 { 1879 ON('&') 1880 { 1881 m_return_state = State::RCDATA; 1882 SWITCH_TO(CharacterReference); 1883 } 1884 ON('<') 1885 { 1886 SWITCH_TO(RCDATALessThanSign); 1887 } 1888 ON(0) 1889 { 1890 log_parse_error(); 1891 EMIT_CHARACTER(0xFFFD); 1892 } 1893 ON_EOF 1894 { 1895 EMIT_EOF; 1896 } 1897 ANYTHING_ELSE 1898 { 1899 EMIT_CURRENT_CHARACTER; 1900 } 1901 } 1902 END_STATE 1903 1904 // 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state 1905 BEGIN_STATE(RCDATALessThanSign) 1906 { 1907 ON('/') 1908 { 1909 m_temporary_buffer.clear(); 1910 SWITCH_TO(RCDATAEndTagOpen); 1911 } 1912 ANYTHING_ELSE 1913 { 1914 EMIT_CHARACTER_AND_RECONSUME_IN('<', RCDATA); 1915 } 1916 } 1917 END_STATE 1918 1919 // 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state 1920 BEGIN_STATE(RCDATAEndTagOpen) 1921 { 1922 ON_ASCII_ALPHA 1923 { 1924 create_new_token(HTMLToken::Type::EndTag); 1925 RECONSUME_IN(RCDATAEndTagName); 1926 } 1927 ANYTHING_ELSE 1928 { 1929 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 1930 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 1931 RECONSUME_IN(RCDATA); 1932 } 1933 } 1934 END_STATE 1935 1936 // 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state 1937 BEGIN_STATE(RCDATAEndTagName) 1938 { 1939 ON_WHITESPACE 1940 { 1941 m_current_token.set_tag_name(consume_current_builder()); 1942 if (!current_end_tag_token_is_appropriate()) { 1943 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 1944 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 1945 for (auto code_point : m_temporary_buffer) 1946 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 1947 RECONSUME_IN(RCDATA); 1948 } 1949 SWITCH_TO(BeforeAttributeName); 1950 } 1951 ON('/') 1952 { 1953 m_current_token.set_tag_name(consume_current_builder()); 1954 if (!current_end_tag_token_is_appropriate()) { 1955 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 1956 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 1957 for (auto code_point : m_temporary_buffer) 1958 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 1959 RECONSUME_IN(RCDATA); 1960 } 1961 SWITCH_TO(SelfClosingStartTag); 1962 } 1963 ON('>') 1964 { 1965 m_current_token.set_tag_name(consume_current_builder()); 1966 if (!current_end_tag_token_is_appropriate()) { 1967 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 1968 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 1969 for (auto code_point : m_temporary_buffer) 1970 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 1971 RECONSUME_IN(RCDATA); 1972 } 1973 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 1974 } 1975 ON_ASCII_UPPER_ALPHA 1976 { 1977 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); 1978 m_temporary_buffer.append(current_input_character.value()); 1979 continue; 1980 } 1981 ON_ASCII_LOWER_ALPHA 1982 { 1983 m_current_builder.append_code_point(current_input_character.value()); 1984 m_temporary_buffer.append(current_input_character.value()); 1985 continue; 1986 } 1987 ANYTHING_ELSE 1988 { 1989 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 1990 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 1991 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 1992 m_current_builder.clear(); 1993 for (auto code_point : m_temporary_buffer) 1994 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 1995 RECONSUME_IN(RCDATA); 1996 } 1997 } 1998 END_STATE 1999 2000 // 13.2.5.3 RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state 2001 BEGIN_STATE(RAWTEXT) 2002 { 2003 ON('<') 2004 { 2005 SWITCH_TO(RAWTEXTLessThanSign); 2006 } 2007 ON(0) 2008 { 2009 log_parse_error(); 2010 EMIT_CHARACTER(0xFFFD); 2011 } 2012 ON_EOF 2013 { 2014 EMIT_EOF; 2015 } 2016 ANYTHING_ELSE 2017 { 2018 EMIT_CURRENT_CHARACTER; 2019 } 2020 } 2021 END_STATE 2022 2023 // 13.2.5.12 RAWTEXT less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state 2024 BEGIN_STATE(RAWTEXTLessThanSign) 2025 { 2026 ON('/') 2027 { 2028 m_temporary_buffer.clear(); 2029 SWITCH_TO(RAWTEXTEndTagOpen); 2030 } 2031 ANYTHING_ELSE 2032 { 2033 EMIT_CHARACTER_AND_RECONSUME_IN('<', RAWTEXT); 2034 } 2035 } 2036 END_STATE 2037 2038 // 13.2.5.13 RAWTEXT end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state 2039 BEGIN_STATE(RAWTEXTEndTagOpen) 2040 { 2041 ON_ASCII_ALPHA 2042 { 2043 create_new_token(HTMLToken::Type::EndTag); 2044 RECONSUME_IN(RAWTEXTEndTagName); 2045 } 2046 ANYTHING_ELSE 2047 { 2048 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2049 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2050 RECONSUME_IN(RAWTEXT); 2051 } 2052 } 2053 END_STATE 2054 2055 // 13.2.5.14 RAWTEXT end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state 2056 BEGIN_STATE(RAWTEXTEndTagName) 2057 { 2058 ON_WHITESPACE 2059 { 2060 m_current_token.set_tag_name(consume_current_builder()); 2061 if (!current_end_tag_token_is_appropriate()) { 2062 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2063 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2064 for (auto code_point : m_temporary_buffer) 2065 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2066 RECONSUME_IN(RAWTEXT); 2067 } 2068 SWITCH_TO(BeforeAttributeName); 2069 } 2070 ON('/') 2071 { 2072 m_current_token.set_tag_name(consume_current_builder()); 2073 if (!current_end_tag_token_is_appropriate()) { 2074 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2075 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2076 for (auto code_point : m_temporary_buffer) 2077 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2078 RECONSUME_IN(RAWTEXT); 2079 } 2080 SWITCH_TO(SelfClosingStartTag); 2081 } 2082 ON('>') 2083 { 2084 m_current_token.set_tag_name(consume_current_builder()); 2085 if (!current_end_tag_token_is_appropriate()) { 2086 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2087 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2088 for (auto code_point : m_temporary_buffer) 2089 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2090 RECONSUME_IN(RAWTEXT); 2091 } 2092 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 2093 } 2094 ON_ASCII_UPPER_ALPHA 2095 { 2096 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); 2097 m_temporary_buffer.append(current_input_character.value()); 2098 continue; 2099 } 2100 ON_ASCII_LOWER_ALPHA 2101 { 2102 m_current_builder.append(current_input_character.value()); 2103 m_temporary_buffer.append(current_input_character.value()); 2104 continue; 2105 } 2106 ANYTHING_ELSE 2107 { 2108 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2109 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2110 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2111 m_current_builder.clear(); 2112 for (auto code_point : m_temporary_buffer) 2113 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2114 RECONSUME_IN(RAWTEXT); 2115 } 2116 } 2117 END_STATE 2118 2119 // 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state 2120 BEGIN_STATE(ScriptData) 2121 { 2122 ON('<') 2123 { 2124 SWITCH_TO(ScriptDataLessThanSign); 2125 } 2126 ON(0) 2127 { 2128 log_parse_error(); 2129 EMIT_CHARACTER(0xFFFD); 2130 } 2131 ON_EOF 2132 { 2133 EMIT_EOF; 2134 } 2135 ANYTHING_ELSE 2136 { 2137 EMIT_CURRENT_CHARACTER; 2138 } 2139 } 2140 END_STATE 2141 2142 // 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state 2143 BEGIN_STATE(PLAINTEXT) 2144 { 2145 ON(0) 2146 { 2147 log_parse_error(); 2148 EMIT_CHARACTER(0xFFFD); 2149 } 2150 ON_EOF 2151 { 2152 EMIT_EOF; 2153 } 2154 ANYTHING_ELSE 2155 { 2156 EMIT_CURRENT_CHARACTER; 2157 } 2158 } 2159 END_STATE 2160 2161 // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state 2162 BEGIN_STATE(ScriptDataLessThanSign) 2163 { 2164 ON('/') 2165 { 2166 m_temporary_buffer.clear(); 2167 SWITCH_TO(ScriptDataEndTagOpen); 2168 } 2169 ON('!') 2170 { 2171 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2172 m_queued_tokens.enqueue(HTMLToken::make_character('!')); 2173 SWITCH_TO(ScriptDataEscapeStart); 2174 } 2175 ANYTHING_ELSE 2176 { 2177 EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptData); 2178 } 2179 } 2180 END_STATE 2181 2182 // 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state 2183 BEGIN_STATE(ScriptDataEscapeStart) 2184 { 2185 ON('-') 2186 { 2187 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapeStartDash); 2188 } 2189 ANYTHING_ELSE 2190 { 2191 RECONSUME_IN(ScriptData); 2192 } 2193 } 2194 END_STATE 2195 2196 // 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state 2197 BEGIN_STATE(ScriptDataEscapeStartDash) 2198 { 2199 ON('-') 2200 { 2201 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDashDash); 2202 } 2203 ANYTHING_ELSE 2204 { 2205 RECONSUME_IN(ScriptData); 2206 } 2207 } 2208 END_STATE 2209 2210 // 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state 2211 BEGIN_STATE(ScriptDataEscapedDashDash) 2212 { 2213 ON('-') 2214 { 2215 EMIT_CHARACTER('-'); 2216 } 2217 ON('<') 2218 { 2219 SWITCH_TO(ScriptDataEscapedLessThanSign); 2220 } 2221 ON('>') 2222 { 2223 SWITCH_TO_AND_EMIT_CHARACTER('>', ScriptData); 2224 } 2225 ON(0) 2226 { 2227 log_parse_error(); 2228 SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataEscaped); 2229 } 2230 ON_EOF 2231 { 2232 log_parse_error(); 2233 EMIT_EOF; 2234 } 2235 ANYTHING_ELSE 2236 { 2237 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped); 2238 } 2239 } 2240 END_STATE 2241 2242 // 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state 2243 BEGIN_STATE(ScriptDataEscapedLessThanSign) 2244 { 2245 ON('/') 2246 { 2247 m_temporary_buffer.clear(); 2248 SWITCH_TO(ScriptDataEscapedEndTagOpen); 2249 } 2250 ON_ASCII_ALPHA 2251 { 2252 m_temporary_buffer.clear(); 2253 EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataDoubleEscapeStart); 2254 } 2255 ANYTHING_ELSE 2256 { 2257 EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataEscaped); 2258 } 2259 } 2260 END_STATE 2261 2262 // 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state 2263 BEGIN_STATE(ScriptDataEscapedEndTagOpen) 2264 { 2265 ON_ASCII_ALPHA 2266 { 2267 create_new_token(HTMLToken::Type::EndTag); 2268 RECONSUME_IN(ScriptDataEscapedEndTagName); 2269 } 2270 ANYTHING_ELSE 2271 { 2272 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2273 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2274 RECONSUME_IN(ScriptDataEscaped); 2275 } 2276 } 2277 END_STATE 2278 2279 // 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state 2280 BEGIN_STATE(ScriptDataEscapedEndTagName) 2281 { 2282 ON_WHITESPACE 2283 { 2284 m_current_token.set_tag_name(consume_current_builder()); 2285 if (current_end_tag_token_is_appropriate()) 2286 SWITCH_TO(BeforeAttributeName); 2287 2288 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2289 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2290 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2291 m_current_builder.clear(); 2292 for (auto code_point : m_temporary_buffer) { 2293 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2294 } 2295 RECONSUME_IN(ScriptDataEscaped); 2296 } 2297 ON('/') 2298 { 2299 m_current_token.set_tag_name(consume_current_builder()); 2300 if (current_end_tag_token_is_appropriate()) 2301 SWITCH_TO(SelfClosingStartTag); 2302 2303 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2304 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2305 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2306 m_current_builder.clear(); 2307 for (auto code_point : m_temporary_buffer) { 2308 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2309 } 2310 RECONSUME_IN(ScriptDataEscaped); 2311 } 2312 ON('>') 2313 { 2314 m_current_token.set_tag_name(consume_current_builder()); 2315 if (current_end_tag_token_is_appropriate()) 2316 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 2317 2318 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2319 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2320 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2321 m_current_builder.clear(); 2322 for (auto code_point : m_temporary_buffer) { 2323 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2324 } 2325 RECONSUME_IN(ScriptDataEscaped); 2326 } 2327 ON_ASCII_UPPER_ALPHA 2328 { 2329 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); 2330 m_temporary_buffer.append(current_input_character.value()); 2331 continue; 2332 } 2333 ON_ASCII_LOWER_ALPHA 2334 { 2335 m_current_builder.append(current_input_character.value()); 2336 m_temporary_buffer.append(current_input_character.value()); 2337 continue; 2338 } 2339 ANYTHING_ELSE 2340 { 2341 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2342 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2343 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2344 m_current_builder.clear(); 2345 for (auto code_point : m_temporary_buffer) { 2346 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2347 } 2348 RECONSUME_IN(ScriptDataEscaped); 2349 } 2350 } 2351 END_STATE 2352 2353 // 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state 2354 BEGIN_STATE(ScriptDataDoubleEscapeStart) 2355 { 2356 auto temporary_buffer_equal_to_script = [this]() -> bool { 2357 if (m_temporary_buffer.size() != 6) 2358 return false; 2359 2360 // FIXME: Is there a better way of doing this? 2361 return m_temporary_buffer[0] == 's' && m_temporary_buffer[1] == 'c' && m_temporary_buffer[2] == 'r' && m_temporary_buffer[3] == 'i' && m_temporary_buffer[4] == 'p' && m_temporary_buffer[5] == 't'; 2362 }; 2363 ON_WHITESPACE 2364 { 2365 if (temporary_buffer_equal_to_script()) 2366 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped); 2367 else 2368 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped); 2369 } 2370 ON('/') 2371 { 2372 if (temporary_buffer_equal_to_script()) 2373 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped); 2374 else 2375 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped); 2376 } 2377 ON('>') 2378 { 2379 if (temporary_buffer_equal_to_script()) 2380 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped); 2381 else 2382 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped); 2383 } 2384 ON_ASCII_UPPER_ALPHA 2385 { 2386 m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value())); 2387 EMIT_CURRENT_CHARACTER; 2388 } 2389 ON_ASCII_LOWER_ALPHA 2390 { 2391 m_temporary_buffer.append(current_input_character.value()); 2392 EMIT_CURRENT_CHARACTER; 2393 } 2394 ANYTHING_ELSE 2395 { 2396 RECONSUME_IN(ScriptDataEscaped); 2397 } 2398 } 2399 END_STATE 2400 2401 // 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state 2402 BEGIN_STATE(ScriptDataDoubleEscaped) 2403 { 2404 ON('-') 2405 { 2406 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataDoubleEscapedDash); 2407 } 2408 ON('<') 2409 { 2410 SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign); 2411 } 2412 ON(0) 2413 { 2414 log_parse_error(); 2415 EMIT_CHARACTER(0xFFFD); 2416 } 2417 ON_EOF 2418 { 2419 log_parse_error(); 2420 EMIT_EOF; 2421 } 2422 ANYTHING_ELSE 2423 { 2424 EMIT_CURRENT_CHARACTER; 2425 } 2426 } 2427 END_STATE 2428 2429 // 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state 2430 BEGIN_STATE(ScriptDataDoubleEscapedDash) 2431 { 2432 ON('-') 2433 { 2434 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataDoubleEscapedDashDash); 2435 } 2436 ON('<') 2437 { 2438 SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign); 2439 } 2440 ON(0) 2441 { 2442 log_parse_error(); 2443 SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataDoubleEscaped); 2444 } 2445 ON_EOF 2446 { 2447 log_parse_error(); 2448 EMIT_EOF; 2449 } 2450 ANYTHING_ELSE 2451 { 2452 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped); 2453 } 2454 } 2455 END_STATE 2456 2457 // 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state 2458 BEGIN_STATE(ScriptDataDoubleEscapedDashDash) 2459 { 2460 ON('-') 2461 { 2462 EMIT_CHARACTER('-'); 2463 } 2464 ON('<') 2465 { 2466 SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign); 2467 } 2468 ON('>') 2469 { 2470 SWITCH_TO_AND_EMIT_CHARACTER('>', ScriptData); 2471 } 2472 ON(0) 2473 { 2474 log_parse_error(); 2475 SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataDoubleEscaped); 2476 } 2477 ON_EOF 2478 { 2479 log_parse_error(); 2480 EMIT_EOF; 2481 } 2482 ANYTHING_ELSE 2483 { 2484 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped); 2485 } 2486 } 2487 END_STATE 2488 2489 // 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state 2490 BEGIN_STATE(ScriptDataDoubleEscapedLessThanSign) 2491 { 2492 ON('/') 2493 { 2494 m_temporary_buffer.clear(); 2495 SWITCH_TO_AND_EMIT_CHARACTER('/', ScriptDataDoubleEscapeEnd); 2496 } 2497 ANYTHING_ELSE 2498 { 2499 RECONSUME_IN(ScriptDataDoubleEscaped); 2500 } 2501 } 2502 END_STATE 2503 2504 // 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state 2505 BEGIN_STATE(ScriptDataDoubleEscapeEnd) 2506 { 2507 auto temporary_buffer_equal_to_script = [this]() -> bool { 2508 if (m_temporary_buffer.size() != 6) 2509 return false; 2510 2511 // FIXME: Is there a better way of doing this? 2512 return m_temporary_buffer[0] == 's' && m_temporary_buffer[1] == 'c' && m_temporary_buffer[2] == 'r' && m_temporary_buffer[3] == 'i' && m_temporary_buffer[4] == 'p' && m_temporary_buffer[5] == 't'; 2513 }; 2514 ON_WHITESPACE 2515 { 2516 if (temporary_buffer_equal_to_script()) 2517 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped); 2518 else 2519 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped); 2520 } 2521 ON('/') 2522 { 2523 if (temporary_buffer_equal_to_script()) 2524 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped); 2525 else 2526 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped); 2527 } 2528 ON('>') 2529 { 2530 if (temporary_buffer_equal_to_script()) 2531 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped); 2532 else 2533 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped); 2534 } 2535 ON_ASCII_UPPER_ALPHA 2536 { 2537 m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value())); 2538 EMIT_CURRENT_CHARACTER; 2539 } 2540 ON_ASCII_LOWER_ALPHA 2541 { 2542 m_temporary_buffer.append(current_input_character.value()); 2543 EMIT_CURRENT_CHARACTER; 2544 } 2545 ANYTHING_ELSE 2546 { 2547 RECONSUME_IN(ScriptDataDoubleEscaped); 2548 } 2549 } 2550 END_STATE 2551 2552 // 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state 2553 BEGIN_STATE(ScriptDataEscapedDash) 2554 { 2555 ON('-') 2556 { 2557 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDashDash); 2558 } 2559 ON('<') 2560 { 2561 SWITCH_TO(ScriptDataEscapedLessThanSign); 2562 } 2563 ON(0) 2564 { 2565 log_parse_error(); 2566 SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataEscaped); 2567 } 2568 ON_EOF 2569 { 2570 log_parse_error(); 2571 EMIT_EOF; 2572 } 2573 ANYTHING_ELSE 2574 { 2575 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped); 2576 } 2577 } 2578 END_STATE 2579 2580 // 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state 2581 BEGIN_STATE(ScriptDataEscaped) 2582 { 2583 ON('-') 2584 { 2585 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDash); 2586 } 2587 ON('<') 2588 { 2589 SWITCH_TO(ScriptDataEscapedLessThanSign); 2590 } 2591 ON(0) 2592 { 2593 log_parse_error(); 2594 EMIT_CHARACTER(0xFFFD); 2595 } 2596 ON_EOF 2597 { 2598 log_parse_error(); 2599 EMIT_EOF; 2600 } 2601 ANYTHING_ELSE 2602 { 2603 EMIT_CURRENT_CHARACTER; 2604 } 2605 } 2606 END_STATE 2607 2608 // 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state 2609 BEGIN_STATE(ScriptDataEndTagOpen) 2610 { 2611 ON_ASCII_ALPHA 2612 { 2613 create_new_token(HTMLToken::Type::EndTag); 2614 RECONSUME_IN(ScriptDataEndTagName); 2615 } 2616 ANYTHING_ELSE 2617 { 2618 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2619 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2620 RECONSUME_IN(ScriptData); 2621 } 2622 } 2623 END_STATE 2624 2625 // 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state 2626 BEGIN_STATE(ScriptDataEndTagName) 2627 { 2628 ON_WHITESPACE 2629 { 2630 m_current_token.set_tag_name(consume_current_builder()); 2631 if (current_end_tag_token_is_appropriate()) 2632 SWITCH_TO(BeforeAttributeName); 2633 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2634 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2635 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2636 m_current_builder.clear(); 2637 for (auto code_point : m_temporary_buffer) 2638 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2639 RECONSUME_IN(ScriptData); 2640 } 2641 ON('/') 2642 { 2643 m_current_token.set_tag_name(consume_current_builder()); 2644 if (current_end_tag_token_is_appropriate()) 2645 SWITCH_TO(SelfClosingStartTag); 2646 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2647 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2648 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2649 m_current_builder.clear(); 2650 for (auto code_point : m_temporary_buffer) 2651 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2652 RECONSUME_IN(ScriptData); 2653 } 2654 ON('>') 2655 { 2656 m_current_token.set_tag_name(consume_current_builder()); 2657 if (current_end_tag_token_is_appropriate()) 2658 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); 2659 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2660 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2661 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2662 m_current_builder.clear(); 2663 for (auto code_point : m_temporary_buffer) 2664 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2665 RECONSUME_IN(ScriptData); 2666 } 2667 ON_ASCII_UPPER_ALPHA 2668 { 2669 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); 2670 m_temporary_buffer.append(current_input_character.value()); 2671 continue; 2672 } 2673 ON_ASCII_LOWER_ALPHA 2674 { 2675 m_current_builder.append(current_input_character.value()); 2676 m_temporary_buffer.append(current_input_character.value()); 2677 continue; 2678 } 2679 ANYTHING_ELSE 2680 { 2681 m_queued_tokens.enqueue(HTMLToken::make_character('<')); 2682 m_queued_tokens.enqueue(HTMLToken::make_character('/')); 2683 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. 2684 m_current_builder.clear(); 2685 for (auto code_point : m_temporary_buffer) 2686 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); 2687 RECONSUME_IN(ScriptData); 2688 } 2689 } 2690 END_STATE 2691 2692 // 13.2.5.69 CDATA section state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state 2693 BEGIN_STATE(CDATASection) 2694 { 2695 ON(']') 2696 { 2697 SWITCH_TO(CDATASectionBracket); 2698 } 2699 ON_EOF 2700 { 2701 log_parse_error(); 2702 EMIT_EOF; 2703 } 2704 ANYTHING_ELSE 2705 { 2706 EMIT_CURRENT_CHARACTER; 2707 } 2708 } 2709 END_STATE 2710 2711 // 13.2.5.70 CDATA section bracket state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state 2712 BEGIN_STATE(CDATASectionBracket) 2713 { 2714 ON(']') 2715 { 2716 SWITCH_TO(CDATASectionEnd); 2717 } 2718 ANYTHING_ELSE 2719 { 2720 EMIT_CHARACTER_AND_RECONSUME_IN(']', CDATASection); 2721 } 2722 } 2723 END_STATE 2724 2725 // 13.2.5.71 CDATA section end state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state 2726 BEGIN_STATE(CDATASectionEnd) 2727 { 2728 ON(']') 2729 { 2730 EMIT_CHARACTER(']'); 2731 } 2732 ON('>') 2733 { 2734 SWITCH_TO(Data); 2735 } 2736 ANYTHING_ELSE 2737 { 2738 m_queued_tokens.enqueue(HTMLToken::make_character(']')); 2739 m_queued_tokens.enqueue(HTMLToken::make_character(']')); 2740 RECONSUME_IN(CDATASection); 2741 } 2742 } 2743 END_STATE 2744 2745 default: 2746 TODO(); 2747 } 2748 } 2749} 2750 2751bool HTMLTokenizer::consume_next_if_match(StringView string, CaseSensitivity case_sensitivity) 2752{ 2753 for (size_t i = 0; i < string.length(); ++i) { 2754 auto code_point = peek_code_point(i); 2755 if (!code_point.has_value()) 2756 return false; 2757 // FIXME: This should be more Unicode-aware. 2758 if (case_sensitivity == CaseSensitivity::CaseInsensitive) { 2759 if (code_point.value() < 0x80) { 2760 if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i])) 2761 return false; 2762 continue; 2763 } 2764 } 2765 if (code_point.value() != (u32)string[i]) 2766 return false; 2767 } 2768 skip(string.length()); 2769 return true; 2770} 2771 2772void HTMLTokenizer::create_new_token(HTMLToken::Type type) 2773{ 2774 m_current_token = { type }; 2775 size_t offset = 0; 2776 switch (type) { 2777 case HTMLToken::Type::StartTag: 2778 offset = 1; 2779 break; 2780 case HTMLToken::Type::EndTag: 2781 offset = 2; 2782 break; 2783 default: 2784 break; 2785 } 2786 2787 m_current_token.set_start_position({}, nth_last_position(offset)); 2788} 2789 2790HTMLTokenizer::HTMLTokenizer() 2791{ 2792 m_decoded_input = ""; 2793 m_utf8_view = Utf8View(m_decoded_input); 2794 m_utf8_iterator = m_utf8_view.begin(); 2795 m_prev_utf8_iterator = m_utf8_view.begin(); 2796 m_source_positions.empend(0u, 0u); 2797} 2798 2799HTMLTokenizer::HTMLTokenizer(StringView input, DeprecatedString const& encoding) 2800{ 2801 auto decoder = TextCodec::decoder_for(encoding); 2802 VERIFY(decoder.has_value()); 2803 m_decoded_input = decoder->to_utf8(input).release_value_but_fixme_should_propagate_errors().to_deprecated_string(); 2804 m_utf8_view = Utf8View(m_decoded_input); 2805 m_utf8_iterator = m_utf8_view.begin(); 2806 m_prev_utf8_iterator = m_utf8_view.begin(); 2807 m_source_positions.empend(0u, 0u); 2808} 2809 2810void HTMLTokenizer::insert_input_at_insertion_point(DeprecatedString const& input) 2811{ 2812 auto utf8_iterator_byte_offset = m_utf8_view.byte_offset_of(m_utf8_iterator); 2813 2814 // FIXME: Implement a InputStream to handle insertion_point and iterators. 2815 StringBuilder builder {}; 2816 builder.append(m_decoded_input.substring(0, m_insertion_point.position)); 2817 builder.append(input); 2818 builder.append(m_decoded_input.substring(m_insertion_point.position)); 2819 m_decoded_input = builder.to_deprecated_string(); 2820 2821 m_utf8_view = Utf8View(m_decoded_input); 2822 m_utf8_iterator = m_utf8_view.iterator_at_byte_offset(utf8_iterator_byte_offset); 2823 2824 m_insertion_point.position += input.length(); 2825} 2826 2827void HTMLTokenizer::insert_eof() 2828{ 2829 m_explicit_eof_inserted = true; 2830} 2831 2832bool HTMLTokenizer::is_eof_inserted() 2833{ 2834 return m_explicit_eof_inserted; 2835} 2836 2837void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) 2838{ 2839 dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Switch to {}", state_name(m_state), state_name(new_state)); 2840} 2841 2842void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state) 2843{ 2844 dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Reconsume in {}", state_name(m_state), state_name(new_state)); 2845} 2846 2847void HTMLTokenizer::switch_to(Badge<HTMLParser>, State new_state) 2848{ 2849 dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Parser switches tokenizer state to {}", state_name(m_state), state_name(new_state)); 2850 m_state = new_state; 2851} 2852 2853void HTMLTokenizer::will_emit(HTMLToken& token) 2854{ 2855 if (token.is_start_tag()) 2856 m_last_emitted_start_tag_name = token.tag_name(); 2857 token.set_end_position({}, nth_last_position(0)); 2858} 2859 2860bool HTMLTokenizer::current_end_tag_token_is_appropriate() const 2861{ 2862 VERIFY(m_current_token.is_end_tag()); 2863 if (!m_last_emitted_start_tag_name.has_value()) 2864 return false; 2865 return m_current_token.tag_name() == m_last_emitted_start_tag_name.value(); 2866} 2867 2868bool HTMLTokenizer::consumed_as_part_of_an_attribute() const 2869{ 2870 return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted; 2871} 2872 2873void HTMLTokenizer::restore_to(Utf8CodePointIterator const& new_iterator) 2874{ 2875 auto diff = m_utf8_iterator - new_iterator; 2876 if (diff > 0) { 2877 for (ssize_t i = 0; i < diff; ++i) { 2878 if (!m_source_positions.is_empty()) 2879 m_source_positions.take_last(); 2880 } 2881 } else { 2882 // Going forwards...? 2883 TODO(); 2884 } 2885 m_utf8_iterator = new_iterator; 2886} 2887 2888DeprecatedString HTMLTokenizer::consume_current_builder() 2889{ 2890 auto string = m_current_builder.to_deprecated_string(); 2891 m_current_builder.clear(); 2892 return string; 2893} 2894 2895}