Serenity Operating System
at master 1781 lines 59 kB view raw
1/* 2 * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <LibXML/DOM/Document.h> 8#include <LibXML/Parser/Parser.h> 9 10struct Range { 11 consteval Range(u32 start, u32 end) 12 : start(start) 13 , end(end) 14 { 15 } 16 17 u32 start; 18 u32 end; 19}; 20 21template<auto... ranges> 22struct ranges_for_search { 23 auto contains(u32 value) const 24 { 25 return ((value >= ranges.start && value <= ranges.end) || ...); 26 } 27 28 bool operator()(u32 value) const 29 { 30 return contains(value); 31 } 32 33 template<auto... ranges_to_include> 34 consteval auto with() const 35 { 36 return ranges_for_search<ranges..., ranges_to_include...>(); 37 } 38 39 template<auto... ranges_to_include> 40 consteval auto unify(ranges_for_search<ranges_to_include...> const&) const 41 { 42 return ranges_for_search<ranges..., ranges_to_include...>(); 43 } 44}; 45 46template<size_t Count, typename Element> 47struct StringSet { 48 consteval StringSet(Element const (&entries)[Count]) 49 { 50 for (size_t i = 0; i < Count - 1; ++i) 51 elements[i] = entries[i]; 52 } 53 54 consteval auto operator[](size_t i) const { return elements[i]; } 55 56 Element elements[Count - 1]; 57}; 58 59template<StringSet chars> 60consteval static auto set_to_search() 61{ 62 return ([&]<auto... Ix>(IndexSequence<Ix...>) { 63 return ranges_for_search<Range(chars[Ix], chars[Ix])...>(); 64 }(MakeIndexSequence<array_size(chars.elements)>())); 65} 66 67namespace XML { 68 69size_t Parser::s_debug_indent_level { 0 }; 70 71void Parser::append_node(NonnullOwnPtr<Node> node) 72{ 73 if (m_entered_node) { 74 m_entered_node->content.get<Node::Element>().children.append(move(node)); 75 } else { 76 m_root_node = move(node); 77 m_entered_node = m_root_node.ptr(); 78 } 79} 80 81void Parser::append_text(StringView text) 82{ 83 if (m_listener) { 84 m_listener->text(text); 85 return; 86 } 87 88 if (!m_entered_node) { 89 Node::Text node; 90 node.builder.append(text); 91 m_root_node = make<Node>(move(node)); 92 return; 93 } 94 95 m_entered_node->content.visit( 96 [&](Node::Element& node) { 97 if (!node.children.is_empty()) { 98 auto* text_node = node.children.last()->content.get_pointer<Node::Text>(); 99 if (text_node) { 100 text_node->builder.append(text); 101 return; 102 } 103 } 104 Node::Text text_node; 105 text_node.builder.append(text); 106 node.children.append(make<Node>(move(text_node))); 107 }, 108 [&](auto&) { 109 // Can't enter a text or comment node. 110 VERIFY_NOT_REACHED(); 111 }); 112} 113 114void Parser::append_comment(StringView text) 115{ 116 if (m_listener) { 117 m_listener->comment(text); 118 return; 119 } 120 121 // If there's no node to attach this to, drop it on the floor. 122 // This can happen to comments in the prolog. 123 if (!m_entered_node) 124 return; 125 126 m_entered_node->content.visit( 127 [&](Node::Element& node) { 128 node.children.append(make<Node>(Node::Comment { text })); 129 }, 130 [&](auto&) { 131 // Can't enter a text or comment node. 132 VERIFY_NOT_REACHED(); 133 }); 134} 135 136void Parser::enter_node(Node& node) 137{ 138 if (m_listener) { 139 auto& element = node.content.get<Node::Element>(); 140 m_listener->element_start(element.name, element.attributes); 141 } 142 143 if (&node != m_root_node.ptr()) 144 node.parent = m_entered_node; 145 m_entered_node = &node; 146} 147 148void Parser::leave_node() 149{ 150 if (m_listener) { 151 auto& element = m_entered_node->content.get<Node::Element>(); 152 m_listener->element_end(element.name); 153 } 154 155 m_entered_node = m_entered_node->parent; 156} 157 158ErrorOr<Document, ParseError> Parser::parse() 159{ 160 if (auto result = parse_internal(); result.is_error()) { 161 if (m_parse_errors.is_empty()) 162 return result.release_error(); 163 return m_parse_errors.take_first(); 164 } 165 return Document { 166 m_root_node.release_nonnull(), 167 move(m_doctype), 168 move(m_processing_instructions), 169 m_version, 170 }; 171} 172 173ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener) 174{ 175 m_listener = &listener; 176 ScopeGuard unset_listener { [this] { m_listener = nullptr; } }; 177 m_listener->set_source(m_source); 178 m_listener->document_start(); 179 auto result = parse_internal(); 180 if (result.is_error()) 181 m_listener->error(result.error()); 182 m_listener->document_end(); 183 m_root_node.clear(); 184 return result; 185} 186 187// 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S 188ErrorOr<void, ParseError> Parser::skip_whitespace(Required required) 189{ 190 auto rollback = rollback_point(); 191 auto rule = enter_rule(); 192 193 // S ::= (#x20 | #x9 | #xD | #xA)+ 194 auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"sv)); 195 if (required == Required::Yes && matched.is_empty()) 196 return parse_error(m_lexer.tell(), "Expected whitespace"); 197 198 rollback.disarm(); 199 return {}; 200} 201 202// 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar 203constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>(); 204 205// 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed 206ErrorOr<void, ParseError> Parser::parse_internal() 207{ 208 auto rule = enter_rule(); 209 210 // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) 211 TRY(parse_prolog()); 212 TRY(parse_element()); 213 while (true) { 214 if (auto result = parse_misc(); result.is_error()) 215 break; 216 } 217 218 auto matched_source = m_source.substring_view(0, m_lexer.tell()); 219 if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) { 220 return parse_error( 221 it.index(), 222 DeprecatedString::formatted("Invalid character #{:x} used in document", *it)); 223 } 224 225 if (!m_lexer.is_eof()) 226 return parse_error(m_lexer.tell(), "Garbage after document"); 227 228 return {}; 229} 230 231ErrorOr<void, ParseError> Parser::expect(StringView expected) 232{ 233 auto rollback = rollback_point(); 234 235 if (!m_lexer.consume_specific(expected)) { 236 if (m_options.treat_errors_as_fatal) 237 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected '{}'", expected)); 238 } 239 240 rollback.disarm(); 241 return {}; 242} 243 244template<typename Pred> 245requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description) 246{ 247 auto rollback = rollback_point(); 248 auto start = m_lexer.tell(); 249 if (!m_lexer.next_is(predicate)) { 250 if (m_options.treat_errors_as_fatal) 251 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected {}", description)); 252 } 253 254 m_lexer.ignore(); 255 rollback.disarm(); 256 return m_source.substring_view(start, m_lexer.tell() - start); 257} 258 259template<typename Pred> 260requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description) 261{ 262 auto rollback = rollback_point(); 263 auto start = m_lexer.tell(); 264 while (m_lexer.next_is(predicate)) { 265 if (m_lexer.is_eof()) 266 break; 267 m_lexer.ignore(); 268 } 269 270 if (m_lexer.tell() == start) { 271 if (m_options.treat_errors_as_fatal) { 272 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected {}", description)); 273 } 274 } 275 276 rollback.disarm(); 277 return m_source.substring_view(start, m_lexer.tell() - start); 278} 279 280// 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog 281ErrorOr<void, ParseError> Parser::parse_prolog() 282{ 283 auto rollback = rollback_point(); 284 auto rule = enter_rule(); 285 286 // prolog ::= XMLDecl Misc* (doctypedecl Misc*)? 287 // The following is valid in XML 1.0. 288 // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? 289 if (auto result = parse_xml_decl(); result.is_error()) { 290 m_version = Version::Version10; 291 m_in_compatibility_mode = true; 292 } 293 auto accept = accept_rule(); 294 295 while (true) { 296 if (auto result = parse_misc(); result.is_error()) 297 break; 298 } 299 300 if (auto result = parse_doctype_decl(); !result.is_error()) { 301 while (true) { 302 if (auto result = parse_misc(); result.is_error()) 303 break; 304 } 305 } 306 307 rollback.disarm(); 308 return {}; 309} 310 311// 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl 312ErrorOr<void, ParseError> Parser::parse_xml_decl() 313{ 314 auto rollback = rollback_point(); 315 auto rule = enter_rule(); 316 317 // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 318 319 TRY(expect("<?xml"sv)); 320 auto accept = accept_rule(); 321 322 TRY(parse_version_info()); 323 (void)parse_encoding_decl(); 324 (void)parse_standalone_document_decl(); 325 TRY(skip_whitespace()); 326 TRY(expect("?>"sv)); 327 328 rollback.disarm(); 329 return {}; 330} 331 332// 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo 333ErrorOr<void, ParseError> Parser::parse_version_info() 334{ 335 auto rollback = rollback_point(); 336 auto rule = enter_rule(); 337 338 // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') 339 TRY(skip_whitespace(Required::Yes)); 340 TRY(expect("version"sv)); 341 auto accept = accept_rule(); 342 343 TRY(parse_eq()); 344 TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); 345 m_lexer.retreat(); 346 347 auto version_string = m_lexer.consume_quoted_string(); 348 if (version_string == "1.0") { 349 // FIXME: Compatibility mode, figure out which rules are different in XML 1.0. 350 m_version = Version::Version10; 351 m_in_compatibility_mode = true; 352 } else { 353 if (version_string != "1.1" && m_options.treat_errors_as_fatal) 354 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected '1.1', found '{}'", version_string)); 355 } 356 357 m_version = Version::Version11; 358 rollback.disarm(); 359 return {}; 360} 361 362// 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq 363ErrorOr<void, ParseError> Parser::parse_eq() 364{ 365 auto rollback = rollback_point(); 366 auto rule = enter_rule(); 367 368 // Eq ::= S? '=' S? 369 auto accept = accept_rule(); 370 TRY(skip_whitespace()); 371 TRY(expect("="sv)); 372 TRY(skip_whitespace()); 373 rollback.disarm(); 374 return {}; 375} 376 377// 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl 378ErrorOr<void, ParseError> Parser::parse_encoding_decl() 379{ 380 auto rollback = rollback_point(); 381 auto rule = enter_rule(); 382 383 // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) 384 TRY(skip_whitespace(Required::Yes)); 385 TRY(expect("encoding"sv)); 386 auto accept = accept_rule(); 387 388 TRY(parse_eq()); 389 TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); 390 m_lexer.retreat(); 391 392 // FIXME: Actually do something with this encoding. 393 m_encoding = m_lexer.consume_quoted_string(); 394 395 rollback.disarm(); 396 return {}; 397} 398 399// 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd 400ErrorOr<void, ParseError> Parser::parse_standalone_document_decl() 401{ 402 auto rollback = rollback_point(); 403 auto rule = enter_rule(); 404 405 // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) 406 TRY(skip_whitespace(Required::Yes)); 407 TRY(expect("standalone"sv)); 408 auto accept = accept_rule(); 409 410 TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); 411 m_lexer.retreat(); 412 413 auto value = m_lexer.consume_quoted_string(); 414 if (!value.is_one_of("yes", "no")) 415 return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'"); 416 417 m_standalone = value == "yes"; 418 419 rollback.disarm(); 420 return {}; 421} 422 423// 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc 424ErrorOr<void, ParseError> Parser::parse_misc() 425{ 426 auto rollback = rollback_point(); 427 auto rule = enter_rule(); 428 429 // Misc ::= Comment | PI | S 430 if (auto result = parse_comment(); !result.is_error()) { 431 rollback.disarm(); 432 return {}; 433 } 434 435 if (auto result = parse_processing_instruction(); !result.is_error()) { 436 rollback.disarm(); 437 return {}; 438 } 439 440 if (auto result = skip_whitespace(Required::Yes); !result.is_error()) { 441 rollback.disarm(); 442 return {}; 443 } 444 445 return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none"); 446} 447 448// 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment 449ErrorOr<void, ParseError> Parser::parse_comment() 450{ 451 auto rollback = rollback_point(); 452 auto rule = enter_rule(); 453 454 // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 455 TRY(expect("<!--"sv)); 456 auto accept = accept_rule(); 457 458 bool last_seen_a_dash = false; 459 // FIXME: This should disallow surrogate blocks 460 auto text = m_lexer.consume_while([&](auto ch) { 461 if (ch != '-') { 462 last_seen_a_dash = false; 463 return true; 464 } 465 466 if (last_seen_a_dash) 467 return false; 468 469 last_seen_a_dash = true; 470 return true; 471 }); 472 473 if (last_seen_a_dash) { 474 m_lexer.retreat(); 475 text = text.substring_view(0, text.length() - 1); 476 } 477 478 TRY(expect("-->"sv)); 479 480 if (m_options.preserve_comments) 481 append_comment(text); 482 483 rollback.disarm(); 484 return {}; 485} 486 487// 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI 488ErrorOr<void, ParseError> Parser::parse_processing_instruction() 489{ 490 auto rollback = rollback_point(); 491 auto rule = enter_rule(); 492 493 // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 494 TRY(expect("<?"sv)); 495 auto accept = accept_rule(); 496 497 auto target = TRY(parse_processing_instruction_target()); 498 DeprecatedString data; 499 if (auto result = skip_whitespace(Required::Yes); !result.is_error()) 500 data = m_lexer.consume_until("?>"); 501 TRY(expect("?>"sv)); 502 503 m_processing_instructions.set(target, data); 504 rollback.disarm(); 505 return {}; 506} 507 508// 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget 509ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target() 510{ 511 auto rollback = rollback_point(); 512 auto rule = enter_rule(); 513 514 // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) 515 auto target = TRY(parse_name()); 516 auto accept = accept_rule(); 517 518 if (target.equals_ignoring_ascii_case("xml"sv) && m_options.treat_errors_as_fatal) { 519 return parse_error( 520 m_lexer.tell() - target.length(), 521 "Use of the reserved 'xml' name for processing instruction target name is disallowed"); 522 } 523 524 rollback.disarm(); 525 return target; 526} 527 528// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] 529constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {}; 530 531// NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] 532constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>(); 533 534// 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name 535ErrorOr<Name, ParseError> Parser::parse_name() 536{ 537 auto rollback = rollback_point(); 538 auto rule = enter_rule(); 539 540 // Name ::= NameStartChar (NameChar)* 541 auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv)); 542 auto accept = accept_rule(); 543 544 auto rest = m_lexer.consume_while(s_name_characters); 545 StringBuilder builder; 546 builder.append(start); 547 builder.append(rest); 548 549 rollback.disarm(); 550 return builder.to_deprecated_string(); 551} 552 553// 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl 554ErrorOr<void, ParseError> Parser::parse_doctype_decl() 555{ 556 auto rollback = rollback_point(); 557 auto rule = enter_rule(); 558 Doctype doctype; 559 560 // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' 561 TRY(expect("<!DOCTYPE"sv)); 562 auto accept = accept_rule(); 563 564 TRY(skip_whitespace(Required::Yes)); 565 doctype.type = TRY(parse_name()); 566 if (auto result = skip_whitespace(Required::Yes); !result.is_error()) { 567 auto id_start = m_lexer.tell(); 568 if (auto id_result = parse_external_id(); !id_result.is_error()) { 569 doctype.external_id = id_result.release_value(); 570 if (m_options.resolve_external_resource) { 571 auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id); 572 if (resource_result.is_error()) { 573 return parse_error( 574 id_start, 575 DeprecatedString::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error())); 576 } 577 StringView resolved_source = resource_result.value(); 578 TemporaryChange source { m_source, resolved_source }; 579 TemporaryChange lexer { m_lexer, GenericLexer(m_source) }; 580 auto declarations = TRY(parse_external_subset()); 581 if (!m_lexer.is_eof()) { 582 return parse_error( 583 m_lexer.tell(), 584 DeprecatedString::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal)); 585 } 586 doctype.markup_declarations.extend(move(declarations)); 587 } 588 } 589 } 590 TRY(skip_whitespace(Required::No)); 591 if (m_lexer.consume_specific('[')) { 592 auto internal_subset = TRY(parse_internal_subset()); 593 TRY(expect("]"sv)); 594 TRY(skip_whitespace()); 595 doctype.markup_declarations.extend(internal_subset); 596 } 597 598 TRY(expect(">"sv)); 599 600 rollback.disarm(); 601 m_doctype = move(doctype); 602 return {}; 603} 604 605// 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element 606ErrorOr<void, ParseError> Parser::parse_element() 607{ 608 auto rollback = rollback_point(); 609 auto rule = enter_rule(); 610 611 // element ::= EmptyElemTag 612 // | STag content ETag 613 if (auto result = parse_empty_element_tag(); !result.is_error()) { 614 append_node(result.release_value()); 615 rollback.disarm(); 616 return {}; 617 } 618 619 auto start_tag = TRY(parse_start_tag()); 620 auto& node = *start_tag; 621 auto& tag = node.content.get<Node::Element>(); 622 append_node(move(start_tag)); 623 enter_node(node); 624 ScopeGuard quit { 625 [&] { 626 leave_node(); 627 } 628 }; 629 630 TRY(parse_content()); 631 632 auto tag_location = m_lexer.tell(); 633 auto closing_name = TRY(parse_end_tag()); 634 635 // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag. 636 if (m_options.treat_errors_as_fatal && closing_name != tag.name) 637 return parse_error(tag_location, "Invalid closing tag"); 638 639 rollback.disarm(); 640 return {}; 641} 642 643// 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag 644ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag() 645{ 646 auto rollback = rollback_point(); 647 auto rule = enter_rule(); 648 649 // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 650 TRY(expect("<"sv)); 651 auto accept = accept_rule(); 652 653 auto name = TRY(parse_name()); 654 HashMap<Name, DeprecatedString> attributes; 655 656 while (true) { 657 if (auto result = skip_whitespace(Required::Yes); result.is_error()) 658 break; 659 660 if (auto result = parse_attribute(); !result.is_error()) { 661 auto attribute = result.release_value(); 662 attributes.set(move(attribute.name), move(attribute.value)); 663 } else { 664 break; 665 } 666 } 667 668 TRY(skip_whitespace()); 669 TRY(expect("/>"sv)); 670 671 rollback.disarm(); 672 return make<Node>(Node::Element { move(name), move(attributes), {} }); 673} 674 675// 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute 676ErrorOr<Attribute, ParseError> Parser::parse_attribute() 677{ 678 auto rollback = rollback_point(); 679 auto rule = enter_rule(); 680 681 // Attribute ::= Name Eq AttValue 682 auto name = TRY(parse_name()); 683 auto accept = accept_rule(); 684 685 TRY(parse_eq()); 686 auto value = TRY(parse_attribute_value()); 687 688 rollback.disarm(); 689 return Attribute { 690 move(name), 691 move(value), 692 }; 693} 694 695// 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue 696ErrorOr<DeprecatedString, ParseError> Parser::parse_attribute_value() 697{ 698 auto rollback = rollback_point(); 699 auto rule = enter_rule(); 700 701 // AttValue ::= '"' ([^<&"] | Reference)* '"' 702 // | "'" ([^<&'] | Reference)* "'" 703 auto quote = TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); 704 auto accept = accept_rule(); 705 706 auto text = TRY(parse_attribute_value_inner(quote)); 707 TRY(expect(quote)); 708 709 rollback.disarm(); 710 return text; 711} 712 713ErrorOr<DeprecatedString, ParseError> Parser::parse_attribute_value_inner(StringView disallow) 714{ 715 StringBuilder builder; 716 while (true) { 717 if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof()) 718 break; 719 720 if (m_lexer.next_is('<')) { 721 // Not allowed, return a nice error to make it easier to debug. 722 return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values"); 723 } 724 725 if (m_lexer.next_is('&')) { 726 auto reference = TRY(parse_reference()); 727 if (auto* char_reference = reference.get_pointer<DeprecatedString>()) 728 builder.append(*char_reference); 729 else 730 builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue))); 731 } else { 732 builder.append(m_lexer.consume()); 733 } 734 } 735 return builder.to_deprecated_string(); 736} 737 738// Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 739constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>(); 740 741// 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference 742ErrorOr<Variant<Parser::EntityReference, DeprecatedString>, ParseError> Parser::parse_reference() 743{ 744 auto rollback = rollback_point(); 745 auto rule = enter_rule(); 746 // Reference ::= EntityRef | CharRef 747 748 // 4.1.68. EntityRef 749 // EntityRef ::= '&' Name ';' 750 751 // 4.1.66. CharRef 752 // CharRef ::= '&#' [0-9]+ ';' 753 // | '&#x' [0-9a-fA-F]+ ';' 754 755 auto reference_start = m_lexer.tell(); 756 TRY(expect("&"sv)); 757 auto accept = accept_rule(); 758 759 auto name_result = parse_name(); 760 if (name_result.is_error()) { 761 TRY(expect("#"sv)); 762 Optional<u32> code_point; 763 if (m_lexer.consume_specific('x')) { 764 auto hex = TRY(expect_many( 765 ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(), 766 "any of [0-9a-fA-F]"sv)); 767 code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(hex); 768 } else { 769 auto decimal = TRY(expect_many( 770 ranges_for_search<Range('0', '9')>(), 771 "any of [0-9]"sv)); 772 code_point = decimal.to_uint<u32>(); 773 } 774 775 if (!code_point.has_value() || !s_characters.contains(*code_point)) 776 return parse_error(reference_start, "Invalid character reference"); 777 778 TRY(expect(";"sv)); 779 780 StringBuilder builder; 781 builder.append_code_point(*code_point); 782 783 rollback.disarm(); 784 return builder.to_deprecated_string(); 785 } 786 787 auto name = name_result.release_value(); 788 TRY(expect(";"sv)); 789 790 rollback.disarm(); 791 return EntityReference { move(name) }; 792} 793 794// 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag 795ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag() 796{ 797 auto rollback = rollback_point(); 798 auto rule = enter_rule(); 799 800 // STag ::= '<' Name (S Attribute)* S? '>' 801 TRY(expect("<"sv)); 802 auto accept = accept_rule(); 803 804 auto name = TRY(parse_name()); 805 HashMap<Name, DeprecatedString> attributes; 806 807 while (true) { 808 if (auto result = skip_whitespace(Required::Yes); result.is_error()) 809 break; 810 811 if (auto result = parse_attribute(); !result.is_error()) { 812 auto attribute = result.release_value(); 813 attributes.set(move(attribute.name), move(attribute.value)); 814 } else { 815 break; 816 } 817 } 818 819 TRY(skip_whitespace()); 820 TRY(expect(">"sv)); 821 822 rollback.disarm(); 823 return make<Node>(Node::Element { move(name), move(attributes), {} }); 824} 825 826// 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag 827ErrorOr<Name, ParseError> Parser::parse_end_tag() 828{ 829 auto rollback = rollback_point(); 830 auto rule = enter_rule(); 831 832 // ETag ::= '</' Name S? '>' 833 TRY(expect("</"sv)); 834 auto accept = accept_rule(); 835 836 auto name = TRY(parse_name()); 837 TRY(skip_whitespace()); 838 TRY(expect(">"sv)); 839 840 rollback.disarm(); 841 return name; 842} 843 844// 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content 845ErrorOr<void, ParseError> Parser::parse_content() 846{ 847 auto rollback = rollback_point(); 848 auto rule = enter_rule(); 849 850 // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* 851 if (auto result = parse_char_data(); !result.is_error()) 852 append_text(result.release_value()); 853 854 while (true) { 855 if (auto result = parse_element(); !result.is_error()) 856 goto try_char_data; 857 if (auto result = parse_reference(); !result.is_error()) { 858 auto reference = result.release_value(); 859 if (auto char_reference = reference.get_pointer<DeprecatedString>()) 860 append_text(*char_reference); 861 else 862 TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content)); 863 goto try_char_data; 864 } 865 if (auto result = parse_cdata_section(); !result.is_error()) { 866 if (m_options.preserve_cdata) 867 append_text(result.release_value()); 868 goto try_char_data; 869 } 870 if (auto result = parse_processing_instruction(); !result.is_error()) 871 goto try_char_data; 872 if (auto result = parse_comment(); !result.is_error()) 873 goto try_char_data; 874 875 break; 876 877 try_char_data:; 878 if (auto result = parse_char_data(); !result.is_error()) 879 append_text(result.release_value()); 880 } 881 882 rollback.disarm(); 883 return {}; 884} 885 886// 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData 887ErrorOr<StringView, ParseError> Parser::parse_char_data() 888{ 889 auto rollback = rollback_point(); 890 auto rule = enter_rule(); 891 892 // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 893 auto cend_state = 0; // 1: ], 2: ], 3: > 894 auto text = m_lexer.consume_while([&](auto ch) { 895 if (ch == '<' || ch == '&' || cend_state == 3) 896 return false; 897 switch (cend_state) { 898 case 0: 899 case 1: 900 if (ch == ']') 901 cend_state++; 902 else 903 cend_state = 0; 904 return true; 905 case 2: 906 if (ch == '>') { 907 cend_state++; 908 return true; 909 } 910 cend_state = 0; 911 return true; 912 default: 913 VERIFY_NOT_REACHED(); 914 } 915 }); 916 if (cend_state == 3) { 917 m_lexer.retreat(3); 918 text = text.substring_view(0, text.length() - 3); 919 } 920 921 rollback.disarm(); 922 return text; 923} 924 925// 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset 926ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset() 927{ 928 auto rollback = rollback_point(); 929 auto rule = enter_rule(); 930 Vector<MarkupDeclaration> declarations; 931 932 // intSubset ::= (markupdecl | DeclSep)* 933 while (true) { 934 if (auto result = parse_markup_declaration(); !result.is_error()) { 935 auto maybe_declaration = result.release_value(); 936 if (maybe_declaration.has_value()) 937 declarations.append(maybe_declaration.release_value()); 938 continue; 939 } 940 if (auto result = parse_declaration_separator(); !result.is_error()) { 941 // The markup declarations may be made up in whole or in part of the replacement text of parameter entities. 942 // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl. 943 auto maybe_replacement_text = result.release_value(); 944 if (maybe_replacement_text.has_value()) { 945 TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() }; 946 TemporaryChange lexer { m_lexer, GenericLexer { m_source } }; 947 948 auto contained_declarations = TRY(parse_external_subset_declaration()); 949 declarations.extend(move(contained_declarations)); 950 } 951 continue; 952 } 953 break; 954 } 955 956 rollback.disarm(); 957 return declarations; 958} 959 960// 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl 961ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration() 962{ 963 auto rollback = rollback_point(); 964 auto rule = enter_rule(); 965 966 // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment 967 if (auto result = parse_element_declaration(); !result.is_error()) { 968 rollback.disarm(); 969 return MarkupDeclaration { result.release_value() }; 970 } 971 if (auto result = parse_attribute_list_declaration(); !result.is_error()) { 972 rollback.disarm(); 973 return MarkupDeclaration { result.release_value() }; 974 } 975 if (auto result = parse_entity_declaration(); !result.is_error()) { 976 rollback.disarm(); 977 return MarkupDeclaration { result.release_value() }; 978 } 979 if (auto result = parse_notation_declaration(); !result.is_error()) { 980 rollback.disarm(); 981 return MarkupDeclaration { result.release_value() }; 982 } 983 if (auto result = parse_processing_instruction(); !result.is_error()) { 984 rollback.disarm(); 985 return Optional<MarkupDeclaration> {}; 986 } 987 if (auto result = parse_comment(); !result.is_error()) { 988 rollback.disarm(); 989 return Optional<MarkupDeclaration> {}; 990 } 991 992 return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment"); 993} 994 995// 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep 996ErrorOr<Optional<DeprecatedString>, ParseError> Parser::parse_declaration_separator() 997{ 998 auto rollback = rollback_point(); 999 auto rule = enter_rule(); 1000 1001 // DeclSep ::= PEReference | S 1002 if (auto name = parse_parameter_entity_reference(); !name.is_error()) { 1003 rollback.disarm(); 1004 // FIXME: Resolve this PEReference. 1005 return ""; 1006 } 1007 1008 if (auto result = skip_whitespace(Required::Yes); !result.is_error()) { 1009 rollback.disarm(); 1010 return Optional<DeprecatedString> {}; 1011 } 1012 1013 return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference"); 1014} 1015 1016// 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference 1017ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference() 1018{ 1019 auto rollback = rollback_point(); 1020 auto rule = enter_rule(); 1021 1022 // PEReference ::= '%' Name ';' 1023 TRY(expect("%"sv)); 1024 auto accept = accept_rule(); 1025 1026 auto name = TRY(parse_name()); 1027 TRY(expect(";"sv)); 1028 1029 rollback.disarm(); 1030 return name; 1031} 1032 1033// 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl 1034ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration() 1035{ 1036 auto rollback = rollback_point(); 1037 auto rule = enter_rule(); 1038 1039 // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences, 1040 // but the grammar does not allow that, figure this out. 1041 // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' 1042 TRY(expect("<!ELEMENT"sv)); 1043 auto accept = accept_rule(); 1044 1045 TRY(skip_whitespace(Required::Yes)); 1046 auto name = TRY(parse_name()); 1047 TRY(skip_whitespace(Required::Yes)); 1048 auto spec = TRY(parse_content_spec()); 1049 TRY(expect(">"sv)); 1050 1051 rollback.disarm(); 1052 return ElementDeclaration { 1053 move(name), 1054 move(spec), 1055 }; 1056} 1057 1058// 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl 1059ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration() 1060{ 1061 auto rollback = rollback_point(); 1062 auto rule = enter_rule(); 1063 AttributeListDeclaration declaration; 1064 1065 // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' 1066 TRY(expect("<!ATTLIST"sv)); 1067 auto accept = accept_rule(); 1068 1069 TRY(skip_whitespace(Required::Yes)); 1070 declaration.type = TRY(parse_name()); 1071 1072 while (true) { 1073 if (auto result = parse_attribute_definition(); !result.is_error()) 1074 declaration.attributes.append(result.release_value()); 1075 else 1076 break; 1077 } 1078 1079 TRY(skip_whitespace()); 1080 TRY(expect(">"sv)); 1081 1082 rollback.disarm(); 1083 return declaration; 1084} 1085 1086// 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef 1087ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition() 1088{ 1089 auto rollback = rollback_point(); 1090 auto rule = enter_rule(); 1091 Optional<AttributeListDeclaration::Type> type; 1092 Optional<AttributeListDeclaration::Default> default_; 1093 1094 // AttDef ::= S Name S AttType S DefaultDecl 1095 TRY(skip_whitespace(Required::Yes)); 1096 auto name = TRY(parse_name()); 1097 auto accept = accept_rule(); 1098 1099 TRY(skip_whitespace(Required::Yes)); 1100 1101 // AttType ::= StringType | TokenizedType | EnumeratedType 1102 // StringType ::= 'CDATA' 1103 // TokenizedType ::= 'ID' 1104 // | 'IDREF' 1105 // | 'IDREFS' 1106 // | 'ENTITY' 1107 // | 'ENTITIES' 1108 // | 'NMTOKEN' 1109 // | 'NMTOKENS' 1110 // EnumeratedType ::= NotationType | Enumeration 1111 // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' 1112 // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' 1113 if (m_lexer.consume_specific("CDATA")) { 1114 type = AttributeListDeclaration::StringType::CData; 1115 } else if (m_lexer.consume_specific("IDREFS")) { 1116 type = AttributeListDeclaration::TokenizedType::IDRefs; 1117 } else if (m_lexer.consume_specific("IDREF")) { 1118 type = AttributeListDeclaration::TokenizedType::IDRef; 1119 } else if (m_lexer.consume_specific("ID")) { 1120 type = AttributeListDeclaration::TokenizedType::ID; 1121 } else if (m_lexer.consume_specific("ENTITIES")) { 1122 type = AttributeListDeclaration::TokenizedType::Entities; 1123 } else if (m_lexer.consume_specific("ENTITY")) { 1124 type = AttributeListDeclaration::TokenizedType::Entity; 1125 } else if (m_lexer.consume_specific("NMTOKENS")) { 1126 type = AttributeListDeclaration::TokenizedType::NMTokens; 1127 } else if (m_lexer.consume_specific("NMTOKEN")) { 1128 type = AttributeListDeclaration::TokenizedType::NMToken; 1129 } else if (m_lexer.consume_specific("NOTATION")) { 1130 HashTable<Name> names; 1131 TRY(skip_whitespace(Required::Yes)); 1132 TRY(expect("("sv)); 1133 TRY(skip_whitespace()); 1134 names.set(TRY(parse_name())); 1135 while (true) { 1136 TRY(skip_whitespace()); 1137 if (auto result = expect("|"sv); result.is_error()) 1138 break; 1139 TRY(skip_whitespace()); 1140 names.set(TRY(parse_name())); 1141 } 1142 TRY(skip_whitespace()); 1143 TRY(expect(")"sv)); 1144 type = AttributeListDeclaration::NotationType { move(names) }; 1145 } else { 1146 HashTable<DeprecatedString> names; 1147 TRY(expect("("sv)); 1148 TRY(skip_whitespace()); 1149 names.set(TRY(parse_nm_token())); 1150 while (true) { 1151 TRY(skip_whitespace()); 1152 if (auto result = expect("|"sv); result.is_error()) 1153 break; 1154 TRY(skip_whitespace()); 1155 names.set(TRY(parse_nm_token())); 1156 } 1157 TRY(skip_whitespace()); 1158 TRY(expect(")"sv)); 1159 type = AttributeListDeclaration::Enumeration { move(names) }; 1160 } 1161 1162 TRY(skip_whitespace(Required::Yes)); 1163 1164 // DefaultDecl ::= '#REQUIRED' | '#IMPLIED' 1165 // | (('#FIXED' S)? AttValue) 1166 if (m_lexer.consume_specific("#REQUIRED")) { 1167 default_ = AttributeListDeclaration::Required {}; 1168 } else if (m_lexer.consume_specific("#IMPLIED")) { 1169 default_ = AttributeListDeclaration::Implied {}; 1170 } else { 1171 bool fixed = false; 1172 if (m_lexer.consume_specific("#FIXED")) { 1173 TRY(skip_whitespace(Required::Yes)); 1174 fixed = true; 1175 } 1176 auto value = TRY(parse_attribute_value()); 1177 if (fixed) 1178 default_ = AttributeListDeclaration::Fixed { move(value) }; 1179 else 1180 default_ = AttributeListDeclaration::DefaultValue { move(value) }; 1181 } 1182 1183 rollback.disarm(); 1184 return AttributeListDeclaration::Definition { 1185 move(name), 1186 type.release_value(), 1187 default_.release_value(), 1188 }; 1189} 1190 1191// 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken 1192ErrorOr<StringView, ParseError> Parser::parse_nm_token() 1193{ 1194 auto rollback = rollback_point(); 1195 auto rule = enter_rule(); 1196 1197 // Nmtoken ::= (NameChar)+ 1198 auto token = TRY(expect_many(s_name_characters, "a NameChar"sv)); 1199 1200 rollback.disarm(); 1201 return token; 1202} 1203 1204// 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations 1205ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration() 1206{ 1207 auto rollback = rollback_point(); 1208 auto rule = enter_rule(); 1209 Variant<ExternalID, PublicID, Empty> notation; 1210 1211 // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' 1212 TRY(expect("<!NOTATION"sv)); 1213 auto accept = accept_rule(); 1214 1215 TRY(skip_whitespace(Required::Yes)); 1216 auto name = TRY(parse_name()); 1217 TRY(skip_whitespace(Required::Yes)); 1218 1219 if (auto result = parse_external_id(); !result.is_error()) 1220 notation = result.release_value(); 1221 else 1222 notation = TRY(parse_public_id()); 1223 1224 TRY(expect(">"sv)); 1225 1226 rollback.disarm(); 1227 return NotationDeclaration { 1228 move(name), 1229 move(notation).downcast<ExternalID, PublicID>(), 1230 }; 1231} 1232 1233// 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec 1234ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec() 1235{ 1236 auto rollback = rollback_point(); 1237 auto rule = enter_rule(); 1238 Optional<ElementDeclaration::ContentSpec> content_spec; 1239 1240 // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children 1241 if (m_lexer.consume_specific("EMPTY")) { 1242 content_spec = ElementDeclaration::Empty {}; 1243 } else if (m_lexer.consume_specific("ANY")) { 1244 content_spec = ElementDeclaration::Any {}; 1245 } else { 1246 TRY(expect("("sv)); 1247 TRY(skip_whitespace()); 1248 if (m_lexer.consume_specific("#PCDATA")) { 1249 HashTable<Name> names; 1250 // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' 1251 // | '(' S? '#PCDATA' S? ')' 1252 TRY(skip_whitespace()); 1253 if (m_lexer.consume_specific(")*")) { 1254 content_spec = ElementDeclaration::Mixed { .types = {}, .many = true }; 1255 } else if (m_lexer.consume_specific(')')) { 1256 content_spec = ElementDeclaration::Mixed { .types = {}, .many = false }; 1257 } else { 1258 while (true) { 1259 TRY(skip_whitespace()); 1260 if (!m_lexer.consume_specific('|')) 1261 break; 1262 TRY(skip_whitespace()); 1263 if (auto result = parse_name(); !result.is_error()) 1264 names.set(result.release_value()); 1265 else 1266 return parse_error(m_lexer.tell(), "Expected a Name"); 1267 } 1268 TRY(skip_whitespace()); 1269 TRY(expect(")*"sv)); 1270 content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true }; 1271 } 1272 } else { 1273 while (!m_lexer.next_is('(')) 1274 m_lexer.retreat(); 1275 // children ::= (choice | seq) ('?' | '*' | '+')? 1276 // cp ::= (Name | choice | seq) ('?' | '*' | '+')? 1277 // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' 1278 // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' 1279 Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice; 1280 Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence; 1281 1282 auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> { 1283 if (auto result = parse_name(); !result.is_error()) 1284 return result.release_value(); 1285 if (auto result = parse_choice(); !result.is_error()) 1286 return result.release_value(); 1287 return TRY(parse_sequence()); 1288 }; 1289 auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier { 1290 ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce }; 1291 if (m_lexer.consume_specific('?')) 1292 qualifier = ElementDeclaration::Children::Qualifier::Optional; 1293 else if (m_lexer.consume_specific('*')) 1294 qualifier = ElementDeclaration::Children::Qualifier::Any; 1295 else if (m_lexer.consume_specific('+')) 1296 qualifier = ElementDeclaration::Children::Qualifier::OneOrMore; 1297 return qualifier; 1298 }; 1299 auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> { 1300 auto sub_entry = TRY(parse_cp_init()); 1301 auto qualifier = parse_qualifier(); 1302 return ElementDeclaration::Children::Entry { 1303 move(sub_entry), 1304 qualifier, 1305 }; 1306 }; 1307 parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> { 1308 auto rollback = rollback_point(); 1309 auto rule = enter_rule(); 1310 1311 TRY(expect("("sv)); 1312 auto accept = accept_rule(); 1313 1314 TRY(skip_whitespace()); 1315 Vector<ElementDeclaration::Children::Entry> choices; 1316 choices.append(TRY(parse_cp())); 1317 while (true) { 1318 TRY(skip_whitespace()); 1319 if (!m_lexer.consume_specific('|')) 1320 break; 1321 TRY(skip_whitespace()); 1322 choices.append(TRY(parse_cp())); 1323 } 1324 1325 TRY(expect(")"sv)); 1326 1327 if (choices.size() < 2) 1328 return parse_error(m_lexer.tell(), "Expected more than one choice"); 1329 1330 TRY(skip_whitespace()); 1331 auto qualifier = parse_qualifier(); 1332 1333 rollback.disarm(); 1334 return ElementDeclaration::Children::Choice { 1335 move(choices), 1336 qualifier, 1337 }; 1338 }; 1339 parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> { 1340 auto rollback = rollback_point(); 1341 auto rule = enter_rule(); 1342 1343 TRY(expect("("sv)); 1344 auto accept = accept_rule(); 1345 1346 TRY(skip_whitespace()); 1347 Vector<ElementDeclaration::Children::Entry> entries; 1348 entries.append(TRY(parse_cp())); 1349 while (true) { 1350 TRY(skip_whitespace()); 1351 if (!m_lexer.consume_specific(',')) 1352 break; 1353 TRY(skip_whitespace()); 1354 entries.append(TRY(parse_cp())); 1355 } 1356 1357 TRY(expect(")"sv)); 1358 1359 TRY(skip_whitespace()); 1360 auto qualifier = parse_qualifier(); 1361 1362 rollback.disarm(); 1363 return ElementDeclaration::Children::Sequence { 1364 move(entries), 1365 qualifier, 1366 }; 1367 }; 1368 if (auto result = parse_choice(); !result.is_error()) { 1369 auto qualifier = parse_qualifier(); 1370 content_spec = ElementDeclaration::Children { 1371 result.release_value(), 1372 qualifier, 1373 }; 1374 } else { 1375 auto sequence = TRY(parse_sequence()); 1376 auto qualifier = parse_qualifier(); 1377 content_spec = ElementDeclaration::Children { 1378 move(sequence), 1379 qualifier, 1380 }; 1381 } 1382 } 1383 } 1384 1385 rollback.disarm(); 1386 return content_spec.release_value(); 1387} 1388 1389// 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl 1390ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration() 1391{ 1392 auto rollback = rollback_point(); 1393 auto rule = enter_rule(); 1394 Vector<MarkupDeclaration> declarations; 1395 1396 // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )* 1397 while (true) { 1398 if (auto result = parse_markup_declaration(); !result.is_error()) { 1399 if (result.value().has_value()) 1400 declarations.append(result.release_value().release_value()); 1401 continue; 1402 } 1403 1404 // FIXME: conditionalSect 1405 1406 if (auto result = parse_declaration_separator(); !result.is_error()) 1407 continue; 1408 1409 break; 1410 } 1411 1412 rollback.disarm(); 1413 return declarations; 1414} 1415 1416// 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl 1417ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration() 1418{ 1419 // EntityDecl ::= GEDecl | PEDecl 1420 if (auto result = parse_general_entity_declaration(); !result.is_error()) 1421 return result; 1422 1423 return parse_parameter_entity_declaration(); 1424} 1425 1426// 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl 1427ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration() 1428{ 1429 auto rollback = rollback_point(); 1430 auto rule = enter_rule(); 1431 Variant<DeprecatedString, EntityDefinition, Empty> definition; 1432 1433 // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' 1434 TRY(expect("<!ENTITY"sv)); 1435 auto accept = accept_rule(); 1436 1437 TRY(skip_whitespace(Required::Yes)); 1438 auto name = TRY(parse_name()); 1439 TRY(skip_whitespace(Required::Yes)); 1440 // EntityDef ::= EntityValue | (ExternalID NDataDecl?) 1441 if (auto result = parse_entity_value(); !result.is_error()) { 1442 definition = result.release_value(); 1443 } else { 1444 auto external_id = TRY(parse_external_id()); 1445 Optional<Name> notation; 1446 if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error()) 1447 notation = notation_result.release_value(); 1448 1449 definition = EntityDefinition { 1450 move(external_id), 1451 move(notation), 1452 }; 1453 } 1454 1455 TRY(skip_whitespace()); 1456 TRY(expect(">"sv)); 1457 1458 rollback.disarm(); 1459 return GEDeclaration { 1460 move(name), 1461 move(definition).downcast<DeprecatedString, EntityDefinition>(), 1462 }; 1463} 1464 1465// 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl 1466ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration() 1467{ 1468 auto rollback = rollback_point(); 1469 auto rule = enter_rule(); 1470 1471 Variant<DeprecatedString, ExternalID, Empty> definition; 1472 // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' 1473 TRY(expect("<!ENTITY"sv)); 1474 auto accept = accept_rule(); 1475 1476 TRY(skip_whitespace(Required::Yes)); 1477 TRY(expect("%"sv)); 1478 TRY(skip_whitespace(Required::Yes)); 1479 auto name = TRY(parse_name()); 1480 TRY(skip_whitespace(Required::Yes)); 1481 // PEDef ::= EntityValue | ExternalID 1482 if (auto result = parse_entity_value(); !result.is_error()) 1483 definition = result.release_value(); 1484 else 1485 definition = TRY(parse_external_id()); 1486 1487 TRY(skip_whitespace()); 1488 TRY(expect(">"sv)); 1489 1490 rollback.disarm(); 1491 return PEDeclaration { 1492 move(name), 1493 move(definition).downcast<DeprecatedString, ExternalID>(), 1494 }; 1495} 1496 1497// 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID 1498ErrorOr<PublicID, ParseError> Parser::parse_public_id() 1499{ 1500 auto rollback = rollback_point(); 1501 auto rule = enter_rule(); 1502 1503 // PublicID ::= 'PUBLIC' S PubidLiteral 1504 TRY(expect("PUBLIC"sv)); 1505 auto accept = accept_rule(); 1506 1507 TRY(skip_whitespace(Required::Yes)); 1508 auto text = TRY(parse_public_id_literal()); 1509 1510 rollback.disarm(); 1511 return PublicID { 1512 text, 1513 }; 1514} 1515 1516constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>()); 1517 1518// 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral 1519ErrorOr<StringView, ParseError> Parser::parse_public_id_literal() 1520{ 1521 auto rollback = rollback_point(); 1522 auto rule = enter_rule(); 1523 1524 // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 1525 auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); 1526 auto accept = accept_rule(); 1527 1528 auto id = TRY(expect_many( 1529 [q = quote[0]](auto x) { 1530 return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x); 1531 }, 1532 "a PubidChar"sv)); 1533 TRY(expect(quote)); 1534 1535 rollback.disarm(); 1536 return id; 1537} 1538 1539// 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral 1540ErrorOr<StringView, ParseError> Parser::parse_system_id_literal() 1541{ 1542 auto rollback = rollback_point(); 1543 auto rule = enter_rule(); 1544 1545 // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 1546 auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); 1547 auto accept = accept_rule(); 1548 1549 auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"sv)); 1550 TRY(expect(quote)); 1551 1552 rollback.disarm(); 1553 return id; 1554} 1555 1556// 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID 1557ErrorOr<ExternalID, ParseError> Parser::parse_external_id() 1558{ 1559 auto rollback = rollback_point(); 1560 auto rule = enter_rule(); 1561 1562 // ExternalID ::= 'SYSTEM' S SystemLiteral 1563 // | 'PUBLIC' S PubidLiteral S SystemLiteral 1564 Optional<PublicID> public_id; 1565 SystemID system_id; 1566 1567 if (m_lexer.consume_specific("SYSTEM")) { 1568 auto accept = accept_rule(); 1569 TRY(skip_whitespace(Required::Yes)); 1570 system_id = SystemID { TRY(parse_system_id_literal()) }; 1571 } else { 1572 TRY(expect("PUBLIC"sv)); 1573 auto accept = accept_rule(); 1574 1575 TRY(skip_whitespace(Required::Yes)); 1576 public_id = PublicID { TRY(parse_public_id_literal()) }; 1577 TRY(skip_whitespace(Required::Yes)); 1578 system_id = SystemID { TRY(parse_system_id_literal()) }; 1579 } 1580 1581 rollback.disarm(); 1582 return ExternalID { 1583 move(public_id), 1584 move(system_id), 1585 }; 1586} 1587 1588// 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl 1589ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration() 1590{ 1591 auto rollback = rollback_point(); 1592 auto rule = enter_rule(); 1593 1594 // NDataDecl ::= S 'NDATA' S Name 1595 TRY(skip_whitespace(Required::Yes)); 1596 auto accept = accept_rule(); 1597 1598 TRY(expect("NDATA"sv)); 1599 TRY(skip_whitespace(Required::Yes)); 1600 auto name = TRY(parse_name()); 1601 1602 rollback.disarm(); 1603 return name; 1604} 1605 1606// 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue 1607ErrorOr<DeprecatedString, ParseError> Parser::parse_entity_value() 1608{ 1609 auto rollback = rollback_point(); 1610 auto rule = enter_rule(); 1611 StringBuilder builder; 1612 1613 // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' 1614 // | "'" ([^%&'] | PEReference | Reference)* "'" 1615 auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); 1616 auto accept = accept_rule(); 1617 1618 while (true) { 1619 if (m_lexer.is_eof()) 1620 break; 1621 if (m_lexer.next_is(quote)) 1622 break; 1623 if (m_lexer.next_is('%')) { 1624 auto start = m_lexer.tell(); 1625 TRY(parse_parameter_entity_reference()); 1626 builder.append(m_source.substring_view(start, m_lexer.tell() - start)); 1627 continue; 1628 } 1629 if (m_lexer.next_is('&')) { 1630 auto start = m_lexer.tell(); 1631 TRY(parse_reference()); 1632 builder.append(m_source.substring_view(start, m_lexer.tell() - start)); 1633 continue; 1634 } 1635 builder.append(m_lexer.consume()); 1636 } 1637 TRY(expect(quote)); 1638 1639 rollback.disarm(); 1640 return builder.to_deprecated_string(); 1641} 1642 1643// 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect 1644ErrorOr<StringView, ParseError> Parser::parse_cdata_section() 1645{ 1646 auto rollback = rollback_point(); 1647 auto rule = enter_rule(); 1648 1649 // CDSect ::= CDStart CData CDEnd 1650 // CDStart ::= '<![CDATA[' 1651 // CData ::= (Char* - (Char* ']]>' Char*)) 1652 // CDEnd ::= ']]>' 1653 TRY(expect("<![CDATA["sv)); 1654 auto accept = accept_rule(); 1655 1656 auto section_start = m_lexer.tell(); 1657 while (!m_lexer.next_is("]]>")) { 1658 if (m_lexer.is_eof()) 1659 break; 1660 m_lexer.ignore(); 1661 } 1662 auto section_end = m_lexer.tell(); 1663 TRY(expect("]]>"sv)); 1664 1665 rollback.disarm(); 1666 return m_source.substring_view(section_start, section_end - section_start); 1667} 1668 1669// 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset 1670ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset() 1671{ 1672 auto rollback = rollback_point(); 1673 auto rule = enter_rule(); 1674 1675 // extSubset ::= TextDecl? extSubsetDecl 1676 (void)parse_text_declaration(); 1677 auto result = TRY(parse_external_subset_declaration()); 1678 1679 rollback.disarm(); 1680 return result; 1681} 1682 1683// 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl 1684ErrorOr<void, ParseError> Parser::parse_text_declaration() 1685{ 1686 auto rollback = rollback_point(); 1687 auto rule = enter_rule(); 1688 1689 // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' 1690 TRY(expect("<?xml"sv)); 1691 auto accept = accept_rule(); 1692 1693 (void)parse_version_info(); 1694 TRY(parse_encoding_decl()); 1695 TRY(skip_whitespace()); 1696 TRY(expect("?>"sv)); 1697 1698 rollback.disarm(); 1699 return {}; 1700} 1701 1702ErrorOr<DeprecatedString, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement) 1703{ 1704 static HashTable<Name> reference_lookup {}; 1705 if (reference_lookup.contains(reference.name)) 1706 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Invalid recursive definition for '{}'", reference.name)); 1707 1708 reference_lookup.set(reference.name); 1709 ScopeGuard remove_lookup { 1710 [&] { 1711 reference_lookup.remove(reference.name); 1712 } 1713 }; 1714 1715 Optional<DeprecatedString> resolved; 1716 if (m_doctype.has_value()) { 1717 // FIXME: Split these up and resolve them ahead of time. 1718 for (auto& declaration : m_doctype->markup_declarations) { 1719 auto entity = declaration.get_pointer<EntityDeclaration>(); 1720 if (!entity) 1721 continue; 1722 auto ge_declaration = entity->get_pointer<GEDeclaration>(); 1723 if (!ge_declaration) 1724 continue; 1725 if (ge_declaration->name != reference.name) 1726 continue; 1727 TRY(ge_declaration->definition.visit( 1728 [&](DeprecatedString const& definition) -> ErrorOr<void, ParseError> { 1729 resolved = definition; 1730 return {}; 1731 }, 1732 [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> { 1733 if (placement == ReferencePlacement::AttributeValue) 1734 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Attribute references external entity '{}'", reference.name)); 1735 1736 if (definition.notation.has_value()) 1737 return parse_error(0u, DeprecatedString::formatted("Entity reference to unparsed entity '{}'", reference.name)); 1738 1739 if (!m_options.resolve_external_resource) 1740 return parse_error(0u, DeprecatedString::formatted("Failed to resolve external entity '{}'", reference.name)); 1741 1742 auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id); 1743 if (result.is_error()) 1744 return parse_error(0u, DeprecatedString::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error())); 1745 1746 resolved = result.release_value(); 1747 return {}; 1748 })); 1749 break; 1750 } 1751 } 1752 1753 if (!resolved.has_value()) { 1754 if (reference.name == "amp") 1755 return "&"; 1756 if (reference.name == "lt") 1757 return "<"; 1758 if (reference.name == "gt") 1759 return ">"; 1760 if (reference.name == "apos") 1761 return "'"; 1762 if (reference.name == "quot") 1763 return "\""; 1764 return parse_error(0u, DeprecatedString::formatted("Reference to undeclared entity '{}'", reference.name)); 1765 } 1766 1767 StringView resolved_source = *resolved; 1768 TemporaryChange source { m_source, resolved_source }; 1769 TemporaryChange lexer { m_lexer, GenericLexer(m_source) }; 1770 switch (placement) { 1771 case ReferencePlacement::AttributeValue: 1772 return TRY(parse_attribute_value_inner(""sv)); 1773 case ReferencePlacement::Content: 1774 TRY(parse_content()); 1775 return ""; 1776 default: 1777 VERIFY_NOT_REACHED(); 1778 } 1779} 1780 1781}