Serenity Operating System
1/*
2 * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <LibXML/DOM/Document.h>
8#include <LibXML/Parser/Parser.h>
9
10struct Range {
11 consteval Range(u32 start, u32 end)
12 : start(start)
13 , end(end)
14 {
15 }
16
17 u32 start;
18 u32 end;
19};
20
21template<auto... ranges>
22struct ranges_for_search {
23 auto contains(u32 value) const
24 {
25 return ((value >= ranges.start && value <= ranges.end) || ...);
26 }
27
28 bool operator()(u32 value) const
29 {
30 return contains(value);
31 }
32
33 template<auto... ranges_to_include>
34 consteval auto with() const
35 {
36 return ranges_for_search<ranges..., ranges_to_include...>();
37 }
38
39 template<auto... ranges_to_include>
40 consteval auto unify(ranges_for_search<ranges_to_include...> const&) const
41 {
42 return ranges_for_search<ranges..., ranges_to_include...>();
43 }
44};
45
46template<size_t Count, typename Element>
47struct StringSet {
48 consteval StringSet(Element const (&entries)[Count])
49 {
50 for (size_t i = 0; i < Count - 1; ++i)
51 elements[i] = entries[i];
52 }
53
54 consteval auto operator[](size_t i) const { return elements[i]; }
55
56 Element elements[Count - 1];
57};
58
59template<StringSet chars>
60consteval static auto set_to_search()
61{
62 return ([&]<auto... Ix>(IndexSequence<Ix...>) {
63 return ranges_for_search<Range(chars[Ix], chars[Ix])...>();
64 }(MakeIndexSequence<array_size(chars.elements)>()));
65}
66
67namespace XML {
68
69size_t Parser::s_debug_indent_level { 0 };
70
71void Parser::append_node(NonnullOwnPtr<Node> node)
72{
73 if (m_entered_node) {
74 m_entered_node->content.get<Node::Element>().children.append(move(node));
75 } else {
76 m_root_node = move(node);
77 m_entered_node = m_root_node.ptr();
78 }
79}
80
81void Parser::append_text(StringView text)
82{
83 if (m_listener) {
84 m_listener->text(text);
85 return;
86 }
87
88 if (!m_entered_node) {
89 Node::Text node;
90 node.builder.append(text);
91 m_root_node = make<Node>(move(node));
92 return;
93 }
94
95 m_entered_node->content.visit(
96 [&](Node::Element& node) {
97 if (!node.children.is_empty()) {
98 auto* text_node = node.children.last()->content.get_pointer<Node::Text>();
99 if (text_node) {
100 text_node->builder.append(text);
101 return;
102 }
103 }
104 Node::Text text_node;
105 text_node.builder.append(text);
106 node.children.append(make<Node>(move(text_node)));
107 },
108 [&](auto&) {
109 // Can't enter a text or comment node.
110 VERIFY_NOT_REACHED();
111 });
112}
113
114void Parser::append_comment(StringView text)
115{
116 if (m_listener) {
117 m_listener->comment(text);
118 return;
119 }
120
121 // If there's no node to attach this to, drop it on the floor.
122 // This can happen to comments in the prolog.
123 if (!m_entered_node)
124 return;
125
126 m_entered_node->content.visit(
127 [&](Node::Element& node) {
128 node.children.append(make<Node>(Node::Comment { text }));
129 },
130 [&](auto&) {
131 // Can't enter a text or comment node.
132 VERIFY_NOT_REACHED();
133 });
134}
135
136void Parser::enter_node(Node& node)
137{
138 if (m_listener) {
139 auto& element = node.content.get<Node::Element>();
140 m_listener->element_start(element.name, element.attributes);
141 }
142
143 if (&node != m_root_node.ptr())
144 node.parent = m_entered_node;
145 m_entered_node = &node;
146}
147
148void Parser::leave_node()
149{
150 if (m_listener) {
151 auto& element = m_entered_node->content.get<Node::Element>();
152 m_listener->element_end(element.name);
153 }
154
155 m_entered_node = m_entered_node->parent;
156}
157
158ErrorOr<Document, ParseError> Parser::parse()
159{
160 if (auto result = parse_internal(); result.is_error()) {
161 if (m_parse_errors.is_empty())
162 return result.release_error();
163 return m_parse_errors.take_first();
164 }
165 return Document {
166 m_root_node.release_nonnull(),
167 move(m_doctype),
168 move(m_processing_instructions),
169 m_version,
170 };
171}
172
173ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
174{
175 m_listener = &listener;
176 ScopeGuard unset_listener { [this] { m_listener = nullptr; } };
177 m_listener->set_source(m_source);
178 m_listener->document_start();
179 auto result = parse_internal();
180 if (result.is_error())
181 m_listener->error(result.error());
182 m_listener->document_end();
183 m_root_node.clear();
184 return result;
185}
186
187// 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S
188ErrorOr<void, ParseError> Parser::skip_whitespace(Required required)
189{
190 auto rollback = rollback_point();
191 auto rule = enter_rule();
192
193 // S ::= (#x20 | #x9 | #xD | #xA)+
194 auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"sv));
195 if (required == Required::Yes && matched.is_empty())
196 return parse_error(m_lexer.tell(), "Expected whitespace");
197
198 rollback.disarm();
199 return {};
200}
201
202// 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
203constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>();
204
205// 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed
206ErrorOr<void, ParseError> Parser::parse_internal()
207{
208 auto rule = enter_rule();
209
210 // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
211 TRY(parse_prolog());
212 TRY(parse_element());
213 while (true) {
214 if (auto result = parse_misc(); result.is_error())
215 break;
216 }
217
218 auto matched_source = m_source.substring_view(0, m_lexer.tell());
219 if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) {
220 return parse_error(
221 it.index(),
222 DeprecatedString::formatted("Invalid character #{:x} used in document", *it));
223 }
224
225 if (!m_lexer.is_eof())
226 return parse_error(m_lexer.tell(), "Garbage after document");
227
228 return {};
229}
230
231ErrorOr<void, ParseError> Parser::expect(StringView expected)
232{
233 auto rollback = rollback_point();
234
235 if (!m_lexer.consume_specific(expected)) {
236 if (m_options.treat_errors_as_fatal)
237 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected '{}'", expected));
238 }
239
240 rollback.disarm();
241 return {};
242}
243
244template<typename Pred>
245requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description)
246{
247 auto rollback = rollback_point();
248 auto start = m_lexer.tell();
249 if (!m_lexer.next_is(predicate)) {
250 if (m_options.treat_errors_as_fatal)
251 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected {}", description));
252 }
253
254 m_lexer.ignore();
255 rollback.disarm();
256 return m_source.substring_view(start, m_lexer.tell() - start);
257}
258
259template<typename Pred>
260requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description)
261{
262 auto rollback = rollback_point();
263 auto start = m_lexer.tell();
264 while (m_lexer.next_is(predicate)) {
265 if (m_lexer.is_eof())
266 break;
267 m_lexer.ignore();
268 }
269
270 if (m_lexer.tell() == start) {
271 if (m_options.treat_errors_as_fatal) {
272 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected {}", description));
273 }
274 }
275
276 rollback.disarm();
277 return m_source.substring_view(start, m_lexer.tell() - start);
278}
279
280// 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog
281ErrorOr<void, ParseError> Parser::parse_prolog()
282{
283 auto rollback = rollback_point();
284 auto rule = enter_rule();
285
286 // prolog ::= XMLDecl Misc* (doctypedecl Misc*)?
287 // The following is valid in XML 1.0.
288 // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
289 if (auto result = parse_xml_decl(); result.is_error()) {
290 m_version = Version::Version10;
291 m_in_compatibility_mode = true;
292 }
293 auto accept = accept_rule();
294
295 while (true) {
296 if (auto result = parse_misc(); result.is_error())
297 break;
298 }
299
300 if (auto result = parse_doctype_decl(); !result.is_error()) {
301 while (true) {
302 if (auto result = parse_misc(); result.is_error())
303 break;
304 }
305 }
306
307 rollback.disarm();
308 return {};
309}
310
311// 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
312ErrorOr<void, ParseError> Parser::parse_xml_decl()
313{
314 auto rollback = rollback_point();
315 auto rule = enter_rule();
316
317 // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
318
319 TRY(expect("<?xml"sv));
320 auto accept = accept_rule();
321
322 TRY(parse_version_info());
323 (void)parse_encoding_decl();
324 (void)parse_standalone_document_decl();
325 TRY(skip_whitespace());
326 TRY(expect("?>"sv));
327
328 rollback.disarm();
329 return {};
330}
331
332// 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo
333ErrorOr<void, ParseError> Parser::parse_version_info()
334{
335 auto rollback = rollback_point();
336 auto rule = enter_rule();
337
338 // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
339 TRY(skip_whitespace(Required::Yes));
340 TRY(expect("version"sv));
341 auto accept = accept_rule();
342
343 TRY(parse_eq());
344 TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
345 m_lexer.retreat();
346
347 auto version_string = m_lexer.consume_quoted_string();
348 if (version_string == "1.0") {
349 // FIXME: Compatibility mode, figure out which rules are different in XML 1.0.
350 m_version = Version::Version10;
351 m_in_compatibility_mode = true;
352 } else {
353 if (version_string != "1.1" && m_options.treat_errors_as_fatal)
354 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected '1.1', found '{}'", version_string));
355 }
356
357 m_version = Version::Version11;
358 rollback.disarm();
359 return {};
360}
361
362// 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq
363ErrorOr<void, ParseError> Parser::parse_eq()
364{
365 auto rollback = rollback_point();
366 auto rule = enter_rule();
367
368 // Eq ::= S? '=' S?
369 auto accept = accept_rule();
370 TRY(skip_whitespace());
371 TRY(expect("="sv));
372 TRY(skip_whitespace());
373 rollback.disarm();
374 return {};
375}
376
377// 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
378ErrorOr<void, ParseError> Parser::parse_encoding_decl()
379{
380 auto rollback = rollback_point();
381 auto rule = enter_rule();
382
383 // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
384 TRY(skip_whitespace(Required::Yes));
385 TRY(expect("encoding"sv));
386 auto accept = accept_rule();
387
388 TRY(parse_eq());
389 TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
390 m_lexer.retreat();
391
392 // FIXME: Actually do something with this encoding.
393 m_encoding = m_lexer.consume_quoted_string();
394
395 rollback.disarm();
396 return {};
397}
398
399// 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd
400ErrorOr<void, ParseError> Parser::parse_standalone_document_decl()
401{
402 auto rollback = rollback_point();
403 auto rule = enter_rule();
404
405 // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
406 TRY(skip_whitespace(Required::Yes));
407 TRY(expect("standalone"sv));
408 auto accept = accept_rule();
409
410 TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
411 m_lexer.retreat();
412
413 auto value = m_lexer.consume_quoted_string();
414 if (!value.is_one_of("yes", "no"))
415 return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'");
416
417 m_standalone = value == "yes";
418
419 rollback.disarm();
420 return {};
421}
422
423// 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc
424ErrorOr<void, ParseError> Parser::parse_misc()
425{
426 auto rollback = rollback_point();
427 auto rule = enter_rule();
428
429 // Misc ::= Comment | PI | S
430 if (auto result = parse_comment(); !result.is_error()) {
431 rollback.disarm();
432 return {};
433 }
434
435 if (auto result = parse_processing_instruction(); !result.is_error()) {
436 rollback.disarm();
437 return {};
438 }
439
440 if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
441 rollback.disarm();
442 return {};
443 }
444
445 return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none");
446}
447
448// 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment
449ErrorOr<void, ParseError> Parser::parse_comment()
450{
451 auto rollback = rollback_point();
452 auto rule = enter_rule();
453
454 // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
455 TRY(expect("<!--"sv));
456 auto accept = accept_rule();
457
458 bool last_seen_a_dash = false;
459 // FIXME: This should disallow surrogate blocks
460 auto text = m_lexer.consume_while([&](auto ch) {
461 if (ch != '-') {
462 last_seen_a_dash = false;
463 return true;
464 }
465
466 if (last_seen_a_dash)
467 return false;
468
469 last_seen_a_dash = true;
470 return true;
471 });
472
473 if (last_seen_a_dash) {
474 m_lexer.retreat();
475 text = text.substring_view(0, text.length() - 1);
476 }
477
478 TRY(expect("-->"sv));
479
480 if (m_options.preserve_comments)
481 append_comment(text);
482
483 rollback.disarm();
484 return {};
485}
486
487// 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI
488ErrorOr<void, ParseError> Parser::parse_processing_instruction()
489{
490 auto rollback = rollback_point();
491 auto rule = enter_rule();
492
493 // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
494 TRY(expect("<?"sv));
495 auto accept = accept_rule();
496
497 auto target = TRY(parse_processing_instruction_target());
498 DeprecatedString data;
499 if (auto result = skip_whitespace(Required::Yes); !result.is_error())
500 data = m_lexer.consume_until("?>");
501 TRY(expect("?>"sv));
502
503 m_processing_instructions.set(target, data);
504 rollback.disarm();
505 return {};
506}
507
508// 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
509ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target()
510{
511 auto rollback = rollback_point();
512 auto rule = enter_rule();
513
514 // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
515 auto target = TRY(parse_name());
516 auto accept = accept_rule();
517
518 if (target.equals_ignoring_ascii_case("xml"sv) && m_options.treat_errors_as_fatal) {
519 return parse_error(
520 m_lexer.tell() - target.length(),
521 "Use of the reserved 'xml' name for processing instruction target name is disallowed");
522 }
523
524 rollback.disarm();
525 return target;
526}
527
528// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
529constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {};
530
531// NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
532constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>();
533
534// 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
535ErrorOr<Name, ParseError> Parser::parse_name()
536{
537 auto rollback = rollback_point();
538 auto rule = enter_rule();
539
540 // Name ::= NameStartChar (NameChar)*
541 auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv));
542 auto accept = accept_rule();
543
544 auto rest = m_lexer.consume_while(s_name_characters);
545 StringBuilder builder;
546 builder.append(start);
547 builder.append(rest);
548
549 rollback.disarm();
550 return builder.to_deprecated_string();
551}
552
553// 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
554ErrorOr<void, ParseError> Parser::parse_doctype_decl()
555{
556 auto rollback = rollback_point();
557 auto rule = enter_rule();
558 Doctype doctype;
559
560 // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
561 TRY(expect("<!DOCTYPE"sv));
562 auto accept = accept_rule();
563
564 TRY(skip_whitespace(Required::Yes));
565 doctype.type = TRY(parse_name());
566 if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
567 auto id_start = m_lexer.tell();
568 if (auto id_result = parse_external_id(); !id_result.is_error()) {
569 doctype.external_id = id_result.release_value();
570 if (m_options.resolve_external_resource) {
571 auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id);
572 if (resource_result.is_error()) {
573 return parse_error(
574 id_start,
575 DeprecatedString::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error()));
576 }
577 StringView resolved_source = resource_result.value();
578 TemporaryChange source { m_source, resolved_source };
579 TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
580 auto declarations = TRY(parse_external_subset());
581 if (!m_lexer.is_eof()) {
582 return parse_error(
583 m_lexer.tell(),
584 DeprecatedString::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal));
585 }
586 doctype.markup_declarations.extend(move(declarations));
587 }
588 }
589 }
590 TRY(skip_whitespace(Required::No));
591 if (m_lexer.consume_specific('[')) {
592 auto internal_subset = TRY(parse_internal_subset());
593 TRY(expect("]"sv));
594 TRY(skip_whitespace());
595 doctype.markup_declarations.extend(internal_subset);
596 }
597
598 TRY(expect(">"sv));
599
600 rollback.disarm();
601 m_doctype = move(doctype);
602 return {};
603}
604
605// 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element
606ErrorOr<void, ParseError> Parser::parse_element()
607{
608 auto rollback = rollback_point();
609 auto rule = enter_rule();
610
611 // element ::= EmptyElemTag
612 // | STag content ETag
613 if (auto result = parse_empty_element_tag(); !result.is_error()) {
614 append_node(result.release_value());
615 rollback.disarm();
616 return {};
617 }
618
619 auto start_tag = TRY(parse_start_tag());
620 auto& node = *start_tag;
621 auto& tag = node.content.get<Node::Element>();
622 append_node(move(start_tag));
623 enter_node(node);
624 ScopeGuard quit {
625 [&] {
626 leave_node();
627 }
628 };
629
630 TRY(parse_content());
631
632 auto tag_location = m_lexer.tell();
633 auto closing_name = TRY(parse_end_tag());
634
635 // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag.
636 if (m_options.treat_errors_as_fatal && closing_name != tag.name)
637 return parse_error(tag_location, "Invalid closing tag");
638
639 rollback.disarm();
640 return {};
641}
642
643// 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag
644ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag()
645{
646 auto rollback = rollback_point();
647 auto rule = enter_rule();
648
649 // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
650 TRY(expect("<"sv));
651 auto accept = accept_rule();
652
653 auto name = TRY(parse_name());
654 HashMap<Name, DeprecatedString> attributes;
655
656 while (true) {
657 if (auto result = skip_whitespace(Required::Yes); result.is_error())
658 break;
659
660 if (auto result = parse_attribute(); !result.is_error()) {
661 auto attribute = result.release_value();
662 attributes.set(move(attribute.name), move(attribute.value));
663 } else {
664 break;
665 }
666 }
667
668 TRY(skip_whitespace());
669 TRY(expect("/>"sv));
670
671 rollback.disarm();
672 return make<Node>(Node::Element { move(name), move(attributes), {} });
673}
674
675// 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute
676ErrorOr<Attribute, ParseError> Parser::parse_attribute()
677{
678 auto rollback = rollback_point();
679 auto rule = enter_rule();
680
681 // Attribute ::= Name Eq AttValue
682 auto name = TRY(parse_name());
683 auto accept = accept_rule();
684
685 TRY(parse_eq());
686 auto value = TRY(parse_attribute_value());
687
688 rollback.disarm();
689 return Attribute {
690 move(name),
691 move(value),
692 };
693}
694
695// 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue
696ErrorOr<DeprecatedString, ParseError> Parser::parse_attribute_value()
697{
698 auto rollback = rollback_point();
699 auto rule = enter_rule();
700
701 // AttValue ::= '"' ([^<&"] | Reference)* '"'
702 // | "'" ([^<&'] | Reference)* "'"
703 auto quote = TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
704 auto accept = accept_rule();
705
706 auto text = TRY(parse_attribute_value_inner(quote));
707 TRY(expect(quote));
708
709 rollback.disarm();
710 return text;
711}
712
713ErrorOr<DeprecatedString, ParseError> Parser::parse_attribute_value_inner(StringView disallow)
714{
715 StringBuilder builder;
716 while (true) {
717 if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof())
718 break;
719
720 if (m_lexer.next_is('<')) {
721 // Not allowed, return a nice error to make it easier to debug.
722 return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values");
723 }
724
725 if (m_lexer.next_is('&')) {
726 auto reference = TRY(parse_reference());
727 if (auto* char_reference = reference.get_pointer<DeprecatedString>())
728 builder.append(*char_reference);
729 else
730 builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
731 } else {
732 builder.append(m_lexer.consume());
733 }
734 }
735 return builder.to_deprecated_string();
736}
737
738// Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
739constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>();
740
741// 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference
742ErrorOr<Variant<Parser::EntityReference, DeprecatedString>, ParseError> Parser::parse_reference()
743{
744 auto rollback = rollback_point();
745 auto rule = enter_rule();
746 // Reference ::= EntityRef | CharRef
747
748 // 4.1.68. EntityRef
749 // EntityRef ::= '&' Name ';'
750
751 // 4.1.66. CharRef
752 // CharRef ::= '&#' [0-9]+ ';'
753 // | '&#x' [0-9a-fA-F]+ ';'
754
755 auto reference_start = m_lexer.tell();
756 TRY(expect("&"sv));
757 auto accept = accept_rule();
758
759 auto name_result = parse_name();
760 if (name_result.is_error()) {
761 TRY(expect("#"sv));
762 Optional<u32> code_point;
763 if (m_lexer.consume_specific('x')) {
764 auto hex = TRY(expect_many(
765 ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(),
766 "any of [0-9a-fA-F]"sv));
767 code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(hex);
768 } else {
769 auto decimal = TRY(expect_many(
770 ranges_for_search<Range('0', '9')>(),
771 "any of [0-9]"sv));
772 code_point = decimal.to_uint<u32>();
773 }
774
775 if (!code_point.has_value() || !s_characters.contains(*code_point))
776 return parse_error(reference_start, "Invalid character reference");
777
778 TRY(expect(";"sv));
779
780 StringBuilder builder;
781 builder.append_code_point(*code_point);
782
783 rollback.disarm();
784 return builder.to_deprecated_string();
785 }
786
787 auto name = name_result.release_value();
788 TRY(expect(";"sv));
789
790 rollback.disarm();
791 return EntityReference { move(name) };
792}
793
794// 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag
795ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag()
796{
797 auto rollback = rollback_point();
798 auto rule = enter_rule();
799
800 // STag ::= '<' Name (S Attribute)* S? '>'
801 TRY(expect("<"sv));
802 auto accept = accept_rule();
803
804 auto name = TRY(parse_name());
805 HashMap<Name, DeprecatedString> attributes;
806
807 while (true) {
808 if (auto result = skip_whitespace(Required::Yes); result.is_error())
809 break;
810
811 if (auto result = parse_attribute(); !result.is_error()) {
812 auto attribute = result.release_value();
813 attributes.set(move(attribute.name), move(attribute.value));
814 } else {
815 break;
816 }
817 }
818
819 TRY(skip_whitespace());
820 TRY(expect(">"sv));
821
822 rollback.disarm();
823 return make<Node>(Node::Element { move(name), move(attributes), {} });
824}
825
826// 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag
827ErrorOr<Name, ParseError> Parser::parse_end_tag()
828{
829 auto rollback = rollback_point();
830 auto rule = enter_rule();
831
832 // ETag ::= '</' Name S? '>'
833 TRY(expect("</"sv));
834 auto accept = accept_rule();
835
836 auto name = TRY(parse_name());
837 TRY(skip_whitespace());
838 TRY(expect(">"sv));
839
840 rollback.disarm();
841 return name;
842}
843
844// 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content
845ErrorOr<void, ParseError> Parser::parse_content()
846{
847 auto rollback = rollback_point();
848 auto rule = enter_rule();
849
850 // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
851 if (auto result = parse_char_data(); !result.is_error())
852 append_text(result.release_value());
853
854 while (true) {
855 if (auto result = parse_element(); !result.is_error())
856 goto try_char_data;
857 if (auto result = parse_reference(); !result.is_error()) {
858 auto reference = result.release_value();
859 if (auto char_reference = reference.get_pointer<DeprecatedString>())
860 append_text(*char_reference);
861 else
862 TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content));
863 goto try_char_data;
864 }
865 if (auto result = parse_cdata_section(); !result.is_error()) {
866 if (m_options.preserve_cdata)
867 append_text(result.release_value());
868 goto try_char_data;
869 }
870 if (auto result = parse_processing_instruction(); !result.is_error())
871 goto try_char_data;
872 if (auto result = parse_comment(); !result.is_error())
873 goto try_char_data;
874
875 break;
876
877 try_char_data:;
878 if (auto result = parse_char_data(); !result.is_error())
879 append_text(result.release_value());
880 }
881
882 rollback.disarm();
883 return {};
884}
885
886// 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData
887ErrorOr<StringView, ParseError> Parser::parse_char_data()
888{
889 auto rollback = rollback_point();
890 auto rule = enter_rule();
891
892 // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
893 auto cend_state = 0; // 1: ], 2: ], 3: >
894 auto text = m_lexer.consume_while([&](auto ch) {
895 if (ch == '<' || ch == '&' || cend_state == 3)
896 return false;
897 switch (cend_state) {
898 case 0:
899 case 1:
900 if (ch == ']')
901 cend_state++;
902 else
903 cend_state = 0;
904 return true;
905 case 2:
906 if (ch == '>') {
907 cend_state++;
908 return true;
909 }
910 cend_state = 0;
911 return true;
912 default:
913 VERIFY_NOT_REACHED();
914 }
915 });
916 if (cend_state == 3) {
917 m_lexer.retreat(3);
918 text = text.substring_view(0, text.length() - 3);
919 }
920
921 rollback.disarm();
922 return text;
923}
924
925// 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset
926ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset()
927{
928 auto rollback = rollback_point();
929 auto rule = enter_rule();
930 Vector<MarkupDeclaration> declarations;
931
932 // intSubset ::= (markupdecl | DeclSep)*
933 while (true) {
934 if (auto result = parse_markup_declaration(); !result.is_error()) {
935 auto maybe_declaration = result.release_value();
936 if (maybe_declaration.has_value())
937 declarations.append(maybe_declaration.release_value());
938 continue;
939 }
940 if (auto result = parse_declaration_separator(); !result.is_error()) {
941 // The markup declarations may be made up in whole or in part of the replacement text of parameter entities.
942 // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl.
943 auto maybe_replacement_text = result.release_value();
944 if (maybe_replacement_text.has_value()) {
945 TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() };
946 TemporaryChange lexer { m_lexer, GenericLexer { m_source } };
947
948 auto contained_declarations = TRY(parse_external_subset_declaration());
949 declarations.extend(move(contained_declarations));
950 }
951 continue;
952 }
953 break;
954 }
955
956 rollback.disarm();
957 return declarations;
958}
959
960// 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl
961ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration()
962{
963 auto rollback = rollback_point();
964 auto rule = enter_rule();
965
966 // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
967 if (auto result = parse_element_declaration(); !result.is_error()) {
968 rollback.disarm();
969 return MarkupDeclaration { result.release_value() };
970 }
971 if (auto result = parse_attribute_list_declaration(); !result.is_error()) {
972 rollback.disarm();
973 return MarkupDeclaration { result.release_value() };
974 }
975 if (auto result = parse_entity_declaration(); !result.is_error()) {
976 rollback.disarm();
977 return MarkupDeclaration { result.release_value() };
978 }
979 if (auto result = parse_notation_declaration(); !result.is_error()) {
980 rollback.disarm();
981 return MarkupDeclaration { result.release_value() };
982 }
983 if (auto result = parse_processing_instruction(); !result.is_error()) {
984 rollback.disarm();
985 return Optional<MarkupDeclaration> {};
986 }
987 if (auto result = parse_comment(); !result.is_error()) {
988 rollback.disarm();
989 return Optional<MarkupDeclaration> {};
990 }
991
992 return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment");
993}
994
995// 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep
996ErrorOr<Optional<DeprecatedString>, ParseError> Parser::parse_declaration_separator()
997{
998 auto rollback = rollback_point();
999 auto rule = enter_rule();
1000
1001 // DeclSep ::= PEReference | S
1002 if (auto name = parse_parameter_entity_reference(); !name.is_error()) {
1003 rollback.disarm();
1004 // FIXME: Resolve this PEReference.
1005 return "";
1006 }
1007
1008 if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
1009 rollback.disarm();
1010 return Optional<DeprecatedString> {};
1011 }
1012
1013 return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference");
1014}
1015
1016// 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference
1017ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference()
1018{
1019 auto rollback = rollback_point();
1020 auto rule = enter_rule();
1021
1022 // PEReference ::= '%' Name ';'
1023 TRY(expect("%"sv));
1024 auto accept = accept_rule();
1025
1026 auto name = TRY(parse_name());
1027 TRY(expect(";"sv));
1028
1029 rollback.disarm();
1030 return name;
1031}
1032
1033// 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl
1034ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration()
1035{
1036 auto rollback = rollback_point();
1037 auto rule = enter_rule();
1038
1039 // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences,
1040 // but the grammar does not allow that, figure this out.
1041 // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
1042 TRY(expect("<!ELEMENT"sv));
1043 auto accept = accept_rule();
1044
1045 TRY(skip_whitespace(Required::Yes));
1046 auto name = TRY(parse_name());
1047 TRY(skip_whitespace(Required::Yes));
1048 auto spec = TRY(parse_content_spec());
1049 TRY(expect(">"sv));
1050
1051 rollback.disarm();
1052 return ElementDeclaration {
1053 move(name),
1054 move(spec),
1055 };
1056}
1057
1058// 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl
1059ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration()
1060{
1061 auto rollback = rollback_point();
1062 auto rule = enter_rule();
1063 AttributeListDeclaration declaration;
1064
1065 // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
1066 TRY(expect("<!ATTLIST"sv));
1067 auto accept = accept_rule();
1068
1069 TRY(skip_whitespace(Required::Yes));
1070 declaration.type = TRY(parse_name());
1071
1072 while (true) {
1073 if (auto result = parse_attribute_definition(); !result.is_error())
1074 declaration.attributes.append(result.release_value());
1075 else
1076 break;
1077 }
1078
1079 TRY(skip_whitespace());
1080 TRY(expect(">"sv));
1081
1082 rollback.disarm();
1083 return declaration;
1084}
1085
1086// 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef
1087ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition()
1088{
1089 auto rollback = rollback_point();
1090 auto rule = enter_rule();
1091 Optional<AttributeListDeclaration::Type> type;
1092 Optional<AttributeListDeclaration::Default> default_;
1093
1094 // AttDef ::= S Name S AttType S DefaultDecl
1095 TRY(skip_whitespace(Required::Yes));
1096 auto name = TRY(parse_name());
1097 auto accept = accept_rule();
1098
1099 TRY(skip_whitespace(Required::Yes));
1100
1101 // AttType ::= StringType | TokenizedType | EnumeratedType
1102 // StringType ::= 'CDATA'
1103 // TokenizedType ::= 'ID'
1104 // | 'IDREF'
1105 // | 'IDREFS'
1106 // | 'ENTITY'
1107 // | 'ENTITIES'
1108 // | 'NMTOKEN'
1109 // | 'NMTOKENS'
1110 // EnumeratedType ::= NotationType | Enumeration
1111 // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
1112 // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1113 if (m_lexer.consume_specific("CDATA")) {
1114 type = AttributeListDeclaration::StringType::CData;
1115 } else if (m_lexer.consume_specific("IDREFS")) {
1116 type = AttributeListDeclaration::TokenizedType::IDRefs;
1117 } else if (m_lexer.consume_specific("IDREF")) {
1118 type = AttributeListDeclaration::TokenizedType::IDRef;
1119 } else if (m_lexer.consume_specific("ID")) {
1120 type = AttributeListDeclaration::TokenizedType::ID;
1121 } else if (m_lexer.consume_specific("ENTITIES")) {
1122 type = AttributeListDeclaration::TokenizedType::Entities;
1123 } else if (m_lexer.consume_specific("ENTITY")) {
1124 type = AttributeListDeclaration::TokenizedType::Entity;
1125 } else if (m_lexer.consume_specific("NMTOKENS")) {
1126 type = AttributeListDeclaration::TokenizedType::NMTokens;
1127 } else if (m_lexer.consume_specific("NMTOKEN")) {
1128 type = AttributeListDeclaration::TokenizedType::NMToken;
1129 } else if (m_lexer.consume_specific("NOTATION")) {
1130 HashTable<Name> names;
1131 TRY(skip_whitespace(Required::Yes));
1132 TRY(expect("("sv));
1133 TRY(skip_whitespace());
1134 names.set(TRY(parse_name()));
1135 while (true) {
1136 TRY(skip_whitespace());
1137 if (auto result = expect("|"sv); result.is_error())
1138 break;
1139 TRY(skip_whitespace());
1140 names.set(TRY(parse_name()));
1141 }
1142 TRY(skip_whitespace());
1143 TRY(expect(")"sv));
1144 type = AttributeListDeclaration::NotationType { move(names) };
1145 } else {
1146 HashTable<DeprecatedString> names;
1147 TRY(expect("("sv));
1148 TRY(skip_whitespace());
1149 names.set(TRY(parse_nm_token()));
1150 while (true) {
1151 TRY(skip_whitespace());
1152 if (auto result = expect("|"sv); result.is_error())
1153 break;
1154 TRY(skip_whitespace());
1155 names.set(TRY(parse_nm_token()));
1156 }
1157 TRY(skip_whitespace());
1158 TRY(expect(")"sv));
1159 type = AttributeListDeclaration::Enumeration { move(names) };
1160 }
1161
1162 TRY(skip_whitespace(Required::Yes));
1163
1164 // DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1165 // | (('#FIXED' S)? AttValue)
1166 if (m_lexer.consume_specific("#REQUIRED")) {
1167 default_ = AttributeListDeclaration::Required {};
1168 } else if (m_lexer.consume_specific("#IMPLIED")) {
1169 default_ = AttributeListDeclaration::Implied {};
1170 } else {
1171 bool fixed = false;
1172 if (m_lexer.consume_specific("#FIXED")) {
1173 TRY(skip_whitespace(Required::Yes));
1174 fixed = true;
1175 }
1176 auto value = TRY(parse_attribute_value());
1177 if (fixed)
1178 default_ = AttributeListDeclaration::Fixed { move(value) };
1179 else
1180 default_ = AttributeListDeclaration::DefaultValue { move(value) };
1181 }
1182
1183 rollback.disarm();
1184 return AttributeListDeclaration::Definition {
1185 move(name),
1186 type.release_value(),
1187 default_.release_value(),
1188 };
1189}
1190
1191// 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken
1192ErrorOr<StringView, ParseError> Parser::parse_nm_token()
1193{
1194 auto rollback = rollback_point();
1195 auto rule = enter_rule();
1196
1197 // Nmtoken ::= (NameChar)+
1198 auto token = TRY(expect_many(s_name_characters, "a NameChar"sv));
1199
1200 rollback.disarm();
1201 return token;
1202}
1203
1204// 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations
1205ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration()
1206{
1207 auto rollback = rollback_point();
1208 auto rule = enter_rule();
1209 Variant<ExternalID, PublicID, Empty> notation;
1210
1211 // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
1212 TRY(expect("<!NOTATION"sv));
1213 auto accept = accept_rule();
1214
1215 TRY(skip_whitespace(Required::Yes));
1216 auto name = TRY(parse_name());
1217 TRY(skip_whitespace(Required::Yes));
1218
1219 if (auto result = parse_external_id(); !result.is_error())
1220 notation = result.release_value();
1221 else
1222 notation = TRY(parse_public_id());
1223
1224 TRY(expect(">"sv));
1225
1226 rollback.disarm();
1227 return NotationDeclaration {
1228 move(name),
1229 move(notation).downcast<ExternalID, PublicID>(),
1230 };
1231}
1232
1233// 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec
1234ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec()
1235{
1236 auto rollback = rollback_point();
1237 auto rule = enter_rule();
1238 Optional<ElementDeclaration::ContentSpec> content_spec;
1239
1240 // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
1241 if (m_lexer.consume_specific("EMPTY")) {
1242 content_spec = ElementDeclaration::Empty {};
1243 } else if (m_lexer.consume_specific("ANY")) {
1244 content_spec = ElementDeclaration::Any {};
1245 } else {
1246 TRY(expect("("sv));
1247 TRY(skip_whitespace());
1248 if (m_lexer.consume_specific("#PCDATA")) {
1249 HashTable<Name> names;
1250 // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
1251 // | '(' S? '#PCDATA' S? ')'
1252 TRY(skip_whitespace());
1253 if (m_lexer.consume_specific(")*")) {
1254 content_spec = ElementDeclaration::Mixed { .types = {}, .many = true };
1255 } else if (m_lexer.consume_specific(')')) {
1256 content_spec = ElementDeclaration::Mixed { .types = {}, .many = false };
1257 } else {
1258 while (true) {
1259 TRY(skip_whitespace());
1260 if (!m_lexer.consume_specific('|'))
1261 break;
1262 TRY(skip_whitespace());
1263 if (auto result = parse_name(); !result.is_error())
1264 names.set(result.release_value());
1265 else
1266 return parse_error(m_lexer.tell(), "Expected a Name");
1267 }
1268 TRY(skip_whitespace());
1269 TRY(expect(")*"sv));
1270 content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true };
1271 }
1272 } else {
1273 while (!m_lexer.next_is('('))
1274 m_lexer.retreat();
1275 // children ::= (choice | seq) ('?' | '*' | '+')?
1276 // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1277 // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
1278 // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
1279 Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice;
1280 Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence;
1281
1282 auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> {
1283 if (auto result = parse_name(); !result.is_error())
1284 return result.release_value();
1285 if (auto result = parse_choice(); !result.is_error())
1286 return result.release_value();
1287 return TRY(parse_sequence());
1288 };
1289 auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier {
1290 ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce };
1291 if (m_lexer.consume_specific('?'))
1292 qualifier = ElementDeclaration::Children::Qualifier::Optional;
1293 else if (m_lexer.consume_specific('*'))
1294 qualifier = ElementDeclaration::Children::Qualifier::Any;
1295 else if (m_lexer.consume_specific('+'))
1296 qualifier = ElementDeclaration::Children::Qualifier::OneOrMore;
1297 return qualifier;
1298 };
1299 auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> {
1300 auto sub_entry = TRY(parse_cp_init());
1301 auto qualifier = parse_qualifier();
1302 return ElementDeclaration::Children::Entry {
1303 move(sub_entry),
1304 qualifier,
1305 };
1306 };
1307 parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> {
1308 auto rollback = rollback_point();
1309 auto rule = enter_rule();
1310
1311 TRY(expect("("sv));
1312 auto accept = accept_rule();
1313
1314 TRY(skip_whitespace());
1315 Vector<ElementDeclaration::Children::Entry> choices;
1316 choices.append(TRY(parse_cp()));
1317 while (true) {
1318 TRY(skip_whitespace());
1319 if (!m_lexer.consume_specific('|'))
1320 break;
1321 TRY(skip_whitespace());
1322 choices.append(TRY(parse_cp()));
1323 }
1324
1325 TRY(expect(")"sv));
1326
1327 if (choices.size() < 2)
1328 return parse_error(m_lexer.tell(), "Expected more than one choice");
1329
1330 TRY(skip_whitespace());
1331 auto qualifier = parse_qualifier();
1332
1333 rollback.disarm();
1334 return ElementDeclaration::Children::Choice {
1335 move(choices),
1336 qualifier,
1337 };
1338 };
1339 parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> {
1340 auto rollback = rollback_point();
1341 auto rule = enter_rule();
1342
1343 TRY(expect("("sv));
1344 auto accept = accept_rule();
1345
1346 TRY(skip_whitespace());
1347 Vector<ElementDeclaration::Children::Entry> entries;
1348 entries.append(TRY(parse_cp()));
1349 while (true) {
1350 TRY(skip_whitespace());
1351 if (!m_lexer.consume_specific(','))
1352 break;
1353 TRY(skip_whitespace());
1354 entries.append(TRY(parse_cp()));
1355 }
1356
1357 TRY(expect(")"sv));
1358
1359 TRY(skip_whitespace());
1360 auto qualifier = parse_qualifier();
1361
1362 rollback.disarm();
1363 return ElementDeclaration::Children::Sequence {
1364 move(entries),
1365 qualifier,
1366 };
1367 };
1368 if (auto result = parse_choice(); !result.is_error()) {
1369 auto qualifier = parse_qualifier();
1370 content_spec = ElementDeclaration::Children {
1371 result.release_value(),
1372 qualifier,
1373 };
1374 } else {
1375 auto sequence = TRY(parse_sequence());
1376 auto qualifier = parse_qualifier();
1377 content_spec = ElementDeclaration::Children {
1378 move(sequence),
1379 qualifier,
1380 };
1381 }
1382 }
1383 }
1384
1385 rollback.disarm();
1386 return content_spec.release_value();
1387}
1388
1389// 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl
1390ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration()
1391{
1392 auto rollback = rollback_point();
1393 auto rule = enter_rule();
1394 Vector<MarkupDeclaration> declarations;
1395
1396 // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )*
1397 while (true) {
1398 if (auto result = parse_markup_declaration(); !result.is_error()) {
1399 if (result.value().has_value())
1400 declarations.append(result.release_value().release_value());
1401 continue;
1402 }
1403
1404 // FIXME: conditionalSect
1405
1406 if (auto result = parse_declaration_separator(); !result.is_error())
1407 continue;
1408
1409 break;
1410 }
1411
1412 rollback.disarm();
1413 return declarations;
1414}
1415
1416// 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl
1417ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration()
1418{
1419 // EntityDecl ::= GEDecl | PEDecl
1420 if (auto result = parse_general_entity_declaration(); !result.is_error())
1421 return result;
1422
1423 return parse_parameter_entity_declaration();
1424}
1425
1426// 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl
1427ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration()
1428{
1429 auto rollback = rollback_point();
1430 auto rule = enter_rule();
1431 Variant<DeprecatedString, EntityDefinition, Empty> definition;
1432
1433 // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
1434 TRY(expect("<!ENTITY"sv));
1435 auto accept = accept_rule();
1436
1437 TRY(skip_whitespace(Required::Yes));
1438 auto name = TRY(parse_name());
1439 TRY(skip_whitespace(Required::Yes));
1440 // EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1441 if (auto result = parse_entity_value(); !result.is_error()) {
1442 definition = result.release_value();
1443 } else {
1444 auto external_id = TRY(parse_external_id());
1445 Optional<Name> notation;
1446 if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error())
1447 notation = notation_result.release_value();
1448
1449 definition = EntityDefinition {
1450 move(external_id),
1451 move(notation),
1452 };
1453 }
1454
1455 TRY(skip_whitespace());
1456 TRY(expect(">"sv));
1457
1458 rollback.disarm();
1459 return GEDeclaration {
1460 move(name),
1461 move(definition).downcast<DeprecatedString, EntityDefinition>(),
1462 };
1463}
1464
1465// 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl
1466ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration()
1467{
1468 auto rollback = rollback_point();
1469 auto rule = enter_rule();
1470
1471 Variant<DeprecatedString, ExternalID, Empty> definition;
1472 // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
1473 TRY(expect("<!ENTITY"sv));
1474 auto accept = accept_rule();
1475
1476 TRY(skip_whitespace(Required::Yes));
1477 TRY(expect("%"sv));
1478 TRY(skip_whitespace(Required::Yes));
1479 auto name = TRY(parse_name());
1480 TRY(skip_whitespace(Required::Yes));
1481 // PEDef ::= EntityValue | ExternalID
1482 if (auto result = parse_entity_value(); !result.is_error())
1483 definition = result.release_value();
1484 else
1485 definition = TRY(parse_external_id());
1486
1487 TRY(skip_whitespace());
1488 TRY(expect(">"sv));
1489
1490 rollback.disarm();
1491 return PEDeclaration {
1492 move(name),
1493 move(definition).downcast<DeprecatedString, ExternalID>(),
1494 };
1495}
1496
1497// 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID
1498ErrorOr<PublicID, ParseError> Parser::parse_public_id()
1499{
1500 auto rollback = rollback_point();
1501 auto rule = enter_rule();
1502
1503 // PublicID ::= 'PUBLIC' S PubidLiteral
1504 TRY(expect("PUBLIC"sv));
1505 auto accept = accept_rule();
1506
1507 TRY(skip_whitespace(Required::Yes));
1508 auto text = TRY(parse_public_id_literal());
1509
1510 rollback.disarm();
1511 return PublicID {
1512 text,
1513 };
1514}
1515
1516constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>());
1517
1518// 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
1519ErrorOr<StringView, ParseError> Parser::parse_public_id_literal()
1520{
1521 auto rollback = rollback_point();
1522 auto rule = enter_rule();
1523
1524 // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1525 auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
1526 auto accept = accept_rule();
1527
1528 auto id = TRY(expect_many(
1529 [q = quote[0]](auto x) {
1530 return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x);
1531 },
1532 "a PubidChar"sv));
1533 TRY(expect(quote));
1534
1535 rollback.disarm();
1536 return id;
1537}
1538
1539// 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral
1540ErrorOr<StringView, ParseError> Parser::parse_system_id_literal()
1541{
1542 auto rollback = rollback_point();
1543 auto rule = enter_rule();
1544
1545 // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
1546 auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
1547 auto accept = accept_rule();
1548
1549 auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"sv));
1550 TRY(expect(quote));
1551
1552 rollback.disarm();
1553 return id;
1554}
1555
1556// 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID
1557ErrorOr<ExternalID, ParseError> Parser::parse_external_id()
1558{
1559 auto rollback = rollback_point();
1560 auto rule = enter_rule();
1561
1562 // ExternalID ::= 'SYSTEM' S SystemLiteral
1563 // | 'PUBLIC' S PubidLiteral S SystemLiteral
1564 Optional<PublicID> public_id;
1565 SystemID system_id;
1566
1567 if (m_lexer.consume_specific("SYSTEM")) {
1568 auto accept = accept_rule();
1569 TRY(skip_whitespace(Required::Yes));
1570 system_id = SystemID { TRY(parse_system_id_literal()) };
1571 } else {
1572 TRY(expect("PUBLIC"sv));
1573 auto accept = accept_rule();
1574
1575 TRY(skip_whitespace(Required::Yes));
1576 public_id = PublicID { TRY(parse_public_id_literal()) };
1577 TRY(skip_whitespace(Required::Yes));
1578 system_id = SystemID { TRY(parse_system_id_literal()) };
1579 }
1580
1581 rollback.disarm();
1582 return ExternalID {
1583 move(public_id),
1584 move(system_id),
1585 };
1586}
1587
1588// 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl
1589ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration()
1590{
1591 auto rollback = rollback_point();
1592 auto rule = enter_rule();
1593
1594 // NDataDecl ::= S 'NDATA' S Name
1595 TRY(skip_whitespace(Required::Yes));
1596 auto accept = accept_rule();
1597
1598 TRY(expect("NDATA"sv));
1599 TRY(skip_whitespace(Required::Yes));
1600 auto name = TRY(parse_name());
1601
1602 rollback.disarm();
1603 return name;
1604}
1605
1606// 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
1607ErrorOr<DeprecatedString, ParseError> Parser::parse_entity_value()
1608{
1609 auto rollback = rollback_point();
1610 auto rule = enter_rule();
1611 StringBuilder builder;
1612
1613 // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
1614 // | "'" ([^%&'] | PEReference | Reference)* "'"
1615 auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
1616 auto accept = accept_rule();
1617
1618 while (true) {
1619 if (m_lexer.is_eof())
1620 break;
1621 if (m_lexer.next_is(quote))
1622 break;
1623 if (m_lexer.next_is('%')) {
1624 auto start = m_lexer.tell();
1625 TRY(parse_parameter_entity_reference());
1626 builder.append(m_source.substring_view(start, m_lexer.tell() - start));
1627 continue;
1628 }
1629 if (m_lexer.next_is('&')) {
1630 auto start = m_lexer.tell();
1631 TRY(parse_reference());
1632 builder.append(m_source.substring_view(start, m_lexer.tell() - start));
1633 continue;
1634 }
1635 builder.append(m_lexer.consume());
1636 }
1637 TRY(expect(quote));
1638
1639 rollback.disarm();
1640 return builder.to_deprecated_string();
1641}
1642
1643// 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect
1644ErrorOr<StringView, ParseError> Parser::parse_cdata_section()
1645{
1646 auto rollback = rollback_point();
1647 auto rule = enter_rule();
1648
1649 // CDSect ::= CDStart CData CDEnd
1650 // CDStart ::= '<![CDATA['
1651 // CData ::= (Char* - (Char* ']]>' Char*))
1652 // CDEnd ::= ']]>'
1653 TRY(expect("<![CDATA["sv));
1654 auto accept = accept_rule();
1655
1656 auto section_start = m_lexer.tell();
1657 while (!m_lexer.next_is("]]>")) {
1658 if (m_lexer.is_eof())
1659 break;
1660 m_lexer.ignore();
1661 }
1662 auto section_end = m_lexer.tell();
1663 TRY(expect("]]>"sv));
1664
1665 rollback.disarm();
1666 return m_source.substring_view(section_start, section_end - section_start);
1667}
1668
1669// 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset
1670ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset()
1671{
1672 auto rollback = rollback_point();
1673 auto rule = enter_rule();
1674
1675 // extSubset ::= TextDecl? extSubsetDecl
1676 (void)parse_text_declaration();
1677 auto result = TRY(parse_external_subset_declaration());
1678
1679 rollback.disarm();
1680 return result;
1681}
1682
1683// 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl
1684ErrorOr<void, ParseError> Parser::parse_text_declaration()
1685{
1686 auto rollback = rollback_point();
1687 auto rule = enter_rule();
1688
1689 // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
1690 TRY(expect("<?xml"sv));
1691 auto accept = accept_rule();
1692
1693 (void)parse_version_info();
1694 TRY(parse_encoding_decl());
1695 TRY(skip_whitespace());
1696 TRY(expect("?>"sv));
1697
1698 rollback.disarm();
1699 return {};
1700}
1701
1702ErrorOr<DeprecatedString, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement)
1703{
1704 static HashTable<Name> reference_lookup {};
1705 if (reference_lookup.contains(reference.name))
1706 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Invalid recursive definition for '{}'", reference.name));
1707
1708 reference_lookup.set(reference.name);
1709 ScopeGuard remove_lookup {
1710 [&] {
1711 reference_lookup.remove(reference.name);
1712 }
1713 };
1714
1715 Optional<DeprecatedString> resolved;
1716 if (m_doctype.has_value()) {
1717 // FIXME: Split these up and resolve them ahead of time.
1718 for (auto& declaration : m_doctype->markup_declarations) {
1719 auto entity = declaration.get_pointer<EntityDeclaration>();
1720 if (!entity)
1721 continue;
1722 auto ge_declaration = entity->get_pointer<GEDeclaration>();
1723 if (!ge_declaration)
1724 continue;
1725 if (ge_declaration->name != reference.name)
1726 continue;
1727 TRY(ge_declaration->definition.visit(
1728 [&](DeprecatedString const& definition) -> ErrorOr<void, ParseError> {
1729 resolved = definition;
1730 return {};
1731 },
1732 [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> {
1733 if (placement == ReferencePlacement::AttributeValue)
1734 return parse_error(m_lexer.tell(), DeprecatedString::formatted("Attribute references external entity '{}'", reference.name));
1735
1736 if (definition.notation.has_value())
1737 return parse_error(0u, DeprecatedString::formatted("Entity reference to unparsed entity '{}'", reference.name));
1738
1739 if (!m_options.resolve_external_resource)
1740 return parse_error(0u, DeprecatedString::formatted("Failed to resolve external entity '{}'", reference.name));
1741
1742 auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id);
1743 if (result.is_error())
1744 return parse_error(0u, DeprecatedString::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error()));
1745
1746 resolved = result.release_value();
1747 return {};
1748 }));
1749 break;
1750 }
1751 }
1752
1753 if (!resolved.has_value()) {
1754 if (reference.name == "amp")
1755 return "&";
1756 if (reference.name == "lt")
1757 return "<";
1758 if (reference.name == "gt")
1759 return ">";
1760 if (reference.name == "apos")
1761 return "'";
1762 if (reference.name == "quot")
1763 return "\"";
1764 return parse_error(0u, DeprecatedString::formatted("Reference to undeclared entity '{}'", reference.name));
1765 }
1766
1767 StringView resolved_source = *resolved;
1768 TemporaryChange source { m_source, resolved_source };
1769 TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
1770 switch (placement) {
1771 case ReferencePlacement::AttributeValue:
1772 return TRY(parse_attribute_value_inner(""sv));
1773 case ReferencePlacement::Content:
1774 TRY(parse_content());
1775 return "";
1776 default:
1777 VERIFY_NOT_REACHED();
1778 }
1779}
1780
1781}