Serenity Operating System
1/*
2 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
3 * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
4 *
5 * SPDX-License-Identifier: BSD-2-Clause
6 */
7
8#include <AK/CharacterTypes.h>
9#include <AK/Debug.h>
10#include <AK/SourceLocation.h>
11#include <LibTextCodec/Decoder.h>
12#include <LibWeb/HTML/Parser/Entities.h>
13#include <LibWeb/HTML/Parser/HTMLParser.h>
14#include <LibWeb/HTML/Parser/HTMLToken.h>
15#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
16#include <LibWeb/Namespace.h>
17#include <string.h>
18
19namespace Web::HTML {
20
21#pragma GCC diagnostic ignored "-Wunused-label"
22
23#define CONSUME_NEXT_INPUT_CHARACTER \
24 current_input_character = next_code_point();
25
26#define SWITCH_TO(new_state) \
27 do { \
28 VERIFY(m_current_builder.is_empty()); \
29 SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state); \
30 } while (0)
31
32#define SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state) \
33 do { \
34 will_switch_to(State::new_state); \
35 m_state = State::new_state; \
36 CONSUME_NEXT_INPUT_CHARACTER; \
37 goto new_state; \
38 } while (0)
39
40#define RECONSUME_IN(new_state) \
41 do { \
42 will_reconsume_in(State::new_state); \
43 m_state = State::new_state; \
44 goto new_state; \
45 } while (0)
46
47#define SWITCH_TO_RETURN_STATE \
48 do { \
49 will_switch_to(m_return_state); \
50 m_state = m_return_state; \
51 goto _StartOfFunction; \
52 } while (0)
53
54#define RECONSUME_IN_RETURN_STATE \
55 do { \
56 will_reconsume_in(m_return_state); \
57 m_state = m_return_state; \
58 if (current_input_character.has_value()) \
59 restore_to(m_prev_utf8_iterator); \
60 goto _StartOfFunction; \
61 } while (0)
62
63#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
64 do { \
65 VERIFY(m_current_builder.is_empty()); \
66 will_switch_to(State::new_state); \
67 m_state = State::new_state; \
68 will_emit(m_current_token); \
69 m_queued_tokens.enqueue(move(m_current_token)); \
70 return m_queued_tokens.dequeue(); \
71 } while (0)
72
73#define EMIT_CHARACTER_AND_RECONSUME_IN(code_point, new_state) \
74 do { \
75 m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); \
76 will_reconsume_in(State::new_state); \
77 m_state = State::new_state; \
78 goto new_state; \
79 } while (0)
80
81#define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \
82 do { \
83 for (auto code_point : m_temporary_buffer) { \
84 if (consumed_as_part_of_an_attribute()) { \
85 m_current_builder.append_code_point(code_point); \
86 } else { \
87 create_new_token(HTMLToken::Type::Character); \
88 m_current_token.set_code_point(code_point); \
89 m_queued_tokens.enqueue(move(m_current_token)); \
90 } \
91 } \
92 } while (0)
93
94#define DONT_CONSUME_NEXT_INPUT_CHARACTER \
95 do { \
96 restore_to(m_prev_utf8_iterator); \
97 } while (0)
98
99#define ON(code_point) \
100 if (current_input_character.has_value() && current_input_character.value() == code_point)
101
102#define ON_EOF \
103 if (!current_input_character.has_value())
104
105#define ON_ASCII_ALPHA \
106 if (current_input_character.has_value() && is_ascii_alpha(current_input_character.value()))
107
108#define ON_ASCII_ALPHANUMERIC \
109 if (current_input_character.has_value() && is_ascii_alphanumeric(current_input_character.value()))
110
111#define ON_ASCII_UPPER_ALPHA \
112 if (current_input_character.has_value() && is_ascii_upper_alpha(current_input_character.value()))
113
114#define ON_ASCII_LOWER_ALPHA \
115 if (current_input_character.has_value() && is_ascii_lower_alpha(current_input_character.value()))
116
117#define ON_ASCII_DIGIT \
118 if (current_input_character.has_value() && is_ascii_digit(current_input_character.value()))
119
120#define ON_ASCII_HEX_DIGIT \
121 if (current_input_character.has_value() && is_ascii_hex_digit(current_input_character.value()))
122
123#define ON_WHITESPACE \
124 if (current_input_character.has_value() && is_ascii(*current_input_character) && first_is_one_of(static_cast<char>(*current_input_character), '\t', '\n', '\f', ' '))
125
126#define ANYTHING_ELSE if (1)
127
128#define EMIT_EOF \
129 do { \
130 if (m_has_emitted_eof) \
131 return {}; \
132 m_has_emitted_eof = true; \
133 create_new_token(HTMLToken::Type::EndOfFile); \
134 will_emit(m_current_token); \
135 m_queued_tokens.enqueue(move(m_current_token)); \
136 return m_queued_tokens.dequeue(); \
137 } while (0)
138
139#define EMIT_CURRENT_TOKEN \
140 do { \
141 VERIFY(m_current_builder.is_empty()); \
142 will_emit(m_current_token); \
143 m_queued_tokens.enqueue(move(m_current_token)); \
144 return m_queued_tokens.dequeue(); \
145 } while (0)
146
147#define EMIT_CHARACTER(code_point) \
148 do { \
149 create_new_token(HTMLToken::Type::Character); \
150 m_current_token.set_code_point(code_point); \
151 m_queued_tokens.enqueue(move(m_current_token)); \
152 return m_queued_tokens.dequeue(); \
153 } while (0)
154
155#define EMIT_CURRENT_CHARACTER \
156 EMIT_CHARACTER(current_input_character.value());
157
158#define SWITCH_TO_AND_EMIT_CHARACTER(code_point, new_state) \
159 do { \
160 will_switch_to(State::new_state); \
161 m_state = State::new_state; \
162 EMIT_CHARACTER(code_point); \
163 } while (0)
164
165#define SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(new_state) \
166 SWITCH_TO_AND_EMIT_CHARACTER(current_input_character.value(), new_state)
167
168#define BEGIN_STATE(state) \
169 state: \
170 case State::state: { \
171 { \
172 {
173
174#define END_STATE \
175 VERIFY_NOT_REACHED(); \
176 break; \
177 } \
178 } \
179 }
180
181static inline void log_parse_error(SourceLocation const& location = SourceLocation::current())
182{
183 dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location);
184}
185
186Optional<u32> HTMLTokenizer::next_code_point()
187{
188 if (m_utf8_iterator == m_utf8_view.end())
189 return {};
190
191 u32 code_point;
192 // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
193 // https://infra.spec.whatwg.org/#normalize-newlines
194 if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') {
195 // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
196 skip(2);
197 code_point = '\n';
198 } else if (peek_code_point(0).value_or(0) == '\r') {
199 // replace every remaining U+000D CR code point with a U+000A LF code point.
200 skip(1);
201 code_point = '\n';
202 } else {
203 skip(1);
204 code_point = *m_prev_utf8_iterator;
205 }
206
207 dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point);
208 return code_point;
209}
210
211void HTMLTokenizer::skip(size_t count)
212{
213 if (!m_source_positions.is_empty())
214 m_source_positions.append(m_source_positions.last());
215 for (size_t i = 0; i < count; ++i) {
216 m_prev_utf8_iterator = m_utf8_iterator;
217 auto code_point = *m_utf8_iterator;
218 if (!m_source_positions.is_empty()) {
219 if (code_point == '\n') {
220 m_source_positions.last().column = 0;
221 m_source_positions.last().line++;
222 } else {
223 m_source_positions.last().column++;
224 }
225 }
226 ++m_utf8_iterator;
227 }
228}
229
230Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const
231{
232 auto it = m_utf8_iterator;
233 for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
234 ++it;
235 if (it == m_utf8_view.end())
236 return {};
237 return *it;
238}
239
240HTMLToken::Position HTMLTokenizer::nth_last_position(size_t n)
241{
242 if (n + 1 > m_source_positions.size()) {
243 dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer::nth_last_position) Invalid position requested: {}th-last of {}. Returning (0-0).", n, m_source_positions.size());
244 return HTMLToken::Position { 0, 0 };
245 };
246 return m_source_positions.at(m_source_positions.size() - 1 - n);
247}
248
249Optional<HTMLToken> HTMLTokenizer::next_token()
250{
251 if (!m_source_positions.is_empty()) {
252 auto last_position = m_source_positions.last();
253 m_source_positions.clear_with_capacity();
254 m_source_positions.append(move(last_position));
255 }
256_StartOfFunction:
257 if (!m_queued_tokens.is_empty())
258 return m_queued_tokens.dequeue();
259
260 if (m_aborted)
261 return {};
262
263 for (;;) {
264 auto current_input_character = next_code_point();
265 switch (m_state) {
266 // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
267 BEGIN_STATE(Data)
268 {
269 ON('&')
270 {
271 m_return_state = State::Data;
272 SWITCH_TO(CharacterReference);
273 }
274 ON('<')
275 {
276 SWITCH_TO(TagOpen);
277 }
278 ON(0)
279 {
280 log_parse_error();
281 EMIT_CURRENT_CHARACTER;
282 }
283 ON_EOF
284 {
285 EMIT_EOF;
286 }
287 ANYTHING_ELSE
288 {
289 EMIT_CURRENT_CHARACTER;
290 }
291 }
292 END_STATE
293
294 // 13.2.5.6 Tag open state, https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
295 BEGIN_STATE(TagOpen)
296 {
297 ON('!')
298 {
299 SWITCH_TO(MarkupDeclarationOpen);
300 }
301 ON('/')
302 {
303 SWITCH_TO(EndTagOpen);
304 }
305 ON_ASCII_ALPHA
306 {
307 create_new_token(HTMLToken::Type::StartTag);
308 RECONSUME_IN(TagName);
309 }
310 ON('?')
311 {
312 log_parse_error();
313 create_new_token(HTMLToken::Type::Comment);
314 m_current_token.set_start_position({}, nth_last_position(2));
315 RECONSUME_IN(BogusComment);
316 }
317 ON_EOF
318 {
319 log_parse_error();
320 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
321 EMIT_EOF;
322 }
323 ANYTHING_ELSE
324 {
325 log_parse_error();
326 EMIT_CHARACTER_AND_RECONSUME_IN('<', Data);
327 }
328 }
329 END_STATE
330
331 // 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
332 BEGIN_STATE(TagName)
333 {
334 ON_WHITESPACE
335 {
336 m_current_token.set_tag_name(consume_current_builder());
337 m_current_token.set_end_position({}, nth_last_position(1));
338 SWITCH_TO(BeforeAttributeName);
339 }
340 ON('/')
341 {
342 m_current_token.set_tag_name(consume_current_builder());
343 m_current_token.set_end_position({}, nth_last_position(0));
344 SWITCH_TO(SelfClosingStartTag);
345 }
346 ON('>')
347 {
348 m_current_token.set_tag_name(consume_current_builder());
349 m_current_token.set_end_position({}, nth_last_position(1));
350 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
351 }
352 ON_ASCII_UPPER_ALPHA
353 {
354 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
355 m_current_token.set_end_position({}, nth_last_position(0));
356 continue;
357 }
358 ON(0)
359 {
360 log_parse_error();
361 m_current_builder.append_code_point(0xFFFD);
362 m_current_token.set_end_position({}, nth_last_position(0));
363 continue;
364 }
365 ON_EOF
366 {
367 log_parse_error();
368 m_current_token.set_end_position({}, nth_last_position(0));
369 EMIT_EOF;
370 }
371 ANYTHING_ELSE
372 {
373 m_current_builder.append_code_point(current_input_character.value());
374 m_current_token.set_end_position({}, nth_last_position(0));
375 continue;
376 }
377 }
378 END_STATE
379
380 // 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
381 BEGIN_STATE(EndTagOpen)
382 {
383 ON_ASCII_ALPHA
384 {
385 create_new_token(HTMLToken::Type::EndTag);
386 RECONSUME_IN(TagName);
387 }
388 ON('>')
389 {
390 log_parse_error();
391 SWITCH_TO(Data);
392 }
393 ON_EOF
394 {
395 log_parse_error();
396 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
397 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
398 EMIT_EOF;
399 }
400 ANYTHING_ELSE
401 {
402 log_parse_error();
403 create_new_token(HTMLToken::Type::Comment);
404 RECONSUME_IN(BogusComment);
405 }
406 }
407 END_STATE
408
409 // 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
410 BEGIN_STATE(MarkupDeclarationOpen)
411 {
412 DONT_CONSUME_NEXT_INPUT_CHARACTER;
413 if (consume_next_if_match("--"sv)) {
414 create_new_token(HTMLToken::Type::Comment);
415 m_current_token.set_start_position({}, nth_last_position(3));
416 SWITCH_TO(CommentStart);
417 }
418 if (consume_next_if_match("DOCTYPE"sv, CaseSensitivity::CaseInsensitive)) {
419 SWITCH_TO(DOCTYPE);
420 }
421 if (consume_next_if_match("[CDATA["sv)) {
422 // We keep the parser optional so that syntax highlighting can be lexer-only.
423 // The parser registers itself with the lexer it creates.
424 if (m_parser != nullptr && m_parser->adjusted_current_node().namespace_() != Namespace::HTML) {
425 SWITCH_TO(CDATASection);
426 } else {
427 create_new_token(HTMLToken::Type::Comment);
428 m_current_builder.append("[CDATA["sv);
429 SWITCH_TO_WITH_UNCLEAN_BUILDER(BogusComment);
430 }
431 }
432 ANYTHING_ELSE
433 {
434 log_parse_error();
435 create_new_token(HTMLToken::Type::Comment);
436 SWITCH_TO(BogusComment);
437 }
438 }
439 END_STATE
440
441 // 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
442 BEGIN_STATE(BogusComment)
443 {
444 ON('>')
445 {
446 m_current_token.set_comment(consume_current_builder());
447 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
448 }
449 ON_EOF
450 {
451 m_queued_tokens.enqueue(move(m_current_token));
452 EMIT_EOF;
453 }
454 ON(0)
455 {
456 log_parse_error();
457 m_current_builder.append_code_point(0xFFFD);
458 continue;
459 }
460 ANYTHING_ELSE
461 {
462 m_current_builder.append_code_point(current_input_character.value());
463 continue;
464 }
465 }
466 END_STATE
467
468 // 13.2.5.53 DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
469 BEGIN_STATE(DOCTYPE)
470 {
471 ON_WHITESPACE
472 {
473 SWITCH_TO(BeforeDOCTYPEName);
474 }
475 ON('>')
476 {
477 RECONSUME_IN(BeforeDOCTYPEName);
478 }
479 ON_EOF
480 {
481 log_parse_error();
482 create_new_token(HTMLToken::Type::DOCTYPE);
483 m_current_token.ensure_doctype_data().force_quirks = true;
484 m_queued_tokens.enqueue(move(m_current_token));
485 EMIT_EOF;
486 }
487 ANYTHING_ELSE
488 {
489 log_parse_error();
490 RECONSUME_IN(BeforeDOCTYPEName);
491 }
492 }
493 END_STATE
494
495 // 13.2.5.54 Before DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
496 BEGIN_STATE(BeforeDOCTYPEName)
497 {
498 ON_WHITESPACE
499 {
500 continue;
501 }
502 ON_ASCII_UPPER_ALPHA
503 {
504 create_new_token(HTMLToken::Type::DOCTYPE);
505 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
506 m_current_token.ensure_doctype_data().missing_name = false;
507 SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
508 }
509 ON(0)
510 {
511 log_parse_error();
512 create_new_token(HTMLToken::Type::DOCTYPE);
513 m_current_builder.append_code_point(0xFFFD);
514 m_current_token.ensure_doctype_data().missing_name = false;
515 SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
516 }
517 ON('>')
518 {
519 log_parse_error();
520 create_new_token(HTMLToken::Type::DOCTYPE);
521 m_current_token.ensure_doctype_data().force_quirks = true;
522 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
523 }
524 ON_EOF
525 {
526 log_parse_error();
527 create_new_token(HTMLToken::Type::DOCTYPE);
528 m_current_token.ensure_doctype_data().force_quirks = true;
529 m_queued_tokens.enqueue(move(m_current_token));
530 EMIT_EOF;
531 }
532 ANYTHING_ELSE
533 {
534 create_new_token(HTMLToken::Type::DOCTYPE);
535 m_current_builder.append_code_point(current_input_character.value());
536 m_current_token.ensure_doctype_data().missing_name = false;
537 SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
538 }
539 }
540 END_STATE
541
542 // 13.2.5.55 DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
543 BEGIN_STATE(DOCTYPEName)
544 {
545 ON_WHITESPACE
546 {
547 m_current_token.ensure_doctype_data().name = consume_current_builder();
548 SWITCH_TO(AfterDOCTYPEName);
549 }
550 ON('>')
551 {
552 m_current_token.ensure_doctype_data().name = consume_current_builder();
553 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
554 }
555 ON_ASCII_UPPER_ALPHA
556 {
557 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
558 continue;
559 }
560 ON(0)
561 {
562 log_parse_error();
563 m_current_builder.append_code_point(0xFFFD);
564 continue;
565 }
566 ON_EOF
567 {
568 log_parse_error();
569 m_current_token.ensure_doctype_data().force_quirks = true;
570 m_queued_tokens.enqueue(move(m_current_token));
571 EMIT_EOF;
572 }
573 ANYTHING_ELSE
574 {
575 m_current_builder.append_code_point(current_input_character.value());
576 continue;
577 }
578 }
579 END_STATE
580
581 // 13.2.5.56 After DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
582 BEGIN_STATE(AfterDOCTYPEName)
583 {
584 ON_WHITESPACE
585 {
586 continue;
587 }
588 ON('>')
589 {
590 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
591 }
592 ON_EOF
593 {
594 log_parse_error();
595 m_current_token.ensure_doctype_data().force_quirks = true;
596 m_queued_tokens.enqueue(move(m_current_token));
597 EMIT_EOF;
598 }
599 ANYTHING_ELSE
600 {
601 if (to_ascii_uppercase(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC"sv, CaseSensitivity::CaseInsensitive)) {
602 SWITCH_TO(AfterDOCTYPEPublicKeyword);
603 }
604 if (to_ascii_uppercase(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM"sv, CaseSensitivity::CaseInsensitive)) {
605 SWITCH_TO(AfterDOCTYPESystemKeyword);
606 }
607 log_parse_error();
608 m_current_token.ensure_doctype_data().force_quirks = true;
609 RECONSUME_IN(BogusDOCTYPE);
610 }
611 }
612 END_STATE
613
614 // 13.2.5.57 After DOCTYPE public keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
615 BEGIN_STATE(AfterDOCTYPEPublicKeyword)
616 {
617 ON_WHITESPACE
618 {
619 SWITCH_TO(BeforeDOCTYPEPublicIdentifier);
620 }
621 ON('"')
622 {
623 log_parse_error();
624 m_current_token.ensure_doctype_data().missing_public_identifier = false;
625 SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
626 }
627 ON('\'')
628 {
629 log_parse_error();
630 m_current_token.ensure_doctype_data().missing_public_identifier = false;
631 SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
632 }
633 ON('>')
634 {
635 log_parse_error();
636 m_current_token.ensure_doctype_data().force_quirks = true;
637 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
638 }
639 ON_EOF
640 {
641 log_parse_error();
642 m_current_token.ensure_doctype_data().force_quirks = true;
643 m_queued_tokens.enqueue(move(m_current_token));
644 EMIT_EOF;
645 }
646 ANYTHING_ELSE
647 {
648 log_parse_error();
649 m_current_token.ensure_doctype_data().force_quirks = true;
650 RECONSUME_IN(BogusDOCTYPE);
651 }
652 }
653 END_STATE
654
655 // 13.2.5.63 After DOCTYPE system keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
656 BEGIN_STATE(AfterDOCTYPESystemKeyword)
657 {
658 ON_WHITESPACE
659 {
660 SWITCH_TO(BeforeDOCTYPESystemIdentifier);
661 }
662 ON('"')
663 {
664 log_parse_error();
665 m_current_token.ensure_doctype_data().system_identifier = {};
666 m_current_token.ensure_doctype_data().missing_system_identifier = false;
667 SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
668 }
669 ON('\'')
670 {
671 log_parse_error();
672 m_current_token.ensure_doctype_data().system_identifier = {};
673 m_current_token.ensure_doctype_data().missing_system_identifier = false;
674 SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
675 }
676 ON('>')
677 {
678 log_parse_error();
679 m_current_token.ensure_doctype_data().force_quirks = true;
680 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
681 }
682 ON_EOF
683 {
684 log_parse_error();
685 m_current_token.ensure_doctype_data().force_quirks = true;
686 m_queued_tokens.enqueue(move(m_current_token));
687 EMIT_EOF;
688 }
689 ANYTHING_ELSE
690 {
691 log_parse_error();
692 m_current_token.ensure_doctype_data().force_quirks = true;
693 RECONSUME_IN(BogusDOCTYPE);
694 }
695 }
696 END_STATE
697
698 // 13.2.5.58 Before DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
699 BEGIN_STATE(BeforeDOCTYPEPublicIdentifier)
700 {
701 ON_WHITESPACE
702 {
703 continue;
704 }
705 ON('"')
706 {
707 m_current_token.ensure_doctype_data().missing_public_identifier = false;
708 SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
709 }
710 ON('\'')
711 {
712 m_current_token.ensure_doctype_data().missing_public_identifier = false;
713 SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
714 }
715 ON('>')
716 {
717 log_parse_error();
718 m_current_token.ensure_doctype_data().force_quirks = true;
719 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
720 }
721 ON_EOF
722 {
723 log_parse_error();
724 m_current_token.ensure_doctype_data().force_quirks = true;
725 m_queued_tokens.enqueue(move(m_current_token));
726 EMIT_EOF;
727 }
728 ANYTHING_ELSE
729 {
730 log_parse_error();
731 m_current_token.ensure_doctype_data().force_quirks = true;
732 RECONSUME_IN(BogusDOCTYPE);
733 }
734 }
735 END_STATE
736
737 // 13.2.5.64 Before DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
738 BEGIN_STATE(BeforeDOCTYPESystemIdentifier)
739 {
740 ON_WHITESPACE
741 {
742 continue;
743 }
744 ON('"')
745 {
746 m_current_token.ensure_doctype_data().missing_system_identifier = false;
747 SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
748 }
749 ON('\'')
750 {
751 m_current_token.ensure_doctype_data().missing_system_identifier = false;
752 SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
753 }
754 ON('>')
755 {
756 log_parse_error();
757 m_current_token.ensure_doctype_data().force_quirks = true;
758 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
759 }
760 ON_EOF
761 {
762 log_parse_error();
763 m_current_token.ensure_doctype_data().force_quirks = true;
764 m_queued_tokens.enqueue(move(m_current_token));
765 EMIT_EOF;
766 }
767 ANYTHING_ELSE
768 {
769 log_parse_error();
770 m_current_token.ensure_doctype_data().force_quirks = true;
771 RECONSUME_IN(BogusDOCTYPE);
772 }
773 }
774 END_STATE
775
776 // 13.2.5.59 DOCTYPE public identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
777 BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuoted)
778 {
779 ON('"')
780 {
781 m_current_token.ensure_doctype_data().public_identifier = consume_current_builder();
782 SWITCH_TO(AfterDOCTYPEPublicIdentifier);
783 }
784 ON(0)
785 {
786 log_parse_error();
787 m_current_builder.append_code_point(0xFFFD);
788 continue;
789 }
790 ON('>')
791 {
792 log_parse_error();
793 m_current_token.ensure_doctype_data().public_identifier = consume_current_builder();
794 m_current_token.ensure_doctype_data().force_quirks = true;
795 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
796 }
797 ON_EOF
798 {
799 log_parse_error();
800 m_current_token.ensure_doctype_data().force_quirks = true;
801 m_queued_tokens.enqueue(move(m_current_token));
802 EMIT_EOF;
803 }
804 ANYTHING_ELSE
805 {
806 m_current_builder.append_code_point(current_input_character.value());
807 continue;
808 }
809 }
810 END_STATE
811
812 // 13.2.5.60 DOCTYPE public identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
813 BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuoted)
814 {
815 ON('\'')
816 {
817 m_current_token.ensure_doctype_data().public_identifier = consume_current_builder();
818 SWITCH_TO(AfterDOCTYPEPublicIdentifier);
819 }
820 ON(0)
821 {
822 log_parse_error();
823 m_current_builder.append_code_point(0xFFFD);
824 continue;
825 }
826 ON('>')
827 {
828 log_parse_error();
829 m_current_token.ensure_doctype_data().public_identifier = consume_current_builder();
830 m_current_token.ensure_doctype_data().force_quirks = true;
831 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
832 }
833 ON_EOF
834 {
835 log_parse_error();
836 m_current_token.ensure_doctype_data().force_quirks = true;
837 m_queued_tokens.enqueue(move(m_current_token));
838 EMIT_EOF;
839 }
840 ANYTHING_ELSE
841 {
842 m_current_builder.append_code_point(current_input_character.value());
843 continue;
844 }
845 }
846 END_STATE
847
848 // 13.2.5.65 DOCTYPE system identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
849 BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuoted)
850 {
851 ON('"')
852 {
853 m_current_token.ensure_doctype_data().system_identifier = consume_current_builder();
854 SWITCH_TO(AfterDOCTYPESystemIdentifier);
855 }
856 ON(0)
857 {
858 log_parse_error();
859 m_current_builder.append_code_point(0xFFFD);
860 continue;
861 }
862 ON('>')
863 {
864 log_parse_error();
865 m_current_token.ensure_doctype_data().system_identifier = consume_current_builder();
866 m_current_token.ensure_doctype_data().force_quirks = true;
867 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
868 }
869 ON_EOF
870 {
871 log_parse_error();
872 m_current_token.ensure_doctype_data().force_quirks = true;
873 m_queued_tokens.enqueue(move(m_current_token));
874 EMIT_EOF;
875 }
876 ANYTHING_ELSE
877 {
878 m_current_builder.append_code_point(current_input_character.value());
879 continue;
880 }
881 }
882 END_STATE
883
884 // 13.2.5.66 DOCTYPE system identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
885 BEGIN_STATE(DOCTYPESystemIdentifierSingleQuoted)
886 {
887 ON('\'')
888 {
889 m_current_token.ensure_doctype_data().system_identifier = consume_current_builder();
890 SWITCH_TO(AfterDOCTYPESystemIdentifier);
891 }
892 ON(0)
893 {
894 log_parse_error();
895 m_current_builder.append_code_point(0xFFFD);
896 continue;
897 }
898 ON('>')
899 {
900 log_parse_error();
901 m_current_token.ensure_doctype_data().system_identifier = consume_current_builder();
902 m_current_token.ensure_doctype_data().force_quirks = true;
903 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
904 }
905 ON_EOF
906 {
907 log_parse_error();
908 m_current_token.ensure_doctype_data().force_quirks = true;
909 m_queued_tokens.enqueue(move(m_current_token));
910 EMIT_EOF;
911 }
912 ANYTHING_ELSE
913 {
914 m_current_builder.append_code_point(current_input_character.value());
915 continue;
916 }
917 }
918 END_STATE
919
920 // 13.2.5.61 After DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
921 BEGIN_STATE(AfterDOCTYPEPublicIdentifier)
922 {
923 ON_WHITESPACE
924 {
925 SWITCH_TO(BetweenDOCTYPEPublicAndSystemIdentifiers);
926 }
927 ON('>')
928 {
929 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
930 }
931 ON('"')
932 {
933 log_parse_error();
934 m_current_token.ensure_doctype_data().missing_system_identifier = false;
935 SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
936 }
937 ON('\'')
938 {
939 log_parse_error();
940 m_current_token.ensure_doctype_data().missing_system_identifier = false;
941 SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
942 }
943 ON_EOF
944 {
945 log_parse_error();
946 m_current_token.ensure_doctype_data().force_quirks = true;
947 m_queued_tokens.enqueue(move(m_current_token));
948 EMIT_EOF;
949 }
950 ANYTHING_ELSE
951 {
952 log_parse_error();
953 m_current_token.ensure_doctype_data().force_quirks = true;
954 RECONSUME_IN(BogusDOCTYPE);
955 }
956 }
957 END_STATE
958
959 // 13.2.5.62 Between DOCTYPE public and system identifiers state, https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
960 BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers)
961 {
962 ON_WHITESPACE
963 {
964 continue;
965 }
966 ON('>')
967 {
968 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
969 }
970 ON('"')
971 {
972 m_current_token.ensure_doctype_data().missing_system_identifier = false;
973 SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
974 }
975 ON('\'')
976 {
977 m_current_token.ensure_doctype_data().missing_system_identifier = false;
978 SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
979 }
980 ON_EOF
981 {
982 log_parse_error();
983 m_current_token.ensure_doctype_data().force_quirks = true;
984 m_queued_tokens.enqueue(move(m_current_token));
985 EMIT_EOF;
986 }
987 ANYTHING_ELSE
988 {
989 log_parse_error();
990 m_current_token.ensure_doctype_data().force_quirks = true;
991 RECONSUME_IN(BogusDOCTYPE);
992 }
993 }
994 END_STATE
995
996 // 13.2.5.67 After DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
997 BEGIN_STATE(AfterDOCTYPESystemIdentifier)
998 {
999 ON_WHITESPACE
1000 {
1001 continue;
1002 }
1003 ON('>')
1004 {
1005 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1006 }
1007 ON_EOF
1008 {
1009 log_parse_error();
1010 m_current_token.ensure_doctype_data().force_quirks = true;
1011 m_queued_tokens.enqueue(move(m_current_token));
1012 EMIT_EOF;
1013 }
1014 ANYTHING_ELSE
1015 {
1016 log_parse_error();
1017 RECONSUME_IN(BogusDOCTYPE);
1018 }
1019 }
1020 END_STATE
1021
1022 // 13.2.5.68 Bogus DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
1023 BEGIN_STATE(BogusDOCTYPE)
1024 {
1025 ON('>')
1026 {
1027 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1028 }
1029 ON(0)
1030 {
1031 log_parse_error();
1032 continue;
1033 }
1034 ON_EOF
1035 {
1036 m_queued_tokens.enqueue(move(m_current_token));
1037 EMIT_EOF;
1038 }
1039 ANYTHING_ELSE
1040 {
1041 continue;
1042 }
1043 }
1044 END_STATE
1045
1046 // 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1047 BEGIN_STATE(BeforeAttributeName)
1048 {
1049 ON_WHITESPACE
1050 {
1051 continue;
1052 }
1053 ON('/')
1054 {
1055 if (m_current_token.has_attributes())
1056 m_current_token.last_attribute().name_end_position = nth_last_position(1);
1057 RECONSUME_IN(AfterAttributeName);
1058 }
1059 ON('>')
1060 {
1061 RECONSUME_IN(AfterAttributeName);
1062 }
1063 ON_EOF
1064 {
1065 RECONSUME_IN(AfterAttributeName);
1066 }
1067 ON('=')
1068 {
1069 log_parse_error();
1070 HTMLToken::Attribute new_attribute;
1071 new_attribute.name_start_position = nth_last_position(1);
1072 m_current_builder.append_code_point(current_input_character.value());
1073 m_current_token.add_attribute(move(new_attribute));
1074 SWITCH_TO_WITH_UNCLEAN_BUILDER(AttributeName);
1075 }
1076 ANYTHING_ELSE
1077 {
1078 HTMLToken::Attribute new_attribute;
1079 new_attribute.name_start_position = nth_last_position(1);
1080 m_current_token.add_attribute(move(new_attribute));
1081 RECONSUME_IN(AttributeName);
1082 }
1083 }
1084 END_STATE
1085
1086 // 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
1087 BEGIN_STATE(SelfClosingStartTag)
1088 {
1089 ON('>')
1090 {
1091 m_current_token.set_self_closing(true);
1092 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1093 }
1094 ON_EOF
1095 {
1096 log_parse_error();
1097 EMIT_EOF;
1098 }
1099 ANYTHING_ELSE
1100 {
1101 log_parse_error();
1102 RECONSUME_IN(BeforeAttributeName);
1103 }
1104 }
1105 END_STATE
1106
1107 // 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1108 BEGIN_STATE(AttributeName)
1109 {
1110 ON_WHITESPACE
1111 {
1112 m_current_token.last_attribute().local_name = consume_current_builder();
1113 RECONSUME_IN(AfterAttributeName);
1114 }
1115 ON('/')
1116 {
1117 m_current_token.last_attribute().local_name = consume_current_builder();
1118 RECONSUME_IN(AfterAttributeName);
1119 }
1120 ON('>')
1121 {
1122 m_current_token.last_attribute().local_name = consume_current_builder();
1123 RECONSUME_IN(AfterAttributeName);
1124 }
1125 ON_EOF
1126 {
1127 m_current_token.last_attribute().local_name = consume_current_builder();
1128 RECONSUME_IN(AfterAttributeName);
1129 }
1130 ON('=')
1131 {
1132 m_current_token.last_attribute().name_end_position = nth_last_position(1);
1133 m_current_token.last_attribute().local_name = consume_current_builder();
1134 SWITCH_TO(BeforeAttributeValue);
1135 }
1136 ON_ASCII_UPPER_ALPHA
1137 {
1138 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
1139 continue;
1140 }
1141 ON(0)
1142 {
1143 log_parse_error();
1144 m_current_builder.append_code_point(0xFFFD);
1145 continue;
1146 }
1147 ON('"')
1148 {
1149 log_parse_error();
1150 goto AnythingElseAttributeName;
1151 }
1152 ON('\'')
1153 {
1154 log_parse_error();
1155 goto AnythingElseAttributeName;
1156 }
1157 ON('<')
1158 {
1159 log_parse_error();
1160 goto AnythingElseAttributeName;
1161 }
1162 ANYTHING_ELSE
1163 {
1164 AnythingElseAttributeName:
1165 m_current_builder.append_code_point(current_input_character.value());
1166 continue;
1167 }
1168 }
1169 END_STATE
1170
1171 // 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1172 BEGIN_STATE(AfterAttributeName)
1173 {
1174 ON_WHITESPACE
1175 {
1176 continue;
1177 }
1178 ON('/')
1179 {
1180 SWITCH_TO(SelfClosingStartTag);
1181 }
1182 ON('=')
1183 {
1184 m_current_token.last_attribute().name_end_position = nth_last_position(1);
1185 SWITCH_TO(BeforeAttributeValue);
1186 }
1187 ON('>')
1188 {
1189 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1190 }
1191 ON_EOF
1192 {
1193 log_parse_error();
1194 EMIT_EOF;
1195 }
1196 ANYTHING_ELSE
1197 {
1198 m_current_token.add_attribute({});
1199 if (!m_source_positions.is_empty())
1200 m_current_token.last_attribute().name_start_position = m_source_positions.last();
1201 RECONSUME_IN(AttributeName);
1202 }
1203 }
1204 END_STATE
1205
1206 // 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
1207 BEGIN_STATE(BeforeAttributeValue)
1208 {
1209 m_current_token.last_attribute().value_start_position = nth_last_position(1);
1210 ON_WHITESPACE
1211 {
1212 continue;
1213 }
1214 ON('"')
1215 {
1216 SWITCH_TO(AttributeValueDoubleQuoted);
1217 }
1218 ON('\'')
1219 {
1220 SWITCH_TO(AttributeValueSingleQuoted);
1221 }
1222 ON('>')
1223 {
1224 log_parse_error();
1225 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1226 }
1227 ANYTHING_ELSE
1228 {
1229 RECONSUME_IN(AttributeValueUnquoted);
1230 }
1231 }
1232 END_STATE
1233
1234 // 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
1235 BEGIN_STATE(AttributeValueDoubleQuoted)
1236 {
1237 ON('"')
1238 {
1239 m_current_token.last_attribute().value = consume_current_builder();
1240 SWITCH_TO(AfterAttributeValueQuoted);
1241 }
1242 ON('&')
1243 {
1244 m_return_state = State::AttributeValueDoubleQuoted;
1245 SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference);
1246 }
1247 ON(0)
1248 {
1249 log_parse_error();
1250 m_current_builder.append_code_point(0xFFFD);
1251 continue;
1252 }
1253 ON_EOF
1254 {
1255 log_parse_error();
1256 EMIT_EOF;
1257 }
1258 ANYTHING_ELSE
1259 {
1260 m_current_builder.append_code_point(current_input_character.value());
1261 continue;
1262 }
1263 }
1264 END_STATE
1265
1266 // 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
1267 BEGIN_STATE(AttributeValueSingleQuoted)
1268 {
1269 ON('\'')
1270 {
1271 m_current_token.last_attribute().value = consume_current_builder();
1272 SWITCH_TO(AfterAttributeValueQuoted);
1273 }
1274 ON('&')
1275 {
1276 m_return_state = State::AttributeValueSingleQuoted;
1277 SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference);
1278 }
1279 ON(0)
1280 {
1281 log_parse_error();
1282 m_current_builder.append_code_point(0xFFFD);
1283 continue;
1284 }
1285 ON_EOF
1286 {
1287 log_parse_error();
1288 EMIT_EOF;
1289 }
1290 ANYTHING_ELSE
1291 {
1292 m_current_builder.append_code_point(current_input_character.value());
1293 continue;
1294 }
1295 }
1296 END_STATE
1297
1298 // 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
1299 BEGIN_STATE(AttributeValueUnquoted)
1300 {
1301 ON_WHITESPACE
1302 {
1303 m_current_token.last_attribute().value = consume_current_builder();
1304 m_current_token.last_attribute().value_end_position = nth_last_position(1);
1305 SWITCH_TO(BeforeAttributeName);
1306 }
1307 ON('&')
1308 {
1309 m_return_state = State::AttributeValueUnquoted;
1310 SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference);
1311 }
1312 ON('>')
1313 {
1314 m_current_token.last_attribute().value = consume_current_builder();
1315 m_current_token.last_attribute().value_end_position = nth_last_position(1);
1316 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1317 }
1318 ON(0)
1319 {
1320 log_parse_error();
1321 m_current_builder.append_code_point(0xFFFD);
1322 continue;
1323 }
1324 ON('"')
1325 {
1326 log_parse_error();
1327 goto AnythingElseAttributeValueUnquoted;
1328 }
1329 ON('\'')
1330 {
1331 log_parse_error();
1332 goto AnythingElseAttributeValueUnquoted;
1333 }
1334 ON('<')
1335 {
1336 log_parse_error();
1337 goto AnythingElseAttributeValueUnquoted;
1338 }
1339 ON('=')
1340 {
1341 log_parse_error();
1342 goto AnythingElseAttributeValueUnquoted;
1343 }
1344 ON('`')
1345 {
1346 log_parse_error();
1347 goto AnythingElseAttributeValueUnquoted;
1348 }
1349 ON_EOF
1350 {
1351 log_parse_error();
1352 EMIT_EOF;
1353 }
1354 ANYTHING_ELSE
1355 {
1356 AnythingElseAttributeValueUnquoted:
1357 m_current_builder.append_code_point(current_input_character.value());
1358 continue;
1359 }
1360 }
1361 END_STATE
1362
1363 // 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
1364 BEGIN_STATE(AfterAttributeValueQuoted)
1365 {
1366 m_current_token.last_attribute().value_end_position = nth_last_position(1);
1367 ON_WHITESPACE
1368 {
1369 SWITCH_TO(BeforeAttributeName);
1370 }
1371 ON('/')
1372 {
1373 SWITCH_TO(SelfClosingStartTag);
1374 }
1375 ON('>')
1376 {
1377 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1378 }
1379 ON_EOF
1380 {
1381 log_parse_error();
1382 EMIT_EOF;
1383 }
1384 ANYTHING_ELSE
1385 {
1386 log_parse_error();
1387 RECONSUME_IN(BeforeAttributeName);
1388 }
1389 }
1390 END_STATE
1391
1392 // 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
1393 BEGIN_STATE(CommentStart)
1394 {
1395 ON('-')
1396 {
1397 SWITCH_TO(CommentStartDash);
1398 }
1399 ON('>')
1400 {
1401 log_parse_error();
1402 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1403 }
1404 ANYTHING_ELSE
1405 {
1406 RECONSUME_IN(Comment);
1407 }
1408 }
1409 END_STATE
1410
1411 // 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
1412 BEGIN_STATE(CommentStartDash)
1413 {
1414 ON('-')
1415 {
1416 SWITCH_TO(CommentEnd);
1417 }
1418 ON('>')
1419 {
1420 log_parse_error();
1421 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1422 }
1423 ON_EOF
1424 {
1425 log_parse_error();
1426 EMIT_EOF;
1427 }
1428 ANYTHING_ELSE
1429 {
1430 m_current_builder.append('-');
1431 RECONSUME_IN(Comment);
1432 }
1433 }
1434 END_STATE
1435
1436 // 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state
1437 BEGIN_STATE(Comment)
1438 {
1439 ON('<')
1440 {
1441 m_current_builder.append_code_point(current_input_character.value());
1442 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSign);
1443 }
1444 ON('-')
1445 {
1446 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndDash);
1447 }
1448 ON(0)
1449 {
1450 log_parse_error();
1451 m_current_builder.append_code_point(0xFFFD);
1452 continue;
1453 }
1454 ON_EOF
1455 {
1456 log_parse_error();
1457 m_current_token.set_comment(consume_current_builder());
1458 EMIT_EOF;
1459 }
1460 ANYTHING_ELSE
1461 {
1462 m_current_builder.append_code_point(current_input_character.value());
1463 continue;
1464 }
1465 }
1466 END_STATE
1467
1468 // 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
1469 BEGIN_STATE(CommentEnd)
1470 {
1471 ON('>')
1472 {
1473 m_current_token.set_comment(consume_current_builder());
1474 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1475 }
1476 ON('!')
1477 {
1478 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndBang);
1479 }
1480 ON('-')
1481 {
1482 m_current_builder.append('-');
1483 continue;
1484 }
1485 ON_EOF
1486 {
1487 log_parse_error();
1488 m_current_token.set_comment(consume_current_builder());
1489 EMIT_EOF;
1490 }
1491 ANYTHING_ELSE
1492 {
1493 m_current_builder.append("--"sv);
1494 RECONSUME_IN(Comment);
1495 }
1496 }
1497 END_STATE
1498
1499 // 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
1500 BEGIN_STATE(CommentEndBang)
1501 {
1502 ON('-')
1503 {
1504 m_current_builder.append("--!"sv);
1505 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndDash);
1506 }
1507 ON('>')
1508 {
1509 log_parse_error();
1510 m_current_token.set_comment(consume_current_builder());
1511 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1512 }
1513 ON_EOF
1514 {
1515 log_parse_error();
1516 m_current_token.set_comment(consume_current_builder());
1517 EMIT_EOF;
1518 }
1519 ANYTHING_ELSE
1520 {
1521 m_current_builder.append("--!"sv);
1522 RECONSUME_IN(Comment);
1523 }
1524 }
1525 END_STATE
1526
1527 // 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
1528 BEGIN_STATE(CommentEndDash)
1529 {
1530 ON('-')
1531 {
1532 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEnd);
1533 }
1534 ON_EOF
1535 {
1536 log_parse_error();
1537 m_current_token.set_comment(consume_current_builder());
1538 EMIT_EOF;
1539 }
1540 ANYTHING_ELSE
1541 {
1542 m_current_builder.append('-');
1543 RECONSUME_IN(Comment);
1544 }
1545 }
1546 END_STATE
1547
1548 // 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
1549 BEGIN_STATE(CommentLessThanSign)
1550 {
1551 ON('!')
1552 {
1553 m_current_builder.append_code_point(current_input_character.value());
1554 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBang);
1555 }
1556 ON('<')
1557 {
1558 m_current_builder.append_code_point(current_input_character.value());
1559 continue;
1560 }
1561 ANYTHING_ELSE
1562 {
1563 RECONSUME_IN(Comment);
1564 }
1565 }
1566 END_STATE
1567
1568 // 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
1569 BEGIN_STATE(CommentLessThanSignBang)
1570 {
1571 ON('-')
1572 {
1573 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBangDash);
1574 }
1575 ANYTHING_ELSE
1576 {
1577 RECONSUME_IN(Comment);
1578 }
1579 }
1580 END_STATE
1581
1582 // 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
1583 BEGIN_STATE(CommentLessThanSignBangDash)
1584 {
1585 ON('-')
1586 {
1587 SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBangDashDash);
1588 }
1589 ANYTHING_ELSE
1590 {
1591 RECONSUME_IN(CommentEndDash);
1592 }
1593 }
1594 END_STATE
1595
1596 // 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
1597 BEGIN_STATE(CommentLessThanSignBangDashDash)
1598 {
1599 ON('>')
1600 {
1601 RECONSUME_IN(CommentEnd);
1602 }
1603 ON_EOF
1604 {
1605 RECONSUME_IN(CommentEnd);
1606 }
1607 ANYTHING_ELSE
1608 {
1609 log_parse_error();
1610 RECONSUME_IN(CommentEnd);
1611 }
1612 }
1613 END_STATE
1614
1615 // 13.2.5.72 Character reference state, https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
1616 BEGIN_STATE(CharacterReference)
1617 {
1618 m_temporary_buffer.clear();
1619 m_temporary_buffer.append('&');
1620
1621 ON_ASCII_ALPHANUMERIC
1622 {
1623 RECONSUME_IN(NamedCharacterReference);
1624 }
1625 ON('#')
1626 {
1627 m_temporary_buffer.append(current_input_character.value());
1628 SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReference);
1629 }
1630 ANYTHING_ELSE
1631 {
1632 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
1633 RECONSUME_IN_RETURN_STATE;
1634 }
1635 }
1636 END_STATE
1637
1638 // 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
1639 BEGIN_STATE(NamedCharacterReference)
1640 {
1641 size_t byte_offset = m_utf8_view.byte_offset_of(m_prev_utf8_iterator);
1642
1643 auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset));
1644
1645 if (match.has_value()) {
1646 skip(match->entity.length() - 1);
1647 for (auto ch : match.value().entity)
1648 m_temporary_buffer.append(ch);
1649
1650 if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
1651 auto next_code_point = peek_code_point(0);
1652 if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) {
1653 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
1654 SWITCH_TO_RETURN_STATE;
1655 }
1656 }
1657
1658 if (!match.value().entity.ends_with(';')) {
1659 log_parse_error();
1660 }
1661
1662 m_temporary_buffer = match.value().code_points;
1663
1664 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
1665 SWITCH_TO_RETURN_STATE;
1666 } else {
1667 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
1668 // FIXME: This should be SWITCH_TO, but we always lose the first character on this path, so just reconsume it.
1669 // I can't wrap my head around how to do it as the spec says.
1670 RECONSUME_IN(AmbiguousAmpersand);
1671 }
1672 }
1673 END_STATE
1674
1675 // 13.2.5.74 Ambiguous ampersand state, https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
1676 BEGIN_STATE(AmbiguousAmpersand)
1677 {
1678 ON_ASCII_ALPHANUMERIC
1679 {
1680 if (consumed_as_part_of_an_attribute()) {
1681 m_current_builder.append_code_point(current_input_character.value());
1682 continue;
1683 } else {
1684 EMIT_CURRENT_CHARACTER;
1685 }
1686 }
1687 ON(';')
1688 {
1689 log_parse_error();
1690 RECONSUME_IN_RETURN_STATE;
1691 }
1692 ANYTHING_ELSE
1693 {
1694 RECONSUME_IN_RETURN_STATE;
1695 }
1696 }
1697 END_STATE
1698
1699 // 13.2.5.75 Numeric character reference state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
1700 BEGIN_STATE(NumericCharacterReference)
1701 {
1702 m_character_reference_code = 0;
1703
1704 ON('X')
1705 {
1706 m_temporary_buffer.append(current_input_character.value());
1707 SWITCH_TO_WITH_UNCLEAN_BUILDER(HexadecimalCharacterReferenceStart);
1708 }
1709 ON('x')
1710 {
1711 m_temporary_buffer.append(current_input_character.value());
1712 SWITCH_TO_WITH_UNCLEAN_BUILDER(HexadecimalCharacterReferenceStart);
1713 }
1714 ANYTHING_ELSE
1715 {
1716 RECONSUME_IN(DecimalCharacterReferenceStart);
1717 }
1718 }
1719 END_STATE
1720
1721 // 13.2.5.76 Hexadecimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
1722 BEGIN_STATE(HexadecimalCharacterReferenceStart)
1723 {
1724 ON_ASCII_HEX_DIGIT
1725 {
1726 RECONSUME_IN(HexadecimalCharacterReference);
1727 }
1728 ANYTHING_ELSE
1729 {
1730 log_parse_error();
1731 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
1732 RECONSUME_IN_RETURN_STATE;
1733 }
1734 }
1735 END_STATE
1736
1737 // 13.2.5.77 Decimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
1738 BEGIN_STATE(DecimalCharacterReferenceStart)
1739 {
1740 ON_ASCII_DIGIT
1741 {
1742 RECONSUME_IN(DecimalCharacterReference);
1743 }
1744 ANYTHING_ELSE
1745 {
1746 log_parse_error();
1747 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
1748 RECONSUME_IN_RETURN_STATE;
1749 }
1750 }
1751 END_STATE
1752
1753 // 13.2.5.78 Hexadecimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
1754 BEGIN_STATE(HexadecimalCharacterReference)
1755 {
1756 ON_ASCII_DIGIT
1757 {
1758 m_character_reference_code *= 16;
1759 m_character_reference_code += current_input_character.value() - 0x30;
1760 continue;
1761 }
1762 ON_ASCII_UPPER_ALPHA
1763 {
1764 m_character_reference_code *= 16;
1765 m_character_reference_code += current_input_character.value() - 0x37;
1766 continue;
1767 }
1768 ON_ASCII_LOWER_ALPHA
1769 {
1770 m_character_reference_code *= 16;
1771 m_character_reference_code += current_input_character.value() - 0x57;
1772 continue;
1773 }
1774 ON(';')
1775 {
1776 SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReferenceEnd);
1777 }
1778 ANYTHING_ELSE
1779 {
1780 log_parse_error();
1781 RECONSUME_IN(NumericCharacterReferenceEnd);
1782 }
1783 }
1784 END_STATE
1785
1786 // 13.2.5.79 Decimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
1787 BEGIN_STATE(DecimalCharacterReference)
1788 {
1789 ON_ASCII_DIGIT
1790 {
1791 m_character_reference_code *= 10;
1792 m_character_reference_code += current_input_character.value() - 0x30;
1793 continue;
1794 }
1795 ON(';')
1796 {
1797 SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReferenceEnd);
1798 }
1799 ANYTHING_ELSE
1800 {
1801 log_parse_error();
1802 RECONSUME_IN(NumericCharacterReferenceEnd);
1803 }
1804 }
1805 END_STATE
1806
1807 // 13.2.5.80 Numeric character reference end state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
1808 BEGIN_STATE(NumericCharacterReferenceEnd)
1809 {
1810 DONT_CONSUME_NEXT_INPUT_CHARACTER;
1811
1812 if (m_character_reference_code == 0) {
1813 log_parse_error();
1814 m_character_reference_code = 0xFFFD;
1815 }
1816 if (m_character_reference_code > 0x10ffff) {
1817 log_parse_error();
1818 m_character_reference_code = 0xFFFD;
1819 }
1820 if (is_unicode_surrogate(m_character_reference_code)) {
1821 log_parse_error();
1822 m_character_reference_code = 0xFFFD;
1823 }
1824 if (is_unicode_noncharacter(m_character_reference_code)) {
1825 log_parse_error();
1826 }
1827 if (m_character_reference_code == 0xd || (is_unicode_control(m_character_reference_code) && !is_ascii_space(m_character_reference_code))) {
1828 log_parse_error();
1829 constexpr struct {
1830 u32 number;
1831 u32 code_point;
1832 } conversion_table[] = {
1833 { 0x80, 0x20AC },
1834 { 0x82, 0x201A },
1835 { 0x83, 0x0192 },
1836 { 0x84, 0x201E },
1837 { 0x85, 0x2026 },
1838 { 0x86, 0x2020 },
1839 { 0x87, 0x2021 },
1840 { 0x88, 0x02C6 },
1841 { 0x89, 0x2030 },
1842 { 0x8A, 0x0160 },
1843 { 0x8B, 0x2039 },
1844 { 0x8C, 0x0152 },
1845 { 0x8E, 0x017D },
1846 { 0x91, 0x2018 },
1847 { 0x92, 0x2019 },
1848 { 0x93, 0x201C },
1849 { 0x94, 0x201D },
1850 { 0x95, 0x2022 },
1851 { 0x96, 0x2013 },
1852 { 0x97, 0x2014 },
1853 { 0x98, 0x02DC },
1854 { 0x99, 0x2122 },
1855 { 0x9A, 0x0161 },
1856 { 0x9B, 0x203A },
1857 { 0x9C, 0x0153 },
1858 { 0x9E, 0x017E },
1859 { 0x9F, 0x0178 },
1860 };
1861 for (auto& entry : conversion_table) {
1862 if (m_character_reference_code == entry.number) {
1863 m_character_reference_code = entry.code_point;
1864 break;
1865 }
1866 }
1867 }
1868
1869 m_temporary_buffer.clear();
1870 m_temporary_buffer.append(m_character_reference_code);
1871 FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
1872 SWITCH_TO_RETURN_STATE;
1873 }
1874 END_STATE
1875
1876 // 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1877 BEGIN_STATE(RCDATA)
1878 {
1879 ON('&')
1880 {
1881 m_return_state = State::RCDATA;
1882 SWITCH_TO(CharacterReference);
1883 }
1884 ON('<')
1885 {
1886 SWITCH_TO(RCDATALessThanSign);
1887 }
1888 ON(0)
1889 {
1890 log_parse_error();
1891 EMIT_CHARACTER(0xFFFD);
1892 }
1893 ON_EOF
1894 {
1895 EMIT_EOF;
1896 }
1897 ANYTHING_ELSE
1898 {
1899 EMIT_CURRENT_CHARACTER;
1900 }
1901 }
1902 END_STATE
1903
1904 // 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
1905 BEGIN_STATE(RCDATALessThanSign)
1906 {
1907 ON('/')
1908 {
1909 m_temporary_buffer.clear();
1910 SWITCH_TO(RCDATAEndTagOpen);
1911 }
1912 ANYTHING_ELSE
1913 {
1914 EMIT_CHARACTER_AND_RECONSUME_IN('<', RCDATA);
1915 }
1916 }
1917 END_STATE
1918
1919 // 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1920 BEGIN_STATE(RCDATAEndTagOpen)
1921 {
1922 ON_ASCII_ALPHA
1923 {
1924 create_new_token(HTMLToken::Type::EndTag);
1925 RECONSUME_IN(RCDATAEndTagName);
1926 }
1927 ANYTHING_ELSE
1928 {
1929 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
1930 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
1931 RECONSUME_IN(RCDATA);
1932 }
1933 }
1934 END_STATE
1935
1936 // 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
1937 BEGIN_STATE(RCDATAEndTagName)
1938 {
1939 ON_WHITESPACE
1940 {
1941 m_current_token.set_tag_name(consume_current_builder());
1942 if (!current_end_tag_token_is_appropriate()) {
1943 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
1944 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
1945 for (auto code_point : m_temporary_buffer)
1946 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
1947 RECONSUME_IN(RCDATA);
1948 }
1949 SWITCH_TO(BeforeAttributeName);
1950 }
1951 ON('/')
1952 {
1953 m_current_token.set_tag_name(consume_current_builder());
1954 if (!current_end_tag_token_is_appropriate()) {
1955 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
1956 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
1957 for (auto code_point : m_temporary_buffer)
1958 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
1959 RECONSUME_IN(RCDATA);
1960 }
1961 SWITCH_TO(SelfClosingStartTag);
1962 }
1963 ON('>')
1964 {
1965 m_current_token.set_tag_name(consume_current_builder());
1966 if (!current_end_tag_token_is_appropriate()) {
1967 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
1968 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
1969 for (auto code_point : m_temporary_buffer)
1970 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
1971 RECONSUME_IN(RCDATA);
1972 }
1973 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
1974 }
1975 ON_ASCII_UPPER_ALPHA
1976 {
1977 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
1978 m_temporary_buffer.append(current_input_character.value());
1979 continue;
1980 }
1981 ON_ASCII_LOWER_ALPHA
1982 {
1983 m_current_builder.append_code_point(current_input_character.value());
1984 m_temporary_buffer.append(current_input_character.value());
1985 continue;
1986 }
1987 ANYTHING_ELSE
1988 {
1989 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
1990 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
1991 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
1992 m_current_builder.clear();
1993 for (auto code_point : m_temporary_buffer)
1994 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
1995 RECONSUME_IN(RCDATA);
1996 }
1997 }
1998 END_STATE
1999
2000 // 13.2.5.3 RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
2001 BEGIN_STATE(RAWTEXT)
2002 {
2003 ON('<')
2004 {
2005 SWITCH_TO(RAWTEXTLessThanSign);
2006 }
2007 ON(0)
2008 {
2009 log_parse_error();
2010 EMIT_CHARACTER(0xFFFD);
2011 }
2012 ON_EOF
2013 {
2014 EMIT_EOF;
2015 }
2016 ANYTHING_ELSE
2017 {
2018 EMIT_CURRENT_CHARACTER;
2019 }
2020 }
2021 END_STATE
2022
2023 // 13.2.5.12 RAWTEXT less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
2024 BEGIN_STATE(RAWTEXTLessThanSign)
2025 {
2026 ON('/')
2027 {
2028 m_temporary_buffer.clear();
2029 SWITCH_TO(RAWTEXTEndTagOpen);
2030 }
2031 ANYTHING_ELSE
2032 {
2033 EMIT_CHARACTER_AND_RECONSUME_IN('<', RAWTEXT);
2034 }
2035 }
2036 END_STATE
2037
2038 // 13.2.5.13 RAWTEXT end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
2039 BEGIN_STATE(RAWTEXTEndTagOpen)
2040 {
2041 ON_ASCII_ALPHA
2042 {
2043 create_new_token(HTMLToken::Type::EndTag);
2044 RECONSUME_IN(RAWTEXTEndTagName);
2045 }
2046 ANYTHING_ELSE
2047 {
2048 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2049 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2050 RECONSUME_IN(RAWTEXT);
2051 }
2052 }
2053 END_STATE
2054
2055 // 13.2.5.14 RAWTEXT end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
2056 BEGIN_STATE(RAWTEXTEndTagName)
2057 {
2058 ON_WHITESPACE
2059 {
2060 m_current_token.set_tag_name(consume_current_builder());
2061 if (!current_end_tag_token_is_appropriate()) {
2062 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2063 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2064 for (auto code_point : m_temporary_buffer)
2065 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2066 RECONSUME_IN(RAWTEXT);
2067 }
2068 SWITCH_TO(BeforeAttributeName);
2069 }
2070 ON('/')
2071 {
2072 m_current_token.set_tag_name(consume_current_builder());
2073 if (!current_end_tag_token_is_appropriate()) {
2074 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2075 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2076 for (auto code_point : m_temporary_buffer)
2077 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2078 RECONSUME_IN(RAWTEXT);
2079 }
2080 SWITCH_TO(SelfClosingStartTag);
2081 }
2082 ON('>')
2083 {
2084 m_current_token.set_tag_name(consume_current_builder());
2085 if (!current_end_tag_token_is_appropriate()) {
2086 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2087 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2088 for (auto code_point : m_temporary_buffer)
2089 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2090 RECONSUME_IN(RAWTEXT);
2091 }
2092 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
2093 }
2094 ON_ASCII_UPPER_ALPHA
2095 {
2096 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
2097 m_temporary_buffer.append(current_input_character.value());
2098 continue;
2099 }
2100 ON_ASCII_LOWER_ALPHA
2101 {
2102 m_current_builder.append(current_input_character.value());
2103 m_temporary_buffer.append(current_input_character.value());
2104 continue;
2105 }
2106 ANYTHING_ELSE
2107 {
2108 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2109 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2110 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2111 m_current_builder.clear();
2112 for (auto code_point : m_temporary_buffer)
2113 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2114 RECONSUME_IN(RAWTEXT);
2115 }
2116 }
2117 END_STATE
2118
2119 // 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
2120 BEGIN_STATE(ScriptData)
2121 {
2122 ON('<')
2123 {
2124 SWITCH_TO(ScriptDataLessThanSign);
2125 }
2126 ON(0)
2127 {
2128 log_parse_error();
2129 EMIT_CHARACTER(0xFFFD);
2130 }
2131 ON_EOF
2132 {
2133 EMIT_EOF;
2134 }
2135 ANYTHING_ELSE
2136 {
2137 EMIT_CURRENT_CHARACTER;
2138 }
2139 }
2140 END_STATE
2141
2142 // 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
2143 BEGIN_STATE(PLAINTEXT)
2144 {
2145 ON(0)
2146 {
2147 log_parse_error();
2148 EMIT_CHARACTER(0xFFFD);
2149 }
2150 ON_EOF
2151 {
2152 EMIT_EOF;
2153 }
2154 ANYTHING_ELSE
2155 {
2156 EMIT_CURRENT_CHARACTER;
2157 }
2158 }
2159 END_STATE
2160
2161 // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
2162 BEGIN_STATE(ScriptDataLessThanSign)
2163 {
2164 ON('/')
2165 {
2166 m_temporary_buffer.clear();
2167 SWITCH_TO(ScriptDataEndTagOpen);
2168 }
2169 ON('!')
2170 {
2171 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2172 m_queued_tokens.enqueue(HTMLToken::make_character('!'));
2173 SWITCH_TO(ScriptDataEscapeStart);
2174 }
2175 ANYTHING_ELSE
2176 {
2177 EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptData);
2178 }
2179 }
2180 END_STATE
2181
2182 // 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
2183 BEGIN_STATE(ScriptDataEscapeStart)
2184 {
2185 ON('-')
2186 {
2187 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapeStartDash);
2188 }
2189 ANYTHING_ELSE
2190 {
2191 RECONSUME_IN(ScriptData);
2192 }
2193 }
2194 END_STATE
2195
2196 // 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
2197 BEGIN_STATE(ScriptDataEscapeStartDash)
2198 {
2199 ON('-')
2200 {
2201 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDashDash);
2202 }
2203 ANYTHING_ELSE
2204 {
2205 RECONSUME_IN(ScriptData);
2206 }
2207 }
2208 END_STATE
2209
2210 // 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
2211 BEGIN_STATE(ScriptDataEscapedDashDash)
2212 {
2213 ON('-')
2214 {
2215 EMIT_CHARACTER('-');
2216 }
2217 ON('<')
2218 {
2219 SWITCH_TO(ScriptDataEscapedLessThanSign);
2220 }
2221 ON('>')
2222 {
2223 SWITCH_TO_AND_EMIT_CHARACTER('>', ScriptData);
2224 }
2225 ON(0)
2226 {
2227 log_parse_error();
2228 SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataEscaped);
2229 }
2230 ON_EOF
2231 {
2232 log_parse_error();
2233 EMIT_EOF;
2234 }
2235 ANYTHING_ELSE
2236 {
2237 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
2238 }
2239 }
2240 END_STATE
2241
2242 // 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
2243 BEGIN_STATE(ScriptDataEscapedLessThanSign)
2244 {
2245 ON('/')
2246 {
2247 m_temporary_buffer.clear();
2248 SWITCH_TO(ScriptDataEscapedEndTagOpen);
2249 }
2250 ON_ASCII_ALPHA
2251 {
2252 m_temporary_buffer.clear();
2253 EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataDoubleEscapeStart);
2254 }
2255 ANYTHING_ELSE
2256 {
2257 EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataEscaped);
2258 }
2259 }
2260 END_STATE
2261
2262 // 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
2263 BEGIN_STATE(ScriptDataEscapedEndTagOpen)
2264 {
2265 ON_ASCII_ALPHA
2266 {
2267 create_new_token(HTMLToken::Type::EndTag);
2268 RECONSUME_IN(ScriptDataEscapedEndTagName);
2269 }
2270 ANYTHING_ELSE
2271 {
2272 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2273 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2274 RECONSUME_IN(ScriptDataEscaped);
2275 }
2276 }
2277 END_STATE
2278
2279 // 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
2280 BEGIN_STATE(ScriptDataEscapedEndTagName)
2281 {
2282 ON_WHITESPACE
2283 {
2284 m_current_token.set_tag_name(consume_current_builder());
2285 if (current_end_tag_token_is_appropriate())
2286 SWITCH_TO(BeforeAttributeName);
2287
2288 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2289 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2290 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2291 m_current_builder.clear();
2292 for (auto code_point : m_temporary_buffer) {
2293 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2294 }
2295 RECONSUME_IN(ScriptDataEscaped);
2296 }
2297 ON('/')
2298 {
2299 m_current_token.set_tag_name(consume_current_builder());
2300 if (current_end_tag_token_is_appropriate())
2301 SWITCH_TO(SelfClosingStartTag);
2302
2303 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2304 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2305 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2306 m_current_builder.clear();
2307 for (auto code_point : m_temporary_buffer) {
2308 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2309 }
2310 RECONSUME_IN(ScriptDataEscaped);
2311 }
2312 ON('>')
2313 {
2314 m_current_token.set_tag_name(consume_current_builder());
2315 if (current_end_tag_token_is_appropriate())
2316 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
2317
2318 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2319 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2320 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2321 m_current_builder.clear();
2322 for (auto code_point : m_temporary_buffer) {
2323 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2324 }
2325 RECONSUME_IN(ScriptDataEscaped);
2326 }
2327 ON_ASCII_UPPER_ALPHA
2328 {
2329 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
2330 m_temporary_buffer.append(current_input_character.value());
2331 continue;
2332 }
2333 ON_ASCII_LOWER_ALPHA
2334 {
2335 m_current_builder.append(current_input_character.value());
2336 m_temporary_buffer.append(current_input_character.value());
2337 continue;
2338 }
2339 ANYTHING_ELSE
2340 {
2341 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2342 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2343 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2344 m_current_builder.clear();
2345 for (auto code_point : m_temporary_buffer) {
2346 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2347 }
2348 RECONSUME_IN(ScriptDataEscaped);
2349 }
2350 }
2351 END_STATE
2352
2353 // 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
2354 BEGIN_STATE(ScriptDataDoubleEscapeStart)
2355 {
2356 auto temporary_buffer_equal_to_script = [this]() -> bool {
2357 if (m_temporary_buffer.size() != 6)
2358 return false;
2359
2360 // FIXME: Is there a better way of doing this?
2361 return m_temporary_buffer[0] == 's' && m_temporary_buffer[1] == 'c' && m_temporary_buffer[2] == 'r' && m_temporary_buffer[3] == 'i' && m_temporary_buffer[4] == 'p' && m_temporary_buffer[5] == 't';
2362 };
2363 ON_WHITESPACE
2364 {
2365 if (temporary_buffer_equal_to_script())
2366 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
2367 else
2368 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
2369 }
2370 ON('/')
2371 {
2372 if (temporary_buffer_equal_to_script())
2373 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
2374 else
2375 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
2376 }
2377 ON('>')
2378 {
2379 if (temporary_buffer_equal_to_script())
2380 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
2381 else
2382 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
2383 }
2384 ON_ASCII_UPPER_ALPHA
2385 {
2386 m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value()));
2387 EMIT_CURRENT_CHARACTER;
2388 }
2389 ON_ASCII_LOWER_ALPHA
2390 {
2391 m_temporary_buffer.append(current_input_character.value());
2392 EMIT_CURRENT_CHARACTER;
2393 }
2394 ANYTHING_ELSE
2395 {
2396 RECONSUME_IN(ScriptDataEscaped);
2397 }
2398 }
2399 END_STATE
2400
2401 // 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
2402 BEGIN_STATE(ScriptDataDoubleEscaped)
2403 {
2404 ON('-')
2405 {
2406 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataDoubleEscapedDash);
2407 }
2408 ON('<')
2409 {
2410 SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
2411 }
2412 ON(0)
2413 {
2414 log_parse_error();
2415 EMIT_CHARACTER(0xFFFD);
2416 }
2417 ON_EOF
2418 {
2419 log_parse_error();
2420 EMIT_EOF;
2421 }
2422 ANYTHING_ELSE
2423 {
2424 EMIT_CURRENT_CHARACTER;
2425 }
2426 }
2427 END_STATE
2428
2429 // 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
2430 BEGIN_STATE(ScriptDataDoubleEscapedDash)
2431 {
2432 ON('-')
2433 {
2434 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataDoubleEscapedDashDash);
2435 }
2436 ON('<')
2437 {
2438 SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
2439 }
2440 ON(0)
2441 {
2442 log_parse_error();
2443 SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataDoubleEscaped);
2444 }
2445 ON_EOF
2446 {
2447 log_parse_error();
2448 EMIT_EOF;
2449 }
2450 ANYTHING_ELSE
2451 {
2452 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
2453 }
2454 }
2455 END_STATE
2456
2457 // 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
2458 BEGIN_STATE(ScriptDataDoubleEscapedDashDash)
2459 {
2460 ON('-')
2461 {
2462 EMIT_CHARACTER('-');
2463 }
2464 ON('<')
2465 {
2466 SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
2467 }
2468 ON('>')
2469 {
2470 SWITCH_TO_AND_EMIT_CHARACTER('>', ScriptData);
2471 }
2472 ON(0)
2473 {
2474 log_parse_error();
2475 SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataDoubleEscaped);
2476 }
2477 ON_EOF
2478 {
2479 log_parse_error();
2480 EMIT_EOF;
2481 }
2482 ANYTHING_ELSE
2483 {
2484 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
2485 }
2486 }
2487 END_STATE
2488
2489 // 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
2490 BEGIN_STATE(ScriptDataDoubleEscapedLessThanSign)
2491 {
2492 ON('/')
2493 {
2494 m_temporary_buffer.clear();
2495 SWITCH_TO_AND_EMIT_CHARACTER('/', ScriptDataDoubleEscapeEnd);
2496 }
2497 ANYTHING_ELSE
2498 {
2499 RECONSUME_IN(ScriptDataDoubleEscaped);
2500 }
2501 }
2502 END_STATE
2503
2504 // 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
2505 BEGIN_STATE(ScriptDataDoubleEscapeEnd)
2506 {
2507 auto temporary_buffer_equal_to_script = [this]() -> bool {
2508 if (m_temporary_buffer.size() != 6)
2509 return false;
2510
2511 // FIXME: Is there a better way of doing this?
2512 return m_temporary_buffer[0] == 's' && m_temporary_buffer[1] == 'c' && m_temporary_buffer[2] == 'r' && m_temporary_buffer[3] == 'i' && m_temporary_buffer[4] == 'p' && m_temporary_buffer[5] == 't';
2513 };
2514 ON_WHITESPACE
2515 {
2516 if (temporary_buffer_equal_to_script())
2517 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
2518 else
2519 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
2520 }
2521 ON('/')
2522 {
2523 if (temporary_buffer_equal_to_script())
2524 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
2525 else
2526 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
2527 }
2528 ON('>')
2529 {
2530 if (temporary_buffer_equal_to_script())
2531 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
2532 else
2533 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
2534 }
2535 ON_ASCII_UPPER_ALPHA
2536 {
2537 m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value()));
2538 EMIT_CURRENT_CHARACTER;
2539 }
2540 ON_ASCII_LOWER_ALPHA
2541 {
2542 m_temporary_buffer.append(current_input_character.value());
2543 EMIT_CURRENT_CHARACTER;
2544 }
2545 ANYTHING_ELSE
2546 {
2547 RECONSUME_IN(ScriptDataDoubleEscaped);
2548 }
2549 }
2550 END_STATE
2551
2552 // 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
2553 BEGIN_STATE(ScriptDataEscapedDash)
2554 {
2555 ON('-')
2556 {
2557 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDashDash);
2558 }
2559 ON('<')
2560 {
2561 SWITCH_TO(ScriptDataEscapedLessThanSign);
2562 }
2563 ON(0)
2564 {
2565 log_parse_error();
2566 SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataEscaped);
2567 }
2568 ON_EOF
2569 {
2570 log_parse_error();
2571 EMIT_EOF;
2572 }
2573 ANYTHING_ELSE
2574 {
2575 SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
2576 }
2577 }
2578 END_STATE
2579
2580 // 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
2581 BEGIN_STATE(ScriptDataEscaped)
2582 {
2583 ON('-')
2584 {
2585 SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDash);
2586 }
2587 ON('<')
2588 {
2589 SWITCH_TO(ScriptDataEscapedLessThanSign);
2590 }
2591 ON(0)
2592 {
2593 log_parse_error();
2594 EMIT_CHARACTER(0xFFFD);
2595 }
2596 ON_EOF
2597 {
2598 log_parse_error();
2599 EMIT_EOF;
2600 }
2601 ANYTHING_ELSE
2602 {
2603 EMIT_CURRENT_CHARACTER;
2604 }
2605 }
2606 END_STATE
2607
2608 // 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
2609 BEGIN_STATE(ScriptDataEndTagOpen)
2610 {
2611 ON_ASCII_ALPHA
2612 {
2613 create_new_token(HTMLToken::Type::EndTag);
2614 RECONSUME_IN(ScriptDataEndTagName);
2615 }
2616 ANYTHING_ELSE
2617 {
2618 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2619 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2620 RECONSUME_IN(ScriptData);
2621 }
2622 }
2623 END_STATE
2624
2625 // 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
2626 BEGIN_STATE(ScriptDataEndTagName)
2627 {
2628 ON_WHITESPACE
2629 {
2630 m_current_token.set_tag_name(consume_current_builder());
2631 if (current_end_tag_token_is_appropriate())
2632 SWITCH_TO(BeforeAttributeName);
2633 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2634 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2635 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2636 m_current_builder.clear();
2637 for (auto code_point : m_temporary_buffer)
2638 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2639 RECONSUME_IN(ScriptData);
2640 }
2641 ON('/')
2642 {
2643 m_current_token.set_tag_name(consume_current_builder());
2644 if (current_end_tag_token_is_appropriate())
2645 SWITCH_TO(SelfClosingStartTag);
2646 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2647 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2648 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2649 m_current_builder.clear();
2650 for (auto code_point : m_temporary_buffer)
2651 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2652 RECONSUME_IN(ScriptData);
2653 }
2654 ON('>')
2655 {
2656 m_current_token.set_tag_name(consume_current_builder());
2657 if (current_end_tag_token_is_appropriate())
2658 SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
2659 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2660 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2661 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2662 m_current_builder.clear();
2663 for (auto code_point : m_temporary_buffer)
2664 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2665 RECONSUME_IN(ScriptData);
2666 }
2667 ON_ASCII_UPPER_ALPHA
2668 {
2669 m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
2670 m_temporary_buffer.append(current_input_character.value());
2671 continue;
2672 }
2673 ON_ASCII_LOWER_ALPHA
2674 {
2675 m_current_builder.append(current_input_character.value());
2676 m_temporary_buffer.append(current_input_character.value());
2677 continue;
2678 }
2679 ANYTHING_ELSE
2680 {
2681 m_queued_tokens.enqueue(HTMLToken::make_character('<'));
2682 m_queued_tokens.enqueue(HTMLToken::make_character('/'));
2683 // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
2684 m_current_builder.clear();
2685 for (auto code_point : m_temporary_buffer)
2686 m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
2687 RECONSUME_IN(ScriptData);
2688 }
2689 }
2690 END_STATE
2691
2692 // 13.2.5.69 CDATA section state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
2693 BEGIN_STATE(CDATASection)
2694 {
2695 ON(']')
2696 {
2697 SWITCH_TO(CDATASectionBracket);
2698 }
2699 ON_EOF
2700 {
2701 log_parse_error();
2702 EMIT_EOF;
2703 }
2704 ANYTHING_ELSE
2705 {
2706 EMIT_CURRENT_CHARACTER;
2707 }
2708 }
2709 END_STATE
2710
2711 // 13.2.5.70 CDATA section bracket state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
2712 BEGIN_STATE(CDATASectionBracket)
2713 {
2714 ON(']')
2715 {
2716 SWITCH_TO(CDATASectionEnd);
2717 }
2718 ANYTHING_ELSE
2719 {
2720 EMIT_CHARACTER_AND_RECONSUME_IN(']', CDATASection);
2721 }
2722 }
2723 END_STATE
2724
2725 // 13.2.5.71 CDATA section end state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
2726 BEGIN_STATE(CDATASectionEnd)
2727 {
2728 ON(']')
2729 {
2730 EMIT_CHARACTER(']');
2731 }
2732 ON('>')
2733 {
2734 SWITCH_TO(Data);
2735 }
2736 ANYTHING_ELSE
2737 {
2738 m_queued_tokens.enqueue(HTMLToken::make_character(']'));
2739 m_queued_tokens.enqueue(HTMLToken::make_character(']'));
2740 RECONSUME_IN(CDATASection);
2741 }
2742 }
2743 END_STATE
2744
2745 default:
2746 TODO();
2747 }
2748 }
2749}
2750
2751bool HTMLTokenizer::consume_next_if_match(StringView string, CaseSensitivity case_sensitivity)
2752{
2753 for (size_t i = 0; i < string.length(); ++i) {
2754 auto code_point = peek_code_point(i);
2755 if (!code_point.has_value())
2756 return false;
2757 // FIXME: This should be more Unicode-aware.
2758 if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
2759 if (code_point.value() < 0x80) {
2760 if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i]))
2761 return false;
2762 continue;
2763 }
2764 }
2765 if (code_point.value() != (u32)string[i])
2766 return false;
2767 }
2768 skip(string.length());
2769 return true;
2770}
2771
2772void HTMLTokenizer::create_new_token(HTMLToken::Type type)
2773{
2774 m_current_token = { type };
2775 size_t offset = 0;
2776 switch (type) {
2777 case HTMLToken::Type::StartTag:
2778 offset = 1;
2779 break;
2780 case HTMLToken::Type::EndTag:
2781 offset = 2;
2782 break;
2783 default:
2784 break;
2785 }
2786
2787 m_current_token.set_start_position({}, nth_last_position(offset));
2788}
2789
2790HTMLTokenizer::HTMLTokenizer()
2791{
2792 m_decoded_input = "";
2793 m_utf8_view = Utf8View(m_decoded_input);
2794 m_utf8_iterator = m_utf8_view.begin();
2795 m_prev_utf8_iterator = m_utf8_view.begin();
2796 m_source_positions.empend(0u, 0u);
2797}
2798
2799HTMLTokenizer::HTMLTokenizer(StringView input, DeprecatedString const& encoding)
2800{
2801 auto decoder = TextCodec::decoder_for(encoding);
2802 VERIFY(decoder.has_value());
2803 m_decoded_input = decoder->to_utf8(input).release_value_but_fixme_should_propagate_errors().to_deprecated_string();
2804 m_utf8_view = Utf8View(m_decoded_input);
2805 m_utf8_iterator = m_utf8_view.begin();
2806 m_prev_utf8_iterator = m_utf8_view.begin();
2807 m_source_positions.empend(0u, 0u);
2808}
2809
2810void HTMLTokenizer::insert_input_at_insertion_point(DeprecatedString const& input)
2811{
2812 auto utf8_iterator_byte_offset = m_utf8_view.byte_offset_of(m_utf8_iterator);
2813
2814 // FIXME: Implement a InputStream to handle insertion_point and iterators.
2815 StringBuilder builder {};
2816 builder.append(m_decoded_input.substring(0, m_insertion_point.position));
2817 builder.append(input);
2818 builder.append(m_decoded_input.substring(m_insertion_point.position));
2819 m_decoded_input = builder.to_deprecated_string();
2820
2821 m_utf8_view = Utf8View(m_decoded_input);
2822 m_utf8_iterator = m_utf8_view.iterator_at_byte_offset(utf8_iterator_byte_offset);
2823
2824 m_insertion_point.position += input.length();
2825}
2826
2827void HTMLTokenizer::insert_eof()
2828{
2829 m_explicit_eof_inserted = true;
2830}
2831
2832bool HTMLTokenizer::is_eof_inserted()
2833{
2834 return m_explicit_eof_inserted;
2835}
2836
2837void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
2838{
2839 dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Switch to {}", state_name(m_state), state_name(new_state));
2840}
2841
2842void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
2843{
2844 dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Reconsume in {}", state_name(m_state), state_name(new_state));
2845}
2846
2847void HTMLTokenizer::switch_to(Badge<HTMLParser>, State new_state)
2848{
2849 dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Parser switches tokenizer state to {}", state_name(m_state), state_name(new_state));
2850 m_state = new_state;
2851}
2852
2853void HTMLTokenizer::will_emit(HTMLToken& token)
2854{
2855 if (token.is_start_tag())
2856 m_last_emitted_start_tag_name = token.tag_name();
2857 token.set_end_position({}, nth_last_position(0));
2858}
2859
2860bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
2861{
2862 VERIFY(m_current_token.is_end_tag());
2863 if (!m_last_emitted_start_tag_name.has_value())
2864 return false;
2865 return m_current_token.tag_name() == m_last_emitted_start_tag_name.value();
2866}
2867
2868bool HTMLTokenizer::consumed_as_part_of_an_attribute() const
2869{
2870 return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
2871}
2872
2873void HTMLTokenizer::restore_to(Utf8CodePointIterator const& new_iterator)
2874{
2875 auto diff = m_utf8_iterator - new_iterator;
2876 if (diff > 0) {
2877 for (ssize_t i = 0; i < diff; ++i) {
2878 if (!m_source_positions.is_empty())
2879 m_source_positions.take_last();
2880 }
2881 } else {
2882 // Going forwards...?
2883 TODO();
2884 }
2885 m_utf8_iterator = new_iterator;
2886}
2887
2888DeprecatedString HTMLTokenizer::consume_current_builder()
2889{
2890 auto string = m_current_builder.to_deprecated_string();
2891 m_current_builder.clear();
2892 return string;
2893}
2894
2895}