Serenity Operating System
1/*
2 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
3 * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
4 *
5 * SPDX-License-Identifier: BSD-2-Clause
6 */
7
8#pragma once
9
10#include <AK/Queue.h>
11#include <AK/StringBuilder.h>
12#include <AK/StringView.h>
13#include <AK/Types.h>
14#include <AK/Utf8View.h>
15#include <LibWeb/Forward.h>
16#include <LibWeb/HTML/Parser/HTMLToken.h>
17
18namespace Web::HTML {
19
20#define ENUMERATE_TOKENIZER_STATES \
21 __ENUMERATE_TOKENIZER_STATE(Data) \
22 __ENUMERATE_TOKENIZER_STATE(RCDATA) \
23 __ENUMERATE_TOKENIZER_STATE(RAWTEXT) \
24 __ENUMERATE_TOKENIZER_STATE(ScriptData) \
25 __ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \
26 __ENUMERATE_TOKENIZER_STATE(TagOpen) \
27 __ENUMERATE_TOKENIZER_STATE(EndTagOpen) \
28 __ENUMERATE_TOKENIZER_STATE(TagName) \
29 __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \
30 __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \
31 __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \
32 __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \
33 __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \
34 __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \
35 __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \
36 __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \
37 __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \
38 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \
39 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \
40 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \
41 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \
42 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \
43 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \
44 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \
45 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \
46 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \
47 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \
48 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \
49 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \
50 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \
51 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \
52 __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \
53 __ENUMERATE_TOKENIZER_STATE(AttributeName) \
54 __ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \
55 __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \
56 __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \
57 __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \
58 __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \
59 __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \
60 __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \
61 __ENUMERATE_TOKENIZER_STATE(BogusComment) \
62 __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \
63 __ENUMERATE_TOKENIZER_STATE(CommentStart) \
64 __ENUMERATE_TOKENIZER_STATE(CommentStartDash) \
65 __ENUMERATE_TOKENIZER_STATE(Comment) \
66 __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \
67 __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \
68 __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \
69 __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \
70 __ENUMERATE_TOKENIZER_STATE(CommentEndDash) \
71 __ENUMERATE_TOKENIZER_STATE(CommentEnd) \
72 __ENUMERATE_TOKENIZER_STATE(CommentEndBang) \
73 __ENUMERATE_TOKENIZER_STATE(DOCTYPE) \
74 __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \
75 __ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \
76 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \
77 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \
78 __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \
79 __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \
80 __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \
81 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \
82 __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
83 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \
84 __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \
85 __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \
86 __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \
87 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \
88 __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \
89 __ENUMERATE_TOKENIZER_STATE(CDATASection) \
90 __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \
91 __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \
92 __ENUMERATE_TOKENIZER_STATE(CharacterReference) \
93 __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \
94 __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \
95 __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \
96 __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \
97 __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \
98 __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \
99 __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \
100 __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
101
102class HTMLTokenizer {
103public:
104 explicit HTMLTokenizer();
105 explicit HTMLTokenizer(StringView input, DeprecatedString const& encoding);
106
107 enum class State {
108#define __ENUMERATE_TOKENIZER_STATE(state) state,
109 ENUMERATE_TOKENIZER_STATES
110#undef __ENUMERATE_TOKENIZER_STATE
111 };
112
113 Optional<HTMLToken> next_token();
114
115 void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; }
116
117 void switch_to(Badge<HTMLParser>, State new_state);
118 void switch_to(State new_state)
119 {
120 m_state = new_state;
121 }
122
123 void set_blocked(bool b) { m_blocked = b; }
124 bool is_blocked() const { return m_blocked; }
125
126 DeprecatedString source() const { return m_decoded_input; }
127
128 void insert_input_at_insertion_point(DeprecatedString const& input);
129 void insert_eof();
130 bool is_eof_inserted();
131
132 bool is_insertion_point_defined() const { return m_insertion_point.defined; }
133 bool is_insertion_point_reached()
134 {
135 return m_insertion_point.defined && m_utf8_view.iterator_offset(m_utf8_iterator) >= m_insertion_point.position;
136 }
137 void undefine_insertion_point() { m_insertion_point.defined = false; }
138 void store_insertion_point() { m_old_insertion_point = m_insertion_point; }
139 void restore_insertion_point() { m_insertion_point = m_old_insertion_point; }
140 void update_insertion_point()
141 {
142 m_insertion_point.defined = true;
143 m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator);
144 }
145
146 // This permanently cuts off the tokenizer input stream.
147 void abort() { m_aborted = true; }
148
149private:
150 void skip(size_t count);
151 Optional<u32> next_code_point();
152 Optional<u32> peek_code_point(size_t offset) const;
153 bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive);
154 void create_new_token(HTMLToken::Type);
155 bool current_end_tag_token_is_appropriate() const;
156 DeprecatedString consume_current_builder();
157
158 static char const* state_name(State state)
159 {
160 switch (state) {
161#define __ENUMERATE_TOKENIZER_STATE(state) \
162 case State::state: \
163 return #state;
164 ENUMERATE_TOKENIZER_STATES
165#undef __ENUMERATE_TOKENIZER_STATE
166 };
167 VERIFY_NOT_REACHED();
168 }
169
170 void will_emit(HTMLToken&);
171 void will_switch_to(State);
172 void will_reconsume_in(State);
173
174 bool consumed_as_part_of_an_attribute() const;
175
176 void restore_to(Utf8CodePointIterator const& new_iterator);
177 HTMLToken::Position nth_last_position(size_t n = 0);
178
179 HTMLParser* m_parser { nullptr };
180
181 State m_state { State::Data };
182 State m_return_state { State::Data };
183
184 Vector<u32> m_temporary_buffer;
185
186 DeprecatedString m_decoded_input;
187
188 struct InsertionPoint {
189 size_t position { 0 };
190 bool defined { false };
191 };
192 InsertionPoint m_insertion_point {};
193 InsertionPoint m_old_insertion_point {};
194
195 Utf8View m_utf8_view;
196 Utf8CodePointIterator m_utf8_iterator;
197 Utf8CodePointIterator m_prev_utf8_iterator;
198
199 HTMLToken m_current_token;
200 StringBuilder m_current_builder;
201
202 Optional<DeprecatedString> m_last_emitted_start_tag_name;
203
204 bool m_explicit_eof_inserted { false };
205 bool m_has_emitted_eof { false };
206
207 Queue<HTMLToken> m_queued_tokens;
208
209 u32 m_character_reference_code { 0 };
210
211 bool m_blocked { false };
212
213 bool m_aborted { false };
214
215 Vector<HTMLToken::Position> m_source_positions;
216};
217
218}