Serenity Operating System
at master 218 lines 10 kB view raw
1/* 2 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> 3 * Copyright (c) 2022, Linus Groh <linusg@serenityos.org> 4 * 5 * SPDX-License-Identifier: BSD-2-Clause 6 */ 7 8#pragma once 9 10#include <AK/Queue.h> 11#include <AK/StringBuilder.h> 12#include <AK/StringView.h> 13#include <AK/Types.h> 14#include <AK/Utf8View.h> 15#include <LibWeb/Forward.h> 16#include <LibWeb/HTML/Parser/HTMLToken.h> 17 18namespace Web::HTML { 19 20#define ENUMERATE_TOKENIZER_STATES \ 21 __ENUMERATE_TOKENIZER_STATE(Data) \ 22 __ENUMERATE_TOKENIZER_STATE(RCDATA) \ 23 __ENUMERATE_TOKENIZER_STATE(RAWTEXT) \ 24 __ENUMERATE_TOKENIZER_STATE(ScriptData) \ 25 __ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \ 26 __ENUMERATE_TOKENIZER_STATE(TagOpen) \ 27 __ENUMERATE_TOKENIZER_STATE(EndTagOpen) \ 28 __ENUMERATE_TOKENIZER_STATE(TagName) \ 29 __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \ 30 __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \ 31 __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \ 32 __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \ 33 __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \ 34 __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \ 35 __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \ 36 __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \ 37 __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \ 38 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \ 39 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \ 40 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \ 41 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \ 42 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \ 43 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \ 44 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \ 45 __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \ 46 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \ 47 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \ 48 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \ 49 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \ 50 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \ 51 __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \ 52 __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \ 53 __ENUMERATE_TOKENIZER_STATE(AttributeName) \ 54 __ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \ 55 __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \ 56 __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \ 57 __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \ 58 __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \ 59 __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \ 60 __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \ 61 __ENUMERATE_TOKENIZER_STATE(BogusComment) \ 62 __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \ 63 __ENUMERATE_TOKENIZER_STATE(CommentStart) \ 64 __ENUMERATE_TOKENIZER_STATE(CommentStartDash) \ 65 __ENUMERATE_TOKENIZER_STATE(Comment) \ 66 __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \ 67 __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \ 68 __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \ 69 __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \ 70 __ENUMERATE_TOKENIZER_STATE(CommentEndDash) \ 71 __ENUMERATE_TOKENIZER_STATE(CommentEnd) \ 72 __ENUMERATE_TOKENIZER_STATE(CommentEndBang) \ 73 __ENUMERATE_TOKENIZER_STATE(DOCTYPE) \ 74 __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \ 75 __ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \ 76 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \ 77 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \ 78 __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \ 79 __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \ 80 __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \ 81 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \ 82 __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \ 83 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \ 84 __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \ 85 __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \ 86 __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \ 87 __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \ 88 __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \ 89 __ENUMERATE_TOKENIZER_STATE(CDATASection) \ 90 __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \ 91 __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \ 92 __ENUMERATE_TOKENIZER_STATE(CharacterReference) \ 93 __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \ 94 __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \ 95 __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \ 96 __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \ 97 __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \ 98 __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \ 99 __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \ 100 __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd) 101 102class HTMLTokenizer { 103public: 104 explicit HTMLTokenizer(); 105 explicit HTMLTokenizer(StringView input, DeprecatedString const& encoding); 106 107 enum class State { 108#define __ENUMERATE_TOKENIZER_STATE(state) state, 109 ENUMERATE_TOKENIZER_STATES 110#undef __ENUMERATE_TOKENIZER_STATE 111 }; 112 113 Optional<HTMLToken> next_token(); 114 115 void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; } 116 117 void switch_to(Badge<HTMLParser>, State new_state); 118 void switch_to(State new_state) 119 { 120 m_state = new_state; 121 } 122 123 void set_blocked(bool b) { m_blocked = b; } 124 bool is_blocked() const { return m_blocked; } 125 126 DeprecatedString source() const { return m_decoded_input; } 127 128 void insert_input_at_insertion_point(DeprecatedString const& input); 129 void insert_eof(); 130 bool is_eof_inserted(); 131 132 bool is_insertion_point_defined() const { return m_insertion_point.defined; } 133 bool is_insertion_point_reached() 134 { 135 return m_insertion_point.defined && m_utf8_view.iterator_offset(m_utf8_iterator) >= m_insertion_point.position; 136 } 137 void undefine_insertion_point() { m_insertion_point.defined = false; } 138 void store_insertion_point() { m_old_insertion_point = m_insertion_point; } 139 void restore_insertion_point() { m_insertion_point = m_old_insertion_point; } 140 void update_insertion_point() 141 { 142 m_insertion_point.defined = true; 143 m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator); 144 } 145 146 // This permanently cuts off the tokenizer input stream. 147 void abort() { m_aborted = true; } 148 149private: 150 void skip(size_t count); 151 Optional<u32> next_code_point(); 152 Optional<u32> peek_code_point(size_t offset) const; 153 bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive); 154 void create_new_token(HTMLToken::Type); 155 bool current_end_tag_token_is_appropriate() const; 156 DeprecatedString consume_current_builder(); 157 158 static char const* state_name(State state) 159 { 160 switch (state) { 161#define __ENUMERATE_TOKENIZER_STATE(state) \ 162 case State::state: \ 163 return #state; 164 ENUMERATE_TOKENIZER_STATES 165#undef __ENUMERATE_TOKENIZER_STATE 166 }; 167 VERIFY_NOT_REACHED(); 168 } 169 170 void will_emit(HTMLToken&); 171 void will_switch_to(State); 172 void will_reconsume_in(State); 173 174 bool consumed_as_part_of_an_attribute() const; 175 176 void restore_to(Utf8CodePointIterator const& new_iterator); 177 HTMLToken::Position nth_last_position(size_t n = 0); 178 179 HTMLParser* m_parser { nullptr }; 180 181 State m_state { State::Data }; 182 State m_return_state { State::Data }; 183 184 Vector<u32> m_temporary_buffer; 185 186 DeprecatedString m_decoded_input; 187 188 struct InsertionPoint { 189 size_t position { 0 }; 190 bool defined { false }; 191 }; 192 InsertionPoint m_insertion_point {}; 193 InsertionPoint m_old_insertion_point {}; 194 195 Utf8View m_utf8_view; 196 Utf8CodePointIterator m_utf8_iterator; 197 Utf8CodePointIterator m_prev_utf8_iterator; 198 199 HTMLToken m_current_token; 200 StringBuilder m_current_builder; 201 202 Optional<DeprecatedString> m_last_emitted_start_tag_name; 203 204 bool m_explicit_eof_inserted { false }; 205 bool m_has_emitted_eof { false }; 206 207 Queue<HTMLToken> m_queued_tokens; 208 209 u32 m_character_reference_code { 0 }; 210 211 bool m_blocked { false }; 212 213 bool m_aborted { false }; 214 215 Vector<HTMLToken::Position> m_source_positions; 216}; 217 218}