Serenity Operating System
1/*
2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <AK/Function.h>
28#include <AK/NonnullRefPtrVector.h>
29#include <AK/StringBuilder.h>
30#include <LibHTML/DOM/Comment.h>
31#include <LibHTML/DOM/DocumentFragment.h>
32#include <LibHTML/DOM/DocumentType.h>
33#include <LibHTML/DOM/Element.h>
34#include <LibHTML/DOM/ElementFactory.h>
35#include <LibHTML/DOM/Text.h>
36#include <LibHTML/Parser/HTMLParser.h>
37#include <ctype.h>
38#include <stdio.h>
39
40static bool is_valid_in_attribute_name(char ch)
41{
42 return isalnum(ch) || ch == '_' || ch == '-';
43}
44
45static bool is_self_closing_tag(const StringView& tag_name)
46{
47 return tag_name == "area"
48 || tag_name == "base"
49 || tag_name == "br"
50 || tag_name == "col"
51 || tag_name == "embed"
52 || tag_name == "hr"
53 || tag_name == "img"
54 || tag_name == "input"
55 || tag_name == "link"
56 || tag_name == "meta"
57 || tag_name == "param"
58 || tag_name == "source"
59 || tag_name == "track"
60 || tag_name == "wbr";
61}
62
63static bool parse_html_document(const StringView& html, Document& document, ParentNode& root)
64{
65 NonnullRefPtrVector<ParentNode> node_stack;
66 node_stack.append(root);
67
68 enum class State {
69 Free = 0,
70 BeforeTagName,
71 InTagName,
72 InDoctype,
73 InComment,
74 InAttributeList,
75 InAttributeName,
76 BeforeAttributeValue,
77 InAttributeValueNoQuote,
78 InAttributeValueSingleQuote,
79 InAttributeValueDoubleQuote,
80 };
81
82 auto state = State::Free;
83
84 StringBuilder text_buffer;
85
86 Vector<char, 32> tag_name_buffer;
87
88 Vector<Attribute> attributes;
89 Vector<char, 256> attribute_name_buffer;
90 Vector<char, 256> attribute_value_buffer;
91
92 bool is_slash_tag = false;
93 bool is_exclamation_tag = false;
94
95 auto move_to_state = [&](State new_state) {
96 if (new_state == State::BeforeTagName) {
97 is_slash_tag = false;
98 is_exclamation_tag = false;
99 tag_name_buffer.clear();
100 attributes.clear();
101 }
102 if (new_state == State::InAttributeName)
103 attribute_name_buffer.clear();
104 if (new_state == State::BeforeAttributeValue)
105 attribute_value_buffer.clear();
106 if (state == State::Free && !text_buffer.string_view().is_empty()) {
107 auto text_node = adopt(*new Text(document, text_buffer.to_string()));
108 node_stack.last().append_child(text_node, false);
109 }
110 state = new_state;
111 text_buffer.clear();
112 };
113
114 auto close_tag = [&] {
115 if (node_stack.size() > 1)
116 node_stack.take_last();
117 };
118
119 auto open_tag = [&] {
120 auto new_element = create_element(document, String::copy(tag_name_buffer));
121 tag_name_buffer.clear();
122 new_element->set_attributes(move(attributes));
123 node_stack.append(new_element);
124 if (node_stack.size() != 1)
125 node_stack[node_stack.size() - 2].append_child(new_element, false);
126
127 if (is_self_closing_tag(new_element->tag_name()))
128 close_tag();
129 };
130
131 auto commit_doctype = [&] {
132 node_stack.last().append_child(adopt(*new DocumentType(document)), false);
133 };
134
135 auto commit_comment = [&] {
136 node_stack.last().append_child(adopt(*new Comment(document, text_buffer.to_string())), false);
137 };
138
139 auto commit_tag = [&] {
140 if (is_slash_tag)
141 close_tag();
142 else
143 open_tag();
144 };
145
146 auto commit_attribute = [&] {
147 if (!attribute_name_buffer.is_empty()) {
148 auto name = String::copy(attribute_name_buffer);
149 String value;
150 if (attribute_value_buffer.is_empty())
151 value = String::empty();
152 else
153 value = String::copy(attribute_value_buffer);
154 attributes.empend(name, value);
155 }
156 };
157
158 for (size_t i = 0; i < html.length(); ++i) {
159 auto peek = [&](size_t offset) -> char {
160 if (i + offset >= html.length())
161 return '\0';
162 return html[i + offset];
163 };
164 char ch = html[i];
165 switch (state) {
166 case State::Free:
167 if (ch == '<') {
168 is_slash_tag = false;
169 move_to_state(State::BeforeTagName);
170 break;
171 }
172 if (ch != '&') {
173 text_buffer.append(ch);
174 } else {
175 struct Escape {
176 const char* code;
177 const char* value;
178 };
179 static Escape escapes[] = {
180 { "<", "<" },
181 { ">", ">" },
182 { "&", "&" },
183 { "—", "-" },
184 };
185 auto rest_of_html = html.substring_view(i, html.length() - i);
186 bool found = false;
187 for (auto& escape : escapes) {
188 if (rest_of_html.starts_with(escape.code)) {
189 text_buffer.append(escape.value);
190 found = true;
191 i += strlen(escape.code) - 1;
192 break;
193 }
194 }
195 if (!found)
196 dbg() << "Unhandled escape sequence";
197 }
198 break;
199 case State::BeforeTagName:
200 if (ch == '/') {
201 is_slash_tag = true;
202 break;
203 }
204 if (ch == '!') {
205 if (toupper(peek(1)) == 'D'
206 && toupper(peek(2)) == 'O'
207 && toupper(peek(3)) == 'C'
208 && toupper(peek(4)) == 'T'
209 && toupper(peek(5)) == 'Y'
210 && toupper(peek(6)) == 'P'
211 && toupper(peek(7)) == 'E') {
212 i += 7;
213 move_to_state(State::InDoctype);
214 break;
215 }
216 if (peek(1) == '-' && peek(2) == '-') {
217 i += 2;
218 move_to_state(State::InComment);
219 break;
220 }
221 break;
222 }
223 if (ch == '>') {
224 move_to_state(State::Free);
225 break;
226 }
227 if (!isalpha(ch))
228 break;
229 move_to_state(State::InTagName);
230 [[fallthrough]];
231 case State::InTagName:
232 if (isspace(ch)) {
233 move_to_state(State::InAttributeList);
234 break;
235 }
236 if (ch == '>') {
237 commit_tag();
238 move_to_state(State::Free);
239 break;
240 }
241 tag_name_buffer.append(ch);
242 break;
243 case State::InDoctype:
244 if (ch == '>') {
245 commit_doctype();
246 move_to_state(State::Free);
247 break;
248 }
249 break;
250 case State::InComment:
251 if (ch == '-' && peek(1) == '-' && peek(2) == '>') {
252 commit_comment();
253 i += 2;
254 move_to_state(State::Free);
255 break;
256 }
257 text_buffer.append(ch);
258 break;
259 case State::InAttributeList:
260 if (ch == '>') {
261 commit_tag();
262 move_to_state(State::Free);
263 break;
264 }
265 if (!isalpha(ch))
266 break;
267 move_to_state(State::InAttributeName);
268 [[fallthrough]];
269 case State::InAttributeName:
270 if (is_valid_in_attribute_name(ch)) {
271 attribute_name_buffer.append(ch);
272 break;
273 }
274 if (isspace(ch)) {
275 commit_attribute();
276 break;
277 }
278
279 if (ch == '>') {
280 commit_attribute();
281 commit_tag();
282 move_to_state(State::Free);
283 break;
284 }
285
286 if (ch == '=') {
287 move_to_state(State::BeforeAttributeValue);
288 break;
289 }
290 break;
291 case State::BeforeAttributeValue:
292 if (ch == '\'') {
293 move_to_state(State::InAttributeValueSingleQuote);
294 break;
295 }
296 if (ch == '"') {
297 move_to_state(State::InAttributeValueDoubleQuote);
298 break;
299 }
300 if (ch == '>') {
301 commit_tag();
302 move_to_state(State::Free);
303 break;
304 }
305 if (isspace(ch)) {
306 commit_attribute();
307 move_to_state(State::InAttributeList);
308 break;
309 }
310 move_to_state(State::InAttributeValueNoQuote);
311 [[fallthrough]];
312 case State::InAttributeValueNoQuote:
313 if (isspace(ch)) {
314 commit_attribute();
315 move_to_state(State::InAttributeList);
316 break;
317 }
318 if (ch == '>') {
319 commit_attribute();
320 commit_tag();
321 move_to_state(State::Free);
322 break;
323 }
324 attribute_value_buffer.append(ch);
325 break;
326 case State::InAttributeValueSingleQuote:
327 if (ch == '\'') {
328 commit_attribute();
329 move_to_state(State::InAttributeList);
330 break;
331 }
332 attribute_value_buffer.append(ch);
333 break;
334 case State::InAttributeValueDoubleQuote:
335 if (ch == '"') {
336 commit_attribute();
337 move_to_state(State::InAttributeList);
338 break;
339 }
340 attribute_value_buffer.append(ch);
341 break;
342 default:
343 fprintf(stderr, "Unhandled state %d\n", (int)state);
344 ASSERT_NOT_REACHED();
345 }
346 }
347
348 return true;
349}
350
351RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& html)
352{
353 auto fragment = adopt(*new DocumentFragment(document));
354 if (!parse_html_document(html, document, *fragment))
355 return nullptr;
356 return fragment;
357}
358
359RefPtr<Document> parse_html_document(const StringView& html, const URL& url)
360{
361 auto document = adopt(*new Document);
362 document->set_url(url);
363 document->set_source(html);
364
365 if (!parse_html_document(html, *document, *document))
366 return nullptr;
367
368 document->fixup();
369
370 Function<void(Node&)> fire_insertion_callbacks = [&](Node& node) {
371 for (auto* child = node.first_child(); child; child = child->next_sibling()) {
372 fire_insertion_callbacks(*child);
373 }
374 if (node.parent())
375 node.inserted_into(*node.parent());
376 };
377 fire_insertion_callbacks(document);
378
379 return document;
380}