Serenity Operating System
1/*
2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <AK/Function.h>
28#include <AK/NonnullRefPtrVector.h>
29#include <AK/StringBuilder.h>
30#include <LibWeb/DOM/Comment.h>
31#include <LibWeb/DOM/DocumentFragment.h>
32#include <LibWeb/DOM/DocumentType.h>
33#include <LibWeb/DOM/Element.h>
34#include <LibWeb/DOM/ElementFactory.h>
35#include <LibWeb/DOM/Event.h>
36#include <LibWeb/DOM/Text.h>
37#include <LibWeb/Parser/HTMLParser.h>
38#include <ctype.h>
39#include <stdio.h>
40
41namespace Web {
42
43static bool is_valid_in_attribute_name(char ch)
44{
45 return isalnum(ch) || ch == '_' || ch == '-';
46}
47
48static bool is_self_closing_tag(const StringView& tag_name)
49{
50 return tag_name == "area"
51 || tag_name == "base"
52 || tag_name == "br"
53 || tag_name == "col"
54 || tag_name == "embed"
55 || tag_name == "hr"
56 || tag_name == "img"
57 || tag_name == "input"
58 || tag_name == "link"
59 || tag_name == "meta"
60 || tag_name == "param"
61 || tag_name == "source"
62 || tag_name == "track"
63 || tag_name == "wbr";
64}
65
66static bool parse_html_document(const StringView& html, Document& document, ParentNode& root)
67{
68 NonnullRefPtrVector<ParentNode> node_stack;
69 node_stack.append(root);
70
71 enum class State {
72 Free = 0,
73 BeforeTagName,
74 InTagName,
75 InDoctype,
76 InComment,
77 InAttributeList,
78 InAttributeName,
79 BeforeAttributeValue,
80 InAttributeValueNoQuote,
81 InAttributeValueSingleQuote,
82 InAttributeValueDoubleQuote,
83 };
84
85 auto state = State::Free;
86
87 StringBuilder text_buffer;
88
89 Vector<char, 32> tag_name_buffer;
90
91 Vector<Attribute> attributes;
92 Vector<char, 256> attribute_name_buffer;
93 Vector<char, 256> attribute_value_buffer;
94
95 bool is_slash_tag = false;
96 bool is_exclamation_tag = false;
97
98 auto commit_text_node = [&] {
99 auto text_node = adopt(*new Text(document, text_buffer.to_string()));
100 node_stack.last().append_child(text_node);
101 text_buffer.clear();
102 };
103
104 auto move_to_state = [&](State new_state) {
105 if (new_state == State::BeforeTagName) {
106 is_slash_tag = false;
107 is_exclamation_tag = false;
108 tag_name_buffer.clear();
109 attributes.clear();
110 }
111 if (new_state == State::InAttributeName)
112 attribute_name_buffer.clear();
113 if (new_state == State::BeforeAttributeValue)
114 attribute_value_buffer.clear();
115 if (state == State::Free && !text_buffer.is_empty()) {
116 commit_text_node();
117 }
118 state = new_state;
119 text_buffer.clear();
120 };
121
122 auto close_tag = [&] {
123 if (node_stack.size() > 1)
124 node_stack.take_last();
125 };
126
127 auto open_tag = [&] {
128 auto new_element = create_element(document, String::copy(tag_name_buffer));
129 tag_name_buffer.clear();
130 new_element->set_attributes(move(attributes));
131 node_stack.append(new_element);
132 if (node_stack.size() != 1) {
133 node_stack[node_stack.size() - 2].append_child(new_element);
134 }
135
136 if (is_self_closing_tag(new_element->tag_name()))
137 close_tag();
138 };
139
140 auto commit_doctype = [&] {
141 node_stack.last().append_child(adopt(*new DocumentType(document)));
142 };
143
144 auto commit_comment = [&] {
145 node_stack.last().append_child(adopt(*new Comment(document, text_buffer.to_string())));
146 };
147
148 auto commit_tag = [&] {
149 if (is_slash_tag)
150 close_tag();
151 else
152 open_tag();
153 };
154
155 auto commit_attribute = [&] {
156 if (!attribute_name_buffer.is_empty()) {
157 auto name = String::copy(attribute_name_buffer);
158 String value;
159 if (attribute_value_buffer.is_empty())
160 value = String::empty();
161 else
162 value = String::copy(attribute_value_buffer);
163 attributes.empend(name, value);
164 }
165 };
166
167 for (size_t i = 0; i < html.length(); ++i) {
168 auto peek = [&](size_t offset) -> char {
169 if (i + offset >= html.length())
170 return '\0';
171 return html[i + offset];
172 };
173 char ch = html[i];
174 switch (state) {
175 case State::Free:
176 if (ch == '<') {
177 bool should_treat_as_text = false;
178 if (node_stack.last().tag_name() == "script") {
179 bool is_script_close_tag = peek(1) == '/'
180 && tolower(peek(2)) == 's'
181 && tolower(peek(3)) == 'c'
182 && tolower(peek(4)) == 'r'
183 && tolower(peek(5)) == 'i'
184 && tolower(peek(6)) == 'p'
185 && tolower(peek(7)) == 't'
186 && tolower(peek(8)) == '>';
187 if (!is_script_close_tag)
188 should_treat_as_text = true;
189 }
190 if (!should_treat_as_text) {
191 is_slash_tag = false;
192 move_to_state(State::BeforeTagName);
193 break;
194 }
195 }
196
197 if (ch != '&') {
198 text_buffer.append(ch);
199 } else {
200 struct Escape {
201 const char* code;
202 const char* value;
203 };
204 static Escape escapes[] = {
205 { "<", "<" },
206 { ">", ">" },
207 { "&", "&" },
208 { "—", "-" },
209 };
210 auto rest_of_html = html.substring_view(i, html.length() - i);
211 bool found = false;
212 for (auto& escape : escapes) {
213 if (rest_of_html.starts_with(escape.code)) {
214 text_buffer.append(escape.value);
215 found = true;
216 i += strlen(escape.code) - 1;
217 break;
218 }
219 }
220 if (!found)
221 dbg() << "Unhandled escape sequence";
222 }
223 break;
224 case State::BeforeTagName:
225 if (ch == '/') {
226 is_slash_tag = true;
227 break;
228 }
229 if (ch == '!') {
230 if (toupper(peek(1)) == 'D'
231 && toupper(peek(2)) == 'O'
232 && toupper(peek(3)) == 'C'
233 && toupper(peek(4)) == 'T'
234 && toupper(peek(5)) == 'Y'
235 && toupper(peek(6)) == 'P'
236 && toupper(peek(7)) == 'E') {
237 i += 7;
238 move_to_state(State::InDoctype);
239 break;
240 }
241 if (peek(1) == '-' && peek(2) == '-') {
242 i += 2;
243 move_to_state(State::InComment);
244 break;
245 }
246 break;
247 }
248 if (ch == '>') {
249 move_to_state(State::Free);
250 break;
251 }
252 if (!isalpha(ch))
253 break;
254 move_to_state(State::InTagName);
255 [[fallthrough]];
256 case State::InTagName:
257 if (isspace(ch)) {
258 move_to_state(State::InAttributeList);
259 break;
260 }
261 if (ch == '>') {
262 commit_tag();
263 move_to_state(State::Free);
264 break;
265 }
266 tag_name_buffer.append(ch);
267 break;
268 case State::InDoctype:
269 if (ch == '>') {
270 commit_doctype();
271 move_to_state(State::Free);
272 break;
273 }
274 break;
275 case State::InComment:
276 if (ch == '-' && peek(1) == '-' && peek(2) == '>') {
277 commit_comment();
278 i += 2;
279 move_to_state(State::Free);
280 break;
281 }
282 text_buffer.append(ch);
283 break;
284 case State::InAttributeList:
285 if (ch == '>') {
286 commit_tag();
287 move_to_state(State::Free);
288 break;
289 }
290 if (!isalpha(ch))
291 break;
292 move_to_state(State::InAttributeName);
293 [[fallthrough]];
294 case State::InAttributeName:
295 if (is_valid_in_attribute_name(ch)) {
296 attribute_name_buffer.append(ch);
297 break;
298 }
299 if (isspace(ch)) {
300 commit_attribute();
301 break;
302 }
303
304 if (ch == '>') {
305 commit_attribute();
306 commit_tag();
307 move_to_state(State::Free);
308 break;
309 }
310
311 if (ch == '=') {
312 move_to_state(State::BeforeAttributeValue);
313 break;
314 }
315 break;
316 case State::BeforeAttributeValue:
317 if (ch == '\'') {
318 move_to_state(State::InAttributeValueSingleQuote);
319 break;
320 }
321 if (ch == '"') {
322 move_to_state(State::InAttributeValueDoubleQuote);
323 break;
324 }
325 if (ch == '>') {
326 commit_tag();
327 move_to_state(State::Free);
328 break;
329 }
330 if (isspace(ch)) {
331 commit_attribute();
332 move_to_state(State::InAttributeList);
333 break;
334 }
335 move_to_state(State::InAttributeValueNoQuote);
336 [[fallthrough]];
337 case State::InAttributeValueNoQuote:
338 if (isspace(ch)) {
339 commit_attribute();
340 move_to_state(State::InAttributeList);
341 break;
342 }
343 if (ch == '>') {
344 commit_attribute();
345 commit_tag();
346 move_to_state(State::Free);
347 break;
348 }
349 attribute_value_buffer.append(ch);
350 break;
351 case State::InAttributeValueSingleQuote:
352 if (ch == '\'') {
353 commit_attribute();
354 move_to_state(State::InAttributeList);
355 break;
356 }
357 attribute_value_buffer.append(ch);
358 break;
359 case State::InAttributeValueDoubleQuote:
360 if (ch == '"') {
361 commit_attribute();
362 move_to_state(State::InAttributeList);
363 break;
364 }
365 attribute_value_buffer.append(ch);
366 break;
367 default:
368 fprintf(stderr, "Unhandled state %d\n", (int)state);
369 ASSERT_NOT_REACHED();
370 }
371 }
372
373 if (!text_buffer.is_empty())
374 commit_text_node();
375
376 return true;
377}
378
379RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& html)
380{
381 auto fragment = adopt(*new DocumentFragment(document));
382 if (!parse_html_document(html, document, *fragment))
383 return nullptr;
384 return fragment;
385}
386
387RefPtr<Document> parse_html_document(const StringView& html, const URL& url)
388{
389 auto document = adopt(*new Document(url));
390 document->set_source(html);
391
392 if (!parse_html_document(html, *document, *document))
393 return nullptr;
394
395 document->fixup();
396
397#if 0
398 Function<void(Node&)> fire_insertion_callbacks = [&](Node& node) {
399 for (auto* child = node.first_child(); child; child = child->next_sibling()) {
400 fire_insertion_callbacks(*child);
401 }
402 if (node.parent())
403 node.inserted_into(*node.parent());
404 };
405 fire_insertion_callbacks(document);
406#endif
407
408 document->dispatch_event(Event::create("DOMContentLoaded"));
409
410 return document;
411}
412}