Serenity Operating System
at hosted 412 lines 13 kB view raw
1/* 2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <AK/Function.h> 28#include <AK/NonnullRefPtrVector.h> 29#include <AK/StringBuilder.h> 30#include <LibWeb/DOM/Comment.h> 31#include <LibWeb/DOM/DocumentFragment.h> 32#include <LibWeb/DOM/DocumentType.h> 33#include <LibWeb/DOM/Element.h> 34#include <LibWeb/DOM/ElementFactory.h> 35#include <LibWeb/DOM/Event.h> 36#include <LibWeb/DOM/Text.h> 37#include <LibWeb/Parser/HTMLParser.h> 38#include <ctype.h> 39#include <stdio.h> 40 41namespace Web { 42 43static bool is_valid_in_attribute_name(char ch) 44{ 45 return isalnum(ch) || ch == '_' || ch == '-'; 46} 47 48static bool is_self_closing_tag(const StringView& tag_name) 49{ 50 return tag_name == "area" 51 || tag_name == "base" 52 || tag_name == "br" 53 || tag_name == "col" 54 || tag_name == "embed" 55 || tag_name == "hr" 56 || tag_name == "img" 57 || tag_name == "input" 58 || tag_name == "link" 59 || tag_name == "meta" 60 || tag_name == "param" 61 || tag_name == "source" 62 || tag_name == "track" 63 || tag_name == "wbr"; 64} 65 66static bool parse_html_document(const StringView& html, Document& document, ParentNode& root) 67{ 68 NonnullRefPtrVector<ParentNode> node_stack; 69 node_stack.append(root); 70 71 enum class State { 72 Free = 0, 73 BeforeTagName, 74 InTagName, 75 InDoctype, 76 InComment, 77 InAttributeList, 78 InAttributeName, 79 BeforeAttributeValue, 80 InAttributeValueNoQuote, 81 InAttributeValueSingleQuote, 82 InAttributeValueDoubleQuote, 83 }; 84 85 auto state = State::Free; 86 87 StringBuilder text_buffer; 88 89 Vector<char, 32> tag_name_buffer; 90 91 Vector<Attribute> attributes; 92 Vector<char, 256> attribute_name_buffer; 93 Vector<char, 256> attribute_value_buffer; 94 95 bool is_slash_tag = false; 96 bool is_exclamation_tag = false; 97 98 auto commit_text_node = [&] { 99 auto text_node = adopt(*new Text(document, text_buffer.to_string())); 100 node_stack.last().append_child(text_node); 101 text_buffer.clear(); 102 }; 103 104 auto move_to_state = [&](State new_state) { 105 if (new_state == State::BeforeTagName) { 106 is_slash_tag = false; 107 is_exclamation_tag = false; 108 tag_name_buffer.clear(); 109 attributes.clear(); 110 } 111 if (new_state == State::InAttributeName) 112 attribute_name_buffer.clear(); 113 if (new_state == State::BeforeAttributeValue) 114 attribute_value_buffer.clear(); 115 if (state == State::Free && !text_buffer.is_empty()) { 116 commit_text_node(); 117 } 118 state = new_state; 119 text_buffer.clear(); 120 }; 121 122 auto close_tag = [&] { 123 if (node_stack.size() > 1) 124 node_stack.take_last(); 125 }; 126 127 auto open_tag = [&] { 128 auto new_element = create_element(document, String::copy(tag_name_buffer)); 129 tag_name_buffer.clear(); 130 new_element->set_attributes(move(attributes)); 131 node_stack.append(new_element); 132 if (node_stack.size() != 1) { 133 node_stack[node_stack.size() - 2].append_child(new_element); 134 } 135 136 if (is_self_closing_tag(new_element->tag_name())) 137 close_tag(); 138 }; 139 140 auto commit_doctype = [&] { 141 node_stack.last().append_child(adopt(*new DocumentType(document))); 142 }; 143 144 auto commit_comment = [&] { 145 node_stack.last().append_child(adopt(*new Comment(document, text_buffer.to_string()))); 146 }; 147 148 auto commit_tag = [&] { 149 if (is_slash_tag) 150 close_tag(); 151 else 152 open_tag(); 153 }; 154 155 auto commit_attribute = [&] { 156 if (!attribute_name_buffer.is_empty()) { 157 auto name = String::copy(attribute_name_buffer); 158 String value; 159 if (attribute_value_buffer.is_empty()) 160 value = String::empty(); 161 else 162 value = String::copy(attribute_value_buffer); 163 attributes.empend(name, value); 164 } 165 }; 166 167 for (size_t i = 0; i < html.length(); ++i) { 168 auto peek = [&](size_t offset) -> char { 169 if (i + offset >= html.length()) 170 return '\0'; 171 return html[i + offset]; 172 }; 173 char ch = html[i]; 174 switch (state) { 175 case State::Free: 176 if (ch == '<') { 177 bool should_treat_as_text = false; 178 if (node_stack.last().tag_name() == "script") { 179 bool is_script_close_tag = peek(1) == '/' 180 && tolower(peek(2)) == 's' 181 && tolower(peek(3)) == 'c' 182 && tolower(peek(4)) == 'r' 183 && tolower(peek(5)) == 'i' 184 && tolower(peek(6)) == 'p' 185 && tolower(peek(7)) == 't' 186 && tolower(peek(8)) == '>'; 187 if (!is_script_close_tag) 188 should_treat_as_text = true; 189 } 190 if (!should_treat_as_text) { 191 is_slash_tag = false; 192 move_to_state(State::BeforeTagName); 193 break; 194 } 195 } 196 197 if (ch != '&') { 198 text_buffer.append(ch); 199 } else { 200 struct Escape { 201 const char* code; 202 const char* value; 203 }; 204 static Escape escapes[] = { 205 { "&lt;", "<" }, 206 { "&gt;", ">" }, 207 { "&amp;", "&" }, 208 { "&mdash;", "-" }, 209 }; 210 auto rest_of_html = html.substring_view(i, html.length() - i); 211 bool found = false; 212 for (auto& escape : escapes) { 213 if (rest_of_html.starts_with(escape.code)) { 214 text_buffer.append(escape.value); 215 found = true; 216 i += strlen(escape.code) - 1; 217 break; 218 } 219 } 220 if (!found) 221 dbg() << "Unhandled escape sequence"; 222 } 223 break; 224 case State::BeforeTagName: 225 if (ch == '/') { 226 is_slash_tag = true; 227 break; 228 } 229 if (ch == '!') { 230 if (toupper(peek(1)) == 'D' 231 && toupper(peek(2)) == 'O' 232 && toupper(peek(3)) == 'C' 233 && toupper(peek(4)) == 'T' 234 && toupper(peek(5)) == 'Y' 235 && toupper(peek(6)) == 'P' 236 && toupper(peek(7)) == 'E') { 237 i += 7; 238 move_to_state(State::InDoctype); 239 break; 240 } 241 if (peek(1) == '-' && peek(2) == '-') { 242 i += 2; 243 move_to_state(State::InComment); 244 break; 245 } 246 break; 247 } 248 if (ch == '>') { 249 move_to_state(State::Free); 250 break; 251 } 252 if (!isalpha(ch)) 253 break; 254 move_to_state(State::InTagName); 255 [[fallthrough]]; 256 case State::InTagName: 257 if (isspace(ch)) { 258 move_to_state(State::InAttributeList); 259 break; 260 } 261 if (ch == '>') { 262 commit_tag(); 263 move_to_state(State::Free); 264 break; 265 } 266 tag_name_buffer.append(ch); 267 break; 268 case State::InDoctype: 269 if (ch == '>') { 270 commit_doctype(); 271 move_to_state(State::Free); 272 break; 273 } 274 break; 275 case State::InComment: 276 if (ch == '-' && peek(1) == '-' && peek(2) == '>') { 277 commit_comment(); 278 i += 2; 279 move_to_state(State::Free); 280 break; 281 } 282 text_buffer.append(ch); 283 break; 284 case State::InAttributeList: 285 if (ch == '>') { 286 commit_tag(); 287 move_to_state(State::Free); 288 break; 289 } 290 if (!isalpha(ch)) 291 break; 292 move_to_state(State::InAttributeName); 293 [[fallthrough]]; 294 case State::InAttributeName: 295 if (is_valid_in_attribute_name(ch)) { 296 attribute_name_buffer.append(ch); 297 break; 298 } 299 if (isspace(ch)) { 300 commit_attribute(); 301 break; 302 } 303 304 if (ch == '>') { 305 commit_attribute(); 306 commit_tag(); 307 move_to_state(State::Free); 308 break; 309 } 310 311 if (ch == '=') { 312 move_to_state(State::BeforeAttributeValue); 313 break; 314 } 315 break; 316 case State::BeforeAttributeValue: 317 if (ch == '\'') { 318 move_to_state(State::InAttributeValueSingleQuote); 319 break; 320 } 321 if (ch == '"') { 322 move_to_state(State::InAttributeValueDoubleQuote); 323 break; 324 } 325 if (ch == '>') { 326 commit_tag(); 327 move_to_state(State::Free); 328 break; 329 } 330 if (isspace(ch)) { 331 commit_attribute(); 332 move_to_state(State::InAttributeList); 333 break; 334 } 335 move_to_state(State::InAttributeValueNoQuote); 336 [[fallthrough]]; 337 case State::InAttributeValueNoQuote: 338 if (isspace(ch)) { 339 commit_attribute(); 340 move_to_state(State::InAttributeList); 341 break; 342 } 343 if (ch == '>') { 344 commit_attribute(); 345 commit_tag(); 346 move_to_state(State::Free); 347 break; 348 } 349 attribute_value_buffer.append(ch); 350 break; 351 case State::InAttributeValueSingleQuote: 352 if (ch == '\'') { 353 commit_attribute(); 354 move_to_state(State::InAttributeList); 355 break; 356 } 357 attribute_value_buffer.append(ch); 358 break; 359 case State::InAttributeValueDoubleQuote: 360 if (ch == '"') { 361 commit_attribute(); 362 move_to_state(State::InAttributeList); 363 break; 364 } 365 attribute_value_buffer.append(ch); 366 break; 367 default: 368 fprintf(stderr, "Unhandled state %d\n", (int)state); 369 ASSERT_NOT_REACHED(); 370 } 371 } 372 373 if (!text_buffer.is_empty()) 374 commit_text_node(); 375 376 return true; 377} 378 379RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& html) 380{ 381 auto fragment = adopt(*new DocumentFragment(document)); 382 if (!parse_html_document(html, document, *fragment)) 383 return nullptr; 384 return fragment; 385} 386 387RefPtr<Document> parse_html_document(const StringView& html, const URL& url) 388{ 389 auto document = adopt(*new Document(url)); 390 document->set_source(html); 391 392 if (!parse_html_document(html, *document, *document)) 393 return nullptr; 394 395 document->fixup(); 396 397#if 0 398 Function<void(Node&)> fire_insertion_callbacks = [&](Node& node) { 399 for (auto* child = node.first_child(); child; child = child->next_sibling()) { 400 fire_insertion_callbacks(*child); 401 } 402 if (node.parent()) 403 node.inserted_into(*node.parent()); 404 }; 405 fire_insertion_callbacks(document); 406#endif 407 408 document->dispatch_event(Event::create("DOMContentLoaded")); 409 410 return document; 411} 412}