Serenity Operating System
at portability 380 lines 12 kB view raw
1/* 2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <AK/Function.h> 28#include <AK/NonnullRefPtrVector.h> 29#include <AK/StringBuilder.h> 30#include <LibHTML/DOM/Comment.h> 31#include <LibHTML/DOM/DocumentFragment.h> 32#include <LibHTML/DOM/DocumentType.h> 33#include <LibHTML/DOM/Element.h> 34#include <LibHTML/DOM/ElementFactory.h> 35#include <LibHTML/DOM/Text.h> 36#include <LibHTML/Parser/HTMLParser.h> 37#include <ctype.h> 38#include <stdio.h> 39 40static bool is_valid_in_attribute_name(char ch) 41{ 42 return isalnum(ch) || ch == '_' || ch == '-'; 43} 44 45static bool is_self_closing_tag(const StringView& tag_name) 46{ 47 return tag_name == "area" 48 || tag_name == "base" 49 || tag_name == "br" 50 || tag_name == "col" 51 || tag_name == "embed" 52 || tag_name == "hr" 53 || tag_name == "img" 54 || tag_name == "input" 55 || tag_name == "link" 56 || tag_name == "meta" 57 || tag_name == "param" 58 || tag_name == "source" 59 || tag_name == "track" 60 || tag_name == "wbr"; 61} 62 63static bool parse_html_document(const StringView& html, Document& document, ParentNode& root) 64{ 65 NonnullRefPtrVector<ParentNode> node_stack; 66 node_stack.append(root); 67 68 enum class State { 69 Free = 0, 70 BeforeTagName, 71 InTagName, 72 InDoctype, 73 InComment, 74 InAttributeList, 75 InAttributeName, 76 BeforeAttributeValue, 77 InAttributeValueNoQuote, 78 InAttributeValueSingleQuote, 79 InAttributeValueDoubleQuote, 80 }; 81 82 auto state = State::Free; 83 84 StringBuilder text_buffer; 85 86 Vector<char, 32> tag_name_buffer; 87 88 Vector<Attribute> attributes; 89 Vector<char, 256> attribute_name_buffer; 90 Vector<char, 256> attribute_value_buffer; 91 92 bool is_slash_tag = false; 93 bool is_exclamation_tag = false; 94 95 auto move_to_state = [&](State new_state) { 96 if (new_state == State::BeforeTagName) { 97 is_slash_tag = false; 98 is_exclamation_tag = false; 99 tag_name_buffer.clear(); 100 attributes.clear(); 101 } 102 if (new_state == State::InAttributeName) 103 attribute_name_buffer.clear(); 104 if (new_state == State::BeforeAttributeValue) 105 attribute_value_buffer.clear(); 106 if (state == State::Free && !text_buffer.string_view().is_empty()) { 107 auto text_node = adopt(*new Text(document, text_buffer.to_string())); 108 node_stack.last().append_child(text_node, false); 109 } 110 state = new_state; 111 text_buffer.clear(); 112 }; 113 114 auto close_tag = [&] { 115 if (node_stack.size() > 1) 116 node_stack.take_last(); 117 }; 118 119 auto open_tag = [&] { 120 auto new_element = create_element(document, String::copy(tag_name_buffer)); 121 tag_name_buffer.clear(); 122 new_element->set_attributes(move(attributes)); 123 node_stack.append(new_element); 124 if (node_stack.size() != 1) 125 node_stack[node_stack.size() - 2].append_child(new_element, false); 126 127 if (is_self_closing_tag(new_element->tag_name())) 128 close_tag(); 129 }; 130 131 auto commit_doctype = [&] { 132 node_stack.last().append_child(adopt(*new DocumentType(document)), false); 133 }; 134 135 auto commit_comment = [&] { 136 node_stack.last().append_child(adopt(*new Comment(document, text_buffer.to_string())), false); 137 }; 138 139 auto commit_tag = [&] { 140 if (is_slash_tag) 141 close_tag(); 142 else 143 open_tag(); 144 }; 145 146 auto commit_attribute = [&] { 147 if (!attribute_name_buffer.is_empty()) { 148 auto name = String::copy(attribute_name_buffer); 149 String value; 150 if (attribute_value_buffer.is_empty()) 151 value = String::empty(); 152 else 153 value = String::copy(attribute_value_buffer); 154 attributes.empend(name, value); 155 } 156 }; 157 158 for (size_t i = 0; i < html.length(); ++i) { 159 auto peek = [&](size_t offset) -> char { 160 if (i + offset >= html.length()) 161 return '\0'; 162 return html[i + offset]; 163 }; 164 char ch = html[i]; 165 switch (state) { 166 case State::Free: 167 if (ch == '<') { 168 is_slash_tag = false; 169 move_to_state(State::BeforeTagName); 170 break; 171 } 172 if (ch != '&') { 173 text_buffer.append(ch); 174 } else { 175 struct Escape { 176 const char* code; 177 const char* value; 178 }; 179 static Escape escapes[] = { 180 { "&lt;", "<" }, 181 { "&gt;", ">" }, 182 { "&amp;", "&" }, 183 { "&mdash;", "-" }, 184 }; 185 auto rest_of_html = html.substring_view(i, html.length() - i); 186 bool found = false; 187 for (auto& escape : escapes) { 188 if (rest_of_html.starts_with(escape.code)) { 189 text_buffer.append(escape.value); 190 found = true; 191 i += strlen(escape.code) - 1; 192 break; 193 } 194 } 195 if (!found) 196 dbg() << "Unhandled escape sequence"; 197 } 198 break; 199 case State::BeforeTagName: 200 if (ch == '/') { 201 is_slash_tag = true; 202 break; 203 } 204 if (ch == '!') { 205 if (toupper(peek(1)) == 'D' 206 && toupper(peek(2)) == 'O' 207 && toupper(peek(3)) == 'C' 208 && toupper(peek(4)) == 'T' 209 && toupper(peek(5)) == 'Y' 210 && toupper(peek(6)) == 'P' 211 && toupper(peek(7)) == 'E') { 212 i += 7; 213 move_to_state(State::InDoctype); 214 break; 215 } 216 if (peek(1) == '-' && peek(2) == '-') { 217 i += 2; 218 move_to_state(State::InComment); 219 break; 220 } 221 break; 222 } 223 if (ch == '>') { 224 move_to_state(State::Free); 225 break; 226 } 227 if (!isalpha(ch)) 228 break; 229 move_to_state(State::InTagName); 230 [[fallthrough]]; 231 case State::InTagName: 232 if (isspace(ch)) { 233 move_to_state(State::InAttributeList); 234 break; 235 } 236 if (ch == '>') { 237 commit_tag(); 238 move_to_state(State::Free); 239 break; 240 } 241 tag_name_buffer.append(ch); 242 break; 243 case State::InDoctype: 244 if (ch == '>') { 245 commit_doctype(); 246 move_to_state(State::Free); 247 break; 248 } 249 break; 250 case State::InComment: 251 if (ch == '-' && peek(1) == '-' && peek(2) == '>') { 252 commit_comment(); 253 i += 2; 254 move_to_state(State::Free); 255 break; 256 } 257 text_buffer.append(ch); 258 break; 259 case State::InAttributeList: 260 if (ch == '>') { 261 commit_tag(); 262 move_to_state(State::Free); 263 break; 264 } 265 if (!isalpha(ch)) 266 break; 267 move_to_state(State::InAttributeName); 268 [[fallthrough]]; 269 case State::InAttributeName: 270 if (is_valid_in_attribute_name(ch)) { 271 attribute_name_buffer.append(ch); 272 break; 273 } 274 if (isspace(ch)) { 275 commit_attribute(); 276 break; 277 } 278 279 if (ch == '>') { 280 commit_attribute(); 281 commit_tag(); 282 move_to_state(State::Free); 283 break; 284 } 285 286 if (ch == '=') { 287 move_to_state(State::BeforeAttributeValue); 288 break; 289 } 290 break; 291 case State::BeforeAttributeValue: 292 if (ch == '\'') { 293 move_to_state(State::InAttributeValueSingleQuote); 294 break; 295 } 296 if (ch == '"') { 297 move_to_state(State::InAttributeValueDoubleQuote); 298 break; 299 } 300 if (ch == '>') { 301 commit_tag(); 302 move_to_state(State::Free); 303 break; 304 } 305 if (isspace(ch)) { 306 commit_attribute(); 307 move_to_state(State::InAttributeList); 308 break; 309 } 310 move_to_state(State::InAttributeValueNoQuote); 311 [[fallthrough]]; 312 case State::InAttributeValueNoQuote: 313 if (isspace(ch)) { 314 commit_attribute(); 315 move_to_state(State::InAttributeList); 316 break; 317 } 318 if (ch == '>') { 319 commit_attribute(); 320 commit_tag(); 321 move_to_state(State::Free); 322 break; 323 } 324 attribute_value_buffer.append(ch); 325 break; 326 case State::InAttributeValueSingleQuote: 327 if (ch == '\'') { 328 commit_attribute(); 329 move_to_state(State::InAttributeList); 330 break; 331 } 332 attribute_value_buffer.append(ch); 333 break; 334 case State::InAttributeValueDoubleQuote: 335 if (ch == '"') { 336 commit_attribute(); 337 move_to_state(State::InAttributeList); 338 break; 339 } 340 attribute_value_buffer.append(ch); 341 break; 342 default: 343 fprintf(stderr, "Unhandled state %d\n", (int)state); 344 ASSERT_NOT_REACHED(); 345 } 346 } 347 348 return true; 349} 350 351RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& html) 352{ 353 auto fragment = adopt(*new DocumentFragment(document)); 354 if (!parse_html_document(html, document, *fragment)) 355 return nullptr; 356 return fragment; 357} 358 359RefPtr<Document> parse_html_document(const StringView& html, const URL& url) 360{ 361 auto document = adopt(*new Document); 362 document->set_url(url); 363 document->set_source(html); 364 365 if (!parse_html_document(html, *document, *document)) 366 return nullptr; 367 368 document->fixup(); 369 370 Function<void(Node&)> fire_insertion_callbacks = [&](Node& node) { 371 for (auto* child = node.first_child(); child; child = child->next_sibling()) { 372 fire_insertion_callbacks(*child); 373 } 374 if (node.parent()) 375 node.inserted_into(*node.parent()); 376 }; 377 fire_insertion_callbacks(document); 378 379 return document; 380}