Serenity Operating System
at master 288 lines 12 kB view raw
1/* 2 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/CharacterTypes.h> 8#include <AK/GenericLexer.h> 9#include <AK/StringView.h> 10#include <AK/Utf8View.h> 11#include <LibTextCodec/Decoder.h> 12#include <LibWeb/DOM/Attr.h> 13#include <LibWeb/DOM/Document.h> 14#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h> 15#include <LibWeb/Infra/CharacterTypes.h> 16#include <ctype.h> 17 18namespace Web::HTML { 19 20bool prescan_should_abort(ByteBuffer const& input, size_t const& position) 21{ 22 return position >= input.size() || position >= 1024; 23} 24 25bool prescan_is_whitespace_or_slash(u8 const& byte) 26{ 27 return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/'; 28} 29 30bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position) 31{ 32 while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/')) 33 ++position; 34 return !prescan_should_abort(input, position); 35} 36 37// https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element 38Optional<StringView> extract_character_encoding_from_meta_element(DeprecatedString const& string) 39{ 40 // Checking for "charset" is case insensitive, as is getting an encoding. 41 // Therefore, stick to lowercase from the start for simplicity. 42 auto lowercase_string = string.to_lowercase(); 43 GenericLexer lexer(lowercase_string); 44 45 for (;;) { 46 auto charset_index = lexer.remaining().find("charset"sv); 47 if (!charset_index.has_value()) 48 return {}; 49 50 // 7 is the length of "charset". 51 lexer.ignore(charset_index.value() + 7); 52 53 lexer.ignore_while([](char c) { 54 return Infra::is_ascii_whitespace(c); 55 }); 56 57 if (lexer.peek() != '=') 58 continue; 59 60 break; 61 } 62 63 // Ignore the '='. 64 lexer.ignore(); 65 66 lexer.ignore_while([](char c) { 67 return Infra::is_ascii_whitespace(c); 68 }); 69 70 if (lexer.is_eof()) 71 return {}; 72 73 if (lexer.consume_specific('"')) { 74 auto matching_double_quote = lexer.remaining().find('"'); 75 if (!matching_double_quote.has_value()) 76 return {}; 77 78 auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value()); 79 return TextCodec::get_standardized_encoding(encoding); 80 } 81 82 if (lexer.consume_specific('\'')) { 83 auto matching_single_quote = lexer.remaining().find('\''); 84 if (!matching_single_quote.has_value()) 85 return {}; 86 87 auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value()); 88 return TextCodec::get_standardized_encoding(encoding); 89 } 90 91 auto encoding = lexer.consume_until([](char c) { 92 return Infra::is_ascii_whitespace(c) || c == ';'; 93 }); 94 return TextCodec::get_standardized_encoding(encoding); 95} 96 97JS::GCPtr<DOM::Attr> prescan_get_attribute(DOM::Document& document, ByteBuffer const& input, size_t& position) 98{ 99 if (!prescan_skip_whitespace_and_slashes(input, position)) 100 return {}; 101 if (input[position] == '>') 102 return {}; 103 104 StringBuilder attribute_name; 105 while (true) { 106 if (input[position] == '=' && !attribute_name.is_empty()) { 107 ++position; 108 goto value; 109 } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ') 110 goto spaces; 111 else if (input[position] == '/' || input[position] == '>') 112 return *DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors(); 113 else 114 attribute_name.append_as_lowercase(input[position]); 115 ++position; 116 if (prescan_should_abort(input, position)) 117 return {}; 118 } 119 120spaces: 121 if (!prescan_skip_whitespace_and_slashes(input, position)) 122 return {}; 123 if (input[position] != '=') 124 return DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors(); 125 ++position; 126 127value: 128 if (!prescan_skip_whitespace_and_slashes(input, position)) 129 return {}; 130 131 StringBuilder attribute_value; 132 if (input[position] == '"' || input[position] == '\'') { 133 u8 quote_character = input[position]; 134 ++position; 135 for (; !prescan_should_abort(input, position); ++position) { 136 if (input[position] == quote_character) 137 return DOM::Attr::create(document, attribute_name.to_deprecated_string(), attribute_value.to_deprecated_string()).release_value_but_fixme_should_propagate_errors(); 138 else 139 attribute_value.append_as_lowercase(input[position]); 140 } 141 return {}; 142 } else if (input[position] == '>') 143 return DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors(); 144 else 145 attribute_value.append_as_lowercase(input[position]); 146 147 ++position; 148 if (prescan_should_abort(input, position)) 149 return {}; 150 151 for (; !prescan_should_abort(input, position); ++position) { 152 if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>') 153 return DOM::Attr::create(document, attribute_name.to_deprecated_string(), attribute_value.to_deprecated_string()).release_value_but_fixme_should_propagate_errors(); 154 else 155 attribute_value.append_as_lowercase(input[position]); 156 } 157 return {}; 158} 159 160// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding 161Optional<DeprecatedString> run_prescan_byte_stream_algorithm(DOM::Document& document, ByteBuffer const& input) 162{ 163 // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding 164 165 // Detects '<?x' 166 if (!prescan_should_abort(input, 6)) { 167 if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00) 168 return "utf-16le"; 169 if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78) 170 return "utf-16be"; 171 } 172 173 for (size_t position = 0; !prescan_should_abort(input, position); ++position) { 174 if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!' 175 && input[position + 2] == '-' && input[position + 3] == '-') { 176 position += 2; 177 for (; !prescan_should_abort(input, position + 3); ++position) { 178 if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') { 179 position += 2; 180 break; 181 } 182 } 183 } else if (!prescan_should_abort(input, position + 6) 184 && input[position] == '<' 185 && (input[position + 1] == 'M' || input[position + 1] == 'm') 186 && (input[position + 2] == 'E' || input[position + 2] == 'e') 187 && (input[position + 3] == 'T' || input[position + 3] == 't') 188 && (input[position + 4] == 'A' || input[position + 4] == 'a') 189 && prescan_is_whitespace_or_slash(input[position + 5])) { 190 position += 6; 191 Vector<DeprecatedString> attribute_list {}; 192 bool got_pragma = false; 193 Optional<bool> need_pragma {}; 194 Optional<DeprecatedString> charset {}; 195 196 while (true) { 197 auto attribute = prescan_get_attribute(document, input, position); 198 if (!attribute) 199 break; 200 if (attribute_list.contains_slow(attribute->name())) 201 continue; 202 auto& attribute_name = attribute->name(); 203 attribute_list.append(attribute->name()); 204 205 if (attribute_name == "http-equiv") { 206 got_pragma = attribute->value() == "content-type"; 207 } else if (attribute_name == "content") { 208 auto encoding = extract_character_encoding_from_meta_element(attribute->value()); 209 if (encoding.has_value() && !charset.has_value()) { 210 charset = encoding.value(); 211 need_pragma = true; 212 } 213 } else if (attribute_name == "charset") { 214 auto maybe_charset = TextCodec::get_standardized_encoding(attribute->value()); 215 if (maybe_charset.has_value()) { 216 charset = Optional<DeprecatedString> { maybe_charset }; 217 need_pragma = { false }; 218 } 219 } 220 } 221 222 if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value()) 223 continue; 224 if (charset.value() == "UTF-16BE/LE") 225 return "UTF-8"; 226 else if (charset.value() == "x-user-defined") 227 return "windows-1252"; 228 else 229 return charset.value(); 230 } else if (!prescan_should_abort(input, position + 3) && input[position] == '<' 231 && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) { 232 position += 2; 233 prescan_skip_whitespace_and_slashes(input, position); 234 while (prescan_get_attribute(document, input, position)) { }; 235 } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) { 236 position += 2; 237 while (input[position] != '>') { 238 ++position; 239 if (prescan_should_abort(input, position)) 240 return {}; 241 } 242 } else { 243 // Do nothing. 244 } 245 } 246 return {}; 247} 248 249// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding 250DeprecatedString run_encoding_sniffing_algorithm(DOM::Document& document, ByteBuffer const& input) 251{ 252 if (input.size() >= 2) { 253 if (input[0] == 0xFE && input[1] == 0xFF) { 254 return "UTF-16BE"; 255 } else if (input[0] == 0xFF && input[1] == 0xFE) { 256 return "UTF-16LE"; 257 } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) { 258 return "UTF-8"; 259 } 260 } 261 262 // FIXME: If the user has explicitly instructed the user agent to override the document's character 263 // encoding with a specific encoding. 264 // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or 265 // at any later step in this algorithm. 266 // FIXME: If the transport layer specifies a character encoding, and it is supported. 267 268 auto optional_encoding = run_prescan_byte_stream_algorithm(document, input); 269 if (optional_encoding.has_value()) { 270 return optional_encoding.value(); 271 } 272 273 // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context 274 // is non-null and a child browsing context. 275 // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page 276 // when it was last visited. 277 278 if (!Utf8View(StringView(input)).validate()) { 279 // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale. 280 return "windows-1252"; 281 } 282 283 // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification. 284 // "Otherwise, return an implementation-defined or user-specified default character encoding, [...]." 285 return "UTF-8"; 286} 287 288}