Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp at master

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / Userland / Libraries / LibWeb / HTML / Parser / HTMLEncodingDetection.cpp
at master 288 lines 12 kB view raw
wrap content
Kenneth Myhra LibWeb: Make factory method of DOM::Attr fallible 3y ago
50c5f0d7
  1/*
  2 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  3 *
  4 * SPDX-License-Identifier: BSD-2-Clause
  5 */
  6
  7#include <AK/CharacterTypes.h>
  8#include <AK/GenericLexer.h>
  9#include <AK/StringView.h>
 10#include <AK/Utf8View.h>
 11#include <LibTextCodec/Decoder.h>
 12#include <LibWeb/DOM/Attr.h>
 13#include <LibWeb/DOM/Document.h>
 14#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
 15#include <LibWeb/Infra/CharacterTypes.h>
 16#include <ctype.h>
 17
 18namespace Web::HTML {
 19
 20bool prescan_should_abort(ByteBuffer const& input, size_t const& position)
 21{
 22    return position >= input.size() || position >= 1024;
 23}
 24
 25bool prescan_is_whitespace_or_slash(u8 const& byte)
 26{
 27    return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
 28}
 29
 30bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position)
 31{
 32    while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
 33        ++position;
 34    return !prescan_should_abort(input, position);
 35}
 36
 37// https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
 38Optional<StringView> extract_character_encoding_from_meta_element(DeprecatedString const& string)
 39{
 40    // Checking for "charset" is case insensitive, as is getting an encoding.
 41    // Therefore, stick to lowercase from the start for simplicity.
 42    auto lowercase_string = string.to_lowercase();
 43    GenericLexer lexer(lowercase_string);
 44
 45    for (;;) {
 46        auto charset_index = lexer.remaining().find("charset"sv);
 47        if (!charset_index.has_value())
 48            return {};
 49
 50        // 7 is the length of "charset".
 51        lexer.ignore(charset_index.value() + 7);
 52
 53        lexer.ignore_while([](char c) {
 54            return Infra::is_ascii_whitespace(c);
 55        });
 56
 57        if (lexer.peek() != '=')
 58            continue;
 59
 60        break;
 61    }
 62
 63    // Ignore the '='.
 64    lexer.ignore();
 65
 66    lexer.ignore_while([](char c) {
 67        return Infra::is_ascii_whitespace(c);
 68    });
 69
 70    if (lexer.is_eof())
 71        return {};
 72
 73    if (lexer.consume_specific('"')) {
 74        auto matching_double_quote = lexer.remaining().find('"');
 75        if (!matching_double_quote.has_value())
 76            return {};
 77
 78        auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value());
 79        return TextCodec::get_standardized_encoding(encoding);
 80    }
 81
 82    if (lexer.consume_specific('\'')) {
 83        auto matching_single_quote = lexer.remaining().find('\'');
 84        if (!matching_single_quote.has_value())
 85            return {};
 86
 87        auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value());
 88        return TextCodec::get_standardized_encoding(encoding);
 89    }
 90
 91    auto encoding = lexer.consume_until([](char c) {
 92        return Infra::is_ascii_whitespace(c) || c == ';';
 93    });
 94    return TextCodec::get_standardized_encoding(encoding);
 95}
 96
 97JS::GCPtr<DOM::Attr> prescan_get_attribute(DOM::Document& document, ByteBuffer const& input, size_t& position)
 98{
 99    if (!prescan_skip_whitespace_and_slashes(input, position))
100        return {};
101    if (input[position] == '>')
102        return {};
103
104    StringBuilder attribute_name;
105    while (true) {
106        if (input[position] == '=' && !attribute_name.is_empty()) {
107            ++position;
108            goto value;
109        } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
110            goto spaces;
111        else if (input[position] == '/' || input[position] == '>')
112            return *DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors();
113        else
114            attribute_name.append_as_lowercase(input[position]);
115        ++position;
116        if (prescan_should_abort(input, position))
117            return {};
118    }
119
120spaces:
121    if (!prescan_skip_whitespace_and_slashes(input, position))
122        return {};
123    if (input[position] != '=')
124        return DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors();
125    ++position;
126
127value:
128    if (!prescan_skip_whitespace_and_slashes(input, position))
129        return {};
130
131    StringBuilder attribute_value;
132    if (input[position] == '"' || input[position] == '\'') {
133        u8 quote_character = input[position];
134        ++position;
135        for (; !prescan_should_abort(input, position); ++position) {
136            if (input[position] == quote_character)
137                return DOM::Attr::create(document, attribute_name.to_deprecated_string(), attribute_value.to_deprecated_string()).release_value_but_fixme_should_propagate_errors();
138            else
139                attribute_value.append_as_lowercase(input[position]);
140        }
141        return {};
142    } else if (input[position] == '>')
143        return DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors();
144    else
145        attribute_value.append_as_lowercase(input[position]);
146
147    ++position;
148    if (prescan_should_abort(input, position))
149        return {};
150
151    for (; !prescan_should_abort(input, position); ++position) {
152        if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
153            return DOM::Attr::create(document, attribute_name.to_deprecated_string(), attribute_value.to_deprecated_string()).release_value_but_fixme_should_propagate_errors();
154        else
155            attribute_value.append_as_lowercase(input[position]);
156    }
157    return {};
158}
159
160// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
161Optional<DeprecatedString> run_prescan_byte_stream_algorithm(DOM::Document& document, ByteBuffer const& input)
162{
163    // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
164
165    // Detects '<?x'
166    if (!prescan_should_abort(input, 6)) {
167        if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
168            return "utf-16le";
169        if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78)
170            return "utf-16be";
171    }
172
173    for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
174        if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
175            && input[position + 2] == '-' && input[position + 3] == '-') {
176            position += 2;
177            for (; !prescan_should_abort(input, position + 3); ++position) {
178                if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
179                    position += 2;
180                    break;
181                }
182            }
183        } else if (!prescan_should_abort(input, position + 6)
184            && input[position] == '<'
185            && (input[position + 1] == 'M' || input[position + 1] == 'm')
186            && (input[position + 2] == 'E' || input[position + 2] == 'e')
187            && (input[position + 3] == 'T' || input[position + 3] == 't')
188            && (input[position + 4] == 'A' || input[position + 4] == 'a')
189            && prescan_is_whitespace_or_slash(input[position + 5])) {
190            position += 6;
191            Vector<DeprecatedString> attribute_list {};
192            bool got_pragma = false;
193            Optional<bool> need_pragma {};
194            Optional<DeprecatedString> charset {};
195
196            while (true) {
197                auto attribute = prescan_get_attribute(document, input, position);
198                if (!attribute)
199                    break;
200                if (attribute_list.contains_slow(attribute->name()))
201                    continue;
202                auto& attribute_name = attribute->name();
203                attribute_list.append(attribute->name());
204
205                if (attribute_name == "http-equiv") {
206                    got_pragma = attribute->value() == "content-type";
207                } else if (attribute_name == "content") {
208                    auto encoding = extract_character_encoding_from_meta_element(attribute->value());
209                    if (encoding.has_value() && !charset.has_value()) {
210                        charset = encoding.value();
211                        need_pragma = true;
212                    }
213                } else if (attribute_name == "charset") {
214                    auto maybe_charset = TextCodec::get_standardized_encoding(attribute->value());
215                    if (maybe_charset.has_value()) {
216                        charset = Optional<DeprecatedString> { maybe_charset };
217                        need_pragma = { false };
218                    }
219                }
220            }
221
222            if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
223                continue;
224            if (charset.value() == "UTF-16BE/LE")
225                return "UTF-8";
226            else if (charset.value() == "x-user-defined")
227                return "windows-1252";
228            else
229                return charset.value();
230        } else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
231            && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
232            position += 2;
233            prescan_skip_whitespace_and_slashes(input, position);
234            while (prescan_get_attribute(document, input, position)) { };
235        } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
236            position += 2;
237            while (input[position] != '>') {
238                ++position;
239                if (prescan_should_abort(input, position))
240                    return {};
241            }
242        } else {
243            // Do nothing.
244        }
245    }
246    return {};
247}
248
249// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
250DeprecatedString run_encoding_sniffing_algorithm(DOM::Document& document, ByteBuffer const& input)
251{
252    if (input.size() >= 2) {
253        if (input[0] == 0xFE && input[1] == 0xFF) {
254            return "UTF-16BE";
255        } else if (input[0] == 0xFF && input[1] == 0xFE) {
256            return "UTF-16LE";
257        } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
258            return "UTF-8";
259        }
260    }
261
262    // FIXME: If the user has explicitly instructed the user agent to override the document's character
263    //        encoding with a specific encoding.
264    // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
265    //        at any later step in this algorithm.
266    // FIXME: If the transport layer specifies a character encoding, and it is supported.
267
268    auto optional_encoding = run_prescan_byte_stream_algorithm(document, input);
269    if (optional_encoding.has_value()) {
270        return optional_encoding.value();
271    }
272
273    // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
274    //        is non-null and a child browsing context.
275    // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
276    //        when it was last visited.
277
278    if (!Utf8View(StringView(input)).validate()) {
279        // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
280        return "windows-1252";
281    }
282
283    // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
284    //       "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
285    return "UTF-8";
286}
287
288}