Serenity Operating System
1/*
2 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/CharacterTypes.h>
8#include <AK/GenericLexer.h>
9#include <AK/StringView.h>
10#include <AK/Utf8View.h>
11#include <LibTextCodec/Decoder.h>
12#include <LibWeb/DOM/Attr.h>
13#include <LibWeb/DOM/Document.h>
14#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
15#include <LibWeb/Infra/CharacterTypes.h>
16#include <ctype.h>
17
18namespace Web::HTML {
19
20bool prescan_should_abort(ByteBuffer const& input, size_t const& position)
21{
22 return position >= input.size() || position >= 1024;
23}
24
25bool prescan_is_whitespace_or_slash(u8 const& byte)
26{
27 return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
28}
29
30bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position)
31{
32 while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
33 ++position;
34 return !prescan_should_abort(input, position);
35}
36
37// https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
38Optional<StringView> extract_character_encoding_from_meta_element(DeprecatedString const& string)
39{
40 // Checking for "charset" is case insensitive, as is getting an encoding.
41 // Therefore, stick to lowercase from the start for simplicity.
42 auto lowercase_string = string.to_lowercase();
43 GenericLexer lexer(lowercase_string);
44
45 for (;;) {
46 auto charset_index = lexer.remaining().find("charset"sv);
47 if (!charset_index.has_value())
48 return {};
49
50 // 7 is the length of "charset".
51 lexer.ignore(charset_index.value() + 7);
52
53 lexer.ignore_while([](char c) {
54 return Infra::is_ascii_whitespace(c);
55 });
56
57 if (lexer.peek() != '=')
58 continue;
59
60 break;
61 }
62
63 // Ignore the '='.
64 lexer.ignore();
65
66 lexer.ignore_while([](char c) {
67 return Infra::is_ascii_whitespace(c);
68 });
69
70 if (lexer.is_eof())
71 return {};
72
73 if (lexer.consume_specific('"')) {
74 auto matching_double_quote = lexer.remaining().find('"');
75 if (!matching_double_quote.has_value())
76 return {};
77
78 auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value());
79 return TextCodec::get_standardized_encoding(encoding);
80 }
81
82 if (lexer.consume_specific('\'')) {
83 auto matching_single_quote = lexer.remaining().find('\'');
84 if (!matching_single_quote.has_value())
85 return {};
86
87 auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value());
88 return TextCodec::get_standardized_encoding(encoding);
89 }
90
91 auto encoding = lexer.consume_until([](char c) {
92 return Infra::is_ascii_whitespace(c) || c == ';';
93 });
94 return TextCodec::get_standardized_encoding(encoding);
95}
96
97JS::GCPtr<DOM::Attr> prescan_get_attribute(DOM::Document& document, ByteBuffer const& input, size_t& position)
98{
99 if (!prescan_skip_whitespace_and_slashes(input, position))
100 return {};
101 if (input[position] == '>')
102 return {};
103
104 StringBuilder attribute_name;
105 while (true) {
106 if (input[position] == '=' && !attribute_name.is_empty()) {
107 ++position;
108 goto value;
109 } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
110 goto spaces;
111 else if (input[position] == '/' || input[position] == '>')
112 return *DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors();
113 else
114 attribute_name.append_as_lowercase(input[position]);
115 ++position;
116 if (prescan_should_abort(input, position))
117 return {};
118 }
119
120spaces:
121 if (!prescan_skip_whitespace_and_slashes(input, position))
122 return {};
123 if (input[position] != '=')
124 return DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors();
125 ++position;
126
127value:
128 if (!prescan_skip_whitespace_and_slashes(input, position))
129 return {};
130
131 StringBuilder attribute_value;
132 if (input[position] == '"' || input[position] == '\'') {
133 u8 quote_character = input[position];
134 ++position;
135 for (; !prescan_should_abort(input, position); ++position) {
136 if (input[position] == quote_character)
137 return DOM::Attr::create(document, attribute_name.to_deprecated_string(), attribute_value.to_deprecated_string()).release_value_but_fixme_should_propagate_errors();
138 else
139 attribute_value.append_as_lowercase(input[position]);
140 }
141 return {};
142 } else if (input[position] == '>')
143 return DOM::Attr::create(document, attribute_name.to_deprecated_string(), "").release_value_but_fixme_should_propagate_errors();
144 else
145 attribute_value.append_as_lowercase(input[position]);
146
147 ++position;
148 if (prescan_should_abort(input, position))
149 return {};
150
151 for (; !prescan_should_abort(input, position); ++position) {
152 if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
153 return DOM::Attr::create(document, attribute_name.to_deprecated_string(), attribute_value.to_deprecated_string()).release_value_but_fixme_should_propagate_errors();
154 else
155 attribute_value.append_as_lowercase(input[position]);
156 }
157 return {};
158}
159
160// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
161Optional<DeprecatedString> run_prescan_byte_stream_algorithm(DOM::Document& document, ByteBuffer const& input)
162{
163 // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
164
165 // Detects '<?x'
166 if (!prescan_should_abort(input, 6)) {
167 if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
168 return "utf-16le";
169 if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78)
170 return "utf-16be";
171 }
172
173 for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
174 if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
175 && input[position + 2] == '-' && input[position + 3] == '-') {
176 position += 2;
177 for (; !prescan_should_abort(input, position + 3); ++position) {
178 if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
179 position += 2;
180 break;
181 }
182 }
183 } else if (!prescan_should_abort(input, position + 6)
184 && input[position] == '<'
185 && (input[position + 1] == 'M' || input[position + 1] == 'm')
186 && (input[position + 2] == 'E' || input[position + 2] == 'e')
187 && (input[position + 3] == 'T' || input[position + 3] == 't')
188 && (input[position + 4] == 'A' || input[position + 4] == 'a')
189 && prescan_is_whitespace_or_slash(input[position + 5])) {
190 position += 6;
191 Vector<DeprecatedString> attribute_list {};
192 bool got_pragma = false;
193 Optional<bool> need_pragma {};
194 Optional<DeprecatedString> charset {};
195
196 while (true) {
197 auto attribute = prescan_get_attribute(document, input, position);
198 if (!attribute)
199 break;
200 if (attribute_list.contains_slow(attribute->name()))
201 continue;
202 auto& attribute_name = attribute->name();
203 attribute_list.append(attribute->name());
204
205 if (attribute_name == "http-equiv") {
206 got_pragma = attribute->value() == "content-type";
207 } else if (attribute_name == "content") {
208 auto encoding = extract_character_encoding_from_meta_element(attribute->value());
209 if (encoding.has_value() && !charset.has_value()) {
210 charset = encoding.value();
211 need_pragma = true;
212 }
213 } else if (attribute_name == "charset") {
214 auto maybe_charset = TextCodec::get_standardized_encoding(attribute->value());
215 if (maybe_charset.has_value()) {
216 charset = Optional<DeprecatedString> { maybe_charset };
217 need_pragma = { false };
218 }
219 }
220 }
221
222 if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
223 continue;
224 if (charset.value() == "UTF-16BE/LE")
225 return "UTF-8";
226 else if (charset.value() == "x-user-defined")
227 return "windows-1252";
228 else
229 return charset.value();
230 } else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
231 && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
232 position += 2;
233 prescan_skip_whitespace_and_slashes(input, position);
234 while (prescan_get_attribute(document, input, position)) { };
235 } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
236 position += 2;
237 while (input[position] != '>') {
238 ++position;
239 if (prescan_should_abort(input, position))
240 return {};
241 }
242 } else {
243 // Do nothing.
244 }
245 }
246 return {};
247}
248
249// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
250DeprecatedString run_encoding_sniffing_algorithm(DOM::Document& document, ByteBuffer const& input)
251{
252 if (input.size() >= 2) {
253 if (input[0] == 0xFE && input[1] == 0xFF) {
254 return "UTF-16BE";
255 } else if (input[0] == 0xFF && input[1] == 0xFE) {
256 return "UTF-16LE";
257 } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
258 return "UTF-8";
259 }
260 }
261
262 // FIXME: If the user has explicitly instructed the user agent to override the document's character
263 // encoding with a specific encoding.
264 // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
265 // at any later step in this algorithm.
266 // FIXME: If the transport layer specifies a character encoding, and it is supported.
267
268 auto optional_encoding = run_prescan_byte_stream_algorithm(document, input);
269 if (optional_encoding.has_value()) {
270 return optional_encoding.value();
271 }
272
273 // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
274 // is non-null and a child browsing context.
275 // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
276 // when it was last visited.
277
278 if (!Utf8View(StringView(input)).validate()) {
279 // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
280 return "windows-1252";
281 }
282
283 // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
284 // "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
285 return "UTF-8";
286}
287
288}