Serenity Operating System
1/*
2 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/CharacterTypes.h>
8#include <AK/Debug.h>
9#include <AK/DeprecatedString.h>
10#include <AK/Optional.h>
11#include <AK/SourceLocation.h>
12#include <AK/StringBuilder.h>
13#include <AK/StringUtils.h>
14#include <AK/URLParser.h>
15#include <AK/Utf8View.h>
16
17namespace AK {
18
19// NOTE: This is similar to the LibC macro EOF = -1.
20constexpr u32 end_of_file = 0xFFFFFFFF;
21
22static bool is_url_code_point(u32 code_point)
23{
24 // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
25 return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
26}
27
28static void report_validation_error(SourceLocation const& location = SourceLocation::current())
29{
30 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location);
31}
32
33static Optional<DeprecatedString> parse_opaque_host(StringView input)
34{
35 auto forbidden_host_characters_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv;
36 for (auto character : forbidden_host_characters_excluding_percent) {
37 if (input.contains(character)) {
38 report_validation_error();
39 return {};
40 }
41 }
42 // FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error.
43 // FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error.
44 return URL::percent_encode(input, URL::PercentEncodeSet::C0Control);
45}
46
47static Optional<DeprecatedString> parse_ipv4_address(StringView input)
48{
49 // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser.
50 return input;
51}
52
53// https://url.spec.whatwg.org/#concept-host-parser
54// NOTE: This is a very bare-bones implementation.
55static Optional<DeprecatedString> parse_host(StringView input, bool is_not_special = false)
56{
57 if (input.starts_with('[')) {
58 if (!input.ends_with(']')) {
59 report_validation_error();
60 return {};
61 }
62 // FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
63 TODO();
64 }
65
66 if (is_not_special)
67 return parse_opaque_host(input);
68 VERIFY(!input.is_empty());
69
70 // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
71 auto domain = URL::percent_decode(input);
72 // FIXME: Let asciiDomain be the result of running domain to ASCII on domain.
73 auto& ascii_domain = domain;
74
75 auto forbidden_host_characters = "\0\t\n\r #%/:<>?@[\\]^|"sv;
76 for (auto character : forbidden_host_characters) {
77 if (ascii_domain.view().contains(character)) {
78 report_validation_error();
79 return {};
80 }
81 }
82
83 auto ipv4_host = parse_ipv4_address(ascii_domain);
84 return ipv4_host;
85}
86
87// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
88constexpr bool starts_with_windows_drive_letter(StringView input)
89{
90 if (input.length() < 2)
91 return false;
92 if (!is_ascii_alpha(input[0]) || !(input[1] == ':' || input[1] == '|'))
93 return false;
94 if (input.length() == 2)
95 return true;
96 return "/\\?#"sv.contains(input[2]);
97}
98
99constexpr bool is_windows_drive_letter(StringView input)
100{
101 return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
102}
103
104constexpr bool is_normalized_windows_drive_letter(StringView input)
105{
106 return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':';
107}
108
109constexpr bool is_single_dot_path_segment(StringView input)
110{
111 return input == "."sv || input.equals_ignoring_ascii_case("%2e"sv);
112}
113
114constexpr bool is_double_dot_path_segment(StringView input)
115{
116 return input == ".."sv || input.equals_ignoring_ascii_case(".%2e"sv) || input.equals_ignoring_ascii_case("%2e."sv) || input.equals_ignoring_ascii_case("%2e%2e"sv);
117}
118
119// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
120static DeprecatedString percent_encode_after_encoding(StringView input, URL::PercentEncodeSet percent_encode_set, bool space_as_plus = false)
121{
122 // NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
123
124 StringBuilder output;
125
126 // 3. For each byte of encodeOutput converted to a byte sequence:
127 for (auto byte : input) {
128 // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
129 if (space_as_plus && byte == ' ') {
130 output.append('+');
131 continue;
132 }
133
134 // 2. Let isomorph be a code point whose value is byte’s value.
135 u32 isomorph = byte;
136
137 // 3. Assert: percentEncodeSet includes all non-ASCII code points.
138
139 // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
140 if (!URL::code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) {
141 output.append_code_point(isomorph);
142 }
143
144 // 5. Otherwise, percent-encode byte and append the result to output.
145 else {
146 output.appendff("%{:02X}", byte);
147 }
148 }
149
150 // 6. Return output.
151 return output.to_deprecated_string();
152}
153
154// https://fetch.spec.whatwg.org/#data-urls
155// FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec.
156Optional<URL> URLParser::parse_data_url(StringView raw_input)
157{
158 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input);
159 VERIFY(raw_input.starts_with("data:"sv));
160 auto input = raw_input.substring_view(5);
161 auto comma_offset = input.find(',');
162 if (!comma_offset.has_value())
163 return {};
164 auto mime_type = StringUtils::trim(input.substring_view(0, comma_offset.value()), "\t\n\f\r "sv, TrimMode::Both);
165 auto encoded_body = input.substring_view(comma_offset.value() + 1);
166 auto body = URL::percent_decode(encoded_body);
167 bool is_base64_encoded = false;
168 if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) {
169 auto substring_view = mime_type.substring_view(0, mime_type.length() - 6);
170 auto trimmed_substring_view = StringUtils::trim(substring_view, " "sv, TrimMode::Right);
171 if (trimmed_substring_view.ends_with(';')) {
172 is_base64_encoded = true;
173 mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
174 }
175 }
176
177 StringBuilder builder;
178 if (mime_type.starts_with(";"sv) || mime_type.is_empty()) {
179 builder.append("text/plain"sv);
180 builder.append(mime_type);
181 mime_type = builder.string_view();
182 }
183
184 // FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
185 URL url { StringUtils::trim(mime_type, "\n\r\t "sv, TrimMode::Both), move(body), is_base64_encoded };
186 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize());
187 return url;
188}
189
190// https://url.spec.whatwg.org/#concept-basic-url-parser
191// NOTE: This parser assumes a UTF-8 encoding.
192// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
193// validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
194// future for validation of URLs, which would then lead to infinite recursion.
195// The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
196// variables' values here, not what the URL class presents to its users.
197// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting
198// some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode
199// everything before setting the member variables.
200URL URLParser::parse(StringView raw_input, URL const* base_url, Optional<URL> url, Optional<State> state_override)
201{
202 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input);
203 if (raw_input.is_empty())
204 return base_url ? *base_url : URL {};
205
206 if (raw_input.starts_with("data:"sv)) {
207 auto maybe_url = parse_data_url(raw_input);
208 if (!maybe_url.has_value())
209 return {};
210 return maybe_url.release_value();
211 }
212
213 size_t start_index = 0;
214 size_t end_index = raw_input.length();
215 if (!url.has_value()) {
216 url = URL();
217
218 // NOTE: This removes all leading and trailing C0 control or space characters.
219 bool has_validation_error = false;
220 for (size_t i = 0; i < raw_input.length(); ++i) {
221 i8 ch = raw_input[i];
222 if (0 <= ch && ch <= 0x20) {
223 ++start_index;
224 has_validation_error = true;
225 } else {
226 break;
227 }
228 }
229 for (ssize_t i = raw_input.length() - 1; i >= 0; --i) {
230 i8 ch = raw_input[i];
231 if (0 <= ch && ch <= 0x20) {
232 --end_index;
233 has_validation_error = true;
234 } else {
235 break;
236 }
237 }
238 if (has_validation_error)
239 report_validation_error();
240 }
241 if (start_index >= end_index)
242 return {};
243
244 DeprecatedString processed_input = raw_input.substring_view(start_index, end_index - start_index);
245
246 // NOTE: This replaces all tab and newline characters with nothing.
247 if (processed_input.contains("\t"sv) || processed_input.contains("\n"sv)) {
248 report_validation_error();
249 processed_input = processed_input.replace("\t"sv, ""sv, ReplaceMode::All).replace("\n"sv, ""sv, ReplaceMode::All);
250 }
251
252 State state = state_override.value_or(State::SchemeStart);
253 StringBuilder buffer;
254 bool at_sign_seen = false;
255 bool inside_brackets = false;
256 bool password_token_seen = false;
257
258 Utf8View input(processed_input);
259 Utf8CodePointIterator iterator = input.begin();
260
261 auto get_remaining = [&input, &iterator] {
262 return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string();
263 };
264
265 // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
266 // ++iterator : "increase pointer by 1"
267 // continue : "decrease pointer by 1"
268 for (;;) {
269 u32 code_point = end_of_file;
270 if (!iterator.done())
271 code_point = *iterator;
272
273 if constexpr (URL_PARSER_DEBUG) {
274 if (code_point == end_of_file)
275 dbgln("URLParser::parse: {} state with EOF.", state_name(state));
276 else if (is_ascii_printable(code_point))
277 dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
278 else
279 dbgln("URLParser::parse: {} state with code point U+{:04X}.", state_name(state), code_point);
280 }
281
282 switch (state) {
283 case State::SchemeStart:
284 if (is_ascii_alpha(code_point)) {
285 buffer.append_as_lowercase(code_point);
286 state = State::Scheme;
287 } else {
288 state = State::NoScheme;
289 continue;
290 }
291 break;
292 case State::Scheme:
293 if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') {
294 buffer.append_as_lowercase(code_point);
295 } else if (code_point == ':') {
296 url->m_scheme = buffer.to_deprecated_string();
297 buffer.clear();
298 if (url->scheme() == "file") {
299 if (!get_remaining().starts_with("//"sv)) {
300 report_validation_error();
301 }
302 state = State::File;
303 } else if (url->is_special()) {
304 if (base_url && base_url->m_scheme == url->m_scheme)
305 state = State::SpecialRelativeOrAuthority;
306 else
307 state = State::SpecialAuthoritySlashes;
308 } else if (get_remaining().starts_with("/"sv)) {
309 state = State::PathOrAuthority;
310 ++iterator;
311 } else {
312 url->m_cannot_be_a_base_url = true;
313 url->append_path("");
314 state = State::CannotBeABaseUrlPath;
315 }
316 } else {
317 buffer.clear();
318 state = State::NoScheme;
319 iterator = input.begin();
320 continue;
321 }
322 break;
323 case State::NoScheme:
324 if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) {
325 report_validation_error();
326 return {};
327 } else if (base_url->m_cannot_be_a_base_url && code_point == '#') {
328 url->m_scheme = base_url->m_scheme;
329 url->m_paths = base_url->m_paths;
330 url->m_query = base_url->m_query;
331 url->m_fragment = "";
332 url->m_cannot_be_a_base_url = true;
333 state = State::Fragment;
334 } else if (base_url->m_scheme != "file") {
335 state = State::Relative;
336 continue;
337 } else {
338 state = State::File;
339 continue;
340 }
341 break;
342 case State::SpecialRelativeOrAuthority:
343 if (code_point == '/' && get_remaining().starts_with("/"sv)) {
344 state = State::SpecialAuthorityIgnoreSlashes;
345 ++iterator;
346 } else {
347 report_validation_error();
348 state = State::Relative;
349 continue;
350 }
351 break;
352 case State::PathOrAuthority:
353 if (code_point == '/') {
354 state = State::Authority;
355 } else {
356 state = State::Path;
357 continue;
358 }
359 break;
360 case State::Relative:
361 url->m_scheme = base_url->m_scheme;
362 if (code_point == '/') {
363 state = State::RelativeSlash;
364 } else if (url->is_special() && code_point == '\\') {
365 report_validation_error();
366 state = State::RelativeSlash;
367 } else {
368 url->m_username = base_url->m_username;
369 url->m_password = base_url->m_password;
370 url->m_host = base_url->m_host;
371 url->m_port = base_url->m_port;
372 url->m_paths = base_url->m_paths;
373 url->m_query = base_url->m_query;
374
375 if (code_point == '?') {
376 url->m_query = "";
377 state = State::Query;
378 } else if (code_point == '#') {
379 url->m_fragment = "";
380 state = State::Fragment;
381 } else if (code_point != end_of_file) {
382 url->m_query = {};
383 if (url->m_paths.size())
384 url->m_paths.remove(url->m_paths.size() - 1);
385 state = State::Path;
386 continue;
387 }
388 }
389 break;
390 case State::RelativeSlash:
391 if (url->is_special() && (code_point == '/' || code_point == '\\')) {
392 if (code_point == '\\')
393 report_validation_error();
394 state = State::SpecialAuthorityIgnoreSlashes;
395 } else if (code_point == '/') {
396 state = State::Authority;
397 } else {
398 url->m_username = base_url->m_username;
399 url->m_password = base_url->m_password;
400 url->m_host = base_url->m_host;
401 url->m_port = base_url->m_port;
402 state = State::Path;
403 continue;
404 }
405 break;
406 case State::SpecialAuthoritySlashes:
407 if (code_point == '/' && get_remaining().starts_with("/"sv)) {
408 state = State::SpecialAuthorityIgnoreSlashes;
409 ++iterator;
410 } else {
411 report_validation_error();
412 state = State::SpecialAuthorityIgnoreSlashes;
413 continue;
414 }
415 break;
416 case State::SpecialAuthorityIgnoreSlashes:
417 if (code_point != '/' && code_point != '\\') {
418 state = State::Authority;
419 continue;
420 } else {
421 report_validation_error();
422 }
423 break;
424 case State::Authority:
425 if (code_point == '@') {
426 report_validation_error();
427 if (at_sign_seen) {
428 auto content = buffer.to_deprecated_string();
429 buffer.clear();
430 buffer.append("%40"sv);
431 buffer.append(content);
432 }
433 at_sign_seen = true;
434 StringBuilder builder;
435 for (auto c : Utf8View(builder.string_view())) {
436 if (c == ':' && !password_token_seen) {
437 password_token_seen = true;
438 continue;
439 }
440 builder.clear();
441 if (password_token_seen) {
442 builder.append(url->password());
443 URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
444 // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
445 url->m_password = URL::percent_decode(builder.string_view());
446 } else {
447 builder.append(url->username());
448 URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
449 // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
450 url->m_username = URL::percent_decode(builder.string_view());
451 }
452 }
453 buffer.clear();
454 } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
455 if (at_sign_seen && buffer.is_empty()) {
456 report_validation_error();
457 return {};
458 }
459 // NOTE: This decreases the iterator by the number of code points in buffer plus one.
460 iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1);
461 buffer.clear();
462 state = State::Host;
463 } else {
464 buffer.append_code_point(code_point);
465 }
466 break;
467 case State::Host:
468 case State::Hostname:
469 if (code_point == ':' && !inside_brackets) {
470 if (buffer.is_empty()) {
471 report_validation_error();
472 return {};
473 }
474 auto host = parse_host(buffer.string_view(), !url->is_special());
475 if (!host.has_value())
476 return {};
477 url->m_host = host.release_value();
478 buffer.clear();
479 state = State::Port;
480 } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
481 if (url->is_special() && buffer.is_empty()) {
482 report_validation_error();
483 return {};
484 }
485 auto host = parse_host(buffer.string_view(), !url->is_special());
486 if (!host.has_value())
487 return {};
488 url->m_host = host.value();
489 buffer.clear();
490 state = State::Port;
491 continue;
492 } else if (code_point == '[') {
493 inside_brackets = true;
494 } else if (code_point == ']') {
495 inside_brackets = false;
496 } else {
497 buffer.append_code_point(code_point);
498 }
499 break;
500 case State::Port:
501 if (is_ascii_digit(code_point)) {
502 buffer.append_code_point(code_point);
503 } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
504 if (!buffer.is_empty()) {
505 auto port = buffer.string_view().to_uint();
506 if (!port.has_value() || port.value() > 65535) {
507 report_validation_error();
508 return {};
509 }
510 if (port.value() == URL::default_port_for_scheme(url->scheme()))
511 url->m_port = {};
512 else
513 url->m_port = port.value();
514 buffer.clear();
515 }
516 state = State::PathStart;
517 continue;
518 } else {
519 report_validation_error();
520 return {};
521 }
522 break;
523 case State::File:
524 url->m_scheme = "file";
525 url->m_host = "";
526 if (code_point == '/' || code_point == '\\') {
527 if (code_point == '\\')
528 report_validation_error();
529 state = State::FileSlash;
530 } else if (base_url && base_url->m_scheme == "file") {
531 url->m_host = base_url->m_host;
532 url->m_paths = base_url->m_paths;
533 url->m_query = base_url->m_query;
534 if (code_point == '?') {
535 url->m_query = "";
536 state = State::Query;
537 } else if (code_point == '#') {
538 url->m_fragment = "";
539 state = State::Fragment;
540 } else if (code_point != end_of_file) {
541 url->m_query = {};
542 auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
543 if (!starts_with_windows_drive_letter(substring_from_pointer)) {
544 if (!url->paths().is_empty() && !(url->scheme() == "file" && url->paths().size() == 1 && is_normalized_windows_drive_letter(url->paths()[0])))
545 url->m_paths.remove(url->m_paths.size() - 1);
546 } else {
547 report_validation_error();
548 url->m_paths.clear();
549 }
550 state = State::Path;
551 continue;
552 }
553 }
554 break;
555 case State::FileSlash:
556 if (code_point == '/' || code_point == '\\') {
557 if (code_point == '\\')
558 report_validation_error();
559 state = State::FileHost;
560 } else if (base_url && base_url->m_scheme == "file") {
561 url->m_host = base_url->m_host;
562 auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
563 if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0]))
564 url->append_path(base_url->m_paths[0]);
565 state = State::Path;
566 continue;
567 }
568 break;
569 case State::FileHost:
570 if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
571 if (is_windows_drive_letter(buffer.string_view())) {
572 report_validation_error();
573 state = State::Path;
574 } else if (buffer.is_empty()) {
575 url->m_host = "";
576 state = State::PathStart;
577 } else {
578 auto host = parse_host(buffer.string_view(), true);
579 if (!host.has_value())
580 return {};
581 if (host.value() == "localhost")
582 host = "";
583 url->m_host = host.release_value();
584 buffer.clear();
585 state = State::PathStart;
586 }
587 continue;
588 } else {
589 buffer.append_code_point(code_point);
590 }
591 break;
592 case State::PathStart:
593 if (url->is_special()) {
594 if (code_point == '\\')
595 report_validation_error();
596 state = State::Path;
597 if (code_point != '/' && code_point != '\\')
598 continue;
599 } else if (code_point == '?') {
600 url->m_query = "";
601 state = State::Query;
602 } else if (code_point == '#') {
603 url->m_fragment = "";
604 state = State::Fragment;
605 } else if (code_point != end_of_file) {
606 state = State::Path;
607 if (code_point != '/')
608 continue;
609 }
610 break;
611 case State::Path:
612 if (code_point == end_of_file || code_point == '/' || (url->is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
613 if (url->is_special() && code_point == '\\')
614 report_validation_error();
615 if (is_double_dot_path_segment(buffer.string_view())) {
616 if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0])))
617 url->m_paths.remove(url->m_paths.size() - 1);
618 if (code_point != '/' && !(url->is_special() && code_point == '\\'))
619 url->append_path("");
620 } else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) {
621 url->append_path("");
622 } else if (!is_single_dot_path_segment(buffer.string_view())) {
623 if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) {
624 auto drive_letter = buffer.string_view()[0];
625 buffer.clear();
626 buffer.append(drive_letter);
627 buffer.append(':');
628 }
629 // NOTE: This needs to be percent decoded since the member variables contain decoded data.
630 url->append_path(URL::percent_decode(buffer.string_view()));
631 }
632 buffer.clear();
633 if (code_point == '?') {
634 url->m_query = "";
635 state = State::Query;
636 } else if (code_point == '#') {
637 url->m_fragment = "";
638 state = State::Fragment;
639 }
640 } else {
641 if (!is_url_code_point(code_point) && code_point != '%')
642 report_validation_error();
643 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
644 URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path);
645 }
646 break;
647 case State::CannotBeABaseUrlPath:
648 // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
649 // NOTE: Verify that the assumptions required for this simplification are correct.
650 VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty());
651 if (code_point == '?') {
652 // NOTE: This needs to be percent decoded since the member variables contain decoded data.
653 url->m_paths[0] = URL::percent_decode(buffer.string_view());
654 url->m_query = "";
655 state = State::Query;
656 } else if (code_point == '#') {
657 // NOTE: This needs to be percent decoded since the member variables contain decoded data.
658 url->m_paths[0] = URL::percent_decode(buffer.string_view());
659 url->m_fragment = "";
660 state = State::Fragment;
661 } else {
662 if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
663 report_validation_error();
664 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
665 if (code_point != end_of_file) {
666 URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
667 } else {
668 // NOTE: This needs to be percent decoded since the member variables contain decoded data.
669 url->m_paths[0] = URL::percent_decode(buffer.string_view());
670 }
671 }
672 break;
673 case State::Query:
674 // https://url.spec.whatwg.org/#query-state
675 if (code_point == end_of_file || code_point == '#') {
676 VERIFY(url->m_query == "");
677 auto query_percent_encode_set = url->is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
678 url->m_query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set);
679 buffer.clear();
680 if (code_point == '#') {
681 url->m_fragment = "";
682 state = State::Fragment;
683 }
684 } else if (code_point != end_of_file) {
685 if (!is_url_code_point(code_point) && code_point != '%')
686 report_validation_error();
687 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
688 buffer.append_code_point(code_point);
689 }
690 break;
691 case State::Fragment:
692 // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
693 if (code_point != end_of_file) {
694 if (!is_url_code_point(code_point) && code_point != '%')
695 report_validation_error();
696 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
697 buffer.append_code_point(code_point);
698 } else {
699 // NOTE: This needs to be percent decoded since the member variables contain decoded data.
700 url->m_fragment = URL::percent_decode(buffer.string_view());
701 buffer.clear();
702 }
703 break;
704 default:
705 VERIFY_NOT_REACHED();
706 }
707
708 if (iterator.done())
709 break;
710 ++iterator;
711 }
712
713 url->m_valid = true;
714 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url->serialize());
715 return url.release_value();
716}
717
718}