AK/URLParser.cpp at master · jcs.org/serenity

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / AK / URLParser.cpp
at master 718 lines 32 kB view raw
wrap content
Andreas Kling Everywhere: Rename equals_ignoring_case => equals_ignoring_ascii_case 3y ago
a504ac3e
  1/*
  2 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  3 *
  4 * SPDX-License-Identifier: BSD-2-Clause
  5 */
  6
  7#include <AK/CharacterTypes.h>
  8#include <AK/Debug.h>
  9#include <AK/DeprecatedString.h>
 10#include <AK/Optional.h>
 11#include <AK/SourceLocation.h>
 12#include <AK/StringBuilder.h>
 13#include <AK/StringUtils.h>
 14#include <AK/URLParser.h>
 15#include <AK/Utf8View.h>
 16
 17namespace AK {
 18
 19// NOTE: This is similar to the LibC macro EOF = -1.
 20constexpr u32 end_of_file = 0xFFFFFFFF;
 21
 22static bool is_url_code_point(u32 code_point)
 23{
 24    // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
 25    return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
 26}
 27
 28static void report_validation_error(SourceLocation const& location = SourceLocation::current())
 29{
 30    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location);
 31}
 32
 33static Optional<DeprecatedString> parse_opaque_host(StringView input)
 34{
 35    auto forbidden_host_characters_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv;
 36    for (auto character : forbidden_host_characters_excluding_percent) {
 37        if (input.contains(character)) {
 38            report_validation_error();
 39            return {};
 40        }
 41    }
 42    // FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error.
 43    // FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error.
 44    return URL::percent_encode(input, URL::PercentEncodeSet::C0Control);
 45}
 46
 47static Optional<DeprecatedString> parse_ipv4_address(StringView input)
 48{
 49    // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser.
 50    return input;
 51}
 52
 53// https://url.spec.whatwg.org/#concept-host-parser
 54// NOTE: This is a very bare-bones implementation.
 55static Optional<DeprecatedString> parse_host(StringView input, bool is_not_special = false)
 56{
 57    if (input.starts_with('[')) {
 58        if (!input.ends_with(']')) {
 59            report_validation_error();
 60            return {};
 61        }
 62        // FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
 63        TODO();
 64    }
 65
 66    if (is_not_special)
 67        return parse_opaque_host(input);
 68    VERIFY(!input.is_empty());
 69
 70    // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
 71    auto domain = URL::percent_decode(input);
 72    // FIXME: Let asciiDomain be the result of running domain to ASCII on domain.
 73    auto& ascii_domain = domain;
 74
 75    auto forbidden_host_characters = "\0\t\n\r #%/:<>?@[\\]^|"sv;
 76    for (auto character : forbidden_host_characters) {
 77        if (ascii_domain.view().contains(character)) {
 78            report_validation_error();
 79            return {};
 80        }
 81    }
 82
 83    auto ipv4_host = parse_ipv4_address(ascii_domain);
 84    return ipv4_host;
 85}
 86
 87// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
 88constexpr bool starts_with_windows_drive_letter(StringView input)
 89{
 90    if (input.length() < 2)
 91        return false;
 92    if (!is_ascii_alpha(input[0]) || !(input[1] == ':' || input[1] == '|'))
 93        return false;
 94    if (input.length() == 2)
 95        return true;
 96    return "/\\?#"sv.contains(input[2]);
 97}
 98
 99constexpr bool is_windows_drive_letter(StringView input)
100{
101    return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
102}
103
104constexpr bool is_normalized_windows_drive_letter(StringView input)
105{
106    return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':';
107}
108
109constexpr bool is_single_dot_path_segment(StringView input)
110{
111    return input == "."sv || input.equals_ignoring_ascii_case("%2e"sv);
112}
113
114constexpr bool is_double_dot_path_segment(StringView input)
115{
116    return input == ".."sv || input.equals_ignoring_ascii_case(".%2e"sv) || input.equals_ignoring_ascii_case("%2e."sv) || input.equals_ignoring_ascii_case("%2e%2e"sv);
117}
118
119// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
120static DeprecatedString percent_encode_after_encoding(StringView input, URL::PercentEncodeSet percent_encode_set, bool space_as_plus = false)
121{
122    // NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
123
124    StringBuilder output;
125
126    // 3. For each byte of encodeOutput converted to a byte sequence:
127    for (auto byte : input) {
128        // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
129        if (space_as_plus && byte == ' ') {
130            output.append('+');
131            continue;
132        }
133
134        // 2. Let isomorph be a code point whose value is byte’s value.
135        u32 isomorph = byte;
136
137        // 3. Assert: percentEncodeSet includes all non-ASCII code points.
138
139        // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
140        if (!URL::code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) {
141            output.append_code_point(isomorph);
142        }
143
144        // 5. Otherwise, percent-encode byte and append the result to output.
145        else {
146            output.appendff("%{:02X}", byte);
147        }
148    }
149
150    // 6. Return output.
151    return output.to_deprecated_string();
152}
153
154// https://fetch.spec.whatwg.org/#data-urls
155// FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec.
156Optional<URL> URLParser::parse_data_url(StringView raw_input)
157{
158    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input);
159    VERIFY(raw_input.starts_with("data:"sv));
160    auto input = raw_input.substring_view(5);
161    auto comma_offset = input.find(',');
162    if (!comma_offset.has_value())
163        return {};
164    auto mime_type = StringUtils::trim(input.substring_view(0, comma_offset.value()), "\t\n\f\r "sv, TrimMode::Both);
165    auto encoded_body = input.substring_view(comma_offset.value() + 1);
166    auto body = URL::percent_decode(encoded_body);
167    bool is_base64_encoded = false;
168    if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) {
169        auto substring_view = mime_type.substring_view(0, mime_type.length() - 6);
170        auto trimmed_substring_view = StringUtils::trim(substring_view, " "sv, TrimMode::Right);
171        if (trimmed_substring_view.ends_with(';')) {
172            is_base64_encoded = true;
173            mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
174        }
175    }
176
177    StringBuilder builder;
178    if (mime_type.starts_with(";"sv) || mime_type.is_empty()) {
179        builder.append("text/plain"sv);
180        builder.append(mime_type);
181        mime_type = builder.string_view();
182    }
183
184    // FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
185    URL url { StringUtils::trim(mime_type, "\n\r\t "sv, TrimMode::Both), move(body), is_base64_encoded };
186    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize());
187    return url;
188}
189
190// https://url.spec.whatwg.org/#concept-basic-url-parser
191// NOTE: This parser assumes a UTF-8 encoding.
192// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
193//       validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
194//       future for validation of URLs, which would then lead to infinite recursion.
195//       The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
196//       variables' values here, not what the URL class presents to its users.
197// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting
198//       some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode
199//       everything before setting the member variables.
200URL URLParser::parse(StringView raw_input, URL const* base_url, Optional<URL> url, Optional<State> state_override)
201{
202    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input);
203    if (raw_input.is_empty())
204        return base_url ? *base_url : URL {};
205
206    if (raw_input.starts_with("data:"sv)) {
207        auto maybe_url = parse_data_url(raw_input);
208        if (!maybe_url.has_value())
209            return {};
210        return maybe_url.release_value();
211    }
212
213    size_t start_index = 0;
214    size_t end_index = raw_input.length();
215    if (!url.has_value()) {
216        url = URL();
217
218        // NOTE: This removes all leading and trailing C0 control or space characters.
219        bool has_validation_error = false;
220        for (size_t i = 0; i < raw_input.length(); ++i) {
221            i8 ch = raw_input[i];
222            if (0 <= ch && ch <= 0x20) {
223                ++start_index;
224                has_validation_error = true;
225            } else {
226                break;
227            }
228        }
229        for (ssize_t i = raw_input.length() - 1; i >= 0; --i) {
230            i8 ch = raw_input[i];
231            if (0 <= ch && ch <= 0x20) {
232                --end_index;
233                has_validation_error = true;
234            } else {
235                break;
236            }
237        }
238        if (has_validation_error)
239            report_validation_error();
240    }
241    if (start_index >= end_index)
242        return {};
243
244    DeprecatedString processed_input = raw_input.substring_view(start_index, end_index - start_index);
245
246    // NOTE: This replaces all tab and newline characters with nothing.
247    if (processed_input.contains("\t"sv) || processed_input.contains("\n"sv)) {
248        report_validation_error();
249        processed_input = processed_input.replace("\t"sv, ""sv, ReplaceMode::All).replace("\n"sv, ""sv, ReplaceMode::All);
250    }
251
252    State state = state_override.value_or(State::SchemeStart);
253    StringBuilder buffer;
254    bool at_sign_seen = false;
255    bool inside_brackets = false;
256    bool password_token_seen = false;
257
258    Utf8View input(processed_input);
259    Utf8CodePointIterator iterator = input.begin();
260
261    auto get_remaining = [&input, &iterator] {
262        return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string();
263    };
264
265    // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
266    //       ++iterator : "increase pointer by 1"
267    //       continue   : "decrease pointer by 1"
268    for (;;) {
269        u32 code_point = end_of_file;
270        if (!iterator.done())
271            code_point = *iterator;
272
273        if constexpr (URL_PARSER_DEBUG) {
274            if (code_point == end_of_file)
275                dbgln("URLParser::parse: {} state with EOF.", state_name(state));
276            else if (is_ascii_printable(code_point))
277                dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
278            else
279                dbgln("URLParser::parse: {} state with code point U+{:04X}.", state_name(state), code_point);
280        }
281
282        switch (state) {
283        case State::SchemeStart:
284            if (is_ascii_alpha(code_point)) {
285                buffer.append_as_lowercase(code_point);
286                state = State::Scheme;
287            } else {
288                state = State::NoScheme;
289                continue;
290            }
291            break;
292        case State::Scheme:
293            if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') {
294                buffer.append_as_lowercase(code_point);
295            } else if (code_point == ':') {
296                url->m_scheme = buffer.to_deprecated_string();
297                buffer.clear();
298                if (url->scheme() == "file") {
299                    if (!get_remaining().starts_with("//"sv)) {
300                        report_validation_error();
301                    }
302                    state = State::File;
303                } else if (url->is_special()) {
304                    if (base_url && base_url->m_scheme == url->m_scheme)
305                        state = State::SpecialRelativeOrAuthority;
306                    else
307                        state = State::SpecialAuthoritySlashes;
308                } else if (get_remaining().starts_with("/"sv)) {
309                    state = State::PathOrAuthority;
310                    ++iterator;
311                } else {
312                    url->m_cannot_be_a_base_url = true;
313                    url->append_path("");
314                    state = State::CannotBeABaseUrlPath;
315                }
316            } else {
317                buffer.clear();
318                state = State::NoScheme;
319                iterator = input.begin();
320                continue;
321            }
322            break;
323        case State::NoScheme:
324            if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) {
325                report_validation_error();
326                return {};
327            } else if (base_url->m_cannot_be_a_base_url && code_point == '#') {
328                url->m_scheme = base_url->m_scheme;
329                url->m_paths = base_url->m_paths;
330                url->m_query = base_url->m_query;
331                url->m_fragment = "";
332                url->m_cannot_be_a_base_url = true;
333                state = State::Fragment;
334            } else if (base_url->m_scheme != "file") {
335                state = State::Relative;
336                continue;
337            } else {
338                state = State::File;
339                continue;
340            }
341            break;
342        case State::SpecialRelativeOrAuthority:
343            if (code_point == '/' && get_remaining().starts_with("/"sv)) {
344                state = State::SpecialAuthorityIgnoreSlashes;
345                ++iterator;
346            } else {
347                report_validation_error();
348                state = State::Relative;
349                continue;
350            }
351            break;
352        case State::PathOrAuthority:
353            if (code_point == '/') {
354                state = State::Authority;
355            } else {
356                state = State::Path;
357                continue;
358            }
359            break;
360        case State::Relative:
361            url->m_scheme = base_url->m_scheme;
362            if (code_point == '/') {
363                state = State::RelativeSlash;
364            } else if (url->is_special() && code_point == '\\') {
365                report_validation_error();
366                state = State::RelativeSlash;
367            } else {
368                url->m_username = base_url->m_username;
369                url->m_password = base_url->m_password;
370                url->m_host = base_url->m_host;
371                url->m_port = base_url->m_port;
372                url->m_paths = base_url->m_paths;
373                url->m_query = base_url->m_query;
374
375                if (code_point == '?') {
376                    url->m_query = "";
377                    state = State::Query;
378                } else if (code_point == '#') {
379                    url->m_fragment = "";
380                    state = State::Fragment;
381                } else if (code_point != end_of_file) {
382                    url->m_query = {};
383                    if (url->m_paths.size())
384                        url->m_paths.remove(url->m_paths.size() - 1);
385                    state = State::Path;
386                    continue;
387                }
388            }
389            break;
390        case State::RelativeSlash:
391            if (url->is_special() && (code_point == '/' || code_point == '\\')) {
392                if (code_point == '\\')
393                    report_validation_error();
394                state = State::SpecialAuthorityIgnoreSlashes;
395            } else if (code_point == '/') {
396                state = State::Authority;
397            } else {
398                url->m_username = base_url->m_username;
399                url->m_password = base_url->m_password;
400                url->m_host = base_url->m_host;
401                url->m_port = base_url->m_port;
402                state = State::Path;
403                continue;
404            }
405            break;
406        case State::SpecialAuthoritySlashes:
407            if (code_point == '/' && get_remaining().starts_with("/"sv)) {
408                state = State::SpecialAuthorityIgnoreSlashes;
409                ++iterator;
410            } else {
411                report_validation_error();
412                state = State::SpecialAuthorityIgnoreSlashes;
413                continue;
414            }
415            break;
416        case State::SpecialAuthorityIgnoreSlashes:
417            if (code_point != '/' && code_point != '\\') {
418                state = State::Authority;
419                continue;
420            } else {
421                report_validation_error();
422            }
423            break;
424        case State::Authority:
425            if (code_point == '@') {
426                report_validation_error();
427                if (at_sign_seen) {
428                    auto content = buffer.to_deprecated_string();
429                    buffer.clear();
430                    buffer.append("%40"sv);
431                    buffer.append(content);
432                }
433                at_sign_seen = true;
434                StringBuilder builder;
435                for (auto c : Utf8View(builder.string_view())) {
436                    if (c == ':' && !password_token_seen) {
437                        password_token_seen = true;
438                        continue;
439                    }
440                    builder.clear();
441                    if (password_token_seen) {
442                        builder.append(url->password());
443                        URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
444                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
445                        url->m_password = URL::percent_decode(builder.string_view());
446                    } else {
447                        builder.append(url->username());
448                        URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
449                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
450                        url->m_username = URL::percent_decode(builder.string_view());
451                    }
452                }
453                buffer.clear();
454            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
455                if (at_sign_seen && buffer.is_empty()) {
456                    report_validation_error();
457                    return {};
458                }
459                // NOTE: This decreases the iterator by the number of code points in buffer plus one.
460                iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1);
461                buffer.clear();
462                state = State::Host;
463            } else {
464                buffer.append_code_point(code_point);
465            }
466            break;
467        case State::Host:
468        case State::Hostname:
469            if (code_point == ':' && !inside_brackets) {
470                if (buffer.is_empty()) {
471                    report_validation_error();
472                    return {};
473                }
474                auto host = parse_host(buffer.string_view(), !url->is_special());
475                if (!host.has_value())
476                    return {};
477                url->m_host = host.release_value();
478                buffer.clear();
479                state = State::Port;
480            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
481                if (url->is_special() && buffer.is_empty()) {
482                    report_validation_error();
483                    return {};
484                }
485                auto host = parse_host(buffer.string_view(), !url->is_special());
486                if (!host.has_value())
487                    return {};
488                url->m_host = host.value();
489                buffer.clear();
490                state = State::Port;
491                continue;
492            } else if (code_point == '[') {
493                inside_brackets = true;
494            } else if (code_point == ']') {
495                inside_brackets = false;
496            } else {
497                buffer.append_code_point(code_point);
498            }
499            break;
500        case State::Port:
501            if (is_ascii_digit(code_point)) {
502                buffer.append_code_point(code_point);
503            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
504                if (!buffer.is_empty()) {
505                    auto port = buffer.string_view().to_uint();
506                    if (!port.has_value() || port.value() > 65535) {
507                        report_validation_error();
508                        return {};
509                    }
510                    if (port.value() == URL::default_port_for_scheme(url->scheme()))
511                        url->m_port = {};
512                    else
513                        url->m_port = port.value();
514                    buffer.clear();
515                }
516                state = State::PathStart;
517                continue;
518            } else {
519                report_validation_error();
520                return {};
521            }
522            break;
523        case State::File:
524            url->m_scheme = "file";
525            url->m_host = "";
526            if (code_point == '/' || code_point == '\\') {
527                if (code_point == '\\')
528                    report_validation_error();
529                state = State::FileSlash;
530            } else if (base_url && base_url->m_scheme == "file") {
531                url->m_host = base_url->m_host;
532                url->m_paths = base_url->m_paths;
533                url->m_query = base_url->m_query;
534                if (code_point == '?') {
535                    url->m_query = "";
536                    state = State::Query;
537                } else if (code_point == '#') {
538                    url->m_fragment = "";
539                    state = State::Fragment;
540                } else if (code_point != end_of_file) {
541                    url->m_query = {};
542                    auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
543                    if (!starts_with_windows_drive_letter(substring_from_pointer)) {
544                        if (!url->paths().is_empty() && !(url->scheme() == "file" && url->paths().size() == 1 && is_normalized_windows_drive_letter(url->paths()[0])))
545                            url->m_paths.remove(url->m_paths.size() - 1);
546                    } else {
547                        report_validation_error();
548                        url->m_paths.clear();
549                    }
550                    state = State::Path;
551                    continue;
552                }
553            }
554            break;
555        case State::FileSlash:
556            if (code_point == '/' || code_point == '\\') {
557                if (code_point == '\\')
558                    report_validation_error();
559                state = State::FileHost;
560            } else if (base_url && base_url->m_scheme == "file") {
561                url->m_host = base_url->m_host;
562                auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
563                if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0]))
564                    url->append_path(base_url->m_paths[0]);
565                state = State::Path;
566                continue;
567            }
568            break;
569        case State::FileHost:
570            if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
571                if (is_windows_drive_letter(buffer.string_view())) {
572                    report_validation_error();
573                    state = State::Path;
574                } else if (buffer.is_empty()) {
575                    url->m_host = "";
576                    state = State::PathStart;
577                } else {
578                    auto host = parse_host(buffer.string_view(), true);
579                    if (!host.has_value())
580                        return {};
581                    if (host.value() == "localhost")
582                        host = "";
583                    url->m_host = host.release_value();
584                    buffer.clear();
585                    state = State::PathStart;
586                }
587                continue;
588            } else {
589                buffer.append_code_point(code_point);
590            }
591            break;
592        case State::PathStart:
593            if (url->is_special()) {
594                if (code_point == '\\')
595                    report_validation_error();
596                state = State::Path;
597                if (code_point != '/' && code_point != '\\')
598                    continue;
599            } else if (code_point == '?') {
600                url->m_query = "";
601                state = State::Query;
602            } else if (code_point == '#') {
603                url->m_fragment = "";
604                state = State::Fragment;
605            } else if (code_point != end_of_file) {
606                state = State::Path;
607                if (code_point != '/')
608                    continue;
609            }
610            break;
611        case State::Path:
612            if (code_point == end_of_file || code_point == '/' || (url->is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
613                if (url->is_special() && code_point == '\\')
614                    report_validation_error();
615                if (is_double_dot_path_segment(buffer.string_view())) {
616                    if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0])))
617                        url->m_paths.remove(url->m_paths.size() - 1);
618                    if (code_point != '/' && !(url->is_special() && code_point == '\\'))
619                        url->append_path("");
620                } else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) {
621                    url->append_path("");
622                } else if (!is_single_dot_path_segment(buffer.string_view())) {
623                    if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) {
624                        auto drive_letter = buffer.string_view()[0];
625                        buffer.clear();
626                        buffer.append(drive_letter);
627                        buffer.append(':');
628                    }
629                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
630                    url->append_path(URL::percent_decode(buffer.string_view()));
631                }
632                buffer.clear();
633                if (code_point == '?') {
634                    url->m_query = "";
635                    state = State::Query;
636                } else if (code_point == '#') {
637                    url->m_fragment = "";
638                    state = State::Fragment;
639                }
640            } else {
641                if (!is_url_code_point(code_point) && code_point != '%')
642                    report_validation_error();
643                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
644                URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path);
645            }
646            break;
647        case State::CannotBeABaseUrlPath:
648            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
649            // NOTE: Verify that the assumptions required for this simplification are correct.
650            VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty());
651            if (code_point == '?') {
652                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
653                url->m_paths[0] = URL::percent_decode(buffer.string_view());
654                url->m_query = "";
655                state = State::Query;
656            } else if (code_point == '#') {
657                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
658                url->m_paths[0] = URL::percent_decode(buffer.string_view());
659                url->m_fragment = "";
660                state = State::Fragment;
661            } else {
662                if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
663                    report_validation_error();
664                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
665                if (code_point != end_of_file) {
666                    URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
667                } else {
668                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
669                    url->m_paths[0] = URL::percent_decode(buffer.string_view());
670                }
671            }
672            break;
673        case State::Query:
674            // https://url.spec.whatwg.org/#query-state
675            if (code_point == end_of_file || code_point == '#') {
676                VERIFY(url->m_query == "");
677                auto query_percent_encode_set = url->is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
678                url->m_query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set);
679                buffer.clear();
680                if (code_point == '#') {
681                    url->m_fragment = "";
682                    state = State::Fragment;
683                }
684            } else if (code_point != end_of_file) {
685                if (!is_url_code_point(code_point) && code_point != '%')
686                    report_validation_error();
687                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
688                buffer.append_code_point(code_point);
689            }
690            break;
691        case State::Fragment:
692            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
693            if (code_point != end_of_file) {
694                if (!is_url_code_point(code_point) && code_point != '%')
695                    report_validation_error();
696                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
697                buffer.append_code_point(code_point);
698            } else {
699                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
700                url->m_fragment = URL::percent_decode(buffer.string_view());
701                buffer.clear();
702            }
703            break;
704        default:
705            VERIFY_NOT_REACHED();
706        }
707
708        if (iterator.done())
709            break;
710        ++iterator;
711    }
712
713    url->m_valid = true;
714    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url->serialize());
715    return url.release_value();
716}
717
718}