Serenity Operating System
at master 718 lines 32 kB view raw
1/* 2 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/CharacterTypes.h> 8#include <AK/Debug.h> 9#include <AK/DeprecatedString.h> 10#include <AK/Optional.h> 11#include <AK/SourceLocation.h> 12#include <AK/StringBuilder.h> 13#include <AK/StringUtils.h> 14#include <AK/URLParser.h> 15#include <AK/Utf8View.h> 16 17namespace AK { 18 19// NOTE: This is similar to the LibC macro EOF = -1. 20constexpr u32 end_of_file = 0xFFFFFFFF; 21 22static bool is_url_code_point(u32 code_point) 23{ 24 // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters. 25 return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point); 26} 27 28static void report_validation_error(SourceLocation const& location = SourceLocation::current()) 29{ 30 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location); 31} 32 33static Optional<DeprecatedString> parse_opaque_host(StringView input) 34{ 35 auto forbidden_host_characters_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv; 36 for (auto character : forbidden_host_characters_excluding_percent) { 37 if (input.contains(character)) { 38 report_validation_error(); 39 return {}; 40 } 41 } 42 // FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error. 43 // FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error. 44 return URL::percent_encode(input, URL::PercentEncodeSet::C0Control); 45} 46 47static Optional<DeprecatedString> parse_ipv4_address(StringView input) 48{ 49 // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser. 50 return input; 51} 52 53// https://url.spec.whatwg.org/#concept-host-parser 54// NOTE: This is a very bare-bones implementation. 55static Optional<DeprecatedString> parse_host(StringView input, bool is_not_special = false) 56{ 57 if (input.starts_with('[')) { 58 if (!input.ends_with(']')) { 59 report_validation_error(); 60 return {}; 61 } 62 // FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed. 63 TODO(); 64 } 65 66 if (is_not_special) 67 return parse_opaque_host(input); 68 VERIFY(!input.is_empty()); 69 70 // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input. 71 auto domain = URL::percent_decode(input); 72 // FIXME: Let asciiDomain be the result of running domain to ASCII on domain. 73 auto& ascii_domain = domain; 74 75 auto forbidden_host_characters = "\0\t\n\r #%/:<>?@[\\]^|"sv; 76 for (auto character : forbidden_host_characters) { 77 if (ascii_domain.view().contains(character)) { 78 report_validation_error(); 79 return {}; 80 } 81 } 82 83 auto ipv4_host = parse_ipv4_address(ascii_domain); 84 return ipv4_host; 85} 86 87// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter 88constexpr bool starts_with_windows_drive_letter(StringView input) 89{ 90 if (input.length() < 2) 91 return false; 92 if (!is_ascii_alpha(input[0]) || !(input[1] == ':' || input[1] == '|')) 93 return false; 94 if (input.length() == 2) 95 return true; 96 return "/\\?#"sv.contains(input[2]); 97} 98 99constexpr bool is_windows_drive_letter(StringView input) 100{ 101 return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|'); 102} 103 104constexpr bool is_normalized_windows_drive_letter(StringView input) 105{ 106 return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':'; 107} 108 109constexpr bool is_single_dot_path_segment(StringView input) 110{ 111 return input == "."sv || input.equals_ignoring_ascii_case("%2e"sv); 112} 113 114constexpr bool is_double_dot_path_segment(StringView input) 115{ 116 return input == ".."sv || input.equals_ignoring_ascii_case(".%2e"sv) || input.equals_ignoring_ascii_case("%2e."sv) || input.equals_ignoring_ascii_case("%2e%2e"sv); 117} 118 119// https://url.spec.whatwg.org/#string-percent-encode-after-encoding 120static DeprecatedString percent_encode_after_encoding(StringView input, URL::PercentEncodeSet percent_encode_set, bool space_as_plus = false) 121{ 122 // NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec. 123 124 StringBuilder output; 125 126 // 3. For each byte of encodeOutput converted to a byte sequence: 127 for (auto byte : input) { 128 // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue. 129 if (space_as_plus && byte == ' ') { 130 output.append('+'); 131 continue; 132 } 133 134 // 2. Let isomorph be a code point whose value is byte’s value. 135 u32 isomorph = byte; 136 137 // 3. Assert: percentEncodeSet includes all non-ASCII code points. 138 139 // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output. 140 if (!URL::code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) { 141 output.append_code_point(isomorph); 142 } 143 144 // 5. Otherwise, percent-encode byte and append the result to output. 145 else { 146 output.appendff("%{:02X}", byte); 147 } 148 } 149 150 // 6. Return output. 151 return output.to_deprecated_string(); 152} 153 154// https://fetch.spec.whatwg.org/#data-urls 155// FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec. 156Optional<URL> URLParser::parse_data_url(StringView raw_input) 157{ 158 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input); 159 VERIFY(raw_input.starts_with("data:"sv)); 160 auto input = raw_input.substring_view(5); 161 auto comma_offset = input.find(','); 162 if (!comma_offset.has_value()) 163 return {}; 164 auto mime_type = StringUtils::trim(input.substring_view(0, comma_offset.value()), "\t\n\f\r "sv, TrimMode::Both); 165 auto encoded_body = input.substring_view(comma_offset.value() + 1); 166 auto body = URL::percent_decode(encoded_body); 167 bool is_base64_encoded = false; 168 if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) { 169 auto substring_view = mime_type.substring_view(0, mime_type.length() - 6); 170 auto trimmed_substring_view = StringUtils::trim(substring_view, " "sv, TrimMode::Right); 171 if (trimmed_substring_view.ends_with(';')) { 172 is_base64_encoded = true; 173 mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1); 174 } 175 } 176 177 StringBuilder builder; 178 if (mime_type.starts_with(";"sv) || mime_type.is_empty()) { 179 builder.append("text/plain"sv); 180 builder.append(mime_type); 181 mime_type = builder.string_view(); 182 } 183 184 // FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type 185 URL url { StringUtils::trim(mime_type, "\n\r\t "sv, TrimMode::Both), move(body), is_base64_encoded }; 186 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize()); 187 return url; 188} 189 190// https://url.spec.whatwg.org/#concept-basic-url-parser 191// NOTE: This parser assumes a UTF-8 encoding. 192// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in 193// validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the 194// future for validation of URLs, which would then lead to infinite recursion. 195// The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member 196// variables' values here, not what the URL class presents to its users. 197// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting 198// some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode 199// everything before setting the member variables. 200URL URLParser::parse(StringView raw_input, URL const* base_url, Optional<URL> url, Optional<State> state_override) 201{ 202 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input); 203 if (raw_input.is_empty()) 204 return base_url ? *base_url : URL {}; 205 206 if (raw_input.starts_with("data:"sv)) { 207 auto maybe_url = parse_data_url(raw_input); 208 if (!maybe_url.has_value()) 209 return {}; 210 return maybe_url.release_value(); 211 } 212 213 size_t start_index = 0; 214 size_t end_index = raw_input.length(); 215 if (!url.has_value()) { 216 url = URL(); 217 218 // NOTE: This removes all leading and trailing C0 control or space characters. 219 bool has_validation_error = false; 220 for (size_t i = 0; i < raw_input.length(); ++i) { 221 i8 ch = raw_input[i]; 222 if (0 <= ch && ch <= 0x20) { 223 ++start_index; 224 has_validation_error = true; 225 } else { 226 break; 227 } 228 } 229 for (ssize_t i = raw_input.length() - 1; i >= 0; --i) { 230 i8 ch = raw_input[i]; 231 if (0 <= ch && ch <= 0x20) { 232 --end_index; 233 has_validation_error = true; 234 } else { 235 break; 236 } 237 } 238 if (has_validation_error) 239 report_validation_error(); 240 } 241 if (start_index >= end_index) 242 return {}; 243 244 DeprecatedString processed_input = raw_input.substring_view(start_index, end_index - start_index); 245 246 // NOTE: This replaces all tab and newline characters with nothing. 247 if (processed_input.contains("\t"sv) || processed_input.contains("\n"sv)) { 248 report_validation_error(); 249 processed_input = processed_input.replace("\t"sv, ""sv, ReplaceMode::All).replace("\n"sv, ""sv, ReplaceMode::All); 250 } 251 252 State state = state_override.value_or(State::SchemeStart); 253 StringBuilder buffer; 254 bool at_sign_seen = false; 255 bool inside_brackets = false; 256 bool password_token_seen = false; 257 258 Utf8View input(processed_input); 259 Utf8CodePointIterator iterator = input.begin(); 260 261 auto get_remaining = [&input, &iterator] { 262 return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string(); 263 }; 264 265 // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop. 266 // ++iterator : "increase pointer by 1" 267 // continue : "decrease pointer by 1" 268 for (;;) { 269 u32 code_point = end_of_file; 270 if (!iterator.done()) 271 code_point = *iterator; 272 273 if constexpr (URL_PARSER_DEBUG) { 274 if (code_point == end_of_file) 275 dbgln("URLParser::parse: {} state with EOF.", state_name(state)); 276 else if (is_ascii_printable(code_point)) 277 dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point); 278 else 279 dbgln("URLParser::parse: {} state with code point U+{:04X}.", state_name(state), code_point); 280 } 281 282 switch (state) { 283 case State::SchemeStart: 284 if (is_ascii_alpha(code_point)) { 285 buffer.append_as_lowercase(code_point); 286 state = State::Scheme; 287 } else { 288 state = State::NoScheme; 289 continue; 290 } 291 break; 292 case State::Scheme: 293 if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') { 294 buffer.append_as_lowercase(code_point); 295 } else if (code_point == ':') { 296 url->m_scheme = buffer.to_deprecated_string(); 297 buffer.clear(); 298 if (url->scheme() == "file") { 299 if (!get_remaining().starts_with("//"sv)) { 300 report_validation_error(); 301 } 302 state = State::File; 303 } else if (url->is_special()) { 304 if (base_url && base_url->m_scheme == url->m_scheme) 305 state = State::SpecialRelativeOrAuthority; 306 else 307 state = State::SpecialAuthoritySlashes; 308 } else if (get_remaining().starts_with("/"sv)) { 309 state = State::PathOrAuthority; 310 ++iterator; 311 } else { 312 url->m_cannot_be_a_base_url = true; 313 url->append_path(""); 314 state = State::CannotBeABaseUrlPath; 315 } 316 } else { 317 buffer.clear(); 318 state = State::NoScheme; 319 iterator = input.begin(); 320 continue; 321 } 322 break; 323 case State::NoScheme: 324 if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) { 325 report_validation_error(); 326 return {}; 327 } else if (base_url->m_cannot_be_a_base_url && code_point == '#') { 328 url->m_scheme = base_url->m_scheme; 329 url->m_paths = base_url->m_paths; 330 url->m_query = base_url->m_query; 331 url->m_fragment = ""; 332 url->m_cannot_be_a_base_url = true; 333 state = State::Fragment; 334 } else if (base_url->m_scheme != "file") { 335 state = State::Relative; 336 continue; 337 } else { 338 state = State::File; 339 continue; 340 } 341 break; 342 case State::SpecialRelativeOrAuthority: 343 if (code_point == '/' && get_remaining().starts_with("/"sv)) { 344 state = State::SpecialAuthorityIgnoreSlashes; 345 ++iterator; 346 } else { 347 report_validation_error(); 348 state = State::Relative; 349 continue; 350 } 351 break; 352 case State::PathOrAuthority: 353 if (code_point == '/') { 354 state = State::Authority; 355 } else { 356 state = State::Path; 357 continue; 358 } 359 break; 360 case State::Relative: 361 url->m_scheme = base_url->m_scheme; 362 if (code_point == '/') { 363 state = State::RelativeSlash; 364 } else if (url->is_special() && code_point == '\\') { 365 report_validation_error(); 366 state = State::RelativeSlash; 367 } else { 368 url->m_username = base_url->m_username; 369 url->m_password = base_url->m_password; 370 url->m_host = base_url->m_host; 371 url->m_port = base_url->m_port; 372 url->m_paths = base_url->m_paths; 373 url->m_query = base_url->m_query; 374 375 if (code_point == '?') { 376 url->m_query = ""; 377 state = State::Query; 378 } else if (code_point == '#') { 379 url->m_fragment = ""; 380 state = State::Fragment; 381 } else if (code_point != end_of_file) { 382 url->m_query = {}; 383 if (url->m_paths.size()) 384 url->m_paths.remove(url->m_paths.size() - 1); 385 state = State::Path; 386 continue; 387 } 388 } 389 break; 390 case State::RelativeSlash: 391 if (url->is_special() && (code_point == '/' || code_point == '\\')) { 392 if (code_point == '\\') 393 report_validation_error(); 394 state = State::SpecialAuthorityIgnoreSlashes; 395 } else if (code_point == '/') { 396 state = State::Authority; 397 } else { 398 url->m_username = base_url->m_username; 399 url->m_password = base_url->m_password; 400 url->m_host = base_url->m_host; 401 url->m_port = base_url->m_port; 402 state = State::Path; 403 continue; 404 } 405 break; 406 case State::SpecialAuthoritySlashes: 407 if (code_point == '/' && get_remaining().starts_with("/"sv)) { 408 state = State::SpecialAuthorityIgnoreSlashes; 409 ++iterator; 410 } else { 411 report_validation_error(); 412 state = State::SpecialAuthorityIgnoreSlashes; 413 continue; 414 } 415 break; 416 case State::SpecialAuthorityIgnoreSlashes: 417 if (code_point != '/' && code_point != '\\') { 418 state = State::Authority; 419 continue; 420 } else { 421 report_validation_error(); 422 } 423 break; 424 case State::Authority: 425 if (code_point == '@') { 426 report_validation_error(); 427 if (at_sign_seen) { 428 auto content = buffer.to_deprecated_string(); 429 buffer.clear(); 430 buffer.append("%40"sv); 431 buffer.append(content); 432 } 433 at_sign_seen = true; 434 StringBuilder builder; 435 for (auto c : Utf8View(builder.string_view())) { 436 if (c == ':' && !password_token_seen) { 437 password_token_seen = true; 438 continue; 439 } 440 builder.clear(); 441 if (password_token_seen) { 442 builder.append(url->password()); 443 URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo); 444 // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences. 445 url->m_password = URL::percent_decode(builder.string_view()); 446 } else { 447 builder.append(url->username()); 448 URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo); 449 // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences. 450 url->m_username = URL::percent_decode(builder.string_view()); 451 } 452 } 453 buffer.clear(); 454 } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) { 455 if (at_sign_seen && buffer.is_empty()) { 456 report_validation_error(); 457 return {}; 458 } 459 // NOTE: This decreases the iterator by the number of code points in buffer plus one. 460 iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1); 461 buffer.clear(); 462 state = State::Host; 463 } else { 464 buffer.append_code_point(code_point); 465 } 466 break; 467 case State::Host: 468 case State::Hostname: 469 if (code_point == ':' && !inside_brackets) { 470 if (buffer.is_empty()) { 471 report_validation_error(); 472 return {}; 473 } 474 auto host = parse_host(buffer.string_view(), !url->is_special()); 475 if (!host.has_value()) 476 return {}; 477 url->m_host = host.release_value(); 478 buffer.clear(); 479 state = State::Port; 480 } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) { 481 if (url->is_special() && buffer.is_empty()) { 482 report_validation_error(); 483 return {}; 484 } 485 auto host = parse_host(buffer.string_view(), !url->is_special()); 486 if (!host.has_value()) 487 return {}; 488 url->m_host = host.value(); 489 buffer.clear(); 490 state = State::Port; 491 continue; 492 } else if (code_point == '[') { 493 inside_brackets = true; 494 } else if (code_point == ']') { 495 inside_brackets = false; 496 } else { 497 buffer.append_code_point(code_point); 498 } 499 break; 500 case State::Port: 501 if (is_ascii_digit(code_point)) { 502 buffer.append_code_point(code_point); 503 } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) { 504 if (!buffer.is_empty()) { 505 auto port = buffer.string_view().to_uint(); 506 if (!port.has_value() || port.value() > 65535) { 507 report_validation_error(); 508 return {}; 509 } 510 if (port.value() == URL::default_port_for_scheme(url->scheme())) 511 url->m_port = {}; 512 else 513 url->m_port = port.value(); 514 buffer.clear(); 515 } 516 state = State::PathStart; 517 continue; 518 } else { 519 report_validation_error(); 520 return {}; 521 } 522 break; 523 case State::File: 524 url->m_scheme = "file"; 525 url->m_host = ""; 526 if (code_point == '/' || code_point == '\\') { 527 if (code_point == '\\') 528 report_validation_error(); 529 state = State::FileSlash; 530 } else if (base_url && base_url->m_scheme == "file") { 531 url->m_host = base_url->m_host; 532 url->m_paths = base_url->m_paths; 533 url->m_query = base_url->m_query; 534 if (code_point == '?') { 535 url->m_query = ""; 536 state = State::Query; 537 } else if (code_point == '#') { 538 url->m_fragment = ""; 539 state = State::Fragment; 540 } else if (code_point != end_of_file) { 541 url->m_query = {}; 542 auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string(); 543 if (!starts_with_windows_drive_letter(substring_from_pointer)) { 544 if (!url->paths().is_empty() && !(url->scheme() == "file" && url->paths().size() == 1 && is_normalized_windows_drive_letter(url->paths()[0]))) 545 url->m_paths.remove(url->m_paths.size() - 1); 546 } else { 547 report_validation_error(); 548 url->m_paths.clear(); 549 } 550 state = State::Path; 551 continue; 552 } 553 } 554 break; 555 case State::FileSlash: 556 if (code_point == '/' || code_point == '\\') { 557 if (code_point == '\\') 558 report_validation_error(); 559 state = State::FileHost; 560 } else if (base_url && base_url->m_scheme == "file") { 561 url->m_host = base_url->m_host; 562 auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string(); 563 if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0])) 564 url->append_path(base_url->m_paths[0]); 565 state = State::Path; 566 continue; 567 } 568 break; 569 case State::FileHost: 570 if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') { 571 if (is_windows_drive_letter(buffer.string_view())) { 572 report_validation_error(); 573 state = State::Path; 574 } else if (buffer.is_empty()) { 575 url->m_host = ""; 576 state = State::PathStart; 577 } else { 578 auto host = parse_host(buffer.string_view(), true); 579 if (!host.has_value()) 580 return {}; 581 if (host.value() == "localhost") 582 host = ""; 583 url->m_host = host.release_value(); 584 buffer.clear(); 585 state = State::PathStart; 586 } 587 continue; 588 } else { 589 buffer.append_code_point(code_point); 590 } 591 break; 592 case State::PathStart: 593 if (url->is_special()) { 594 if (code_point == '\\') 595 report_validation_error(); 596 state = State::Path; 597 if (code_point != '/' && code_point != '\\') 598 continue; 599 } else if (code_point == '?') { 600 url->m_query = ""; 601 state = State::Query; 602 } else if (code_point == '#') { 603 url->m_fragment = ""; 604 state = State::Fragment; 605 } else if (code_point != end_of_file) { 606 state = State::Path; 607 if (code_point != '/') 608 continue; 609 } 610 break; 611 case State::Path: 612 if (code_point == end_of_file || code_point == '/' || (url->is_special() && code_point == '\\') || code_point == '?' || code_point == '#') { 613 if (url->is_special() && code_point == '\\') 614 report_validation_error(); 615 if (is_double_dot_path_segment(buffer.string_view())) { 616 if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0]))) 617 url->m_paths.remove(url->m_paths.size() - 1); 618 if (code_point != '/' && !(url->is_special() && code_point == '\\')) 619 url->append_path(""); 620 } else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) { 621 url->append_path(""); 622 } else if (!is_single_dot_path_segment(buffer.string_view())) { 623 if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) { 624 auto drive_letter = buffer.string_view()[0]; 625 buffer.clear(); 626 buffer.append(drive_letter); 627 buffer.append(':'); 628 } 629 // NOTE: This needs to be percent decoded since the member variables contain decoded data. 630 url->append_path(URL::percent_decode(buffer.string_view())); 631 } 632 buffer.clear(); 633 if (code_point == '?') { 634 url->m_query = ""; 635 state = State::Query; 636 } else if (code_point == '#') { 637 url->m_fragment = ""; 638 state = State::Fragment; 639 } 640 } else { 641 if (!is_url_code_point(code_point) && code_point != '%') 642 report_validation_error(); 643 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. 644 URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path); 645 } 646 break; 647 case State::CannotBeABaseUrlPath: 648 // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF. 649 // NOTE: Verify that the assumptions required for this simplification are correct. 650 VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty()); 651 if (code_point == '?') { 652 // NOTE: This needs to be percent decoded since the member variables contain decoded data. 653 url->m_paths[0] = URL::percent_decode(buffer.string_view()); 654 url->m_query = ""; 655 state = State::Query; 656 } else if (code_point == '#') { 657 // NOTE: This needs to be percent decoded since the member variables contain decoded data. 658 url->m_paths[0] = URL::percent_decode(buffer.string_view()); 659 url->m_fragment = ""; 660 state = State::Fragment; 661 } else { 662 if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%') 663 report_validation_error(); 664 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. 665 if (code_point != end_of_file) { 666 URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control); 667 } else { 668 // NOTE: This needs to be percent decoded since the member variables contain decoded data. 669 url->m_paths[0] = URL::percent_decode(buffer.string_view()); 670 } 671 } 672 break; 673 case State::Query: 674 // https://url.spec.whatwg.org/#query-state 675 if (code_point == end_of_file || code_point == '#') { 676 VERIFY(url->m_query == ""); 677 auto query_percent_encode_set = url->is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query; 678 url->m_query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set); 679 buffer.clear(); 680 if (code_point == '#') { 681 url->m_fragment = ""; 682 state = State::Fragment; 683 } 684 } else if (code_point != end_of_file) { 685 if (!is_url_code_point(code_point) && code_point != '%') 686 report_validation_error(); 687 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. 688 buffer.append_code_point(code_point); 689 } 690 break; 691 case State::Fragment: 692 // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF. 693 if (code_point != end_of_file) { 694 if (!is_url_code_point(code_point) && code_point != '%') 695 report_validation_error(); 696 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. 697 buffer.append_code_point(code_point); 698 } else { 699 // NOTE: This needs to be percent decoded since the member variables contain decoded data. 700 url->m_fragment = URL::percent_decode(buffer.string_view()); 701 buffer.clear(); 702 } 703 break; 704 default: 705 VERIFY_NOT_REACHED(); 706 } 707 708 if (iterator.done()) 709 break; 710 ++iterator; 711 } 712 713 url->m_valid = true; 714 dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url->serialize()); 715 return url.release_value(); 716} 717 718}