we (web engine): Experimental web browser project to understand the limits of Claude
at main 2055 lines 63 kB view raw
1//! WHATWG URL parser. 2//! 3//! Implements the URL Standard (<https://url.spec.whatwg.org/>): 4//! - URL record type with scheme, username, password, host, port, path, query, fragment 5//! - State-machine parser following the spec 6//! - Host parsing: domains, IPv4 addresses, IPv6 addresses 7//! - Percent-encoding and decoding (UTF-8) 8//! - Special scheme handling (http, https, ftp, ws, wss, file) 9//! - Relative URL resolution via base URL 10//! - URL serialization 11//! - Origin derivation 12 13pub mod data_url; 14 15use core::fmt; 16 17// --------------------------------------------------------------------------- 18// Error types 19// --------------------------------------------------------------------------- 20 21#[derive(Debug, Clone, PartialEq, Eq)] 22pub enum UrlError { 23 /// Input is empty or contains only whitespace. 24 EmptyInput, 25 /// Invalid URL syntax. 26 InvalidUrl, 27 /// Invalid scheme. 28 InvalidScheme, 29 /// Invalid authority. 30 InvalidAuthority, 31 /// Invalid host. 32 InvalidHost, 33 /// Invalid port number. 34 InvalidPort, 35 /// Invalid IPv4 address. 36 InvalidIpv4, 37 /// Invalid IPv6 address. 38 InvalidIpv6, 39 /// Invalid percent-encoding. 40 InvalidPercentEncoding, 41 /// Relative URL without a base. 42 RelativeWithoutBase, 43 /// Missing scheme. 44 MissingScheme, 45} 46 47impl fmt::Display for UrlError { 48 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 49 match self { 50 Self::EmptyInput => write!(f, "empty input"), 51 Self::InvalidUrl => write!(f, "invalid URL"), 52 Self::InvalidScheme => write!(f, "invalid scheme"), 53 Self::InvalidAuthority => write!(f, "invalid authority"), 54 Self::InvalidHost => write!(f, "invalid host"), 55 Self::InvalidPort => write!(f, "invalid port number"), 56 Self::InvalidIpv4 => write!(f, "invalid IPv4 address"), 57 Self::InvalidIpv6 => write!(f, "invalid IPv6 address"), 58 Self::InvalidPercentEncoding => write!(f, "invalid percent-encoding"), 59 Self::RelativeWithoutBase => write!(f, "relative URL without a base"), 60 Self::MissingScheme => write!(f, "missing scheme"), 61 } 62 } 63} 64 65pub type Result<T> = core::result::Result<T, UrlError>; 66 67// --------------------------------------------------------------------------- 68// Host 69// --------------------------------------------------------------------------- 70 71/// A parsed URL host. 72#[derive(Debug, Clone, PartialEq, Eq)] 73pub enum Host { 74 /// A domain name (already lowercased). 75 Domain(String), 76 /// An IPv4 address. 77 Ipv4(u32), 78 /// An IPv6 address (128 bits as eight 16-bit pieces). 79 Ipv6([u16; 8]), 80} 81 82impl Host { 83 /// Serialize the host to a string. 84 pub fn serialize(&self) -> String { 85 match self { 86 Host::Domain(d) => d.clone(), 87 Host::Ipv4(addr) => serialize_ipv4(*addr), 88 Host::Ipv6(pieces) => format!("[{}]", serialize_ipv6(pieces)), 89 } 90 } 91} 92 93impl fmt::Display for Host { 94 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 95 write!(f, "{}", self.serialize()) 96 } 97} 98 99// --------------------------------------------------------------------------- 100// Origin 101// --------------------------------------------------------------------------- 102 103/// A URL origin (scheme, host, port). 104#[derive(Debug, Clone, PartialEq, Eq)] 105pub enum Origin { 106 /// A tuple origin (scheme, host, port). 107 Tuple(String, Host, Option<u16>), 108 /// An opaque origin (unique, not equal to anything). 109 Opaque, 110} 111 112impl Origin { 113 /// Check whether two origins are the same origin per the HTML spec. 114 /// 115 /// Two tuple origins are same-origin iff their schemes, hosts, and ports 116 /// are identical after normalizing default ports (http→80, https→443, etc.). 117 /// Opaque origins are never same-origin, even with themselves. 118 pub fn same_origin(&self, other: &Origin) -> bool { 119 match (self, other) { 120 (Origin::Tuple(scheme_a, host_a, port_a), Origin::Tuple(scheme_b, host_b, port_b)) => { 121 let effective_port_a = port_a.or_else(|| default_port(scheme_a)); 122 let effective_port_b = port_b.or_else(|| default_port(scheme_b)); 123 scheme_a == scheme_b && host_a == host_b && effective_port_a == effective_port_b 124 } 125 _ => false, 126 } 127 } 128 129 /// Serialize this origin to a string (e.g. `"https://example.com"`). 130 /// 131 /// Opaque origins serialize to `"null"`. 132 pub fn serialize(&self) -> String { 133 match self { 134 Origin::Opaque => "null".to_string(), 135 Origin::Tuple(scheme, host, port) => { 136 let mut s = String::new(); 137 s.push_str(scheme); 138 s.push_str("://"); 139 s.push_str(&host.serialize()); 140 if let Some(p) = port { 141 if default_port(scheme) != Some(*p) { 142 s.push(':'); 143 s.push_str(&p.to_string()); 144 } 145 } 146 s 147 } 148 } 149 } 150} 151 152impl fmt::Display for Origin { 153 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 154 write!(f, "{}", self.serialize()) 155 } 156} 157 158// --------------------------------------------------------------------------- 159// URL record 160// --------------------------------------------------------------------------- 161 162/// A parsed URL record per the WHATWG URL Standard. 163#[derive(Debug, Clone, PartialEq, Eq)] 164pub struct Url { 165 /// The scheme (e.g., "http", "https", "file"). 166 pub scheme: String, 167 /// The username (percent-encoded). 168 username: String, 169 /// The password (percent-encoded). 170 password: String, 171 /// The host. 172 pub host: Option<Host>, 173 /// The port (None = default or absent). 174 pub port: Option<u16>, 175 /// Path segments. For non-opaque paths, these are the segments. 176 /// For opaque paths (cannot-be-a-base URL), this is a single element. 177 path: Vec<String>, 178 /// Whether this URL has an opaque path (cannot-be-a-base URL). 179 opaque_path: bool, 180 /// The query string (without leading '?'). 181 pub query: Option<String>, 182 /// The fragment (without leading '#'). 183 pub fragment: Option<String>, 184} 185 186impl Url { 187 /// Parse a URL string. 188 pub fn parse(input: &str) -> Result<Self> { 189 parse_url(input, None) 190 } 191 192 /// Parse a URL string with a base URL for resolving relative references. 193 pub fn parse_with_base(input: &str, base: &Url) -> Result<Self> { 194 parse_url(input, Some(base)) 195 } 196 197 /// Get the scheme. 198 pub fn scheme(&self) -> &str { 199 &self.scheme 200 } 201 202 /// Get the username (percent-encoded). 203 pub fn username(&self) -> &str { 204 &self.username 205 } 206 207 /// Get the password (percent-encoded). 208 pub fn password(&self) -> &str { 209 &self.password 210 } 211 212 /// Get the host. 213 pub fn host(&self) -> Option<&Host> { 214 self.host.as_ref() 215 } 216 217 /// Get the host as a string. 218 pub fn host_str(&self) -> Option<String> { 219 self.host.as_ref().map(|h| h.serialize()) 220 } 221 222 /// Get the port. 223 pub fn port(&self) -> Option<u16> { 224 self.port 225 } 226 227 /// Get the port or the default port for the scheme. 228 pub fn port_or_default(&self) -> Option<u16> { 229 self.port.or_else(|| default_port(&self.scheme)) 230 } 231 232 /// Get the path as a string. 233 pub fn path(&self) -> String { 234 if self.opaque_path { 235 self.path.first().cloned().unwrap_or_default() 236 } else { 237 let mut s = String::new(); 238 for seg in &self.path { 239 s.push('/'); 240 s.push_str(seg); 241 } 242 if s.is_empty() { 243 s.push('/'); 244 } 245 s 246 } 247 } 248 249 /// Get the path segments. 250 pub fn path_segments(&self) -> &[String] { 251 &self.path 252 } 253 254 /// Get the query string. 255 pub fn query(&self) -> Option<&str> { 256 self.query.as_deref() 257 } 258 259 /// Get the fragment. 260 pub fn fragment(&self) -> Option<&str> { 261 self.fragment.as_deref() 262 } 263 264 /// Whether this URL has an opaque path (cannot-be-a-base). 265 pub fn cannot_be_a_base(&self) -> bool { 266 self.opaque_path 267 } 268 269 /// Whether this URL includes credentials. 270 pub fn has_credentials(&self) -> bool { 271 !self.username.is_empty() || !self.password.is_empty() 272 } 273 274 /// Derive the origin of this URL. 275 pub fn origin(&self) -> Origin { 276 match self.scheme.as_str() { 277 "http" | "https" | "ws" | "wss" | "ftp" => { 278 if let Some(host) = &self.host { 279 Origin::Tuple(self.scheme.clone(), host.clone(), self.port) 280 } else { 281 Origin::Opaque 282 } 283 } 284 _ => Origin::Opaque, 285 } 286 } 287 288 /// Serialize this URL to a string (the href). 289 pub fn serialize(&self) -> String { 290 let mut output = String::new(); 291 output.push_str(&self.scheme); 292 output.push(':'); 293 294 if self.host.is_some() { 295 output.push_str("//"); 296 if self.has_credentials() { 297 output.push_str(&self.username); 298 if !self.password.is_empty() { 299 output.push(':'); 300 output.push_str(&self.password); 301 } 302 output.push('@'); 303 } 304 if let Some(ref host) = self.host { 305 output.push_str(&host.serialize()); 306 } 307 if let Some(port) = self.port { 308 output.push(':'); 309 output.push_str(&port.to_string()); 310 } 311 } else if !self.opaque_path && self.scheme == "file" { 312 output.push_str("//"); 313 } 314 315 output.push_str(&self.path()); 316 317 if let Some(ref query) = self.query { 318 output.push('?'); 319 output.push_str(query); 320 } 321 if let Some(ref fragment) = self.fragment { 322 output.push('#'); 323 output.push_str(fragment); 324 } 325 326 output 327 } 328} 329 330impl fmt::Display for Url { 331 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 332 write!(f, "{}", self.serialize()) 333 } 334} 335 336// --------------------------------------------------------------------------- 337// Special schemes 338// --------------------------------------------------------------------------- 339 340/// Whether a scheme is "special" per the URL standard. 341fn is_special_scheme(scheme: &str) -> bool { 342 matches!(scheme, "http" | "https" | "ftp" | "ws" | "wss" | "file") 343} 344 345/// Default port for a special scheme. 346fn default_port(scheme: &str) -> Option<u16> { 347 match scheme { 348 "http" | "ws" => Some(80), 349 "https" | "wss" => Some(443), 350 "ftp" => Some(21), 351 _ => None, 352 } 353} 354 355// --------------------------------------------------------------------------- 356// Percent encoding / decoding 357// --------------------------------------------------------------------------- 358 359/// The C0 control percent-encode set. 360fn is_c0_control(c: char) -> bool { 361 c <= '\u{001F}' || c > '\u{007E}' 362} 363 364/// The fragment percent-encode set. 365fn is_fragment_encode(c: char) -> bool { 366 is_c0_control(c) || c == ' ' || c == '"' || c == '<' || c == '>' || c == '`' 367} 368 369/// The query percent-encode set. 370fn is_query_encode(c: char) -> bool { 371 is_c0_control(c) || c == ' ' || c == '"' || c == '#' || c == '<' || c == '>' 372} 373 374/// The special query percent-encode set. 375fn is_special_query_encode(c: char) -> bool { 376 is_query_encode(c) || c == '\'' 377} 378 379/// The path percent-encode set. 380fn is_path_encode(c: char) -> bool { 381 is_query_encode(c) || c == '?' || c == '`' || c == '{' || c == '}' 382} 383 384/// The userinfo percent-encode set. 385fn is_userinfo_encode(c: char) -> bool { 386 is_path_encode(c) 387 || c == '/' 388 || c == ':' 389 || c == ';' 390 || c == '=' 391 || c == '@' 392 || c == '[' 393 || c == '\\' 394 || c == ']' 395 || c == '^' 396 || c == '|' 397} 398 399/// Percent-encode a string using the given encode set predicate. 400fn percent_encode(input: &str, should_encode: fn(char) -> bool) -> String { 401 let mut out = String::with_capacity(input.len()); 402 for c in input.chars() { 403 if should_encode(c) { 404 for b in c.to_string().as_bytes() { 405 out.push('%'); 406 out.push(to_hex_upper(b >> 4)); 407 out.push(to_hex_upper(b & 0x0F)); 408 } 409 } else { 410 out.push(c); 411 } 412 } 413 out 414} 415 416fn to_hex_upper(n: u8) -> char { 417 if n < 10 { 418 (b'0' + n) as char 419 } else { 420 (b'A' + n - 10) as char 421 } 422} 423 424/// Percent-decode a byte string. 425pub fn percent_decode(input: &str) -> Vec<u8> { 426 let bytes = input.as_bytes(); 427 let mut out = Vec::with_capacity(bytes.len()); 428 let mut i = 0; 429 while i < bytes.len() { 430 if bytes[i] == b'%' && i + 2 < bytes.len() { 431 if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) { 432 out.push(hi << 4 | lo); 433 i += 3; 434 continue; 435 } 436 } 437 out.push(bytes[i]); 438 i += 1; 439 } 440 out 441} 442 443/// Percent-decode to a UTF-8 string (lossy). 444pub fn percent_decode_string(input: &str) -> String { 445 String::from_utf8_lossy(&percent_decode(input)).into_owned() 446} 447 448fn hex_val(b: u8) -> Option<u8> { 449 match b { 450 b'0'..=b'9' => Some(b - b'0'), 451 b'a'..=b'f' => Some(b - b'a' + 10), 452 b'A'..=b'F' => Some(b - b'A' + 10), 453 _ => None, 454 } 455} 456 457// --------------------------------------------------------------------------- 458// IPv4 parsing 459// --------------------------------------------------------------------------- 460 461fn parse_ipv4(input: &str) -> Result<u32> { 462 let parts: Vec<&str> = input.split('.').collect(); 463 if parts.len() < 2 || parts.len() > 4 { 464 return Err(UrlError::InvalidIpv4); 465 } 466 let mut numbers: Vec<u64> = Vec::with_capacity(parts.len()); 467 for part in &parts { 468 if part.is_empty() { 469 return Err(UrlError::InvalidIpv4); 470 } 471 let n = parse_ipv4_number(part)?; 472 numbers.push(n); 473 } 474 let last = numbers.len() - 1; 475 for (i, &n) in numbers.iter().enumerate() { 476 if i < last && n > 255 { 477 return Err(UrlError::InvalidIpv4); 478 } 479 } 480 if numbers[last] >= 256u64.pow((4 - last) as u32) { 481 return Err(UrlError::InvalidIpv4); 482 } 483 484 let mut ipv4 = numbers[last] as u32; 485 for (i, &n) in numbers.iter().enumerate().take(last) { 486 ipv4 += (n as u32) << (8 * (3 - i)); 487 } 488 Ok(ipv4) 489} 490 491fn parse_ipv4_number(input: &str) -> Result<u64> { 492 if input.is_empty() { 493 return Err(UrlError::InvalidIpv4); 494 } 495 let (s, radix) = if input.starts_with("0x") || input.starts_with("0X") { 496 (&input[2..], 16) 497 } else if input.len() > 1 && input.starts_with('0') { 498 (&input[1..], 8) 499 } else { 500 (input, 10) 501 }; 502 if s.is_empty() { 503 return Ok(0); 504 } 505 u64::from_str_radix(s, radix).map_err(|_| UrlError::InvalidIpv4) 506} 507 508fn serialize_ipv4(addr: u32) -> String { 509 format!( 510 "{}.{}.{}.{}", 511 (addr >> 24) & 0xFF, 512 (addr >> 16) & 0xFF, 513 (addr >> 8) & 0xFF, 514 addr & 0xFF 515 ) 516} 517 518// --------------------------------------------------------------------------- 519// IPv6 parsing 520// --------------------------------------------------------------------------- 521 522fn parse_ipv6(input: &str) -> Result<[u16; 8]> { 523 let mut pieces = [0u16; 8]; 524 let mut piece_index: usize = 0; 525 let mut compress: Option<usize> = None; 526 let chars: Vec<char> = input.chars().collect(); 527 let len = chars.len(); 528 let mut pointer = 0; 529 530 if pointer < len && chars[pointer] == ':' { 531 if pointer + 1 >= len || chars[pointer + 1] != ':' { 532 return Err(UrlError::InvalidIpv6); 533 } 534 pointer += 2; 535 piece_index += 1; 536 compress = Some(piece_index); 537 } 538 539 while pointer < len { 540 if piece_index >= 8 { 541 return Err(UrlError::InvalidIpv6); 542 } 543 544 if chars[pointer] == ':' { 545 if compress.is_some() { 546 return Err(UrlError::InvalidIpv6); 547 } 548 pointer += 1; 549 piece_index += 1; 550 compress = Some(piece_index); 551 continue; 552 } 553 554 let mut value: u16 = 0; 555 let mut length = 0; 556 while length < 4 && pointer < len && chars[pointer].is_ascii_hexdigit() { 557 value = value * 0x10 + hex_val(chars[pointer] as u8).unwrap() as u16; 558 pointer += 1; 559 length += 1; 560 } 561 562 if pointer < len && chars[pointer] == '.' { 563 // IPv4-mapped IPv6. 564 if length == 0 { 565 return Err(UrlError::InvalidIpv6); 566 } 567 pointer -= length; 568 if piece_index > 6 { 569 return Err(UrlError::InvalidIpv6); 570 } 571 let mut numbers_seen = 0; 572 while pointer < len { 573 let mut ipv4_piece: Option<u16> = None; 574 if numbers_seen > 0 { 575 if chars[pointer] == '.' && numbers_seen < 4 { 576 pointer += 1; 577 } else { 578 return Err(UrlError::InvalidIpv6); 579 } 580 } 581 if pointer >= len || !chars[pointer].is_ascii_digit() { 582 return Err(UrlError::InvalidIpv6); 583 } 584 while pointer < len && chars[pointer].is_ascii_digit() { 585 let number = (chars[pointer] as u8 - b'0') as u16; 586 match ipv4_piece { 587 None => ipv4_piece = Some(number), 588 Some(0) => return Err(UrlError::InvalidIpv6), // leading zero 589 Some(v) => ipv4_piece = Some(v * 10 + number), 590 } 591 if ipv4_piece.unwrap_or(0) > 255 { 592 return Err(UrlError::InvalidIpv6); 593 } 594 pointer += 1; 595 } 596 pieces[piece_index] = 597 pieces[piece_index] * 0x100 + ipv4_piece.ok_or(UrlError::InvalidIpv6)?; 598 numbers_seen += 1; 599 if numbers_seen == 2 || numbers_seen == 4 { 600 piece_index += 1; 601 } 602 } 603 if numbers_seen != 4 { 604 return Err(UrlError::InvalidIpv6); 605 } 606 break; 607 } 608 609 if pointer < len && chars[pointer] == ':' { 610 pointer += 1; 611 if pointer >= len { 612 // Trailing single colon after a piece — only valid with compress. 613 } 614 } else if pointer < len { 615 return Err(UrlError::InvalidIpv6); 616 } 617 618 if piece_index >= 8 { 619 return Err(UrlError::InvalidIpv6); 620 } 621 pieces[piece_index] = value; 622 piece_index += 1; 623 } 624 625 if let Some(comp) = compress { 626 let mut swaps = piece_index - comp; 627 piece_index = 7; 628 while piece_index != 0 && swaps > 0 { 629 let swap_index = comp + swaps - 1; 630 pieces.swap(piece_index, swap_index); 631 piece_index -= 1; 632 swaps -= 1; 633 } 634 } else if piece_index != 8 { 635 return Err(UrlError::InvalidIpv6); 636 } 637 638 Ok(pieces) 639} 640 641fn serialize_ipv6(pieces: &[u16; 8]) -> String { 642 // Find the longest run of consecutive zeros for :: compression. 643 let mut best_start = None; 644 let mut best_len = 0usize; 645 let mut cur_start = None; 646 let mut cur_len = 0usize; 647 648 for (i, &p) in pieces.iter().enumerate() { 649 if p == 0 { 650 if cur_start.is_none() { 651 cur_start = Some(i); 652 cur_len = 1; 653 } else { 654 cur_len += 1; 655 } 656 } else { 657 if cur_len > best_len && cur_len >= 2 { 658 best_start = cur_start; 659 best_len = cur_len; 660 } 661 cur_start = None; 662 cur_len = 0; 663 } 664 } 665 if cur_len > best_len && cur_len >= 2 { 666 best_start = cur_start; 667 best_len = cur_len; 668 } 669 670 let mut out = String::new(); 671 let mut i = 0; 672 while i < 8 { 673 if Some(i) == best_start { 674 out.push_str("::"); 675 i += best_len; 676 continue; 677 } 678 if !out.is_empty() && !out.ends_with(':') { 679 out.push(':'); 680 } 681 out.push_str(&format!("{:x}", pieces[i])); 682 i += 1; 683 } 684 out 685} 686 687// --------------------------------------------------------------------------- 688// Host parsing 689// --------------------------------------------------------------------------- 690 691fn parse_host(input: &str, is_special: bool) -> Result<Host> { 692 if input.is_empty() { 693 if is_special { 694 return Err(UrlError::InvalidHost); 695 } 696 return Ok(Host::Domain(String::new())); 697 } 698 699 // IPv6 700 if input.starts_with('[') { 701 if !input.ends_with(']') { 702 return Err(UrlError::InvalidIpv6); 703 } 704 let inner = &input[1..input.len() - 1]; 705 let pieces = parse_ipv6(inner)?; 706 return Ok(Host::Ipv6(pieces)); 707 } 708 709 if !is_special { 710 let encoded = percent_encode(input, is_c0_control); 711 return Ok(Host::Domain(encoded)); 712 } 713 714 // Domain — percent-decode then lowercase. 715 let decoded = percent_decode_string(input); 716 let lowered = decoded.to_ascii_lowercase(); 717 718 // Check if it's an IPv4 address. 719 if ends_with_number(&lowered) { 720 match parse_ipv4(&lowered) { 721 Ok(addr) => return Ok(Host::Ipv4(addr)), 722 Err(_) => return Err(UrlError::InvalidHost), 723 } 724 } 725 726 // Validate domain characters. 727 for c in lowered.chars() { 728 if c == '\0' 729 || c == '\t' 730 || c == '\n' 731 || c == '\r' 732 || c == ' ' 733 || c == '#' 734 || c == '/' 735 || c == ':' 736 || c == '<' 737 || c == '>' 738 || c == '?' 739 || c == '@' 740 || c == '[' 741 || c == '\\' 742 || c == ']' 743 || c == '^' 744 || c == '|' 745 { 746 return Err(UrlError::InvalidHost); 747 } 748 } 749 750 Ok(Host::Domain(lowered)) 751} 752 753/// Check if a domain string ends with a number (suggesting IPv4). 754fn ends_with_number(input: &str) -> bool { 755 let last_part = match input.rsplit('.').next() { 756 Some(p) => p, 757 None => return false, 758 }; 759 if last_part.is_empty() { 760 return false; 761 } 762 if last_part.starts_with("0x") || last_part.starts_with("0X") { 763 return last_part[2..].chars().all(|c| c.is_ascii_hexdigit()); 764 } 765 last_part.chars().all(|c| c.is_ascii_digit()) 766} 767 768// --------------------------------------------------------------------------- 769// Shorten path helper 770// --------------------------------------------------------------------------- 771 772fn shorten_path(scheme: &str, path: &mut Vec<String>) { 773 if scheme == "file" && path.len() == 1 { 774 if let Some(first) = path.first() { 775 if is_normalized_windows_drive_letter(first) { 776 return; 777 } 778 } 779 } 780 path.pop(); 781} 782 783fn is_normalized_windows_drive_letter(s: &str) -> bool { 784 let bytes = s.as_bytes(); 785 bytes.len() == 2 && bytes[0].is_ascii_alphabetic() && bytes[1] == b':' 786} 787 788fn starts_with_windows_drive_letter(s: &str) -> bool { 789 let bytes = s.as_bytes(); 790 if bytes.len() < 2 { 791 return false; 792 } 793 if !bytes[0].is_ascii_alphabetic() { 794 return false; 795 } 796 if bytes[1] != b':' && bytes[1] != b'|' { 797 return false; 798 } 799 if bytes.len() >= 3 { 800 matches!(bytes[2], b'/' | b'\\' | b'?' | b'#') 801 } else { 802 true 803 } 804} 805 806// --------------------------------------------------------------------------- 807// URL parser 808// --------------------------------------------------------------------------- 809 810fn parse_url(input: &str, base: Option<&Url>) -> Result<Url> { 811 // Strip leading/trailing C0 controls and spaces. 812 let input = input.trim_matches(|c: char| c <= '\u{0020}'); 813 814 if input.is_empty() { 815 if let Some(base) = base { 816 return parse_relative("", base); 817 } 818 return Err(UrlError::EmptyInput); 819 } 820 821 // Remove tab and newline characters. 822 let input: String = input 823 .chars() 824 .filter(|&c| c != '\t' && c != '\n' && c != '\r') 825 .collect(); 826 827 let chars: Vec<char> = input.chars().collect(); 828 let len = chars.len(); 829 830 let mut pointer = 0; 831 832 // Try to parse a scheme. 833 let mut scheme = String::new(); 834 let mut has_scheme = false; 835 836 if pointer < len && chars[pointer].is_ascii_alphabetic() { 837 let mut temp = String::new(); 838 temp.push(chars[pointer].to_ascii_lowercase()); 839 let mut p = pointer + 1; 840 while p < len 841 && (chars[p].is_ascii_alphanumeric() 842 || chars[p] == '+' 843 || chars[p] == '-' 844 || chars[p] == '.') 845 { 846 temp.push(chars[p].to_ascii_lowercase()); 847 p += 1; 848 } 849 if p < len && chars[p] == ':' { 850 scheme = temp; 851 has_scheme = true; 852 pointer = p + 1; // skip the ':' 853 } 854 } 855 856 if !has_scheme { 857 if let Some(base) = base { 858 return parse_relative(&input, base); 859 } 860 return Err(UrlError::MissingScheme); 861 } 862 863 let is_special = is_special_scheme(&scheme); 864 865 let mut url = Url { 866 scheme: scheme.clone(), 867 username: String::new(), 868 password: String::new(), 869 host: None, 870 port: None, 871 path: Vec::new(), 872 opaque_path: false, 873 query: None, 874 fragment: None, 875 }; 876 877 let remaining: String = chars[pointer..].iter().collect(); 878 879 if scheme == "file" { 880 return parse_file_url(&remaining, base, url); 881 } 882 883 if let Some(after_slashes) = remaining.strip_prefix("//") { 884 parse_authority_and_path(&mut url, after_slashes, is_special)?; 885 } else if is_special { 886 if let Some(base) = base { 887 if base.scheme == url.scheme { 888 return parse_relative_special(&remaining, base, url); 889 } 890 } 891 if let Some(after_slash) = remaining.strip_prefix('/') { 892 parse_authority_and_path(&mut url, after_slash, is_special)?; 893 } else { 894 parse_authority_and_path(&mut url, &remaining, is_special)?; 895 } 896 } else { 897 parse_opaque_or_path(&mut url, &remaining)?; 898 } 899 900 Ok(url) 901} 902 903fn parse_authority_and_path(url: &mut Url, input: &str, is_special: bool) -> Result<()> { 904 let authority_end = input 905 .find(|c: char| c == '/' || c == '?' || c == '#' || (is_special && c == '\\')) 906 .unwrap_or(input.len()); 907 908 let authority = &input[..authority_end]; 909 let rest = &input[authority_end..]; 910 911 let (userinfo_part, hostport) = if let Some(at_pos) = authority.rfind('@') { 912 (&authority[..at_pos], &authority[at_pos + 1..]) 913 } else { 914 ("", authority) 915 }; 916 917 if !userinfo_part.is_empty() { 918 if let Some(colon_pos) = userinfo_part.find(':') { 919 url.username = percent_encode(&userinfo_part[..colon_pos], is_userinfo_encode); 920 url.password = percent_encode(&userinfo_part[colon_pos + 1..], is_userinfo_encode); 921 } else { 922 url.username = percent_encode(userinfo_part, is_userinfo_encode); 923 } 924 } 925 926 let (host_str, port_str) = split_host_port(hostport); 927 928 url.host = Some(parse_host(host_str, is_special)?); 929 930 if let Some(port_s) = port_str { 931 if !port_s.is_empty() { 932 let port: u16 = port_s.parse().map_err(|_| UrlError::InvalidPort)?; 933 if default_port(&url.scheme) != Some(port) { 934 url.port = Some(port); 935 } 936 } 937 } 938 939 parse_path_query_fragment(url, rest, is_special) 940} 941 942fn split_host_port(input: &str) -> (&str, Option<&str>) { 943 if input.starts_with('[') { 944 if let Some(bracket_end) = input.find(']') { 945 let host = &input[..bracket_end + 1]; 946 let after = &input[bracket_end + 1..]; 947 if let Some(port_str) = after.strip_prefix(':') { 948 return (host, Some(port_str)); 949 } 950 return (host, None); 951 } 952 return (input, None); 953 } 954 955 if let Some(colon_pos) = input.rfind(':') { 956 let port_part = &input[colon_pos + 1..]; 957 if port_part.is_empty() || port_part.chars().all(|c| c.is_ascii_digit()) { 958 return (&input[..colon_pos], Some(port_part)); 959 } 960 } 961 (input, None) 962} 963 964fn parse_path_query_fragment(url: &mut Url, input: &str, is_special: bool) -> Result<()> { 965 let mut remaining = input; 966 967 let path_end = remaining.find(['?', '#']).unwrap_or(remaining.len()); 968 let path_str = &remaining[..path_end]; 969 remaining = &remaining[path_end..]; 970 971 parse_path_into(url, path_str, is_special); 972 973 if let Some(after_q) = remaining.strip_prefix('?') { 974 remaining = after_q; 975 let query_end = remaining.find('#').unwrap_or(remaining.len()); 976 let query_str = &remaining[..query_end]; 977 remaining = &remaining[query_end..]; 978 979 let encode_fn = if is_special { 980 is_special_query_encode 981 } else { 982 is_query_encode 983 }; 984 url.query = Some(percent_encode(query_str, encode_fn)); 985 } 986 987 if let Some(after_hash) = remaining.strip_prefix('#') { 988 url.fragment = Some(percent_encode(after_hash, is_fragment_encode)); 989 } 990 991 Ok(()) 992} 993 994fn parse_path_into(url: &mut Url, path: &str, is_special: bool) { 995 if path.is_empty() { 996 if is_special { 997 url.path = vec![String::new()]; 998 } 999 return; 1000 } 1001 1002 let segments: Vec<&str> = if is_special { 1003 path.split(['/', '\\']).collect() 1004 } else { 1005 path.split('/').collect() 1006 }; 1007 1008 for (i, seg) in segments.iter().enumerate() { 1009 if i == 0 && seg.is_empty() { 1010 continue; 1011 } 1012 1013 let decoded = *seg; 1014 if decoded == "." || decoded.eq_ignore_ascii_case("%2e") { 1015 if i == segments.len() - 1 { 1016 url.path.push(String::new()); 1017 } 1018 } else if decoded == ".." 1019 || decoded.eq_ignore_ascii_case(".%2e") 1020 || decoded.eq_ignore_ascii_case("%2e.") 1021 || decoded.eq_ignore_ascii_case("%2e%2e") 1022 { 1023 shorten_path(&url.scheme, &mut url.path); 1024 if i == segments.len() - 1 { 1025 url.path.push(String::new()); 1026 } 1027 } else { 1028 url.path.push(percent_encode(decoded, is_path_encode)); 1029 } 1030 } 1031} 1032 1033fn parse_opaque_or_path(url: &mut Url, input: &str) -> Result<()> { 1034 let mut remaining = input; 1035 1036 let path_end = remaining.find(['?', '#']).unwrap_or(remaining.len()); 1037 let path_str = &remaining[..path_end]; 1038 remaining = &remaining[path_end..]; 1039 1040 if path_str.starts_with('/') { 1041 url.opaque_path = false; 1042 parse_path_into(url, path_str, false); 1043 } else { 1044 url.opaque_path = true; 1045 url.path = vec![percent_encode(path_str, is_c0_control)]; 1046 } 1047 1048 if let Some(after_q) = remaining.strip_prefix('?') { 1049 remaining = after_q; 1050 let query_end = remaining.find('#').unwrap_or(remaining.len()); 1051 let query_str = &remaining[..query_end]; 1052 remaining = &remaining[query_end..]; 1053 url.query = Some(percent_encode(query_str, is_query_encode)); 1054 } 1055 1056 if let Some(after_hash) = remaining.strip_prefix('#') { 1057 url.fragment = Some(percent_encode(after_hash, is_fragment_encode)); 1058 } 1059 1060 Ok(()) 1061} 1062 1063// --------------------------------------------------------------------------- 1064// Relative URL parsing 1065// --------------------------------------------------------------------------- 1066 1067fn parse_relative(input: &str, base: &Url) -> Result<Url> { 1068 let mut url = Url { 1069 scheme: base.scheme.clone(), 1070 username: base.username.clone(), 1071 password: base.password.clone(), 1072 host: base.host.clone(), 1073 port: base.port, 1074 path: base.path.clone(), 1075 opaque_path: base.opaque_path, 1076 query: base.query.clone(), 1077 fragment: None, 1078 }; 1079 1080 let is_special = is_special_scheme(&url.scheme); 1081 1082 if input.is_empty() { 1083 return Ok(url); 1084 } 1085 1086 let chars: Vec<char> = input.chars().collect(); 1087 1088 if chars[0] == '/' || (is_special && chars[0] == '\\') { 1089 if input.starts_with("//") || (is_special && input.starts_with("\\/")) { 1090 let after_slashes = &input[2..]; 1091 url.username = String::new(); 1092 url.password = String::new(); 1093 url.path = Vec::new(); 1094 url.query = None; 1095 parse_authority_and_path(&mut url, after_slashes, is_special)?; 1096 return Ok(url); 1097 } 1098 url.path = Vec::new(); 1099 url.query = None; 1100 parse_path_query_fragment(&mut url, input, is_special)?; 1101 return Ok(url); 1102 } 1103 1104 if let Some(after_q) = input.strip_prefix('?') { 1105 url.query = None; 1106 url.fragment = None; 1107 let query_end = after_q.find('#').unwrap_or(after_q.len()); 1108 let query_str = &after_q[..query_end]; 1109 let after = &after_q[query_end..]; 1110 1111 let encode_fn = if is_special { 1112 is_special_query_encode 1113 } else { 1114 is_query_encode 1115 }; 1116 url.query = Some(percent_encode(query_str, encode_fn)); 1117 1118 if let Some(frag) = after.strip_prefix('#') { 1119 url.fragment = Some(percent_encode(frag, is_fragment_encode)); 1120 } 1121 return Ok(url); 1122 } 1123 1124 if let Some(frag) = input.strip_prefix('#') { 1125 url.fragment = Some(percent_encode(frag, is_fragment_encode)); 1126 return Ok(url); 1127 } 1128 1129 // Path-relative. 1130 if !url.opaque_path { 1131 shorten_path(&url.scheme, &mut url.path); 1132 } 1133 url.query = None; 1134 url.fragment = None; 1135 1136 parse_path_query_fragment(&mut url, &format!("/{input}"), is_special)?; 1137 Ok(url) 1138} 1139 1140fn parse_relative_special(remaining: &str, base: &Url, mut url: Url) -> Result<Url> { 1141 url.username = base.username.clone(); 1142 url.password = base.password.clone(); 1143 url.host = base.host.clone(); 1144 url.port = base.port; 1145 url.path = base.path.clone(); 1146 url.query = base.query.clone(); 1147 1148 let is_special = true; 1149 1150 if remaining.is_empty() { 1151 return Ok(url); 1152 } 1153 1154 if remaining.starts_with('/') || remaining.starts_with('\\') { 1155 url.path = Vec::new(); 1156 url.query = None; 1157 parse_path_query_fragment(&mut url, remaining, is_special)?; 1158 return Ok(url); 1159 } 1160 1161 if let Some(rest) = remaining.strip_prefix('?') { 1162 url.query = None; 1163 url.fragment = None; 1164 let query_end = rest.find('#').unwrap_or(rest.len()); 1165 url.query = Some(percent_encode(&rest[..query_end], is_special_query_encode)); 1166 if query_end < rest.len() { 1167 url.fragment = Some(percent_encode(&rest[query_end + 1..], is_fragment_encode)); 1168 } 1169 return Ok(url); 1170 } 1171 1172 if let Some(frag) = remaining.strip_prefix('#') { 1173 url.fragment = Some(percent_encode(frag, is_fragment_encode)); 1174 return Ok(url); 1175 } 1176 1177 shorten_path(&url.scheme, &mut url.path); 1178 url.query = None; 1179 parse_path_query_fragment(&mut url, &format!("/{remaining}"), is_special)?; 1180 Ok(url) 1181} 1182 1183// --------------------------------------------------------------------------- 1184// File URL parsing 1185// --------------------------------------------------------------------------- 1186 1187fn parse_file_url(input: &str, base: Option<&Url>, mut url: Url) -> Result<Url> { 1188 url.host = Some(Host::Domain(String::new())); 1189 1190 let remaining = if let Some(after) = input.strip_prefix("//") { 1191 after 1192 } else if let Some(after) = input.strip_prefix('/') { 1193 after 1194 } else if let Some(base) = base { 1195 if base.scheme == "file" { 1196 url.host = base.host.clone(); 1197 url.path = base.path.clone(); 1198 1199 if let Some(rest) = input.strip_prefix('?') { 1200 url.query = None; 1201 url.fragment = None; 1202 let query_end = rest.find('#').unwrap_or(rest.len()); 1203 url.query = Some(percent_encode(&rest[..query_end], is_query_encode)); 1204 if query_end < rest.len() { 1205 url.fragment = Some(percent_encode(&rest[query_end + 1..], is_fragment_encode)); 1206 } 1207 return Ok(url); 1208 } 1209 1210 if let Some(frag) = input.strip_prefix('#') { 1211 url.fragment = Some(percent_encode(frag, is_fragment_encode)); 1212 return Ok(url); 1213 } 1214 1215 shorten_path(&url.scheme, &mut url.path); 1216 url.query = None; 1217 parse_path_query_fragment(&mut url, &format!("/{input}"), false)?; 1218 return Ok(url); 1219 } else { 1220 input 1221 } 1222 } else { 1223 input 1224 }; 1225 1226 let path_start = remaining 1227 .find(['/', '\\', '?', '#']) 1228 .unwrap_or(remaining.len()); 1229 1230 let potential_host = &remaining[..path_start]; 1231 let rest = &remaining[path_start..]; 1232 1233 if starts_with_windows_drive_letter(remaining) { 1234 url.host = Some(Host::Domain(String::new())); 1235 parse_path_query_fragment(&mut url, &format!("/{remaining}"), false)?; 1236 return Ok(url); 1237 } 1238 1239 if !potential_host.is_empty() { 1240 let host = parse_host(potential_host, false)?; 1241 if host != Host::Domain(String::new()) { 1242 url.host = Some(host); 1243 } 1244 } 1245 1246 parse_path_query_fragment(&mut url, rest, false)?; 1247 1248 // Normalize Windows drive letters in path. 1249 if let Some(first) = url.path.first_mut() { 1250 if first.len() == 2 { 1251 let bytes = first.as_bytes(); 1252 if bytes[0].is_ascii_alphabetic() && bytes[1] == b'|' { 1253 let mut normalized = String::new(); 1254 normalized.push(bytes[0] as char); 1255 normalized.push(':'); 1256 *first = normalized; 1257 } 1258 } 1259 } 1260 1261 Ok(url) 1262} 1263 1264// --------------------------------------------------------------------------- 1265// Tests 1266// --------------------------------------------------------------------------- 1267 1268#[cfg(test)] 1269mod tests { 1270 use super::*; 1271 1272 // ------------------------------------------------------------------- 1273 // Basic absolute URL parsing 1274 // ------------------------------------------------------------------- 1275 1276 #[test] 1277 fn parse_simple_http() { 1278 let url = Url::parse("http://example.com").unwrap(); 1279 assert_eq!(url.scheme(), "http"); 1280 assert_eq!(url.host_str(), Some("example.com".into())); 1281 assert_eq!(url.port(), None); 1282 assert_eq!(url.path(), "/"); 1283 assert_eq!(url.query(), None); 1284 assert_eq!(url.fragment(), None); 1285 } 1286 1287 #[test] 1288 fn parse_https_with_path() { 1289 let url = Url::parse("https://example.com/foo/bar").unwrap(); 1290 assert_eq!(url.scheme(), "https"); 1291 assert_eq!(url.host_str(), Some("example.com".into())); 1292 assert_eq!(url.path(), "/foo/bar"); 1293 } 1294 1295 #[test] 1296 fn parse_full_url() { 1297 let url = 1298 Url::parse("https://user:pass@example.com:8080/path/to/page?q=1&r=2#frag").unwrap(); 1299 assert_eq!(url.scheme(), "https"); 1300 assert_eq!(url.username(), "user"); 1301 assert_eq!(url.password(), "pass"); 1302 assert_eq!(url.host_str(), Some("example.com".into())); 1303 assert_eq!(url.port(), Some(8080)); 1304 assert_eq!(url.path(), "/path/to/page"); 1305 assert_eq!(url.query(), Some("q=1&r=2")); 1306 assert_eq!(url.fragment(), Some("frag")); 1307 } 1308 1309 #[test] 1310 fn parse_default_port_omitted() { 1311 let url = Url::parse("http://example.com:80/").unwrap(); 1312 assert_eq!(url.port(), None); 1313 assert_eq!(url.port_or_default(), Some(80)); 1314 } 1315 1316 #[test] 1317 fn parse_non_default_port() { 1318 let url = Url::parse("http://example.com:8080/").unwrap(); 1319 assert_eq!(url.port(), Some(8080)); 1320 } 1321 1322 #[test] 1323 fn parse_https_default_port() { 1324 let url = Url::parse("https://example.com:443/").unwrap(); 1325 assert_eq!(url.port(), None); 1326 } 1327 1328 #[test] 1329 fn parse_ftp_default_port() { 1330 let url = Url::parse("ftp://files.example.com:21/readme.txt").unwrap(); 1331 assert_eq!(url.port(), None); 1332 assert_eq!(url.port_or_default(), Some(21)); 1333 } 1334 1335 // ------------------------------------------------------------------- 1336 // Scheme handling 1337 // ------------------------------------------------------------------- 1338 1339 #[test] 1340 fn scheme_is_lowercased() { 1341 let url = Url::parse("HTTP://EXAMPLE.COM").unwrap(); 1342 assert_eq!(url.scheme(), "http"); 1343 } 1344 1345 #[test] 1346 fn non_special_scheme() { 1347 let url = Url::parse("custom://host/path").unwrap(); 1348 assert_eq!(url.scheme(), "custom"); 1349 assert_eq!(url.host_str(), Some("host".into())); 1350 assert_eq!(url.path(), "/path"); 1351 } 1352 1353 #[test] 1354 fn data_uri() { 1355 let url = Url::parse("data:text/html,<h1>Hello</h1>").unwrap(); 1356 assert_eq!(url.scheme(), "data"); 1357 assert!(url.cannot_be_a_base()); 1358 } 1359 1360 #[test] 1361 fn javascript_uri() { 1362 let url = Url::parse("javascript:alert(1)").unwrap(); 1363 assert_eq!(url.scheme(), "javascript"); 1364 assert!(url.cannot_be_a_base()); 1365 } 1366 1367 #[test] 1368 fn mailto_uri() { 1369 let url = Url::parse("mailto:user@example.com").unwrap(); 1370 assert_eq!(url.scheme(), "mailto"); 1371 assert!(url.cannot_be_a_base()); 1372 } 1373 1374 // ------------------------------------------------------------------- 1375 // Host parsing 1376 // ------------------------------------------------------------------- 1377 1378 #[test] 1379 fn host_is_lowercased() { 1380 let url = Url::parse("http://EXAMPLE.COM/").unwrap(); 1381 assert_eq!(url.host_str(), Some("example.com".into())); 1382 } 1383 1384 #[test] 1385 fn ipv4_host() { 1386 let url = Url::parse("http://127.0.0.1/").unwrap(); 1387 assert_eq!(url.host(), Some(&Host::Ipv4(0x7F000001))); 1388 assert_eq!(url.host_str(), Some("127.0.0.1".into())); 1389 } 1390 1391 #[test] 1392 fn ipv4_host_all_zeros() { 1393 let url = Url::parse("http://0.0.0.0/").unwrap(); 1394 assert_eq!(url.host(), Some(&Host::Ipv4(0))); 1395 } 1396 1397 #[test] 1398 fn ipv6_host() { 1399 let url = Url::parse("http://[::1]/").unwrap(); 1400 assert_eq!(url.host(), Some(&Host::Ipv6([0, 0, 0, 0, 0, 0, 0, 1]))); 1401 } 1402 1403 #[test] 1404 fn ipv6_full() { 1405 let url = Url::parse("http://[2001:db8:85a3:0:0:8a2e:370:7334]/").unwrap(); 1406 assert_eq!( 1407 url.host(), 1408 Some(&Host::Ipv6([ 1409 0x2001, 0x0db8, 0x85a3, 0, 0, 0x8a2e, 0x0370, 0x7334 1410 ])) 1411 ); 1412 } 1413 1414 #[test] 1415 fn ipv6_serialization_compressed() { 1416 let url = Url::parse("http://[2001:db8::1]/").unwrap(); 1417 assert_eq!(url.host_str(), Some("[2001:db8::1]".into())); 1418 } 1419 1420 #[test] 1421 fn ipv6_all_zeros() { 1422 let url = Url::parse("http://[::]/").unwrap(); 1423 assert_eq!(url.host(), Some(&Host::Ipv6([0; 8]))); 1424 assert_eq!(url.host_str(), Some("[::]".into())); 1425 } 1426 1427 #[test] 1428 fn ipv6_loopback() { 1429 let pieces = parse_ipv6("::1").unwrap(); 1430 assert_eq!(pieces, [0, 0, 0, 0, 0, 0, 0, 1]); 1431 } 1432 1433 #[test] 1434 fn ipv6_with_ipv4() { 1435 let pieces = parse_ipv6("::ffff:192.168.1.1").unwrap(); 1436 assert_eq!(pieces, [0, 0, 0, 0, 0, 0xffff, 0xc0a8, 0x0101]); 1437 } 1438 1439 // ------------------------------------------------------------------- 1440 // IPv4 parsing 1441 // ------------------------------------------------------------------- 1442 1443 #[test] 1444 fn ipv4_basic() { 1445 assert_eq!(parse_ipv4("192.168.1.1").unwrap(), 0xC0A80101); 1446 } 1447 1448 #[test] 1449 fn ipv4_hex() { 1450 assert_eq!(parse_ipv4("0xC0.0xA8.0x01.0x01").unwrap(), 0xC0A80101); 1451 } 1452 1453 #[test] 1454 fn ipv4_octal() { 1455 assert_eq!(parse_ipv4("0300.0250.01.01").unwrap(), 0xC0A80101); 1456 } 1457 1458 #[test] 1459 fn ipv4_single_number() { 1460 assert!(parse_ipv4("3232235777").is_err()); 1461 } 1462 1463 #[test] 1464 fn ipv4_two_parts() { 1465 // Two parts: first is top 8 bits, second is bottom 24 bits. 1466 // 192.168.1.1 => 168*65536 + 1*256 + 1 = 11010305 1467 assert_eq!(parse_ipv4("192.11010305").unwrap(), 0xC0A80101); 1468 } 1469 1470 #[test] 1471 fn ipv4_reject_overflow() { 1472 assert!(parse_ipv4("256.0.0.0").is_err()); 1473 } 1474 1475 #[test] 1476 fn ipv4_reject_empty_part() { 1477 assert!(parse_ipv4("1..1.1").is_err()); 1478 } 1479 1480 // ------------------------------------------------------------------- 1481 // Percent encoding/decoding 1482 // ------------------------------------------------------------------- 1483 1484 #[test] 1485 fn percent_decode_basic() { 1486 assert_eq!(percent_decode_string("%48%65%6C%6C%6F"), "Hello"); 1487 } 1488 1489 #[test] 1490 fn percent_decode_mixed() { 1491 assert_eq!(percent_decode_string("Hello%20World"), "Hello World"); 1492 } 1493 1494 #[test] 1495 fn percent_decode_passthrough() { 1496 assert_eq!(percent_decode_string("no-encoding"), "no-encoding"); 1497 } 1498 1499 #[test] 1500 fn percent_decode_partial() { 1501 assert_eq!(percent_decode_string("100%"), "100%"); 1502 assert_eq!(percent_decode_string("%2"), "%2"); 1503 } 1504 1505 #[test] 1506 fn percent_encode_userinfo() { 1507 let encoded = percent_encode("user@host", is_userinfo_encode); 1508 assert_eq!(encoded, "user%40host"); 1509 } 1510 1511 #[test] 1512 fn percent_encode_path() { 1513 let encoded = percent_encode("hello world", is_path_encode); 1514 assert_eq!(encoded, "hello%20world"); 1515 } 1516 1517 // ------------------------------------------------------------------- 1518 // Path parsing and dot segments 1519 // ------------------------------------------------------------------- 1520 1521 #[test] 1522 fn path_dot_removal() { 1523 let url = Url::parse("http://example.com/a/b/../c").unwrap(); 1524 assert_eq!(url.path(), "/a/c"); 1525 } 1526 1527 #[test] 1528 fn path_dot_current() { 1529 let url = Url::parse("http://example.com/a/./b").unwrap(); 1530 assert_eq!(url.path(), "/a/b"); 1531 } 1532 1533 #[test] 1534 fn path_multiple_dots() { 1535 let url = Url::parse("http://example.com/a/b/c/../../d").unwrap(); 1536 assert_eq!(url.path(), "/a/d"); 1537 } 1538 1539 #[test] 1540 fn path_trailing_slash() { 1541 let url = Url::parse("http://example.com/a/b/").unwrap(); 1542 assert_eq!(url.path(), "/a/b/"); 1543 } 1544 1545 #[test] 1546 fn path_empty() { 1547 let url = Url::parse("http://example.com").unwrap(); 1548 assert_eq!(url.path(), "/"); 1549 } 1550 1551 #[test] 1552 fn path_double_dot_at_root() { 1553 let url = Url::parse("http://example.com/../a").unwrap(); 1554 assert_eq!(url.path(), "/a"); 1555 } 1556 1557 // ------------------------------------------------------------------- 1558 // Relative URL resolution 1559 // ------------------------------------------------------------------- 1560 1561 #[test] 1562 fn relative_path() { 1563 let base = Url::parse("http://example.com/a/b/c").unwrap(); 1564 let url = Url::parse_with_base("d", &base).unwrap(); 1565 assert_eq!(url.path(), "/a/b/d"); 1566 assert_eq!(url.host_str(), Some("example.com".into())); 1567 } 1568 1569 #[test] 1570 fn relative_path_with_dots() { 1571 let base = Url::parse("http://example.com/a/b/c").unwrap(); 1572 let url = Url::parse_with_base("../d", &base).unwrap(); 1573 assert_eq!(url.path(), "/a/d"); 1574 } 1575 1576 #[test] 1577 fn relative_absolute_path() { 1578 let base = Url::parse("http://example.com/a/b/c").unwrap(); 1579 let url = Url::parse_with_base("/d/e", &base).unwrap(); 1580 assert_eq!(url.path(), "/d/e"); 1581 assert_eq!(url.host_str(), Some("example.com".into())); 1582 } 1583 1584 #[test] 1585 fn relative_query_only() { 1586 let base = Url::parse("http://example.com/a/b?old=1").unwrap(); 1587 let url = Url::parse_with_base("?new=2", &base).unwrap(); 1588 assert_eq!(url.path(), "/a/b"); 1589 assert_eq!(url.query(), Some("new=2")); 1590 } 1591 1592 #[test] 1593 fn relative_fragment_only() { 1594 let base = Url::parse("http://example.com/a/b#old").unwrap(); 1595 let url = Url::parse_with_base("#new", &base).unwrap(); 1596 assert_eq!(url.path(), "/a/b"); 1597 assert_eq!(url.fragment(), Some("new")); 1598 } 1599 1600 #[test] 1601 fn relative_authority_override() { 1602 let base = Url::parse("http://example.com/a/b").unwrap(); 1603 let url = Url::parse_with_base("//other.com/c", &base).unwrap(); 1604 assert_eq!(url.scheme(), "http"); 1605 assert_eq!(url.host_str(), Some("other.com".into())); 1606 assert_eq!(url.path(), "/c"); 1607 } 1608 1609 #[test] 1610 fn absolute_url_ignores_base() { 1611 let base = Url::parse("http://example.com/a").unwrap(); 1612 let url = Url::parse_with_base("https://other.com/b", &base).unwrap(); 1613 assert_eq!(url.scheme(), "https"); 1614 assert_eq!(url.host_str(), Some("other.com".into())); 1615 assert_eq!(url.path(), "/b"); 1616 } 1617 1618 #[test] 1619 fn relative_empty_string() { 1620 let base = Url::parse("http://example.com/a/b?q=1#f").unwrap(); 1621 let url = Url::parse_with_base("", &base).unwrap(); 1622 assert_eq!(url.path(), "/a/b"); 1623 assert_eq!(url.query(), Some("q=1")); 1624 assert_eq!(url.fragment(), None); 1625 } 1626 1627 // ------------------------------------------------------------------- 1628 // Serialization 1629 // ------------------------------------------------------------------- 1630 1631 #[test] 1632 fn serialize_simple() { 1633 let url = Url::parse("http://example.com/path").unwrap(); 1634 assert_eq!(url.serialize(), "http://example.com/path"); 1635 } 1636 1637 #[test] 1638 fn serialize_with_credentials() { 1639 let url = Url::parse("http://user:pass@example.com/").unwrap(); 1640 assert_eq!(url.serialize(), "http://user:pass@example.com/"); 1641 } 1642 1643 #[test] 1644 fn serialize_with_port() { 1645 let url = Url::parse("http://example.com:8080/").unwrap(); 1646 assert_eq!(url.serialize(), "http://example.com:8080/"); 1647 } 1648 1649 #[test] 1650 fn serialize_with_query_fragment() { 1651 let url = Url::parse("http://example.com/path?q=1#frag").unwrap(); 1652 assert_eq!(url.serialize(), "http://example.com/path?q=1#frag"); 1653 } 1654 1655 #[test] 1656 fn serialize_data_uri() { 1657 let url = Url::parse("data:text/html,hello").unwrap(); 1658 assert_eq!(url.serialize(), "data:text/html,hello"); 1659 } 1660 1661 #[test] 1662 fn roundtrip_full_url() { 1663 let input = "https://user:pass@example.com:8080/a/b?q=1#frag"; 1664 let url = Url::parse(input).unwrap(); 1665 assert_eq!(url.serialize(), input); 1666 } 1667 1668 #[test] 1669 fn roundtrip_ipv4() { 1670 let url = Url::parse("http://192.168.1.1/path").unwrap(); 1671 assert_eq!(url.serialize(), "http://192.168.1.1/path"); 1672 } 1673 1674 #[test] 1675 fn roundtrip_ipv6() { 1676 let url = Url::parse("http://[::1]/path").unwrap(); 1677 assert_eq!(url.serialize(), "http://[::1]/path"); 1678 } 1679 1680 // ------------------------------------------------------------------- 1681 // Origin 1682 // ------------------------------------------------------------------- 1683 1684 #[test] 1685 fn origin_http() { 1686 let url = Url::parse("http://example.com:8080/path").unwrap(); 1687 match url.origin() { 1688 Origin::Tuple(scheme, host, port) => { 1689 assert_eq!(scheme, "http"); 1690 assert_eq!(host, Host::Domain("example.com".into())); 1691 assert_eq!(port, Some(8080)); 1692 } 1693 _ => panic!("expected tuple origin"), 1694 } 1695 } 1696 1697 #[test] 1698 fn origin_https_default_port() { 1699 let url = Url::parse("https://example.com/").unwrap(); 1700 match url.origin() { 1701 Origin::Tuple(scheme, host, port) => { 1702 assert_eq!(scheme, "https"); 1703 assert_eq!(host, Host::Domain("example.com".into())); 1704 assert_eq!(port, None); 1705 } 1706 _ => panic!("expected tuple origin"), 1707 } 1708 } 1709 1710 #[test] 1711 fn origin_data_is_opaque() { 1712 let url = Url::parse("data:text/html,hello").unwrap(); 1713 assert_eq!(url.origin(), Origin::Opaque); 1714 } 1715 1716 // ------------------------------------------------------------------- 1717 // Origin::same_origin 1718 // ------------------------------------------------------------------- 1719 1720 #[test] 1721 fn same_origin_identical_tuple() { 1722 let a = Url::parse("http://example.com/page1").unwrap(); 1723 let b = Url::parse("http://example.com/page2").unwrap(); 1724 assert!(a.origin().same_origin(&b.origin())); 1725 } 1726 1727 #[test] 1728 fn same_origin_different_path_query_fragment() { 1729 let a = Url::parse("https://example.com/a?x=1#frag").unwrap(); 1730 let b = Url::parse("https://example.com/b?y=2#other").unwrap(); 1731 assert!(a.origin().same_origin(&b.origin())); 1732 } 1733 1734 #[test] 1735 fn same_origin_different_scheme() { 1736 let a = Url::parse("http://example.com/").unwrap(); 1737 let b = Url::parse("https://example.com/").unwrap(); 1738 assert!(!a.origin().same_origin(&b.origin())); 1739 } 1740 1741 #[test] 1742 fn same_origin_different_host() { 1743 let a = Url::parse("http://example.com/").unwrap(); 1744 let b = Url::parse("http://other.com/").unwrap(); 1745 assert!(!a.origin().same_origin(&b.origin())); 1746 } 1747 1748 #[test] 1749 fn same_origin_different_port() { 1750 let a = Url::parse("http://example.com:8080/").unwrap(); 1751 let b = Url::parse("http://example.com:9090/").unwrap(); 1752 assert!(!a.origin().same_origin(&b.origin())); 1753 } 1754 1755 #[test] 1756 fn same_origin_default_port_normalization_http() { 1757 // http://example.com (port=None) should match http://example.com:80 1758 let a = Url::parse("http://example.com/").unwrap(); 1759 let b = Url::parse("http://example.com:80/").unwrap(); 1760 assert!(a.origin().same_origin(&b.origin())); 1761 } 1762 1763 #[test] 1764 fn same_origin_default_port_normalization_https() { 1765 let a = Url::parse("https://example.com/").unwrap(); 1766 let b = Url::parse("https://example.com:443/").unwrap(); 1767 assert!(a.origin().same_origin(&b.origin())); 1768 } 1769 1770 #[test] 1771 fn same_origin_default_port_normalization_ftp() { 1772 let a = Url::parse("ftp://example.com/").unwrap(); 1773 let b = Url::parse("ftp://example.com:21/").unwrap(); 1774 assert!(a.origin().same_origin(&b.origin())); 1775 } 1776 1777 #[test] 1778 fn same_origin_non_default_port_vs_none() { 1779 let a = Url::parse("http://example.com/").unwrap(); 1780 let b = Url::parse("http://example.com:8080/").unwrap(); 1781 assert!(!a.origin().same_origin(&b.origin())); 1782 } 1783 1784 #[test] 1785 fn same_origin_opaque_never_matches() { 1786 let a = Url::parse("data:text/html,hello").unwrap(); 1787 let b = Url::parse("data:text/html,hello").unwrap(); 1788 assert!(!a.origin().same_origin(&b.origin())); 1789 } 1790 1791 #[test] 1792 fn same_origin_opaque_vs_tuple() { 1793 let a = Url::parse("data:text/html,hello").unwrap(); 1794 let b = Url::parse("http://example.com/").unwrap(); 1795 assert!(!a.origin().same_origin(&b.origin())); 1796 } 1797 1798 #[test] 1799 fn same_origin_ipv4() { 1800 let a = Url::parse("http://127.0.0.1/a").unwrap(); 1801 let b = Url::parse("http://127.0.0.1/b").unwrap(); 1802 assert!(a.origin().same_origin(&b.origin())); 1803 } 1804 1805 #[test] 1806 fn same_origin_ipv4_different() { 1807 let a = Url::parse("http://127.0.0.1/").unwrap(); 1808 let b = Url::parse("http://192.168.1.1/").unwrap(); 1809 assert!(!a.origin().same_origin(&b.origin())); 1810 } 1811 1812 // ------------------------------------------------------------------- 1813 // Origin::serialize / Display 1814 // ------------------------------------------------------------------- 1815 1816 #[test] 1817 fn origin_serialize_http() { 1818 let url = Url::parse("http://example.com/path").unwrap(); 1819 assert_eq!(url.origin().serialize(), "http://example.com"); 1820 } 1821 1822 #[test] 1823 fn origin_serialize_https_with_port() { 1824 let url = Url::parse("https://example.com:8443/").unwrap(); 1825 assert_eq!(url.origin().serialize(), "https://example.com:8443"); 1826 } 1827 1828 #[test] 1829 fn origin_serialize_default_port_omitted() { 1830 // Default port should not appear in serialization 1831 let url = Url::parse("http://example.com:80/").unwrap(); 1832 assert_eq!(url.origin().serialize(), "http://example.com"); 1833 } 1834 1835 #[test] 1836 fn origin_serialize_opaque() { 1837 let url = Url::parse("data:text/html,hi").unwrap(); 1838 assert_eq!(url.origin().serialize(), "null"); 1839 } 1840 1841 #[test] 1842 fn origin_display() { 1843 let url = Url::parse("https://example.com/").unwrap(); 1844 assert_eq!(format!("{}", url.origin()), "https://example.com"); 1845 } 1846 1847 // ------------------------------------------------------------------- 1848 // File URLs 1849 // ------------------------------------------------------------------- 1850 1851 #[test] 1852 fn file_url_unix() { 1853 let url = Url::parse("file:///home/user/file.txt").unwrap(); 1854 assert_eq!(url.scheme(), "file"); 1855 assert_eq!(url.host_str(), Some("".into())); 1856 assert_eq!(url.path(), "/home/user/file.txt"); 1857 } 1858 1859 #[test] 1860 fn file_url_windows_drive() { 1861 let url = Url::parse("file:///C:/Windows/system32").unwrap(); 1862 assert_eq!(url.scheme(), "file"); 1863 assert_eq!(url.path(), "/C:/Windows/system32"); 1864 } 1865 1866 #[test] 1867 fn file_url_with_host() { 1868 let url = Url::parse("file://server/share/file.txt").unwrap(); 1869 assert_eq!(url.scheme(), "file"); 1870 assert_eq!(url.host_str(), Some("server".into())); 1871 assert_eq!(url.path(), "/share/file.txt"); 1872 } 1873 1874 // ------------------------------------------------------------------- 1875 // Edge cases 1876 // ------------------------------------------------------------------- 1877 1878 #[test] 1879 fn empty_input_fails() { 1880 assert_eq!(Url::parse(""), Err(UrlError::EmptyInput)); 1881 } 1882 1883 #[test] 1884 fn whitespace_only_fails() { 1885 assert_eq!(Url::parse(" "), Err(UrlError::EmptyInput)); 1886 } 1887 1888 #[test] 1889 fn missing_scheme_fails() { 1890 assert!(Url::parse("example.com").is_err()); 1891 } 1892 1893 #[test] 1894 fn leading_whitespace_stripped() { 1895 let url = Url::parse(" http://example.com ").unwrap(); 1896 assert_eq!(url.host_str(), Some("example.com".into())); 1897 } 1898 1899 #[test] 1900 fn tab_newline_stripped() { 1901 let url = Url::parse("http://exa\tmple\n.com/").unwrap(); 1902 assert_eq!(url.host_str(), Some("example.com".into())); 1903 } 1904 1905 #[test] 1906 fn query_with_special_chars() { 1907 let url = Url::parse("http://example.com/?key=val ue&foo=bar").unwrap(); 1908 assert!(url.query().unwrap().contains("key=val%20ue")); 1909 } 1910 1911 #[test] 1912 fn fragment_with_special_chars() { 1913 let url = Url::parse("http://example.com/#sec tion").unwrap(); 1914 assert!(url.fragment().unwrap().contains("sec%20tion")); 1915 } 1916 1917 #[test] 1918 fn username_only() { 1919 let url = Url::parse("http://user@example.com/").unwrap(); 1920 assert_eq!(url.username(), "user"); 1921 assert_eq!(url.password(), ""); 1922 assert!(url.has_credentials()); 1923 } 1924 1925 #[test] 1926 fn no_credentials() { 1927 let url = Url::parse("http://example.com/").unwrap(); 1928 assert!(!url.has_credentials()); 1929 } 1930 1931 #[test] 1932 fn port_overflow_fails() { 1933 assert!(Url::parse("http://example.com:99999/").is_err()); 1934 } 1935 1936 #[test] 1937 fn ws_scheme() { 1938 let url = Url::parse("ws://example.com/chat").unwrap(); 1939 assert_eq!(url.scheme(), "ws"); 1940 assert_eq!(url.port_or_default(), Some(80)); 1941 } 1942 1943 #[test] 1944 fn wss_scheme() { 1945 let url = Url::parse("wss://example.com/chat").unwrap(); 1946 assert_eq!(url.scheme(), "wss"); 1947 assert_eq!(url.port_or_default(), Some(443)); 1948 } 1949 1950 #[test] 1951 fn cannot_be_a_base() { 1952 let url = Url::parse("data:text/html,hello").unwrap(); 1953 assert!(url.cannot_be_a_base()); 1954 } 1955 1956 #[test] 1957 fn http_can_be_a_base() { 1958 let url = Url::parse("http://example.com/").unwrap(); 1959 assert!(!url.cannot_be_a_base()); 1960 } 1961 1962 // ------------------------------------------------------------------- 1963 // Display/ToString 1964 // ------------------------------------------------------------------- 1965 1966 #[test] 1967 fn display_matches_serialize() { 1968 let url = Url::parse("https://example.com:8443/path?q=1#f").unwrap(); 1969 assert_eq!(format!("{url}"), url.serialize()); 1970 } 1971 1972 // ------------------------------------------------------------------- 1973 // Multiple path segments 1974 // ------------------------------------------------------------------- 1975 1976 #[test] 1977 fn path_segments() { 1978 let url = Url::parse("http://example.com/a/b/c").unwrap(); 1979 assert_eq!(url.path_segments(), &["a", "b", "c"]); 1980 } 1981 1982 #[test] 1983 fn path_segments_trailing_slash() { 1984 let url = Url::parse("http://example.com/a/b/").unwrap(); 1985 assert_eq!(url.path_segments(), &["a", "b", ""]); 1986 } 1987 1988 // ------------------------------------------------------------------- 1989 // Host type 1990 // ------------------------------------------------------------------- 1991 1992 #[test] 1993 fn host_serialize_domain() { 1994 let h = Host::Domain("example.com".into()); 1995 assert_eq!(h.serialize(), "example.com"); 1996 } 1997 1998 #[test] 1999 fn host_serialize_ipv4() { 2000 let h = Host::Ipv4(0x7F000001); 2001 assert_eq!(h.serialize(), "127.0.0.1"); 2002 } 2003 2004 #[test] 2005 fn host_serialize_ipv6() { 2006 let h = Host::Ipv6([0, 0, 0, 0, 0, 0, 0, 1]); 2007 assert_eq!(h.serialize(), "[::1]"); 2008 } 2009 2010 // ------------------------------------------------------------------- 2011 // IPv6 serialization 2012 // ------------------------------------------------------------------- 2013 2014 #[test] 2015 fn ipv6_serialize_full() { 2016 let pieces = [ 2017 0x2001, 0x0db8, 0x85a3, 0x0001, 0x0002, 0x8a2e, 0x0370, 0x7334, 2018 ]; 2019 assert_eq!(serialize_ipv6(&pieces), "2001:db8:85a3:1:2:8a2e:370:7334"); 2020 } 2021 2022 #[test] 2023 fn ipv6_serialize_compress() { 2024 let pieces = [0x2001, 0x0db8, 0, 0, 0, 0, 0, 1]; 2025 assert_eq!(serialize_ipv6(&pieces), "2001:db8::1"); 2026 } 2027 2028 #[test] 2029 fn ipv6_serialize_all_zeros() { 2030 let pieces = [0u16; 8]; 2031 assert_eq!(serialize_ipv6(&pieces), "::"); 2032 } 2033 2034 #[test] 2035 fn ipv6_serialize_no_compress_single_zero() { 2036 let pieces = [1, 0, 2, 0, 3, 0, 4, 0]; 2037 assert_eq!(serialize_ipv6(&pieces), "1:0:2:0:3:0:4:0"); 2038 } 2039 2040 // ------------------------------------------------------------------- 2041 // Percent encoding edge cases 2042 // ------------------------------------------------------------------- 2043 2044 #[test] 2045 fn percent_encode_preserves_unreserved() { 2046 let encoded = percent_encode("hello-world_test.page~1", is_path_encode); 2047 assert_eq!(encoded, "hello-world_test.page~1"); 2048 } 2049 2050 #[test] 2051 fn percent_encode_multibyte_utf8() { 2052 let encoded = percent_encode("café", is_path_encode); 2053 assert_eq!(encoded, "caf%C3%A9"); 2054 } 2055}