//! WHATWG URL parser. //! //! Implements the URL Standard (): //! - URL record type with scheme, username, password, host, port, path, query, fragment //! - State-machine parser following the spec //! - Host parsing: domains, IPv4 addresses, IPv6 addresses //! - Percent-encoding and decoding (UTF-8) //! - Special scheme handling (http, https, ftp, ws, wss, file) //! - Relative URL resolution via base URL //! - URL serialization //! - Origin derivation use core::fmt; // --------------------------------------------------------------------------- // Error types // --------------------------------------------------------------------------- #[derive(Debug, Clone, PartialEq, Eq)] pub enum UrlError { /// Input is empty or contains only whitespace. EmptyInput, /// Invalid URL syntax. InvalidUrl, /// Invalid scheme. InvalidScheme, /// Invalid authority. InvalidAuthority, /// Invalid host. InvalidHost, /// Invalid port number. InvalidPort, /// Invalid IPv4 address. InvalidIpv4, /// Invalid IPv6 address. InvalidIpv6, /// Invalid percent-encoding. InvalidPercentEncoding, /// Relative URL without a base. RelativeWithoutBase, /// Missing scheme. MissingScheme, } impl fmt::Display for UrlError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::EmptyInput => write!(f, "empty input"), Self::InvalidUrl => write!(f, "invalid URL"), Self::InvalidScheme => write!(f, "invalid scheme"), Self::InvalidAuthority => write!(f, "invalid authority"), Self::InvalidHost => write!(f, "invalid host"), Self::InvalidPort => write!(f, "invalid port number"), Self::InvalidIpv4 => write!(f, "invalid IPv4 address"), Self::InvalidIpv6 => write!(f, "invalid IPv6 address"), Self::InvalidPercentEncoding => write!(f, "invalid percent-encoding"), Self::RelativeWithoutBase => write!(f, "relative URL without a base"), Self::MissingScheme => write!(f, "missing scheme"), } } } pub type Result = core::result::Result; // --------------------------------------------------------------------------- // Host // --------------------------------------------------------------------------- /// A parsed URL host. #[derive(Debug, Clone, PartialEq, Eq)] pub enum Host { /// A domain name (already lowercased). Domain(String), /// An IPv4 address. Ipv4(u32), /// An IPv6 address (128 bits as eight 16-bit pieces). Ipv6([u16; 8]), } impl Host { /// Serialize the host to a string. pub fn serialize(&self) -> String { match self { Host::Domain(d) => d.clone(), Host::Ipv4(addr) => serialize_ipv4(*addr), Host::Ipv6(pieces) => format!("[{}]", serialize_ipv6(pieces)), } } } impl fmt::Display for Host { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.serialize()) } } // --------------------------------------------------------------------------- // Origin // --------------------------------------------------------------------------- /// A URL origin (scheme, host, port). #[derive(Debug, Clone, PartialEq, Eq)] pub enum Origin { /// A tuple origin (scheme, host, port). Tuple(String, Host, Option), /// An opaque origin (unique, not equal to anything). Opaque, } // --------------------------------------------------------------------------- // URL record // --------------------------------------------------------------------------- /// A parsed URL record per the WHATWG URL Standard. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Url { /// The scheme (e.g., "http", "https", "file"). pub scheme: String, /// The username (percent-encoded). username: String, /// The password (percent-encoded). password: String, /// The host. pub host: Option, /// The port (None = default or absent). pub port: Option, /// Path segments. For non-opaque paths, these are the segments. /// For opaque paths (cannot-be-a-base URL), this is a single element. path: Vec, /// Whether this URL has an opaque path (cannot-be-a-base URL). opaque_path: bool, /// The query string (without leading '?'). pub query: Option, /// The fragment (without leading '#'). pub fragment: Option, } impl Url { /// Parse a URL string. pub fn parse(input: &str) -> Result { parse_url(input, None) } /// Parse a URL string with a base URL for resolving relative references. pub fn parse_with_base(input: &str, base: &Url) -> Result { parse_url(input, Some(base)) } /// Get the scheme. pub fn scheme(&self) -> &str { &self.scheme } /// Get the username (percent-encoded). pub fn username(&self) -> &str { &self.username } /// Get the password (percent-encoded). pub fn password(&self) -> &str { &self.password } /// Get the host. pub fn host(&self) -> Option<&Host> { self.host.as_ref() } /// Get the host as a string. pub fn host_str(&self) -> Option { self.host.as_ref().map(|h| h.serialize()) } /// Get the port. pub fn port(&self) -> Option { self.port } /// Get the port or the default port for the scheme. pub fn port_or_default(&self) -> Option { self.port.or_else(|| default_port(&self.scheme)) } /// Get the path as a string. pub fn path(&self) -> String { if self.opaque_path { self.path.first().cloned().unwrap_or_default() } else { let mut s = String::new(); for seg in &self.path { s.push('/'); s.push_str(seg); } if s.is_empty() { s.push('/'); } s } } /// Get the path segments. pub fn path_segments(&self) -> &[String] { &self.path } /// Get the query string. pub fn query(&self) -> Option<&str> { self.query.as_deref() } /// Get the fragment. pub fn fragment(&self) -> Option<&str> { self.fragment.as_deref() } /// Whether this URL has an opaque path (cannot-be-a-base). pub fn cannot_be_a_base(&self) -> bool { self.opaque_path } /// Whether this URL includes credentials. pub fn has_credentials(&self) -> bool { !self.username.is_empty() || !self.password.is_empty() } /// Derive the origin of this URL. pub fn origin(&self) -> Origin { match self.scheme.as_str() { "http" | "https" | "ws" | "wss" | "ftp" => { if let Some(host) = &self.host { Origin::Tuple(self.scheme.clone(), host.clone(), self.port) } else { Origin::Opaque } } _ => Origin::Opaque, } } /// Serialize this URL to a string (the href). pub fn serialize(&self) -> String { let mut output = String::new(); output.push_str(&self.scheme); output.push(':'); if self.host.is_some() { output.push_str("//"); if self.has_credentials() { output.push_str(&self.username); if !self.password.is_empty() { output.push(':'); output.push_str(&self.password); } output.push('@'); } if let Some(ref host) = self.host { output.push_str(&host.serialize()); } if let Some(port) = self.port { output.push(':'); output.push_str(&port.to_string()); } } else if !self.opaque_path && self.scheme == "file" { output.push_str("//"); } output.push_str(&self.path()); if let Some(ref query) = self.query { output.push('?'); output.push_str(query); } if let Some(ref fragment) = self.fragment { output.push('#'); output.push_str(fragment); } output } } impl fmt::Display for Url { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.serialize()) } } // --------------------------------------------------------------------------- // Special schemes // --------------------------------------------------------------------------- /// Whether a scheme is "special" per the URL standard. fn is_special_scheme(scheme: &str) -> bool { matches!(scheme, "http" | "https" | "ftp" | "ws" | "wss" | "file") } /// Default port for a special scheme. fn default_port(scheme: &str) -> Option { match scheme { "http" | "ws" => Some(80), "https" | "wss" => Some(443), "ftp" => Some(21), _ => None, } } // --------------------------------------------------------------------------- // Percent encoding / decoding // --------------------------------------------------------------------------- /// The C0 control percent-encode set. fn is_c0_control(c: char) -> bool { c <= '\u{001F}' || c > '\u{007E}' } /// The fragment percent-encode set. fn is_fragment_encode(c: char) -> bool { is_c0_control(c) || c == ' ' || c == '"' || c == '<' || c == '>' || c == '`' } /// The query percent-encode set. fn is_query_encode(c: char) -> bool { is_c0_control(c) || c == ' ' || c == '"' || c == '#' || c == '<' || c == '>' } /// The special query percent-encode set. fn is_special_query_encode(c: char) -> bool { is_query_encode(c) || c == '\'' } /// The path percent-encode set. fn is_path_encode(c: char) -> bool { is_query_encode(c) || c == '?' || c == '`' || c == '{' || c == '}' } /// The userinfo percent-encode set. fn is_userinfo_encode(c: char) -> bool { is_path_encode(c) || c == '/' || c == ':' || c == ';' || c == '=' || c == '@' || c == '[' || c == '\\' || c == ']' || c == '^' || c == '|' } /// Percent-encode a string using the given encode set predicate. fn percent_encode(input: &str, should_encode: fn(char) -> bool) -> String { let mut out = String::with_capacity(input.len()); for c in input.chars() { if should_encode(c) { for b in c.to_string().as_bytes() { out.push('%'); out.push(to_hex_upper(b >> 4)); out.push(to_hex_upper(b & 0x0F)); } } else { out.push(c); } } out } fn to_hex_upper(n: u8) -> char { if n < 10 { (b'0' + n) as char } else { (b'A' + n - 10) as char } } /// Percent-decode a byte string. pub fn percent_decode(input: &str) -> Vec { let bytes = input.as_bytes(); let mut out = Vec::with_capacity(bytes.len()); let mut i = 0; while i < bytes.len() { if bytes[i] == b'%' && i + 2 < bytes.len() { if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) { out.push(hi << 4 | lo); i += 3; continue; } } out.push(bytes[i]); i += 1; } out } /// Percent-decode to a UTF-8 string (lossy). pub fn percent_decode_string(input: &str) -> String { String::from_utf8_lossy(&percent_decode(input)).into_owned() } fn hex_val(b: u8) -> Option { match b { b'0'..=b'9' => Some(b - b'0'), b'a'..=b'f' => Some(b - b'a' + 10), b'A'..=b'F' => Some(b - b'A' + 10), _ => None, } } // --------------------------------------------------------------------------- // IPv4 parsing // --------------------------------------------------------------------------- fn parse_ipv4(input: &str) -> Result { let parts: Vec<&str> = input.split('.').collect(); if parts.len() < 2 || parts.len() > 4 { return Err(UrlError::InvalidIpv4); } let mut numbers: Vec = Vec::with_capacity(parts.len()); for part in &parts { if part.is_empty() { return Err(UrlError::InvalidIpv4); } let n = parse_ipv4_number(part)?; numbers.push(n); } let last = numbers.len() - 1; for (i, &n) in numbers.iter().enumerate() { if i < last && n > 255 { return Err(UrlError::InvalidIpv4); } } if numbers[last] >= 256u64.pow((4 - last) as u32) { return Err(UrlError::InvalidIpv4); } let mut ipv4 = numbers[last] as u32; for (i, &n) in numbers.iter().enumerate().take(last) { ipv4 += (n as u32) << (8 * (3 - i)); } Ok(ipv4) } fn parse_ipv4_number(input: &str) -> Result { if input.is_empty() { return Err(UrlError::InvalidIpv4); } let (s, radix) = if input.starts_with("0x") || input.starts_with("0X") { (&input[2..], 16) } else if input.len() > 1 && input.starts_with('0') { (&input[1..], 8) } else { (input, 10) }; if s.is_empty() { return Ok(0); } u64::from_str_radix(s, radix).map_err(|_| UrlError::InvalidIpv4) } fn serialize_ipv4(addr: u32) -> String { format!( "{}.{}.{}.{}", (addr >> 24) & 0xFF, (addr >> 16) & 0xFF, (addr >> 8) & 0xFF, addr & 0xFF ) } // --------------------------------------------------------------------------- // IPv6 parsing // --------------------------------------------------------------------------- fn parse_ipv6(input: &str) -> Result<[u16; 8]> { let mut pieces = [0u16; 8]; let mut piece_index: usize = 0; let mut compress: Option = None; let chars: Vec = input.chars().collect(); let len = chars.len(); let mut pointer = 0; if pointer < len && chars[pointer] == ':' { if pointer + 1 >= len || chars[pointer + 1] != ':' { return Err(UrlError::InvalidIpv6); } pointer += 2; piece_index += 1; compress = Some(piece_index); } while pointer < len { if piece_index >= 8 { return Err(UrlError::InvalidIpv6); } if chars[pointer] == ':' { if compress.is_some() { return Err(UrlError::InvalidIpv6); } pointer += 1; piece_index += 1; compress = Some(piece_index); continue; } let mut value: u16 = 0; let mut length = 0; while length < 4 && pointer < len && chars[pointer].is_ascii_hexdigit() { value = value * 0x10 + hex_val(chars[pointer] as u8).unwrap() as u16; pointer += 1; length += 1; } if pointer < len && chars[pointer] == '.' { // IPv4-mapped IPv6. if length == 0 { return Err(UrlError::InvalidIpv6); } pointer -= length; if piece_index > 6 { return Err(UrlError::InvalidIpv6); } let mut numbers_seen = 0; while pointer < len { let mut ipv4_piece: Option = None; if numbers_seen > 0 { if chars[pointer] == '.' && numbers_seen < 4 { pointer += 1; } else { return Err(UrlError::InvalidIpv6); } } if pointer >= len || !chars[pointer].is_ascii_digit() { return Err(UrlError::InvalidIpv6); } while pointer < len && chars[pointer].is_ascii_digit() { let number = (chars[pointer] as u8 - b'0') as u16; match ipv4_piece { None => ipv4_piece = Some(number), Some(0) => return Err(UrlError::InvalidIpv6), // leading zero Some(v) => ipv4_piece = Some(v * 10 + number), } if ipv4_piece.unwrap_or(0) > 255 { return Err(UrlError::InvalidIpv6); } pointer += 1; } pieces[piece_index] = pieces[piece_index] * 0x100 + ipv4_piece.ok_or(UrlError::InvalidIpv6)?; numbers_seen += 1; if numbers_seen == 2 || numbers_seen == 4 { piece_index += 1; } } if numbers_seen != 4 { return Err(UrlError::InvalidIpv6); } break; } if pointer < len && chars[pointer] == ':' { pointer += 1; if pointer >= len { // Trailing single colon after a piece — only valid with compress. } } else if pointer < len { return Err(UrlError::InvalidIpv6); } if piece_index >= 8 { return Err(UrlError::InvalidIpv6); } pieces[piece_index] = value; piece_index += 1; } if let Some(comp) = compress { let mut swaps = piece_index - comp; piece_index = 7; while piece_index != 0 && swaps > 0 { let swap_index = comp + swaps - 1; pieces.swap(piece_index, swap_index); piece_index -= 1; swaps -= 1; } } else if piece_index != 8 { return Err(UrlError::InvalidIpv6); } Ok(pieces) } fn serialize_ipv6(pieces: &[u16; 8]) -> String { // Find the longest run of consecutive zeros for :: compression. let mut best_start = None; let mut best_len = 0usize; let mut cur_start = None; let mut cur_len = 0usize; for (i, &p) in pieces.iter().enumerate() { if p == 0 { if cur_start.is_none() { cur_start = Some(i); cur_len = 1; } else { cur_len += 1; } } else { if cur_len > best_len && cur_len >= 2 { best_start = cur_start; best_len = cur_len; } cur_start = None; cur_len = 0; } } if cur_len > best_len && cur_len >= 2 { best_start = cur_start; best_len = cur_len; } let mut out = String::new(); let mut i = 0; while i < 8 { if Some(i) == best_start { out.push_str("::"); i += best_len; continue; } if !out.is_empty() && !out.ends_with(':') { out.push(':'); } out.push_str(&format!("{:x}", pieces[i])); i += 1; } out } // --------------------------------------------------------------------------- // Host parsing // --------------------------------------------------------------------------- fn parse_host(input: &str, is_special: bool) -> Result { if input.is_empty() { if is_special { return Err(UrlError::InvalidHost); } return Ok(Host::Domain(String::new())); } // IPv6 if input.starts_with('[') { if !input.ends_with(']') { return Err(UrlError::InvalidIpv6); } let inner = &input[1..input.len() - 1]; let pieces = parse_ipv6(inner)?; return Ok(Host::Ipv6(pieces)); } if !is_special { let encoded = percent_encode(input, is_c0_control); return Ok(Host::Domain(encoded)); } // Domain — percent-decode then lowercase. let decoded = percent_decode_string(input); let lowered = decoded.to_ascii_lowercase(); // Check if it's an IPv4 address. if ends_with_number(&lowered) { match parse_ipv4(&lowered) { Ok(addr) => return Ok(Host::Ipv4(addr)), Err(_) => return Err(UrlError::InvalidHost), } } // Validate domain characters. for c in lowered.chars() { if c == '\0' || c == '\t' || c == '\n' || c == '\r' || c == ' ' || c == '#' || c == '/' || c == ':' || c == '<' || c == '>' || c == '?' || c == '@' || c == '[' || c == '\\' || c == ']' || c == '^' || c == '|' { return Err(UrlError::InvalidHost); } } Ok(Host::Domain(lowered)) } /// Check if a domain string ends with a number (suggesting IPv4). fn ends_with_number(input: &str) -> bool { let last_part = match input.rsplit('.').next() { Some(p) => p, None => return false, }; if last_part.is_empty() { return false; } if last_part.starts_with("0x") || last_part.starts_with("0X") { return last_part[2..].chars().all(|c| c.is_ascii_hexdigit()); } last_part.chars().all(|c| c.is_ascii_digit()) } // --------------------------------------------------------------------------- // Shorten path helper // --------------------------------------------------------------------------- fn shorten_path(scheme: &str, path: &mut Vec) { if scheme == "file" && path.len() == 1 { if let Some(first) = path.first() { if is_normalized_windows_drive_letter(first) { return; } } } path.pop(); } fn is_normalized_windows_drive_letter(s: &str) -> bool { let bytes = s.as_bytes(); bytes.len() == 2 && bytes[0].is_ascii_alphabetic() && bytes[1] == b':' } fn starts_with_windows_drive_letter(s: &str) -> bool { let bytes = s.as_bytes(); if bytes.len() < 2 { return false; } if !bytes[0].is_ascii_alphabetic() { return false; } if bytes[1] != b':' && bytes[1] != b'|' { return false; } if bytes.len() >= 3 { matches!(bytes[2], b'/' | b'\\' | b'?' | b'#') } else { true } } // --------------------------------------------------------------------------- // URL parser // --------------------------------------------------------------------------- fn parse_url(input: &str, base: Option<&Url>) -> Result { // Strip leading/trailing C0 controls and spaces. let input = input.trim_matches(|c: char| c <= '\u{0020}'); if input.is_empty() { if let Some(base) = base { return parse_relative("", base); } return Err(UrlError::EmptyInput); } // Remove tab and newline characters. let input: String = input .chars() .filter(|&c| c != '\t' && c != '\n' && c != '\r') .collect(); let chars: Vec = input.chars().collect(); let len = chars.len(); let mut pointer = 0; // Try to parse a scheme. let mut scheme = String::new(); let mut has_scheme = false; if pointer < len && chars[pointer].is_ascii_alphabetic() { let mut temp = String::new(); temp.push(chars[pointer].to_ascii_lowercase()); let mut p = pointer + 1; while p < len && (chars[p].is_ascii_alphanumeric() || chars[p] == '+' || chars[p] == '-' || chars[p] == '.') { temp.push(chars[p].to_ascii_lowercase()); p += 1; } if p < len && chars[p] == ':' { scheme = temp; has_scheme = true; pointer = p + 1; // skip the ':' } } if !has_scheme { if let Some(base) = base { return parse_relative(&input, base); } return Err(UrlError::MissingScheme); } let is_special = is_special_scheme(&scheme); let mut url = Url { scheme: scheme.clone(), username: String::new(), password: String::new(), host: None, port: None, path: Vec::new(), opaque_path: false, query: None, fragment: None, }; let remaining: String = chars[pointer..].iter().collect(); if scheme == "file" { return parse_file_url(&remaining, base, url); } if let Some(after_slashes) = remaining.strip_prefix("//") { parse_authority_and_path(&mut url, after_slashes, is_special)?; } else if is_special { if let Some(base) = base { if base.scheme == url.scheme { return parse_relative_special(&remaining, base, url); } } if let Some(after_slash) = remaining.strip_prefix('/') { parse_authority_and_path(&mut url, after_slash, is_special)?; } else { parse_authority_and_path(&mut url, &remaining, is_special)?; } } else { parse_opaque_or_path(&mut url, &remaining)?; } Ok(url) } fn parse_authority_and_path(url: &mut Url, input: &str, is_special: bool) -> Result<()> { let authority_end = input .find(|c: char| c == '/' || c == '?' || c == '#' || (is_special && c == '\\')) .unwrap_or(input.len()); let authority = &input[..authority_end]; let rest = &input[authority_end..]; let (userinfo_part, hostport) = if let Some(at_pos) = authority.rfind('@') { (&authority[..at_pos], &authority[at_pos + 1..]) } else { ("", authority) }; if !userinfo_part.is_empty() { if let Some(colon_pos) = userinfo_part.find(':') { url.username = percent_encode(&userinfo_part[..colon_pos], is_userinfo_encode); url.password = percent_encode(&userinfo_part[colon_pos + 1..], is_userinfo_encode); } else { url.username = percent_encode(userinfo_part, is_userinfo_encode); } } let (host_str, port_str) = split_host_port(hostport); url.host = Some(parse_host(host_str, is_special)?); if let Some(port_s) = port_str { if !port_s.is_empty() { let port: u16 = port_s.parse().map_err(|_| UrlError::InvalidPort)?; if default_port(&url.scheme) != Some(port) { url.port = Some(port); } } } parse_path_query_fragment(url, rest, is_special) } fn split_host_port(input: &str) -> (&str, Option<&str>) { if input.starts_with('[') { if let Some(bracket_end) = input.find(']') { let host = &input[..bracket_end + 1]; let after = &input[bracket_end + 1..]; if let Some(port_str) = after.strip_prefix(':') { return (host, Some(port_str)); } return (host, None); } return (input, None); } if let Some(colon_pos) = input.rfind(':') { let port_part = &input[colon_pos + 1..]; if port_part.is_empty() || port_part.chars().all(|c| c.is_ascii_digit()) { return (&input[..colon_pos], Some(port_part)); } } (input, None) } fn parse_path_query_fragment(url: &mut Url, input: &str, is_special: bool) -> Result<()> { let mut remaining = input; let path_end = remaining.find(['?', '#']).unwrap_or(remaining.len()); let path_str = &remaining[..path_end]; remaining = &remaining[path_end..]; parse_path_into(url, path_str, is_special); if let Some(after_q) = remaining.strip_prefix('?') { remaining = after_q; let query_end = remaining.find('#').unwrap_or(remaining.len()); let query_str = &remaining[..query_end]; remaining = &remaining[query_end..]; let encode_fn = if is_special { is_special_query_encode } else { is_query_encode }; url.query = Some(percent_encode(query_str, encode_fn)); } if let Some(after_hash) = remaining.strip_prefix('#') { url.fragment = Some(percent_encode(after_hash, is_fragment_encode)); } Ok(()) } fn parse_path_into(url: &mut Url, path: &str, is_special: bool) { if path.is_empty() { if is_special { url.path = vec![String::new()]; } return; } let segments: Vec<&str> = if is_special { path.split(['/', '\\']).collect() } else { path.split('/').collect() }; for (i, seg) in segments.iter().enumerate() { if i == 0 && seg.is_empty() { continue; } let decoded = *seg; if decoded == "." || decoded.eq_ignore_ascii_case("%2e") { if i == segments.len() - 1 { url.path.push(String::new()); } } else if decoded == ".." || decoded.eq_ignore_ascii_case(".%2e") || decoded.eq_ignore_ascii_case("%2e.") || decoded.eq_ignore_ascii_case("%2e%2e") { shorten_path(&url.scheme, &mut url.path); if i == segments.len() - 1 { url.path.push(String::new()); } } else { url.path.push(percent_encode(decoded, is_path_encode)); } } } fn parse_opaque_or_path(url: &mut Url, input: &str) -> Result<()> { let mut remaining = input; let path_end = remaining.find(['?', '#']).unwrap_or(remaining.len()); let path_str = &remaining[..path_end]; remaining = &remaining[path_end..]; if path_str.starts_with('/') { url.opaque_path = false; parse_path_into(url, path_str, false); } else { url.opaque_path = true; url.path = vec![percent_encode(path_str, is_c0_control)]; } if let Some(after_q) = remaining.strip_prefix('?') { remaining = after_q; let query_end = remaining.find('#').unwrap_or(remaining.len()); let query_str = &remaining[..query_end]; remaining = &remaining[query_end..]; url.query = Some(percent_encode(query_str, is_query_encode)); } if let Some(after_hash) = remaining.strip_prefix('#') { url.fragment = Some(percent_encode(after_hash, is_fragment_encode)); } Ok(()) } // --------------------------------------------------------------------------- // Relative URL parsing // --------------------------------------------------------------------------- fn parse_relative(input: &str, base: &Url) -> Result { let mut url = Url { scheme: base.scheme.clone(), username: base.username.clone(), password: base.password.clone(), host: base.host.clone(), port: base.port, path: base.path.clone(), opaque_path: base.opaque_path, query: base.query.clone(), fragment: None, }; let is_special = is_special_scheme(&url.scheme); if input.is_empty() { return Ok(url); } let chars: Vec = input.chars().collect(); if chars[0] == '/' || (is_special && chars[0] == '\\') { if input.starts_with("//") || (is_special && input.starts_with("\\/")) { let after_slashes = &input[2..]; url.username = String::new(); url.password = String::new(); url.path = Vec::new(); url.query = None; parse_authority_and_path(&mut url, after_slashes, is_special)?; return Ok(url); } url.path = Vec::new(); url.query = None; parse_path_query_fragment(&mut url, input, is_special)?; return Ok(url); } if let Some(after_q) = input.strip_prefix('?') { url.query = None; url.fragment = None; let query_end = after_q.find('#').unwrap_or(after_q.len()); let query_str = &after_q[..query_end]; let after = &after_q[query_end..]; let encode_fn = if is_special { is_special_query_encode } else { is_query_encode }; url.query = Some(percent_encode(query_str, encode_fn)); if let Some(frag) = after.strip_prefix('#') { url.fragment = Some(percent_encode(frag, is_fragment_encode)); } return Ok(url); } if let Some(frag) = input.strip_prefix('#') { url.fragment = Some(percent_encode(frag, is_fragment_encode)); return Ok(url); } // Path-relative. if !url.opaque_path { shorten_path(&url.scheme, &mut url.path); } url.query = None; url.fragment = None; parse_path_query_fragment(&mut url, &format!("/{input}"), is_special)?; Ok(url) } fn parse_relative_special(remaining: &str, base: &Url, mut url: Url) -> Result { url.username = base.username.clone(); url.password = base.password.clone(); url.host = base.host.clone(); url.port = base.port; url.path = base.path.clone(); url.query = base.query.clone(); let is_special = true; if remaining.is_empty() { return Ok(url); } if remaining.starts_with('/') || remaining.starts_with('\\') { url.path = Vec::new(); url.query = None; parse_path_query_fragment(&mut url, remaining, is_special)?; return Ok(url); } if let Some(rest) = remaining.strip_prefix('?') { url.query = None; url.fragment = None; let query_end = rest.find('#').unwrap_or(rest.len()); url.query = Some(percent_encode(&rest[..query_end], is_special_query_encode)); if query_end < rest.len() { url.fragment = Some(percent_encode(&rest[query_end + 1..], is_fragment_encode)); } return Ok(url); } if let Some(frag) = remaining.strip_prefix('#') { url.fragment = Some(percent_encode(frag, is_fragment_encode)); return Ok(url); } shorten_path(&url.scheme, &mut url.path); url.query = None; parse_path_query_fragment(&mut url, &format!("/{remaining}"), is_special)?; Ok(url) } // --------------------------------------------------------------------------- // File URL parsing // --------------------------------------------------------------------------- fn parse_file_url(input: &str, base: Option<&Url>, mut url: Url) -> Result { url.host = Some(Host::Domain(String::new())); let remaining = if let Some(after) = input.strip_prefix("//") { after } else if let Some(after) = input.strip_prefix('/') { after } else if let Some(base) = base { if base.scheme == "file" { url.host = base.host.clone(); url.path = base.path.clone(); if let Some(rest) = input.strip_prefix('?') { url.query = None; url.fragment = None; let query_end = rest.find('#').unwrap_or(rest.len()); url.query = Some(percent_encode(&rest[..query_end], is_query_encode)); if query_end < rest.len() { url.fragment = Some(percent_encode(&rest[query_end + 1..], is_fragment_encode)); } return Ok(url); } if let Some(frag) = input.strip_prefix('#') { url.fragment = Some(percent_encode(frag, is_fragment_encode)); return Ok(url); } shorten_path(&url.scheme, &mut url.path); url.query = None; parse_path_query_fragment(&mut url, &format!("/{input}"), false)?; return Ok(url); } else { input } } else { input }; let path_start = remaining .find(['/', '\\', '?', '#']) .unwrap_or(remaining.len()); let potential_host = &remaining[..path_start]; let rest = &remaining[path_start..]; if starts_with_windows_drive_letter(remaining) { url.host = Some(Host::Domain(String::new())); parse_path_query_fragment(&mut url, &format!("/{remaining}"), false)?; return Ok(url); } if !potential_host.is_empty() { let host = parse_host(potential_host, false)?; if host != Host::Domain(String::new()) { url.host = Some(host); } } parse_path_query_fragment(&mut url, rest, false)?; // Normalize Windows drive letters in path. if let Some(first) = url.path.first_mut() { if first.len() == 2 { let bytes = first.as_bytes(); if bytes[0].is_ascii_alphabetic() && bytes[1] == b'|' { let mut normalized = String::new(); normalized.push(bytes[0] as char); normalized.push(':'); *first = normalized; } } } Ok(url) } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; // ------------------------------------------------------------------- // Basic absolute URL parsing // ------------------------------------------------------------------- #[test] fn parse_simple_http() { let url = Url::parse("http://example.com").unwrap(); assert_eq!(url.scheme(), "http"); assert_eq!(url.host_str(), Some("example.com".into())); assert_eq!(url.port(), None); assert_eq!(url.path(), "/"); assert_eq!(url.query(), None); assert_eq!(url.fragment(), None); } #[test] fn parse_https_with_path() { let url = Url::parse("https://example.com/foo/bar").unwrap(); assert_eq!(url.scheme(), "https"); assert_eq!(url.host_str(), Some("example.com".into())); assert_eq!(url.path(), "/foo/bar"); } #[test] fn parse_full_url() { let url = Url::parse("https://user:pass@example.com:8080/path/to/page?q=1&r=2#frag").unwrap(); assert_eq!(url.scheme(), "https"); assert_eq!(url.username(), "user"); assert_eq!(url.password(), "pass"); assert_eq!(url.host_str(), Some("example.com".into())); assert_eq!(url.port(), Some(8080)); assert_eq!(url.path(), "/path/to/page"); assert_eq!(url.query(), Some("q=1&r=2")); assert_eq!(url.fragment(), Some("frag")); } #[test] fn parse_default_port_omitted() { let url = Url::parse("http://example.com:80/").unwrap(); assert_eq!(url.port(), None); assert_eq!(url.port_or_default(), Some(80)); } #[test] fn parse_non_default_port() { let url = Url::parse("http://example.com:8080/").unwrap(); assert_eq!(url.port(), Some(8080)); } #[test] fn parse_https_default_port() { let url = Url::parse("https://example.com:443/").unwrap(); assert_eq!(url.port(), None); } #[test] fn parse_ftp_default_port() { let url = Url::parse("ftp://files.example.com:21/readme.txt").unwrap(); assert_eq!(url.port(), None); assert_eq!(url.port_or_default(), Some(21)); } // ------------------------------------------------------------------- // Scheme handling // ------------------------------------------------------------------- #[test] fn scheme_is_lowercased() { let url = Url::parse("HTTP://EXAMPLE.COM").unwrap(); assert_eq!(url.scheme(), "http"); } #[test] fn non_special_scheme() { let url = Url::parse("custom://host/path").unwrap(); assert_eq!(url.scheme(), "custom"); assert_eq!(url.host_str(), Some("host".into())); assert_eq!(url.path(), "/path"); } #[test] fn data_uri() { let url = Url::parse("data:text/html,

Hello

").unwrap(); assert_eq!(url.scheme(), "data"); assert!(url.cannot_be_a_base()); } #[test] fn javascript_uri() { let url = Url::parse("javascript:alert(1)").unwrap(); assert_eq!(url.scheme(), "javascript"); assert!(url.cannot_be_a_base()); } #[test] fn mailto_uri() { let url = Url::parse("mailto:user@example.com").unwrap(); assert_eq!(url.scheme(), "mailto"); assert!(url.cannot_be_a_base()); } // ------------------------------------------------------------------- // Host parsing // ------------------------------------------------------------------- #[test] fn host_is_lowercased() { let url = Url::parse("http://EXAMPLE.COM/").unwrap(); assert_eq!(url.host_str(), Some("example.com".into())); } #[test] fn ipv4_host() { let url = Url::parse("http://127.0.0.1/").unwrap(); assert_eq!(url.host(), Some(&Host::Ipv4(0x7F000001))); assert_eq!(url.host_str(), Some("127.0.0.1".into())); } #[test] fn ipv4_host_all_zeros() { let url = Url::parse("http://0.0.0.0/").unwrap(); assert_eq!(url.host(), Some(&Host::Ipv4(0))); } #[test] fn ipv6_host() { let url = Url::parse("http://[::1]/").unwrap(); assert_eq!(url.host(), Some(&Host::Ipv6([0, 0, 0, 0, 0, 0, 0, 1]))); } #[test] fn ipv6_full() { let url = Url::parse("http://[2001:db8:85a3:0:0:8a2e:370:7334]/").unwrap(); assert_eq!( url.host(), Some(&Host::Ipv6([ 0x2001, 0x0db8, 0x85a3, 0, 0, 0x8a2e, 0x0370, 0x7334 ])) ); } #[test] fn ipv6_serialization_compressed() { let url = Url::parse("http://[2001:db8::1]/").unwrap(); assert_eq!(url.host_str(), Some("[2001:db8::1]".into())); } #[test] fn ipv6_all_zeros() { let url = Url::parse("http://[::]/").unwrap(); assert_eq!(url.host(), Some(&Host::Ipv6([0; 8]))); assert_eq!(url.host_str(), Some("[::]".into())); } #[test] fn ipv6_loopback() { let pieces = parse_ipv6("::1").unwrap(); assert_eq!(pieces, [0, 0, 0, 0, 0, 0, 0, 1]); } #[test] fn ipv6_with_ipv4() { let pieces = parse_ipv6("::ffff:192.168.1.1").unwrap(); assert_eq!(pieces, [0, 0, 0, 0, 0, 0xffff, 0xc0a8, 0x0101]); } // ------------------------------------------------------------------- // IPv4 parsing // ------------------------------------------------------------------- #[test] fn ipv4_basic() { assert_eq!(parse_ipv4("192.168.1.1").unwrap(), 0xC0A80101); } #[test] fn ipv4_hex() { assert_eq!(parse_ipv4("0xC0.0xA8.0x01.0x01").unwrap(), 0xC0A80101); } #[test] fn ipv4_octal() { assert_eq!(parse_ipv4("0300.0250.01.01").unwrap(), 0xC0A80101); } #[test] fn ipv4_single_number() { assert!(parse_ipv4("3232235777").is_err()); } #[test] fn ipv4_two_parts() { // Two parts: first is top 8 bits, second is bottom 24 bits. // 192.168.1.1 => 168*65536 + 1*256 + 1 = 11010305 assert_eq!(parse_ipv4("192.11010305").unwrap(), 0xC0A80101); } #[test] fn ipv4_reject_overflow() { assert!(parse_ipv4("256.0.0.0").is_err()); } #[test] fn ipv4_reject_empty_part() { assert!(parse_ipv4("1..1.1").is_err()); } // ------------------------------------------------------------------- // Percent encoding/decoding // ------------------------------------------------------------------- #[test] fn percent_decode_basic() { assert_eq!(percent_decode_string("%48%65%6C%6C%6F"), "Hello"); } #[test] fn percent_decode_mixed() { assert_eq!(percent_decode_string("Hello%20World"), "Hello World"); } #[test] fn percent_decode_passthrough() { assert_eq!(percent_decode_string("no-encoding"), "no-encoding"); } #[test] fn percent_decode_partial() { assert_eq!(percent_decode_string("100%"), "100%"); assert_eq!(percent_decode_string("%2"), "%2"); } #[test] fn percent_encode_userinfo() { let encoded = percent_encode("user@host", is_userinfo_encode); assert_eq!(encoded, "user%40host"); } #[test] fn percent_encode_path() { let encoded = percent_encode("hello world", is_path_encode); assert_eq!(encoded, "hello%20world"); } // ------------------------------------------------------------------- // Path parsing and dot segments // ------------------------------------------------------------------- #[test] fn path_dot_removal() { let url = Url::parse("http://example.com/a/b/../c").unwrap(); assert_eq!(url.path(), "/a/c"); } #[test] fn path_dot_current() { let url = Url::parse("http://example.com/a/./b").unwrap(); assert_eq!(url.path(), "/a/b"); } #[test] fn path_multiple_dots() { let url = Url::parse("http://example.com/a/b/c/../../d").unwrap(); assert_eq!(url.path(), "/a/d"); } #[test] fn path_trailing_slash() { let url = Url::parse("http://example.com/a/b/").unwrap(); assert_eq!(url.path(), "/a/b/"); } #[test] fn path_empty() { let url = Url::parse("http://example.com").unwrap(); assert_eq!(url.path(), "/"); } #[test] fn path_double_dot_at_root() { let url = Url::parse("http://example.com/../a").unwrap(); assert_eq!(url.path(), "/a"); } // ------------------------------------------------------------------- // Relative URL resolution // ------------------------------------------------------------------- #[test] fn relative_path() { let base = Url::parse("http://example.com/a/b/c").unwrap(); let url = Url::parse_with_base("d", &base).unwrap(); assert_eq!(url.path(), "/a/b/d"); assert_eq!(url.host_str(), Some("example.com".into())); } #[test] fn relative_path_with_dots() { let base = Url::parse("http://example.com/a/b/c").unwrap(); let url = Url::parse_with_base("../d", &base).unwrap(); assert_eq!(url.path(), "/a/d"); } #[test] fn relative_absolute_path() { let base = Url::parse("http://example.com/a/b/c").unwrap(); let url = Url::parse_with_base("/d/e", &base).unwrap(); assert_eq!(url.path(), "/d/e"); assert_eq!(url.host_str(), Some("example.com".into())); } #[test] fn relative_query_only() { let base = Url::parse("http://example.com/a/b?old=1").unwrap(); let url = Url::parse_with_base("?new=2", &base).unwrap(); assert_eq!(url.path(), "/a/b"); assert_eq!(url.query(), Some("new=2")); } #[test] fn relative_fragment_only() { let base = Url::parse("http://example.com/a/b#old").unwrap(); let url = Url::parse_with_base("#new", &base).unwrap(); assert_eq!(url.path(), "/a/b"); assert_eq!(url.fragment(), Some("new")); } #[test] fn relative_authority_override() { let base = Url::parse("http://example.com/a/b").unwrap(); let url = Url::parse_with_base("//other.com/c", &base).unwrap(); assert_eq!(url.scheme(), "http"); assert_eq!(url.host_str(), Some("other.com".into())); assert_eq!(url.path(), "/c"); } #[test] fn absolute_url_ignores_base() { let base = Url::parse("http://example.com/a").unwrap(); let url = Url::parse_with_base("https://other.com/b", &base).unwrap(); assert_eq!(url.scheme(), "https"); assert_eq!(url.host_str(), Some("other.com".into())); assert_eq!(url.path(), "/b"); } #[test] fn relative_empty_string() { let base = Url::parse("http://example.com/a/b?q=1#f").unwrap(); let url = Url::parse_with_base("", &base).unwrap(); assert_eq!(url.path(), "/a/b"); assert_eq!(url.query(), Some("q=1")); assert_eq!(url.fragment(), None); } // ------------------------------------------------------------------- // Serialization // ------------------------------------------------------------------- #[test] fn serialize_simple() { let url = Url::parse("http://example.com/path").unwrap(); assert_eq!(url.serialize(), "http://example.com/path"); } #[test] fn serialize_with_credentials() { let url = Url::parse("http://user:pass@example.com/").unwrap(); assert_eq!(url.serialize(), "http://user:pass@example.com/"); } #[test] fn serialize_with_port() { let url = Url::parse("http://example.com:8080/").unwrap(); assert_eq!(url.serialize(), "http://example.com:8080/"); } #[test] fn serialize_with_query_fragment() { let url = Url::parse("http://example.com/path?q=1#frag").unwrap(); assert_eq!(url.serialize(), "http://example.com/path?q=1#frag"); } #[test] fn serialize_data_uri() { let url = Url::parse("data:text/html,hello").unwrap(); assert_eq!(url.serialize(), "data:text/html,hello"); } #[test] fn roundtrip_full_url() { let input = "https://user:pass@example.com:8080/a/b?q=1#frag"; let url = Url::parse(input).unwrap(); assert_eq!(url.serialize(), input); } #[test] fn roundtrip_ipv4() { let url = Url::parse("http://192.168.1.1/path").unwrap(); assert_eq!(url.serialize(), "http://192.168.1.1/path"); } #[test] fn roundtrip_ipv6() { let url = Url::parse("http://[::1]/path").unwrap(); assert_eq!(url.serialize(), "http://[::1]/path"); } // ------------------------------------------------------------------- // Origin // ------------------------------------------------------------------- #[test] fn origin_http() { let url = Url::parse("http://example.com:8080/path").unwrap(); match url.origin() { Origin::Tuple(scheme, host, port) => { assert_eq!(scheme, "http"); assert_eq!(host, Host::Domain("example.com".into())); assert_eq!(port, Some(8080)); } _ => panic!("expected tuple origin"), } } #[test] fn origin_https_default_port() { let url = Url::parse("https://example.com/").unwrap(); match url.origin() { Origin::Tuple(scheme, host, port) => { assert_eq!(scheme, "https"); assert_eq!(host, Host::Domain("example.com".into())); assert_eq!(port, None); } _ => panic!("expected tuple origin"), } } #[test] fn origin_data_is_opaque() { let url = Url::parse("data:text/html,hello").unwrap(); assert_eq!(url.origin(), Origin::Opaque); } // ------------------------------------------------------------------- // File URLs // ------------------------------------------------------------------- #[test] fn file_url_unix() { let url = Url::parse("file:///home/user/file.txt").unwrap(); assert_eq!(url.scheme(), "file"); assert_eq!(url.host_str(), Some("".into())); assert_eq!(url.path(), "/home/user/file.txt"); } #[test] fn file_url_windows_drive() { let url = Url::parse("file:///C:/Windows/system32").unwrap(); assert_eq!(url.scheme(), "file"); assert_eq!(url.path(), "/C:/Windows/system32"); } #[test] fn file_url_with_host() { let url = Url::parse("file://server/share/file.txt").unwrap(); assert_eq!(url.scheme(), "file"); assert_eq!(url.host_str(), Some("server".into())); assert_eq!(url.path(), "/share/file.txt"); } // ------------------------------------------------------------------- // Edge cases // ------------------------------------------------------------------- #[test] fn empty_input_fails() { assert_eq!(Url::parse(""), Err(UrlError::EmptyInput)); } #[test] fn whitespace_only_fails() { assert_eq!(Url::parse(" "), Err(UrlError::EmptyInput)); } #[test] fn missing_scheme_fails() { assert!(Url::parse("example.com").is_err()); } #[test] fn leading_whitespace_stripped() { let url = Url::parse(" http://example.com ").unwrap(); assert_eq!(url.host_str(), Some("example.com".into())); } #[test] fn tab_newline_stripped() { let url = Url::parse("http://exa\tmple\n.com/").unwrap(); assert_eq!(url.host_str(), Some("example.com".into())); } #[test] fn query_with_special_chars() { let url = Url::parse("http://example.com/?key=val ue&foo=bar").unwrap(); assert!(url.query().unwrap().contains("key=val%20ue")); } #[test] fn fragment_with_special_chars() { let url = Url::parse("http://example.com/#sec tion").unwrap(); assert!(url.fragment().unwrap().contains("sec%20tion")); } #[test] fn username_only() { let url = Url::parse("http://user@example.com/").unwrap(); assert_eq!(url.username(), "user"); assert_eq!(url.password(), ""); assert!(url.has_credentials()); } #[test] fn no_credentials() { let url = Url::parse("http://example.com/").unwrap(); assert!(!url.has_credentials()); } #[test] fn port_overflow_fails() { assert!(Url::parse("http://example.com:99999/").is_err()); } #[test] fn ws_scheme() { let url = Url::parse("ws://example.com/chat").unwrap(); assert_eq!(url.scheme(), "ws"); assert_eq!(url.port_or_default(), Some(80)); } #[test] fn wss_scheme() { let url = Url::parse("wss://example.com/chat").unwrap(); assert_eq!(url.scheme(), "wss"); assert_eq!(url.port_or_default(), Some(443)); } #[test] fn cannot_be_a_base() { let url = Url::parse("data:text/html,hello").unwrap(); assert!(url.cannot_be_a_base()); } #[test] fn http_can_be_a_base() { let url = Url::parse("http://example.com/").unwrap(); assert!(!url.cannot_be_a_base()); } // ------------------------------------------------------------------- // Display/ToString // ------------------------------------------------------------------- #[test] fn display_matches_serialize() { let url = Url::parse("https://example.com:8443/path?q=1#f").unwrap(); assert_eq!(format!("{url}"), url.serialize()); } // ------------------------------------------------------------------- // Multiple path segments // ------------------------------------------------------------------- #[test] fn path_segments() { let url = Url::parse("http://example.com/a/b/c").unwrap(); assert_eq!(url.path_segments(), &["a", "b", "c"]); } #[test] fn path_segments_trailing_slash() { let url = Url::parse("http://example.com/a/b/").unwrap(); assert_eq!(url.path_segments(), &["a", "b", ""]); } // ------------------------------------------------------------------- // Host type // ------------------------------------------------------------------- #[test] fn host_serialize_domain() { let h = Host::Domain("example.com".into()); assert_eq!(h.serialize(), "example.com"); } #[test] fn host_serialize_ipv4() { let h = Host::Ipv4(0x7F000001); assert_eq!(h.serialize(), "127.0.0.1"); } #[test] fn host_serialize_ipv6() { let h = Host::Ipv6([0, 0, 0, 0, 0, 0, 0, 1]); assert_eq!(h.serialize(), "[::1]"); } // ------------------------------------------------------------------- // IPv6 serialization // ------------------------------------------------------------------- #[test] fn ipv6_serialize_full() { let pieces = [ 0x2001, 0x0db8, 0x85a3, 0x0001, 0x0002, 0x8a2e, 0x0370, 0x7334, ]; assert_eq!(serialize_ipv6(&pieces), "2001:db8:85a3:1:2:8a2e:370:7334"); } #[test] fn ipv6_serialize_compress() { let pieces = [0x2001, 0x0db8, 0, 0, 0, 0, 0, 1]; assert_eq!(serialize_ipv6(&pieces), "2001:db8::1"); } #[test] fn ipv6_serialize_all_zeros() { let pieces = [0u16; 8]; assert_eq!(serialize_ipv6(&pieces), "::"); } #[test] fn ipv6_serialize_no_compress_single_zero() { let pieces = [1, 0, 2, 0, 3, 0, 4, 0]; assert_eq!(serialize_ipv6(&pieces), "1:0:2:0:3:0:4:0"); } // ------------------------------------------------------------------- // Percent encoding edge cases // ------------------------------------------------------------------- #[test] fn percent_encode_preserves_unreserved() { let encoded = percent_encode("hello-world_test.page~1", is_path_encode); assert_eq!(encoded, "hello-world_test.page~1"); } #[test] fn percent_encode_multibyte_utf8() { let encoded = percent_encode("café", is_path_encode); assert_eq!(encoded, "caf%C3%A9"); } }