we (web engine): Experimental web browser project to understand the limits of Claude
1//! WHATWG URL parser.
2//!
3//! Implements the URL Standard (<https://url.spec.whatwg.org/>):
4//! - URL record type with scheme, username, password, host, port, path, query, fragment
5//! - State-machine parser following the spec
6//! - Host parsing: domains, IPv4 addresses, IPv6 addresses
7//! - Percent-encoding and decoding (UTF-8)
8//! - Special scheme handling (http, https, ftp, ws, wss, file)
9//! - Relative URL resolution via base URL
10//! - URL serialization
11//! - Origin derivation
12
13pub mod data_url;
14
15use core::fmt;
16
17// ---------------------------------------------------------------------------
18// Error types
19// ---------------------------------------------------------------------------
20
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum UrlError {
23 /// Input is empty or contains only whitespace.
24 EmptyInput,
25 /// Invalid URL syntax.
26 InvalidUrl,
27 /// Invalid scheme.
28 InvalidScheme,
29 /// Invalid authority.
30 InvalidAuthority,
31 /// Invalid host.
32 InvalidHost,
33 /// Invalid port number.
34 InvalidPort,
35 /// Invalid IPv4 address.
36 InvalidIpv4,
37 /// Invalid IPv6 address.
38 InvalidIpv6,
39 /// Invalid percent-encoding.
40 InvalidPercentEncoding,
41 /// Relative URL without a base.
42 RelativeWithoutBase,
43 /// Missing scheme.
44 MissingScheme,
45}
46
47impl fmt::Display for UrlError {
48 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
49 match self {
50 Self::EmptyInput => write!(f, "empty input"),
51 Self::InvalidUrl => write!(f, "invalid URL"),
52 Self::InvalidScheme => write!(f, "invalid scheme"),
53 Self::InvalidAuthority => write!(f, "invalid authority"),
54 Self::InvalidHost => write!(f, "invalid host"),
55 Self::InvalidPort => write!(f, "invalid port number"),
56 Self::InvalidIpv4 => write!(f, "invalid IPv4 address"),
57 Self::InvalidIpv6 => write!(f, "invalid IPv6 address"),
58 Self::InvalidPercentEncoding => write!(f, "invalid percent-encoding"),
59 Self::RelativeWithoutBase => write!(f, "relative URL without a base"),
60 Self::MissingScheme => write!(f, "missing scheme"),
61 }
62 }
63}
64
65pub type Result<T> = core::result::Result<T, UrlError>;
66
67// ---------------------------------------------------------------------------
68// Host
69// ---------------------------------------------------------------------------
70
71/// A parsed URL host.
72#[derive(Debug, Clone, PartialEq, Eq)]
73pub enum Host {
74 /// A domain name (already lowercased).
75 Domain(String),
76 /// An IPv4 address.
77 Ipv4(u32),
78 /// An IPv6 address (128 bits as eight 16-bit pieces).
79 Ipv6([u16; 8]),
80}
81
82impl Host {
83 /// Serialize the host to a string.
84 pub fn serialize(&self) -> String {
85 match self {
86 Host::Domain(d) => d.clone(),
87 Host::Ipv4(addr) => serialize_ipv4(*addr),
88 Host::Ipv6(pieces) => format!("[{}]", serialize_ipv6(pieces)),
89 }
90 }
91}
92
93impl fmt::Display for Host {
94 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
95 write!(f, "{}", self.serialize())
96 }
97}
98
99// ---------------------------------------------------------------------------
100// Origin
101// ---------------------------------------------------------------------------
102
103/// A URL origin (scheme, host, port).
104#[derive(Debug, Clone, PartialEq, Eq)]
105pub enum Origin {
106 /// A tuple origin (scheme, host, port).
107 Tuple(String, Host, Option<u16>),
108 /// An opaque origin (unique, not equal to anything).
109 Opaque,
110}
111
112impl Origin {
113 /// Check whether two origins are the same origin per the HTML spec.
114 ///
115 /// Two tuple origins are same-origin iff their schemes, hosts, and ports
116 /// are identical after normalizing default ports (http→80, https→443, etc.).
117 /// Opaque origins are never same-origin, even with themselves.
118 pub fn same_origin(&self, other: &Origin) -> bool {
119 match (self, other) {
120 (Origin::Tuple(scheme_a, host_a, port_a), Origin::Tuple(scheme_b, host_b, port_b)) => {
121 let effective_port_a = port_a.or_else(|| default_port(scheme_a));
122 let effective_port_b = port_b.or_else(|| default_port(scheme_b));
123 scheme_a == scheme_b && host_a == host_b && effective_port_a == effective_port_b
124 }
125 _ => false,
126 }
127 }
128
129 /// Serialize this origin to a string (e.g. `"https://example.com"`).
130 ///
131 /// Opaque origins serialize to `"null"`.
132 pub fn serialize(&self) -> String {
133 match self {
134 Origin::Opaque => "null".to_string(),
135 Origin::Tuple(scheme, host, port) => {
136 let mut s = String::new();
137 s.push_str(scheme);
138 s.push_str("://");
139 s.push_str(&host.serialize());
140 if let Some(p) = port {
141 if default_port(scheme) != Some(*p) {
142 s.push(':');
143 s.push_str(&p.to_string());
144 }
145 }
146 s
147 }
148 }
149 }
150}
151
152impl fmt::Display for Origin {
153 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
154 write!(f, "{}", self.serialize())
155 }
156}
157
158// ---------------------------------------------------------------------------
159// URL record
160// ---------------------------------------------------------------------------
161
162/// A parsed URL record per the WHATWG URL Standard.
163#[derive(Debug, Clone, PartialEq, Eq)]
164pub struct Url {
165 /// The scheme (e.g., "http", "https", "file").
166 pub scheme: String,
167 /// The username (percent-encoded).
168 username: String,
169 /// The password (percent-encoded).
170 password: String,
171 /// The host.
172 pub host: Option<Host>,
173 /// The port (None = default or absent).
174 pub port: Option<u16>,
175 /// Path segments. For non-opaque paths, these are the segments.
176 /// For opaque paths (cannot-be-a-base URL), this is a single element.
177 path: Vec<String>,
178 /// Whether this URL has an opaque path (cannot-be-a-base URL).
179 opaque_path: bool,
180 /// The query string (without leading '?').
181 pub query: Option<String>,
182 /// The fragment (without leading '#').
183 pub fragment: Option<String>,
184}
185
186impl Url {
187 /// Parse a URL string.
188 pub fn parse(input: &str) -> Result<Self> {
189 parse_url(input, None)
190 }
191
192 /// Parse a URL string with a base URL for resolving relative references.
193 pub fn parse_with_base(input: &str, base: &Url) -> Result<Self> {
194 parse_url(input, Some(base))
195 }
196
197 /// Get the scheme.
198 pub fn scheme(&self) -> &str {
199 &self.scheme
200 }
201
202 /// Get the username (percent-encoded).
203 pub fn username(&self) -> &str {
204 &self.username
205 }
206
207 /// Get the password (percent-encoded).
208 pub fn password(&self) -> &str {
209 &self.password
210 }
211
212 /// Get the host.
213 pub fn host(&self) -> Option<&Host> {
214 self.host.as_ref()
215 }
216
217 /// Get the host as a string.
218 pub fn host_str(&self) -> Option<String> {
219 self.host.as_ref().map(|h| h.serialize())
220 }
221
222 /// Get the port.
223 pub fn port(&self) -> Option<u16> {
224 self.port
225 }
226
227 /// Get the port or the default port for the scheme.
228 pub fn port_or_default(&self) -> Option<u16> {
229 self.port.or_else(|| default_port(&self.scheme))
230 }
231
232 /// Get the path as a string.
233 pub fn path(&self) -> String {
234 if self.opaque_path {
235 self.path.first().cloned().unwrap_or_default()
236 } else {
237 let mut s = String::new();
238 for seg in &self.path {
239 s.push('/');
240 s.push_str(seg);
241 }
242 if s.is_empty() {
243 s.push('/');
244 }
245 s
246 }
247 }
248
249 /// Get the path segments.
250 pub fn path_segments(&self) -> &[String] {
251 &self.path
252 }
253
254 /// Get the query string.
255 pub fn query(&self) -> Option<&str> {
256 self.query.as_deref()
257 }
258
259 /// Get the fragment.
260 pub fn fragment(&self) -> Option<&str> {
261 self.fragment.as_deref()
262 }
263
264 /// Whether this URL has an opaque path (cannot-be-a-base).
265 pub fn cannot_be_a_base(&self) -> bool {
266 self.opaque_path
267 }
268
269 /// Whether this URL includes credentials.
270 pub fn has_credentials(&self) -> bool {
271 !self.username.is_empty() || !self.password.is_empty()
272 }
273
274 /// Derive the origin of this URL.
275 pub fn origin(&self) -> Origin {
276 match self.scheme.as_str() {
277 "http" | "https" | "ws" | "wss" | "ftp" => {
278 if let Some(host) = &self.host {
279 Origin::Tuple(self.scheme.clone(), host.clone(), self.port)
280 } else {
281 Origin::Opaque
282 }
283 }
284 _ => Origin::Opaque,
285 }
286 }
287
288 /// Serialize this URL to a string (the href).
289 pub fn serialize(&self) -> String {
290 let mut output = String::new();
291 output.push_str(&self.scheme);
292 output.push(':');
293
294 if self.host.is_some() {
295 output.push_str("//");
296 if self.has_credentials() {
297 output.push_str(&self.username);
298 if !self.password.is_empty() {
299 output.push(':');
300 output.push_str(&self.password);
301 }
302 output.push('@');
303 }
304 if let Some(ref host) = self.host {
305 output.push_str(&host.serialize());
306 }
307 if let Some(port) = self.port {
308 output.push(':');
309 output.push_str(&port.to_string());
310 }
311 } else if !self.opaque_path && self.scheme == "file" {
312 output.push_str("//");
313 }
314
315 output.push_str(&self.path());
316
317 if let Some(ref query) = self.query {
318 output.push('?');
319 output.push_str(query);
320 }
321 if let Some(ref fragment) = self.fragment {
322 output.push('#');
323 output.push_str(fragment);
324 }
325
326 output
327 }
328}
329
330impl fmt::Display for Url {
331 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332 write!(f, "{}", self.serialize())
333 }
334}
335
336// ---------------------------------------------------------------------------
337// Special schemes
338// ---------------------------------------------------------------------------
339
340/// Whether a scheme is "special" per the URL standard.
341fn is_special_scheme(scheme: &str) -> bool {
342 matches!(scheme, "http" | "https" | "ftp" | "ws" | "wss" | "file")
343}
344
345/// Default port for a special scheme.
346fn default_port(scheme: &str) -> Option<u16> {
347 match scheme {
348 "http" | "ws" => Some(80),
349 "https" | "wss" => Some(443),
350 "ftp" => Some(21),
351 _ => None,
352 }
353}
354
355// ---------------------------------------------------------------------------
356// Percent encoding / decoding
357// ---------------------------------------------------------------------------
358
359/// The C0 control percent-encode set.
360fn is_c0_control(c: char) -> bool {
361 c <= '\u{001F}' || c > '\u{007E}'
362}
363
364/// The fragment percent-encode set.
365fn is_fragment_encode(c: char) -> bool {
366 is_c0_control(c) || c == ' ' || c == '"' || c == '<' || c == '>' || c == '`'
367}
368
369/// The query percent-encode set.
370fn is_query_encode(c: char) -> bool {
371 is_c0_control(c) || c == ' ' || c == '"' || c == '#' || c == '<' || c == '>'
372}
373
374/// The special query percent-encode set.
375fn is_special_query_encode(c: char) -> bool {
376 is_query_encode(c) || c == '\''
377}
378
379/// The path percent-encode set.
380fn is_path_encode(c: char) -> bool {
381 is_query_encode(c) || c == '?' || c == '`' || c == '{' || c == '}'
382}
383
384/// The userinfo percent-encode set.
385fn is_userinfo_encode(c: char) -> bool {
386 is_path_encode(c)
387 || c == '/'
388 || c == ':'
389 || c == ';'
390 || c == '='
391 || c == '@'
392 || c == '['
393 || c == '\\'
394 || c == ']'
395 || c == '^'
396 || c == '|'
397}
398
399/// Percent-encode a string using the given encode set predicate.
400fn percent_encode(input: &str, should_encode: fn(char) -> bool) -> String {
401 let mut out = String::with_capacity(input.len());
402 for c in input.chars() {
403 if should_encode(c) {
404 for b in c.to_string().as_bytes() {
405 out.push('%');
406 out.push(to_hex_upper(b >> 4));
407 out.push(to_hex_upper(b & 0x0F));
408 }
409 } else {
410 out.push(c);
411 }
412 }
413 out
414}
415
416fn to_hex_upper(n: u8) -> char {
417 if n < 10 {
418 (b'0' + n) as char
419 } else {
420 (b'A' + n - 10) as char
421 }
422}
423
424/// Percent-decode a byte string.
425pub fn percent_decode(input: &str) -> Vec<u8> {
426 let bytes = input.as_bytes();
427 let mut out = Vec::with_capacity(bytes.len());
428 let mut i = 0;
429 while i < bytes.len() {
430 if bytes[i] == b'%' && i + 2 < bytes.len() {
431 if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) {
432 out.push(hi << 4 | lo);
433 i += 3;
434 continue;
435 }
436 }
437 out.push(bytes[i]);
438 i += 1;
439 }
440 out
441}
442
443/// Percent-decode to a UTF-8 string (lossy).
444pub fn percent_decode_string(input: &str) -> String {
445 String::from_utf8_lossy(&percent_decode(input)).into_owned()
446}
447
448fn hex_val(b: u8) -> Option<u8> {
449 match b {
450 b'0'..=b'9' => Some(b - b'0'),
451 b'a'..=b'f' => Some(b - b'a' + 10),
452 b'A'..=b'F' => Some(b - b'A' + 10),
453 _ => None,
454 }
455}
456
457// ---------------------------------------------------------------------------
458// IPv4 parsing
459// ---------------------------------------------------------------------------
460
461fn parse_ipv4(input: &str) -> Result<u32> {
462 let parts: Vec<&str> = input.split('.').collect();
463 if parts.len() < 2 || parts.len() > 4 {
464 return Err(UrlError::InvalidIpv4);
465 }
466 let mut numbers: Vec<u64> = Vec::with_capacity(parts.len());
467 for part in &parts {
468 if part.is_empty() {
469 return Err(UrlError::InvalidIpv4);
470 }
471 let n = parse_ipv4_number(part)?;
472 numbers.push(n);
473 }
474 let last = numbers.len() - 1;
475 for (i, &n) in numbers.iter().enumerate() {
476 if i < last && n > 255 {
477 return Err(UrlError::InvalidIpv4);
478 }
479 }
480 if numbers[last] >= 256u64.pow((4 - last) as u32) {
481 return Err(UrlError::InvalidIpv4);
482 }
483
484 let mut ipv4 = numbers[last] as u32;
485 for (i, &n) in numbers.iter().enumerate().take(last) {
486 ipv4 += (n as u32) << (8 * (3 - i));
487 }
488 Ok(ipv4)
489}
490
491fn parse_ipv4_number(input: &str) -> Result<u64> {
492 if input.is_empty() {
493 return Err(UrlError::InvalidIpv4);
494 }
495 let (s, radix) = if input.starts_with("0x") || input.starts_with("0X") {
496 (&input[2..], 16)
497 } else if input.len() > 1 && input.starts_with('0') {
498 (&input[1..], 8)
499 } else {
500 (input, 10)
501 };
502 if s.is_empty() {
503 return Ok(0);
504 }
505 u64::from_str_radix(s, radix).map_err(|_| UrlError::InvalidIpv4)
506}
507
508fn serialize_ipv4(addr: u32) -> String {
509 format!(
510 "{}.{}.{}.{}",
511 (addr >> 24) & 0xFF,
512 (addr >> 16) & 0xFF,
513 (addr >> 8) & 0xFF,
514 addr & 0xFF
515 )
516}
517
518// ---------------------------------------------------------------------------
519// IPv6 parsing
520// ---------------------------------------------------------------------------
521
522fn parse_ipv6(input: &str) -> Result<[u16; 8]> {
523 let mut pieces = [0u16; 8];
524 let mut piece_index: usize = 0;
525 let mut compress: Option<usize> = None;
526 let chars: Vec<char> = input.chars().collect();
527 let len = chars.len();
528 let mut pointer = 0;
529
530 if pointer < len && chars[pointer] == ':' {
531 if pointer + 1 >= len || chars[pointer + 1] != ':' {
532 return Err(UrlError::InvalidIpv6);
533 }
534 pointer += 2;
535 piece_index += 1;
536 compress = Some(piece_index);
537 }
538
539 while pointer < len {
540 if piece_index >= 8 {
541 return Err(UrlError::InvalidIpv6);
542 }
543
544 if chars[pointer] == ':' {
545 if compress.is_some() {
546 return Err(UrlError::InvalidIpv6);
547 }
548 pointer += 1;
549 piece_index += 1;
550 compress = Some(piece_index);
551 continue;
552 }
553
554 let mut value: u16 = 0;
555 let mut length = 0;
556 while length < 4 && pointer < len && chars[pointer].is_ascii_hexdigit() {
557 value = value * 0x10 + hex_val(chars[pointer] as u8).unwrap() as u16;
558 pointer += 1;
559 length += 1;
560 }
561
562 if pointer < len && chars[pointer] == '.' {
563 // IPv4-mapped IPv6.
564 if length == 0 {
565 return Err(UrlError::InvalidIpv6);
566 }
567 pointer -= length;
568 if piece_index > 6 {
569 return Err(UrlError::InvalidIpv6);
570 }
571 let mut numbers_seen = 0;
572 while pointer < len {
573 let mut ipv4_piece: Option<u16> = None;
574 if numbers_seen > 0 {
575 if chars[pointer] == '.' && numbers_seen < 4 {
576 pointer += 1;
577 } else {
578 return Err(UrlError::InvalidIpv6);
579 }
580 }
581 if pointer >= len || !chars[pointer].is_ascii_digit() {
582 return Err(UrlError::InvalidIpv6);
583 }
584 while pointer < len && chars[pointer].is_ascii_digit() {
585 let number = (chars[pointer] as u8 - b'0') as u16;
586 match ipv4_piece {
587 None => ipv4_piece = Some(number),
588 Some(0) => return Err(UrlError::InvalidIpv6), // leading zero
589 Some(v) => ipv4_piece = Some(v * 10 + number),
590 }
591 if ipv4_piece.unwrap_or(0) > 255 {
592 return Err(UrlError::InvalidIpv6);
593 }
594 pointer += 1;
595 }
596 pieces[piece_index] =
597 pieces[piece_index] * 0x100 + ipv4_piece.ok_or(UrlError::InvalidIpv6)?;
598 numbers_seen += 1;
599 if numbers_seen == 2 || numbers_seen == 4 {
600 piece_index += 1;
601 }
602 }
603 if numbers_seen != 4 {
604 return Err(UrlError::InvalidIpv6);
605 }
606 break;
607 }
608
609 if pointer < len && chars[pointer] == ':' {
610 pointer += 1;
611 if pointer >= len {
612 // Trailing single colon after a piece — only valid with compress.
613 }
614 } else if pointer < len {
615 return Err(UrlError::InvalidIpv6);
616 }
617
618 if piece_index >= 8 {
619 return Err(UrlError::InvalidIpv6);
620 }
621 pieces[piece_index] = value;
622 piece_index += 1;
623 }
624
625 if let Some(comp) = compress {
626 let mut swaps = piece_index - comp;
627 piece_index = 7;
628 while piece_index != 0 && swaps > 0 {
629 let swap_index = comp + swaps - 1;
630 pieces.swap(piece_index, swap_index);
631 piece_index -= 1;
632 swaps -= 1;
633 }
634 } else if piece_index != 8 {
635 return Err(UrlError::InvalidIpv6);
636 }
637
638 Ok(pieces)
639}
640
641fn serialize_ipv6(pieces: &[u16; 8]) -> String {
642 // Find the longest run of consecutive zeros for :: compression.
643 let mut best_start = None;
644 let mut best_len = 0usize;
645 let mut cur_start = None;
646 let mut cur_len = 0usize;
647
648 for (i, &p) in pieces.iter().enumerate() {
649 if p == 0 {
650 if cur_start.is_none() {
651 cur_start = Some(i);
652 cur_len = 1;
653 } else {
654 cur_len += 1;
655 }
656 } else {
657 if cur_len > best_len && cur_len >= 2 {
658 best_start = cur_start;
659 best_len = cur_len;
660 }
661 cur_start = None;
662 cur_len = 0;
663 }
664 }
665 if cur_len > best_len && cur_len >= 2 {
666 best_start = cur_start;
667 best_len = cur_len;
668 }
669
670 let mut out = String::new();
671 let mut i = 0;
672 while i < 8 {
673 if Some(i) == best_start {
674 out.push_str("::");
675 i += best_len;
676 continue;
677 }
678 if !out.is_empty() && !out.ends_with(':') {
679 out.push(':');
680 }
681 out.push_str(&format!("{:x}", pieces[i]));
682 i += 1;
683 }
684 out
685}
686
687// ---------------------------------------------------------------------------
688// Host parsing
689// ---------------------------------------------------------------------------
690
691fn parse_host(input: &str, is_special: bool) -> Result<Host> {
692 if input.is_empty() {
693 if is_special {
694 return Err(UrlError::InvalidHost);
695 }
696 return Ok(Host::Domain(String::new()));
697 }
698
699 // IPv6
700 if input.starts_with('[') {
701 if !input.ends_with(']') {
702 return Err(UrlError::InvalidIpv6);
703 }
704 let inner = &input[1..input.len() - 1];
705 let pieces = parse_ipv6(inner)?;
706 return Ok(Host::Ipv6(pieces));
707 }
708
709 if !is_special {
710 let encoded = percent_encode(input, is_c0_control);
711 return Ok(Host::Domain(encoded));
712 }
713
714 // Domain — percent-decode then lowercase.
715 let decoded = percent_decode_string(input);
716 let lowered = decoded.to_ascii_lowercase();
717
718 // Check if it's an IPv4 address.
719 if ends_with_number(&lowered) {
720 match parse_ipv4(&lowered) {
721 Ok(addr) => return Ok(Host::Ipv4(addr)),
722 Err(_) => return Err(UrlError::InvalidHost),
723 }
724 }
725
726 // Validate domain characters.
727 for c in lowered.chars() {
728 if c == '\0'
729 || c == '\t'
730 || c == '\n'
731 || c == '\r'
732 || c == ' '
733 || c == '#'
734 || c == '/'
735 || c == ':'
736 || c == '<'
737 || c == '>'
738 || c == '?'
739 || c == '@'
740 || c == '['
741 || c == '\\'
742 || c == ']'
743 || c == '^'
744 || c == '|'
745 {
746 return Err(UrlError::InvalidHost);
747 }
748 }
749
750 Ok(Host::Domain(lowered))
751}
752
753/// Check if a domain string ends with a number (suggesting IPv4).
754fn ends_with_number(input: &str) -> bool {
755 let last_part = match input.rsplit('.').next() {
756 Some(p) => p,
757 None => return false,
758 };
759 if last_part.is_empty() {
760 return false;
761 }
762 if last_part.starts_with("0x") || last_part.starts_with("0X") {
763 return last_part[2..].chars().all(|c| c.is_ascii_hexdigit());
764 }
765 last_part.chars().all(|c| c.is_ascii_digit())
766}
767
768// ---------------------------------------------------------------------------
769// Shorten path helper
770// ---------------------------------------------------------------------------
771
772fn shorten_path(scheme: &str, path: &mut Vec<String>) {
773 if scheme == "file" && path.len() == 1 {
774 if let Some(first) = path.first() {
775 if is_normalized_windows_drive_letter(first) {
776 return;
777 }
778 }
779 }
780 path.pop();
781}
782
783fn is_normalized_windows_drive_letter(s: &str) -> bool {
784 let bytes = s.as_bytes();
785 bytes.len() == 2 && bytes[0].is_ascii_alphabetic() && bytes[1] == b':'
786}
787
788fn starts_with_windows_drive_letter(s: &str) -> bool {
789 let bytes = s.as_bytes();
790 if bytes.len() < 2 {
791 return false;
792 }
793 if !bytes[0].is_ascii_alphabetic() {
794 return false;
795 }
796 if bytes[1] != b':' && bytes[1] != b'|' {
797 return false;
798 }
799 if bytes.len() >= 3 {
800 matches!(bytes[2], b'/' | b'\\' | b'?' | b'#')
801 } else {
802 true
803 }
804}
805
806// ---------------------------------------------------------------------------
807// URL parser
808// ---------------------------------------------------------------------------
809
810fn parse_url(input: &str, base: Option<&Url>) -> Result<Url> {
811 // Strip leading/trailing C0 controls and spaces.
812 let input = input.trim_matches(|c: char| c <= '\u{0020}');
813
814 if input.is_empty() {
815 if let Some(base) = base {
816 return parse_relative("", base);
817 }
818 return Err(UrlError::EmptyInput);
819 }
820
821 // Remove tab and newline characters.
822 let input: String = input
823 .chars()
824 .filter(|&c| c != '\t' && c != '\n' && c != '\r')
825 .collect();
826
827 let chars: Vec<char> = input.chars().collect();
828 let len = chars.len();
829
830 let mut pointer = 0;
831
832 // Try to parse a scheme.
833 let mut scheme = String::new();
834 let mut has_scheme = false;
835
836 if pointer < len && chars[pointer].is_ascii_alphabetic() {
837 let mut temp = String::new();
838 temp.push(chars[pointer].to_ascii_lowercase());
839 let mut p = pointer + 1;
840 while p < len
841 && (chars[p].is_ascii_alphanumeric()
842 || chars[p] == '+'
843 || chars[p] == '-'
844 || chars[p] == '.')
845 {
846 temp.push(chars[p].to_ascii_lowercase());
847 p += 1;
848 }
849 if p < len && chars[p] == ':' {
850 scheme = temp;
851 has_scheme = true;
852 pointer = p + 1; // skip the ':'
853 }
854 }
855
856 if !has_scheme {
857 if let Some(base) = base {
858 return parse_relative(&input, base);
859 }
860 return Err(UrlError::MissingScheme);
861 }
862
863 let is_special = is_special_scheme(&scheme);
864
865 let mut url = Url {
866 scheme: scheme.clone(),
867 username: String::new(),
868 password: String::new(),
869 host: None,
870 port: None,
871 path: Vec::new(),
872 opaque_path: false,
873 query: None,
874 fragment: None,
875 };
876
877 let remaining: String = chars[pointer..].iter().collect();
878
879 if scheme == "file" {
880 return parse_file_url(&remaining, base, url);
881 }
882
883 if let Some(after_slashes) = remaining.strip_prefix("//") {
884 parse_authority_and_path(&mut url, after_slashes, is_special)?;
885 } else if is_special {
886 if let Some(base) = base {
887 if base.scheme == url.scheme {
888 return parse_relative_special(&remaining, base, url);
889 }
890 }
891 if let Some(after_slash) = remaining.strip_prefix('/') {
892 parse_authority_and_path(&mut url, after_slash, is_special)?;
893 } else {
894 parse_authority_and_path(&mut url, &remaining, is_special)?;
895 }
896 } else {
897 parse_opaque_or_path(&mut url, &remaining)?;
898 }
899
900 Ok(url)
901}
902
903fn parse_authority_and_path(url: &mut Url, input: &str, is_special: bool) -> Result<()> {
904 let authority_end = input
905 .find(|c: char| c == '/' || c == '?' || c == '#' || (is_special && c == '\\'))
906 .unwrap_or(input.len());
907
908 let authority = &input[..authority_end];
909 let rest = &input[authority_end..];
910
911 let (userinfo_part, hostport) = if let Some(at_pos) = authority.rfind('@') {
912 (&authority[..at_pos], &authority[at_pos + 1..])
913 } else {
914 ("", authority)
915 };
916
917 if !userinfo_part.is_empty() {
918 if let Some(colon_pos) = userinfo_part.find(':') {
919 url.username = percent_encode(&userinfo_part[..colon_pos], is_userinfo_encode);
920 url.password = percent_encode(&userinfo_part[colon_pos + 1..], is_userinfo_encode);
921 } else {
922 url.username = percent_encode(userinfo_part, is_userinfo_encode);
923 }
924 }
925
926 let (host_str, port_str) = split_host_port(hostport);
927
928 url.host = Some(parse_host(host_str, is_special)?);
929
930 if let Some(port_s) = port_str {
931 if !port_s.is_empty() {
932 let port: u16 = port_s.parse().map_err(|_| UrlError::InvalidPort)?;
933 if default_port(&url.scheme) != Some(port) {
934 url.port = Some(port);
935 }
936 }
937 }
938
939 parse_path_query_fragment(url, rest, is_special)
940}
941
942fn split_host_port(input: &str) -> (&str, Option<&str>) {
943 if input.starts_with('[') {
944 if let Some(bracket_end) = input.find(']') {
945 let host = &input[..bracket_end + 1];
946 let after = &input[bracket_end + 1..];
947 if let Some(port_str) = after.strip_prefix(':') {
948 return (host, Some(port_str));
949 }
950 return (host, None);
951 }
952 return (input, None);
953 }
954
955 if let Some(colon_pos) = input.rfind(':') {
956 let port_part = &input[colon_pos + 1..];
957 if port_part.is_empty() || port_part.chars().all(|c| c.is_ascii_digit()) {
958 return (&input[..colon_pos], Some(port_part));
959 }
960 }
961 (input, None)
962}
963
964fn parse_path_query_fragment(url: &mut Url, input: &str, is_special: bool) -> Result<()> {
965 let mut remaining = input;
966
967 let path_end = remaining.find(['?', '#']).unwrap_or(remaining.len());
968 let path_str = &remaining[..path_end];
969 remaining = &remaining[path_end..];
970
971 parse_path_into(url, path_str, is_special);
972
973 if let Some(after_q) = remaining.strip_prefix('?') {
974 remaining = after_q;
975 let query_end = remaining.find('#').unwrap_or(remaining.len());
976 let query_str = &remaining[..query_end];
977 remaining = &remaining[query_end..];
978
979 let encode_fn = if is_special {
980 is_special_query_encode
981 } else {
982 is_query_encode
983 };
984 url.query = Some(percent_encode(query_str, encode_fn));
985 }
986
987 if let Some(after_hash) = remaining.strip_prefix('#') {
988 url.fragment = Some(percent_encode(after_hash, is_fragment_encode));
989 }
990
991 Ok(())
992}
993
994fn parse_path_into(url: &mut Url, path: &str, is_special: bool) {
995 if path.is_empty() {
996 if is_special {
997 url.path = vec![String::new()];
998 }
999 return;
1000 }
1001
1002 let segments: Vec<&str> = if is_special {
1003 path.split(['/', '\\']).collect()
1004 } else {
1005 path.split('/').collect()
1006 };
1007
1008 for (i, seg) in segments.iter().enumerate() {
1009 if i == 0 && seg.is_empty() {
1010 continue;
1011 }
1012
1013 let decoded = *seg;
1014 if decoded == "." || decoded.eq_ignore_ascii_case("%2e") {
1015 if i == segments.len() - 1 {
1016 url.path.push(String::new());
1017 }
1018 } else if decoded == ".."
1019 || decoded.eq_ignore_ascii_case(".%2e")
1020 || decoded.eq_ignore_ascii_case("%2e.")
1021 || decoded.eq_ignore_ascii_case("%2e%2e")
1022 {
1023 shorten_path(&url.scheme, &mut url.path);
1024 if i == segments.len() - 1 {
1025 url.path.push(String::new());
1026 }
1027 } else {
1028 url.path.push(percent_encode(decoded, is_path_encode));
1029 }
1030 }
1031}
1032
1033fn parse_opaque_or_path(url: &mut Url, input: &str) -> Result<()> {
1034 let mut remaining = input;
1035
1036 let path_end = remaining.find(['?', '#']).unwrap_or(remaining.len());
1037 let path_str = &remaining[..path_end];
1038 remaining = &remaining[path_end..];
1039
1040 if path_str.starts_with('/') {
1041 url.opaque_path = false;
1042 parse_path_into(url, path_str, false);
1043 } else {
1044 url.opaque_path = true;
1045 url.path = vec![percent_encode(path_str, is_c0_control)];
1046 }
1047
1048 if let Some(after_q) = remaining.strip_prefix('?') {
1049 remaining = after_q;
1050 let query_end = remaining.find('#').unwrap_or(remaining.len());
1051 let query_str = &remaining[..query_end];
1052 remaining = &remaining[query_end..];
1053 url.query = Some(percent_encode(query_str, is_query_encode));
1054 }
1055
1056 if let Some(after_hash) = remaining.strip_prefix('#') {
1057 url.fragment = Some(percent_encode(after_hash, is_fragment_encode));
1058 }
1059
1060 Ok(())
1061}
1062
1063// ---------------------------------------------------------------------------
1064// Relative URL parsing
1065// ---------------------------------------------------------------------------
1066
1067fn parse_relative(input: &str, base: &Url) -> Result<Url> {
1068 let mut url = Url {
1069 scheme: base.scheme.clone(),
1070 username: base.username.clone(),
1071 password: base.password.clone(),
1072 host: base.host.clone(),
1073 port: base.port,
1074 path: base.path.clone(),
1075 opaque_path: base.opaque_path,
1076 query: base.query.clone(),
1077 fragment: None,
1078 };
1079
1080 let is_special = is_special_scheme(&url.scheme);
1081
1082 if input.is_empty() {
1083 return Ok(url);
1084 }
1085
1086 let chars: Vec<char> = input.chars().collect();
1087
1088 if chars[0] == '/' || (is_special && chars[0] == '\\') {
1089 if input.starts_with("//") || (is_special && input.starts_with("\\/")) {
1090 let after_slashes = &input[2..];
1091 url.username = String::new();
1092 url.password = String::new();
1093 url.path = Vec::new();
1094 url.query = None;
1095 parse_authority_and_path(&mut url, after_slashes, is_special)?;
1096 return Ok(url);
1097 }
1098 url.path = Vec::new();
1099 url.query = None;
1100 parse_path_query_fragment(&mut url, input, is_special)?;
1101 return Ok(url);
1102 }
1103
1104 if let Some(after_q) = input.strip_prefix('?') {
1105 url.query = None;
1106 url.fragment = None;
1107 let query_end = after_q.find('#').unwrap_or(after_q.len());
1108 let query_str = &after_q[..query_end];
1109 let after = &after_q[query_end..];
1110
1111 let encode_fn = if is_special {
1112 is_special_query_encode
1113 } else {
1114 is_query_encode
1115 };
1116 url.query = Some(percent_encode(query_str, encode_fn));
1117
1118 if let Some(frag) = after.strip_prefix('#') {
1119 url.fragment = Some(percent_encode(frag, is_fragment_encode));
1120 }
1121 return Ok(url);
1122 }
1123
1124 if let Some(frag) = input.strip_prefix('#') {
1125 url.fragment = Some(percent_encode(frag, is_fragment_encode));
1126 return Ok(url);
1127 }
1128
1129 // Path-relative.
1130 if !url.opaque_path {
1131 shorten_path(&url.scheme, &mut url.path);
1132 }
1133 url.query = None;
1134 url.fragment = None;
1135
1136 parse_path_query_fragment(&mut url, &format!("/{input}"), is_special)?;
1137 Ok(url)
1138}
1139
1140fn parse_relative_special(remaining: &str, base: &Url, mut url: Url) -> Result<Url> {
1141 url.username = base.username.clone();
1142 url.password = base.password.clone();
1143 url.host = base.host.clone();
1144 url.port = base.port;
1145 url.path = base.path.clone();
1146 url.query = base.query.clone();
1147
1148 let is_special = true;
1149
1150 if remaining.is_empty() {
1151 return Ok(url);
1152 }
1153
1154 if remaining.starts_with('/') || remaining.starts_with('\\') {
1155 url.path = Vec::new();
1156 url.query = None;
1157 parse_path_query_fragment(&mut url, remaining, is_special)?;
1158 return Ok(url);
1159 }
1160
1161 if let Some(rest) = remaining.strip_prefix('?') {
1162 url.query = None;
1163 url.fragment = None;
1164 let query_end = rest.find('#').unwrap_or(rest.len());
1165 url.query = Some(percent_encode(&rest[..query_end], is_special_query_encode));
1166 if query_end < rest.len() {
1167 url.fragment = Some(percent_encode(&rest[query_end + 1..], is_fragment_encode));
1168 }
1169 return Ok(url);
1170 }
1171
1172 if let Some(frag) = remaining.strip_prefix('#') {
1173 url.fragment = Some(percent_encode(frag, is_fragment_encode));
1174 return Ok(url);
1175 }
1176
1177 shorten_path(&url.scheme, &mut url.path);
1178 url.query = None;
1179 parse_path_query_fragment(&mut url, &format!("/{remaining}"), is_special)?;
1180 Ok(url)
1181}
1182
1183// ---------------------------------------------------------------------------
1184// File URL parsing
1185// ---------------------------------------------------------------------------
1186
1187fn parse_file_url(input: &str, base: Option<&Url>, mut url: Url) -> Result<Url> {
1188 url.host = Some(Host::Domain(String::new()));
1189
1190 let remaining = if let Some(after) = input.strip_prefix("//") {
1191 after
1192 } else if let Some(after) = input.strip_prefix('/') {
1193 after
1194 } else if let Some(base) = base {
1195 if base.scheme == "file" {
1196 url.host = base.host.clone();
1197 url.path = base.path.clone();
1198
1199 if let Some(rest) = input.strip_prefix('?') {
1200 url.query = None;
1201 url.fragment = None;
1202 let query_end = rest.find('#').unwrap_or(rest.len());
1203 url.query = Some(percent_encode(&rest[..query_end], is_query_encode));
1204 if query_end < rest.len() {
1205 url.fragment = Some(percent_encode(&rest[query_end + 1..], is_fragment_encode));
1206 }
1207 return Ok(url);
1208 }
1209
1210 if let Some(frag) = input.strip_prefix('#') {
1211 url.fragment = Some(percent_encode(frag, is_fragment_encode));
1212 return Ok(url);
1213 }
1214
1215 shorten_path(&url.scheme, &mut url.path);
1216 url.query = None;
1217 parse_path_query_fragment(&mut url, &format!("/{input}"), false)?;
1218 return Ok(url);
1219 } else {
1220 input
1221 }
1222 } else {
1223 input
1224 };
1225
1226 let path_start = remaining
1227 .find(['/', '\\', '?', '#'])
1228 .unwrap_or(remaining.len());
1229
1230 let potential_host = &remaining[..path_start];
1231 let rest = &remaining[path_start..];
1232
1233 if starts_with_windows_drive_letter(remaining) {
1234 url.host = Some(Host::Domain(String::new()));
1235 parse_path_query_fragment(&mut url, &format!("/{remaining}"), false)?;
1236 return Ok(url);
1237 }
1238
1239 if !potential_host.is_empty() {
1240 let host = parse_host(potential_host, false)?;
1241 if host != Host::Domain(String::new()) {
1242 url.host = Some(host);
1243 }
1244 }
1245
1246 parse_path_query_fragment(&mut url, rest, false)?;
1247
1248 // Normalize Windows drive letters in path.
1249 if let Some(first) = url.path.first_mut() {
1250 if first.len() == 2 {
1251 let bytes = first.as_bytes();
1252 if bytes[0].is_ascii_alphabetic() && bytes[1] == b'|' {
1253 let mut normalized = String::new();
1254 normalized.push(bytes[0] as char);
1255 normalized.push(':');
1256 *first = normalized;
1257 }
1258 }
1259 }
1260
1261 Ok(url)
1262}
1263
1264// ---------------------------------------------------------------------------
1265// Tests
1266// ---------------------------------------------------------------------------
1267
1268#[cfg(test)]
1269mod tests {
1270 use super::*;
1271
1272 // -------------------------------------------------------------------
1273 // Basic absolute URL parsing
1274 // -------------------------------------------------------------------
1275
1276 #[test]
1277 fn parse_simple_http() {
1278 let url = Url::parse("http://example.com").unwrap();
1279 assert_eq!(url.scheme(), "http");
1280 assert_eq!(url.host_str(), Some("example.com".into()));
1281 assert_eq!(url.port(), None);
1282 assert_eq!(url.path(), "/");
1283 assert_eq!(url.query(), None);
1284 assert_eq!(url.fragment(), None);
1285 }
1286
1287 #[test]
1288 fn parse_https_with_path() {
1289 let url = Url::parse("https://example.com/foo/bar").unwrap();
1290 assert_eq!(url.scheme(), "https");
1291 assert_eq!(url.host_str(), Some("example.com".into()));
1292 assert_eq!(url.path(), "/foo/bar");
1293 }
1294
1295 #[test]
1296 fn parse_full_url() {
1297 let url =
1298 Url::parse("https://user:pass@example.com:8080/path/to/page?q=1&r=2#frag").unwrap();
1299 assert_eq!(url.scheme(), "https");
1300 assert_eq!(url.username(), "user");
1301 assert_eq!(url.password(), "pass");
1302 assert_eq!(url.host_str(), Some("example.com".into()));
1303 assert_eq!(url.port(), Some(8080));
1304 assert_eq!(url.path(), "/path/to/page");
1305 assert_eq!(url.query(), Some("q=1&r=2"));
1306 assert_eq!(url.fragment(), Some("frag"));
1307 }
1308
1309 #[test]
1310 fn parse_default_port_omitted() {
1311 let url = Url::parse("http://example.com:80/").unwrap();
1312 assert_eq!(url.port(), None);
1313 assert_eq!(url.port_or_default(), Some(80));
1314 }
1315
1316 #[test]
1317 fn parse_non_default_port() {
1318 let url = Url::parse("http://example.com:8080/").unwrap();
1319 assert_eq!(url.port(), Some(8080));
1320 }
1321
1322 #[test]
1323 fn parse_https_default_port() {
1324 let url = Url::parse("https://example.com:443/").unwrap();
1325 assert_eq!(url.port(), None);
1326 }
1327
1328 #[test]
1329 fn parse_ftp_default_port() {
1330 let url = Url::parse("ftp://files.example.com:21/readme.txt").unwrap();
1331 assert_eq!(url.port(), None);
1332 assert_eq!(url.port_or_default(), Some(21));
1333 }
1334
1335 // -------------------------------------------------------------------
1336 // Scheme handling
1337 // -------------------------------------------------------------------
1338
1339 #[test]
1340 fn scheme_is_lowercased() {
1341 let url = Url::parse("HTTP://EXAMPLE.COM").unwrap();
1342 assert_eq!(url.scheme(), "http");
1343 }
1344
1345 #[test]
1346 fn non_special_scheme() {
1347 let url = Url::parse("custom://host/path").unwrap();
1348 assert_eq!(url.scheme(), "custom");
1349 assert_eq!(url.host_str(), Some("host".into()));
1350 assert_eq!(url.path(), "/path");
1351 }
1352
1353 #[test]
1354 fn data_uri() {
1355 let url = Url::parse("data:text/html,<h1>Hello</h1>").unwrap();
1356 assert_eq!(url.scheme(), "data");
1357 assert!(url.cannot_be_a_base());
1358 }
1359
1360 #[test]
1361 fn javascript_uri() {
1362 let url = Url::parse("javascript:alert(1)").unwrap();
1363 assert_eq!(url.scheme(), "javascript");
1364 assert!(url.cannot_be_a_base());
1365 }
1366
1367 #[test]
1368 fn mailto_uri() {
1369 let url = Url::parse("mailto:user@example.com").unwrap();
1370 assert_eq!(url.scheme(), "mailto");
1371 assert!(url.cannot_be_a_base());
1372 }
1373
1374 // -------------------------------------------------------------------
1375 // Host parsing
1376 // -------------------------------------------------------------------
1377
1378 #[test]
1379 fn host_is_lowercased() {
1380 let url = Url::parse("http://EXAMPLE.COM/").unwrap();
1381 assert_eq!(url.host_str(), Some("example.com".into()));
1382 }
1383
1384 #[test]
1385 fn ipv4_host() {
1386 let url = Url::parse("http://127.0.0.1/").unwrap();
1387 assert_eq!(url.host(), Some(&Host::Ipv4(0x7F000001)));
1388 assert_eq!(url.host_str(), Some("127.0.0.1".into()));
1389 }
1390
1391 #[test]
1392 fn ipv4_host_all_zeros() {
1393 let url = Url::parse("http://0.0.0.0/").unwrap();
1394 assert_eq!(url.host(), Some(&Host::Ipv4(0)));
1395 }
1396
1397 #[test]
1398 fn ipv6_host() {
1399 let url = Url::parse("http://[::1]/").unwrap();
1400 assert_eq!(url.host(), Some(&Host::Ipv6([0, 0, 0, 0, 0, 0, 0, 1])));
1401 }
1402
1403 #[test]
1404 fn ipv6_full() {
1405 let url = Url::parse("http://[2001:db8:85a3:0:0:8a2e:370:7334]/").unwrap();
1406 assert_eq!(
1407 url.host(),
1408 Some(&Host::Ipv6([
1409 0x2001, 0x0db8, 0x85a3, 0, 0, 0x8a2e, 0x0370, 0x7334
1410 ]))
1411 );
1412 }
1413
1414 #[test]
1415 fn ipv6_serialization_compressed() {
1416 let url = Url::parse("http://[2001:db8::1]/").unwrap();
1417 assert_eq!(url.host_str(), Some("[2001:db8::1]".into()));
1418 }
1419
1420 #[test]
1421 fn ipv6_all_zeros() {
1422 let url = Url::parse("http://[::]/").unwrap();
1423 assert_eq!(url.host(), Some(&Host::Ipv6([0; 8])));
1424 assert_eq!(url.host_str(), Some("[::]".into()));
1425 }
1426
1427 #[test]
1428 fn ipv6_loopback() {
1429 let pieces = parse_ipv6("::1").unwrap();
1430 assert_eq!(pieces, [0, 0, 0, 0, 0, 0, 0, 1]);
1431 }
1432
1433 #[test]
1434 fn ipv6_with_ipv4() {
1435 let pieces = parse_ipv6("::ffff:192.168.1.1").unwrap();
1436 assert_eq!(pieces, [0, 0, 0, 0, 0, 0xffff, 0xc0a8, 0x0101]);
1437 }
1438
1439 // -------------------------------------------------------------------
1440 // IPv4 parsing
1441 // -------------------------------------------------------------------
1442
1443 #[test]
1444 fn ipv4_basic() {
1445 assert_eq!(parse_ipv4("192.168.1.1").unwrap(), 0xC0A80101);
1446 }
1447
1448 #[test]
1449 fn ipv4_hex() {
1450 assert_eq!(parse_ipv4("0xC0.0xA8.0x01.0x01").unwrap(), 0xC0A80101);
1451 }
1452
1453 #[test]
1454 fn ipv4_octal() {
1455 assert_eq!(parse_ipv4("0300.0250.01.01").unwrap(), 0xC0A80101);
1456 }
1457
1458 #[test]
1459 fn ipv4_single_number() {
1460 assert!(parse_ipv4("3232235777").is_err());
1461 }
1462
1463 #[test]
1464 fn ipv4_two_parts() {
1465 // Two parts: first is top 8 bits, second is bottom 24 bits.
1466 // 192.168.1.1 => 168*65536 + 1*256 + 1 = 11010305
1467 assert_eq!(parse_ipv4("192.11010305").unwrap(), 0xC0A80101);
1468 }
1469
1470 #[test]
1471 fn ipv4_reject_overflow() {
1472 assert!(parse_ipv4("256.0.0.0").is_err());
1473 }
1474
1475 #[test]
1476 fn ipv4_reject_empty_part() {
1477 assert!(parse_ipv4("1..1.1").is_err());
1478 }
1479
1480 // -------------------------------------------------------------------
1481 // Percent encoding/decoding
1482 // -------------------------------------------------------------------
1483
1484 #[test]
1485 fn percent_decode_basic() {
1486 assert_eq!(percent_decode_string("%48%65%6C%6C%6F"), "Hello");
1487 }
1488
1489 #[test]
1490 fn percent_decode_mixed() {
1491 assert_eq!(percent_decode_string("Hello%20World"), "Hello World");
1492 }
1493
1494 #[test]
1495 fn percent_decode_passthrough() {
1496 assert_eq!(percent_decode_string("no-encoding"), "no-encoding");
1497 }
1498
1499 #[test]
1500 fn percent_decode_partial() {
1501 assert_eq!(percent_decode_string("100%"), "100%");
1502 assert_eq!(percent_decode_string("%2"), "%2");
1503 }
1504
1505 #[test]
1506 fn percent_encode_userinfo() {
1507 let encoded = percent_encode("user@host", is_userinfo_encode);
1508 assert_eq!(encoded, "user%40host");
1509 }
1510
1511 #[test]
1512 fn percent_encode_path() {
1513 let encoded = percent_encode("hello world", is_path_encode);
1514 assert_eq!(encoded, "hello%20world");
1515 }
1516
1517 // -------------------------------------------------------------------
1518 // Path parsing and dot segments
1519 // -------------------------------------------------------------------
1520
1521 #[test]
1522 fn path_dot_removal() {
1523 let url = Url::parse("http://example.com/a/b/../c").unwrap();
1524 assert_eq!(url.path(), "/a/c");
1525 }
1526
1527 #[test]
1528 fn path_dot_current() {
1529 let url = Url::parse("http://example.com/a/./b").unwrap();
1530 assert_eq!(url.path(), "/a/b");
1531 }
1532
1533 #[test]
1534 fn path_multiple_dots() {
1535 let url = Url::parse("http://example.com/a/b/c/../../d").unwrap();
1536 assert_eq!(url.path(), "/a/d");
1537 }
1538
1539 #[test]
1540 fn path_trailing_slash() {
1541 let url = Url::parse("http://example.com/a/b/").unwrap();
1542 assert_eq!(url.path(), "/a/b/");
1543 }
1544
1545 #[test]
1546 fn path_empty() {
1547 let url = Url::parse("http://example.com").unwrap();
1548 assert_eq!(url.path(), "/");
1549 }
1550
1551 #[test]
1552 fn path_double_dot_at_root() {
1553 let url = Url::parse("http://example.com/../a").unwrap();
1554 assert_eq!(url.path(), "/a");
1555 }
1556
1557 // -------------------------------------------------------------------
1558 // Relative URL resolution
1559 // -------------------------------------------------------------------
1560
1561 #[test]
1562 fn relative_path() {
1563 let base = Url::parse("http://example.com/a/b/c").unwrap();
1564 let url = Url::parse_with_base("d", &base).unwrap();
1565 assert_eq!(url.path(), "/a/b/d");
1566 assert_eq!(url.host_str(), Some("example.com".into()));
1567 }
1568
1569 #[test]
1570 fn relative_path_with_dots() {
1571 let base = Url::parse("http://example.com/a/b/c").unwrap();
1572 let url = Url::parse_with_base("../d", &base).unwrap();
1573 assert_eq!(url.path(), "/a/d");
1574 }
1575
1576 #[test]
1577 fn relative_absolute_path() {
1578 let base = Url::parse("http://example.com/a/b/c").unwrap();
1579 let url = Url::parse_with_base("/d/e", &base).unwrap();
1580 assert_eq!(url.path(), "/d/e");
1581 assert_eq!(url.host_str(), Some("example.com".into()));
1582 }
1583
1584 #[test]
1585 fn relative_query_only() {
1586 let base = Url::parse("http://example.com/a/b?old=1").unwrap();
1587 let url = Url::parse_with_base("?new=2", &base).unwrap();
1588 assert_eq!(url.path(), "/a/b");
1589 assert_eq!(url.query(), Some("new=2"));
1590 }
1591
1592 #[test]
1593 fn relative_fragment_only() {
1594 let base = Url::parse("http://example.com/a/b#old").unwrap();
1595 let url = Url::parse_with_base("#new", &base).unwrap();
1596 assert_eq!(url.path(), "/a/b");
1597 assert_eq!(url.fragment(), Some("new"));
1598 }
1599
1600 #[test]
1601 fn relative_authority_override() {
1602 let base = Url::parse("http://example.com/a/b").unwrap();
1603 let url = Url::parse_with_base("//other.com/c", &base).unwrap();
1604 assert_eq!(url.scheme(), "http");
1605 assert_eq!(url.host_str(), Some("other.com".into()));
1606 assert_eq!(url.path(), "/c");
1607 }
1608
1609 #[test]
1610 fn absolute_url_ignores_base() {
1611 let base = Url::parse("http://example.com/a").unwrap();
1612 let url = Url::parse_with_base("https://other.com/b", &base).unwrap();
1613 assert_eq!(url.scheme(), "https");
1614 assert_eq!(url.host_str(), Some("other.com".into()));
1615 assert_eq!(url.path(), "/b");
1616 }
1617
1618 #[test]
1619 fn relative_empty_string() {
1620 let base = Url::parse("http://example.com/a/b?q=1#f").unwrap();
1621 let url = Url::parse_with_base("", &base).unwrap();
1622 assert_eq!(url.path(), "/a/b");
1623 assert_eq!(url.query(), Some("q=1"));
1624 assert_eq!(url.fragment(), None);
1625 }
1626
1627 // -------------------------------------------------------------------
1628 // Serialization
1629 // -------------------------------------------------------------------
1630
1631 #[test]
1632 fn serialize_simple() {
1633 let url = Url::parse("http://example.com/path").unwrap();
1634 assert_eq!(url.serialize(), "http://example.com/path");
1635 }
1636
1637 #[test]
1638 fn serialize_with_credentials() {
1639 let url = Url::parse("http://user:pass@example.com/").unwrap();
1640 assert_eq!(url.serialize(), "http://user:pass@example.com/");
1641 }
1642
1643 #[test]
1644 fn serialize_with_port() {
1645 let url = Url::parse("http://example.com:8080/").unwrap();
1646 assert_eq!(url.serialize(), "http://example.com:8080/");
1647 }
1648
1649 #[test]
1650 fn serialize_with_query_fragment() {
1651 let url = Url::parse("http://example.com/path?q=1#frag").unwrap();
1652 assert_eq!(url.serialize(), "http://example.com/path?q=1#frag");
1653 }
1654
1655 #[test]
1656 fn serialize_data_uri() {
1657 let url = Url::parse("data:text/html,hello").unwrap();
1658 assert_eq!(url.serialize(), "data:text/html,hello");
1659 }
1660
1661 #[test]
1662 fn roundtrip_full_url() {
1663 let input = "https://user:pass@example.com:8080/a/b?q=1#frag";
1664 let url = Url::parse(input).unwrap();
1665 assert_eq!(url.serialize(), input);
1666 }
1667
1668 #[test]
1669 fn roundtrip_ipv4() {
1670 let url = Url::parse("http://192.168.1.1/path").unwrap();
1671 assert_eq!(url.serialize(), "http://192.168.1.1/path");
1672 }
1673
1674 #[test]
1675 fn roundtrip_ipv6() {
1676 let url = Url::parse("http://[::1]/path").unwrap();
1677 assert_eq!(url.serialize(), "http://[::1]/path");
1678 }
1679
1680 // -------------------------------------------------------------------
1681 // Origin
1682 // -------------------------------------------------------------------
1683
1684 #[test]
1685 fn origin_http() {
1686 let url = Url::parse("http://example.com:8080/path").unwrap();
1687 match url.origin() {
1688 Origin::Tuple(scheme, host, port) => {
1689 assert_eq!(scheme, "http");
1690 assert_eq!(host, Host::Domain("example.com".into()));
1691 assert_eq!(port, Some(8080));
1692 }
1693 _ => panic!("expected tuple origin"),
1694 }
1695 }
1696
1697 #[test]
1698 fn origin_https_default_port() {
1699 let url = Url::parse("https://example.com/").unwrap();
1700 match url.origin() {
1701 Origin::Tuple(scheme, host, port) => {
1702 assert_eq!(scheme, "https");
1703 assert_eq!(host, Host::Domain("example.com".into()));
1704 assert_eq!(port, None);
1705 }
1706 _ => panic!("expected tuple origin"),
1707 }
1708 }
1709
1710 #[test]
1711 fn origin_data_is_opaque() {
1712 let url = Url::parse("data:text/html,hello").unwrap();
1713 assert_eq!(url.origin(), Origin::Opaque);
1714 }
1715
1716 // -------------------------------------------------------------------
1717 // Origin::same_origin
1718 // -------------------------------------------------------------------
1719
1720 #[test]
1721 fn same_origin_identical_tuple() {
1722 let a = Url::parse("http://example.com/page1").unwrap();
1723 let b = Url::parse("http://example.com/page2").unwrap();
1724 assert!(a.origin().same_origin(&b.origin()));
1725 }
1726
1727 #[test]
1728 fn same_origin_different_path_query_fragment() {
1729 let a = Url::parse("https://example.com/a?x=1#frag").unwrap();
1730 let b = Url::parse("https://example.com/b?y=2#other").unwrap();
1731 assert!(a.origin().same_origin(&b.origin()));
1732 }
1733
1734 #[test]
1735 fn same_origin_different_scheme() {
1736 let a = Url::parse("http://example.com/").unwrap();
1737 let b = Url::parse("https://example.com/").unwrap();
1738 assert!(!a.origin().same_origin(&b.origin()));
1739 }
1740
1741 #[test]
1742 fn same_origin_different_host() {
1743 let a = Url::parse("http://example.com/").unwrap();
1744 let b = Url::parse("http://other.com/").unwrap();
1745 assert!(!a.origin().same_origin(&b.origin()));
1746 }
1747
1748 #[test]
1749 fn same_origin_different_port() {
1750 let a = Url::parse("http://example.com:8080/").unwrap();
1751 let b = Url::parse("http://example.com:9090/").unwrap();
1752 assert!(!a.origin().same_origin(&b.origin()));
1753 }
1754
1755 #[test]
1756 fn same_origin_default_port_normalization_http() {
1757 // http://example.com (port=None) should match http://example.com:80
1758 let a = Url::parse("http://example.com/").unwrap();
1759 let b = Url::parse("http://example.com:80/").unwrap();
1760 assert!(a.origin().same_origin(&b.origin()));
1761 }
1762
1763 #[test]
1764 fn same_origin_default_port_normalization_https() {
1765 let a = Url::parse("https://example.com/").unwrap();
1766 let b = Url::parse("https://example.com:443/").unwrap();
1767 assert!(a.origin().same_origin(&b.origin()));
1768 }
1769
1770 #[test]
1771 fn same_origin_default_port_normalization_ftp() {
1772 let a = Url::parse("ftp://example.com/").unwrap();
1773 let b = Url::parse("ftp://example.com:21/").unwrap();
1774 assert!(a.origin().same_origin(&b.origin()));
1775 }
1776
1777 #[test]
1778 fn same_origin_non_default_port_vs_none() {
1779 let a = Url::parse("http://example.com/").unwrap();
1780 let b = Url::parse("http://example.com:8080/").unwrap();
1781 assert!(!a.origin().same_origin(&b.origin()));
1782 }
1783
1784 #[test]
1785 fn same_origin_opaque_never_matches() {
1786 let a = Url::parse("data:text/html,hello").unwrap();
1787 let b = Url::parse("data:text/html,hello").unwrap();
1788 assert!(!a.origin().same_origin(&b.origin()));
1789 }
1790
1791 #[test]
1792 fn same_origin_opaque_vs_tuple() {
1793 let a = Url::parse("data:text/html,hello").unwrap();
1794 let b = Url::parse("http://example.com/").unwrap();
1795 assert!(!a.origin().same_origin(&b.origin()));
1796 }
1797
1798 #[test]
1799 fn same_origin_ipv4() {
1800 let a = Url::parse("http://127.0.0.1/a").unwrap();
1801 let b = Url::parse("http://127.0.0.1/b").unwrap();
1802 assert!(a.origin().same_origin(&b.origin()));
1803 }
1804
1805 #[test]
1806 fn same_origin_ipv4_different() {
1807 let a = Url::parse("http://127.0.0.1/").unwrap();
1808 let b = Url::parse("http://192.168.1.1/").unwrap();
1809 assert!(!a.origin().same_origin(&b.origin()));
1810 }
1811
1812 // -------------------------------------------------------------------
1813 // Origin::serialize / Display
1814 // -------------------------------------------------------------------
1815
1816 #[test]
1817 fn origin_serialize_http() {
1818 let url = Url::parse("http://example.com/path").unwrap();
1819 assert_eq!(url.origin().serialize(), "http://example.com");
1820 }
1821
1822 #[test]
1823 fn origin_serialize_https_with_port() {
1824 let url = Url::parse("https://example.com:8443/").unwrap();
1825 assert_eq!(url.origin().serialize(), "https://example.com:8443");
1826 }
1827
1828 #[test]
1829 fn origin_serialize_default_port_omitted() {
1830 // Default port should not appear in serialization
1831 let url = Url::parse("http://example.com:80/").unwrap();
1832 assert_eq!(url.origin().serialize(), "http://example.com");
1833 }
1834
1835 #[test]
1836 fn origin_serialize_opaque() {
1837 let url = Url::parse("data:text/html,hi").unwrap();
1838 assert_eq!(url.origin().serialize(), "null");
1839 }
1840
1841 #[test]
1842 fn origin_display() {
1843 let url = Url::parse("https://example.com/").unwrap();
1844 assert_eq!(format!("{}", url.origin()), "https://example.com");
1845 }
1846
1847 // -------------------------------------------------------------------
1848 // File URLs
1849 // -------------------------------------------------------------------
1850
1851 #[test]
1852 fn file_url_unix() {
1853 let url = Url::parse("file:///home/user/file.txt").unwrap();
1854 assert_eq!(url.scheme(), "file");
1855 assert_eq!(url.host_str(), Some("".into()));
1856 assert_eq!(url.path(), "/home/user/file.txt");
1857 }
1858
1859 #[test]
1860 fn file_url_windows_drive() {
1861 let url = Url::parse("file:///C:/Windows/system32").unwrap();
1862 assert_eq!(url.scheme(), "file");
1863 assert_eq!(url.path(), "/C:/Windows/system32");
1864 }
1865
1866 #[test]
1867 fn file_url_with_host() {
1868 let url = Url::parse("file://server/share/file.txt").unwrap();
1869 assert_eq!(url.scheme(), "file");
1870 assert_eq!(url.host_str(), Some("server".into()));
1871 assert_eq!(url.path(), "/share/file.txt");
1872 }
1873
1874 // -------------------------------------------------------------------
1875 // Edge cases
1876 // -------------------------------------------------------------------
1877
1878 #[test]
1879 fn empty_input_fails() {
1880 assert_eq!(Url::parse(""), Err(UrlError::EmptyInput));
1881 }
1882
1883 #[test]
1884 fn whitespace_only_fails() {
1885 assert_eq!(Url::parse(" "), Err(UrlError::EmptyInput));
1886 }
1887
1888 #[test]
1889 fn missing_scheme_fails() {
1890 assert!(Url::parse("example.com").is_err());
1891 }
1892
1893 #[test]
1894 fn leading_whitespace_stripped() {
1895 let url = Url::parse(" http://example.com ").unwrap();
1896 assert_eq!(url.host_str(), Some("example.com".into()));
1897 }
1898
1899 #[test]
1900 fn tab_newline_stripped() {
1901 let url = Url::parse("http://exa\tmple\n.com/").unwrap();
1902 assert_eq!(url.host_str(), Some("example.com".into()));
1903 }
1904
1905 #[test]
1906 fn query_with_special_chars() {
1907 let url = Url::parse("http://example.com/?key=val ue&foo=bar").unwrap();
1908 assert!(url.query().unwrap().contains("key=val%20ue"));
1909 }
1910
1911 #[test]
1912 fn fragment_with_special_chars() {
1913 let url = Url::parse("http://example.com/#sec tion").unwrap();
1914 assert!(url.fragment().unwrap().contains("sec%20tion"));
1915 }
1916
1917 #[test]
1918 fn username_only() {
1919 let url = Url::parse("http://user@example.com/").unwrap();
1920 assert_eq!(url.username(), "user");
1921 assert_eq!(url.password(), "");
1922 assert!(url.has_credentials());
1923 }
1924
1925 #[test]
1926 fn no_credentials() {
1927 let url = Url::parse("http://example.com/").unwrap();
1928 assert!(!url.has_credentials());
1929 }
1930
1931 #[test]
1932 fn port_overflow_fails() {
1933 assert!(Url::parse("http://example.com:99999/").is_err());
1934 }
1935
1936 #[test]
1937 fn ws_scheme() {
1938 let url = Url::parse("ws://example.com/chat").unwrap();
1939 assert_eq!(url.scheme(), "ws");
1940 assert_eq!(url.port_or_default(), Some(80));
1941 }
1942
1943 #[test]
1944 fn wss_scheme() {
1945 let url = Url::parse("wss://example.com/chat").unwrap();
1946 assert_eq!(url.scheme(), "wss");
1947 assert_eq!(url.port_or_default(), Some(443));
1948 }
1949
1950 #[test]
1951 fn cannot_be_a_base() {
1952 let url = Url::parse("data:text/html,hello").unwrap();
1953 assert!(url.cannot_be_a_base());
1954 }
1955
1956 #[test]
1957 fn http_can_be_a_base() {
1958 let url = Url::parse("http://example.com/").unwrap();
1959 assert!(!url.cannot_be_a_base());
1960 }
1961
1962 // -------------------------------------------------------------------
1963 // Display/ToString
1964 // -------------------------------------------------------------------
1965
1966 #[test]
1967 fn display_matches_serialize() {
1968 let url = Url::parse("https://example.com:8443/path?q=1#f").unwrap();
1969 assert_eq!(format!("{url}"), url.serialize());
1970 }
1971
1972 // -------------------------------------------------------------------
1973 // Multiple path segments
1974 // -------------------------------------------------------------------
1975
1976 #[test]
1977 fn path_segments() {
1978 let url = Url::parse("http://example.com/a/b/c").unwrap();
1979 assert_eq!(url.path_segments(), &["a", "b", "c"]);
1980 }
1981
1982 #[test]
1983 fn path_segments_trailing_slash() {
1984 let url = Url::parse("http://example.com/a/b/").unwrap();
1985 assert_eq!(url.path_segments(), &["a", "b", ""]);
1986 }
1987
1988 // -------------------------------------------------------------------
1989 // Host type
1990 // -------------------------------------------------------------------
1991
1992 #[test]
1993 fn host_serialize_domain() {
1994 let h = Host::Domain("example.com".into());
1995 assert_eq!(h.serialize(), "example.com");
1996 }
1997
1998 #[test]
1999 fn host_serialize_ipv4() {
2000 let h = Host::Ipv4(0x7F000001);
2001 assert_eq!(h.serialize(), "127.0.0.1");
2002 }
2003
2004 #[test]
2005 fn host_serialize_ipv6() {
2006 let h = Host::Ipv6([0, 0, 0, 0, 0, 0, 0, 1]);
2007 assert_eq!(h.serialize(), "[::1]");
2008 }
2009
2010 // -------------------------------------------------------------------
2011 // IPv6 serialization
2012 // -------------------------------------------------------------------
2013
2014 #[test]
2015 fn ipv6_serialize_full() {
2016 let pieces = [
2017 0x2001, 0x0db8, 0x85a3, 0x0001, 0x0002, 0x8a2e, 0x0370, 0x7334,
2018 ];
2019 assert_eq!(serialize_ipv6(&pieces), "2001:db8:85a3:1:2:8a2e:370:7334");
2020 }
2021
2022 #[test]
2023 fn ipv6_serialize_compress() {
2024 let pieces = [0x2001, 0x0db8, 0, 0, 0, 0, 0, 1];
2025 assert_eq!(serialize_ipv6(&pieces), "2001:db8::1");
2026 }
2027
2028 #[test]
2029 fn ipv6_serialize_all_zeros() {
2030 let pieces = [0u16; 8];
2031 assert_eq!(serialize_ipv6(&pieces), "::");
2032 }
2033
2034 #[test]
2035 fn ipv6_serialize_no_compress_single_zero() {
2036 let pieces = [1, 0, 2, 0, 3, 0, 4, 0];
2037 assert_eq!(serialize_ipv6(&pieces), "1:0:2:0:3:0:4:0");
2038 }
2039
2040 // -------------------------------------------------------------------
2041 // Percent encoding edge cases
2042 // -------------------------------------------------------------------
2043
2044 #[test]
2045 fn percent_encode_preserves_unreserved() {
2046 let encoded = percent_encode("hello-world_test.page~1", is_path_encode);
2047 assert_eq!(encoded, "hello-world_test.page~1");
2048 }
2049
2050 #[test]
2051 fn percent_encode_multibyte_utf8() {
2052 let encoded = percent_encode("café", is_path_encode);
2053 assert_eq!(encoded, "caf%C3%A9");
2054 }
2055}