we (web engine): Experimental web browser project to understand the limits of Claude
at data-urls 718 lines 24 kB view raw
1//! Resource loader: fetch URLs and decode their content. 2//! 3//! Brings together `net` (HTTP client), `encoding` (charset detection and decoding), 4//! `url` (URL parsing and resolution), and `image` (image decoding) into a single 5//! `ResourceLoader` that the browser uses to load web pages and subresources. 6 7use std::fmt; 8 9use we_encoding::sniff::sniff_encoding; 10use we_encoding::Encoding; 11use we_net::client::{ClientError, HttpClient}; 12use we_net::http::ContentType; 13use we_url::data_url::{is_data_url, parse_data_url}; 14use we_url::Url; 15 16// --------------------------------------------------------------------------- 17// Error type 18// --------------------------------------------------------------------------- 19 20/// Errors that can occur during resource loading. 21#[derive(Debug)] 22pub enum LoadError { 23 /// URL parsing failed. 24 InvalidUrl(String), 25 /// Network or HTTP error from the underlying client. 26 Network(ClientError), 27 /// HTTP response indicated an error status. 28 HttpStatus { status: u16, reason: String }, 29 /// Encoding or decoding error. 30 Encoding(String), 31} 32 33impl fmt::Display for LoadError { 34 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 35 match self { 36 Self::InvalidUrl(s) => write!(f, "invalid URL: {s}"), 37 Self::Network(e) => write!(f, "network error: {e}"), 38 Self::HttpStatus { status, reason } => { 39 write!(f, "HTTP {status} {reason}") 40 } 41 Self::Encoding(s) => write!(f, "encoding error: {s}"), 42 } 43 } 44} 45 46impl From<ClientError> for LoadError { 47 fn from(e: ClientError) -> Self { 48 Self::Network(e) 49 } 50} 51 52// --------------------------------------------------------------------------- 53// Resource types 54// --------------------------------------------------------------------------- 55 56/// A loaded resource with its decoded content and metadata. 57#[derive(Debug)] 58pub enum Resource { 59 /// An HTML document. 60 Html { 61 text: String, 62 base_url: Url, 63 encoding: Encoding, 64 }, 65 /// A CSS stylesheet. 66 Css { text: String, url: Url }, 67 /// A decoded image. 68 Image { 69 data: Vec<u8>, 70 mime_type: String, 71 url: Url, 72 }, 73 /// Any other resource type (binary). 74 Other { 75 data: Vec<u8>, 76 mime_type: String, 77 url: Url, 78 }, 79} 80 81// --------------------------------------------------------------------------- 82// ResourceLoader 83// --------------------------------------------------------------------------- 84 85/// Loads resources over HTTP/HTTPS with encoding detection and content-type handling. 86pub struct ResourceLoader { 87 client: HttpClient, 88} 89 90impl ResourceLoader { 91 /// Create a new resource loader with default settings. 92 pub fn new() -> Self { 93 Self { 94 client: HttpClient::new(), 95 } 96 } 97 98 /// Fetch a resource at the given URL. 99 /// 100 /// Determines the resource type from the HTTP Content-Type header, decodes 101 /// text resources using the appropriate character encoding (per WHATWG spec), 102 /// and returns the result as a typed `Resource`. 103 /// 104 /// Handles `data:` URLs locally without network access. 105 pub fn fetch(&mut self, url: &Url) -> Result<Resource, LoadError> { 106 // Handle data: URLs without network fetch. 107 if url.scheme() == "data" { 108 return fetch_data_url(&url.serialize()); 109 } 110 111 let response = self.client.get(url)?; 112 113 // Check for HTTP error status codes 114 if response.status_code >= 400 { 115 return Err(LoadError::HttpStatus { 116 status: response.status_code, 117 reason: response.reason.clone(), 118 }); 119 } 120 121 let content_type = response.content_type(); 122 let mime = content_type 123 .as_ref() 124 .map(|ct| ct.mime_type.as_str()) 125 .unwrap_or("application/octet-stream"); 126 127 match classify_mime(mime) { 128 MimeClass::Html => { 129 let (text, encoding) = 130 decode_text_resource(&response.body, content_type.as_ref(), true); 131 Ok(Resource::Html { 132 text, 133 base_url: url.clone(), 134 encoding, 135 }) 136 } 137 MimeClass::Css => { 138 let (text, _encoding) = 139 decode_text_resource(&response.body, content_type.as_ref(), false); 140 Ok(Resource::Css { 141 text, 142 url: url.clone(), 143 }) 144 } 145 MimeClass::Image => Ok(Resource::Image { 146 data: response.body, 147 mime_type: mime.to_string(), 148 url: url.clone(), 149 }), 150 MimeClass::Other => { 151 // Check if it's a text type we should decode 152 if mime.starts_with("text/") { 153 let (text, _encoding) = 154 decode_text_resource(&response.body, content_type.as_ref(), false); 155 Ok(Resource::Other { 156 data: text.into_bytes(), 157 mime_type: mime.to_string(), 158 url: url.clone(), 159 }) 160 } else { 161 Ok(Resource::Other { 162 data: response.body, 163 mime_type: mime.to_string(), 164 url: url.clone(), 165 }) 166 } 167 } 168 } 169 } 170 171 /// Fetch a URL string, resolving it against an optional base URL. 172 /// 173 /// Handles `data:` URLs locally without network access. 174 pub fn fetch_url(&mut self, url_str: &str, base: Option<&Url>) -> Result<Resource, LoadError> { 175 // Handle data URLs directly — no network fetch needed. 176 if is_data_url(url_str) { 177 return fetch_data_url(url_str); 178 } 179 180 let url = match base { 181 Some(base_url) => Url::parse_with_base(url_str, base_url) 182 .or_else(|_| Url::parse(url_str)) 183 .map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, 184 None => Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, 185 }; 186 self.fetch(&url) 187 } 188} 189 190impl Default for ResourceLoader { 191 fn default() -> Self { 192 Self::new() 193 } 194} 195 196// --------------------------------------------------------------------------- 197// MIME classification 198// --------------------------------------------------------------------------- 199 200enum MimeClass { 201 Html, 202 Css, 203 Image, 204 Other, 205} 206 207fn classify_mime(mime: &str) -> MimeClass { 208 match mime { 209 "text/html" | "application/xhtml+xml" => MimeClass::Html, 210 "text/css" => MimeClass::Css, 211 "image/png" | "image/jpeg" | "image/gif" | "image/webp" | "image/svg+xml" => { 212 MimeClass::Image 213 } 214 _ => MimeClass::Other, 215 } 216} 217 218// --------------------------------------------------------------------------- 219// Text decoding 220// --------------------------------------------------------------------------- 221 222/// Decode a text resource's bytes to a String using WHATWG encoding sniffing. 223/// 224/// For HTML resources, uses BOM > HTTP charset > meta prescan > default. 225/// For non-HTML text resources, uses BOM > HTTP charset > default (UTF-8). 226fn decode_text_resource( 227 bytes: &[u8], 228 content_type: Option<&ContentType>, 229 is_html: bool, 230) -> (String, Encoding) { 231 let http_ct_value = content_type.map(|ct| { 232 // Reconstruct a Content-Type header value for the sniffing function 233 match &ct.charset { 234 Some(charset) => format!("{}; charset={}", ct.mime_type, charset), 235 None => ct.mime_type.clone(), 236 } 237 }); 238 239 if is_html { 240 // Full WHATWG sniffing: BOM > HTTP > meta prescan > default (Windows-1252) 241 let (encoding, _source) = sniff_encoding(bytes, http_ct_value.as_deref()); 242 let text = decode_with_bom_handling(bytes, encoding); 243 (text, encoding) 244 } else { 245 // Non-HTML: BOM > HTTP charset > default (UTF-8) 246 let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); 247 if let Some(enc) = bom_enc { 248 let text = we_encoding::decode(after_bom, enc); 249 return (text, enc); 250 } 251 252 // Try HTTP charset 253 if let Some(charset) = content_type.and_then(|ct| ct.charset.as_deref()) { 254 if let Some(enc) = we_encoding::lookup(charset) { 255 let text = we_encoding::decode(bytes, enc); 256 return (text, enc); 257 } 258 } 259 260 // Default to UTF-8 for non-HTML text 261 let text = we_encoding::decode(bytes, Encoding::Utf8); 262 (text, Encoding::Utf8) 263 } 264} 265 266/// Decode bytes with BOM handling — strip BOM bytes before decoding. 267fn decode_with_bom_handling(bytes: &[u8], encoding: Encoding) -> String { 268 let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); 269 if bom_enc.is_some() { 270 // BOM was present — decode the bytes after the BOM 271 we_encoding::decode(after_bom, encoding) 272 } else { 273 we_encoding::decode(bytes, encoding) 274 } 275} 276 277// --------------------------------------------------------------------------- 278// Data URL handling 279// --------------------------------------------------------------------------- 280 281/// Fetch a data URL, decoding its payload and returning the appropriate Resource type. 282fn fetch_data_url(url_str: &str) -> Result<Resource, LoadError> { 283 let parsed = parse_data_url(url_str) 284 .map_err(|e| LoadError::InvalidUrl(format!("data URL error: {e}")))?; 285 286 let mime = &parsed.mime_type; 287 288 // Create a synthetic Url for the resource metadata. 289 let url = Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?; 290 291 match classify_mime(mime) { 292 MimeClass::Html => { 293 let encoding = charset_to_encoding(parsed.charset.as_deref()); 294 let text = we_encoding::decode(&parsed.data, encoding); 295 Ok(Resource::Html { 296 text, 297 base_url: url, 298 encoding, 299 }) 300 } 301 MimeClass::Css => { 302 let encoding = charset_to_encoding(parsed.charset.as_deref()); 303 let text = we_encoding::decode(&parsed.data, encoding); 304 Ok(Resource::Css { text, url }) 305 } 306 MimeClass::Image => Ok(Resource::Image { 307 data: parsed.data, 308 mime_type: mime.to_string(), 309 url, 310 }), 311 MimeClass::Other => { 312 if mime.starts_with("text/") { 313 let encoding = charset_to_encoding(parsed.charset.as_deref()); 314 let text = we_encoding::decode(&parsed.data, encoding); 315 Ok(Resource::Other { 316 data: text.into_bytes(), 317 mime_type: mime.to_string(), 318 url, 319 }) 320 } else { 321 Ok(Resource::Other { 322 data: parsed.data, 323 mime_type: mime.to_string(), 324 url, 325 }) 326 } 327 } 328 } 329} 330 331/// Map a charset name to an Encoding, defaulting to UTF-8. 332fn charset_to_encoding(charset: Option<&str>) -> Encoding { 333 charset 334 .and_then(we_encoding::lookup) 335 .unwrap_or(Encoding::Utf8) 336} 337 338// --------------------------------------------------------------------------- 339// Tests 340// --------------------------------------------------------------------------- 341 342#[cfg(test)] 343mod tests { 344 use super::*; 345 346 // ----------------------------------------------------------------------- 347 // LoadError Display 348 // ----------------------------------------------------------------------- 349 350 #[test] 351 fn load_error_display_invalid_url() { 352 let e = LoadError::InvalidUrl("bad://url".to_string()); 353 assert_eq!(e.to_string(), "invalid URL: bad://url"); 354 } 355 356 #[test] 357 fn load_error_display_http_status() { 358 let e = LoadError::HttpStatus { 359 status: 404, 360 reason: "Not Found".to_string(), 361 }; 362 assert_eq!(e.to_string(), "HTTP 404 Not Found"); 363 } 364 365 #[test] 366 fn load_error_display_encoding() { 367 let e = LoadError::Encoding("bad charset".to_string()); 368 assert_eq!(e.to_string(), "encoding error: bad charset"); 369 } 370 371 // ----------------------------------------------------------------------- 372 // MIME classification 373 // ----------------------------------------------------------------------- 374 375 #[test] 376 fn classify_text_html() { 377 assert!(matches!(classify_mime("text/html"), MimeClass::Html)); 378 } 379 380 #[test] 381 fn classify_xhtml() { 382 assert!(matches!( 383 classify_mime("application/xhtml+xml"), 384 MimeClass::Html 385 )); 386 } 387 388 #[test] 389 fn classify_text_css() { 390 assert!(matches!(classify_mime("text/css"), MimeClass::Css)); 391 } 392 393 #[test] 394 fn classify_image_png() { 395 assert!(matches!(classify_mime("image/png"), MimeClass::Image)); 396 } 397 398 #[test] 399 fn classify_image_jpeg() { 400 assert!(matches!(classify_mime("image/jpeg"), MimeClass::Image)); 401 } 402 403 #[test] 404 fn classify_image_gif() { 405 assert!(matches!(classify_mime("image/gif"), MimeClass::Image)); 406 } 407 408 #[test] 409 fn classify_application_json() { 410 assert!(matches!( 411 classify_mime("application/json"), 412 MimeClass::Other 413 )); 414 } 415 416 #[test] 417 fn classify_text_plain() { 418 assert!(matches!(classify_mime("text/plain"), MimeClass::Other)); 419 } 420 421 #[test] 422 fn classify_octet_stream() { 423 assert!(matches!( 424 classify_mime("application/octet-stream"), 425 MimeClass::Other 426 )); 427 } 428 429 // ----------------------------------------------------------------------- 430 // Text decoding — HTML 431 // ----------------------------------------------------------------------- 432 433 #[test] 434 fn decode_html_utf8_bom() { 435 let bytes = b"\xEF\xBB\xBF<html>Hello</html>"; 436 let (text, enc) = decode_text_resource(bytes, None, true); 437 assert_eq!(enc, Encoding::Utf8); 438 assert_eq!(text, "<html>Hello</html>"); 439 } 440 441 #[test] 442 fn decode_html_utf8_from_http_charset() { 443 let ct = ContentType { 444 mime_type: "text/html".to_string(), 445 charset: Some("utf-8".to_string()), 446 }; 447 let bytes = b"<html>Hello</html>"; 448 let (text, enc) = decode_text_resource(bytes, Some(&ct), true); 449 assert_eq!(enc, Encoding::Utf8); 450 assert_eq!(text, "<html>Hello</html>"); 451 } 452 453 #[test] 454 fn decode_html_meta_charset() { 455 let html = b"<meta charset=\"utf-8\"><html>Hello</html>"; 456 let (text, enc) = decode_text_resource(html, None, true); 457 assert_eq!(enc, Encoding::Utf8); 458 assert!(text.contains("Hello")); 459 } 460 461 #[test] 462 fn decode_html_default_windows_1252() { 463 let bytes = b"<html>Hello</html>"; 464 let (text, enc) = decode_text_resource(bytes, None, true); 465 assert_eq!(enc, Encoding::Windows1252); 466 assert!(text.contains("Hello")); 467 } 468 469 #[test] 470 fn decode_html_windows_1252_special_chars() { 471 // \x93 and \x94 are left/right double quotation marks in Windows-1252 472 let bytes = b"<html>\x93Hello\x94</html>"; 473 let (text, enc) = decode_text_resource(bytes, None, true); 474 assert_eq!(enc, Encoding::Windows1252); 475 assert!(text.contains('\u{201C}')); // left double quote 476 assert!(text.contains('\u{201D}')); // right double quote 477 } 478 479 #[test] 480 fn decode_html_bom_beats_http_charset() { 481 let ct = ContentType { 482 mime_type: "text/html".to_string(), 483 charset: Some("windows-1252".to_string()), 484 }; 485 let mut bytes = vec![0xEF, 0xBB, 0xBF]; 486 bytes.extend_from_slice(b"<html>Hello</html>"); 487 let (text, enc) = decode_text_resource(&bytes, Some(&ct), true); 488 assert_eq!(enc, Encoding::Utf8); 489 assert_eq!(text, "<html>Hello</html>"); 490 } 491 492 // ----------------------------------------------------------------------- 493 // Text decoding — non-HTML (CSS, etc.) 494 // ----------------------------------------------------------------------- 495 496 #[test] 497 fn decode_css_utf8_default() { 498 let bytes = b"body { color: red; }"; 499 let (text, enc) = decode_text_resource(bytes, None, false); 500 assert_eq!(enc, Encoding::Utf8); 501 assert_eq!(text, "body { color: red; }"); 502 } 503 504 #[test] 505 fn decode_css_bom_utf8() { 506 let bytes = b"\xEF\xBB\xBFbody { color: red; }"; 507 let (text, enc) = decode_text_resource(bytes, None, false); 508 assert_eq!(enc, Encoding::Utf8); 509 assert_eq!(text, "body { color: red; }"); 510 } 511 512 #[test] 513 fn decode_css_http_charset() { 514 let ct = ContentType { 515 mime_type: "text/css".to_string(), 516 charset: Some("utf-8".to_string()), 517 }; 518 let bytes = b"body { color: red; }"; 519 let (text, enc) = decode_text_resource(bytes, Some(&ct), false); 520 assert_eq!(enc, Encoding::Utf8); 521 assert_eq!(text, "body { color: red; }"); 522 } 523 524 // ----------------------------------------------------------------------- 525 // BOM handling 526 // ----------------------------------------------------------------------- 527 528 #[test] 529 fn decode_with_bom_strips_utf8_bom() { 530 let bytes = b"\xEF\xBB\xBFHello"; 531 let text = decode_with_bom_handling(bytes, Encoding::Utf8); 532 assert_eq!(text, "Hello"); 533 } 534 535 #[test] 536 fn decode_without_bom_passes_through() { 537 let bytes = b"Hello"; 538 let text = decode_with_bom_handling(bytes, Encoding::Utf8); 539 assert_eq!(text, "Hello"); 540 } 541 542 #[test] 543 fn decode_with_utf16le_bom() { 544 let bytes = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00"; 545 let text = decode_with_bom_handling(bytes, Encoding::Utf16Le); 546 assert_eq!(text, "Hello"); 547 } 548 549 #[test] 550 fn decode_with_utf16be_bom() { 551 let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o"; 552 let text = decode_with_bom_handling(bytes, Encoding::Utf16Be); 553 assert_eq!(text, "Hello"); 554 } 555 556 // ----------------------------------------------------------------------- 557 // ResourceLoader construction 558 // ----------------------------------------------------------------------- 559 560 #[test] 561 fn resource_loader_new() { 562 let _loader = ResourceLoader::new(); 563 } 564 565 #[test] 566 fn resource_loader_default() { 567 let _loader = ResourceLoader::default(); 568 } 569 570 // ----------------------------------------------------------------------- 571 // URL resolution 572 // ----------------------------------------------------------------------- 573 574 #[test] 575 fn fetch_url_invalid_url_error() { 576 let mut loader = ResourceLoader::new(); 577 let result = loader.fetch_url("not a url at all", None); 578 assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); 579 } 580 581 #[test] 582 fn fetch_url_relative_without_base_errors() { 583 let mut loader = ResourceLoader::new(); 584 let result = loader.fetch_url("/relative/path", None); 585 assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); 586 } 587 588 #[test] 589 fn fetch_url_relative_with_base_resolves() { 590 let mut loader = ResourceLoader::new(); 591 let base = Url::parse("http://example.com/page").unwrap(); 592 // This will fail since we can't actually connect in tests, 593 // but the URL resolution itself should work (it won't be InvalidUrl). 594 let result = loader.fetch_url("/style.css", Some(&base)); 595 assert!(result.is_err()); 596 // The error should NOT be InvalidUrl — the URL resolved successfully. 597 assert!(!matches!(result, Err(LoadError::InvalidUrl(_)))); 598 } 599 600 // ----------------------------------------------------------------------- 601 // Data URL loading 602 // ----------------------------------------------------------------------- 603 604 #[test] 605 fn data_url_plain_text() { 606 let mut loader = ResourceLoader::new(); 607 let result = loader.fetch_url("data:text/plain,Hello%20World", None); 608 assert!(result.is_ok()); 609 match result.unwrap() { 610 Resource::Other { 611 data, mime_type, .. 612 } => { 613 assert_eq!(mime_type, "text/plain"); 614 assert_eq!(String::from_utf8(data).unwrap(), "Hello World"); 615 } 616 other => panic!("expected Other, got {:?}", other), 617 } 618 } 619 620 #[test] 621 fn data_url_html() { 622 let mut loader = ResourceLoader::new(); 623 let result = loader.fetch_url("data:text/html,<h1>Hello</h1>", None); 624 assert!(result.is_ok()); 625 match result.unwrap() { 626 Resource::Html { text, .. } => { 627 assert_eq!(text, "<h1>Hello</h1>"); 628 } 629 other => panic!("expected Html, got {:?}", other), 630 } 631 } 632 633 #[test] 634 fn data_url_css() { 635 let mut loader = ResourceLoader::new(); 636 let result = loader.fetch_url("data:text/css,body{color:red}", None); 637 assert!(result.is_ok()); 638 match result.unwrap() { 639 Resource::Css { text, .. } => { 640 assert_eq!(text, "body{color:red}"); 641 } 642 other => panic!("expected Css, got {:?}", other), 643 } 644 } 645 646 #[test] 647 fn data_url_image() { 648 let mut loader = ResourceLoader::new(); 649 let result = loader.fetch_url("data:image/png;base64,/wCq", None); 650 assert!(result.is_ok()); 651 match result.unwrap() { 652 Resource::Image { 653 data, mime_type, .. 654 } => { 655 assert_eq!(mime_type, "image/png"); 656 assert_eq!(data, vec![0xFF, 0x00, 0xAA]); 657 } 658 other => panic!("expected Image, got {:?}", other), 659 } 660 } 661 662 #[test] 663 fn data_url_base64() { 664 let mut loader = ResourceLoader::new(); 665 let result = loader.fetch_url("data:text/plain;base64,SGVsbG8=", None); 666 assert!(result.is_ok()); 667 match result.unwrap() { 668 Resource::Other { data, .. } => { 669 assert_eq!(String::from_utf8(data).unwrap(), "Hello"); 670 } 671 other => panic!("expected Other, got {:?}", other), 672 } 673 } 674 675 #[test] 676 fn data_url_empty() { 677 let mut loader = ResourceLoader::new(); 678 let result = loader.fetch_url("data:,", None); 679 assert!(result.is_ok()); 680 } 681 682 #[test] 683 fn data_url_via_fetch_method() { 684 let mut loader = ResourceLoader::new(); 685 let url = Url::parse("data:text/plain,Hello").unwrap(); 686 let result = loader.fetch(&url); 687 assert!(result.is_ok()); 688 match result.unwrap() { 689 Resource::Other { data, .. } => { 690 assert_eq!(String::from_utf8(data).unwrap(), "Hello"); 691 } 692 other => panic!("expected Other, got {:?}", other), 693 } 694 } 695 696 #[test] 697 fn data_url_invalid() { 698 let mut loader = ResourceLoader::new(); 699 let result = loader.fetch_url("data:text/plain", None); 700 assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); 701 } 702 703 #[test] 704 fn data_url_binary() { 705 let mut loader = ResourceLoader::new(); 706 let result = loader.fetch_url("data:application/octet-stream;base64,/wCq", None); 707 assert!(result.is_ok()); 708 match result.unwrap() { 709 Resource::Other { 710 data, mime_type, .. 711 } => { 712 assert_eq!(mime_type, "application/octet-stream"); 713 assert_eq!(data, vec![0xFF, 0x00, 0xAA]); 714 } 715 other => panic!("expected Other, got {:?}", other), 716 } 717 } 718}