we (web engine): Experimental web browser project to understand the limits of Claude
at resource-loader 523 lines 17 kB view raw
1//! Resource loader: fetch URLs and decode their content. 2//! 3//! Brings together `net` (HTTP client), `encoding` (charset detection and decoding), 4//! `url` (URL parsing and resolution), and `image` (image decoding) into a single 5//! `ResourceLoader` that the browser uses to load web pages and subresources. 6 7use std::fmt; 8 9use we_encoding::sniff::sniff_encoding; 10use we_encoding::Encoding; 11use we_net::client::{ClientError, HttpClient}; 12use we_net::http::ContentType; 13use we_url::Url; 14 15// --------------------------------------------------------------------------- 16// Error type 17// --------------------------------------------------------------------------- 18 19/// Errors that can occur during resource loading. 20#[derive(Debug)] 21pub enum LoadError { 22 /// URL parsing failed. 23 InvalidUrl(String), 24 /// Network or HTTP error from the underlying client. 25 Network(ClientError), 26 /// HTTP response indicated an error status. 27 HttpStatus { status: u16, reason: String }, 28 /// Encoding or decoding error. 29 Encoding(String), 30} 31 32impl fmt::Display for LoadError { 33 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 34 match self { 35 Self::InvalidUrl(s) => write!(f, "invalid URL: {s}"), 36 Self::Network(e) => write!(f, "network error: {e}"), 37 Self::HttpStatus { status, reason } => { 38 write!(f, "HTTP {status} {reason}") 39 } 40 Self::Encoding(s) => write!(f, "encoding error: {s}"), 41 } 42 } 43} 44 45impl From<ClientError> for LoadError { 46 fn from(e: ClientError) -> Self { 47 Self::Network(e) 48 } 49} 50 51// --------------------------------------------------------------------------- 52// Resource types 53// --------------------------------------------------------------------------- 54 55/// A loaded resource with its decoded content and metadata. 56#[derive(Debug)] 57pub enum Resource { 58 /// An HTML document. 59 Html { 60 text: String, 61 base_url: Url, 62 encoding: Encoding, 63 }, 64 /// A CSS stylesheet. 65 Css { text: String, url: Url }, 66 /// A decoded image. 67 Image { 68 data: Vec<u8>, 69 mime_type: String, 70 url: Url, 71 }, 72 /// Any other resource type (binary). 73 Other { 74 data: Vec<u8>, 75 mime_type: String, 76 url: Url, 77 }, 78} 79 80// --------------------------------------------------------------------------- 81// ResourceLoader 82// --------------------------------------------------------------------------- 83 84/// Loads resources over HTTP/HTTPS with encoding detection and content-type handling. 85pub struct ResourceLoader { 86 client: HttpClient, 87} 88 89impl ResourceLoader { 90 /// Create a new resource loader with default settings. 91 pub fn new() -> Self { 92 Self { 93 client: HttpClient::new(), 94 } 95 } 96 97 /// Fetch a resource at the given URL. 98 /// 99 /// Determines the resource type from the HTTP Content-Type header, decodes 100 /// text resources using the appropriate character encoding (per WHATWG spec), 101 /// and returns the result as a typed `Resource`. 102 pub fn fetch(&mut self, url: &Url) -> Result<Resource, LoadError> { 103 let response = self.client.get(url)?; 104 105 // Check for HTTP error status codes 106 if response.status_code >= 400 { 107 return Err(LoadError::HttpStatus { 108 status: response.status_code, 109 reason: response.reason.clone(), 110 }); 111 } 112 113 let content_type = response.content_type(); 114 let mime = content_type 115 .as_ref() 116 .map(|ct| ct.mime_type.as_str()) 117 .unwrap_or("application/octet-stream"); 118 119 match classify_mime(mime) { 120 MimeClass::Html => { 121 let (text, encoding) = 122 decode_text_resource(&response.body, content_type.as_ref(), true); 123 Ok(Resource::Html { 124 text, 125 base_url: url.clone(), 126 encoding, 127 }) 128 } 129 MimeClass::Css => { 130 let (text, _encoding) = 131 decode_text_resource(&response.body, content_type.as_ref(), false); 132 Ok(Resource::Css { 133 text, 134 url: url.clone(), 135 }) 136 } 137 MimeClass::Image => Ok(Resource::Image { 138 data: response.body, 139 mime_type: mime.to_string(), 140 url: url.clone(), 141 }), 142 MimeClass::Other => { 143 // Check if it's a text type we should decode 144 if mime.starts_with("text/") { 145 let (text, _encoding) = 146 decode_text_resource(&response.body, content_type.as_ref(), false); 147 Ok(Resource::Other { 148 data: text.into_bytes(), 149 mime_type: mime.to_string(), 150 url: url.clone(), 151 }) 152 } else { 153 Ok(Resource::Other { 154 data: response.body, 155 mime_type: mime.to_string(), 156 url: url.clone(), 157 }) 158 } 159 } 160 } 161 } 162 163 /// Fetch a URL string, resolving it against an optional base URL. 164 pub fn fetch_url(&mut self, url_str: &str, base: Option<&Url>) -> Result<Resource, LoadError> { 165 let url = match base { 166 Some(base_url) => Url::parse_with_base(url_str, base_url) 167 .or_else(|_| Url::parse(url_str)) 168 .map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, 169 None => Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, 170 }; 171 self.fetch(&url) 172 } 173} 174 175impl Default for ResourceLoader { 176 fn default() -> Self { 177 Self::new() 178 } 179} 180 181// --------------------------------------------------------------------------- 182// MIME classification 183// --------------------------------------------------------------------------- 184 185enum MimeClass { 186 Html, 187 Css, 188 Image, 189 Other, 190} 191 192fn classify_mime(mime: &str) -> MimeClass { 193 match mime { 194 "text/html" | "application/xhtml+xml" => MimeClass::Html, 195 "text/css" => MimeClass::Css, 196 "image/png" | "image/jpeg" | "image/gif" | "image/webp" | "image/svg+xml" => { 197 MimeClass::Image 198 } 199 _ => MimeClass::Other, 200 } 201} 202 203// --------------------------------------------------------------------------- 204// Text decoding 205// --------------------------------------------------------------------------- 206 207/// Decode a text resource's bytes to a String using WHATWG encoding sniffing. 208/// 209/// For HTML resources, uses BOM > HTTP charset > meta prescan > default. 210/// For non-HTML text resources, uses BOM > HTTP charset > default (UTF-8). 211fn decode_text_resource( 212 bytes: &[u8], 213 content_type: Option<&ContentType>, 214 is_html: bool, 215) -> (String, Encoding) { 216 let http_ct_value = content_type.map(|ct| { 217 // Reconstruct a Content-Type header value for the sniffing function 218 match &ct.charset { 219 Some(charset) => format!("{}; charset={}", ct.mime_type, charset), 220 None => ct.mime_type.clone(), 221 } 222 }); 223 224 if is_html { 225 // Full WHATWG sniffing: BOM > HTTP > meta prescan > default (Windows-1252) 226 let (encoding, _source) = sniff_encoding(bytes, http_ct_value.as_deref()); 227 let text = decode_with_bom_handling(bytes, encoding); 228 (text, encoding) 229 } else { 230 // Non-HTML: BOM > HTTP charset > default (UTF-8) 231 let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); 232 if let Some(enc) = bom_enc { 233 let text = we_encoding::decode(after_bom, enc); 234 return (text, enc); 235 } 236 237 // Try HTTP charset 238 if let Some(charset) = content_type.and_then(|ct| ct.charset.as_deref()) { 239 if let Some(enc) = we_encoding::lookup(charset) { 240 let text = we_encoding::decode(bytes, enc); 241 return (text, enc); 242 } 243 } 244 245 // Default to UTF-8 for non-HTML text 246 let text = we_encoding::decode(bytes, Encoding::Utf8); 247 (text, Encoding::Utf8) 248 } 249} 250 251/// Decode bytes with BOM handling — strip BOM bytes before decoding. 252fn decode_with_bom_handling(bytes: &[u8], encoding: Encoding) -> String { 253 let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); 254 if bom_enc.is_some() { 255 // BOM was present — decode the bytes after the BOM 256 we_encoding::decode(after_bom, encoding) 257 } else { 258 we_encoding::decode(bytes, encoding) 259 } 260} 261 262// --------------------------------------------------------------------------- 263// Tests 264// --------------------------------------------------------------------------- 265 266#[cfg(test)] 267mod tests { 268 use super::*; 269 270 // ----------------------------------------------------------------------- 271 // LoadError Display 272 // ----------------------------------------------------------------------- 273 274 #[test] 275 fn load_error_display_invalid_url() { 276 let e = LoadError::InvalidUrl("bad://url".to_string()); 277 assert_eq!(e.to_string(), "invalid URL: bad://url"); 278 } 279 280 #[test] 281 fn load_error_display_http_status() { 282 let e = LoadError::HttpStatus { 283 status: 404, 284 reason: "Not Found".to_string(), 285 }; 286 assert_eq!(e.to_string(), "HTTP 404 Not Found"); 287 } 288 289 #[test] 290 fn load_error_display_encoding() { 291 let e = LoadError::Encoding("bad charset".to_string()); 292 assert_eq!(e.to_string(), "encoding error: bad charset"); 293 } 294 295 // ----------------------------------------------------------------------- 296 // MIME classification 297 // ----------------------------------------------------------------------- 298 299 #[test] 300 fn classify_text_html() { 301 assert!(matches!(classify_mime("text/html"), MimeClass::Html)); 302 } 303 304 #[test] 305 fn classify_xhtml() { 306 assert!(matches!( 307 classify_mime("application/xhtml+xml"), 308 MimeClass::Html 309 )); 310 } 311 312 #[test] 313 fn classify_text_css() { 314 assert!(matches!(classify_mime("text/css"), MimeClass::Css)); 315 } 316 317 #[test] 318 fn classify_image_png() { 319 assert!(matches!(classify_mime("image/png"), MimeClass::Image)); 320 } 321 322 #[test] 323 fn classify_image_jpeg() { 324 assert!(matches!(classify_mime("image/jpeg"), MimeClass::Image)); 325 } 326 327 #[test] 328 fn classify_image_gif() { 329 assert!(matches!(classify_mime("image/gif"), MimeClass::Image)); 330 } 331 332 #[test] 333 fn classify_application_json() { 334 assert!(matches!( 335 classify_mime("application/json"), 336 MimeClass::Other 337 )); 338 } 339 340 #[test] 341 fn classify_text_plain() { 342 assert!(matches!(classify_mime("text/plain"), MimeClass::Other)); 343 } 344 345 #[test] 346 fn classify_octet_stream() { 347 assert!(matches!( 348 classify_mime("application/octet-stream"), 349 MimeClass::Other 350 )); 351 } 352 353 // ----------------------------------------------------------------------- 354 // Text decoding — HTML 355 // ----------------------------------------------------------------------- 356 357 #[test] 358 fn decode_html_utf8_bom() { 359 let bytes = b"\xEF\xBB\xBF<html>Hello</html>"; 360 let (text, enc) = decode_text_resource(bytes, None, true); 361 assert_eq!(enc, Encoding::Utf8); 362 assert_eq!(text, "<html>Hello</html>"); 363 } 364 365 #[test] 366 fn decode_html_utf8_from_http_charset() { 367 let ct = ContentType { 368 mime_type: "text/html".to_string(), 369 charset: Some("utf-8".to_string()), 370 }; 371 let bytes = b"<html>Hello</html>"; 372 let (text, enc) = decode_text_resource(bytes, Some(&ct), true); 373 assert_eq!(enc, Encoding::Utf8); 374 assert_eq!(text, "<html>Hello</html>"); 375 } 376 377 #[test] 378 fn decode_html_meta_charset() { 379 let html = b"<meta charset=\"utf-8\"><html>Hello</html>"; 380 let (text, enc) = decode_text_resource(html, None, true); 381 assert_eq!(enc, Encoding::Utf8); 382 assert!(text.contains("Hello")); 383 } 384 385 #[test] 386 fn decode_html_default_windows_1252() { 387 let bytes = b"<html>Hello</html>"; 388 let (text, enc) = decode_text_resource(bytes, None, true); 389 assert_eq!(enc, Encoding::Windows1252); 390 assert!(text.contains("Hello")); 391 } 392 393 #[test] 394 fn decode_html_windows_1252_special_chars() { 395 // \x93 and \x94 are left/right double quotation marks in Windows-1252 396 let bytes = b"<html>\x93Hello\x94</html>"; 397 let (text, enc) = decode_text_resource(bytes, None, true); 398 assert_eq!(enc, Encoding::Windows1252); 399 assert!(text.contains('\u{201C}')); // left double quote 400 assert!(text.contains('\u{201D}')); // right double quote 401 } 402 403 #[test] 404 fn decode_html_bom_beats_http_charset() { 405 let ct = ContentType { 406 mime_type: "text/html".to_string(), 407 charset: Some("windows-1252".to_string()), 408 }; 409 let mut bytes = vec![0xEF, 0xBB, 0xBF]; 410 bytes.extend_from_slice(b"<html>Hello</html>"); 411 let (text, enc) = decode_text_resource(&bytes, Some(&ct), true); 412 assert_eq!(enc, Encoding::Utf8); 413 assert_eq!(text, "<html>Hello</html>"); 414 } 415 416 // ----------------------------------------------------------------------- 417 // Text decoding — non-HTML (CSS, etc.) 418 // ----------------------------------------------------------------------- 419 420 #[test] 421 fn decode_css_utf8_default() { 422 let bytes = b"body { color: red; }"; 423 let (text, enc) = decode_text_resource(bytes, None, false); 424 assert_eq!(enc, Encoding::Utf8); 425 assert_eq!(text, "body { color: red; }"); 426 } 427 428 #[test] 429 fn decode_css_bom_utf8() { 430 let bytes = b"\xEF\xBB\xBFbody { color: red; }"; 431 let (text, enc) = decode_text_resource(bytes, None, false); 432 assert_eq!(enc, Encoding::Utf8); 433 assert_eq!(text, "body { color: red; }"); 434 } 435 436 #[test] 437 fn decode_css_http_charset() { 438 let ct = ContentType { 439 mime_type: "text/css".to_string(), 440 charset: Some("utf-8".to_string()), 441 }; 442 let bytes = b"body { color: red; }"; 443 let (text, enc) = decode_text_resource(bytes, Some(&ct), false); 444 assert_eq!(enc, Encoding::Utf8); 445 assert_eq!(text, "body { color: red; }"); 446 } 447 448 // ----------------------------------------------------------------------- 449 // BOM handling 450 // ----------------------------------------------------------------------- 451 452 #[test] 453 fn decode_with_bom_strips_utf8_bom() { 454 let bytes = b"\xEF\xBB\xBFHello"; 455 let text = decode_with_bom_handling(bytes, Encoding::Utf8); 456 assert_eq!(text, "Hello"); 457 } 458 459 #[test] 460 fn decode_without_bom_passes_through() { 461 let bytes = b"Hello"; 462 let text = decode_with_bom_handling(bytes, Encoding::Utf8); 463 assert_eq!(text, "Hello"); 464 } 465 466 #[test] 467 fn decode_with_utf16le_bom() { 468 let bytes = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00"; 469 let text = decode_with_bom_handling(bytes, Encoding::Utf16Le); 470 assert_eq!(text, "Hello"); 471 } 472 473 #[test] 474 fn decode_with_utf16be_bom() { 475 let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o"; 476 let text = decode_with_bom_handling(bytes, Encoding::Utf16Be); 477 assert_eq!(text, "Hello"); 478 } 479 480 // ----------------------------------------------------------------------- 481 // ResourceLoader construction 482 // ----------------------------------------------------------------------- 483 484 #[test] 485 fn resource_loader_new() { 486 let _loader = ResourceLoader::new(); 487 } 488 489 #[test] 490 fn resource_loader_default() { 491 let _loader = ResourceLoader::default(); 492 } 493 494 // ----------------------------------------------------------------------- 495 // URL resolution 496 // ----------------------------------------------------------------------- 497 498 #[test] 499 fn fetch_url_invalid_url_error() { 500 let mut loader = ResourceLoader::new(); 501 let result = loader.fetch_url("not a url at all", None); 502 assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); 503 } 504 505 #[test] 506 fn fetch_url_relative_without_base_errors() { 507 let mut loader = ResourceLoader::new(); 508 let result = loader.fetch_url("/relative/path", None); 509 assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); 510 } 511 512 #[test] 513 fn fetch_url_relative_with_base_resolves() { 514 let mut loader = ResourceLoader::new(); 515 let base = Url::parse("http://example.com/page").unwrap(); 516 // This will fail since we can't actually connect in tests, 517 // but the URL resolution itself should work (it won't be InvalidUrl). 518 let result = loader.fetch_url("/style.css", Some(&base)); 519 assert!(result.is_err()); 520 // The error should NOT be InvalidUrl — the URL resolved successfully. 521 assert!(!matches!(result, Err(LoadError::InvalidUrl(_)))); 522 } 523}