Implement resource loader: fetch URLs and decode content

we (web engine): Experimental web browser project to understand the limits of Claude

Add ResourceLoader to the browser crate that integrates the net, encoding,
url, and image crates into a unified resource fetching API:
- ResourceLoader wraps HttpClient for HTTP/HTTPS fetching
- Content-Type parsing determines resource type (HTML, CSS, image, other)
- WHATWG encoding sniffing for HTML (BOM > HTTP > meta prescan > default)
- BOM/HTTP charset detection for non-HTML text resources (default UTF-8)
- Relative URL resolution via base URL support
- HTTP error status handling (4xx, 5xx)
- Resource enum: Html, Css, Image, Other with appropriate metadata

30 tests covering MIME classification, text decoding (BOM, HTTP charset,
meta prescan, Windows-1252 special chars), URL resolution, and error types.

Implements issue 3mhkt6hnbhp25

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

pierrelf.com 6 days ago 8b330b7c 276dfeb1

+525

2 changed files

expand all

crates

browser

src

lib.rs

loader.rs

crates/browser/src/lib.rs

··· 1 1 //! Event loop, resource loading, navigation, UI chrome. 2 + 3 + pub mod loader;

+523

crates/browser/src/loader.rs

··· 1 + //! Resource loader: fetch URLs and decode their content. 2 + //! 3 + //! Brings together `net` (HTTP client), `encoding` (charset detection and decoding), 4 + //! `url` (URL parsing and resolution), and `image` (image decoding) into a single 5 + //! `ResourceLoader` that the browser uses to load web pages and subresources. 6 + 7 + use std::fmt; 8 + 9 + use we_encoding::sniff::sniff_encoding; 10 + use we_encoding::Encoding; 11 + use we_net::client::{ClientError, HttpClient}; 12 + use we_net::http::ContentType; 13 + use we_url::Url; 14 + 15 + // --------------------------------------------------------------------------- 16 + // Error type 17 + // --------------------------------------------------------------------------- 18 + 19 + /// Errors that can occur during resource loading. 20 + #[derive(Debug)] 21 + pub enum LoadError { 22 + /// URL parsing failed. 23 + InvalidUrl(String), 24 + /// Network or HTTP error from the underlying client. 25 + Network(ClientError), 26 + /// HTTP response indicated an error status. 27 + HttpStatus { status: u16, reason: String }, 28 + /// Encoding or decoding error. 29 + Encoding(String), 30 + } 31 + 32 + impl fmt::Display for LoadError { 33 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 34 + match self { 35 + Self::InvalidUrl(s) => write!(f, "invalid URL: {s}"), 36 + Self::Network(e) => write!(f, "network error: {e}"), 37 + Self::HttpStatus { status, reason } => { 38 + write!(f, "HTTP {status} {reason}") 39 + } 40 + Self::Encoding(s) => write!(f, "encoding error: {s}"), 41 + } 42 + } 43 + } 44 + 45 + impl From<ClientError> for LoadError { 46 + fn from(e: ClientError) -> Self { 47 + Self::Network(e) 48 + } 49 + } 50 + 51 + // --------------------------------------------------------------------------- 52 + // Resource types 53 + // --------------------------------------------------------------------------- 54 + 55 + /// A loaded resource with its decoded content and metadata. 56 + #[derive(Debug)] 57 + pub enum Resource { 58 + /// An HTML document. 59 + Html { 60 + text: String, 61 + base_url: Url, 62 + encoding: Encoding, 63 + }, 64 + /// A CSS stylesheet. 65 + Css { text: String, url: Url }, 66 + /// A decoded image. 67 + Image { 68 + data: Vec<u8>, 69 + mime_type: String, 70 + url: Url, 71 + }, 72 + /// Any other resource type (binary). 73 + Other { 74 + data: Vec<u8>, 75 + mime_type: String, 76 + url: Url, 77 + }, 78 + } 79 + 80 + // --------------------------------------------------------------------------- 81 + // ResourceLoader 82 + // --------------------------------------------------------------------------- 83 + 84 + /// Loads resources over HTTP/HTTPS with encoding detection and content-type handling. 85 + pub struct ResourceLoader { 86 + client: HttpClient, 87 + } 88 + 89 + impl ResourceLoader { 90 + /// Create a new resource loader with default settings. 91 + pub fn new() -> Self { 92 + Self { 93 + client: HttpClient::new(), 94 + } 95 + } 96 + 97 + /// Fetch a resource at the given URL. 98 + /// 99 + /// Determines the resource type from the HTTP Content-Type header, decodes 100 + /// text resources using the appropriate character encoding (per WHATWG spec), 101 + /// and returns the result as a typed `Resource`. 102 + pub fn fetch(&mut self, url: &Url) -> Result<Resource, LoadError> { 103 + let response = self.client.get(url)?; 104 + 105 + // Check for HTTP error status codes 106 + if response.status_code >= 400 { 107 + return Err(LoadError::HttpStatus { 108 + status: response.status_code, 109 + reason: response.reason.clone(), 110 + }); 111 + } 112 + 113 + let content_type = response.content_type(); 114 + let mime = content_type 115 + .as_ref() 116 + .map(|ct| ct.mime_type.as_str()) 117 + .unwrap_or("application/octet-stream"); 118 + 119 + match classify_mime(mime) { 120 + MimeClass::Html => { 121 + let (text, encoding) = 122 + decode_text_resource(&response.body, content_type.as_ref(), true); 123 + Ok(Resource::Html { 124 + text, 125 + base_url: url.clone(), 126 + encoding, 127 + }) 128 + } 129 + MimeClass::Css => { 130 + let (text, _encoding) = 131 + decode_text_resource(&response.body, content_type.as_ref(), false); 132 + Ok(Resource::Css { 133 + text, 134 + url: url.clone(), 135 + }) 136 + } 137 + MimeClass::Image => Ok(Resource::Image { 138 + data: response.body, 139 + mime_type: mime.to_string(), 140 + url: url.clone(), 141 + }), 142 + MimeClass::Other => { 143 + // Check if it's a text type we should decode 144 + if mime.starts_with("text/") { 145 + let (text, _encoding) = 146 + decode_text_resource(&response.body, content_type.as_ref(), false); 147 + Ok(Resource::Other { 148 + data: text.into_bytes(), 149 + mime_type: mime.to_string(), 150 + url: url.clone(), 151 + }) 152 + } else { 153 + Ok(Resource::Other { 154 + data: response.body, 155 + mime_type: mime.to_string(), 156 + url: url.clone(), 157 + }) 158 + } 159 + } 160 + } 161 + } 162 + 163 + /// Fetch a URL string, resolving it against an optional base URL. 164 + pub fn fetch_url(&mut self, url_str: &str, base: Option<&Url>) -> Result<Resource, LoadError> { 165 + let url = match base { 166 + Some(base_url) => Url::parse_with_base(url_str, base_url) 167 + .or_else(|_| Url::parse(url_str)) 168 + .map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, 169 + None => Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, 170 + }; 171 + self.fetch(&url) 172 + } 173 + } 174 + 175 + impl Default for ResourceLoader { 176 + fn default() -> Self { 177 + Self::new() 178 + } 179 + } 180 + 181 + // --------------------------------------------------------------------------- 182 + // MIME classification 183 + // --------------------------------------------------------------------------- 184 + 185 + enum MimeClass { 186 + Html, 187 + Css, 188 + Image, 189 + Other, 190 + } 191 + 192 + fn classify_mime(mime: &str) -> MimeClass { 193 + match mime { 194 + "text/html" | "application/xhtml+xml" => MimeClass::Html, 195 + "text/css" => MimeClass::Css, 196 + "image/png" | "image/jpeg" | "image/gif" | "image/webp" | "image/svg+xml" => { 197 + MimeClass::Image 198 + } 199 + _ => MimeClass::Other, 200 + } 201 + } 202 + 203 + // --------------------------------------------------------------------------- 204 + // Text decoding 205 + // --------------------------------------------------------------------------- 206 + 207 + /// Decode a text resource's bytes to a String using WHATWG encoding sniffing. 208 + /// 209 + /// For HTML resources, uses BOM > HTTP charset > meta prescan > default. 210 + /// For non-HTML text resources, uses BOM > HTTP charset > default (UTF-8). 211 + fn decode_text_resource( 212 + bytes: &[u8], 213 + content_type: Option<&ContentType>, 214 + is_html: bool, 215 + ) -> (String, Encoding) { 216 + let http_ct_value = content_type.map(|ct| { 217 + // Reconstruct a Content-Type header value for the sniffing function 218 + match &ct.charset { 219 + Some(charset) => format!("{}; charset={}", ct.mime_type, charset), 220 + None => ct.mime_type.clone(), 221 + } 222 + }); 223 + 224 + if is_html { 225 + // Full WHATWG sniffing: BOM > HTTP > meta prescan > default (Windows-1252) 226 + let (encoding, _source) = sniff_encoding(bytes, http_ct_value.as_deref()); 227 + let text = decode_with_bom_handling(bytes, encoding); 228 + (text, encoding) 229 + } else { 230 + // Non-HTML: BOM > HTTP charset > default (UTF-8) 231 + let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); 232 + if let Some(enc) = bom_enc { 233 + let text = we_encoding::decode(after_bom, enc); 234 + return (text, enc); 235 + } 236 + 237 + // Try HTTP charset 238 + if let Some(charset) = content_type.and_then(|ct| ct.charset.as_deref()) { 239 + if let Some(enc) = we_encoding::lookup(charset) { 240 + let text = we_encoding::decode(bytes, enc); 241 + return (text, enc); 242 + } 243 + } 244 + 245 + // Default to UTF-8 for non-HTML text 246 + let text = we_encoding::decode(bytes, Encoding::Utf8); 247 + (text, Encoding::Utf8) 248 + } 249 + } 250 + 251 + /// Decode bytes with BOM handling — strip BOM bytes before decoding. 252 + fn decode_with_bom_handling(bytes: &[u8], encoding: Encoding) -> String { 253 + let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); 254 + if bom_enc.is_some() { 255 + // BOM was present — decode the bytes after the BOM 256 + we_encoding::decode(after_bom, encoding) 257 + } else { 258 + we_encoding::decode(bytes, encoding) 259 + } 260 + } 261 + 262 + // --------------------------------------------------------------------------- 263 + // Tests 264 + // --------------------------------------------------------------------------- 265 + 266 + #[cfg(test)] 267 + mod tests { 268 + use super::*; 269 + 270 + // ----------------------------------------------------------------------- 271 + // LoadError Display 272 + // ----------------------------------------------------------------------- 273 + 274 + #[test] 275 + fn load_error_display_invalid_url() { 276 + let e = LoadError::InvalidUrl("bad://url".to_string()); 277 + assert_eq!(e.to_string(), "invalid URL: bad://url"); 278 + } 279 + 280 + #[test] 281 + fn load_error_display_http_status() { 282 + let e = LoadError::HttpStatus { 283 + status: 404, 284 + reason: "Not Found".to_string(), 285 + }; 286 + assert_eq!(e.to_string(), "HTTP 404 Not Found"); 287 + } 288 + 289 + #[test] 290 + fn load_error_display_encoding() { 291 + let e = LoadError::Encoding("bad charset".to_string()); 292 + assert_eq!(e.to_string(), "encoding error: bad charset"); 293 + } 294 + 295 + // ----------------------------------------------------------------------- 296 + // MIME classification 297 + // ----------------------------------------------------------------------- 298 + 299 + #[test] 300 + fn classify_text_html() { 301 + assert!(matches!(classify_mime("text/html"), MimeClass::Html)); 302 + } 303 + 304 + #[test] 305 + fn classify_xhtml() { 306 + assert!(matches!( 307 + classify_mime("application/xhtml+xml"), 308 + MimeClass::Html 309 + )); 310 + } 311 + 312 + #[test] 313 + fn classify_text_css() { 314 + assert!(matches!(classify_mime("text/css"), MimeClass::Css)); 315 + } 316 + 317 + #[test] 318 + fn classify_image_png() { 319 + assert!(matches!(classify_mime("image/png"), MimeClass::Image)); 320 + } 321 + 322 + #[test] 323 + fn classify_image_jpeg() { 324 + assert!(matches!(classify_mime("image/jpeg"), MimeClass::Image)); 325 + } 326 + 327 + #[test] 328 + fn classify_image_gif() { 329 + assert!(matches!(classify_mime("image/gif"), MimeClass::Image)); 330 + } 331 + 332 + #[test] 333 + fn classify_application_json() { 334 + assert!(matches!( 335 + classify_mime("application/json"), 336 + MimeClass::Other 337 + )); 338 + } 339 + 340 + #[test] 341 + fn classify_text_plain() { 342 + assert!(matches!(classify_mime("text/plain"), MimeClass::Other)); 343 + } 344 + 345 + #[test] 346 + fn classify_octet_stream() { 347 + assert!(matches!( 348 + classify_mime("application/octet-stream"), 349 + MimeClass::Other 350 + )); 351 + } 352 + 353 + // ----------------------------------------------------------------------- 354 + // Text decoding — HTML 355 + // ----------------------------------------------------------------------- 356 + 357 + #[test] 358 + fn decode_html_utf8_bom() { 359 + let bytes = b"\xEF\xBB\xBF<html>Hello</html>"; 360 + let (text, enc) = decode_text_resource(bytes, None, true); 361 + assert_eq!(enc, Encoding::Utf8); 362 + assert_eq!(text, "<html>Hello</html>"); 363 + } 364 + 365 + #[test] 366 + fn decode_html_utf8_from_http_charset() { 367 + let ct = ContentType { 368 + mime_type: "text/html".to_string(), 369 + charset: Some("utf-8".to_string()), 370 + }; 371 + let bytes = b"<html>Hello</html>"; 372 + let (text, enc) = decode_text_resource(bytes, Some(&ct), true); 373 + assert_eq!(enc, Encoding::Utf8); 374 + assert_eq!(text, "<html>Hello</html>"); 375 + } 376 + 377 + #[test] 378 + fn decode_html_meta_charset() { 379 + let html = b"<meta charset=\"utf-8\"><html>Hello</html>"; 380 + let (text, enc) = decode_text_resource(html, None, true); 381 + assert_eq!(enc, Encoding::Utf8); 382 + assert!(text.contains("Hello")); 383 + } 384 + 385 + #[test] 386 + fn decode_html_default_windows_1252() { 387 + let bytes = b"<html>Hello</html>"; 388 + let (text, enc) = decode_text_resource(bytes, None, true); 389 + assert_eq!(enc, Encoding::Windows1252); 390 + assert!(text.contains("Hello")); 391 + } 392 + 393 + #[test] 394 + fn decode_html_windows_1252_special_chars() { 395 + // \x93 and \x94 are left/right double quotation marks in Windows-1252 396 + let bytes = b"<html>\x93Hello\x94</html>"; 397 + let (text, enc) = decode_text_resource(bytes, None, true); 398 + assert_eq!(enc, Encoding::Windows1252); 399 + assert!(text.contains('\u{201C}')); // left double quote 400 + assert!(text.contains('\u{201D}')); // right double quote 401 + } 402 + 403 + #[test] 404 + fn decode_html_bom_beats_http_charset() { 405 + let ct = ContentType { 406 + mime_type: "text/html".to_string(), 407 + charset: Some("windows-1252".to_string()), 408 + }; 409 + let mut bytes = vec![0xEF, 0xBB, 0xBF]; 410 + bytes.extend_from_slice(b"<html>Hello</html>"); 411 + let (text, enc) = decode_text_resource(&bytes, Some(&ct), true); 412 + assert_eq!(enc, Encoding::Utf8); 413 + assert_eq!(text, "<html>Hello</html>"); 414 + } 415 + 416 + // ----------------------------------------------------------------------- 417 + // Text decoding — non-HTML (CSS, etc.) 418 + // ----------------------------------------------------------------------- 419 + 420 + #[test] 421 + fn decode_css_utf8_default() { 422 + let bytes = b"body { color: red; }"; 423 + let (text, enc) = decode_text_resource(bytes, None, false); 424 + assert_eq!(enc, Encoding::Utf8); 425 + assert_eq!(text, "body { color: red; }"); 426 + } 427 + 428 + #[test] 429 + fn decode_css_bom_utf8() { 430 + let bytes = b"\xEF\xBB\xBFbody { color: red; }"; 431 + let (text, enc) = decode_text_resource(bytes, None, false); 432 + assert_eq!(enc, Encoding::Utf8); 433 + assert_eq!(text, "body { color: red; }"); 434 + } 435 + 436 + #[test] 437 + fn decode_css_http_charset() { 438 + let ct = ContentType { 439 + mime_type: "text/css".to_string(), 440 + charset: Some("utf-8".to_string()), 441 + }; 442 + let bytes = b"body { color: red; }"; 443 + let (text, enc) = decode_text_resource(bytes, Some(&ct), false); 444 + assert_eq!(enc, Encoding::Utf8); 445 + assert_eq!(text, "body { color: red; }"); 446 + } 447 + 448 + // ----------------------------------------------------------------------- 449 + // BOM handling 450 + // ----------------------------------------------------------------------- 451 + 452 + #[test] 453 + fn decode_with_bom_strips_utf8_bom() { 454 + let bytes = b"\xEF\xBB\xBFHello"; 455 + let text = decode_with_bom_handling(bytes, Encoding::Utf8); 456 + assert_eq!(text, "Hello"); 457 + } 458 + 459 + #[test] 460 + fn decode_without_bom_passes_through() { 461 + let bytes = b"Hello"; 462 + let text = decode_with_bom_handling(bytes, Encoding::Utf8); 463 + assert_eq!(text, "Hello"); 464 + } 465 + 466 + #[test] 467 + fn decode_with_utf16le_bom() { 468 + let bytes = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00"; 469 + let text = decode_with_bom_handling(bytes, Encoding::Utf16Le); 470 + assert_eq!(text, "Hello"); 471 + } 472 + 473 + #[test] 474 + fn decode_with_utf16be_bom() { 475 + let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o"; 476 + let text = decode_with_bom_handling(bytes, Encoding::Utf16Be); 477 + assert_eq!(text, "Hello"); 478 + } 479 + 480 + // ----------------------------------------------------------------------- 481 + // ResourceLoader construction 482 + // ----------------------------------------------------------------------- 483 + 484 + #[test] 485 + fn resource_loader_new() { 486 + let _loader = ResourceLoader::new(); 487 + } 488 + 489 + #[test] 490 + fn resource_loader_default() { 491 + let _loader = ResourceLoader::default(); 492 + } 493 + 494 + // ----------------------------------------------------------------------- 495 + // URL resolution 496 + // ----------------------------------------------------------------------- 497 + 498 + #[test] 499 + fn fetch_url_invalid_url_error() { 500 + let mut loader = ResourceLoader::new(); 501 + let result = loader.fetch_url("not a url at all", None); 502 + assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); 503 + } 504 + 505 + #[test] 506 + fn fetch_url_relative_without_base_errors() { 507 + let mut loader = ResourceLoader::new(); 508 + let result = loader.fetch_url("/relative/path", None); 509 + assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); 510 + } 511 + 512 + #[test] 513 + fn fetch_url_relative_with_base_resolves() { 514 + let mut loader = ResourceLoader::new(); 515 + let base = Url::parse("http://example.com/page").unwrap(); 516 + // This will fail since we can't actually connect in tests, 517 + // but the URL resolution itself should work (it won't be InvalidUrl). 518 + let result = loader.fetch_url("/style.css", Some(&base)); 519 + assert!(result.is_err()); 520 + // The error should NOT be InvalidUrl — the URL resolved successfully. 521 + assert!(!matches!(result, Err(LoadError::InvalidUrl(_)))); 522 + } 523 + }