//! Resource loader: fetch URLs and decode their content. //! //! Brings together `net` (HTTP client), `encoding` (charset detection and decoding), //! `url` (URL parsing and resolution), and `image` (image decoding) into a single //! `ResourceLoader` that the browser uses to load web pages and subresources. use std::fmt; use we_encoding::sniff::sniff_encoding; use we_encoding::Encoding; use we_net::client::{ClientError, HttpClient}; use we_net::http::ContentType; use we_url::Url; // --------------------------------------------------------------------------- // Error type // --------------------------------------------------------------------------- /// Errors that can occur during resource loading. #[derive(Debug)] pub enum LoadError { /// URL parsing failed. InvalidUrl(String), /// Network or HTTP error from the underlying client. Network(ClientError), /// HTTP response indicated an error status. HttpStatus { status: u16, reason: String }, /// Encoding or decoding error. Encoding(String), } impl fmt::Display for LoadError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::InvalidUrl(s) => write!(f, "invalid URL: {s}"), Self::Network(e) => write!(f, "network error: {e}"), Self::HttpStatus { status, reason } => { write!(f, "HTTP {status} {reason}") } Self::Encoding(s) => write!(f, "encoding error: {s}"), } } } impl From for LoadError { fn from(e: ClientError) -> Self { Self::Network(e) } } // --------------------------------------------------------------------------- // Resource types // --------------------------------------------------------------------------- /// A loaded resource with its decoded content and metadata. #[derive(Debug)] pub enum Resource { /// An HTML document. Html { text: String, base_url: Url, encoding: Encoding, }, /// A CSS stylesheet. Css { text: String, url: Url }, /// A decoded image. Image { data: Vec, mime_type: String, url: Url, }, /// Any other resource type (binary). Other { data: Vec, mime_type: String, url: Url, }, } // --------------------------------------------------------------------------- // ResourceLoader // --------------------------------------------------------------------------- /// Loads resources over HTTP/HTTPS with encoding detection and content-type handling. pub struct ResourceLoader { client: HttpClient, } impl ResourceLoader { /// Create a new resource loader with default settings. pub fn new() -> Self { Self { client: HttpClient::new(), } } /// Fetch a resource at the given URL. /// /// Determines the resource type from the HTTP Content-Type header, decodes /// text resources using the appropriate character encoding (per WHATWG spec), /// and returns the result as a typed `Resource`. pub fn fetch(&mut self, url: &Url) -> Result { let response = self.client.get(url)?; // Check for HTTP error status codes if response.status_code >= 400 { return Err(LoadError::HttpStatus { status: response.status_code, reason: response.reason.clone(), }); } let content_type = response.content_type(); let mime = content_type .as_ref() .map(|ct| ct.mime_type.as_str()) .unwrap_or("application/octet-stream"); match classify_mime(mime) { MimeClass::Html => { let (text, encoding) = decode_text_resource(&response.body, content_type.as_ref(), true); Ok(Resource::Html { text, base_url: url.clone(), encoding, }) } MimeClass::Css => { let (text, _encoding) = decode_text_resource(&response.body, content_type.as_ref(), false); Ok(Resource::Css { text, url: url.clone(), }) } MimeClass::Image => Ok(Resource::Image { data: response.body, mime_type: mime.to_string(), url: url.clone(), }), MimeClass::Other => { // Check if it's a text type we should decode if mime.starts_with("text/") { let (text, _encoding) = decode_text_resource(&response.body, content_type.as_ref(), false); Ok(Resource::Other { data: text.into_bytes(), mime_type: mime.to_string(), url: url.clone(), }) } else { Ok(Resource::Other { data: response.body, mime_type: mime.to_string(), url: url.clone(), }) } } } } /// Fetch a URL string, resolving it against an optional base URL. pub fn fetch_url(&mut self, url_str: &str, base: Option<&Url>) -> Result { let url = match base { Some(base_url) => Url::parse_with_base(url_str, base_url) .or_else(|_| Url::parse(url_str)) .map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, None => Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, }; self.fetch(&url) } } impl Default for ResourceLoader { fn default() -> Self { Self::new() } } // --------------------------------------------------------------------------- // MIME classification // --------------------------------------------------------------------------- enum MimeClass { Html, Css, Image, Other, } fn classify_mime(mime: &str) -> MimeClass { match mime { "text/html" | "application/xhtml+xml" => MimeClass::Html, "text/css" => MimeClass::Css, "image/png" | "image/jpeg" | "image/gif" | "image/webp" | "image/svg+xml" => { MimeClass::Image } _ => MimeClass::Other, } } // --------------------------------------------------------------------------- // Text decoding // --------------------------------------------------------------------------- /// Decode a text resource's bytes to a String using WHATWG encoding sniffing. /// /// For HTML resources, uses BOM > HTTP charset > meta prescan > default. /// For non-HTML text resources, uses BOM > HTTP charset > default (UTF-8). fn decode_text_resource( bytes: &[u8], content_type: Option<&ContentType>, is_html: bool, ) -> (String, Encoding) { let http_ct_value = content_type.map(|ct| { // Reconstruct a Content-Type header value for the sniffing function match &ct.charset { Some(charset) => format!("{}; charset={}", ct.mime_type, charset), None => ct.mime_type.clone(), } }); if is_html { // Full WHATWG sniffing: BOM > HTTP > meta prescan > default (Windows-1252) let (encoding, _source) = sniff_encoding(bytes, http_ct_value.as_deref()); let text = decode_with_bom_handling(bytes, encoding); (text, encoding) } else { // Non-HTML: BOM > HTTP charset > default (UTF-8) let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); if let Some(enc) = bom_enc { let text = we_encoding::decode(after_bom, enc); return (text, enc); } // Try HTTP charset if let Some(charset) = content_type.and_then(|ct| ct.charset.as_deref()) { if let Some(enc) = we_encoding::lookup(charset) { let text = we_encoding::decode(bytes, enc); return (text, enc); } } // Default to UTF-8 for non-HTML text let text = we_encoding::decode(bytes, Encoding::Utf8); (text, Encoding::Utf8) } } /// Decode bytes with BOM handling — strip BOM bytes before decoding. fn decode_with_bom_handling(bytes: &[u8], encoding: Encoding) -> String { let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); if bom_enc.is_some() { // BOM was present — decode the bytes after the BOM we_encoding::decode(after_bom, encoding) } else { we_encoding::decode(bytes, encoding) } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; // ----------------------------------------------------------------------- // LoadError Display // ----------------------------------------------------------------------- #[test] fn load_error_display_invalid_url() { let e = LoadError::InvalidUrl("bad://url".to_string()); assert_eq!(e.to_string(), "invalid URL: bad://url"); } #[test] fn load_error_display_http_status() { let e = LoadError::HttpStatus { status: 404, reason: "Not Found".to_string(), }; assert_eq!(e.to_string(), "HTTP 404 Not Found"); } #[test] fn load_error_display_encoding() { let e = LoadError::Encoding("bad charset".to_string()); assert_eq!(e.to_string(), "encoding error: bad charset"); } // ----------------------------------------------------------------------- // MIME classification // ----------------------------------------------------------------------- #[test] fn classify_text_html() { assert!(matches!(classify_mime("text/html"), MimeClass::Html)); } #[test] fn classify_xhtml() { assert!(matches!( classify_mime("application/xhtml+xml"), MimeClass::Html )); } #[test] fn classify_text_css() { assert!(matches!(classify_mime("text/css"), MimeClass::Css)); } #[test] fn classify_image_png() { assert!(matches!(classify_mime("image/png"), MimeClass::Image)); } #[test] fn classify_image_jpeg() { assert!(matches!(classify_mime("image/jpeg"), MimeClass::Image)); } #[test] fn classify_image_gif() { assert!(matches!(classify_mime("image/gif"), MimeClass::Image)); } #[test] fn classify_application_json() { assert!(matches!( classify_mime("application/json"), MimeClass::Other )); } #[test] fn classify_text_plain() { assert!(matches!(classify_mime("text/plain"), MimeClass::Other)); } #[test] fn classify_octet_stream() { assert!(matches!( classify_mime("application/octet-stream"), MimeClass::Other )); } // ----------------------------------------------------------------------- // Text decoding — HTML // ----------------------------------------------------------------------- #[test] fn decode_html_utf8_bom() { let bytes = b"\xEF\xBB\xBFHello"; let (text, enc) = decode_text_resource(bytes, None, true); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "Hello"); } #[test] fn decode_html_utf8_from_http_charset() { let ct = ContentType { mime_type: "text/html".to_string(), charset: Some("utf-8".to_string()), }; let bytes = b"Hello"; let (text, enc) = decode_text_resource(bytes, Some(&ct), true); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "Hello"); } #[test] fn decode_html_meta_charset() { let html = b"Hello"; let (text, enc) = decode_text_resource(html, None, true); assert_eq!(enc, Encoding::Utf8); assert!(text.contains("Hello")); } #[test] fn decode_html_default_windows_1252() { let bytes = b"Hello"; let (text, enc) = decode_text_resource(bytes, None, true); assert_eq!(enc, Encoding::Windows1252); assert!(text.contains("Hello")); } #[test] fn decode_html_windows_1252_special_chars() { // \x93 and \x94 are left/right double quotation marks in Windows-1252 let bytes = b"\x93Hello\x94"; let (text, enc) = decode_text_resource(bytes, None, true); assert_eq!(enc, Encoding::Windows1252); assert!(text.contains('\u{201C}')); // left double quote assert!(text.contains('\u{201D}')); // right double quote } #[test] fn decode_html_bom_beats_http_charset() { let ct = ContentType { mime_type: "text/html".to_string(), charset: Some("windows-1252".to_string()), }; let mut bytes = vec![0xEF, 0xBB, 0xBF]; bytes.extend_from_slice(b"Hello"); let (text, enc) = decode_text_resource(&bytes, Some(&ct), true); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "Hello"); } // ----------------------------------------------------------------------- // Text decoding — non-HTML (CSS, etc.) // ----------------------------------------------------------------------- #[test] fn decode_css_utf8_default() { let bytes = b"body { color: red; }"; let (text, enc) = decode_text_resource(bytes, None, false); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "body { color: red; }"); } #[test] fn decode_css_bom_utf8() { let bytes = b"\xEF\xBB\xBFbody { color: red; }"; let (text, enc) = decode_text_resource(bytes, None, false); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "body { color: red; }"); } #[test] fn decode_css_http_charset() { let ct = ContentType { mime_type: "text/css".to_string(), charset: Some("utf-8".to_string()), }; let bytes = b"body { color: red; }"; let (text, enc) = decode_text_resource(bytes, Some(&ct), false); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "body { color: red; }"); } // ----------------------------------------------------------------------- // BOM handling // ----------------------------------------------------------------------- #[test] fn decode_with_bom_strips_utf8_bom() { let bytes = b"\xEF\xBB\xBFHello"; let text = decode_with_bom_handling(bytes, Encoding::Utf8); assert_eq!(text, "Hello"); } #[test] fn decode_without_bom_passes_through() { let bytes = b"Hello"; let text = decode_with_bom_handling(bytes, Encoding::Utf8); assert_eq!(text, "Hello"); } #[test] fn decode_with_utf16le_bom() { let bytes = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00"; let text = decode_with_bom_handling(bytes, Encoding::Utf16Le); assert_eq!(text, "Hello"); } #[test] fn decode_with_utf16be_bom() { let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o"; let text = decode_with_bom_handling(bytes, Encoding::Utf16Be); assert_eq!(text, "Hello"); } // ----------------------------------------------------------------------- // ResourceLoader construction // ----------------------------------------------------------------------- #[test] fn resource_loader_new() { let _loader = ResourceLoader::new(); } #[test] fn resource_loader_default() { let _loader = ResourceLoader::default(); } // ----------------------------------------------------------------------- // URL resolution // ----------------------------------------------------------------------- #[test] fn fetch_url_invalid_url_error() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("not a url at all", None); assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); } #[test] fn fetch_url_relative_without_base_errors() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("/relative/path", None); assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); } #[test] fn fetch_url_relative_with_base_resolves() { let mut loader = ResourceLoader::new(); let base = Url::parse("http://example.com/page").unwrap(); // This will fail since we can't actually connect in tests, // but the URL resolution itself should work (it won't be InvalidUrl). let result = loader.fetch_url("/style.css", Some(&base)); assert!(result.is_err()); // The error should NOT be InvalidUrl — the URL resolved successfully. assert!(!matches!(result, Err(LoadError::InvalidUrl(_)))); } }