//! Resource loader: fetch URLs and decode their content. //! //! Brings together `net` (HTTP client), `encoding` (charset detection and decoding), //! `url` (URL parsing and resolution), and `image` (image decoding) into a single //! `ResourceLoader` that the browser uses to load web pages and subresources. use std::fmt; use we_encoding::sniff::sniff_encoding; use we_encoding::Encoding; use we_net::client::{ClientError, HttpClient}; use we_net::http::ContentType; use we_url::data_url::{is_data_url, parse_data_url}; use we_url::Url; // --------------------------------------------------------------------------- // Error type // --------------------------------------------------------------------------- /// Errors that can occur during resource loading. #[derive(Debug)] pub enum LoadError { /// URL parsing failed. InvalidUrl(String), /// Network or HTTP error from the underlying client. Network(ClientError), /// HTTP response indicated an error status. HttpStatus { status: u16, reason: String }, /// Encoding or decoding error. Encoding(String), } impl fmt::Display for LoadError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::InvalidUrl(s) => write!(f, "invalid URL: {s}"), Self::Network(e) => write!(f, "network error: {e}"), Self::HttpStatus { status, reason } => { write!(f, "HTTP {status} {reason}") } Self::Encoding(s) => write!(f, "encoding error: {s}"), } } } impl From for LoadError { fn from(e: ClientError) -> Self { Self::Network(e) } } // --------------------------------------------------------------------------- // Resource types // --------------------------------------------------------------------------- /// A loaded resource with its decoded content and metadata. #[derive(Debug)] pub enum Resource { /// An HTML document. Html { text: String, base_url: Url, encoding: Encoding, }, /// A CSS stylesheet. Css { text: String, url: Url }, /// A decoded image. Image { data: Vec, mime_type: String, url: Url, }, /// Any other resource type (binary). Other { data: Vec, mime_type: String, url: Url, }, } // --------------------------------------------------------------------------- // ResourceLoader // --------------------------------------------------------------------------- /// Loads resources over HTTP/HTTPS with encoding detection and content-type handling. pub struct ResourceLoader { client: HttpClient, } impl ResourceLoader { /// Create a new resource loader with default settings. pub fn new() -> Self { Self { client: HttpClient::new(), } } /// Fetch a resource at the given URL. /// /// Determines the resource type from the HTTP Content-Type header, decodes /// text resources using the appropriate character encoding (per WHATWG spec), /// and returns the result as a typed `Resource`. /// /// Handles `data:` and `about:` URLs locally without network access. pub fn fetch(&mut self, url: &Url) -> Result { // Handle data: URLs without network fetch. if url.scheme() == "data" { return fetch_data_url(&url.serialize()); } // Handle about: URLs without network fetch. if url.scheme() == "about" { return fetch_about_url(url); } let response = self.client.get(url)?; // Check for HTTP error status codes if response.status_code >= 400 { return Err(LoadError::HttpStatus { status: response.status_code, reason: response.reason.clone(), }); } let content_type = response.content_type(); let mime = content_type .as_ref() .map(|ct| ct.mime_type.as_str()) .unwrap_or("application/octet-stream"); match classify_mime(mime) { MimeClass::Html => { let (text, encoding) = decode_text_resource(&response.body, content_type.as_ref(), true); Ok(Resource::Html { text, base_url: url.clone(), encoding, }) } MimeClass::Css => { let (text, _encoding) = decode_text_resource(&response.body, content_type.as_ref(), false); Ok(Resource::Css { text, url: url.clone(), }) } MimeClass::Image => Ok(Resource::Image { data: response.body, mime_type: mime.to_string(), url: url.clone(), }), MimeClass::Other => { // Check if it's a text type we should decode if mime.starts_with("text/") { let (text, _encoding) = decode_text_resource(&response.body, content_type.as_ref(), false); Ok(Resource::Other { data: text.into_bytes(), mime_type: mime.to_string(), url: url.clone(), }) } else { Ok(Resource::Other { data: response.body, mime_type: mime.to_string(), url: url.clone(), }) } } } } /// Fetch a URL string, resolving it against an optional base URL. /// /// Handles `data:` and `about:` URLs locally without network access. pub fn fetch_url(&mut self, url_str: &str, base: Option<&Url>) -> Result { // Handle data URLs directly — no network fetch needed. if is_data_url(url_str) { return fetch_data_url(url_str); } // Handle about: URLs without network fetch. if url_str.starts_with("about:") { let url = Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?; return fetch_about_url(&url); } let url = match base { Some(base_url) => Url::parse_with_base(url_str, base_url) .or_else(|_| Url::parse(url_str)) .map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, None => Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?, }; self.fetch(&url) } } impl Default for ResourceLoader { fn default() -> Self { Self::new() } } // --------------------------------------------------------------------------- // MIME classification // --------------------------------------------------------------------------- enum MimeClass { Html, Css, Image, Other, } fn classify_mime(mime: &str) -> MimeClass { match mime { "text/html" | "application/xhtml+xml" => MimeClass::Html, "text/css" => MimeClass::Css, "image/png" | "image/jpeg" | "image/gif" | "image/webp" | "image/svg+xml" => { MimeClass::Image } _ => MimeClass::Other, } } // --------------------------------------------------------------------------- // Text decoding // --------------------------------------------------------------------------- /// Decode a text resource's bytes to a String using WHATWG encoding sniffing. /// /// For HTML resources, uses BOM > HTTP charset > meta prescan > default. /// For non-HTML text resources, uses BOM > HTTP charset > default (UTF-8). fn decode_text_resource( bytes: &[u8], content_type: Option<&ContentType>, is_html: bool, ) -> (String, Encoding) { let http_ct_value = content_type.map(|ct| { // Reconstruct a Content-Type header value for the sniffing function match &ct.charset { Some(charset) => format!("{}; charset={}", ct.mime_type, charset), None => ct.mime_type.clone(), } }); if is_html { // Full WHATWG sniffing: BOM > HTTP > meta prescan > default (Windows-1252) let (encoding, _source) = sniff_encoding(bytes, http_ct_value.as_deref()); let text = decode_with_bom_handling(bytes, encoding); (text, encoding) } else { // Non-HTML: BOM > HTTP charset > default (UTF-8) let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); if let Some(enc) = bom_enc { let text = we_encoding::decode(after_bom, enc); return (text, enc); } // Try HTTP charset if let Some(charset) = content_type.and_then(|ct| ct.charset.as_deref()) { if let Some(enc) = we_encoding::lookup(charset) { let text = we_encoding::decode(bytes, enc); return (text, enc); } } // Default to UTF-8 for non-HTML text let text = we_encoding::decode(bytes, Encoding::Utf8); (text, Encoding::Utf8) } } /// Decode bytes with BOM handling — strip BOM bytes before decoding. fn decode_with_bom_handling(bytes: &[u8], encoding: Encoding) -> String { let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes); if bom_enc.is_some() { // BOM was present — decode the bytes after the BOM we_encoding::decode(after_bom, encoding) } else { we_encoding::decode(bytes, encoding) } } // --------------------------------------------------------------------------- // Data URL handling // --------------------------------------------------------------------------- /// Fetch a data URL, decoding its payload and returning the appropriate Resource type. fn fetch_data_url(url_str: &str) -> Result { let parsed = parse_data_url(url_str) .map_err(|e| LoadError::InvalidUrl(format!("data URL error: {e}")))?; let mime = &parsed.mime_type; // Create a synthetic Url for the resource metadata. let url = Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?; match classify_mime(mime) { MimeClass::Html => { let encoding = charset_to_encoding(parsed.charset.as_deref()); let text = we_encoding::decode(&parsed.data, encoding); Ok(Resource::Html { text, base_url: url, encoding, }) } MimeClass::Css => { let encoding = charset_to_encoding(parsed.charset.as_deref()); let text = we_encoding::decode(&parsed.data, encoding); Ok(Resource::Css { text, url }) } MimeClass::Image => Ok(Resource::Image { data: parsed.data, mime_type: mime.to_string(), url, }), MimeClass::Other => { if mime.starts_with("text/") { let encoding = charset_to_encoding(parsed.charset.as_deref()); let text = we_encoding::decode(&parsed.data, encoding); Ok(Resource::Other { data: text.into_bytes(), mime_type: mime.to_string(), url, }) } else { Ok(Resource::Other { data: parsed.data, mime_type: mime.to_string(), url, }) } } } } // --------------------------------------------------------------------------- // about: URL handling // --------------------------------------------------------------------------- /// The minimal HTML document for about:blank. pub const ABOUT_BLANK_HTML: &str = ""; /// Fetch an about: URL, returning the appropriate resource. /// /// Currently only `about:blank` is supported, which returns an empty HTML /// document with UTF-8 encoding. fn fetch_about_url(url: &Url) -> Result { match url.path().as_str() { "blank" => Ok(Resource::Html { text: ABOUT_BLANK_HTML.to_string(), base_url: url.clone(), encoding: Encoding::Utf8, }), other => Err(LoadError::InvalidUrl(format!( "unsupported about: URL: about:{other}" ))), } } /// Map a charset name to an Encoding, defaulting to UTF-8. fn charset_to_encoding(charset: Option<&str>) -> Encoding { charset .and_then(we_encoding::lookup) .unwrap_or(Encoding::Utf8) } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; // ----------------------------------------------------------------------- // LoadError Display // ----------------------------------------------------------------------- #[test] fn load_error_display_invalid_url() { let e = LoadError::InvalidUrl("bad://url".to_string()); assert_eq!(e.to_string(), "invalid URL: bad://url"); } #[test] fn load_error_display_http_status() { let e = LoadError::HttpStatus { status: 404, reason: "Not Found".to_string(), }; assert_eq!(e.to_string(), "HTTP 404 Not Found"); } #[test] fn load_error_display_encoding() { let e = LoadError::Encoding("bad charset".to_string()); assert_eq!(e.to_string(), "encoding error: bad charset"); } // ----------------------------------------------------------------------- // MIME classification // ----------------------------------------------------------------------- #[test] fn classify_text_html() { assert!(matches!(classify_mime("text/html"), MimeClass::Html)); } #[test] fn classify_xhtml() { assert!(matches!( classify_mime("application/xhtml+xml"), MimeClass::Html )); } #[test] fn classify_text_css() { assert!(matches!(classify_mime("text/css"), MimeClass::Css)); } #[test] fn classify_image_png() { assert!(matches!(classify_mime("image/png"), MimeClass::Image)); } #[test] fn classify_image_jpeg() { assert!(matches!(classify_mime("image/jpeg"), MimeClass::Image)); } #[test] fn classify_image_gif() { assert!(matches!(classify_mime("image/gif"), MimeClass::Image)); } #[test] fn classify_application_json() { assert!(matches!( classify_mime("application/json"), MimeClass::Other )); } #[test] fn classify_text_plain() { assert!(matches!(classify_mime("text/plain"), MimeClass::Other)); } #[test] fn classify_octet_stream() { assert!(matches!( classify_mime("application/octet-stream"), MimeClass::Other )); } // ----------------------------------------------------------------------- // Text decoding — HTML // ----------------------------------------------------------------------- #[test] fn decode_html_utf8_bom() { let bytes = b"\xEF\xBB\xBFHello"; let (text, enc) = decode_text_resource(bytes, None, true); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "Hello"); } #[test] fn decode_html_utf8_from_http_charset() { let ct = ContentType { mime_type: "text/html".to_string(), charset: Some("utf-8".to_string()), }; let bytes = b"Hello"; let (text, enc) = decode_text_resource(bytes, Some(&ct), true); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "Hello"); } #[test] fn decode_html_meta_charset() { let html = b"Hello"; let (text, enc) = decode_text_resource(html, None, true); assert_eq!(enc, Encoding::Utf8); assert!(text.contains("Hello")); } #[test] fn decode_html_default_windows_1252() { let bytes = b"Hello"; let (text, enc) = decode_text_resource(bytes, None, true); assert_eq!(enc, Encoding::Windows1252); assert!(text.contains("Hello")); } #[test] fn decode_html_windows_1252_special_chars() { // \x93 and \x94 are left/right double quotation marks in Windows-1252 let bytes = b"\x93Hello\x94"; let (text, enc) = decode_text_resource(bytes, None, true); assert_eq!(enc, Encoding::Windows1252); assert!(text.contains('\u{201C}')); // left double quote assert!(text.contains('\u{201D}')); // right double quote } #[test] fn decode_html_bom_beats_http_charset() { let ct = ContentType { mime_type: "text/html".to_string(), charset: Some("windows-1252".to_string()), }; let mut bytes = vec![0xEF, 0xBB, 0xBF]; bytes.extend_from_slice(b"Hello"); let (text, enc) = decode_text_resource(&bytes, Some(&ct), true); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "Hello"); } // ----------------------------------------------------------------------- // Text decoding — non-HTML (CSS, etc.) // ----------------------------------------------------------------------- #[test] fn decode_css_utf8_default() { let bytes = b"body { color: red; }"; let (text, enc) = decode_text_resource(bytes, None, false); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "body { color: red; }"); } #[test] fn decode_css_bom_utf8() { let bytes = b"\xEF\xBB\xBFbody { color: red; }"; let (text, enc) = decode_text_resource(bytes, None, false); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "body { color: red; }"); } #[test] fn decode_css_http_charset() { let ct = ContentType { mime_type: "text/css".to_string(), charset: Some("utf-8".to_string()), }; let bytes = b"body { color: red; }"; let (text, enc) = decode_text_resource(bytes, Some(&ct), false); assert_eq!(enc, Encoding::Utf8); assert_eq!(text, "body { color: red; }"); } // ----------------------------------------------------------------------- // BOM handling // ----------------------------------------------------------------------- #[test] fn decode_with_bom_strips_utf8_bom() { let bytes = b"\xEF\xBB\xBFHello"; let text = decode_with_bom_handling(bytes, Encoding::Utf8); assert_eq!(text, "Hello"); } #[test] fn decode_without_bom_passes_through() { let bytes = b"Hello"; let text = decode_with_bom_handling(bytes, Encoding::Utf8); assert_eq!(text, "Hello"); } #[test] fn decode_with_utf16le_bom() { let bytes = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00"; let text = decode_with_bom_handling(bytes, Encoding::Utf16Le); assert_eq!(text, "Hello"); } #[test] fn decode_with_utf16be_bom() { let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o"; let text = decode_with_bom_handling(bytes, Encoding::Utf16Be); assert_eq!(text, "Hello"); } // ----------------------------------------------------------------------- // ResourceLoader construction // ----------------------------------------------------------------------- #[test] fn resource_loader_new() { let _loader = ResourceLoader::new(); } #[test] fn resource_loader_default() { let _loader = ResourceLoader::default(); } // ----------------------------------------------------------------------- // URL resolution // ----------------------------------------------------------------------- #[test] fn fetch_url_invalid_url_error() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("not a url at all", None); assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); } #[test] fn fetch_url_relative_without_base_errors() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("/relative/path", None); assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); } #[test] fn fetch_url_relative_with_base_resolves() { let mut loader = ResourceLoader::new(); let base = Url::parse("http://example.com/page").unwrap(); // This will fail since we can't actually connect in tests, // but the URL resolution itself should work (it won't be InvalidUrl). let result = loader.fetch_url("/style.css", Some(&base)); assert!(result.is_err()); // The error should NOT be InvalidUrl — the URL resolved successfully. assert!(!matches!(result, Err(LoadError::InvalidUrl(_)))); } // ----------------------------------------------------------------------- // Data URL loading // ----------------------------------------------------------------------- #[test] fn data_url_plain_text() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("data:text/plain,Hello%20World", None); assert!(result.is_ok()); match result.unwrap() { Resource::Other { data, mime_type, .. } => { assert_eq!(mime_type, "text/plain"); assert_eq!(String::from_utf8(data).unwrap(), "Hello World"); } other => panic!("expected Other, got {:?}", other), } } #[test] fn data_url_html() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("data:text/html,

Hello

", None); assert!(result.is_ok()); match result.unwrap() { Resource::Html { text, .. } => { assert_eq!(text, "

Hello

"); } other => panic!("expected Html, got {:?}", other), } } #[test] fn data_url_css() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("data:text/css,body{color:red}", None); assert!(result.is_ok()); match result.unwrap() { Resource::Css { text, .. } => { assert_eq!(text, "body{color:red}"); } other => panic!("expected Css, got {:?}", other), } } #[test] fn data_url_image() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("data:image/png;base64,/wCq", None); assert!(result.is_ok()); match result.unwrap() { Resource::Image { data, mime_type, .. } => { assert_eq!(mime_type, "image/png"); assert_eq!(data, vec![0xFF, 0x00, 0xAA]); } other => panic!("expected Image, got {:?}", other), } } #[test] fn data_url_base64() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("data:text/plain;base64,SGVsbG8=", None); assert!(result.is_ok()); match result.unwrap() { Resource::Other { data, .. } => { assert_eq!(String::from_utf8(data).unwrap(), "Hello"); } other => panic!("expected Other, got {:?}", other), } } #[test] fn data_url_empty() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("data:,", None); assert!(result.is_ok()); } #[test] fn data_url_via_fetch_method() { let mut loader = ResourceLoader::new(); let url = Url::parse("data:text/plain,Hello").unwrap(); let result = loader.fetch(&url); assert!(result.is_ok()); match result.unwrap() { Resource::Other { data, .. } => { assert_eq!(String::from_utf8(data).unwrap(), "Hello"); } other => panic!("expected Other, got {:?}", other), } } #[test] fn data_url_invalid() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("data:text/plain", None); assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); } #[test] fn data_url_binary() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("data:application/octet-stream;base64,/wCq", None); assert!(result.is_ok()); match result.unwrap() { Resource::Other { data, mime_type, .. } => { assert_eq!(mime_type, "application/octet-stream"); assert_eq!(data, vec![0xFF, 0x00, 0xAA]); } other => panic!("expected Other, got {:?}", other), } } // ----------------------------------------------------------------------- // about: URL loading // ----------------------------------------------------------------------- #[test] fn about_blank_via_fetch_url() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("about:blank", None); assert!(result.is_ok()); match result.unwrap() { Resource::Html { text, encoding, base_url, .. } => { assert_eq!(text, ABOUT_BLANK_HTML); assert_eq!(encoding, Encoding::Utf8); assert_eq!(base_url.scheme(), "about"); } other => panic!("expected Html, got {:?}", other), } } #[test] fn about_blank_via_fetch() { let mut loader = ResourceLoader::new(); let url = Url::parse("about:blank").unwrap(); let result = loader.fetch(&url); assert!(result.is_ok()); match result.unwrap() { Resource::Html { text, encoding, base_url, .. } => { assert_eq!(text, ABOUT_BLANK_HTML); assert_eq!(encoding, Encoding::Utf8); assert_eq!(base_url.scheme(), "about"); } other => panic!("expected Html, got {:?}", other), } } #[test] fn about_blank_dom_structure() { let doc = we_html::parse_html(ABOUT_BLANK_HTML); // Find the element under the document root. let html = doc .children(doc.root()) .find(|&n| doc.tag_name(n) == Some("html")); assert!(html.is_some(), "document should have an element"); let html = html.unwrap(); // The DOM should have html > head + body structure. let children: Vec<_> = doc .children(html) .filter(|&n| doc.tag_name(n).is_some()) .collect(); assert_eq!(children.len(), 2); assert_eq!(doc.tag_name(children[0]).unwrap(), "head"); assert_eq!(doc.tag_name(children[1]).unwrap(), "body"); // Body should have no child elements. let body_children: Vec<_> = doc .children(children[1]) .filter(|&n| doc.tag_name(n).is_some()) .collect(); assert!(body_children.is_empty()); } #[test] fn about_unsupported_url() { let mut loader = ResourceLoader::new(); let result = loader.fetch_url("about:invalid", None); assert!(matches!(result, Err(LoadError::InvalidUrl(_)))); } }