we (web engine): Experimental web browser project to understand the limits of Claude
1//! Resource loader: fetch URLs and decode their content.
2//!
3//! Brings together `net` (HTTP client), `encoding` (charset detection and decoding),
4//! `url` (URL parsing and resolution), and `image` (image decoding) into a single
5//! `ResourceLoader` that the browser uses to load web pages and subresources.
6
7use std::fmt;
8
9use we_encoding::sniff::sniff_encoding;
10use we_encoding::Encoding;
11use we_net::client::{ClientError, HttpClient};
12use we_net::http::ContentType;
13use we_url::Url;
14
15// ---------------------------------------------------------------------------
16// Error type
17// ---------------------------------------------------------------------------
18
19/// Errors that can occur during resource loading.
20#[derive(Debug)]
21pub enum LoadError {
22 /// URL parsing failed.
23 InvalidUrl(String),
24 /// Network or HTTP error from the underlying client.
25 Network(ClientError),
26 /// HTTP response indicated an error status.
27 HttpStatus { status: u16, reason: String },
28 /// Encoding or decoding error.
29 Encoding(String),
30}
31
32impl fmt::Display for LoadError {
33 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
34 match self {
35 Self::InvalidUrl(s) => write!(f, "invalid URL: {s}"),
36 Self::Network(e) => write!(f, "network error: {e}"),
37 Self::HttpStatus { status, reason } => {
38 write!(f, "HTTP {status} {reason}")
39 }
40 Self::Encoding(s) => write!(f, "encoding error: {s}"),
41 }
42 }
43}
44
45impl From<ClientError> for LoadError {
46 fn from(e: ClientError) -> Self {
47 Self::Network(e)
48 }
49}
50
51// ---------------------------------------------------------------------------
52// Resource types
53// ---------------------------------------------------------------------------
54
55/// A loaded resource with its decoded content and metadata.
56#[derive(Debug)]
57pub enum Resource {
58 /// An HTML document.
59 Html {
60 text: String,
61 base_url: Url,
62 encoding: Encoding,
63 },
64 /// A CSS stylesheet.
65 Css { text: String, url: Url },
66 /// A decoded image.
67 Image {
68 data: Vec<u8>,
69 mime_type: String,
70 url: Url,
71 },
72 /// Any other resource type (binary).
73 Other {
74 data: Vec<u8>,
75 mime_type: String,
76 url: Url,
77 },
78}
79
80// ---------------------------------------------------------------------------
81// ResourceLoader
82// ---------------------------------------------------------------------------
83
84/// Loads resources over HTTP/HTTPS with encoding detection and content-type handling.
85pub struct ResourceLoader {
86 client: HttpClient,
87}
88
89impl ResourceLoader {
90 /// Create a new resource loader with default settings.
91 pub fn new() -> Self {
92 Self {
93 client: HttpClient::new(),
94 }
95 }
96
97 /// Fetch a resource at the given URL.
98 ///
99 /// Determines the resource type from the HTTP Content-Type header, decodes
100 /// text resources using the appropriate character encoding (per WHATWG spec),
101 /// and returns the result as a typed `Resource`.
102 pub fn fetch(&mut self, url: &Url) -> Result<Resource, LoadError> {
103 let response = self.client.get(url)?;
104
105 // Check for HTTP error status codes
106 if response.status_code >= 400 {
107 return Err(LoadError::HttpStatus {
108 status: response.status_code,
109 reason: response.reason.clone(),
110 });
111 }
112
113 let content_type = response.content_type();
114 let mime = content_type
115 .as_ref()
116 .map(|ct| ct.mime_type.as_str())
117 .unwrap_or("application/octet-stream");
118
119 match classify_mime(mime) {
120 MimeClass::Html => {
121 let (text, encoding) =
122 decode_text_resource(&response.body, content_type.as_ref(), true);
123 Ok(Resource::Html {
124 text,
125 base_url: url.clone(),
126 encoding,
127 })
128 }
129 MimeClass::Css => {
130 let (text, _encoding) =
131 decode_text_resource(&response.body, content_type.as_ref(), false);
132 Ok(Resource::Css {
133 text,
134 url: url.clone(),
135 })
136 }
137 MimeClass::Image => Ok(Resource::Image {
138 data: response.body,
139 mime_type: mime.to_string(),
140 url: url.clone(),
141 }),
142 MimeClass::Other => {
143 // Check if it's a text type we should decode
144 if mime.starts_with("text/") {
145 let (text, _encoding) =
146 decode_text_resource(&response.body, content_type.as_ref(), false);
147 Ok(Resource::Other {
148 data: text.into_bytes(),
149 mime_type: mime.to_string(),
150 url: url.clone(),
151 })
152 } else {
153 Ok(Resource::Other {
154 data: response.body,
155 mime_type: mime.to_string(),
156 url: url.clone(),
157 })
158 }
159 }
160 }
161 }
162
163 /// Fetch a URL string, resolving it against an optional base URL.
164 pub fn fetch_url(&mut self, url_str: &str, base: Option<&Url>) -> Result<Resource, LoadError> {
165 let url = match base {
166 Some(base_url) => Url::parse_with_base(url_str, base_url)
167 .or_else(|_| Url::parse(url_str))
168 .map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?,
169 None => Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?,
170 };
171 self.fetch(&url)
172 }
173}
174
175impl Default for ResourceLoader {
176 fn default() -> Self {
177 Self::new()
178 }
179}
180
181// ---------------------------------------------------------------------------
182// MIME classification
183// ---------------------------------------------------------------------------
184
185enum MimeClass {
186 Html,
187 Css,
188 Image,
189 Other,
190}
191
192fn classify_mime(mime: &str) -> MimeClass {
193 match mime {
194 "text/html" | "application/xhtml+xml" => MimeClass::Html,
195 "text/css" => MimeClass::Css,
196 "image/png" | "image/jpeg" | "image/gif" | "image/webp" | "image/svg+xml" => {
197 MimeClass::Image
198 }
199 _ => MimeClass::Other,
200 }
201}
202
203// ---------------------------------------------------------------------------
204// Text decoding
205// ---------------------------------------------------------------------------
206
207/// Decode a text resource's bytes to a String using WHATWG encoding sniffing.
208///
209/// For HTML resources, uses BOM > HTTP charset > meta prescan > default.
210/// For non-HTML text resources, uses BOM > HTTP charset > default (UTF-8).
211fn decode_text_resource(
212 bytes: &[u8],
213 content_type: Option<&ContentType>,
214 is_html: bool,
215) -> (String, Encoding) {
216 let http_ct_value = content_type.map(|ct| {
217 // Reconstruct a Content-Type header value for the sniffing function
218 match &ct.charset {
219 Some(charset) => format!("{}; charset={}", ct.mime_type, charset),
220 None => ct.mime_type.clone(),
221 }
222 });
223
224 if is_html {
225 // Full WHATWG sniffing: BOM > HTTP > meta prescan > default (Windows-1252)
226 let (encoding, _source) = sniff_encoding(bytes, http_ct_value.as_deref());
227 let text = decode_with_bom_handling(bytes, encoding);
228 (text, encoding)
229 } else {
230 // Non-HTML: BOM > HTTP charset > default (UTF-8)
231 let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes);
232 if let Some(enc) = bom_enc {
233 let text = we_encoding::decode(after_bom, enc);
234 return (text, enc);
235 }
236
237 // Try HTTP charset
238 if let Some(charset) = content_type.and_then(|ct| ct.charset.as_deref()) {
239 if let Some(enc) = we_encoding::lookup(charset) {
240 let text = we_encoding::decode(bytes, enc);
241 return (text, enc);
242 }
243 }
244
245 // Default to UTF-8 for non-HTML text
246 let text = we_encoding::decode(bytes, Encoding::Utf8);
247 (text, Encoding::Utf8)
248 }
249}
250
251/// Decode bytes with BOM handling — strip BOM bytes before decoding.
252fn decode_with_bom_handling(bytes: &[u8], encoding: Encoding) -> String {
253 let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes);
254 if bom_enc.is_some() {
255 // BOM was present — decode the bytes after the BOM
256 we_encoding::decode(after_bom, encoding)
257 } else {
258 we_encoding::decode(bytes, encoding)
259 }
260}
261
262// ---------------------------------------------------------------------------
263// Tests
264// ---------------------------------------------------------------------------
265
266#[cfg(test)]
267mod tests {
268 use super::*;
269
270 // -----------------------------------------------------------------------
271 // LoadError Display
272 // -----------------------------------------------------------------------
273
274 #[test]
275 fn load_error_display_invalid_url() {
276 let e = LoadError::InvalidUrl("bad://url".to_string());
277 assert_eq!(e.to_string(), "invalid URL: bad://url");
278 }
279
280 #[test]
281 fn load_error_display_http_status() {
282 let e = LoadError::HttpStatus {
283 status: 404,
284 reason: "Not Found".to_string(),
285 };
286 assert_eq!(e.to_string(), "HTTP 404 Not Found");
287 }
288
289 #[test]
290 fn load_error_display_encoding() {
291 let e = LoadError::Encoding("bad charset".to_string());
292 assert_eq!(e.to_string(), "encoding error: bad charset");
293 }
294
295 // -----------------------------------------------------------------------
296 // MIME classification
297 // -----------------------------------------------------------------------
298
299 #[test]
300 fn classify_text_html() {
301 assert!(matches!(classify_mime("text/html"), MimeClass::Html));
302 }
303
304 #[test]
305 fn classify_xhtml() {
306 assert!(matches!(
307 classify_mime("application/xhtml+xml"),
308 MimeClass::Html
309 ));
310 }
311
312 #[test]
313 fn classify_text_css() {
314 assert!(matches!(classify_mime("text/css"), MimeClass::Css));
315 }
316
317 #[test]
318 fn classify_image_png() {
319 assert!(matches!(classify_mime("image/png"), MimeClass::Image));
320 }
321
322 #[test]
323 fn classify_image_jpeg() {
324 assert!(matches!(classify_mime("image/jpeg"), MimeClass::Image));
325 }
326
327 #[test]
328 fn classify_image_gif() {
329 assert!(matches!(classify_mime("image/gif"), MimeClass::Image));
330 }
331
332 #[test]
333 fn classify_application_json() {
334 assert!(matches!(
335 classify_mime("application/json"),
336 MimeClass::Other
337 ));
338 }
339
340 #[test]
341 fn classify_text_plain() {
342 assert!(matches!(classify_mime("text/plain"), MimeClass::Other));
343 }
344
345 #[test]
346 fn classify_octet_stream() {
347 assert!(matches!(
348 classify_mime("application/octet-stream"),
349 MimeClass::Other
350 ));
351 }
352
353 // -----------------------------------------------------------------------
354 // Text decoding — HTML
355 // -----------------------------------------------------------------------
356
357 #[test]
358 fn decode_html_utf8_bom() {
359 let bytes = b"\xEF\xBB\xBF<html>Hello</html>";
360 let (text, enc) = decode_text_resource(bytes, None, true);
361 assert_eq!(enc, Encoding::Utf8);
362 assert_eq!(text, "<html>Hello</html>");
363 }
364
365 #[test]
366 fn decode_html_utf8_from_http_charset() {
367 let ct = ContentType {
368 mime_type: "text/html".to_string(),
369 charset: Some("utf-8".to_string()),
370 };
371 let bytes = b"<html>Hello</html>";
372 let (text, enc) = decode_text_resource(bytes, Some(&ct), true);
373 assert_eq!(enc, Encoding::Utf8);
374 assert_eq!(text, "<html>Hello</html>");
375 }
376
377 #[test]
378 fn decode_html_meta_charset() {
379 let html = b"<meta charset=\"utf-8\"><html>Hello</html>";
380 let (text, enc) = decode_text_resource(html, None, true);
381 assert_eq!(enc, Encoding::Utf8);
382 assert!(text.contains("Hello"));
383 }
384
385 #[test]
386 fn decode_html_default_windows_1252() {
387 let bytes = b"<html>Hello</html>";
388 let (text, enc) = decode_text_resource(bytes, None, true);
389 assert_eq!(enc, Encoding::Windows1252);
390 assert!(text.contains("Hello"));
391 }
392
393 #[test]
394 fn decode_html_windows_1252_special_chars() {
395 // \x93 and \x94 are left/right double quotation marks in Windows-1252
396 let bytes = b"<html>\x93Hello\x94</html>";
397 let (text, enc) = decode_text_resource(bytes, None, true);
398 assert_eq!(enc, Encoding::Windows1252);
399 assert!(text.contains('\u{201C}')); // left double quote
400 assert!(text.contains('\u{201D}')); // right double quote
401 }
402
403 #[test]
404 fn decode_html_bom_beats_http_charset() {
405 let ct = ContentType {
406 mime_type: "text/html".to_string(),
407 charset: Some("windows-1252".to_string()),
408 };
409 let mut bytes = vec![0xEF, 0xBB, 0xBF];
410 bytes.extend_from_slice(b"<html>Hello</html>");
411 let (text, enc) = decode_text_resource(&bytes, Some(&ct), true);
412 assert_eq!(enc, Encoding::Utf8);
413 assert_eq!(text, "<html>Hello</html>");
414 }
415
416 // -----------------------------------------------------------------------
417 // Text decoding — non-HTML (CSS, etc.)
418 // -----------------------------------------------------------------------
419
420 #[test]
421 fn decode_css_utf8_default() {
422 let bytes = b"body { color: red; }";
423 let (text, enc) = decode_text_resource(bytes, None, false);
424 assert_eq!(enc, Encoding::Utf8);
425 assert_eq!(text, "body { color: red; }");
426 }
427
428 #[test]
429 fn decode_css_bom_utf8() {
430 let bytes = b"\xEF\xBB\xBFbody { color: red; }";
431 let (text, enc) = decode_text_resource(bytes, None, false);
432 assert_eq!(enc, Encoding::Utf8);
433 assert_eq!(text, "body { color: red; }");
434 }
435
436 #[test]
437 fn decode_css_http_charset() {
438 let ct = ContentType {
439 mime_type: "text/css".to_string(),
440 charset: Some("utf-8".to_string()),
441 };
442 let bytes = b"body { color: red; }";
443 let (text, enc) = decode_text_resource(bytes, Some(&ct), false);
444 assert_eq!(enc, Encoding::Utf8);
445 assert_eq!(text, "body { color: red; }");
446 }
447
448 // -----------------------------------------------------------------------
449 // BOM handling
450 // -----------------------------------------------------------------------
451
452 #[test]
453 fn decode_with_bom_strips_utf8_bom() {
454 let bytes = b"\xEF\xBB\xBFHello";
455 let text = decode_with_bom_handling(bytes, Encoding::Utf8);
456 assert_eq!(text, "Hello");
457 }
458
459 #[test]
460 fn decode_without_bom_passes_through() {
461 let bytes = b"Hello";
462 let text = decode_with_bom_handling(bytes, Encoding::Utf8);
463 assert_eq!(text, "Hello");
464 }
465
466 #[test]
467 fn decode_with_utf16le_bom() {
468 let bytes = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00";
469 let text = decode_with_bom_handling(bytes, Encoding::Utf16Le);
470 assert_eq!(text, "Hello");
471 }
472
473 #[test]
474 fn decode_with_utf16be_bom() {
475 let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
476 let text = decode_with_bom_handling(bytes, Encoding::Utf16Be);
477 assert_eq!(text, "Hello");
478 }
479
480 // -----------------------------------------------------------------------
481 // ResourceLoader construction
482 // -----------------------------------------------------------------------
483
484 #[test]
485 fn resource_loader_new() {
486 let _loader = ResourceLoader::new();
487 }
488
489 #[test]
490 fn resource_loader_default() {
491 let _loader = ResourceLoader::default();
492 }
493
494 // -----------------------------------------------------------------------
495 // URL resolution
496 // -----------------------------------------------------------------------
497
498 #[test]
499 fn fetch_url_invalid_url_error() {
500 let mut loader = ResourceLoader::new();
501 let result = loader.fetch_url("not a url at all", None);
502 assert!(matches!(result, Err(LoadError::InvalidUrl(_))));
503 }
504
505 #[test]
506 fn fetch_url_relative_without_base_errors() {
507 let mut loader = ResourceLoader::new();
508 let result = loader.fetch_url("/relative/path", None);
509 assert!(matches!(result, Err(LoadError::InvalidUrl(_))));
510 }
511
512 #[test]
513 fn fetch_url_relative_with_base_resolves() {
514 let mut loader = ResourceLoader::new();
515 let base = Url::parse("http://example.com/page").unwrap();
516 // This will fail since we can't actually connect in tests,
517 // but the URL resolution itself should work (it won't be InvalidUrl).
518 let result = loader.fetch_url("/style.css", Some(&base));
519 assert!(result.is_err());
520 // The error should NOT be InvalidUrl — the URL resolved successfully.
521 assert!(!matches!(result, Err(LoadError::InvalidUrl(_))));
522 }
523}