we (web engine): Experimental web browser project to understand the limits of Claude
1//! Resource loader: fetch URLs and decode their content.
2//!
3//! Brings together `net` (HTTP client), `encoding` (charset detection and decoding),
4//! `url` (URL parsing and resolution), and `image` (image decoding) into a single
5//! `ResourceLoader` that the browser uses to load web pages and subresources.
6
7use std::fmt;
8
9use we_encoding::sniff::sniff_encoding;
10use we_encoding::Encoding;
11use we_net::client::{ClientError, HttpClient};
12use we_net::http::ContentType;
13use we_url::data_url::{is_data_url, parse_data_url};
14use we_url::Url;
15
16// ---------------------------------------------------------------------------
17// Error type
18// ---------------------------------------------------------------------------
19
20/// Errors that can occur during resource loading.
21#[derive(Debug)]
22pub enum LoadError {
23 /// URL parsing failed.
24 InvalidUrl(String),
25 /// Network or HTTP error from the underlying client.
26 Network(ClientError),
27 /// HTTP response indicated an error status.
28 HttpStatus { status: u16, reason: String },
29 /// Encoding or decoding error.
30 Encoding(String),
31}
32
33impl fmt::Display for LoadError {
34 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35 match self {
36 Self::InvalidUrl(s) => write!(f, "invalid URL: {s}"),
37 Self::Network(e) => write!(f, "network error: {e}"),
38 Self::HttpStatus { status, reason } => {
39 write!(f, "HTTP {status} {reason}")
40 }
41 Self::Encoding(s) => write!(f, "encoding error: {s}"),
42 }
43 }
44}
45
46impl From<ClientError> for LoadError {
47 fn from(e: ClientError) -> Self {
48 Self::Network(e)
49 }
50}
51
52// ---------------------------------------------------------------------------
53// Resource types
54// ---------------------------------------------------------------------------
55
56/// A loaded resource with its decoded content and metadata.
57#[derive(Debug)]
58pub enum Resource {
59 /// An HTML document.
60 Html {
61 text: String,
62 base_url: Url,
63 encoding: Encoding,
64 },
65 /// A CSS stylesheet.
66 Css { text: String, url: Url },
67 /// A decoded image.
68 Image {
69 data: Vec<u8>,
70 mime_type: String,
71 url: Url,
72 },
73 /// Any other resource type (binary).
74 Other {
75 data: Vec<u8>,
76 mime_type: String,
77 url: Url,
78 },
79}
80
81// ---------------------------------------------------------------------------
82// ResourceLoader
83// ---------------------------------------------------------------------------
84
85/// Loads resources over HTTP/HTTPS with encoding detection and content-type handling.
86pub struct ResourceLoader {
87 client: HttpClient,
88}
89
90impl ResourceLoader {
91 /// Create a new resource loader with default settings.
92 pub fn new() -> Self {
93 Self {
94 client: HttpClient::new(),
95 }
96 }
97
98 /// Fetch a resource at the given URL.
99 ///
100 /// Determines the resource type from the HTTP Content-Type header, decodes
101 /// text resources using the appropriate character encoding (per WHATWG spec),
102 /// and returns the result as a typed `Resource`.
103 ///
104 /// Handles `data:` URLs locally without network access.
105 pub fn fetch(&mut self, url: &Url) -> Result<Resource, LoadError> {
106 // Handle data: URLs without network fetch.
107 if url.scheme() == "data" {
108 return fetch_data_url(&url.serialize());
109 }
110
111 let response = self.client.get(url)?;
112
113 // Check for HTTP error status codes
114 if response.status_code >= 400 {
115 return Err(LoadError::HttpStatus {
116 status: response.status_code,
117 reason: response.reason.clone(),
118 });
119 }
120
121 let content_type = response.content_type();
122 let mime = content_type
123 .as_ref()
124 .map(|ct| ct.mime_type.as_str())
125 .unwrap_or("application/octet-stream");
126
127 match classify_mime(mime) {
128 MimeClass::Html => {
129 let (text, encoding) =
130 decode_text_resource(&response.body, content_type.as_ref(), true);
131 Ok(Resource::Html {
132 text,
133 base_url: url.clone(),
134 encoding,
135 })
136 }
137 MimeClass::Css => {
138 let (text, _encoding) =
139 decode_text_resource(&response.body, content_type.as_ref(), false);
140 Ok(Resource::Css {
141 text,
142 url: url.clone(),
143 })
144 }
145 MimeClass::Image => Ok(Resource::Image {
146 data: response.body,
147 mime_type: mime.to_string(),
148 url: url.clone(),
149 }),
150 MimeClass::Other => {
151 // Check if it's a text type we should decode
152 if mime.starts_with("text/") {
153 let (text, _encoding) =
154 decode_text_resource(&response.body, content_type.as_ref(), false);
155 Ok(Resource::Other {
156 data: text.into_bytes(),
157 mime_type: mime.to_string(),
158 url: url.clone(),
159 })
160 } else {
161 Ok(Resource::Other {
162 data: response.body,
163 mime_type: mime.to_string(),
164 url: url.clone(),
165 })
166 }
167 }
168 }
169 }
170
171 /// Fetch a URL string, resolving it against an optional base URL.
172 ///
173 /// Handles `data:` URLs locally without network access.
174 pub fn fetch_url(&mut self, url_str: &str, base: Option<&Url>) -> Result<Resource, LoadError> {
175 // Handle data URLs directly — no network fetch needed.
176 if is_data_url(url_str) {
177 return fetch_data_url(url_str);
178 }
179
180 let url = match base {
181 Some(base_url) => Url::parse_with_base(url_str, base_url)
182 .or_else(|_| Url::parse(url_str))
183 .map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?,
184 None => Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?,
185 };
186 self.fetch(&url)
187 }
188}
189
190impl Default for ResourceLoader {
191 fn default() -> Self {
192 Self::new()
193 }
194}
195
196// ---------------------------------------------------------------------------
197// MIME classification
198// ---------------------------------------------------------------------------
199
200enum MimeClass {
201 Html,
202 Css,
203 Image,
204 Other,
205}
206
207fn classify_mime(mime: &str) -> MimeClass {
208 match mime {
209 "text/html" | "application/xhtml+xml" => MimeClass::Html,
210 "text/css" => MimeClass::Css,
211 "image/png" | "image/jpeg" | "image/gif" | "image/webp" | "image/svg+xml" => {
212 MimeClass::Image
213 }
214 _ => MimeClass::Other,
215 }
216}
217
218// ---------------------------------------------------------------------------
219// Text decoding
220// ---------------------------------------------------------------------------
221
222/// Decode a text resource's bytes to a String using WHATWG encoding sniffing.
223///
224/// For HTML resources, uses BOM > HTTP charset > meta prescan > default.
225/// For non-HTML text resources, uses BOM > HTTP charset > default (UTF-8).
226fn decode_text_resource(
227 bytes: &[u8],
228 content_type: Option<&ContentType>,
229 is_html: bool,
230) -> (String, Encoding) {
231 let http_ct_value = content_type.map(|ct| {
232 // Reconstruct a Content-Type header value for the sniffing function
233 match &ct.charset {
234 Some(charset) => format!("{}; charset={}", ct.mime_type, charset),
235 None => ct.mime_type.clone(),
236 }
237 });
238
239 if is_html {
240 // Full WHATWG sniffing: BOM > HTTP > meta prescan > default (Windows-1252)
241 let (encoding, _source) = sniff_encoding(bytes, http_ct_value.as_deref());
242 let text = decode_with_bom_handling(bytes, encoding);
243 (text, encoding)
244 } else {
245 // Non-HTML: BOM > HTTP charset > default (UTF-8)
246 let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes);
247 if let Some(enc) = bom_enc {
248 let text = we_encoding::decode(after_bom, enc);
249 return (text, enc);
250 }
251
252 // Try HTTP charset
253 if let Some(charset) = content_type.and_then(|ct| ct.charset.as_deref()) {
254 if let Some(enc) = we_encoding::lookup(charset) {
255 let text = we_encoding::decode(bytes, enc);
256 return (text, enc);
257 }
258 }
259
260 // Default to UTF-8 for non-HTML text
261 let text = we_encoding::decode(bytes, Encoding::Utf8);
262 (text, Encoding::Utf8)
263 }
264}
265
266/// Decode bytes with BOM handling — strip BOM bytes before decoding.
267fn decode_with_bom_handling(bytes: &[u8], encoding: Encoding) -> String {
268 let (bom_enc, after_bom) = we_encoding::bom_sniff(bytes);
269 if bom_enc.is_some() {
270 // BOM was present — decode the bytes after the BOM
271 we_encoding::decode(after_bom, encoding)
272 } else {
273 we_encoding::decode(bytes, encoding)
274 }
275}
276
277// ---------------------------------------------------------------------------
278// Data URL handling
279// ---------------------------------------------------------------------------
280
281/// Fetch a data URL, decoding its payload and returning the appropriate Resource type.
282fn fetch_data_url(url_str: &str) -> Result<Resource, LoadError> {
283 let parsed = parse_data_url(url_str)
284 .map_err(|e| LoadError::InvalidUrl(format!("data URL error: {e}")))?;
285
286 let mime = &parsed.mime_type;
287
288 // Create a synthetic Url for the resource metadata.
289 let url = Url::parse(url_str).map_err(|_| LoadError::InvalidUrl(url_str.to_string()))?;
290
291 match classify_mime(mime) {
292 MimeClass::Html => {
293 let encoding = charset_to_encoding(parsed.charset.as_deref());
294 let text = we_encoding::decode(&parsed.data, encoding);
295 Ok(Resource::Html {
296 text,
297 base_url: url,
298 encoding,
299 })
300 }
301 MimeClass::Css => {
302 let encoding = charset_to_encoding(parsed.charset.as_deref());
303 let text = we_encoding::decode(&parsed.data, encoding);
304 Ok(Resource::Css { text, url })
305 }
306 MimeClass::Image => Ok(Resource::Image {
307 data: parsed.data,
308 mime_type: mime.to_string(),
309 url,
310 }),
311 MimeClass::Other => {
312 if mime.starts_with("text/") {
313 let encoding = charset_to_encoding(parsed.charset.as_deref());
314 let text = we_encoding::decode(&parsed.data, encoding);
315 Ok(Resource::Other {
316 data: text.into_bytes(),
317 mime_type: mime.to_string(),
318 url,
319 })
320 } else {
321 Ok(Resource::Other {
322 data: parsed.data,
323 mime_type: mime.to_string(),
324 url,
325 })
326 }
327 }
328 }
329}
330
331/// Map a charset name to an Encoding, defaulting to UTF-8.
332fn charset_to_encoding(charset: Option<&str>) -> Encoding {
333 charset
334 .and_then(we_encoding::lookup)
335 .unwrap_or(Encoding::Utf8)
336}
337
338// ---------------------------------------------------------------------------
339// Tests
340// ---------------------------------------------------------------------------
341
342#[cfg(test)]
343mod tests {
344 use super::*;
345
346 // -----------------------------------------------------------------------
347 // LoadError Display
348 // -----------------------------------------------------------------------
349
350 #[test]
351 fn load_error_display_invalid_url() {
352 let e = LoadError::InvalidUrl("bad://url".to_string());
353 assert_eq!(e.to_string(), "invalid URL: bad://url");
354 }
355
356 #[test]
357 fn load_error_display_http_status() {
358 let e = LoadError::HttpStatus {
359 status: 404,
360 reason: "Not Found".to_string(),
361 };
362 assert_eq!(e.to_string(), "HTTP 404 Not Found");
363 }
364
365 #[test]
366 fn load_error_display_encoding() {
367 let e = LoadError::Encoding("bad charset".to_string());
368 assert_eq!(e.to_string(), "encoding error: bad charset");
369 }
370
371 // -----------------------------------------------------------------------
372 // MIME classification
373 // -----------------------------------------------------------------------
374
375 #[test]
376 fn classify_text_html() {
377 assert!(matches!(classify_mime("text/html"), MimeClass::Html));
378 }
379
380 #[test]
381 fn classify_xhtml() {
382 assert!(matches!(
383 classify_mime("application/xhtml+xml"),
384 MimeClass::Html
385 ));
386 }
387
388 #[test]
389 fn classify_text_css() {
390 assert!(matches!(classify_mime("text/css"), MimeClass::Css));
391 }
392
393 #[test]
394 fn classify_image_png() {
395 assert!(matches!(classify_mime("image/png"), MimeClass::Image));
396 }
397
398 #[test]
399 fn classify_image_jpeg() {
400 assert!(matches!(classify_mime("image/jpeg"), MimeClass::Image));
401 }
402
403 #[test]
404 fn classify_image_gif() {
405 assert!(matches!(classify_mime("image/gif"), MimeClass::Image));
406 }
407
408 #[test]
409 fn classify_application_json() {
410 assert!(matches!(
411 classify_mime("application/json"),
412 MimeClass::Other
413 ));
414 }
415
416 #[test]
417 fn classify_text_plain() {
418 assert!(matches!(classify_mime("text/plain"), MimeClass::Other));
419 }
420
421 #[test]
422 fn classify_octet_stream() {
423 assert!(matches!(
424 classify_mime("application/octet-stream"),
425 MimeClass::Other
426 ));
427 }
428
429 // -----------------------------------------------------------------------
430 // Text decoding — HTML
431 // -----------------------------------------------------------------------
432
433 #[test]
434 fn decode_html_utf8_bom() {
435 let bytes = b"\xEF\xBB\xBF<html>Hello</html>";
436 let (text, enc) = decode_text_resource(bytes, None, true);
437 assert_eq!(enc, Encoding::Utf8);
438 assert_eq!(text, "<html>Hello</html>");
439 }
440
441 #[test]
442 fn decode_html_utf8_from_http_charset() {
443 let ct = ContentType {
444 mime_type: "text/html".to_string(),
445 charset: Some("utf-8".to_string()),
446 };
447 let bytes = b"<html>Hello</html>";
448 let (text, enc) = decode_text_resource(bytes, Some(&ct), true);
449 assert_eq!(enc, Encoding::Utf8);
450 assert_eq!(text, "<html>Hello</html>");
451 }
452
453 #[test]
454 fn decode_html_meta_charset() {
455 let html = b"<meta charset=\"utf-8\"><html>Hello</html>";
456 let (text, enc) = decode_text_resource(html, None, true);
457 assert_eq!(enc, Encoding::Utf8);
458 assert!(text.contains("Hello"));
459 }
460
461 #[test]
462 fn decode_html_default_windows_1252() {
463 let bytes = b"<html>Hello</html>";
464 let (text, enc) = decode_text_resource(bytes, None, true);
465 assert_eq!(enc, Encoding::Windows1252);
466 assert!(text.contains("Hello"));
467 }
468
469 #[test]
470 fn decode_html_windows_1252_special_chars() {
471 // \x93 and \x94 are left/right double quotation marks in Windows-1252
472 let bytes = b"<html>\x93Hello\x94</html>";
473 let (text, enc) = decode_text_resource(bytes, None, true);
474 assert_eq!(enc, Encoding::Windows1252);
475 assert!(text.contains('\u{201C}')); // left double quote
476 assert!(text.contains('\u{201D}')); // right double quote
477 }
478
479 #[test]
480 fn decode_html_bom_beats_http_charset() {
481 let ct = ContentType {
482 mime_type: "text/html".to_string(),
483 charset: Some("windows-1252".to_string()),
484 };
485 let mut bytes = vec![0xEF, 0xBB, 0xBF];
486 bytes.extend_from_slice(b"<html>Hello</html>");
487 let (text, enc) = decode_text_resource(&bytes, Some(&ct), true);
488 assert_eq!(enc, Encoding::Utf8);
489 assert_eq!(text, "<html>Hello</html>");
490 }
491
492 // -----------------------------------------------------------------------
493 // Text decoding — non-HTML (CSS, etc.)
494 // -----------------------------------------------------------------------
495
496 #[test]
497 fn decode_css_utf8_default() {
498 let bytes = b"body { color: red; }";
499 let (text, enc) = decode_text_resource(bytes, None, false);
500 assert_eq!(enc, Encoding::Utf8);
501 assert_eq!(text, "body { color: red; }");
502 }
503
504 #[test]
505 fn decode_css_bom_utf8() {
506 let bytes = b"\xEF\xBB\xBFbody { color: red; }";
507 let (text, enc) = decode_text_resource(bytes, None, false);
508 assert_eq!(enc, Encoding::Utf8);
509 assert_eq!(text, "body { color: red; }");
510 }
511
512 #[test]
513 fn decode_css_http_charset() {
514 let ct = ContentType {
515 mime_type: "text/css".to_string(),
516 charset: Some("utf-8".to_string()),
517 };
518 let bytes = b"body { color: red; }";
519 let (text, enc) = decode_text_resource(bytes, Some(&ct), false);
520 assert_eq!(enc, Encoding::Utf8);
521 assert_eq!(text, "body { color: red; }");
522 }
523
524 // -----------------------------------------------------------------------
525 // BOM handling
526 // -----------------------------------------------------------------------
527
528 #[test]
529 fn decode_with_bom_strips_utf8_bom() {
530 let bytes = b"\xEF\xBB\xBFHello";
531 let text = decode_with_bom_handling(bytes, Encoding::Utf8);
532 assert_eq!(text, "Hello");
533 }
534
535 #[test]
536 fn decode_without_bom_passes_through() {
537 let bytes = b"Hello";
538 let text = decode_with_bom_handling(bytes, Encoding::Utf8);
539 assert_eq!(text, "Hello");
540 }
541
542 #[test]
543 fn decode_with_utf16le_bom() {
544 let bytes = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00";
545 let text = decode_with_bom_handling(bytes, Encoding::Utf16Le);
546 assert_eq!(text, "Hello");
547 }
548
549 #[test]
550 fn decode_with_utf16be_bom() {
551 let bytes = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
552 let text = decode_with_bom_handling(bytes, Encoding::Utf16Be);
553 assert_eq!(text, "Hello");
554 }
555
556 // -----------------------------------------------------------------------
557 // ResourceLoader construction
558 // -----------------------------------------------------------------------
559
560 #[test]
561 fn resource_loader_new() {
562 let _loader = ResourceLoader::new();
563 }
564
565 #[test]
566 fn resource_loader_default() {
567 let _loader = ResourceLoader::default();
568 }
569
570 // -----------------------------------------------------------------------
571 // URL resolution
572 // -----------------------------------------------------------------------
573
574 #[test]
575 fn fetch_url_invalid_url_error() {
576 let mut loader = ResourceLoader::new();
577 let result = loader.fetch_url("not a url at all", None);
578 assert!(matches!(result, Err(LoadError::InvalidUrl(_))));
579 }
580
581 #[test]
582 fn fetch_url_relative_without_base_errors() {
583 let mut loader = ResourceLoader::new();
584 let result = loader.fetch_url("/relative/path", None);
585 assert!(matches!(result, Err(LoadError::InvalidUrl(_))));
586 }
587
588 #[test]
589 fn fetch_url_relative_with_base_resolves() {
590 let mut loader = ResourceLoader::new();
591 let base = Url::parse("http://example.com/page").unwrap();
592 // This will fail since we can't actually connect in tests,
593 // but the URL resolution itself should work (it won't be InvalidUrl).
594 let result = loader.fetch_url("/style.css", Some(&base));
595 assert!(result.is_err());
596 // The error should NOT be InvalidUrl — the URL resolved successfully.
597 assert!(!matches!(result, Err(LoadError::InvalidUrl(_))));
598 }
599
600 // -----------------------------------------------------------------------
601 // Data URL loading
602 // -----------------------------------------------------------------------
603
604 #[test]
605 fn data_url_plain_text() {
606 let mut loader = ResourceLoader::new();
607 let result = loader.fetch_url("data:text/plain,Hello%20World", None);
608 assert!(result.is_ok());
609 match result.unwrap() {
610 Resource::Other {
611 data, mime_type, ..
612 } => {
613 assert_eq!(mime_type, "text/plain");
614 assert_eq!(String::from_utf8(data).unwrap(), "Hello World");
615 }
616 other => panic!("expected Other, got {:?}", other),
617 }
618 }
619
620 #[test]
621 fn data_url_html() {
622 let mut loader = ResourceLoader::new();
623 let result = loader.fetch_url("data:text/html,<h1>Hello</h1>", None);
624 assert!(result.is_ok());
625 match result.unwrap() {
626 Resource::Html { text, .. } => {
627 assert_eq!(text, "<h1>Hello</h1>");
628 }
629 other => panic!("expected Html, got {:?}", other),
630 }
631 }
632
633 #[test]
634 fn data_url_css() {
635 let mut loader = ResourceLoader::new();
636 let result = loader.fetch_url("data:text/css,body{color:red}", None);
637 assert!(result.is_ok());
638 match result.unwrap() {
639 Resource::Css { text, .. } => {
640 assert_eq!(text, "body{color:red}");
641 }
642 other => panic!("expected Css, got {:?}", other),
643 }
644 }
645
646 #[test]
647 fn data_url_image() {
648 let mut loader = ResourceLoader::new();
649 let result = loader.fetch_url("data:image/png;base64,/wCq", None);
650 assert!(result.is_ok());
651 match result.unwrap() {
652 Resource::Image {
653 data, mime_type, ..
654 } => {
655 assert_eq!(mime_type, "image/png");
656 assert_eq!(data, vec![0xFF, 0x00, 0xAA]);
657 }
658 other => panic!("expected Image, got {:?}", other),
659 }
660 }
661
662 #[test]
663 fn data_url_base64() {
664 let mut loader = ResourceLoader::new();
665 let result = loader.fetch_url("data:text/plain;base64,SGVsbG8=", None);
666 assert!(result.is_ok());
667 match result.unwrap() {
668 Resource::Other { data, .. } => {
669 assert_eq!(String::from_utf8(data).unwrap(), "Hello");
670 }
671 other => panic!("expected Other, got {:?}", other),
672 }
673 }
674
675 #[test]
676 fn data_url_empty() {
677 let mut loader = ResourceLoader::new();
678 let result = loader.fetch_url("data:,", None);
679 assert!(result.is_ok());
680 }
681
682 #[test]
683 fn data_url_via_fetch_method() {
684 let mut loader = ResourceLoader::new();
685 let url = Url::parse("data:text/plain,Hello").unwrap();
686 let result = loader.fetch(&url);
687 assert!(result.is_ok());
688 match result.unwrap() {
689 Resource::Other { data, .. } => {
690 assert_eq!(String::from_utf8(data).unwrap(), "Hello");
691 }
692 other => panic!("expected Other, got {:?}", other),
693 }
694 }
695
696 #[test]
697 fn data_url_invalid() {
698 let mut loader = ResourceLoader::new();
699 let result = loader.fetch_url("data:text/plain", None);
700 assert!(matches!(result, Err(LoadError::InvalidUrl(_))));
701 }
702
703 #[test]
704 fn data_url_binary() {
705 let mut loader = ResourceLoader::new();
706 let result = loader.fetch_url("data:application/octet-stream;base64,/wCq", None);
707 assert!(result.is_ok());
708 match result.unwrap() {
709 Resource::Other {
710 data, mime_type, ..
711 } => {
712 assert_eq!(mime_type, "application/octet-stream");
713 assert_eq!(data, vec![0xFF, 0x00, 0xAA]);
714 }
715 other => panic!("expected Other, got {:?}", other),
716 }
717 }
718}