we (web engine): Experimental web browser project to understand the limits of Claude
1//! Data URL parsing per RFC 2397.
2//!
3//! Parses `data:[<mediatype>][;base64],<data>` URLs into their components:
4//! MIME type, optional charset, and decoded payload.
5
6/// A parsed data URL.
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct DataUrl {
9 /// The MIME type (e.g., `text/plain`, `image/png`).
10 pub mime_type: String,
11 /// Optional charset parameter from the MIME type.
12 pub charset: Option<String>,
13 /// The decoded payload bytes.
14 pub data: Vec<u8>,
15}
16
17/// Errors from parsing a data URL.
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub enum DataUrlError {
20 /// Input does not start with `data:`.
21 NotDataUrl,
22 /// Missing comma separator between metadata and data.
23 MissingComma,
24 /// Base64 payload is malformed.
25 InvalidBase64,
26}
27
28impl core::fmt::Display for DataUrlError {
29 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
30 match self {
31 Self::NotDataUrl => write!(f, "not a data URL"),
32 Self::MissingComma => write!(f, "data URL missing comma separator"),
33 Self::InvalidBase64 => write!(f, "invalid base64 in data URL"),
34 }
35 }
36}
37
38/// Parse a data URL string into its components.
39///
40/// Format: `data:[<mediatype>][;base64],<data>`
41///
42/// If the media type is omitted, defaults to `text/plain;charset=US-ASCII`.
43/// The data portion is either base64-decoded or percent-decoded depending on
44/// whether `;base64` is present in the metadata.
45pub fn parse_data_url(url: &str) -> Result<DataUrl, DataUrlError> {
46 // Must start with "data:"
47 let rest = url.strip_prefix("data:").ok_or(DataUrlError::NotDataUrl)?;
48
49 // Find the comma that separates metadata from data.
50 let comma_pos = rest.find(',').ok_or(DataUrlError::MissingComma)?;
51
52 let metadata = &rest[..comma_pos];
53 let payload = &rest[comma_pos + 1..];
54
55 // Check for ;base64 flag.
56 let (metadata, is_base64) = if let Some(meta) = metadata.strip_suffix(";base64") {
57 (meta, true)
58 } else {
59 (metadata, false)
60 };
61
62 // Parse MIME type and charset.
63 let (mime_type, charset) = parse_mime_type(metadata);
64
65 // Decode the payload.
66 let data = if is_base64 {
67 base64_decode(payload).map_err(|_| DataUrlError::InvalidBase64)?
68 } else {
69 percent_decode_bytes(payload)
70 };
71
72 Ok(DataUrl {
73 mime_type,
74 charset,
75 data,
76 })
77}
78
79/// Returns true if the URL string starts with `data:`.
80pub fn is_data_url(url: &str) -> bool {
81 url.starts_with("data:")
82}
83
84/// Parse the MIME type portion of a data URL's metadata.
85///
86/// Returns (mime_type, optional_charset). If metadata is empty,
87/// defaults to `text/plain` with charset `US-ASCII`.
88fn parse_mime_type(metadata: &str) -> (String, Option<String>) {
89 if metadata.is_empty() {
90 return ("text/plain".to_string(), Some("US-ASCII".to_string()));
91 }
92
93 // Split on ';' to separate MIME type from parameters.
94 let mut parts = metadata.splitn(2, ';');
95 let mime = parts.next().unwrap_or("").trim();
96 let params = parts.next().unwrap_or("");
97
98 let mime_type = if mime.is_empty() {
99 "text/plain".to_string()
100 } else {
101 mime.to_ascii_lowercase()
102 };
103
104 // Extract charset from parameters if present.
105 let charset = extract_charset(params);
106
107 (mime_type, charset)
108}
109
110/// Extract `charset=VALUE` from a parameter string.
111fn extract_charset(params: &str) -> Option<String> {
112 for param in params.split(';') {
113 let param = param.trim();
114 if let Some(value) = param.strip_prefix("charset=") {
115 return Some(value.trim().to_string());
116 }
117 }
118 None
119}
120
121/// Percent-decode a string into raw bytes.
122fn percent_decode_bytes(input: &str) -> Vec<u8> {
123 let bytes = input.as_bytes();
124 let mut result = Vec::with_capacity(bytes.len());
125 let mut i = 0;
126
127 while i < bytes.len() {
128 if bytes[i] == b'%' && i + 2 < bytes.len() {
129 if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) {
130 result.push(hi << 4 | lo);
131 i += 3;
132 continue;
133 }
134 }
135 result.push(bytes[i]);
136 i += 1;
137 }
138
139 result
140}
141
142fn hex_val(b: u8) -> Option<u8> {
143 match b {
144 b'0'..=b'9' => Some(b - b'0'),
145 b'a'..=b'f' => Some(b - b'a' + 10),
146 b'A'..=b'F' => Some(b - b'A' + 10),
147 _ => None,
148 }
149}
150
151// ---------------------------------------------------------------------------
152// Base64 decoder (RFC 4648)
153// ---------------------------------------------------------------------------
154
155/// Decode a base64-encoded string (standard alphabet, RFC 4648).
156///
157/// Ignores ASCII whitespace. Handles padding with `=`.
158pub fn base64_decode(input: &str) -> Result<Vec<u8>, Base64Error> {
159 // Strip whitespace.
160 let clean: Vec<u8> = input
161 .bytes()
162 .filter(|&b| !b.is_ascii_whitespace())
163 .collect();
164
165 if clean.is_empty() {
166 return Ok(Vec::new());
167 }
168
169 // Length after stripping must be a multiple of 4.
170 if !clean.len().is_multiple_of(4) {
171 return Err(Base64Error::InvalidLength);
172 }
173
174 let mut result = Vec::with_capacity(clean.len() * 3 / 4);
175
176 for chunk in clean.chunks_exact(4) {
177 let a = base64_val(chunk[0])?;
178 let b = base64_val(chunk[1])?;
179
180 // First byte is always present.
181 result.push((a << 2) | (b >> 4));
182
183 if chunk[2] == b'=' {
184 // Two padding chars — one output byte.
185 if chunk[3] != b'=' {
186 return Err(Base64Error::InvalidPadding);
187 }
188 } else {
189 let c = base64_val(chunk[2])?;
190 result.push((b << 4) | (c >> 2));
191
192 if chunk[3] != b'=' {
193 let d = base64_val(chunk[3])?;
194 result.push((c << 6) | d);
195 }
196 }
197 }
198
199 Ok(result)
200}
201
202/// Base64 decoding error.
203#[derive(Debug, Clone, PartialEq, Eq)]
204pub enum Base64Error {
205 /// Invalid character in input.
206 InvalidCharacter(u8),
207 /// Input length is not a multiple of 4.
208 InvalidLength,
209 /// Invalid padding.
210 InvalidPadding,
211}
212
213impl core::fmt::Display for Base64Error {
214 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
215 match self {
216 Self::InvalidCharacter(c) => write!(f, "invalid base64 character: 0x{c:02X}"),
217 Self::InvalidLength => write!(f, "invalid base64 length"),
218 Self::InvalidPadding => write!(f, "invalid base64 padding"),
219 }
220 }
221}
222
223fn base64_val(b: u8) -> Result<u8, Base64Error> {
224 match b {
225 b'A'..=b'Z' => Ok(b - b'A'),
226 b'a'..=b'z' => Ok(b - b'a' + 26),
227 b'0'..=b'9' => Ok(b - b'0' + 52),
228 b'+' => Ok(62),
229 b'/' => Ok(63),
230 _ => Err(Base64Error::InvalidCharacter(b)),
231 }
232}
233
234// ---------------------------------------------------------------------------
235// Tests
236// ---------------------------------------------------------------------------
237
238#[cfg(test)]
239mod tests {
240 use super::*;
241
242 // -----------------------------------------------------------------------
243 // Base64 decoding
244 // -----------------------------------------------------------------------
245
246 #[test]
247 fn base64_empty() {
248 assert_eq!(base64_decode("").unwrap(), b"");
249 }
250
251 #[test]
252 fn base64_hello() {
253 assert_eq!(base64_decode("SGVsbG8=").unwrap(), b"Hello");
254 }
255
256 #[test]
257 fn base64_hello_world() {
258 assert_eq!(base64_decode("SGVsbG8gV29ybGQ=").unwrap(), b"Hello World");
259 }
260
261 #[test]
262 fn base64_no_padding() {
263 assert_eq!(base64_decode("YWJj").unwrap(), b"abc");
264 }
265
266 #[test]
267 fn base64_one_pad() {
268 assert_eq!(base64_decode("YWI=").unwrap(), b"ab");
269 }
270
271 #[test]
272 fn base64_two_pad() {
273 assert_eq!(base64_decode("YQ==").unwrap(), b"a");
274 }
275
276 #[test]
277 fn base64_with_whitespace() {
278 assert_eq!(base64_decode("SGVs\nbG8=").unwrap(), b"Hello");
279 }
280
281 #[test]
282 fn base64_all_chars() {
283 // Encode bytes 0..63 using standard alphabet.
284 let encoded = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
285 let decoded = base64_decode(encoded).unwrap();
286 assert_eq!(decoded.len(), 48);
287 // First byte: A(0)<<2 | B(1)>>4 = 0
288 assert_eq!(decoded[0], 0x00);
289 }
290
291 #[test]
292 fn base64_invalid_char() {
293 assert!(matches!(
294 base64_decode("SGV!bG8="),
295 Err(Base64Error::InvalidCharacter(b'!'))
296 ));
297 }
298
299 #[test]
300 fn base64_invalid_length() {
301 assert!(matches!(
302 base64_decode("SGVsb"),
303 Err(Base64Error::InvalidLength)
304 ));
305 }
306
307 #[test]
308 fn base64_invalid_padding() {
309 assert!(matches!(
310 base64_decode("SG=b"),
311 Err(Base64Error::InvalidPadding)
312 ));
313 }
314
315 #[test]
316 fn base64_binary_data() {
317 // Raw bytes [0xFF, 0x00, 0xAA]
318 assert_eq!(base64_decode("/wCq").unwrap(), vec![0xFF, 0x00, 0xAA]);
319 }
320
321 // -----------------------------------------------------------------------
322 // Data URL parsing
323 // -----------------------------------------------------------------------
324
325 #[test]
326 fn data_url_plain_text() {
327 let result = parse_data_url("data:,Hello%20World").unwrap();
328 assert_eq!(result.mime_type, "text/plain");
329 assert_eq!(result.charset, Some("US-ASCII".to_string()));
330 assert_eq!(result.data, b"Hello World");
331 }
332
333 #[test]
334 fn data_url_explicit_mime() {
335 let result = parse_data_url("data:text/html,<h1>Hello</h1>").unwrap();
336 assert_eq!(result.mime_type, "text/html");
337 assert_eq!(result.charset, None);
338 assert_eq!(result.data, b"<h1>Hello</h1>");
339 }
340
341 #[test]
342 fn data_url_with_charset() {
343 let result = parse_data_url("data:text/plain;charset=utf-8,Hello").unwrap();
344 assert_eq!(result.mime_type, "text/plain");
345 assert_eq!(result.charset, Some("utf-8".to_string()));
346 assert_eq!(result.data, b"Hello");
347 }
348
349 #[test]
350 fn data_url_base64() {
351 let result = parse_data_url("data:text/plain;base64,SGVsbG8=").unwrap();
352 assert_eq!(result.mime_type, "text/plain");
353 assert_eq!(result.data, b"Hello");
354 }
355
356 #[test]
357 fn data_url_base64_image() {
358 // Minimal data: 3 bytes as base64.
359 let result = parse_data_url("data:image/png;base64,/wCq").unwrap();
360 assert_eq!(result.mime_type, "image/png");
361 assert_eq!(result.data, vec![0xFF, 0x00, 0xAA]);
362 }
363
364 #[test]
365 fn data_url_base64_with_charset() {
366 let result = parse_data_url("data:text/plain;charset=utf-8;base64,SGVsbG8=").unwrap();
367 assert_eq!(result.mime_type, "text/plain");
368 assert_eq!(result.charset, Some("utf-8".to_string()));
369 assert_eq!(result.data, b"Hello");
370 }
371
372 #[test]
373 fn data_url_empty_data() {
374 let result = parse_data_url("data:,").unwrap();
375 assert_eq!(result.mime_type, "text/plain");
376 assert_eq!(result.data, b"");
377 }
378
379 #[test]
380 fn data_url_empty_base64() {
381 let result = parse_data_url("data:;base64,").unwrap();
382 assert_eq!(result.mime_type, "text/plain");
383 assert_eq!(result.data, b"");
384 }
385
386 #[test]
387 fn data_url_not_data() {
388 assert!(matches!(
389 parse_data_url("http://example.com"),
390 Err(DataUrlError::NotDataUrl)
391 ));
392 }
393
394 #[test]
395 fn data_url_missing_comma() {
396 assert!(matches!(
397 parse_data_url("data:text/plain"),
398 Err(DataUrlError::MissingComma)
399 ));
400 }
401
402 #[test]
403 fn data_url_invalid_base64() {
404 assert!(matches!(
405 parse_data_url("data:;base64,!!!"),
406 Err(DataUrlError::InvalidBase64)
407 ));
408 }
409
410 #[test]
411 fn data_url_percent_encoded() {
412 let result = parse_data_url("data:text/plain,%48%65%6C%6C%6F").unwrap();
413 assert_eq!(result.data, b"Hello");
414 }
415
416 #[test]
417 fn data_url_mime_case_insensitive() {
418 let result = parse_data_url("data:Text/HTML,<p>hi</p>").unwrap();
419 assert_eq!(result.mime_type, "text/html");
420 }
421
422 #[test]
423 fn data_url_comma_in_data() {
424 // Only the first comma splits metadata from data.
425 let result = parse_data_url("data:text/plain,a,b,c").unwrap();
426 assert_eq!(result.data, b"a,b,c");
427 }
428
429 #[test]
430 fn is_data_url_positive() {
431 assert!(is_data_url("data:text/plain,hello"));
432 }
433
434 #[test]
435 fn is_data_url_negative() {
436 assert!(!is_data_url("http://example.com"));
437 }
438
439 // -----------------------------------------------------------------------
440 // percent_decode_bytes
441 // -----------------------------------------------------------------------
442
443 #[test]
444 fn percent_decode_basic() {
445 assert_eq!(percent_decode_bytes("Hello%20World"), b"Hello World");
446 }
447
448 #[test]
449 fn percent_decode_no_encoding() {
450 assert_eq!(percent_decode_bytes("Hello"), b"Hello");
451 }
452
453 #[test]
454 fn percent_decode_incomplete_sequence() {
455 assert_eq!(percent_decode_bytes("100%"), b"100%");
456 }
457
458 #[test]
459 fn percent_decode_binary() {
460 assert_eq!(percent_decode_bytes("%FF%00"), vec![0xFF, 0x00]);
461 }
462
463 // -----------------------------------------------------------------------
464 // MIME parsing
465 // -----------------------------------------------------------------------
466
467 #[test]
468 fn mime_empty_defaults() {
469 let (mime, charset) = parse_mime_type("");
470 assert_eq!(mime, "text/plain");
471 assert_eq!(charset, Some("US-ASCII".to_string()));
472 }
473
474 #[test]
475 fn mime_with_charset() {
476 let (mime, charset) = parse_mime_type("text/html;charset=utf-8");
477 assert_eq!(mime, "text/html");
478 assert_eq!(charset, Some("utf-8".to_string()));
479 }
480
481 #[test]
482 fn mime_no_charset() {
483 let (mime, charset) = parse_mime_type("image/png");
484 assert_eq!(mime, "image/png");
485 assert_eq!(charset, None);
486 }
487}