crates/url/src/data_url.rs at js-parser

pierrelf.com / we
fork atom
we (web engine): Experimental web browser project to understand the limits of Claude
fork atom
we / crates / url / src / data_url.rs
at js-parser 487 lines 14 kB view raw
wrap content
pierrelf.com Implement data URL parsing and loading (RFC 2397) 9d ago
9a646b13
  1//! Data URL parsing per RFC 2397.
  2//!
  3//! Parses `data:[<mediatype>][;base64],<data>` URLs into their components:
  4//! MIME type, optional charset, and decoded payload.
  5
  6/// A parsed data URL.
  7#[derive(Debug, Clone, PartialEq, Eq)]
  8pub struct DataUrl {
  9    /// The MIME type (e.g., `text/plain`, `image/png`).
 10    pub mime_type: String,
 11    /// Optional charset parameter from the MIME type.
 12    pub charset: Option<String>,
 13    /// The decoded payload bytes.
 14    pub data: Vec<u8>,
 15}
 16
 17/// Errors from parsing a data URL.
 18#[derive(Debug, Clone, PartialEq, Eq)]
 19pub enum DataUrlError {
 20    /// Input does not start with `data:`.
 21    NotDataUrl,
 22    /// Missing comma separator between metadata and data.
 23    MissingComma,
 24    /// Base64 payload is malformed.
 25    InvalidBase64,
 26}
 27
 28impl core::fmt::Display for DataUrlError {
 29    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
 30        match self {
 31            Self::NotDataUrl => write!(f, "not a data URL"),
 32            Self::MissingComma => write!(f, "data URL missing comma separator"),
 33            Self::InvalidBase64 => write!(f, "invalid base64 in data URL"),
 34        }
 35    }
 36}
 37
 38/// Parse a data URL string into its components.
 39///
 40/// Format: `data:[<mediatype>][;base64],<data>`
 41///
 42/// If the media type is omitted, defaults to `text/plain;charset=US-ASCII`.
 43/// The data portion is either base64-decoded or percent-decoded depending on
 44/// whether `;base64` is present in the metadata.
 45pub fn parse_data_url(url: &str) -> Result<DataUrl, DataUrlError> {
 46    // Must start with "data:"
 47    let rest = url.strip_prefix("data:").ok_or(DataUrlError::NotDataUrl)?;
 48
 49    // Find the comma that separates metadata from data.
 50    let comma_pos = rest.find(',').ok_or(DataUrlError::MissingComma)?;
 51
 52    let metadata = &rest[..comma_pos];
 53    let payload = &rest[comma_pos + 1..];
 54
 55    // Check for ;base64 flag.
 56    let (metadata, is_base64) = if let Some(meta) = metadata.strip_suffix(";base64") {
 57        (meta, true)
 58    } else {
 59        (metadata, false)
 60    };
 61
 62    // Parse MIME type and charset.
 63    let (mime_type, charset) = parse_mime_type(metadata);
 64
 65    // Decode the payload.
 66    let data = if is_base64 {
 67        base64_decode(payload).map_err(|_| DataUrlError::InvalidBase64)?
 68    } else {
 69        percent_decode_bytes(payload)
 70    };
 71
 72    Ok(DataUrl {
 73        mime_type,
 74        charset,
 75        data,
 76    })
 77}
 78
 79/// Returns true if the URL string starts with `data:`.
 80pub fn is_data_url(url: &str) -> bool {
 81    url.starts_with("data:")
 82}
 83
 84/// Parse the MIME type portion of a data URL's metadata.
 85///
 86/// Returns (mime_type, optional_charset). If metadata is empty,
 87/// defaults to `text/plain` with charset `US-ASCII`.
 88fn parse_mime_type(metadata: &str) -> (String, Option<String>) {
 89    if metadata.is_empty() {
 90        return ("text/plain".to_string(), Some("US-ASCII".to_string()));
 91    }
 92
 93    // Split on ';' to separate MIME type from parameters.
 94    let mut parts = metadata.splitn(2, ';');
 95    let mime = parts.next().unwrap_or("").trim();
 96    let params = parts.next().unwrap_or("");
 97
 98    let mime_type = if mime.is_empty() {
 99        "text/plain".to_string()
100    } else {
101        mime.to_ascii_lowercase()
102    };
103
104    // Extract charset from parameters if present.
105    let charset = extract_charset(params);
106
107    (mime_type, charset)
108}
109
110/// Extract `charset=VALUE` from a parameter string.
111fn extract_charset(params: &str) -> Option<String> {
112    for param in params.split(';') {
113        let param = param.trim();
114        if let Some(value) = param.strip_prefix("charset=") {
115            return Some(value.trim().to_string());
116        }
117    }
118    None
119}
120
121/// Percent-decode a string into raw bytes.
122fn percent_decode_bytes(input: &str) -> Vec<u8> {
123    let bytes = input.as_bytes();
124    let mut result = Vec::with_capacity(bytes.len());
125    let mut i = 0;
126
127    while i < bytes.len() {
128        if bytes[i] == b'%' && i + 2 < bytes.len() {
129            if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) {
130                result.push(hi << 4 | lo);
131                i += 3;
132                continue;
133            }
134        }
135        result.push(bytes[i]);
136        i += 1;
137    }
138
139    result
140}
141
142fn hex_val(b: u8) -> Option<u8> {
143    match b {
144        b'0'..=b'9' => Some(b - b'0'),
145        b'a'..=b'f' => Some(b - b'a' + 10),
146        b'A'..=b'F' => Some(b - b'A' + 10),
147        _ => None,
148    }
149}
150
151// ---------------------------------------------------------------------------
152// Base64 decoder (RFC 4648)
153// ---------------------------------------------------------------------------
154
155/// Decode a base64-encoded string (standard alphabet, RFC 4648).
156///
157/// Ignores ASCII whitespace. Handles padding with `=`.
158pub fn base64_decode(input: &str) -> Result<Vec<u8>, Base64Error> {
159    // Strip whitespace.
160    let clean: Vec<u8> = input
161        .bytes()
162        .filter(|&b| !b.is_ascii_whitespace())
163        .collect();
164
165    if clean.is_empty() {
166        return Ok(Vec::new());
167    }
168
169    // Length after stripping must be a multiple of 4.
170    if !clean.len().is_multiple_of(4) {
171        return Err(Base64Error::InvalidLength);
172    }
173
174    let mut result = Vec::with_capacity(clean.len() * 3 / 4);
175
176    for chunk in clean.chunks_exact(4) {
177        let a = base64_val(chunk[0])?;
178        let b = base64_val(chunk[1])?;
179
180        // First byte is always present.
181        result.push((a << 2) | (b >> 4));
182
183        if chunk[2] == b'=' {
184            // Two padding chars — one output byte.
185            if chunk[3] != b'=' {
186                return Err(Base64Error::InvalidPadding);
187            }
188        } else {
189            let c = base64_val(chunk[2])?;
190            result.push((b << 4) | (c >> 2));
191
192            if chunk[3] != b'=' {
193                let d = base64_val(chunk[3])?;
194                result.push((c << 6) | d);
195            }
196        }
197    }
198
199    Ok(result)
200}
201
202/// Base64 decoding error.
203#[derive(Debug, Clone, PartialEq, Eq)]
204pub enum Base64Error {
205    /// Invalid character in input.
206    InvalidCharacter(u8),
207    /// Input length is not a multiple of 4.
208    InvalidLength,
209    /// Invalid padding.
210    InvalidPadding,
211}
212
213impl core::fmt::Display for Base64Error {
214    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
215        match self {
216            Self::InvalidCharacter(c) => write!(f, "invalid base64 character: 0x{c:02X}"),
217            Self::InvalidLength => write!(f, "invalid base64 length"),
218            Self::InvalidPadding => write!(f, "invalid base64 padding"),
219        }
220    }
221}
222
223fn base64_val(b: u8) -> Result<u8, Base64Error> {
224    match b {
225        b'A'..=b'Z' => Ok(b - b'A'),
226        b'a'..=b'z' => Ok(b - b'a' + 26),
227        b'0'..=b'9' => Ok(b - b'0' + 52),
228        b'+' => Ok(62),
229        b'/' => Ok(63),
230        _ => Err(Base64Error::InvalidCharacter(b)),
231    }
232}
233
234// ---------------------------------------------------------------------------
235// Tests
236// ---------------------------------------------------------------------------
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241
242    // -----------------------------------------------------------------------
243    // Base64 decoding
244    // -----------------------------------------------------------------------
245
246    #[test]
247    fn base64_empty() {
248        assert_eq!(base64_decode("").unwrap(), b"");
249    }
250
251    #[test]
252    fn base64_hello() {
253        assert_eq!(base64_decode("SGVsbG8=").unwrap(), b"Hello");
254    }
255
256    #[test]
257    fn base64_hello_world() {
258        assert_eq!(base64_decode("SGVsbG8gV29ybGQ=").unwrap(), b"Hello World");
259    }
260
261    #[test]
262    fn base64_no_padding() {
263        assert_eq!(base64_decode("YWJj").unwrap(), b"abc");
264    }
265
266    #[test]
267    fn base64_one_pad() {
268        assert_eq!(base64_decode("YWI=").unwrap(), b"ab");
269    }
270
271    #[test]
272    fn base64_two_pad() {
273        assert_eq!(base64_decode("YQ==").unwrap(), b"a");
274    }
275
276    #[test]
277    fn base64_with_whitespace() {
278        assert_eq!(base64_decode("SGVs\nbG8=").unwrap(), b"Hello");
279    }
280
281    #[test]
282    fn base64_all_chars() {
283        // Encode bytes 0..63 using standard alphabet.
284        let encoded = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
285        let decoded = base64_decode(encoded).unwrap();
286        assert_eq!(decoded.len(), 48);
287        // First byte: A(0)<<2 | B(1)>>4 = 0
288        assert_eq!(decoded[0], 0x00);
289    }
290
291    #[test]
292    fn base64_invalid_char() {
293        assert!(matches!(
294            base64_decode("SGV!bG8="),
295            Err(Base64Error::InvalidCharacter(b'!'))
296        ));
297    }
298
299    #[test]
300    fn base64_invalid_length() {
301        assert!(matches!(
302            base64_decode("SGVsb"),
303            Err(Base64Error::InvalidLength)
304        ));
305    }
306
307    #[test]
308    fn base64_invalid_padding() {
309        assert!(matches!(
310            base64_decode("SG=b"),
311            Err(Base64Error::InvalidPadding)
312        ));
313    }
314
315    #[test]
316    fn base64_binary_data() {
317        // Raw bytes [0xFF, 0x00, 0xAA]
318        assert_eq!(base64_decode("/wCq").unwrap(), vec![0xFF, 0x00, 0xAA]);
319    }
320
321    // -----------------------------------------------------------------------
322    // Data URL parsing
323    // -----------------------------------------------------------------------
324
325    #[test]
326    fn data_url_plain_text() {
327        let result = parse_data_url("data:,Hello%20World").unwrap();
328        assert_eq!(result.mime_type, "text/plain");
329        assert_eq!(result.charset, Some("US-ASCII".to_string()));
330        assert_eq!(result.data, b"Hello World");
331    }
332
333    #[test]
334    fn data_url_explicit_mime() {
335        let result = parse_data_url("data:text/html,<h1>Hello</h1>").unwrap();
336        assert_eq!(result.mime_type, "text/html");
337        assert_eq!(result.charset, None);
338        assert_eq!(result.data, b"<h1>Hello</h1>");
339    }
340
341    #[test]
342    fn data_url_with_charset() {
343        let result = parse_data_url("data:text/plain;charset=utf-8,Hello").unwrap();
344        assert_eq!(result.mime_type, "text/plain");
345        assert_eq!(result.charset, Some("utf-8".to_string()));
346        assert_eq!(result.data, b"Hello");
347    }
348
349    #[test]
350    fn data_url_base64() {
351        let result = parse_data_url("data:text/plain;base64,SGVsbG8=").unwrap();
352        assert_eq!(result.mime_type, "text/plain");
353        assert_eq!(result.data, b"Hello");
354    }
355
356    #[test]
357    fn data_url_base64_image() {
358        // Minimal data: 3 bytes as base64.
359        let result = parse_data_url("data:image/png;base64,/wCq").unwrap();
360        assert_eq!(result.mime_type, "image/png");
361        assert_eq!(result.data, vec![0xFF, 0x00, 0xAA]);
362    }
363
364    #[test]
365    fn data_url_base64_with_charset() {
366        let result = parse_data_url("data:text/plain;charset=utf-8;base64,SGVsbG8=").unwrap();
367        assert_eq!(result.mime_type, "text/plain");
368        assert_eq!(result.charset, Some("utf-8".to_string()));
369        assert_eq!(result.data, b"Hello");
370    }
371
372    #[test]
373    fn data_url_empty_data() {
374        let result = parse_data_url("data:,").unwrap();
375        assert_eq!(result.mime_type, "text/plain");
376        assert_eq!(result.data, b"");
377    }
378
379    #[test]
380    fn data_url_empty_base64() {
381        let result = parse_data_url("data:;base64,").unwrap();
382        assert_eq!(result.mime_type, "text/plain");
383        assert_eq!(result.data, b"");
384    }
385
386    #[test]
387    fn data_url_not_data() {
388        assert!(matches!(
389            parse_data_url("http://example.com"),
390            Err(DataUrlError::NotDataUrl)
391        ));
392    }
393
394    #[test]
395    fn data_url_missing_comma() {
396        assert!(matches!(
397            parse_data_url("data:text/plain"),
398            Err(DataUrlError::MissingComma)
399        ));
400    }
401
402    #[test]
403    fn data_url_invalid_base64() {
404        assert!(matches!(
405            parse_data_url("data:;base64,!!!"),
406            Err(DataUrlError::InvalidBase64)
407        ));
408    }
409
410    #[test]
411    fn data_url_percent_encoded() {
412        let result = parse_data_url("data:text/plain,%48%65%6C%6C%6F").unwrap();
413        assert_eq!(result.data, b"Hello");
414    }
415
416    #[test]
417    fn data_url_mime_case_insensitive() {
418        let result = parse_data_url("data:Text/HTML,<p>hi</p>").unwrap();
419        assert_eq!(result.mime_type, "text/html");
420    }
421
422    #[test]
423    fn data_url_comma_in_data() {
424        // Only the first comma splits metadata from data.
425        let result = parse_data_url("data:text/plain,a,b,c").unwrap();
426        assert_eq!(result.data, b"a,b,c");
427    }
428
429    #[test]
430    fn is_data_url_positive() {
431        assert!(is_data_url("data:text/plain,hello"));
432    }
433
434    #[test]
435    fn is_data_url_negative() {
436        assert!(!is_data_url("http://example.com"));
437    }
438
439    // -----------------------------------------------------------------------
440    // percent_decode_bytes
441    // -----------------------------------------------------------------------
442
443    #[test]
444    fn percent_decode_basic() {
445        assert_eq!(percent_decode_bytes("Hello%20World"), b"Hello World");
446    }
447
448    #[test]
449    fn percent_decode_no_encoding() {
450        assert_eq!(percent_decode_bytes("Hello"), b"Hello");
451    }
452
453    #[test]
454    fn percent_decode_incomplete_sequence() {
455        assert_eq!(percent_decode_bytes("100%"), b"100%");
456    }
457
458    #[test]
459    fn percent_decode_binary() {
460        assert_eq!(percent_decode_bytes("%FF%00"), vec![0xFF, 0x00]);
461    }
462
463    // -----------------------------------------------------------------------
464    // MIME parsing
465    // -----------------------------------------------------------------------
466
467    #[test]
468    fn mime_empty_defaults() {
469        let (mime, charset) = parse_mime_type("");
470        assert_eq!(mime, "text/plain");
471        assert_eq!(charset, Some("US-ASCII".to_string()));
472    }
473
474    #[test]
475    fn mime_with_charset() {
476        let (mime, charset) = parse_mime_type("text/html;charset=utf-8");
477        assert_eq!(mime, "text/html");
478        assert_eq!(charset, Some("utf-8".to_string()));
479    }
480
481    #[test]
482    fn mime_no_charset() {
483        let (mime, charset) = parse_mime_type("image/png");
484        assert_eq!(mime, "image/png");
485        assert_eq!(charset, None);
486    }
487}