//! Data URL parsing per RFC 2397. //! //! Parses `data:[][;base64],` URLs into their components: //! MIME type, optional charset, and decoded payload. /// A parsed data URL. #[derive(Debug, Clone, PartialEq, Eq)] pub struct DataUrl { /// The MIME type (e.g., `text/plain`, `image/png`). pub mime_type: String, /// Optional charset parameter from the MIME type. pub charset: Option, /// The decoded payload bytes. pub data: Vec, } /// Errors from parsing a data URL. #[derive(Debug, Clone, PartialEq, Eq)] pub enum DataUrlError { /// Input does not start with `data:`. NotDataUrl, /// Missing comma separator between metadata and data. MissingComma, /// Base64 payload is malformed. InvalidBase64, } impl core::fmt::Display for DataUrlError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { Self::NotDataUrl => write!(f, "not a data URL"), Self::MissingComma => write!(f, "data URL missing comma separator"), Self::InvalidBase64 => write!(f, "invalid base64 in data URL"), } } } /// Parse a data URL string into its components. /// /// Format: `data:[][;base64],` /// /// If the media type is omitted, defaults to `text/plain;charset=US-ASCII`. /// The data portion is either base64-decoded or percent-decoded depending on /// whether `;base64` is present in the metadata. pub fn parse_data_url(url: &str) -> Result { // Must start with "data:" let rest = url.strip_prefix("data:").ok_or(DataUrlError::NotDataUrl)?; // Find the comma that separates metadata from data. let comma_pos = rest.find(',').ok_or(DataUrlError::MissingComma)?; let metadata = &rest[..comma_pos]; let payload = &rest[comma_pos + 1..]; // Check for ;base64 flag. let (metadata, is_base64) = if let Some(meta) = metadata.strip_suffix(";base64") { (meta, true) } else { (metadata, false) }; // Parse MIME type and charset. let (mime_type, charset) = parse_mime_type(metadata); // Decode the payload. let data = if is_base64 { base64_decode(payload).map_err(|_| DataUrlError::InvalidBase64)? } else { percent_decode_bytes(payload) }; Ok(DataUrl { mime_type, charset, data, }) } /// Returns true if the URL string starts with `data:`. pub fn is_data_url(url: &str) -> bool { url.starts_with("data:") } /// Parse the MIME type portion of a data URL's metadata. /// /// Returns (mime_type, optional_charset). If metadata is empty, /// defaults to `text/plain` with charset `US-ASCII`. fn parse_mime_type(metadata: &str) -> (String, Option) { if metadata.is_empty() { return ("text/plain".to_string(), Some("US-ASCII".to_string())); } // Split on ';' to separate MIME type from parameters. let mut parts = metadata.splitn(2, ';'); let mime = parts.next().unwrap_or("").trim(); let params = parts.next().unwrap_or(""); let mime_type = if mime.is_empty() { "text/plain".to_string() } else { mime.to_ascii_lowercase() }; // Extract charset from parameters if present. let charset = extract_charset(params); (mime_type, charset) } /// Extract `charset=VALUE` from a parameter string. fn extract_charset(params: &str) -> Option { for param in params.split(';') { let param = param.trim(); if let Some(value) = param.strip_prefix("charset=") { return Some(value.trim().to_string()); } } None } /// Percent-decode a string into raw bytes. fn percent_decode_bytes(input: &str) -> Vec { let bytes = input.as_bytes(); let mut result = Vec::with_capacity(bytes.len()); let mut i = 0; while i < bytes.len() { if bytes[i] == b'%' && i + 2 < bytes.len() { if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) { result.push(hi << 4 | lo); i += 3; continue; } } result.push(bytes[i]); i += 1; } result } fn hex_val(b: u8) -> Option { match b { b'0'..=b'9' => Some(b - b'0'), b'a'..=b'f' => Some(b - b'a' + 10), b'A'..=b'F' => Some(b - b'A' + 10), _ => None, } } // --------------------------------------------------------------------------- // Base64 decoder (RFC 4648) // --------------------------------------------------------------------------- /// Decode a base64-encoded string (standard alphabet, RFC 4648). /// /// Ignores ASCII whitespace. Handles padding with `=`. pub fn base64_decode(input: &str) -> Result, Base64Error> { // Strip whitespace. let clean: Vec = input .bytes() .filter(|&b| !b.is_ascii_whitespace()) .collect(); if clean.is_empty() { return Ok(Vec::new()); } // Length after stripping must be a multiple of 4. if !clean.len().is_multiple_of(4) { return Err(Base64Error::InvalidLength); } let mut result = Vec::with_capacity(clean.len() * 3 / 4); for chunk in clean.chunks_exact(4) { let a = base64_val(chunk[0])?; let b = base64_val(chunk[1])?; // First byte is always present. result.push((a << 2) | (b >> 4)); if chunk[2] == b'=' { // Two padding chars — one output byte. if chunk[3] != b'=' { return Err(Base64Error::InvalidPadding); } } else { let c = base64_val(chunk[2])?; result.push((b << 4) | (c >> 2)); if chunk[3] != b'=' { let d = base64_val(chunk[3])?; result.push((c << 6) | d); } } } Ok(result) } /// Base64 decoding error. #[derive(Debug, Clone, PartialEq, Eq)] pub enum Base64Error { /// Invalid character in input. InvalidCharacter(u8), /// Input length is not a multiple of 4. InvalidLength, /// Invalid padding. InvalidPadding, } impl core::fmt::Display for Base64Error { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { Self::InvalidCharacter(c) => write!(f, "invalid base64 character: 0x{c:02X}"), Self::InvalidLength => write!(f, "invalid base64 length"), Self::InvalidPadding => write!(f, "invalid base64 padding"), } } } fn base64_val(b: u8) -> Result { match b { b'A'..=b'Z' => Ok(b - b'A'), b'a'..=b'z' => Ok(b - b'a' + 26), b'0'..=b'9' => Ok(b - b'0' + 52), b'+' => Ok(62), b'/' => Ok(63), _ => Err(Base64Error::InvalidCharacter(b)), } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; // ----------------------------------------------------------------------- // Base64 decoding // ----------------------------------------------------------------------- #[test] fn base64_empty() { assert_eq!(base64_decode("").unwrap(), b""); } #[test] fn base64_hello() { assert_eq!(base64_decode("SGVsbG8=").unwrap(), b"Hello"); } #[test] fn base64_hello_world() { assert_eq!(base64_decode("SGVsbG8gV29ybGQ=").unwrap(), b"Hello World"); } #[test] fn base64_no_padding() { assert_eq!(base64_decode("YWJj").unwrap(), b"abc"); } #[test] fn base64_one_pad() { assert_eq!(base64_decode("YWI=").unwrap(), b"ab"); } #[test] fn base64_two_pad() { assert_eq!(base64_decode("YQ==").unwrap(), b"a"); } #[test] fn base64_with_whitespace() { assert_eq!(base64_decode("SGVs\nbG8=").unwrap(), b"Hello"); } #[test] fn base64_all_chars() { // Encode bytes 0..63 using standard alphabet. let encoded = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; let decoded = base64_decode(encoded).unwrap(); assert_eq!(decoded.len(), 48); // First byte: A(0)<<2 | B(1)>>4 = 0 assert_eq!(decoded[0], 0x00); } #[test] fn base64_invalid_char() { assert!(matches!( base64_decode("SGV!bG8="), Err(Base64Error::InvalidCharacter(b'!')) )); } #[test] fn base64_invalid_length() { assert!(matches!( base64_decode("SGVsb"), Err(Base64Error::InvalidLength) )); } #[test] fn base64_invalid_padding() { assert!(matches!( base64_decode("SG=b"), Err(Base64Error::InvalidPadding) )); } #[test] fn base64_binary_data() { // Raw bytes [0xFF, 0x00, 0xAA] assert_eq!(base64_decode("/wCq").unwrap(), vec![0xFF, 0x00, 0xAA]); } // ----------------------------------------------------------------------- // Data URL parsing // ----------------------------------------------------------------------- #[test] fn data_url_plain_text() { let result = parse_data_url("data:,Hello%20World").unwrap(); assert_eq!(result.mime_type, "text/plain"); assert_eq!(result.charset, Some("US-ASCII".to_string())); assert_eq!(result.data, b"Hello World"); } #[test] fn data_url_explicit_mime() { let result = parse_data_url("data:text/html,

Hello

").unwrap(); assert_eq!(result.mime_type, "text/html"); assert_eq!(result.charset, None); assert_eq!(result.data, b"

Hello

"); } #[test] fn data_url_with_charset() { let result = parse_data_url("data:text/plain;charset=utf-8,Hello").unwrap(); assert_eq!(result.mime_type, "text/plain"); assert_eq!(result.charset, Some("utf-8".to_string())); assert_eq!(result.data, b"Hello"); } #[test] fn data_url_base64() { let result = parse_data_url("data:text/plain;base64,SGVsbG8=").unwrap(); assert_eq!(result.mime_type, "text/plain"); assert_eq!(result.data, b"Hello"); } #[test] fn data_url_base64_image() { // Minimal data: 3 bytes as base64. let result = parse_data_url("data:image/png;base64,/wCq").unwrap(); assert_eq!(result.mime_type, "image/png"); assert_eq!(result.data, vec![0xFF, 0x00, 0xAA]); } #[test] fn data_url_base64_with_charset() { let result = parse_data_url("data:text/plain;charset=utf-8;base64,SGVsbG8=").unwrap(); assert_eq!(result.mime_type, "text/plain"); assert_eq!(result.charset, Some("utf-8".to_string())); assert_eq!(result.data, b"Hello"); } #[test] fn data_url_empty_data() { let result = parse_data_url("data:,").unwrap(); assert_eq!(result.mime_type, "text/plain"); assert_eq!(result.data, b""); } #[test] fn data_url_empty_base64() { let result = parse_data_url("data:;base64,").unwrap(); assert_eq!(result.mime_type, "text/plain"); assert_eq!(result.data, b""); } #[test] fn data_url_not_data() { assert!(matches!( parse_data_url("http://example.com"), Err(DataUrlError::NotDataUrl) )); } #[test] fn data_url_missing_comma() { assert!(matches!( parse_data_url("data:text/plain"), Err(DataUrlError::MissingComma) )); } #[test] fn data_url_invalid_base64() { assert!(matches!( parse_data_url("data:;base64,!!!"), Err(DataUrlError::InvalidBase64) )); } #[test] fn data_url_percent_encoded() { let result = parse_data_url("data:text/plain,%48%65%6C%6C%6F").unwrap(); assert_eq!(result.data, b"Hello"); } #[test] fn data_url_mime_case_insensitive() { let result = parse_data_url("data:Text/HTML,

hi

").unwrap(); assert_eq!(result.mime_type, "text/html"); } #[test] fn data_url_comma_in_data() { // Only the first comma splits metadata from data. let result = parse_data_url("data:text/plain,a,b,c").unwrap(); assert_eq!(result.data, b"a,b,c"); } #[test] fn is_data_url_positive() { assert!(is_data_url("data:text/plain,hello")); } #[test] fn is_data_url_negative() { assert!(!is_data_url("http://example.com")); } // ----------------------------------------------------------------------- // percent_decode_bytes // ----------------------------------------------------------------------- #[test] fn percent_decode_basic() { assert_eq!(percent_decode_bytes("Hello%20World"), b"Hello World"); } #[test] fn percent_decode_no_encoding() { assert_eq!(percent_decode_bytes("Hello"), b"Hello"); } #[test] fn percent_decode_incomplete_sequence() { assert_eq!(percent_decode_bytes("100%"), b"100%"); } #[test] fn percent_decode_binary() { assert_eq!(percent_decode_bytes("%FF%00"), vec![0xFF, 0x00]); } // ----------------------------------------------------------------------- // MIME parsing // ----------------------------------------------------------------------- #[test] fn mime_empty_defaults() { let (mime, charset) = parse_mime_type(""); assert_eq!(mime, "text/plain"); assert_eq!(charset, Some("US-ASCII".to_string())); } #[test] fn mime_with_charset() { let (mime, charset) = parse_mime_type("text/html;charset=utf-8"); assert_eq!(mime, "text/html"); assert_eq!(charset, Some("utf-8".to_string())); } #[test] fn mime_no_charset() { let (mime, charset) = parse_mime_type("image/png"); assert_eq!(mime, "image/png"); assert_eq!(charset, None); } }