we (web engine): Experimental web browser project to understand the limits of Claude
at js-bytecode 487 lines 14 kB view raw
1//! Data URL parsing per RFC 2397. 2//! 3//! Parses `data:[<mediatype>][;base64],<data>` URLs into their components: 4//! MIME type, optional charset, and decoded payload. 5 6/// A parsed data URL. 7#[derive(Debug, Clone, PartialEq, Eq)] 8pub struct DataUrl { 9 /// The MIME type (e.g., `text/plain`, `image/png`). 10 pub mime_type: String, 11 /// Optional charset parameter from the MIME type. 12 pub charset: Option<String>, 13 /// The decoded payload bytes. 14 pub data: Vec<u8>, 15} 16 17/// Errors from parsing a data URL. 18#[derive(Debug, Clone, PartialEq, Eq)] 19pub enum DataUrlError { 20 /// Input does not start with `data:`. 21 NotDataUrl, 22 /// Missing comma separator between metadata and data. 23 MissingComma, 24 /// Base64 payload is malformed. 25 InvalidBase64, 26} 27 28impl core::fmt::Display for DataUrlError { 29 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 30 match self { 31 Self::NotDataUrl => write!(f, "not a data URL"), 32 Self::MissingComma => write!(f, "data URL missing comma separator"), 33 Self::InvalidBase64 => write!(f, "invalid base64 in data URL"), 34 } 35 } 36} 37 38/// Parse a data URL string into its components. 39/// 40/// Format: `data:[<mediatype>][;base64],<data>` 41/// 42/// If the media type is omitted, defaults to `text/plain;charset=US-ASCII`. 43/// The data portion is either base64-decoded or percent-decoded depending on 44/// whether `;base64` is present in the metadata. 45pub fn parse_data_url(url: &str) -> Result<DataUrl, DataUrlError> { 46 // Must start with "data:" 47 let rest = url.strip_prefix("data:").ok_or(DataUrlError::NotDataUrl)?; 48 49 // Find the comma that separates metadata from data. 50 let comma_pos = rest.find(',').ok_or(DataUrlError::MissingComma)?; 51 52 let metadata = &rest[..comma_pos]; 53 let payload = &rest[comma_pos + 1..]; 54 55 // Check for ;base64 flag. 56 let (metadata, is_base64) = if let Some(meta) = metadata.strip_suffix(";base64") { 57 (meta, true) 58 } else { 59 (metadata, false) 60 }; 61 62 // Parse MIME type and charset. 63 let (mime_type, charset) = parse_mime_type(metadata); 64 65 // Decode the payload. 66 let data = if is_base64 { 67 base64_decode(payload).map_err(|_| DataUrlError::InvalidBase64)? 68 } else { 69 percent_decode_bytes(payload) 70 }; 71 72 Ok(DataUrl { 73 mime_type, 74 charset, 75 data, 76 }) 77} 78 79/// Returns true if the URL string starts with `data:`. 80pub fn is_data_url(url: &str) -> bool { 81 url.starts_with("data:") 82} 83 84/// Parse the MIME type portion of a data URL's metadata. 85/// 86/// Returns (mime_type, optional_charset). If metadata is empty, 87/// defaults to `text/plain` with charset `US-ASCII`. 88fn parse_mime_type(metadata: &str) -> (String, Option<String>) { 89 if metadata.is_empty() { 90 return ("text/plain".to_string(), Some("US-ASCII".to_string())); 91 } 92 93 // Split on ';' to separate MIME type from parameters. 94 let mut parts = metadata.splitn(2, ';'); 95 let mime = parts.next().unwrap_or("").trim(); 96 let params = parts.next().unwrap_or(""); 97 98 let mime_type = if mime.is_empty() { 99 "text/plain".to_string() 100 } else { 101 mime.to_ascii_lowercase() 102 }; 103 104 // Extract charset from parameters if present. 105 let charset = extract_charset(params); 106 107 (mime_type, charset) 108} 109 110/// Extract `charset=VALUE` from a parameter string. 111fn extract_charset(params: &str) -> Option<String> { 112 for param in params.split(';') { 113 let param = param.trim(); 114 if let Some(value) = param.strip_prefix("charset=") { 115 return Some(value.trim().to_string()); 116 } 117 } 118 None 119} 120 121/// Percent-decode a string into raw bytes. 122fn percent_decode_bytes(input: &str) -> Vec<u8> { 123 let bytes = input.as_bytes(); 124 let mut result = Vec::with_capacity(bytes.len()); 125 let mut i = 0; 126 127 while i < bytes.len() { 128 if bytes[i] == b'%' && i + 2 < bytes.len() { 129 if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) { 130 result.push(hi << 4 | lo); 131 i += 3; 132 continue; 133 } 134 } 135 result.push(bytes[i]); 136 i += 1; 137 } 138 139 result 140} 141 142fn hex_val(b: u8) -> Option<u8> { 143 match b { 144 b'0'..=b'9' => Some(b - b'0'), 145 b'a'..=b'f' => Some(b - b'a' + 10), 146 b'A'..=b'F' => Some(b - b'A' + 10), 147 _ => None, 148 } 149} 150 151// --------------------------------------------------------------------------- 152// Base64 decoder (RFC 4648) 153// --------------------------------------------------------------------------- 154 155/// Decode a base64-encoded string (standard alphabet, RFC 4648). 156/// 157/// Ignores ASCII whitespace. Handles padding with `=`. 158pub fn base64_decode(input: &str) -> Result<Vec<u8>, Base64Error> { 159 // Strip whitespace. 160 let clean: Vec<u8> = input 161 .bytes() 162 .filter(|&b| !b.is_ascii_whitespace()) 163 .collect(); 164 165 if clean.is_empty() { 166 return Ok(Vec::new()); 167 } 168 169 // Length after stripping must be a multiple of 4. 170 if !clean.len().is_multiple_of(4) { 171 return Err(Base64Error::InvalidLength); 172 } 173 174 let mut result = Vec::with_capacity(clean.len() * 3 / 4); 175 176 for chunk in clean.chunks_exact(4) { 177 let a = base64_val(chunk[0])?; 178 let b = base64_val(chunk[1])?; 179 180 // First byte is always present. 181 result.push((a << 2) | (b >> 4)); 182 183 if chunk[2] == b'=' { 184 // Two padding chars — one output byte. 185 if chunk[3] != b'=' { 186 return Err(Base64Error::InvalidPadding); 187 } 188 } else { 189 let c = base64_val(chunk[2])?; 190 result.push((b << 4) | (c >> 2)); 191 192 if chunk[3] != b'=' { 193 let d = base64_val(chunk[3])?; 194 result.push((c << 6) | d); 195 } 196 } 197 } 198 199 Ok(result) 200} 201 202/// Base64 decoding error. 203#[derive(Debug, Clone, PartialEq, Eq)] 204pub enum Base64Error { 205 /// Invalid character in input. 206 InvalidCharacter(u8), 207 /// Input length is not a multiple of 4. 208 InvalidLength, 209 /// Invalid padding. 210 InvalidPadding, 211} 212 213impl core::fmt::Display for Base64Error { 214 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 215 match self { 216 Self::InvalidCharacter(c) => write!(f, "invalid base64 character: 0x{c:02X}"), 217 Self::InvalidLength => write!(f, "invalid base64 length"), 218 Self::InvalidPadding => write!(f, "invalid base64 padding"), 219 } 220 } 221} 222 223fn base64_val(b: u8) -> Result<u8, Base64Error> { 224 match b { 225 b'A'..=b'Z' => Ok(b - b'A'), 226 b'a'..=b'z' => Ok(b - b'a' + 26), 227 b'0'..=b'9' => Ok(b - b'0' + 52), 228 b'+' => Ok(62), 229 b'/' => Ok(63), 230 _ => Err(Base64Error::InvalidCharacter(b)), 231 } 232} 233 234// --------------------------------------------------------------------------- 235// Tests 236// --------------------------------------------------------------------------- 237 238#[cfg(test)] 239mod tests { 240 use super::*; 241 242 // ----------------------------------------------------------------------- 243 // Base64 decoding 244 // ----------------------------------------------------------------------- 245 246 #[test] 247 fn base64_empty() { 248 assert_eq!(base64_decode("").unwrap(), b""); 249 } 250 251 #[test] 252 fn base64_hello() { 253 assert_eq!(base64_decode("SGVsbG8=").unwrap(), b"Hello"); 254 } 255 256 #[test] 257 fn base64_hello_world() { 258 assert_eq!(base64_decode("SGVsbG8gV29ybGQ=").unwrap(), b"Hello World"); 259 } 260 261 #[test] 262 fn base64_no_padding() { 263 assert_eq!(base64_decode("YWJj").unwrap(), b"abc"); 264 } 265 266 #[test] 267 fn base64_one_pad() { 268 assert_eq!(base64_decode("YWI=").unwrap(), b"ab"); 269 } 270 271 #[test] 272 fn base64_two_pad() { 273 assert_eq!(base64_decode("YQ==").unwrap(), b"a"); 274 } 275 276 #[test] 277 fn base64_with_whitespace() { 278 assert_eq!(base64_decode("SGVs\nbG8=").unwrap(), b"Hello"); 279 } 280 281 #[test] 282 fn base64_all_chars() { 283 // Encode bytes 0..63 using standard alphabet. 284 let encoded = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 285 let decoded = base64_decode(encoded).unwrap(); 286 assert_eq!(decoded.len(), 48); 287 // First byte: A(0)<<2 | B(1)>>4 = 0 288 assert_eq!(decoded[0], 0x00); 289 } 290 291 #[test] 292 fn base64_invalid_char() { 293 assert!(matches!( 294 base64_decode("SGV!bG8="), 295 Err(Base64Error::InvalidCharacter(b'!')) 296 )); 297 } 298 299 #[test] 300 fn base64_invalid_length() { 301 assert!(matches!( 302 base64_decode("SGVsb"), 303 Err(Base64Error::InvalidLength) 304 )); 305 } 306 307 #[test] 308 fn base64_invalid_padding() { 309 assert!(matches!( 310 base64_decode("SG=b"), 311 Err(Base64Error::InvalidPadding) 312 )); 313 } 314 315 #[test] 316 fn base64_binary_data() { 317 // Raw bytes [0xFF, 0x00, 0xAA] 318 assert_eq!(base64_decode("/wCq").unwrap(), vec![0xFF, 0x00, 0xAA]); 319 } 320 321 // ----------------------------------------------------------------------- 322 // Data URL parsing 323 // ----------------------------------------------------------------------- 324 325 #[test] 326 fn data_url_plain_text() { 327 let result = parse_data_url("data:,Hello%20World").unwrap(); 328 assert_eq!(result.mime_type, "text/plain"); 329 assert_eq!(result.charset, Some("US-ASCII".to_string())); 330 assert_eq!(result.data, b"Hello World"); 331 } 332 333 #[test] 334 fn data_url_explicit_mime() { 335 let result = parse_data_url("data:text/html,<h1>Hello</h1>").unwrap(); 336 assert_eq!(result.mime_type, "text/html"); 337 assert_eq!(result.charset, None); 338 assert_eq!(result.data, b"<h1>Hello</h1>"); 339 } 340 341 #[test] 342 fn data_url_with_charset() { 343 let result = parse_data_url("data:text/plain;charset=utf-8,Hello").unwrap(); 344 assert_eq!(result.mime_type, "text/plain"); 345 assert_eq!(result.charset, Some("utf-8".to_string())); 346 assert_eq!(result.data, b"Hello"); 347 } 348 349 #[test] 350 fn data_url_base64() { 351 let result = parse_data_url("data:text/plain;base64,SGVsbG8=").unwrap(); 352 assert_eq!(result.mime_type, "text/plain"); 353 assert_eq!(result.data, b"Hello"); 354 } 355 356 #[test] 357 fn data_url_base64_image() { 358 // Minimal data: 3 bytes as base64. 359 let result = parse_data_url("data:image/png;base64,/wCq").unwrap(); 360 assert_eq!(result.mime_type, "image/png"); 361 assert_eq!(result.data, vec![0xFF, 0x00, 0xAA]); 362 } 363 364 #[test] 365 fn data_url_base64_with_charset() { 366 let result = parse_data_url("data:text/plain;charset=utf-8;base64,SGVsbG8=").unwrap(); 367 assert_eq!(result.mime_type, "text/plain"); 368 assert_eq!(result.charset, Some("utf-8".to_string())); 369 assert_eq!(result.data, b"Hello"); 370 } 371 372 #[test] 373 fn data_url_empty_data() { 374 let result = parse_data_url("data:,").unwrap(); 375 assert_eq!(result.mime_type, "text/plain"); 376 assert_eq!(result.data, b""); 377 } 378 379 #[test] 380 fn data_url_empty_base64() { 381 let result = parse_data_url("data:;base64,").unwrap(); 382 assert_eq!(result.mime_type, "text/plain"); 383 assert_eq!(result.data, b""); 384 } 385 386 #[test] 387 fn data_url_not_data() { 388 assert!(matches!( 389 parse_data_url("http://example.com"), 390 Err(DataUrlError::NotDataUrl) 391 )); 392 } 393 394 #[test] 395 fn data_url_missing_comma() { 396 assert!(matches!( 397 parse_data_url("data:text/plain"), 398 Err(DataUrlError::MissingComma) 399 )); 400 } 401 402 #[test] 403 fn data_url_invalid_base64() { 404 assert!(matches!( 405 parse_data_url("data:;base64,!!!"), 406 Err(DataUrlError::InvalidBase64) 407 )); 408 } 409 410 #[test] 411 fn data_url_percent_encoded() { 412 let result = parse_data_url("data:text/plain,%48%65%6C%6C%6F").unwrap(); 413 assert_eq!(result.data, b"Hello"); 414 } 415 416 #[test] 417 fn data_url_mime_case_insensitive() { 418 let result = parse_data_url("data:Text/HTML,<p>hi</p>").unwrap(); 419 assert_eq!(result.mime_type, "text/html"); 420 } 421 422 #[test] 423 fn data_url_comma_in_data() { 424 // Only the first comma splits metadata from data. 425 let result = parse_data_url("data:text/plain,a,b,c").unwrap(); 426 assert_eq!(result.data, b"a,b,c"); 427 } 428 429 #[test] 430 fn is_data_url_positive() { 431 assert!(is_data_url("data:text/plain,hello")); 432 } 433 434 #[test] 435 fn is_data_url_negative() { 436 assert!(!is_data_url("http://example.com")); 437 } 438 439 // ----------------------------------------------------------------------- 440 // percent_decode_bytes 441 // ----------------------------------------------------------------------- 442 443 #[test] 444 fn percent_decode_basic() { 445 assert_eq!(percent_decode_bytes("Hello%20World"), b"Hello World"); 446 } 447 448 #[test] 449 fn percent_decode_no_encoding() { 450 assert_eq!(percent_decode_bytes("Hello"), b"Hello"); 451 } 452 453 #[test] 454 fn percent_decode_incomplete_sequence() { 455 assert_eq!(percent_decode_bytes("100%"), b"100%"); 456 } 457 458 #[test] 459 fn percent_decode_binary() { 460 assert_eq!(percent_decode_bytes("%FF%00"), vec![0xFF, 0x00]); 461 } 462 463 // ----------------------------------------------------------------------- 464 // MIME parsing 465 // ----------------------------------------------------------------------- 466 467 #[test] 468 fn mime_empty_defaults() { 469 let (mime, charset) = parse_mime_type(""); 470 assert_eq!(mime, "text/plain"); 471 assert_eq!(charset, Some("US-ASCII".to_string())); 472 } 473 474 #[test] 475 fn mime_with_charset() { 476 let (mime, charset) = parse_mime_type("text/html;charset=utf-8"); 477 assert_eq!(mime, "text/html"); 478 assert_eq!(charset, Some("utf-8".to_string())); 479 } 480 481 #[test] 482 fn mime_no_charset() { 483 let (mime, charset) = parse_mime_type("image/png"); 484 assert_eq!(mime, "image/png"); 485 assert_eq!(charset, None); 486 } 487}