Hello

//! Data URL parsing per RFC 2397.
//!
//! Parses `data:[<mediatype>][;base64],<data>` URLs into their components:
//! MIME type, optional charset, and decoded payload.

/// A parsed data URL.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DataUrl {
    /// The MIME type (e.g., `text/plain`, `image/png`).
    pub mime_type: String,
    /// Optional charset parameter from the MIME type.
    pub charset: Option<String>,
    /// The decoded payload bytes.
    pub data: Vec<u8>,
}

/// Errors from parsing a data URL.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DataUrlError {
    /// Input does not start with `data:`.
    NotDataUrl,
    /// Missing comma separator between metadata and data.
    MissingComma,
    /// Base64 payload is malformed.
    InvalidBase64,
}

impl core::fmt::Display for DataUrlError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            Self::NotDataUrl => write!(f, "not a data URL"),
            Self::MissingComma => write!(f, "data URL missing comma separator"),
            Self::InvalidBase64 => write!(f, "invalid base64 in data URL"),
        }
    }
}

/// Parse a data URL string into its components.
///
/// Format: `data:[<mediatype>][;base64],<data>`
///
/// If the media type is omitted, defaults to `text/plain;charset=US-ASCII`.
/// The data portion is either base64-decoded or percent-decoded depending on
/// whether `;base64` is present in the metadata.
pub fn parse_data_url(url: &str) -> Result<DataUrl, DataUrlError> {
    // Must start with "data:"
    let rest = url.strip_prefix("data:").ok_or(DataUrlError::NotDataUrl)?;

    // Find the comma that separates metadata from data.
    let comma_pos = rest.find(',').ok_or(DataUrlError::MissingComma)?;

    let metadata = &rest[..comma_pos];
    let payload = &rest[comma_pos + 1..];

    // Check for ;base64 flag.
    let (metadata, is_base64) = if let Some(meta) = metadata.strip_suffix(";base64") {
        (meta, true)
    } else {
        (metadata, false)
    };

    // Parse MIME type and charset.
    let (mime_type, charset) = parse_mime_type(metadata);

    // Decode the payload.
    let data = if is_base64 {
        base64_decode(payload).map_err(|_| DataUrlError::InvalidBase64)?
    } else {
        percent_decode_bytes(payload)
    };

    Ok(DataUrl {
        mime_type,
        charset,
        data,
    })
}

/// Returns true if the URL string starts with `data:`.
pub fn is_data_url(url: &str) -> bool {
    url.starts_with("data:")
}

/// Parse the MIME type portion of a data URL's metadata.
///
/// Returns (mime_type, optional_charset). If metadata is empty,
/// defaults to `text/plain` with charset `US-ASCII`.
fn parse_mime_type(metadata: &str) -> (String, Option<String>) {
    if metadata.is_empty() {
        return ("text/plain".to_string(), Some("US-ASCII".to_string()));
    }

    // Split on ';' to separate MIME type from parameters.
    let mut parts = metadata.splitn(2, ';');
    let mime = parts.next().unwrap_or("").trim();
    let params = parts.next().unwrap_or("");

    let mime_type = if mime.is_empty() {
        "text/plain".to_string()
    } else {
        mime.to_ascii_lowercase()
    };

    // Extract charset from parameters if present.
    let charset = extract_charset(params);

    (mime_type, charset)
}

/// Extract `charset=VALUE` from a parameter string.
fn extract_charset(params: &str) -> Option<String> {
    for param in params.split(';') {
        let param = param.trim();
        if let Some(value) = param.strip_prefix("charset=") {
            return Some(value.trim().to_string());
        }
    }
    None
}

/// Percent-decode a string into raw bytes.
fn percent_decode_bytes(input: &str) -> Vec<u8> {
    let bytes = input.as_bytes();
    let mut result = Vec::with_capacity(bytes.len());
    let mut i = 0;

    while i < bytes.len() {
        if bytes[i] == b'%' && i + 2 < bytes.len() {
            if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) {
                result.push(hi << 4 | lo);
                i += 3;
                continue;
            }
        }
        result.push(bytes[i]);
        i += 1;
    }

    result
}

fn hex_val(b: u8) -> Option<u8> {
    match b {
        b'0'..=b'9' => Some(b - b'0'),
        b'a'..=b'f' => Some(b - b'a' + 10),
        b'A'..=b'F' => Some(b - b'A' + 10),
        _ => None,
    }
}

// ---------------------------------------------------------------------------
// Base64 decoder (RFC 4648)
// ---------------------------------------------------------------------------

/// Decode a base64-encoded string (standard alphabet, RFC 4648).
///
/// Ignores ASCII whitespace. Handles padding with `=`.
pub fn base64_decode(input: &str) -> Result<Vec<u8>, Base64Error> {
    // Strip whitespace.
    let clean: Vec<u8> = input
        .bytes()
        .filter(|&b| !b.is_ascii_whitespace())
        .collect();

    if clean.is_empty() {
        return Ok(Vec::new());
    }

    // Length after stripping must be a multiple of 4.
    if !clean.len().is_multiple_of(4) {
        return Err(Base64Error::InvalidLength);
    }

    let mut result = Vec::with_capacity(clean.len() * 3 / 4);

    for chunk in clean.chunks_exact(4) {
        let a = base64_val(chunk[0])?;
        let b = base64_val(chunk[1])?;

        // First byte is always present.
        result.push((a << 2) | (b >> 4));

        if chunk[2] == b'=' {
            // Two padding chars — one output byte.
            if chunk[3] != b'=' {
                return Err(Base64Error::InvalidPadding);
            }
        } else {
            let c = base64_val(chunk[2])?;
            result.push((b << 4) | (c >> 2));

            if chunk[3] != b'=' {
                let d = base64_val(chunk[3])?;
                result.push((c << 6) | d);
            }
        }
    }

    Ok(result)
}

/// Base64 decoding error.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Base64Error {
    /// Invalid character in input.
    InvalidCharacter(u8),
    /// Input length is not a multiple of 4.
    InvalidLength,
    /// Invalid padding.
    InvalidPadding,
}

impl core::fmt::Display for Base64Error {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            Self::InvalidCharacter(c) => write!(f, "invalid base64 character: 0x{c:02X}"),
            Self::InvalidLength => write!(f, "invalid base64 length"),
            Self::InvalidPadding => write!(f, "invalid base64 padding"),
        }
    }
}

fn base64_val(b: u8) -> Result<u8, Base64Error> {
    match b {
        b'A'..=b'Z' => Ok(b - b'A'),
        b'a'..=b'z' => Ok(b - b'a' + 26),
        b'0'..=b'9' => Ok(b - b'0' + 52),
        b'+' => Ok(62),
        b'/' => Ok(63),
        _ => Err(Base64Error::InvalidCharacter(b)),
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    // -----------------------------------------------------------------------
    // Base64 decoding
    // -----------------------------------------------------------------------

    #[test]
    fn base64_empty() {
        assert_eq!(base64_decode("").unwrap(), b"");
    }

    #[test]
    fn base64_hello() {
        assert_eq!(base64_decode("SGVsbG8=").unwrap(), b"Hello");
    }

    #[test]
    fn base64_hello_world() {
        assert_eq!(base64_decode("SGVsbG8gV29ybGQ=").unwrap(), b"Hello World");
    }

    #[test]
    fn base64_no_padding() {
        assert_eq!(base64_decode("YWJj").unwrap(), b"abc");
    }

    #[test]
    fn base64_one_pad() {
        assert_eq!(base64_decode("YWI=").unwrap(), b"ab");
    }

    #[test]
    fn base64_two_pad() {
        assert_eq!(base64_decode("YQ==").unwrap(), b"a");
    }

    #[test]
    fn base64_with_whitespace() {
        assert_eq!(base64_decode("SGVs\nbG8=").unwrap(), b"Hello");
    }

    #[test]
    fn base64_all_chars() {
        // Encode bytes 0..63 using standard alphabet.
        let encoded = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
        let decoded = base64_decode(encoded).unwrap();
        assert_eq!(decoded.len(), 48);
        // First byte: A(0)<<2 | B(1)>>4 = 0
        assert_eq!(decoded[0], 0x00);
    }

    #[test]
    fn base64_invalid_char() {
        assert!(matches!(
            base64_decode("SGV!bG8="),
            Err(Base64Error::InvalidCharacter(b'!'))
        ));
    }

    #[test]
    fn base64_invalid_length() {
        assert!(matches!(
            base64_decode("SGVsb"),
            Err(Base64Error::InvalidLength)
        ));
    }

    #[test]
    fn base64_invalid_padding() {
        assert!(matches!(
            base64_decode("SG=b"),
            Err(Base64Error::InvalidPadding)
        ));
    }

    #[test]
    fn base64_binary_data() {
        // Raw bytes [0xFF, 0x00, 0xAA]
        assert_eq!(base64_decode("/wCq").unwrap(), vec![0xFF, 0x00, 0xAA]);
    }

    // -----------------------------------------------------------------------
    // Data URL parsing
    // -----------------------------------------------------------------------

    #[test]
    fn data_url_plain_text() {
        let result = parse_data_url("data:,Hello%20World").unwrap();
        assert_eq!(result.mime_type, "text/plain");
        assert_eq!(result.charset, Some("US-ASCII".to_string()));
        assert_eq!(result.data, b"Hello World");
    }

    #[test]
    fn data_url_explicit_mime() {
        let result = parse_data_url("data:text/html,<h1>Hello</h1>").unwrap();
        assert_eq!(result.mime_type, "text/html");
        assert_eq!(result.charset, None);
        assert_eq!(result.data, b"<h1>Hello</h1>");
    }

    #[test]
    fn data_url_with_charset() {
        let result = parse_data_url("data:text/plain;charset=utf-8,Hello").unwrap();
        assert_eq!(result.mime_type, "text/plain");
        assert_eq!(result.charset, Some("utf-8".to_string()));
        assert_eq!(result.data, b"Hello");
    }

    #[test]
    fn data_url_base64() {
        let result = parse_data_url("data:text/plain;base64,SGVsbG8=").unwrap();
        assert_eq!(result.mime_type, "text/plain");
        assert_eq!(result.data, b"Hello");
    }

    #[test]
    fn data_url_base64_image() {
        // Minimal data: 3 bytes as base64.
        let result = parse_data_url("data:image/png;base64,/wCq").unwrap();
        assert_eq!(result.mime_type, "image/png");
        assert_eq!(result.data, vec![0xFF, 0x00, 0xAA]);
    }

    #[test]
    fn data_url_base64_with_charset() {
        let result = parse_data_url("data:text/plain;charset=utf-8;base64,SGVsbG8=").unwrap();
        assert_eq!(result.mime_type, "text/plain");
        assert_eq!(result.charset, Some("utf-8".to_string()));
        assert_eq!(result.data, b"Hello");
    }

    #[test]
    fn data_url_empty_data() {
        let result = parse_data_url("data:,").unwrap();
        assert_eq!(result.mime_type, "text/plain");
        assert_eq!(result.data, b"");
    }

    #[test]
    fn data_url_empty_base64() {
        let result = parse_data_url("data:;base64,").unwrap();
        assert_eq!(result.mime_type, "text/plain");
        assert_eq!(result.data, b"");
    }

    #[test]
    fn data_url_not_data() {
        assert!(matches!(
            parse_data_url("http://example.com"),
            Err(DataUrlError::NotDataUrl)
        ));
    }

    #[test]
    fn data_url_missing_comma() {
        assert!(matches!(
            parse_data_url("data:text/plain"),
            Err(DataUrlError::MissingComma)
        ));
    }

    #[test]
    fn data_url_invalid_base64() {
        assert!(matches!(
            parse_data_url("data:;base64,!!!"),
            Err(DataUrlError::InvalidBase64)
        ));
    }

    #[test]
    fn data_url_percent_encoded() {
        let result = parse_data_url("data:text/plain,%48%65%6C%6C%6F").unwrap();
        assert_eq!(result.data, b"Hello");
    }

    #[test]
    fn data_url_mime_case_insensitive() {
        let result = parse_data_url("data:Text/HTML,<p>hi</p>").unwrap();
        assert_eq!(result.mime_type, "text/html");
    }

    #[test]
    fn data_url_comma_in_data() {
        // Only the first comma splits metadata from data.
        let result = parse_data_url("data:text/plain,a,b,c").unwrap();
        assert_eq!(result.data, b"a,b,c");
    }

    #[test]
    fn is_data_url_positive() {
        assert!(is_data_url("data:text/plain,hello"));
    }

    #[test]
    fn is_data_url_negative() {
        assert!(!is_data_url("http://example.com"));
    }

    // -----------------------------------------------------------------------
    // percent_decode_bytes
    // -----------------------------------------------------------------------

    #[test]
    fn percent_decode_basic() {
        assert_eq!(percent_decode_bytes("Hello%20World"), b"Hello World");
    }

    #[test]
    fn percent_decode_no_encoding() {
        assert_eq!(percent_decode_bytes("Hello"), b"Hello");
    }

    #[test]
    fn percent_decode_incomplete_sequence() {
        assert_eq!(percent_decode_bytes("100%"), b"100%");
    }

    #[test]
    fn percent_decode_binary() {
        assert_eq!(percent_decode_bytes("%FF%00"), vec![0xFF, 0x00]);
    }

    // -----------------------------------------------------------------------
    // MIME parsing
    // -----------------------------------------------------------------------

    #[test]
    fn mime_empty_defaults() {
        let (mime, charset) = parse_mime_type("");
        assert_eq!(mime, "text/plain");
        assert_eq!(charset, Some("US-ASCII".to_string()));
    }

    #[test]
    fn mime_with_charset() {
        let (mime, charset) = parse_mime_type("text/html;charset=utf-8");
        assert_eq!(mime, "text/html");
        assert_eq!(charset, Some("utf-8".to_string()));
    }

    #[test]
    fn mime_no_charset() {
        let (mime, charset) = parse_mime_type("image/png");
        assert_eq!(mime, "image/png");
        assert_eq!(charset, None);
    }
}