//! WHATWG Encoding Standard — UTF-8 and UTF-16 codecs, pure Rust.

pub mod error;
mod utf16;
mod utf8;

use error::{EncodingError, Result};
use utf8::ErrorMode;

// ---------------------------------------------------------------------------
// Encoding enum
// ---------------------------------------------------------------------------

/// Supported text encodings per WHATWG Encoding Standard.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Encoding {
    Utf8,
    Utf16Be,
    Utf16Le,
}

impl Encoding {
    /// Canonical name per WHATWG spec.
    pub fn name(&self) -> &'static str {
        match self {
            Self::Utf8 => "UTF-8",
            Self::Utf16Be => "UTF-16BE",
            Self::Utf16Le => "UTF-16LE",
        }
    }
}

// ---------------------------------------------------------------------------
// Label lookup (WHATWG Encoding Standard §4.2)
// ---------------------------------------------------------------------------

/// WHATWG encoding label mappings.
/// Labels are stored in lowercase; lookup normalizes input to lowercase.
const ENCODING_LABELS: &[(&str, Encoding)] = &[
    // UTF-8 labels
    ("unicode-1-1-utf-8", Encoding::Utf8),
    ("unicode11utf8", Encoding::Utf8),
    ("unicode20utf8", Encoding::Utf8),
    ("utf-8", Encoding::Utf8),
    ("utf8", Encoding::Utf8),
    ("x-unicode20utf8", Encoding::Utf8),
    // UTF-16BE labels
    ("unicodefffe", Encoding::Utf16Be),
    ("utf-16be", Encoding::Utf16Be),
    // UTF-16LE labels
    ("csunicode", Encoding::Utf16Le),
    ("iso-10646-ucs-2", Encoding::Utf16Le),
    ("ucs-2", Encoding::Utf16Le),
    ("unicode", Encoding::Utf16Le),
    ("unicodefeff", Encoding::Utf16Le),
    ("utf-16", Encoding::Utf16Le),
    ("utf-16le", Encoding::Utf16Le),
];

/// Look up an encoding by its WHATWG label.
///
/// Strips leading/trailing ASCII whitespace and compares case-insensitively,
/// per the WHATWG Encoding Standard.
pub fn lookup(label: &str) -> Option<Encoding> {
    let trimmed = trim_ascii_whitespace(label);
    if trimmed.is_empty() {
        return None;
    }
    for &(name, enc) in ENCODING_LABELS {
        if ascii_eq_ignore_case(trimmed, name) {
            return Some(enc);
        }
    }
    None
}

/// Sniff BOM from the start of a byte slice.
///
/// Returns the detected encoding (if any) and the remaining bytes after the BOM.
pub fn bom_sniff(bytes: &[u8]) -> (Option<Encoding>, &[u8]) {
    if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
        (Some(Encoding::Utf8), &bytes[3..])
    } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
        (Some(Encoding::Utf16Be), &bytes[2..])
    } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE {
        (Some(Encoding::Utf16Le), &bytes[2..])
    } else {
        (None, bytes)
    }
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

/// Decode bytes to a `String` using the given encoding.
///
/// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec).
pub fn decode(bytes: &[u8], encoding: Encoding) -> String {
    // Replacement mode never fails
    match encoding {
        Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(),
        Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(),
        Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(),
    }
}

/// Decode bytes to a `String`, returning an error on any invalid sequence.
///
/// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence.
pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result<String> {
    match encoding {
        Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal),
        Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal),
        Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal),
    }
}

/// Encode a string to bytes using the given encoding.
///
/// Only UTF-8 encoding is supported for encode. Per WHATWG spec, UTF-16
/// encodings are decode-only.
pub fn encode(text: &str, encoding: Encoding) -> Result<Vec<u8>> {
    match encoding {
        Encoding::Utf8 => Ok(utf8::encode_utf8(text)),
        Encoding::Utf16Be => Err(EncodingError::EncodeNotSupported {
            encoding: "UTF-16BE",
        }),
        Encoding::Utf16Le => Err(EncodingError::EncodeNotSupported {
            encoding: "UTF-16LE",
        }),
    }
}

// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------

/// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE.
fn trim_ascii_whitespace(s: &str) -> &str {
    let bytes = s.as_bytes();
    let start = bytes
        .iter()
        .position(|&b| !is_ascii_whitespace(b))
        .unwrap_or(bytes.len());
    let end = bytes
        .iter()
        .rposition(|&b| !is_ascii_whitespace(b))
        .map(|p| p + 1)
        .unwrap_or(0);
    if start >= end {
        return "";
    }
    &s[start..end]
}

fn is_ascii_whitespace(b: u8) -> bool {
    matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20)
}

fn ascii_eq_ignore_case(a: &str, b: &str) -> bool {
    a.eq_ignore_ascii_case(b)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    // -- Encoding enum --

    #[test]
    fn encoding_names() {
        assert_eq!(Encoding::Utf8.name(), "UTF-8");
        assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE");
        assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE");
    }

    // -- Label lookup --

    #[test]
    fn lookup_utf8_labels() {
        assert_eq!(lookup("utf-8"), Some(Encoding::Utf8));
        assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8));
        assert_eq!(lookup("utf8"), Some(Encoding::Utf8));
        assert_eq!(lookup("Utf8"), Some(Encoding::Utf8));
        assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8));
        assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8));
    }

    #[test]
    fn lookup_utf16_labels() {
        assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be));
        assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be));
        assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be));
        assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le));
        assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le));
        assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le));
        assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le));
        assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le));
    }

    #[test]
    fn lookup_with_whitespace() {
        assert_eq!(lookup("  utf-8  "), Some(Encoding::Utf8));
        assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8));
        assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le));
    }

    #[test]
    fn lookup_unknown() {
        assert_eq!(lookup("latin1"), None);
        assert_eq!(lookup(""), None);
        assert_eq!(lookup("   "), None);
        assert_eq!(lookup("utf-99"), None);
    }

    // -- BOM sniffing --

    #[test]
    fn bom_utf8() {
        let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]);
        assert_eq!(enc, Some(Encoding::Utf8));
        assert_eq!(rest, &[0x41]);
    }

    #[test]
    fn bom_utf16be() {
        let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]);
        assert_eq!(enc, Some(Encoding::Utf16Be));
        assert_eq!(rest, &[0x00, 0x41]);
    }

    #[test]
    fn bom_utf16le() {
        let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]);
        assert_eq!(enc, Some(Encoding::Utf16Le));
        assert_eq!(rest, &[0x41, 0x00]);
    }

    #[test]
    fn bom_none() {
        let data = [0x41, 0x42, 0x43];
        let (enc, rest) = bom_sniff(&data);
        assert_eq!(enc, None);
        assert_eq!(rest, &data);
    }

    #[test]
    fn bom_empty() {
        let (enc, rest) = bom_sniff(&[]);
        assert_eq!(enc, None);
        assert_eq!(rest, &[] as &[u8]);
    }

    #[test]
    fn bom_short() {
        let (enc, rest) = bom_sniff(&[0xEF, 0xBB]);
        assert_eq!(enc, None);
        assert_eq!(rest, &[0xEF, 0xBB]);
    }

    // -- Top-level decode --

    #[test]
    fn decode_utf8_basic() {
        assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello");
    }

    #[test]
    fn decode_utf8_invalid_replaces() {
        assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}");
    }

    #[test]
    fn decode_utf16le_basic() {
        assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A");
    }

    #[test]
    fn decode_utf16be_basic() {
        assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A");
    }

    // -- Top-level decode_strict --

    #[test]
    fn decode_strict_valid() {
        assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello");
    }

    #[test]
    fn decode_strict_invalid() {
        assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err());
    }

    // -- Top-level encode --

    #[test]
    fn encode_utf8_basic() {
        assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello");
    }

    #[test]
    fn encode_utf16_not_supported() {
        assert!(matches!(
            encode("Hello", Encoding::Utf16Le),
            Err(EncodingError::EncodeNotSupported {
                encoding: "UTF-16LE"
            })
        ));
        assert!(matches!(
            encode("Hello", Encoding::Utf16Be),
            Err(EncodingError::EncodeNotSupported {
                encoding: "UTF-16BE"
            })
        ));
    }

    // -- Trim helpers --

    #[test]
    fn trim_ascii_whitespace_basic() {
        assert_eq!(trim_ascii_whitespace("  hello  "), "hello");
        assert_eq!(trim_ascii_whitespace("hello"), "hello");
        assert_eq!(trim_ascii_whitespace(""), "");
        assert_eq!(trim_ascii_whitespace("   "), "");
        assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello");
    }

    #[test]
    fn ascii_eq_ignore_case_basic() {
        assert!(ascii_eq_ignore_case("utf-8", "UTF-8"));
        assert!(ascii_eq_ignore_case("Utf-8", "utf-8"));
        assert!(!ascii_eq_ignore_case("utf-8", "utf-9"));
        assert!(!ascii_eq_ignore_case("utf-8", "utf-8x"));
    }
}