//! WHATWG Encoding Standard — UTF-8 and UTF-16 codecs, pure Rust. pub mod error; mod utf16; mod utf8; use error::{EncodingError, Result}; use utf8::ErrorMode; // --------------------------------------------------------------------------- // Encoding enum // --------------------------------------------------------------------------- /// Supported text encodings per WHATWG Encoding Standard. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Encoding { Utf8, Utf16Be, Utf16Le, } impl Encoding { /// Canonical name per WHATWG spec. pub fn name(&self) -> &'static str { match self { Self::Utf8 => "UTF-8", Self::Utf16Be => "UTF-16BE", Self::Utf16Le => "UTF-16LE", } } } // --------------------------------------------------------------------------- // Label lookup (WHATWG Encoding Standard §4.2) // --------------------------------------------------------------------------- /// WHATWG encoding label mappings. /// Labels are stored in lowercase; lookup normalizes input to lowercase. const ENCODING_LABELS: &[(&str, Encoding)] = &[ // UTF-8 labels ("unicode-1-1-utf-8", Encoding::Utf8), ("unicode11utf8", Encoding::Utf8), ("unicode20utf8", Encoding::Utf8), ("utf-8", Encoding::Utf8), ("utf8", Encoding::Utf8), ("x-unicode20utf8", Encoding::Utf8), // UTF-16BE labels ("unicodefffe", Encoding::Utf16Be), ("utf-16be", Encoding::Utf16Be), // UTF-16LE labels ("csunicode", Encoding::Utf16Le), ("iso-10646-ucs-2", Encoding::Utf16Le), ("ucs-2", Encoding::Utf16Le), ("unicode", Encoding::Utf16Le), ("unicodefeff", Encoding::Utf16Le), ("utf-16", Encoding::Utf16Le), ("utf-16le", Encoding::Utf16Le), ]; /// Look up an encoding by its WHATWG label. /// /// Strips leading/trailing ASCII whitespace and compares case-insensitively, /// per the WHATWG Encoding Standard. pub fn lookup(label: &str) -> Option { let trimmed = trim_ascii_whitespace(label); if trimmed.is_empty() { return None; } for &(name, enc) in ENCODING_LABELS { if ascii_eq_ignore_case(trimmed, name) { return Some(enc); } } None } /// Sniff BOM from the start of a byte slice. /// /// Returns the detected encoding (if any) and the remaining bytes after the BOM. pub fn bom_sniff(bytes: &[u8]) -> (Option, &[u8]) { if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF { (Some(Encoding::Utf8), &bytes[3..]) } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { (Some(Encoding::Utf16Be), &bytes[2..]) } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE { (Some(Encoding::Utf16Le), &bytes[2..]) } else { (None, bytes) } } // --------------------------------------------------------------------------- // Public API // --------------------------------------------------------------------------- /// Decode bytes to a `String` using the given encoding. /// /// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec). pub fn decode(bytes: &[u8], encoding: Encoding) -> String { // Replacement mode never fails match encoding { Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(), Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(), Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(), } } /// Decode bytes to a `String`, returning an error on any invalid sequence. /// /// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence. pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result { match encoding { Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal), Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal), Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal), } } /// Encode a string to bytes using the given encoding. /// /// Only UTF-8 encoding is supported for encode. Per WHATWG spec, UTF-16 /// encodings are decode-only. pub fn encode(text: &str, encoding: Encoding) -> Result> { match encoding { Encoding::Utf8 => Ok(utf8::encode_utf8(text)), Encoding::Utf16Be => Err(EncodingError::EncodeNotSupported { encoding: "UTF-16BE", }), Encoding::Utf16Le => Err(EncodingError::EncodeNotSupported { encoding: "UTF-16LE", }), } } // --------------------------------------------------------------------------- // Internal helpers // --------------------------------------------------------------------------- /// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE. fn trim_ascii_whitespace(s: &str) -> &str { let bytes = s.as_bytes(); let start = bytes .iter() .position(|&b| !is_ascii_whitespace(b)) .unwrap_or(bytes.len()); let end = bytes .iter() .rposition(|&b| !is_ascii_whitespace(b)) .map(|p| p + 1) .unwrap_or(0); if start >= end { return ""; } &s[start..end] } fn is_ascii_whitespace(b: u8) -> bool { matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20) } fn ascii_eq_ignore_case(a: &str, b: &str) -> bool { a.eq_ignore_ascii_case(b) } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; // -- Encoding enum -- #[test] fn encoding_names() { assert_eq!(Encoding::Utf8.name(), "UTF-8"); assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE"); assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE"); } // -- Label lookup -- #[test] fn lookup_utf8_labels() { assert_eq!(lookup("utf-8"), Some(Encoding::Utf8)); assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8)); assert_eq!(lookup("utf8"), Some(Encoding::Utf8)); assert_eq!(lookup("Utf8"), Some(Encoding::Utf8)); assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8)); assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8)); } #[test] fn lookup_utf16_labels() { assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be)); assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be)); assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be)); assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le)); assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le)); assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le)); assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le)); assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le)); } #[test] fn lookup_with_whitespace() { assert_eq!(lookup(" utf-8 "), Some(Encoding::Utf8)); assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8)); assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le)); } #[test] fn lookup_unknown() { assert_eq!(lookup("latin1"), None); assert_eq!(lookup(""), None); assert_eq!(lookup(" "), None); assert_eq!(lookup("utf-99"), None); } // -- BOM sniffing -- #[test] fn bom_utf8() { let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]); assert_eq!(enc, Some(Encoding::Utf8)); assert_eq!(rest, &[0x41]); } #[test] fn bom_utf16be() { let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]); assert_eq!(enc, Some(Encoding::Utf16Be)); assert_eq!(rest, &[0x00, 0x41]); } #[test] fn bom_utf16le() { let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]); assert_eq!(enc, Some(Encoding::Utf16Le)); assert_eq!(rest, &[0x41, 0x00]); } #[test] fn bom_none() { let data = [0x41, 0x42, 0x43]; let (enc, rest) = bom_sniff(&data); assert_eq!(enc, None); assert_eq!(rest, &data); } #[test] fn bom_empty() { let (enc, rest) = bom_sniff(&[]); assert_eq!(enc, None); assert_eq!(rest, &[] as &[u8]); } #[test] fn bom_short() { let (enc, rest) = bom_sniff(&[0xEF, 0xBB]); assert_eq!(enc, None); assert_eq!(rest, &[0xEF, 0xBB]); } // -- Top-level decode -- #[test] fn decode_utf8_basic() { assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello"); } #[test] fn decode_utf8_invalid_replaces() { assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}"); } #[test] fn decode_utf16le_basic() { assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A"); } #[test] fn decode_utf16be_basic() { assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A"); } // -- Top-level decode_strict -- #[test] fn decode_strict_valid() { assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello"); } #[test] fn decode_strict_invalid() { assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err()); } // -- Top-level encode -- #[test] fn encode_utf8_basic() { assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello"); } #[test] fn encode_utf16_not_supported() { assert!(matches!( encode("Hello", Encoding::Utf16Le), Err(EncodingError::EncodeNotSupported { encoding: "UTF-16LE" }) )); assert!(matches!( encode("Hello", Encoding::Utf16Be), Err(EncodingError::EncodeNotSupported { encoding: "UTF-16BE" }) )); } // -- Trim helpers -- #[test] fn trim_ascii_whitespace_basic() { assert_eq!(trim_ascii_whitespace(" hello "), "hello"); assert_eq!(trim_ascii_whitespace("hello"), "hello"); assert_eq!(trim_ascii_whitespace(""), ""); assert_eq!(trim_ascii_whitespace(" "), ""); assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello"); } #[test] fn ascii_eq_ignore_case_basic() { assert!(ascii_eq_ignore_case("utf-8", "UTF-8")); assert!(ascii_eq_ignore_case("Utf-8", "utf-8")); assert!(!ascii_eq_ignore_case("utf-8", "utf-9")); assert!(!ascii_eq_ignore_case("utf-8", "utf-8x")); } }