//! WHATWG Encoding Standard — UTF-8, UTF-16, and legacy single-byte codecs, pure Rust. pub mod error; mod single_byte; pub mod sniff; mod utf16; mod utf8; use error::{EncodingError, Result}; use utf8::ErrorMode; // --------------------------------------------------------------------------- // Encoding enum // --------------------------------------------------------------------------- /// Supported text encodings per WHATWG Encoding Standard. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Encoding { Utf8, Utf16Be, Utf16Le, // Single-byte encodings Ibm866, Iso8859_2, Iso8859_3, Iso8859_4, Iso8859_5, Iso8859_6, Iso8859_7, Iso8859_8, Iso8859_8I, Iso8859_10, Iso8859_13, Iso8859_14, Iso8859_15, Iso8859_16, Koi8R, Koi8U, Macintosh, Windows874, Windows1250, Windows1251, Windows1252, Windows1253, Windows1254, Windows1255, Windows1256, Windows1257, Windows1258, XMacCyrillic, } impl Encoding { /// Canonical name per WHATWG spec. pub fn name(&self) -> &'static str { match self { Self::Utf8 => "UTF-8", Self::Utf16Be => "UTF-16BE", Self::Utf16Le => "UTF-16LE", Self::Ibm866 => "IBM866", Self::Iso8859_2 => "ISO-8859-2", Self::Iso8859_3 => "ISO-8859-3", Self::Iso8859_4 => "ISO-8859-4", Self::Iso8859_5 => "ISO-8859-5", Self::Iso8859_6 => "ISO-8859-6", Self::Iso8859_7 => "ISO-8859-7", Self::Iso8859_8 => "ISO-8859-8", Self::Iso8859_8I => "ISO-8859-8-I", Self::Iso8859_10 => "ISO-8859-10", Self::Iso8859_13 => "ISO-8859-13", Self::Iso8859_14 => "ISO-8859-14", Self::Iso8859_15 => "ISO-8859-15", Self::Iso8859_16 => "ISO-8859-16", Self::Koi8R => "KOI8-R", Self::Koi8U => "KOI8-U", Self::Macintosh => "macintosh", Self::Windows874 => "windows-874", Self::Windows1250 => "windows-1250", Self::Windows1251 => "windows-1251", Self::Windows1252 => "windows-1252", Self::Windows1253 => "windows-1253", Self::Windows1254 => "windows-1254", Self::Windows1255 => "windows-1255", Self::Windows1256 => "windows-1256", Self::Windows1257 => "windows-1257", Self::Windows1258 => "windows-1258", Self::XMacCyrillic => "x-mac-cyrillic", } } } // --------------------------------------------------------------------------- // Label lookup (WHATWG Encoding Standard §4.2) // --------------------------------------------------------------------------- /// WHATWG encoding label mappings. /// Labels are stored in lowercase; lookup normalizes input to lowercase. const ENCODING_LABELS: &[(&str, Encoding)] = &[ // UTF-8 labels ("unicode-1-1-utf-8", Encoding::Utf8), ("unicode11utf8", Encoding::Utf8), ("unicode20utf8", Encoding::Utf8), ("utf-8", Encoding::Utf8), ("utf8", Encoding::Utf8), ("x-unicode20utf8", Encoding::Utf8), // UTF-16BE labels ("unicodefffe", Encoding::Utf16Be), ("utf-16be", Encoding::Utf16Be), // UTF-16LE labels ("csunicode", Encoding::Utf16Le), ("iso-10646-ucs-2", Encoding::Utf16Le), ("ucs-2", Encoding::Utf16Le), ("unicode", Encoding::Utf16Le), ("unicodefeff", Encoding::Utf16Le), ("utf-16", Encoding::Utf16Le), ("utf-16le", Encoding::Utf16Le), // IBM866 labels ("866", Encoding::Ibm866), ("cp866", Encoding::Ibm866), ("csibm866", Encoding::Ibm866), ("ibm866", Encoding::Ibm866), // ISO-8859-2 labels ("csisolatin2", Encoding::Iso8859_2), ("iso-8859-2", Encoding::Iso8859_2), ("iso-ir-101", Encoding::Iso8859_2), ("iso8859-2", Encoding::Iso8859_2), ("iso88592", Encoding::Iso8859_2), ("iso_8859-2", Encoding::Iso8859_2), ("iso_8859-2:1987", Encoding::Iso8859_2), ("l2", Encoding::Iso8859_2), ("latin2", Encoding::Iso8859_2), // ISO-8859-3 labels ("csisolatin3", Encoding::Iso8859_3), ("iso-8859-3", Encoding::Iso8859_3), ("iso-ir-109", Encoding::Iso8859_3), ("iso8859-3", Encoding::Iso8859_3), ("iso88593", Encoding::Iso8859_3), ("iso_8859-3", Encoding::Iso8859_3), ("iso_8859-3:1988", Encoding::Iso8859_3), ("l3", Encoding::Iso8859_3), ("latin3", Encoding::Iso8859_3), // ISO-8859-4 labels ("csisolatin4", Encoding::Iso8859_4), ("iso-8859-4", Encoding::Iso8859_4), ("iso-ir-110", Encoding::Iso8859_4), ("iso8859-4", Encoding::Iso8859_4), ("iso88594", Encoding::Iso8859_4), ("iso_8859-4", Encoding::Iso8859_4), ("iso_8859-4:1988", Encoding::Iso8859_4), ("l4", Encoding::Iso8859_4), ("latin4", Encoding::Iso8859_4), // ISO-8859-5 labels ("csisolatincyrillic", Encoding::Iso8859_5), ("cyrillic", Encoding::Iso8859_5), ("iso-8859-5", Encoding::Iso8859_5), ("iso-ir-144", Encoding::Iso8859_5), ("iso8859-5", Encoding::Iso8859_5), ("iso88595", Encoding::Iso8859_5), ("iso_8859-5", Encoding::Iso8859_5), ("iso_8859-5:1988", Encoding::Iso8859_5), // ISO-8859-6 labels ("arabic", Encoding::Iso8859_6), ("asmo-708", Encoding::Iso8859_6), ("csiso88596e", Encoding::Iso8859_6), ("csiso88596i", Encoding::Iso8859_6), ("csisolatinarabic", Encoding::Iso8859_6), ("ecma-114", Encoding::Iso8859_6), ("iso-8859-6", Encoding::Iso8859_6), ("iso-8859-6-e", Encoding::Iso8859_6), ("iso-8859-6-i", Encoding::Iso8859_6), ("iso-ir-127", Encoding::Iso8859_6), ("iso8859-6", Encoding::Iso8859_6), ("iso88596", Encoding::Iso8859_6), ("iso_8859-6", Encoding::Iso8859_6), ("iso_8859-6:1987", Encoding::Iso8859_6), // ISO-8859-7 labels ("csisolatingreek", Encoding::Iso8859_7), ("ecma-118", Encoding::Iso8859_7), ("elot_928", Encoding::Iso8859_7), ("greek", Encoding::Iso8859_7), ("greek8", Encoding::Iso8859_7), ("iso-8859-7", Encoding::Iso8859_7), ("iso-ir-126", Encoding::Iso8859_7), ("iso8859-7", Encoding::Iso8859_7), ("iso88597", Encoding::Iso8859_7), ("iso_8859-7", Encoding::Iso8859_7), ("iso_8859-7:1987", Encoding::Iso8859_7), ("sun_eu_greek", Encoding::Iso8859_7), // ISO-8859-8 labels ("csiso88598e", Encoding::Iso8859_8), ("csisolatinhebrew", Encoding::Iso8859_8), ("hebrew", Encoding::Iso8859_8), ("iso-8859-8", Encoding::Iso8859_8), ("iso-8859-8-e", Encoding::Iso8859_8), ("iso-ir-138", Encoding::Iso8859_8), ("iso8859-8", Encoding::Iso8859_8), ("iso88598", Encoding::Iso8859_8), ("iso_8859-8", Encoding::Iso8859_8), ("iso_8859-8:1988", Encoding::Iso8859_8), ("visual", Encoding::Iso8859_8), // ISO-8859-8-I labels ("csiso88598i", Encoding::Iso8859_8I), ("iso-8859-8-i", Encoding::Iso8859_8I), ("logical", Encoding::Iso8859_8I), // ISO-8859-10 labels ("csisolatin6", Encoding::Iso8859_10), ("iso-8859-10", Encoding::Iso8859_10), ("iso-ir-157", Encoding::Iso8859_10), ("iso8859-10", Encoding::Iso8859_10), ("iso885910", Encoding::Iso8859_10), ("l6", Encoding::Iso8859_10), ("latin6", Encoding::Iso8859_10), // ISO-8859-13 labels ("iso-8859-13", Encoding::Iso8859_13), ("iso8859-13", Encoding::Iso8859_13), ("iso885913", Encoding::Iso8859_13), // ISO-8859-14 labels ("iso-8859-14", Encoding::Iso8859_14), ("iso8859-14", Encoding::Iso8859_14), ("iso885914", Encoding::Iso8859_14), // ISO-8859-15 labels ("csisolatin9", Encoding::Iso8859_15), ("iso-8859-15", Encoding::Iso8859_15), ("iso8859-15", Encoding::Iso8859_15), ("iso885915", Encoding::Iso8859_15), ("iso_8859-15", Encoding::Iso8859_15), ("l9", Encoding::Iso8859_15), // ISO-8859-16 labels ("iso-8859-16", Encoding::Iso8859_16), // KOI8-R labels ("cskoi8r", Encoding::Koi8R), ("koi", Encoding::Koi8R), ("koi8", Encoding::Koi8R), ("koi8-r", Encoding::Koi8R), ("koi8_r", Encoding::Koi8R), // KOI8-U labels ("koi8-ru", Encoding::Koi8U), ("koi8-u", Encoding::Koi8U), // macintosh labels ("csmacintosh", Encoding::Macintosh), ("mac", Encoding::Macintosh), ("macintosh", Encoding::Macintosh), ("x-mac-roman", Encoding::Macintosh), // windows-874 labels ("dos-874", Encoding::Windows874), ("iso-8859-11", Encoding::Windows874), ("iso8859-11", Encoding::Windows874), ("iso885911", Encoding::Windows874), ("tis-620", Encoding::Windows874), ("windows-874", Encoding::Windows874), // windows-1250 labels ("cp1250", Encoding::Windows1250), ("windows-1250", Encoding::Windows1250), ("x-cp1250", Encoding::Windows1250), // windows-1251 labels ("cp1251", Encoding::Windows1251), ("windows-1251", Encoding::Windows1251), ("x-cp1251", Encoding::Windows1251), // windows-1252 labels (also serves as ISO-8859-1 and US-ASCII per WHATWG) ("ansi_x3.4-1968", Encoding::Windows1252), ("ascii", Encoding::Windows1252), ("cp1252", Encoding::Windows1252), ("cp819", Encoding::Windows1252), ("csisolatin1", Encoding::Windows1252), ("ibm819", Encoding::Windows1252), ("iso-8859-1", Encoding::Windows1252), ("iso-ir-100", Encoding::Windows1252), ("iso8859-1", Encoding::Windows1252), ("iso88591", Encoding::Windows1252), ("iso_8859-1", Encoding::Windows1252), ("iso_8859-1:1987", Encoding::Windows1252), ("l1", Encoding::Windows1252), ("latin1", Encoding::Windows1252), ("us-ascii", Encoding::Windows1252), ("windows-1252", Encoding::Windows1252), ("x-cp1252", Encoding::Windows1252), // windows-1253 labels ("cp1253", Encoding::Windows1253), ("windows-1253", Encoding::Windows1253), ("x-cp1253", Encoding::Windows1253), // windows-1254 labels ("cp1254", Encoding::Windows1254), ("csisolatin5", Encoding::Windows1254), ("iso-8859-9", Encoding::Windows1254), ("iso-ir-148", Encoding::Windows1254), ("iso8859-9", Encoding::Windows1254), ("iso88599", Encoding::Windows1254), ("iso_8859-9", Encoding::Windows1254), ("iso_8859-9:1989", Encoding::Windows1254), ("l5", Encoding::Windows1254), ("latin5", Encoding::Windows1254), ("windows-1254", Encoding::Windows1254), ("x-cp1254", Encoding::Windows1254), // windows-1255 labels ("cp1255", Encoding::Windows1255), ("windows-1255", Encoding::Windows1255), ("x-cp1255", Encoding::Windows1255), // windows-1256 labels ("cp1256", Encoding::Windows1256), ("windows-1256", Encoding::Windows1256), ("x-cp1256", Encoding::Windows1256), // windows-1257 labels ("cp1257", Encoding::Windows1257), ("windows-1257", Encoding::Windows1257), ("x-cp1257", Encoding::Windows1257), // windows-1258 labels ("cp1258", Encoding::Windows1258), ("windows-1258", Encoding::Windows1258), ("x-cp1258", Encoding::Windows1258), // x-mac-cyrillic labels ("x-mac-cyrillic", Encoding::XMacCyrillic), ("x-mac-ukrainian", Encoding::XMacCyrillic), ]; /// Look up an encoding by its WHATWG label. /// /// Strips leading/trailing ASCII whitespace and compares case-insensitively, /// per the WHATWG Encoding Standard. pub fn lookup(label: &str) -> Option { let trimmed = trim_ascii_whitespace(label); if trimmed.is_empty() { return None; } for &(name, enc) in ENCODING_LABELS { if ascii_eq_ignore_case(trimmed, name) { return Some(enc); } } None } /// Sniff BOM from the start of a byte slice. /// /// Returns the detected encoding (if any) and the remaining bytes after the BOM. pub fn bom_sniff(bytes: &[u8]) -> (Option, &[u8]) { if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF { (Some(Encoding::Utf8), &bytes[3..]) } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { (Some(Encoding::Utf16Be), &bytes[2..]) } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE { (Some(Encoding::Utf16Le), &bytes[2..]) } else { (None, bytes) } } // --------------------------------------------------------------------------- // Public API // --------------------------------------------------------------------------- /// Decode bytes to a `String` using the given encoding. /// /// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec). pub fn decode(bytes: &[u8], encoding: Encoding) -> String { // Replacement mode never fails match encoding { Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(), Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(), Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(), enc => { let table = single_byte::table_for(&enc).unwrap(); single_byte::decode_single_byte(bytes, table, enc.name(), ErrorMode::Replacement) .unwrap() } } } /// Decode bytes to a `String`, returning an error on any invalid sequence. /// /// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence. pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result { match encoding { Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal), Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal), Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal), enc => { let table = single_byte::table_for(&enc).unwrap(); single_byte::decode_single_byte(bytes, table, enc.name(), ErrorMode::Fatal) } } } /// Encode a string to bytes using the given encoding. /// /// Only UTF-8 encoding is supported for encode. Per WHATWG spec, all other /// encodings are decode-only. pub fn encode(text: &str, encoding: Encoding) -> Result> { match encoding { Encoding::Utf8 => Ok(utf8::encode_utf8(text)), other => Err(EncodingError::EncodeNotSupported { encoding: other.name(), }), } } // --------------------------------------------------------------------------- // Internal helpers // --------------------------------------------------------------------------- /// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE. fn trim_ascii_whitespace(s: &str) -> &str { let bytes = s.as_bytes(); let start = bytes .iter() .position(|&b| !is_ascii_whitespace(b)) .unwrap_or(bytes.len()); let end = bytes .iter() .rposition(|&b| !is_ascii_whitespace(b)) .map(|p| p + 1) .unwrap_or(0); if start >= end { return ""; } &s[start..end] } fn is_ascii_whitespace(b: u8) -> bool { matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20) } fn ascii_eq_ignore_case(a: &str, b: &str) -> bool { a.eq_ignore_ascii_case(b) } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; // -- Encoding enum -- #[test] fn encoding_names() { assert_eq!(Encoding::Utf8.name(), "UTF-8"); assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE"); assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE"); assert_eq!(Encoding::Windows1252.name(), "windows-1252"); assert_eq!(Encoding::Iso8859_2.name(), "ISO-8859-2"); assert_eq!(Encoding::Koi8R.name(), "KOI8-R"); assert_eq!(Encoding::Macintosh.name(), "macintosh"); } // -- Label lookup -- #[test] fn lookup_utf8_labels() { assert_eq!(lookup("utf-8"), Some(Encoding::Utf8)); assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8)); assert_eq!(lookup("utf8"), Some(Encoding::Utf8)); assert_eq!(lookup("Utf8"), Some(Encoding::Utf8)); assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8)); assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8)); } #[test] fn lookup_utf16_labels() { assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be)); assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be)); assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be)); assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le)); assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le)); assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le)); assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le)); assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le)); } #[test] fn lookup_windows_1252_labels() { // windows-1252 is THE most important single-byte encoding assert_eq!(lookup("windows-1252"), Some(Encoding::Windows1252)); assert_eq!(lookup("cp1252"), Some(Encoding::Windows1252)); assert_eq!(lookup("x-cp1252"), Some(Encoding::Windows1252)); // ISO-8859-1 maps to windows-1252 per WHATWG assert_eq!(lookup("iso-8859-1"), Some(Encoding::Windows1252)); assert_eq!(lookup("latin1"), Some(Encoding::Windows1252)); assert_eq!(lookup("l1"), Some(Encoding::Windows1252)); // US-ASCII maps to windows-1252 per WHATWG assert_eq!(lookup("us-ascii"), Some(Encoding::Windows1252)); assert_eq!(lookup("ascii"), Some(Encoding::Windows1252)); } #[test] fn lookup_legacy_labels() { assert_eq!(lookup("iso-8859-2"), Some(Encoding::Iso8859_2)); assert_eq!(lookup("latin2"), Some(Encoding::Iso8859_2)); assert_eq!(lookup("iso-8859-5"), Some(Encoding::Iso8859_5)); assert_eq!(lookup("cyrillic"), Some(Encoding::Iso8859_5)); assert_eq!(lookup("iso-8859-7"), Some(Encoding::Iso8859_7)); assert_eq!(lookup("greek"), Some(Encoding::Iso8859_7)); assert_eq!(lookup("iso-8859-15"), Some(Encoding::Iso8859_15)); assert_eq!(lookup("koi8-r"), Some(Encoding::Koi8R)); assert_eq!(lookup("koi8-u"), Some(Encoding::Koi8U)); assert_eq!(lookup("macintosh"), Some(Encoding::Macintosh)); assert_eq!(lookup("ibm866"), Some(Encoding::Ibm866)); assert_eq!(lookup("windows-1251"), Some(Encoding::Windows1251)); assert_eq!(lookup("windows-874"), Some(Encoding::Windows874)); assert_eq!(lookup("iso-8859-9"), Some(Encoding::Windows1254)); assert_eq!(lookup("x-mac-cyrillic"), Some(Encoding::XMacCyrillic)); } #[test] fn lookup_with_whitespace() { assert_eq!(lookup(" utf-8 "), Some(Encoding::Utf8)); assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8)); assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le)); assert_eq!(lookup(" windows-1252 "), Some(Encoding::Windows1252)); } #[test] fn lookup_unknown() { assert_eq!(lookup(""), None); assert_eq!(lookup(" "), None); assert_eq!(lookup("utf-99"), None); assert_eq!(lookup("bogus-encoding"), None); } // -- BOM sniffing -- #[test] fn bom_utf8() { let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]); assert_eq!(enc, Some(Encoding::Utf8)); assert_eq!(rest, &[0x41]); } #[test] fn bom_utf16be() { let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]); assert_eq!(enc, Some(Encoding::Utf16Be)); assert_eq!(rest, &[0x00, 0x41]); } #[test] fn bom_utf16le() { let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]); assert_eq!(enc, Some(Encoding::Utf16Le)); assert_eq!(rest, &[0x41, 0x00]); } #[test] fn bom_none() { let data = [0x41, 0x42, 0x43]; let (enc, rest) = bom_sniff(&data); assert_eq!(enc, None); assert_eq!(rest, &data); } #[test] fn bom_empty() { let (enc, rest) = bom_sniff(&[]); assert_eq!(enc, None); assert_eq!(rest, &[] as &[u8]); } #[test] fn bom_short() { let (enc, rest) = bom_sniff(&[0xEF, 0xBB]); assert_eq!(enc, None); assert_eq!(rest, &[0xEF, 0xBB]); } // -- Top-level decode -- #[test] fn decode_utf8_basic() { assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello"); } #[test] fn decode_utf8_invalid_replaces() { assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}"); } #[test] fn decode_utf16le_basic() { assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A"); } #[test] fn decode_utf16be_basic() { assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A"); } #[test] fn decode_windows_1252_euro() { assert_eq!(decode(&[0x80], Encoding::Windows1252), "\u{20AC}"); } #[test] fn decode_windows_1252_cafe() { // "Café" in windows-1252 assert_eq!( decode(&[0x43, 0x61, 0x66, 0xE9], Encoding::Windows1252), "Caf\u{00E9}" ); } #[test] fn decode_iso_8859_2() { // 0xA1 → Ą assert_eq!(decode(&[0xA1], Encoding::Iso8859_2), "\u{0104}"); } #[test] fn decode_koi8r_cyrillic() { // 0xE1 → А (U+0410) assert_eq!(decode(&[0xE1], Encoding::Koi8R), "\u{0410}"); } #[test] fn decode_windows_1251_cyrillic() { // 0xC0 → А (U+0410), 0xE0 → а (U+0430) assert_eq!( decode(&[0xC0, 0xE0], Encoding::Windows1251), "\u{0410}\u{0430}" ); } // -- Top-level decode_strict -- #[test] fn decode_strict_valid() { assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello"); } #[test] fn decode_strict_invalid() { assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err()); } #[test] fn decode_strict_single_byte_unmapped() { // ISO-8859-3 byte 0xA5 is unmapped assert!(decode_strict(&[0xA5], Encoding::Iso8859_3).is_err()); } #[test] fn decode_strict_single_byte_valid() { assert_eq!( decode_strict(&[0x80], Encoding::Windows1252).unwrap(), "\u{20AC}" ); } // -- Top-level encode -- #[test] fn encode_utf8_basic() { assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello"); } #[test] fn encode_non_utf8_not_supported() { assert!(matches!( encode("Hello", Encoding::Utf16Le), Err(EncodingError::EncodeNotSupported { .. }) )); assert!(matches!( encode("Hello", Encoding::Utf16Be), Err(EncodingError::EncodeNotSupported { .. }) )); assert!(matches!( encode("Hello", Encoding::Windows1252), Err(EncodingError::EncodeNotSupported { .. }) )); } // -- Trim helpers -- #[test] fn trim_ascii_whitespace_basic() { assert_eq!(trim_ascii_whitespace(" hello "), "hello"); assert_eq!(trim_ascii_whitespace("hello"), "hello"); assert_eq!(trim_ascii_whitespace(""), ""); assert_eq!(trim_ascii_whitespace(" "), ""); assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello"); } #[test] fn ascii_eq_ignore_case_basic() { assert!(ascii_eq_ignore_case("utf-8", "UTF-8")); assert!(ascii_eq_ignore_case("Utf-8", "utf-8")); assert!(!ascii_eq_ignore_case("utf-8", "utf-9")); assert!(!ascii_eq_ignore_case("utf-8", "utf-8x")); } }