//! UTF-16 decoder per WHATWG Encoding Standard. use crate::error::{EncodingError, Result}; use crate::utf8::ErrorMode; /// Decode a byte slice as UTF-16LE. pub(crate) fn decode_utf16le(bytes: &[u8], mode: ErrorMode) -> Result { decode_utf16(bytes, false, mode) } /// Decode a byte slice as UTF-16BE. pub(crate) fn decode_utf16be(bytes: &[u8], mode: ErrorMode) -> Result { decode_utf16(bytes, true, mode) } /// Shared UTF-16 decoder (WHATWG Encoding Standard §14.2). fn decode_utf16(bytes: &[u8], big_endian: bool, mode: ErrorMode) -> Result { let mut output = String::with_capacity(bytes.len() / 2); let mut i = 0; let mut lead_surrogate: Option = None; let mut bom_checked = false; while i + 1 < bytes.len() { let code_unit = if big_endian { ((bytes[i] as u16) << 8) | (bytes[i + 1] as u16) } else { ((bytes[i + 1] as u16) << 8) | (bytes[i] as u16) }; i += 2; // BOM handling: strip BOM matching our endianness at the start if !bom_checked { bom_checked = true; if code_unit == 0xFEFF { // BOM matches our endianness — consume it continue; } // 0xFFFE is NOT treated as a BOM — fall through to normal processing } if is_lead_surrogate(code_unit) { // If we already have an unpaired lead, emit error for it if let Some(_prev) = lead_surrogate { if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: encoding_name(big_endian), position: i - 4, // position of the previous unpaired lead }); } output.push('\u{FFFD}'); } lead_surrogate = Some(code_unit); } else if is_trail_surrogate(code_unit) { if let Some(lead) = lead_surrogate.take() { // Valid surrogate pair — compute supplementary code point let cp = 0x10000 + ((lead as u32 - 0xD800) << 10) + (code_unit as u32 - 0xDC00); let ch = char::from_u32(cp).unwrap_or('\u{FFFD}'); output.push(ch); } else { // Trail surrogate without lead if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: encoding_name(big_endian), position: i - 2, }); } output.push('\u{FFFD}'); } } else { // Regular BMP character if let Some(_lead) = lead_surrogate.take() { // Unpaired lead surrogate before this code unit if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: encoding_name(big_endian), position: i - 4, }); } output.push('\u{FFFD}'); } let ch = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); output.push(ch); } } // Handle trailing single byte (odd byte count) if i < bytes.len() { // Flush any pending lead surrogate first if lead_surrogate.take().is_some() { if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: encoding_name(big_endian), position: i - 2, }); } output.push('\u{FFFD}'); } if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: encoding_name(big_endian), position: i, }); } output.push('\u{FFFD}'); } else if lead_surrogate.is_some() { // Unpaired lead surrogate at end of input if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: encoding_name(big_endian), position: i - 2, }); } output.push('\u{FFFD}'); } Ok(output) } fn is_lead_surrogate(cu: u16) -> bool { (0xD800..=0xDBFF).contains(&cu) } fn is_trail_surrogate(cu: u16) -> bool { (0xDC00..=0xDFFF).contains(&cu) } fn encoding_name(big_endian: bool) -> &'static str { if big_endian { "UTF-16BE" } else { "UTF-16LE" } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; fn le(bytes: &[u8]) -> String { decode_utf16le(bytes, ErrorMode::Replacement).unwrap() } fn be(bytes: &[u8]) -> String { decode_utf16be(bytes, ErrorMode::Replacement).unwrap() } // -- Basic ASCII -- #[test] fn le_ascii() { assert_eq!(le(&[0x41, 0x00]), "A"); } #[test] fn be_ascii() { assert_eq!(be(&[0x00, 0x41]), "A"); } #[test] fn le_hello() { assert_eq!(le(&[0x48, 0x00, 0x69, 0x00]), "Hi"); } #[test] fn be_hello() { assert_eq!(be(&[0x00, 0x48, 0x00, 0x69]), "Hi"); } // -- BMP characters -- #[test] fn le_bmp() { // U+00E9 (e with acute) = 0xE9 0x00 in LE assert_eq!(le(&[0xE9, 0x00]), "\u{00E9}"); } #[test] fn be_bmp() { // U+00E9 in BE = 0x00 0xE9 assert_eq!(be(&[0x00, 0xE9]), "\u{00E9}"); } #[test] fn le_cjk() { // U+4E16 = 0x16 0x4E in LE assert_eq!(le(&[0x16, 0x4E]), "\u{4E16}"); } // -- Surrogate pairs -- #[test] fn le_surrogate_pair() { // U+1F600 = D83D DE00 in UTF-16 // LE: 3D D8 00 DE assert_eq!(le(&[0x3D, 0xD8, 0x00, 0xDE]), "\u{1F600}"); } #[test] fn be_surrogate_pair() { // U+1F600 = D83D DE00 in UTF-16 // BE: D8 3D DE 00 assert_eq!(be(&[0xD8, 0x3D, 0xDE, 0x00]), "\u{1F600}"); } #[test] fn le_supplementary_u10000() { // U+10000 = D800 DC00 // LE: 00 D8 00 DC assert_eq!(le(&[0x00, 0xD8, 0x00, 0xDC]), "\u{10000}"); } #[test] fn le_supplementary_u10ffff() { // U+10FFFF = DBFF DFFF // LE: FF DB FF DF assert_eq!(le(&[0xFF, 0xDB, 0xFF, 0xDF]), "\u{10FFFF}"); } // -- Unpaired surrogates -- #[test] fn le_unpaired_lead() { // Lead surrogate D800 followed by non-surrogate 0041 // LE: 00 D8 41 00 assert_eq!(le(&[0x00, 0xD8, 0x41, 0x00]), "\u{FFFD}A"); } #[test] fn le_unpaired_trail() { // Trail surrogate DC00 without lead // LE: 00 DC assert_eq!(le(&[0x00, 0xDC]), "\u{FFFD}"); } #[test] fn le_lead_at_end() { // Lead surrogate at end of input assert_eq!(le(&[0x00, 0xD8]), "\u{FFFD}"); } #[test] fn le_two_leads_in_a_row() { // Two lead surrogates: D800 D801 — first is unpaired, second is unpaired at end // LE: 00 D8 01 D8 assert_eq!(le(&[0x00, 0xD8, 0x01, 0xD8]), "\u{FFFD}\u{FFFD}"); } // -- BOM handling -- #[test] fn le_bom_stripped() { // UTF-16LE BOM: FF FE assert_eq!(le(&[0xFF, 0xFE, 0x41, 0x00]), "A"); } #[test] fn be_bom_stripped() { // UTF-16BE BOM: FE FF assert_eq!(be(&[0xFE, 0xFF, 0x00, 0x41]), "A"); } #[test] fn le_wrong_bom_not_stripped() { // FE FF is NOT the LE BOM — it's U+FEFF (ZWNBSP) assert_eq!(le(&[0xFE, 0xFF]), "\u{FFFE}"); } #[test] fn be_wrong_bom_not_stripped() { // FF FE is NOT the BE BOM — it's U+FFFE assert_eq!(be(&[0xFF, 0xFE]), "\u{FFFE}"); } #[test] fn le_bom_only() { assert_eq!(le(&[0xFF, 0xFE]), ""); } #[test] fn be_bom_only() { assert_eq!(be(&[0xFE, 0xFF]), ""); } // -- Odd byte count -- #[test] fn le_odd_byte() { assert_eq!(le(&[0x41, 0x00, 0x42]), "A\u{FFFD}"); } #[test] fn be_odd_byte() { assert_eq!(be(&[0x00, 0x41, 0x42]), "A\u{FFFD}"); } #[test] fn single_byte() { assert_eq!(le(&[0x41]), "\u{FFFD}"); } // -- Empty input -- #[test] fn empty_le() { assert_eq!(le(&[]), ""); } #[test] fn empty_be() { assert_eq!(be(&[]), ""); } // -- Fatal mode -- #[test] fn fatal_valid_le() { assert_eq!( decode_utf16le(&[0x41, 0x00], ErrorMode::Fatal).unwrap(), "A" ); } #[test] fn fatal_unpaired_lead_le() { let err = decode_utf16le(&[0x00, 0xD8, 0x41, 0x00], ErrorMode::Fatal).unwrap_err(); assert!(matches!( err, EncodingError::InvalidSequence { encoding: "UTF-16LE", .. } )); } #[test] fn fatal_unpaired_trail_le() { let err = decode_utf16le(&[0x00, 0xDC], ErrorMode::Fatal).unwrap_err(); assert!(matches!( err, EncodingError::InvalidSequence { encoding: "UTF-16LE", .. } )); } #[test] fn fatal_odd_byte_le() { let err = decode_utf16le(&[0x41, 0x00, 0x42], ErrorMode::Fatal).unwrap_err(); assert!(matches!( err, EncodingError::InvalidSequence { encoding: "UTF-16LE", .. } )); } // -- Mixed content -- #[test] fn le_mixed_bmp_and_supplementary() { // "A" + U+1F600 + "B" // LE: 41 00 | 3D D8 00 DE | 42 00 assert_eq!( le(&[0x41, 0x00, 0x3D, 0xD8, 0x00, 0xDE, 0x42, 0x00]), "A\u{1F600}B" ); } #[test] fn be_mixed_bmp_and_supplementary() { // "A" + U+1F600 + "B" // BE: 00 41 | D8 3D DE 00 | 00 42 assert_eq!( be(&[0x00, 0x41, 0xD8, 0x3D, 0xDE, 0x00, 0x00, 0x42]), "A\u{1F600}B" ); } #[test] fn le_null_character() { // U+0000 = 00 00 in LE assert_eq!(le(&[0x00, 0x00]), "\0"); } }