//! UTF-8 decoder and encoder per WHATWG Encoding Standard. use crate::error::{EncodingError, Result}; /// Error handling mode. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) enum ErrorMode { Replacement, Fatal, } /// Decode a byte slice as UTF-8. /// /// In replacement mode, invalid sequences are replaced with U+FFFD. /// In fatal mode, the first invalid sequence causes an error. pub(crate) fn decode_utf8(bytes: &[u8], mode: ErrorMode) -> Result { // Strip UTF-8 BOM if present let bytes = if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF { &bytes[3..] } else { bytes }; let mut output = String::with_capacity(bytes.len()); let mut decoder = Utf8Decoder::new(); let mut i = 0; while i < bytes.len() { match decoder.process_byte(bytes[i]) { DecoderResult::CodePoint(ch) => { output.push(ch); i += 1; } DecoderResult::Error(error_pos) => { if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: "UTF-8", position: error_pos, }); } output.push('\u{FFFD}'); i += 1; } DecoderResult::ErrorPrepend(error_pos) => { if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: "UTF-8", position: error_pos, }); } output.push('\u{FFFD}'); // Do NOT advance i — re-process this byte } DecoderResult::Continue => { i += 1; } } } // Handle incomplete sequence at end of input if decoder.bytes_needed > 0 { if mode == ErrorMode::Fatal { return Err(EncodingError::InvalidSequence { encoding: "UTF-8", position: bytes.len().saturating_sub(decoder.bytes_seen as usize), }); } output.push('\u{FFFD}'); } Ok(output) } /// Encode a string as UTF-8 bytes. /// /// Since Rust strings are already valid UTF-8, this is a straightforward copy. pub(crate) fn encode_utf8(text: &str) -> Vec { text.as_bytes().to_vec() } // --------------------------------------------------------------------------- // Streaming UTF-8 decoder (WHATWG Encoding Standard §8.1.1) // --------------------------------------------------------------------------- enum DecoderResult { /// A valid code point was decoded. CodePoint(char), /// An error occurred at the given byte position; advance to next byte. Error(usize), /// An error occurred at the given byte position; re-process current byte. ErrorPrepend(usize), /// More bytes needed; continue feeding. Continue, } struct Utf8Decoder { code_point: u32, bytes_seen: u8, bytes_needed: u8, lower_boundary: u8, upper_boundary: u8, /// Position of the start of the current multi-byte sequence. sequence_start: usize, /// Total bytes processed so far. position: usize, } impl Utf8Decoder { fn new() -> Self { Self { code_point: 0, bytes_seen: 0, bytes_needed: 0, lower_boundary: 0x80, upper_boundary: 0xBF, sequence_start: 0, position: 0, } } fn process_byte(&mut self, byte: u8) -> DecoderResult { let pos = self.position; self.position += 1; if self.bytes_needed == 0 { match byte { 0x00..=0x7F => DecoderResult::CodePoint(byte as char), 0xC2..=0xDF => { self.bytes_needed = 1; self.code_point = (byte & 0x1F) as u32; self.sequence_start = pos; DecoderResult::Continue } 0xE0 => { self.bytes_needed = 2; self.lower_boundary = 0xA0; self.code_point = (byte & 0x0F) as u32; self.sequence_start = pos; DecoderResult::Continue } 0xE1..=0xEC | 0xEE..=0xEF => { self.bytes_needed = 2; self.code_point = (byte & 0x0F) as u32; self.sequence_start = pos; DecoderResult::Continue } 0xED => { self.bytes_needed = 2; self.upper_boundary = 0x9F; self.code_point = (byte & 0x0F) as u32; self.sequence_start = pos; DecoderResult::Continue } 0xF0 => { self.bytes_needed = 3; self.lower_boundary = 0x90; self.code_point = (byte & 0x07) as u32; self.sequence_start = pos; DecoderResult::Continue } 0xF1..=0xF3 => { self.bytes_needed = 3; self.code_point = (byte & 0x07) as u32; self.sequence_start = pos; DecoderResult::Continue } 0xF4 => { self.bytes_needed = 3; self.upper_boundary = 0x8F; self.code_point = (byte & 0x07) as u32; self.sequence_start = pos; DecoderResult::Continue } _ => { // 0x80..=0xC1, 0xF5..=0xFF: invalid lead byte DecoderResult::Error(pos) } } } else { // Expecting continuation byte if byte < self.lower_boundary || byte > self.upper_boundary { // Invalid continuation — reset and prepend byte let err_pos = self.sequence_start; self.reset(); self.position -= 1; // will be re-processed return DecoderResult::ErrorPrepend(err_pos); } // Valid continuation byte self.lower_boundary = 0x80; self.upper_boundary = 0xBF; self.code_point = (self.code_point << 6) | (byte & 0x3F) as u32; self.bytes_seen += 1; if self.bytes_seen == self.bytes_needed { let cp = self.code_point; self.reset(); // The WHATWG state machine guarantees valid scalar values here, // but use fallback for defense-in-depth. let ch = char::from_u32(cp).unwrap_or('\u{FFFD}'); DecoderResult::CodePoint(ch) } else { DecoderResult::Continue } } } fn reset(&mut self) { self.code_point = 0; self.bytes_seen = 0; self.bytes_needed = 0; self.lower_boundary = 0x80; self.upper_boundary = 0xBF; } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; fn decode_replace(bytes: &[u8]) -> String { decode_utf8(bytes, ErrorMode::Replacement).unwrap() } fn decode_fatal(bytes: &[u8]) -> Result { decode_utf8(bytes, ErrorMode::Fatal) } // -- Basic ASCII -- #[test] fn ascii_roundtrip() { assert_eq!(decode_replace(b"Hello, world!"), "Hello, world!"); } #[test] fn empty_input() { assert_eq!(decode_replace(b""), ""); } #[test] fn null_byte() { assert_eq!(decode_replace(&[0x00]), "\0"); } // -- Multi-byte sequences -- #[test] fn two_byte_sequence() { // U+00E9 (e with acute) = 0xC3 0xA9 assert_eq!(decode_replace(&[0xC3, 0xA9]), "\u{00E9}"); } #[test] fn three_byte_sequence() { // U+4E16 (CJK character) = 0xE4 0xB8 0x96 assert_eq!(decode_replace(&[0xE4, 0xB8, 0x96]), "\u{4E16}"); } #[test] fn four_byte_sequence() { // U+1F600 (grinning face) = 0xF0 0x9F 0x98 0x80 assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98, 0x80]), "\u{1F600}"); } #[test] fn mixed_ascii_and_multibyte() { // "Caf\u{00E9}" = [0x43, 0x61, 0x66, 0xC3, 0xA9] assert_eq!( decode_replace(&[0x43, 0x61, 0x66, 0xC3, 0xA9]), "Caf\u{00E9}" ); } // -- BOM handling -- #[test] fn bom_stripped() { // UTF-8 BOM + "A" assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF, 0x41]), "A"); } #[test] fn bom_only() { assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF]), ""); } // -- Invalid sequences (replacement mode) -- #[test] fn invalid_byte_ff() { assert_eq!(decode_replace(&[0xFF]), "\u{FFFD}"); } #[test] fn invalid_byte_fe() { assert_eq!(decode_replace(&[0xFE]), "\u{FFFD}"); } #[test] fn invalid_continuation_byte_standalone() { // 0x80 without a lead byte assert_eq!(decode_replace(&[0x80]), "\u{FFFD}"); } #[test] fn overlong_two_byte() { // 0xC0 0xAF is an overlong encoding of U+002F ('/') // 0xC0 is always invalid (lead byte rejected), 0xAF is a continuation // byte without a lead (also invalid) — both produce U+FFFD assert_eq!(decode_replace(&[0xC0, 0xAF]), "\u{FFFD}\u{FFFD}"); } #[test] fn truncated_two_byte() { // 0xC3 without continuation assert_eq!(decode_replace(&[0xC3]), "\u{FFFD}"); } #[test] fn truncated_three_byte() { // 0xE4 0xB8 without third byte assert_eq!(decode_replace(&[0xE4, 0xB8]), "\u{FFFD}"); } #[test] fn truncated_four_byte() { // 0xF0 0x9F 0x98 without fourth byte assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98]), "\u{FFFD}"); } #[test] fn surrogate_half_rejected() { // U+D800 would encode as 0xED 0xA0 0x80, but surrogates are invalid in UTF-8 // 0xED with upper_boundary 0x9F rejects 0xA0 assert_eq!( decode_replace(&[0xED, 0xA0, 0x80]), "\u{FFFD}\u{FFFD}\u{FFFD}" ); } #[test] fn invalid_continuation_mid_sequence() { // 0xE4 expects continuation, but 0x41 is ASCII — error + prepend assert_eq!(decode_replace(&[0xE4, 0x41]), "\u{FFFD}A"); } #[test] fn invalid_between_valid() { // Valid 'A', invalid 0xFF, valid 'B' assert_eq!(decode_replace(&[0x41, 0xFF, 0x42]), "A\u{FFFD}B"); } #[test] fn multiple_errors_in_a_row() { assert_eq!( decode_replace(&[0xFE, 0xFF, 0xFE]), "\u{FFFD}\u{FFFD}\u{FFFD}" ); } // -- Fatal mode -- #[test] fn fatal_valid() { assert_eq!(decode_fatal(b"Hello").unwrap(), "Hello"); } #[test] fn fatal_invalid() { let err = decode_fatal(&[0x41, 0xFF]).unwrap_err(); assert!(matches!( err, EncodingError::InvalidSequence { encoding: "UTF-8", position: 1 } )); } #[test] fn fatal_truncated() { let err = decode_fatal(&[0xC3]).unwrap_err(); assert!(matches!( err, EncodingError::InvalidSequence { encoding: "UTF-8", .. } )); } // -- Encoder -- #[test] fn encode_ascii() { assert_eq!(encode_utf8("Hello"), b"Hello"); } #[test] fn encode_multibyte() { assert_eq!(encode_utf8("\u{00E9}"), &[0xC3, 0xA9]); } #[test] fn encode_emoji() { assert_eq!(encode_utf8("\u{1F600}"), &[0xF0, 0x9F, 0x98, 0x80]); } #[test] fn encode_empty() { assert_eq!(encode_utf8(""), b""); } #[test] fn roundtrip() { let original = "Hello \u{4E16}\u{754C} \u{1F600}"; let encoded = encode_utf8(original); let decoded = decode_replace(&encoded); assert_eq!(decoded, original); } // -- Edge cases -- #[test] fn max_two_byte() { // U+07FF = 0xDF 0xBF assert_eq!(decode_replace(&[0xDF, 0xBF]), "\u{07FF}"); } #[test] fn min_three_byte() { // U+0800 = 0xE0 0xA0 0x80 assert_eq!(decode_replace(&[0xE0, 0xA0, 0x80]), "\u{0800}"); } #[test] fn max_three_byte() { // U+FFFF = 0xEF 0xBF 0xBF assert_eq!(decode_replace(&[0xEF, 0xBF, 0xBF]), "\u{FFFF}"); } #[test] fn min_four_byte() { // U+10000 = 0xF0 0x90 0x80 0x80 assert_eq!(decode_replace(&[0xF0, 0x90, 0x80, 0x80]), "\u{10000}"); } #[test] fn max_unicode() { // U+10FFFF = 0xF4 0x8F 0xBF 0xBF assert_eq!(decode_replace(&[0xF4, 0x8F, 0xBF, 0xBF]), "\u{10FFFF}"); } #[test] fn above_max_unicode_rejected() { // 0xF4 0x90 would start U+110000, which is above max // 0xF4 has upper_boundary = 0x8F, so 0x90 is rejected assert_eq!( decode_replace(&[0xF4, 0x90, 0x80, 0x80]), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); } #[test] fn overlong_three_byte_rejected() { // 0xE0 requires lower_boundary = 0xA0, so 0xE0 0x80 0x80 is rejected assert_eq!( decode_replace(&[0xE0, 0x80, 0x80]), "\u{FFFD}\u{FFFD}\u{FFFD}" ); } #[test] fn overlong_four_byte_rejected() { // 0xF0 requires lower_boundary = 0x90, so 0xF0 0x80 0x80 0x80 is rejected assert_eq!( decode_replace(&[0xF0, 0x80, 0x80, 0x80]), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); } }