we (web engine): Experimental web browser project to understand the limits of Claude
at promise 486 lines 14 kB view raw
1//! UTF-8 decoder and encoder per WHATWG Encoding Standard. 2 3use crate::error::{EncodingError, Result}; 4 5/// Error handling mode. 6#[derive(Debug, Clone, Copy, PartialEq, Eq)] 7pub(crate) enum ErrorMode { 8 Replacement, 9 Fatal, 10} 11 12/// Decode a byte slice as UTF-8. 13/// 14/// In replacement mode, invalid sequences are replaced with U+FFFD. 15/// In fatal mode, the first invalid sequence causes an error. 16pub(crate) fn decode_utf8(bytes: &[u8], mode: ErrorMode) -> Result<String> { 17 // Strip UTF-8 BOM if present 18 let bytes = if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF { 19 &bytes[3..] 20 } else { 21 bytes 22 }; 23 24 let mut output = String::with_capacity(bytes.len()); 25 let mut decoder = Utf8Decoder::new(); 26 let mut i = 0; 27 28 while i < bytes.len() { 29 match decoder.process_byte(bytes[i]) { 30 DecoderResult::CodePoint(ch) => { 31 output.push(ch); 32 i += 1; 33 } 34 DecoderResult::Error(error_pos) => { 35 if mode == ErrorMode::Fatal { 36 return Err(EncodingError::InvalidSequence { 37 encoding: "UTF-8", 38 position: error_pos, 39 }); 40 } 41 output.push('\u{FFFD}'); 42 i += 1; 43 } 44 DecoderResult::ErrorPrepend(error_pos) => { 45 if mode == ErrorMode::Fatal { 46 return Err(EncodingError::InvalidSequence { 47 encoding: "UTF-8", 48 position: error_pos, 49 }); 50 } 51 output.push('\u{FFFD}'); 52 // Do NOT advance i — re-process this byte 53 } 54 DecoderResult::Continue => { 55 i += 1; 56 } 57 } 58 } 59 60 // Handle incomplete sequence at end of input 61 if decoder.bytes_needed > 0 { 62 if mode == ErrorMode::Fatal { 63 return Err(EncodingError::InvalidSequence { 64 encoding: "UTF-8", 65 position: bytes.len().saturating_sub(decoder.bytes_seen as usize), 66 }); 67 } 68 output.push('\u{FFFD}'); 69 } 70 71 Ok(output) 72} 73 74/// Encode a string as UTF-8 bytes. 75/// 76/// Since Rust strings are already valid UTF-8, this is a straightforward copy. 77pub(crate) fn encode_utf8(text: &str) -> Vec<u8> { 78 text.as_bytes().to_vec() 79} 80 81// --------------------------------------------------------------------------- 82// Streaming UTF-8 decoder (WHATWG Encoding Standard §8.1.1) 83// --------------------------------------------------------------------------- 84 85enum DecoderResult { 86 /// A valid code point was decoded. 87 CodePoint(char), 88 /// An error occurred at the given byte position; advance to next byte. 89 Error(usize), 90 /// An error occurred at the given byte position; re-process current byte. 91 ErrorPrepend(usize), 92 /// More bytes needed; continue feeding. 93 Continue, 94} 95 96struct Utf8Decoder { 97 code_point: u32, 98 bytes_seen: u8, 99 bytes_needed: u8, 100 lower_boundary: u8, 101 upper_boundary: u8, 102 /// Position of the start of the current multi-byte sequence. 103 sequence_start: usize, 104 /// Total bytes processed so far. 105 position: usize, 106} 107 108impl Utf8Decoder { 109 fn new() -> Self { 110 Self { 111 code_point: 0, 112 bytes_seen: 0, 113 bytes_needed: 0, 114 lower_boundary: 0x80, 115 upper_boundary: 0xBF, 116 sequence_start: 0, 117 position: 0, 118 } 119 } 120 121 fn process_byte(&mut self, byte: u8) -> DecoderResult { 122 let pos = self.position; 123 self.position += 1; 124 125 if self.bytes_needed == 0 { 126 match byte { 127 0x00..=0x7F => DecoderResult::CodePoint(byte as char), 128 0xC2..=0xDF => { 129 self.bytes_needed = 1; 130 self.code_point = (byte & 0x1F) as u32; 131 self.sequence_start = pos; 132 DecoderResult::Continue 133 } 134 0xE0 => { 135 self.bytes_needed = 2; 136 self.lower_boundary = 0xA0; 137 self.code_point = (byte & 0x0F) as u32; 138 self.sequence_start = pos; 139 DecoderResult::Continue 140 } 141 0xE1..=0xEC | 0xEE..=0xEF => { 142 self.bytes_needed = 2; 143 self.code_point = (byte & 0x0F) as u32; 144 self.sequence_start = pos; 145 DecoderResult::Continue 146 } 147 0xED => { 148 self.bytes_needed = 2; 149 self.upper_boundary = 0x9F; 150 self.code_point = (byte & 0x0F) as u32; 151 self.sequence_start = pos; 152 DecoderResult::Continue 153 } 154 0xF0 => { 155 self.bytes_needed = 3; 156 self.lower_boundary = 0x90; 157 self.code_point = (byte & 0x07) as u32; 158 self.sequence_start = pos; 159 DecoderResult::Continue 160 } 161 0xF1..=0xF3 => { 162 self.bytes_needed = 3; 163 self.code_point = (byte & 0x07) as u32; 164 self.sequence_start = pos; 165 DecoderResult::Continue 166 } 167 0xF4 => { 168 self.bytes_needed = 3; 169 self.upper_boundary = 0x8F; 170 self.code_point = (byte & 0x07) as u32; 171 self.sequence_start = pos; 172 DecoderResult::Continue 173 } 174 _ => { 175 // 0x80..=0xC1, 0xF5..=0xFF: invalid lead byte 176 DecoderResult::Error(pos) 177 } 178 } 179 } else { 180 // Expecting continuation byte 181 if byte < self.lower_boundary || byte > self.upper_boundary { 182 // Invalid continuation — reset and prepend byte 183 let err_pos = self.sequence_start; 184 self.reset(); 185 self.position -= 1; // will be re-processed 186 return DecoderResult::ErrorPrepend(err_pos); 187 } 188 189 // Valid continuation byte 190 self.lower_boundary = 0x80; 191 self.upper_boundary = 0xBF; 192 self.code_point = (self.code_point << 6) | (byte & 0x3F) as u32; 193 self.bytes_seen += 1; 194 195 if self.bytes_seen == self.bytes_needed { 196 let cp = self.code_point; 197 self.reset(); 198 // The WHATWG state machine guarantees valid scalar values here, 199 // but use fallback for defense-in-depth. 200 let ch = char::from_u32(cp).unwrap_or('\u{FFFD}'); 201 DecoderResult::CodePoint(ch) 202 } else { 203 DecoderResult::Continue 204 } 205 } 206 } 207 208 fn reset(&mut self) { 209 self.code_point = 0; 210 self.bytes_seen = 0; 211 self.bytes_needed = 0; 212 self.lower_boundary = 0x80; 213 self.upper_boundary = 0xBF; 214 } 215} 216 217// --------------------------------------------------------------------------- 218// Tests 219// --------------------------------------------------------------------------- 220 221#[cfg(test)] 222mod tests { 223 use super::*; 224 225 fn decode_replace(bytes: &[u8]) -> String { 226 decode_utf8(bytes, ErrorMode::Replacement).unwrap() 227 } 228 229 fn decode_fatal(bytes: &[u8]) -> Result<String> { 230 decode_utf8(bytes, ErrorMode::Fatal) 231 } 232 233 // -- Basic ASCII -- 234 235 #[test] 236 fn ascii_roundtrip() { 237 assert_eq!(decode_replace(b"Hello, world!"), "Hello, world!"); 238 } 239 240 #[test] 241 fn empty_input() { 242 assert_eq!(decode_replace(b""), ""); 243 } 244 245 #[test] 246 fn null_byte() { 247 assert_eq!(decode_replace(&[0x00]), "\0"); 248 } 249 250 // -- Multi-byte sequences -- 251 252 #[test] 253 fn two_byte_sequence() { 254 // U+00E9 (e with acute) = 0xC3 0xA9 255 assert_eq!(decode_replace(&[0xC3, 0xA9]), "\u{00E9}"); 256 } 257 258 #[test] 259 fn three_byte_sequence() { 260 // U+4E16 (CJK character) = 0xE4 0xB8 0x96 261 assert_eq!(decode_replace(&[0xE4, 0xB8, 0x96]), "\u{4E16}"); 262 } 263 264 #[test] 265 fn four_byte_sequence() { 266 // U+1F600 (grinning face) = 0xF0 0x9F 0x98 0x80 267 assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98, 0x80]), "\u{1F600}"); 268 } 269 270 #[test] 271 fn mixed_ascii_and_multibyte() { 272 // "Caf\u{00E9}" = [0x43, 0x61, 0x66, 0xC3, 0xA9] 273 assert_eq!( 274 decode_replace(&[0x43, 0x61, 0x66, 0xC3, 0xA9]), 275 "Caf\u{00E9}" 276 ); 277 } 278 279 // -- BOM handling -- 280 281 #[test] 282 fn bom_stripped() { 283 // UTF-8 BOM + "A" 284 assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF, 0x41]), "A"); 285 } 286 287 #[test] 288 fn bom_only() { 289 assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF]), ""); 290 } 291 292 // -- Invalid sequences (replacement mode) -- 293 294 #[test] 295 fn invalid_byte_ff() { 296 assert_eq!(decode_replace(&[0xFF]), "\u{FFFD}"); 297 } 298 299 #[test] 300 fn invalid_byte_fe() { 301 assert_eq!(decode_replace(&[0xFE]), "\u{FFFD}"); 302 } 303 304 #[test] 305 fn invalid_continuation_byte_standalone() { 306 // 0x80 without a lead byte 307 assert_eq!(decode_replace(&[0x80]), "\u{FFFD}"); 308 } 309 310 #[test] 311 fn overlong_two_byte() { 312 // 0xC0 0xAF is an overlong encoding of U+002F ('/') 313 // 0xC0 is always invalid (lead byte rejected), 0xAF is a continuation 314 // byte without a lead (also invalid) — both produce U+FFFD 315 assert_eq!(decode_replace(&[0xC0, 0xAF]), "\u{FFFD}\u{FFFD}"); 316 } 317 318 #[test] 319 fn truncated_two_byte() { 320 // 0xC3 without continuation 321 assert_eq!(decode_replace(&[0xC3]), "\u{FFFD}"); 322 } 323 324 #[test] 325 fn truncated_three_byte() { 326 // 0xE4 0xB8 without third byte 327 assert_eq!(decode_replace(&[0xE4, 0xB8]), "\u{FFFD}"); 328 } 329 330 #[test] 331 fn truncated_four_byte() { 332 // 0xF0 0x9F 0x98 without fourth byte 333 assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98]), "\u{FFFD}"); 334 } 335 336 #[test] 337 fn surrogate_half_rejected() { 338 // U+D800 would encode as 0xED 0xA0 0x80, but surrogates are invalid in UTF-8 339 // 0xED with upper_boundary 0x9F rejects 0xA0 340 assert_eq!( 341 decode_replace(&[0xED, 0xA0, 0x80]), 342 "\u{FFFD}\u{FFFD}\u{FFFD}" 343 ); 344 } 345 346 #[test] 347 fn invalid_continuation_mid_sequence() { 348 // 0xE4 expects continuation, but 0x41 is ASCII — error + prepend 349 assert_eq!(decode_replace(&[0xE4, 0x41]), "\u{FFFD}A"); 350 } 351 352 #[test] 353 fn invalid_between_valid() { 354 // Valid 'A', invalid 0xFF, valid 'B' 355 assert_eq!(decode_replace(&[0x41, 0xFF, 0x42]), "A\u{FFFD}B"); 356 } 357 358 #[test] 359 fn multiple_errors_in_a_row() { 360 assert_eq!( 361 decode_replace(&[0xFE, 0xFF, 0xFE]), 362 "\u{FFFD}\u{FFFD}\u{FFFD}" 363 ); 364 } 365 366 // -- Fatal mode -- 367 368 #[test] 369 fn fatal_valid() { 370 assert_eq!(decode_fatal(b"Hello").unwrap(), "Hello"); 371 } 372 373 #[test] 374 fn fatal_invalid() { 375 let err = decode_fatal(&[0x41, 0xFF]).unwrap_err(); 376 assert!(matches!( 377 err, 378 EncodingError::InvalidSequence { 379 encoding: "UTF-8", 380 position: 1 381 } 382 )); 383 } 384 385 #[test] 386 fn fatal_truncated() { 387 let err = decode_fatal(&[0xC3]).unwrap_err(); 388 assert!(matches!( 389 err, 390 EncodingError::InvalidSequence { 391 encoding: "UTF-8", 392 .. 393 } 394 )); 395 } 396 397 // -- Encoder -- 398 399 #[test] 400 fn encode_ascii() { 401 assert_eq!(encode_utf8("Hello"), b"Hello"); 402 } 403 404 #[test] 405 fn encode_multibyte() { 406 assert_eq!(encode_utf8("\u{00E9}"), &[0xC3, 0xA9]); 407 } 408 409 #[test] 410 fn encode_emoji() { 411 assert_eq!(encode_utf8("\u{1F600}"), &[0xF0, 0x9F, 0x98, 0x80]); 412 } 413 414 #[test] 415 fn encode_empty() { 416 assert_eq!(encode_utf8(""), b""); 417 } 418 419 #[test] 420 fn roundtrip() { 421 let original = "Hello \u{4E16}\u{754C} \u{1F600}"; 422 let encoded = encode_utf8(original); 423 let decoded = decode_replace(&encoded); 424 assert_eq!(decoded, original); 425 } 426 427 // -- Edge cases -- 428 429 #[test] 430 fn max_two_byte() { 431 // U+07FF = 0xDF 0xBF 432 assert_eq!(decode_replace(&[0xDF, 0xBF]), "\u{07FF}"); 433 } 434 435 #[test] 436 fn min_three_byte() { 437 // U+0800 = 0xE0 0xA0 0x80 438 assert_eq!(decode_replace(&[0xE0, 0xA0, 0x80]), "\u{0800}"); 439 } 440 441 #[test] 442 fn max_three_byte() { 443 // U+FFFF = 0xEF 0xBF 0xBF 444 assert_eq!(decode_replace(&[0xEF, 0xBF, 0xBF]), "\u{FFFF}"); 445 } 446 447 #[test] 448 fn min_four_byte() { 449 // U+10000 = 0xF0 0x90 0x80 0x80 450 assert_eq!(decode_replace(&[0xF0, 0x90, 0x80, 0x80]), "\u{10000}"); 451 } 452 453 #[test] 454 fn max_unicode() { 455 // U+10FFFF = 0xF4 0x8F 0xBF 0xBF 456 assert_eq!(decode_replace(&[0xF4, 0x8F, 0xBF, 0xBF]), "\u{10FFFF}"); 457 } 458 459 #[test] 460 fn above_max_unicode_rejected() { 461 // 0xF4 0x90 would start U+110000, which is above max 462 // 0xF4 has upper_boundary = 0x8F, so 0x90 is rejected 463 assert_eq!( 464 decode_replace(&[0xF4, 0x90, 0x80, 0x80]), 465 "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" 466 ); 467 } 468 469 #[test] 470 fn overlong_three_byte_rejected() { 471 // 0xE0 requires lower_boundary = 0xA0, so 0xE0 0x80 0x80 is rejected 472 assert_eq!( 473 decode_replace(&[0xE0, 0x80, 0x80]), 474 "\u{FFFD}\u{FFFD}\u{FFFD}" 475 ); 476 } 477 478 #[test] 479 fn overlong_four_byte_rejected() { 480 // 0xF0 requires lower_boundary = 0x90, so 0xF0 0x80 0x80 0x80 is rejected 481 assert_eq!( 482 decode_replace(&[0xF0, 0x80, 0x80, 0x80]), 483 "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" 484 ); 485 } 486}