we (web engine): Experimental web browser project to understand the limits of Claude
at promise 390 lines 10 kB view raw
1//! UTF-16 decoder per WHATWG Encoding Standard. 2 3use crate::error::{EncodingError, Result}; 4use crate::utf8::ErrorMode; 5 6/// Decode a byte slice as UTF-16LE. 7pub(crate) fn decode_utf16le(bytes: &[u8], mode: ErrorMode) -> Result<String> { 8 decode_utf16(bytes, false, mode) 9} 10 11/// Decode a byte slice as UTF-16BE. 12pub(crate) fn decode_utf16be(bytes: &[u8], mode: ErrorMode) -> Result<String> { 13 decode_utf16(bytes, true, mode) 14} 15 16/// Shared UTF-16 decoder (WHATWG Encoding Standard §14.2). 17fn decode_utf16(bytes: &[u8], big_endian: bool, mode: ErrorMode) -> Result<String> { 18 let mut output = String::with_capacity(bytes.len() / 2); 19 let mut i = 0; 20 let mut lead_surrogate: Option<u16> = None; 21 let mut bom_checked = false; 22 23 while i + 1 < bytes.len() { 24 let code_unit = if big_endian { 25 ((bytes[i] as u16) << 8) | (bytes[i + 1] as u16) 26 } else { 27 ((bytes[i + 1] as u16) << 8) | (bytes[i] as u16) 28 }; 29 i += 2; 30 31 // BOM handling: strip BOM matching our endianness at the start 32 if !bom_checked { 33 bom_checked = true; 34 if code_unit == 0xFEFF { 35 // BOM matches our endianness — consume it 36 continue; 37 } 38 // 0xFFFE is NOT treated as a BOM — fall through to normal processing 39 } 40 41 if is_lead_surrogate(code_unit) { 42 // If we already have an unpaired lead, emit error for it 43 if let Some(_prev) = lead_surrogate { 44 if mode == ErrorMode::Fatal { 45 return Err(EncodingError::InvalidSequence { 46 encoding: encoding_name(big_endian), 47 position: i - 4, // position of the previous unpaired lead 48 }); 49 } 50 output.push('\u{FFFD}'); 51 } 52 lead_surrogate = Some(code_unit); 53 } else if is_trail_surrogate(code_unit) { 54 if let Some(lead) = lead_surrogate.take() { 55 // Valid surrogate pair — compute supplementary code point 56 let cp = 0x10000 + ((lead as u32 - 0xD800) << 10) + (code_unit as u32 - 0xDC00); 57 let ch = char::from_u32(cp).unwrap_or('\u{FFFD}'); 58 output.push(ch); 59 } else { 60 // Trail surrogate without lead 61 if mode == ErrorMode::Fatal { 62 return Err(EncodingError::InvalidSequence { 63 encoding: encoding_name(big_endian), 64 position: i - 2, 65 }); 66 } 67 output.push('\u{FFFD}'); 68 } 69 } else { 70 // Regular BMP character 71 if let Some(_lead) = lead_surrogate.take() { 72 // Unpaired lead surrogate before this code unit 73 if mode == ErrorMode::Fatal { 74 return Err(EncodingError::InvalidSequence { 75 encoding: encoding_name(big_endian), 76 position: i - 4, 77 }); 78 } 79 output.push('\u{FFFD}'); 80 } 81 let ch = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); 82 output.push(ch); 83 } 84 } 85 86 // Handle trailing single byte (odd byte count) 87 if i < bytes.len() { 88 // Flush any pending lead surrogate first 89 if lead_surrogate.take().is_some() { 90 if mode == ErrorMode::Fatal { 91 return Err(EncodingError::InvalidSequence { 92 encoding: encoding_name(big_endian), 93 position: i - 2, 94 }); 95 } 96 output.push('\u{FFFD}'); 97 } 98 if mode == ErrorMode::Fatal { 99 return Err(EncodingError::InvalidSequence { 100 encoding: encoding_name(big_endian), 101 position: i, 102 }); 103 } 104 output.push('\u{FFFD}'); 105 } else if lead_surrogate.is_some() { 106 // Unpaired lead surrogate at end of input 107 if mode == ErrorMode::Fatal { 108 return Err(EncodingError::InvalidSequence { 109 encoding: encoding_name(big_endian), 110 position: i - 2, 111 }); 112 } 113 output.push('\u{FFFD}'); 114 } 115 116 Ok(output) 117} 118 119fn is_lead_surrogate(cu: u16) -> bool { 120 (0xD800..=0xDBFF).contains(&cu) 121} 122 123fn is_trail_surrogate(cu: u16) -> bool { 124 (0xDC00..=0xDFFF).contains(&cu) 125} 126 127fn encoding_name(big_endian: bool) -> &'static str { 128 if big_endian { 129 "UTF-16BE" 130 } else { 131 "UTF-16LE" 132 } 133} 134 135// --------------------------------------------------------------------------- 136// Tests 137// --------------------------------------------------------------------------- 138 139#[cfg(test)] 140mod tests { 141 use super::*; 142 143 fn le(bytes: &[u8]) -> String { 144 decode_utf16le(bytes, ErrorMode::Replacement).unwrap() 145 } 146 147 fn be(bytes: &[u8]) -> String { 148 decode_utf16be(bytes, ErrorMode::Replacement).unwrap() 149 } 150 151 // -- Basic ASCII -- 152 153 #[test] 154 fn le_ascii() { 155 assert_eq!(le(&[0x41, 0x00]), "A"); 156 } 157 158 #[test] 159 fn be_ascii() { 160 assert_eq!(be(&[0x00, 0x41]), "A"); 161 } 162 163 #[test] 164 fn le_hello() { 165 assert_eq!(le(&[0x48, 0x00, 0x69, 0x00]), "Hi"); 166 } 167 168 #[test] 169 fn be_hello() { 170 assert_eq!(be(&[0x00, 0x48, 0x00, 0x69]), "Hi"); 171 } 172 173 // -- BMP characters -- 174 175 #[test] 176 fn le_bmp() { 177 // U+00E9 (e with acute) = 0xE9 0x00 in LE 178 assert_eq!(le(&[0xE9, 0x00]), "\u{00E9}"); 179 } 180 181 #[test] 182 fn be_bmp() { 183 // U+00E9 in BE = 0x00 0xE9 184 assert_eq!(be(&[0x00, 0xE9]), "\u{00E9}"); 185 } 186 187 #[test] 188 fn le_cjk() { 189 // U+4E16 = 0x16 0x4E in LE 190 assert_eq!(le(&[0x16, 0x4E]), "\u{4E16}"); 191 } 192 193 // -- Surrogate pairs -- 194 195 #[test] 196 fn le_surrogate_pair() { 197 // U+1F600 = D83D DE00 in UTF-16 198 // LE: 3D D8 00 DE 199 assert_eq!(le(&[0x3D, 0xD8, 0x00, 0xDE]), "\u{1F600}"); 200 } 201 202 #[test] 203 fn be_surrogate_pair() { 204 // U+1F600 = D83D DE00 in UTF-16 205 // BE: D8 3D DE 00 206 assert_eq!(be(&[0xD8, 0x3D, 0xDE, 0x00]), "\u{1F600}"); 207 } 208 209 #[test] 210 fn le_supplementary_u10000() { 211 // U+10000 = D800 DC00 212 // LE: 00 D8 00 DC 213 assert_eq!(le(&[0x00, 0xD8, 0x00, 0xDC]), "\u{10000}"); 214 } 215 216 #[test] 217 fn le_supplementary_u10ffff() { 218 // U+10FFFF = DBFF DFFF 219 // LE: FF DB FF DF 220 assert_eq!(le(&[0xFF, 0xDB, 0xFF, 0xDF]), "\u{10FFFF}"); 221 } 222 223 // -- Unpaired surrogates -- 224 225 #[test] 226 fn le_unpaired_lead() { 227 // Lead surrogate D800 followed by non-surrogate 0041 228 // LE: 00 D8 41 00 229 assert_eq!(le(&[0x00, 0xD8, 0x41, 0x00]), "\u{FFFD}A"); 230 } 231 232 #[test] 233 fn le_unpaired_trail() { 234 // Trail surrogate DC00 without lead 235 // LE: 00 DC 236 assert_eq!(le(&[0x00, 0xDC]), "\u{FFFD}"); 237 } 238 239 #[test] 240 fn le_lead_at_end() { 241 // Lead surrogate at end of input 242 assert_eq!(le(&[0x00, 0xD8]), "\u{FFFD}"); 243 } 244 245 #[test] 246 fn le_two_leads_in_a_row() { 247 // Two lead surrogates: D800 D801 — first is unpaired, second is unpaired at end 248 // LE: 00 D8 01 D8 249 assert_eq!(le(&[0x00, 0xD8, 0x01, 0xD8]), "\u{FFFD}\u{FFFD}"); 250 } 251 252 // -- BOM handling -- 253 254 #[test] 255 fn le_bom_stripped() { 256 // UTF-16LE BOM: FF FE 257 assert_eq!(le(&[0xFF, 0xFE, 0x41, 0x00]), "A"); 258 } 259 260 #[test] 261 fn be_bom_stripped() { 262 // UTF-16BE BOM: FE FF 263 assert_eq!(be(&[0xFE, 0xFF, 0x00, 0x41]), "A"); 264 } 265 266 #[test] 267 fn le_wrong_bom_not_stripped() { 268 // FE FF is NOT the LE BOM — it's U+FEFF (ZWNBSP) 269 assert_eq!(le(&[0xFE, 0xFF]), "\u{FFFE}"); 270 } 271 272 #[test] 273 fn be_wrong_bom_not_stripped() { 274 // FF FE is NOT the BE BOM — it's U+FFFE 275 assert_eq!(be(&[0xFF, 0xFE]), "\u{FFFE}"); 276 } 277 278 #[test] 279 fn le_bom_only() { 280 assert_eq!(le(&[0xFF, 0xFE]), ""); 281 } 282 283 #[test] 284 fn be_bom_only() { 285 assert_eq!(be(&[0xFE, 0xFF]), ""); 286 } 287 288 // -- Odd byte count -- 289 290 #[test] 291 fn le_odd_byte() { 292 assert_eq!(le(&[0x41, 0x00, 0x42]), "A\u{FFFD}"); 293 } 294 295 #[test] 296 fn be_odd_byte() { 297 assert_eq!(be(&[0x00, 0x41, 0x42]), "A\u{FFFD}"); 298 } 299 300 #[test] 301 fn single_byte() { 302 assert_eq!(le(&[0x41]), "\u{FFFD}"); 303 } 304 305 // -- Empty input -- 306 307 #[test] 308 fn empty_le() { 309 assert_eq!(le(&[]), ""); 310 } 311 312 #[test] 313 fn empty_be() { 314 assert_eq!(be(&[]), ""); 315 } 316 317 // -- Fatal mode -- 318 319 #[test] 320 fn fatal_valid_le() { 321 assert_eq!( 322 decode_utf16le(&[0x41, 0x00], ErrorMode::Fatal).unwrap(), 323 "A" 324 ); 325 } 326 327 #[test] 328 fn fatal_unpaired_lead_le() { 329 let err = decode_utf16le(&[0x00, 0xD8, 0x41, 0x00], ErrorMode::Fatal).unwrap_err(); 330 assert!(matches!( 331 err, 332 EncodingError::InvalidSequence { 333 encoding: "UTF-16LE", 334 .. 335 } 336 )); 337 } 338 339 #[test] 340 fn fatal_unpaired_trail_le() { 341 let err = decode_utf16le(&[0x00, 0xDC], ErrorMode::Fatal).unwrap_err(); 342 assert!(matches!( 343 err, 344 EncodingError::InvalidSequence { 345 encoding: "UTF-16LE", 346 .. 347 } 348 )); 349 } 350 351 #[test] 352 fn fatal_odd_byte_le() { 353 let err = decode_utf16le(&[0x41, 0x00, 0x42], ErrorMode::Fatal).unwrap_err(); 354 assert!(matches!( 355 err, 356 EncodingError::InvalidSequence { 357 encoding: "UTF-16LE", 358 .. 359 } 360 )); 361 } 362 363 // -- Mixed content -- 364 365 #[test] 366 fn le_mixed_bmp_and_supplementary() { 367 // "A" + U+1F600 + "B" 368 // LE: 41 00 | 3D D8 00 DE | 42 00 369 assert_eq!( 370 le(&[0x41, 0x00, 0x3D, 0xD8, 0x00, 0xDE, 0x42, 0x00]), 371 "A\u{1F600}B" 372 ); 373 } 374 375 #[test] 376 fn be_mixed_bmp_and_supplementary() { 377 // "A" + U+1F600 + "B" 378 // BE: 00 41 | D8 3D DE 00 | 00 42 379 assert_eq!( 380 be(&[0x00, 0x41, 0xD8, 0x3D, 0xDE, 0x00, 0x00, 0x42]), 381 "A\u{1F600}B" 382 ); 383 } 384 385 #[test] 386 fn le_null_character() { 387 // U+0000 = 00 00 in LE 388 assert_eq!(le(&[0x00, 0x00]), "\0"); 389 } 390}