we (web engine): Experimental web browser project to understand the limits of Claude
1//! UTF-16 decoder per WHATWG Encoding Standard.
2
3use crate::error::{EncodingError, Result};
4use crate::utf8::ErrorMode;
5
6/// Decode a byte slice as UTF-16LE.
7pub(crate) fn decode_utf16le(bytes: &[u8], mode: ErrorMode) -> Result<String> {
8 decode_utf16(bytes, false, mode)
9}
10
11/// Decode a byte slice as UTF-16BE.
12pub(crate) fn decode_utf16be(bytes: &[u8], mode: ErrorMode) -> Result<String> {
13 decode_utf16(bytes, true, mode)
14}
15
16/// Shared UTF-16 decoder (WHATWG Encoding Standard §14.2).
17fn decode_utf16(bytes: &[u8], big_endian: bool, mode: ErrorMode) -> Result<String> {
18 let mut output = String::with_capacity(bytes.len() / 2);
19 let mut i = 0;
20 let mut lead_surrogate: Option<u16> = None;
21 let mut bom_checked = false;
22
23 while i + 1 < bytes.len() {
24 let code_unit = if big_endian {
25 ((bytes[i] as u16) << 8) | (bytes[i + 1] as u16)
26 } else {
27 ((bytes[i + 1] as u16) << 8) | (bytes[i] as u16)
28 };
29 i += 2;
30
31 // BOM handling: strip BOM matching our endianness at the start
32 if !bom_checked {
33 bom_checked = true;
34 if code_unit == 0xFEFF {
35 // BOM matches our endianness — consume it
36 continue;
37 }
38 // 0xFFFE is NOT treated as a BOM — fall through to normal processing
39 }
40
41 if is_lead_surrogate(code_unit) {
42 // If we already have an unpaired lead, emit error for it
43 if let Some(_prev) = lead_surrogate {
44 if mode == ErrorMode::Fatal {
45 return Err(EncodingError::InvalidSequence {
46 encoding: encoding_name(big_endian),
47 position: i - 4, // position of the previous unpaired lead
48 });
49 }
50 output.push('\u{FFFD}');
51 }
52 lead_surrogate = Some(code_unit);
53 } else if is_trail_surrogate(code_unit) {
54 if let Some(lead) = lead_surrogate.take() {
55 // Valid surrogate pair — compute supplementary code point
56 let cp = 0x10000 + ((lead as u32 - 0xD800) << 10) + (code_unit as u32 - 0xDC00);
57 let ch = char::from_u32(cp).unwrap_or('\u{FFFD}');
58 output.push(ch);
59 } else {
60 // Trail surrogate without lead
61 if mode == ErrorMode::Fatal {
62 return Err(EncodingError::InvalidSequence {
63 encoding: encoding_name(big_endian),
64 position: i - 2,
65 });
66 }
67 output.push('\u{FFFD}');
68 }
69 } else {
70 // Regular BMP character
71 if let Some(_lead) = lead_surrogate.take() {
72 // Unpaired lead surrogate before this code unit
73 if mode == ErrorMode::Fatal {
74 return Err(EncodingError::InvalidSequence {
75 encoding: encoding_name(big_endian),
76 position: i - 4,
77 });
78 }
79 output.push('\u{FFFD}');
80 }
81 let ch = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}');
82 output.push(ch);
83 }
84 }
85
86 // Handle trailing single byte (odd byte count)
87 if i < bytes.len() {
88 // Flush any pending lead surrogate first
89 if lead_surrogate.take().is_some() {
90 if mode == ErrorMode::Fatal {
91 return Err(EncodingError::InvalidSequence {
92 encoding: encoding_name(big_endian),
93 position: i - 2,
94 });
95 }
96 output.push('\u{FFFD}');
97 }
98 if mode == ErrorMode::Fatal {
99 return Err(EncodingError::InvalidSequence {
100 encoding: encoding_name(big_endian),
101 position: i,
102 });
103 }
104 output.push('\u{FFFD}');
105 } else if lead_surrogate.is_some() {
106 // Unpaired lead surrogate at end of input
107 if mode == ErrorMode::Fatal {
108 return Err(EncodingError::InvalidSequence {
109 encoding: encoding_name(big_endian),
110 position: i - 2,
111 });
112 }
113 output.push('\u{FFFD}');
114 }
115
116 Ok(output)
117}
118
119fn is_lead_surrogate(cu: u16) -> bool {
120 (0xD800..=0xDBFF).contains(&cu)
121}
122
123fn is_trail_surrogate(cu: u16) -> bool {
124 (0xDC00..=0xDFFF).contains(&cu)
125}
126
127fn encoding_name(big_endian: bool) -> &'static str {
128 if big_endian {
129 "UTF-16BE"
130 } else {
131 "UTF-16LE"
132 }
133}
134
135// ---------------------------------------------------------------------------
136// Tests
137// ---------------------------------------------------------------------------
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142
143 fn le(bytes: &[u8]) -> String {
144 decode_utf16le(bytes, ErrorMode::Replacement).unwrap()
145 }
146
147 fn be(bytes: &[u8]) -> String {
148 decode_utf16be(bytes, ErrorMode::Replacement).unwrap()
149 }
150
151 // -- Basic ASCII --
152
153 #[test]
154 fn le_ascii() {
155 assert_eq!(le(&[0x41, 0x00]), "A");
156 }
157
158 #[test]
159 fn be_ascii() {
160 assert_eq!(be(&[0x00, 0x41]), "A");
161 }
162
163 #[test]
164 fn le_hello() {
165 assert_eq!(le(&[0x48, 0x00, 0x69, 0x00]), "Hi");
166 }
167
168 #[test]
169 fn be_hello() {
170 assert_eq!(be(&[0x00, 0x48, 0x00, 0x69]), "Hi");
171 }
172
173 // -- BMP characters --
174
175 #[test]
176 fn le_bmp() {
177 // U+00E9 (e with acute) = 0xE9 0x00 in LE
178 assert_eq!(le(&[0xE9, 0x00]), "\u{00E9}");
179 }
180
181 #[test]
182 fn be_bmp() {
183 // U+00E9 in BE = 0x00 0xE9
184 assert_eq!(be(&[0x00, 0xE9]), "\u{00E9}");
185 }
186
187 #[test]
188 fn le_cjk() {
189 // U+4E16 = 0x16 0x4E in LE
190 assert_eq!(le(&[0x16, 0x4E]), "\u{4E16}");
191 }
192
193 // -- Surrogate pairs --
194
195 #[test]
196 fn le_surrogate_pair() {
197 // U+1F600 = D83D DE00 in UTF-16
198 // LE: 3D D8 00 DE
199 assert_eq!(le(&[0x3D, 0xD8, 0x00, 0xDE]), "\u{1F600}");
200 }
201
202 #[test]
203 fn be_surrogate_pair() {
204 // U+1F600 = D83D DE00 in UTF-16
205 // BE: D8 3D DE 00
206 assert_eq!(be(&[0xD8, 0x3D, 0xDE, 0x00]), "\u{1F600}");
207 }
208
209 #[test]
210 fn le_supplementary_u10000() {
211 // U+10000 = D800 DC00
212 // LE: 00 D8 00 DC
213 assert_eq!(le(&[0x00, 0xD8, 0x00, 0xDC]), "\u{10000}");
214 }
215
216 #[test]
217 fn le_supplementary_u10ffff() {
218 // U+10FFFF = DBFF DFFF
219 // LE: FF DB FF DF
220 assert_eq!(le(&[0xFF, 0xDB, 0xFF, 0xDF]), "\u{10FFFF}");
221 }
222
223 // -- Unpaired surrogates --
224
225 #[test]
226 fn le_unpaired_lead() {
227 // Lead surrogate D800 followed by non-surrogate 0041
228 // LE: 00 D8 41 00
229 assert_eq!(le(&[0x00, 0xD8, 0x41, 0x00]), "\u{FFFD}A");
230 }
231
232 #[test]
233 fn le_unpaired_trail() {
234 // Trail surrogate DC00 without lead
235 // LE: 00 DC
236 assert_eq!(le(&[0x00, 0xDC]), "\u{FFFD}");
237 }
238
239 #[test]
240 fn le_lead_at_end() {
241 // Lead surrogate at end of input
242 assert_eq!(le(&[0x00, 0xD8]), "\u{FFFD}");
243 }
244
245 #[test]
246 fn le_two_leads_in_a_row() {
247 // Two lead surrogates: D800 D801 — first is unpaired, second is unpaired at end
248 // LE: 00 D8 01 D8
249 assert_eq!(le(&[0x00, 0xD8, 0x01, 0xD8]), "\u{FFFD}\u{FFFD}");
250 }
251
252 // -- BOM handling --
253
254 #[test]
255 fn le_bom_stripped() {
256 // UTF-16LE BOM: FF FE
257 assert_eq!(le(&[0xFF, 0xFE, 0x41, 0x00]), "A");
258 }
259
260 #[test]
261 fn be_bom_stripped() {
262 // UTF-16BE BOM: FE FF
263 assert_eq!(be(&[0xFE, 0xFF, 0x00, 0x41]), "A");
264 }
265
266 #[test]
267 fn le_wrong_bom_not_stripped() {
268 // FE FF is NOT the LE BOM — it's U+FEFF (ZWNBSP)
269 assert_eq!(le(&[0xFE, 0xFF]), "\u{FFFE}");
270 }
271
272 #[test]
273 fn be_wrong_bom_not_stripped() {
274 // FF FE is NOT the BE BOM — it's U+FFFE
275 assert_eq!(be(&[0xFF, 0xFE]), "\u{FFFE}");
276 }
277
278 #[test]
279 fn le_bom_only() {
280 assert_eq!(le(&[0xFF, 0xFE]), "");
281 }
282
283 #[test]
284 fn be_bom_only() {
285 assert_eq!(be(&[0xFE, 0xFF]), "");
286 }
287
288 // -- Odd byte count --
289
290 #[test]
291 fn le_odd_byte() {
292 assert_eq!(le(&[0x41, 0x00, 0x42]), "A\u{FFFD}");
293 }
294
295 #[test]
296 fn be_odd_byte() {
297 assert_eq!(be(&[0x00, 0x41, 0x42]), "A\u{FFFD}");
298 }
299
300 #[test]
301 fn single_byte() {
302 assert_eq!(le(&[0x41]), "\u{FFFD}");
303 }
304
305 // -- Empty input --
306
307 #[test]
308 fn empty_le() {
309 assert_eq!(le(&[]), "");
310 }
311
312 #[test]
313 fn empty_be() {
314 assert_eq!(be(&[]), "");
315 }
316
317 // -- Fatal mode --
318
319 #[test]
320 fn fatal_valid_le() {
321 assert_eq!(
322 decode_utf16le(&[0x41, 0x00], ErrorMode::Fatal).unwrap(),
323 "A"
324 );
325 }
326
327 #[test]
328 fn fatal_unpaired_lead_le() {
329 let err = decode_utf16le(&[0x00, 0xD8, 0x41, 0x00], ErrorMode::Fatal).unwrap_err();
330 assert!(matches!(
331 err,
332 EncodingError::InvalidSequence {
333 encoding: "UTF-16LE",
334 ..
335 }
336 ));
337 }
338
339 #[test]
340 fn fatal_unpaired_trail_le() {
341 let err = decode_utf16le(&[0x00, 0xDC], ErrorMode::Fatal).unwrap_err();
342 assert!(matches!(
343 err,
344 EncodingError::InvalidSequence {
345 encoding: "UTF-16LE",
346 ..
347 }
348 ));
349 }
350
351 #[test]
352 fn fatal_odd_byte_le() {
353 let err = decode_utf16le(&[0x41, 0x00, 0x42], ErrorMode::Fatal).unwrap_err();
354 assert!(matches!(
355 err,
356 EncodingError::InvalidSequence {
357 encoding: "UTF-16LE",
358 ..
359 }
360 ));
361 }
362
363 // -- Mixed content --
364
365 #[test]
366 fn le_mixed_bmp_and_supplementary() {
367 // "A" + U+1F600 + "B"
368 // LE: 41 00 | 3D D8 00 DE | 42 00
369 assert_eq!(
370 le(&[0x41, 0x00, 0x3D, 0xD8, 0x00, 0xDE, 0x42, 0x00]),
371 "A\u{1F600}B"
372 );
373 }
374
375 #[test]
376 fn be_mixed_bmp_and_supplementary() {
377 // "A" + U+1F600 + "B"
378 // BE: 00 41 | D8 3D DE 00 | 00 42
379 assert_eq!(
380 be(&[0x00, 0x41, 0xD8, 0x3D, 0xDE, 0x00, 0x00, 0x42]),
381 "A\u{1F600}B"
382 );
383 }
384
385 #[test]
386 fn le_null_character() {
387 // U+0000 = 00 00 in LE
388 assert_eq!(le(&[0x00, 0x00]), "\0");
389 }
390}