crates/encoding/src/utf16.rs at promise

pierrelf.com / we
fork atom
we (web engine): Experimental web browser project to understand the limits of Claude
fork atom
we / crates / encoding / src / utf16.rs
at promise 390 lines 10 kB view raw
wrap content
pierrelf.com Implement WHATWG Encoding: UTF-8 and UTF-16 codecs 7d ago
db9d5fbf
  1//! UTF-16 decoder per WHATWG Encoding Standard.
  2
  3use crate::error::{EncodingError, Result};
  4use crate::utf8::ErrorMode;
  5
  6/// Decode a byte slice as UTF-16LE.
  7pub(crate) fn decode_utf16le(bytes: &[u8], mode: ErrorMode) -> Result<String> {
  8    decode_utf16(bytes, false, mode)
  9}
 10
 11/// Decode a byte slice as UTF-16BE.
 12pub(crate) fn decode_utf16be(bytes: &[u8], mode: ErrorMode) -> Result<String> {
 13    decode_utf16(bytes, true, mode)
 14}
 15
 16/// Shared UTF-16 decoder (WHATWG Encoding Standard §14.2).
 17fn decode_utf16(bytes: &[u8], big_endian: bool, mode: ErrorMode) -> Result<String> {
 18    let mut output = String::with_capacity(bytes.len() / 2);
 19    let mut i = 0;
 20    let mut lead_surrogate: Option<u16> = None;
 21    let mut bom_checked = false;
 22
 23    while i + 1 < bytes.len() {
 24        let code_unit = if big_endian {
 25            ((bytes[i] as u16) << 8) | (bytes[i + 1] as u16)
 26        } else {
 27            ((bytes[i + 1] as u16) << 8) | (bytes[i] as u16)
 28        };
 29        i += 2;
 30
 31        // BOM handling: strip BOM matching our endianness at the start
 32        if !bom_checked {
 33            bom_checked = true;
 34            if code_unit == 0xFEFF {
 35                // BOM matches our endianness — consume it
 36                continue;
 37            }
 38            // 0xFFFE is NOT treated as a BOM — fall through to normal processing
 39        }
 40
 41        if is_lead_surrogate(code_unit) {
 42            // If we already have an unpaired lead, emit error for it
 43            if let Some(_prev) = lead_surrogate {
 44                if mode == ErrorMode::Fatal {
 45                    return Err(EncodingError::InvalidSequence {
 46                        encoding: encoding_name(big_endian),
 47                        position: i - 4, // position of the previous unpaired lead
 48                    });
 49                }
 50                output.push('\u{FFFD}');
 51            }
 52            lead_surrogate = Some(code_unit);
 53        } else if is_trail_surrogate(code_unit) {
 54            if let Some(lead) = lead_surrogate.take() {
 55                // Valid surrogate pair — compute supplementary code point
 56                let cp = 0x10000 + ((lead as u32 - 0xD800) << 10) + (code_unit as u32 - 0xDC00);
 57                let ch = char::from_u32(cp).unwrap_or('\u{FFFD}');
 58                output.push(ch);
 59            } else {
 60                // Trail surrogate without lead
 61                if mode == ErrorMode::Fatal {
 62                    return Err(EncodingError::InvalidSequence {
 63                        encoding: encoding_name(big_endian),
 64                        position: i - 2,
 65                    });
 66                }
 67                output.push('\u{FFFD}');
 68            }
 69        } else {
 70            // Regular BMP character
 71            if let Some(_lead) = lead_surrogate.take() {
 72                // Unpaired lead surrogate before this code unit
 73                if mode == ErrorMode::Fatal {
 74                    return Err(EncodingError::InvalidSequence {
 75                        encoding: encoding_name(big_endian),
 76                        position: i - 4,
 77                    });
 78                }
 79                output.push('\u{FFFD}');
 80            }
 81            let ch = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}');
 82            output.push(ch);
 83        }
 84    }
 85
 86    // Handle trailing single byte (odd byte count)
 87    if i < bytes.len() {
 88        // Flush any pending lead surrogate first
 89        if lead_surrogate.take().is_some() {
 90            if mode == ErrorMode::Fatal {
 91                return Err(EncodingError::InvalidSequence {
 92                    encoding: encoding_name(big_endian),
 93                    position: i - 2,
 94                });
 95            }
 96            output.push('\u{FFFD}');
 97        }
 98        if mode == ErrorMode::Fatal {
 99            return Err(EncodingError::InvalidSequence {
100                encoding: encoding_name(big_endian),
101                position: i,
102            });
103        }
104        output.push('\u{FFFD}');
105    } else if lead_surrogate.is_some() {
106        // Unpaired lead surrogate at end of input
107        if mode == ErrorMode::Fatal {
108            return Err(EncodingError::InvalidSequence {
109                encoding: encoding_name(big_endian),
110                position: i - 2,
111            });
112        }
113        output.push('\u{FFFD}');
114    }
115
116    Ok(output)
117}
118
119fn is_lead_surrogate(cu: u16) -> bool {
120    (0xD800..=0xDBFF).contains(&cu)
121}
122
123fn is_trail_surrogate(cu: u16) -> bool {
124    (0xDC00..=0xDFFF).contains(&cu)
125}
126
127fn encoding_name(big_endian: bool) -> &'static str {
128    if big_endian {
129        "UTF-16BE"
130    } else {
131        "UTF-16LE"
132    }
133}
134
135// ---------------------------------------------------------------------------
136// Tests
137// ---------------------------------------------------------------------------
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142
143    fn le(bytes: &[u8]) -> String {
144        decode_utf16le(bytes, ErrorMode::Replacement).unwrap()
145    }
146
147    fn be(bytes: &[u8]) -> String {
148        decode_utf16be(bytes, ErrorMode::Replacement).unwrap()
149    }
150
151    // -- Basic ASCII --
152
153    #[test]
154    fn le_ascii() {
155        assert_eq!(le(&[0x41, 0x00]), "A");
156    }
157
158    #[test]
159    fn be_ascii() {
160        assert_eq!(be(&[0x00, 0x41]), "A");
161    }
162
163    #[test]
164    fn le_hello() {
165        assert_eq!(le(&[0x48, 0x00, 0x69, 0x00]), "Hi");
166    }
167
168    #[test]
169    fn be_hello() {
170        assert_eq!(be(&[0x00, 0x48, 0x00, 0x69]), "Hi");
171    }
172
173    // -- BMP characters --
174
175    #[test]
176    fn le_bmp() {
177        // U+00E9 (e with acute) = 0xE9 0x00 in LE
178        assert_eq!(le(&[0xE9, 0x00]), "\u{00E9}");
179    }
180
181    #[test]
182    fn be_bmp() {
183        // U+00E9 in BE = 0x00 0xE9
184        assert_eq!(be(&[0x00, 0xE9]), "\u{00E9}");
185    }
186
187    #[test]
188    fn le_cjk() {
189        // U+4E16 = 0x16 0x4E in LE
190        assert_eq!(le(&[0x16, 0x4E]), "\u{4E16}");
191    }
192
193    // -- Surrogate pairs --
194
195    #[test]
196    fn le_surrogate_pair() {
197        // U+1F600 = D83D DE00 in UTF-16
198        // LE: 3D D8 00 DE
199        assert_eq!(le(&[0x3D, 0xD8, 0x00, 0xDE]), "\u{1F600}");
200    }
201
202    #[test]
203    fn be_surrogate_pair() {
204        // U+1F600 = D83D DE00 in UTF-16
205        // BE: D8 3D DE 00
206        assert_eq!(be(&[0xD8, 0x3D, 0xDE, 0x00]), "\u{1F600}");
207    }
208
209    #[test]
210    fn le_supplementary_u10000() {
211        // U+10000 = D800 DC00
212        // LE: 00 D8 00 DC
213        assert_eq!(le(&[0x00, 0xD8, 0x00, 0xDC]), "\u{10000}");
214    }
215
216    #[test]
217    fn le_supplementary_u10ffff() {
218        // U+10FFFF = DBFF DFFF
219        // LE: FF DB FF DF
220        assert_eq!(le(&[0xFF, 0xDB, 0xFF, 0xDF]), "\u{10FFFF}");
221    }
222
223    // -- Unpaired surrogates --
224
225    #[test]
226    fn le_unpaired_lead() {
227        // Lead surrogate D800 followed by non-surrogate 0041
228        // LE: 00 D8 41 00
229        assert_eq!(le(&[0x00, 0xD8, 0x41, 0x00]), "\u{FFFD}A");
230    }
231
232    #[test]
233    fn le_unpaired_trail() {
234        // Trail surrogate DC00 without lead
235        // LE: 00 DC
236        assert_eq!(le(&[0x00, 0xDC]), "\u{FFFD}");
237    }
238
239    #[test]
240    fn le_lead_at_end() {
241        // Lead surrogate at end of input
242        assert_eq!(le(&[0x00, 0xD8]), "\u{FFFD}");
243    }
244
245    #[test]
246    fn le_two_leads_in_a_row() {
247        // Two lead surrogates: D800 D801 — first is unpaired, second is unpaired at end
248        // LE: 00 D8 01 D8
249        assert_eq!(le(&[0x00, 0xD8, 0x01, 0xD8]), "\u{FFFD}\u{FFFD}");
250    }
251
252    // -- BOM handling --
253
254    #[test]
255    fn le_bom_stripped() {
256        // UTF-16LE BOM: FF FE
257        assert_eq!(le(&[0xFF, 0xFE, 0x41, 0x00]), "A");
258    }
259
260    #[test]
261    fn be_bom_stripped() {
262        // UTF-16BE BOM: FE FF
263        assert_eq!(be(&[0xFE, 0xFF, 0x00, 0x41]), "A");
264    }
265
266    #[test]
267    fn le_wrong_bom_not_stripped() {
268        // FE FF is NOT the LE BOM — it's U+FEFF (ZWNBSP)
269        assert_eq!(le(&[0xFE, 0xFF]), "\u{FFFE}");
270    }
271
272    #[test]
273    fn be_wrong_bom_not_stripped() {
274        // FF FE is NOT the BE BOM — it's U+FFFE
275        assert_eq!(be(&[0xFF, 0xFE]), "\u{FFFE}");
276    }
277
278    #[test]
279    fn le_bom_only() {
280        assert_eq!(le(&[0xFF, 0xFE]), "");
281    }
282
283    #[test]
284    fn be_bom_only() {
285        assert_eq!(be(&[0xFE, 0xFF]), "");
286    }
287
288    // -- Odd byte count --
289
290    #[test]
291    fn le_odd_byte() {
292        assert_eq!(le(&[0x41, 0x00, 0x42]), "A\u{FFFD}");
293    }
294
295    #[test]
296    fn be_odd_byte() {
297        assert_eq!(be(&[0x00, 0x41, 0x42]), "A\u{FFFD}");
298    }
299
300    #[test]
301    fn single_byte() {
302        assert_eq!(le(&[0x41]), "\u{FFFD}");
303    }
304
305    // -- Empty input --
306
307    #[test]
308    fn empty_le() {
309        assert_eq!(le(&[]), "");
310    }
311
312    #[test]
313    fn empty_be() {
314        assert_eq!(be(&[]), "");
315    }
316
317    // -- Fatal mode --
318
319    #[test]
320    fn fatal_valid_le() {
321        assert_eq!(
322            decode_utf16le(&[0x41, 0x00], ErrorMode::Fatal).unwrap(),
323            "A"
324        );
325    }
326
327    #[test]
328    fn fatal_unpaired_lead_le() {
329        let err = decode_utf16le(&[0x00, 0xD8, 0x41, 0x00], ErrorMode::Fatal).unwrap_err();
330        assert!(matches!(
331            err,
332            EncodingError::InvalidSequence {
333                encoding: "UTF-16LE",
334                ..
335            }
336        ));
337    }
338
339    #[test]
340    fn fatal_unpaired_trail_le() {
341        let err = decode_utf16le(&[0x00, 0xDC], ErrorMode::Fatal).unwrap_err();
342        assert!(matches!(
343            err,
344            EncodingError::InvalidSequence {
345                encoding: "UTF-16LE",
346                ..
347            }
348        ));
349    }
350
351    #[test]
352    fn fatal_odd_byte_le() {
353        let err = decode_utf16le(&[0x41, 0x00, 0x42], ErrorMode::Fatal).unwrap_err();
354        assert!(matches!(
355            err,
356            EncodingError::InvalidSequence {
357                encoding: "UTF-16LE",
358                ..
359            }
360        ));
361    }
362
363    // -- Mixed content --
364
365    #[test]
366    fn le_mixed_bmp_and_supplementary() {
367        // "A" + U+1F600 + "B"
368        // LE: 41 00 | 3D D8 00 DE | 42 00
369        assert_eq!(
370            le(&[0x41, 0x00, 0x3D, 0xD8, 0x00, 0xDE, 0x42, 0x00]),
371            "A\u{1F600}B"
372        );
373    }
374
375    #[test]
376    fn be_mixed_bmp_and_supplementary() {
377        // "A" + U+1F600 + "B"
378        // BE: 00 41 | D8 3D DE 00 | 00 42
379        assert_eq!(
380            be(&[0x00, 0x41, 0xD8, 0x3D, 0xDE, 0x00, 0x00, 0x42]),
381            "A\u{1F600}B"
382        );
383    }
384
385    #[test]
386    fn le_null_character() {
387        // U+0000 = 00 00 in LE
388        assert_eq!(le(&[0x00, 0x00]), "\0");
389    }
390}