crates/encoding/src/utf8.rs at promise

pierrelf.com / we
fork atom
we (web engine): Experimental web browser project to understand the limits of Claude
fork atom
we / crates / encoding / src / utf8.rs
at promise 486 lines 14 kB view raw
wrap content
pierrelf.com Implement WHATWG Encoding: UTF-8 and UTF-16 codecs 7d ago
db9d5fbf
  1//! UTF-8 decoder and encoder per WHATWG Encoding Standard.
  2
  3use crate::error::{EncodingError, Result};
  4
  5/// Error handling mode.
  6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
  7pub(crate) enum ErrorMode {
  8    Replacement,
  9    Fatal,
 10}
 11
 12/// Decode a byte slice as UTF-8.
 13///
 14/// In replacement mode, invalid sequences are replaced with U+FFFD.
 15/// In fatal mode, the first invalid sequence causes an error.
 16pub(crate) fn decode_utf8(bytes: &[u8], mode: ErrorMode) -> Result<String> {
 17    // Strip UTF-8 BOM if present
 18    let bytes = if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
 19        &bytes[3..]
 20    } else {
 21        bytes
 22    };
 23
 24    let mut output = String::with_capacity(bytes.len());
 25    let mut decoder = Utf8Decoder::new();
 26    let mut i = 0;
 27
 28    while i < bytes.len() {
 29        match decoder.process_byte(bytes[i]) {
 30            DecoderResult::CodePoint(ch) => {
 31                output.push(ch);
 32                i += 1;
 33            }
 34            DecoderResult::Error(error_pos) => {
 35                if mode == ErrorMode::Fatal {
 36                    return Err(EncodingError::InvalidSequence {
 37                        encoding: "UTF-8",
 38                        position: error_pos,
 39                    });
 40                }
 41                output.push('\u{FFFD}');
 42                i += 1;
 43            }
 44            DecoderResult::ErrorPrepend(error_pos) => {
 45                if mode == ErrorMode::Fatal {
 46                    return Err(EncodingError::InvalidSequence {
 47                        encoding: "UTF-8",
 48                        position: error_pos,
 49                    });
 50                }
 51                output.push('\u{FFFD}');
 52                // Do NOT advance i — re-process this byte
 53            }
 54            DecoderResult::Continue => {
 55                i += 1;
 56            }
 57        }
 58    }
 59
 60    // Handle incomplete sequence at end of input
 61    if decoder.bytes_needed > 0 {
 62        if mode == ErrorMode::Fatal {
 63            return Err(EncodingError::InvalidSequence {
 64                encoding: "UTF-8",
 65                position: bytes.len().saturating_sub(decoder.bytes_seen as usize),
 66            });
 67        }
 68        output.push('\u{FFFD}');
 69    }
 70
 71    Ok(output)
 72}
 73
 74/// Encode a string as UTF-8 bytes.
 75///
 76/// Since Rust strings are already valid UTF-8, this is a straightforward copy.
 77pub(crate) fn encode_utf8(text: &str) -> Vec<u8> {
 78    text.as_bytes().to_vec()
 79}
 80
 81// ---------------------------------------------------------------------------
 82// Streaming UTF-8 decoder (WHATWG Encoding Standard §8.1.1)
 83// ---------------------------------------------------------------------------
 84
 85enum DecoderResult {
 86    /// A valid code point was decoded.
 87    CodePoint(char),
 88    /// An error occurred at the given byte position; advance to next byte.
 89    Error(usize),
 90    /// An error occurred at the given byte position; re-process current byte.
 91    ErrorPrepend(usize),
 92    /// More bytes needed; continue feeding.
 93    Continue,
 94}
 95
 96struct Utf8Decoder {
 97    code_point: u32,
 98    bytes_seen: u8,
 99    bytes_needed: u8,
100    lower_boundary: u8,
101    upper_boundary: u8,
102    /// Position of the start of the current multi-byte sequence.
103    sequence_start: usize,
104    /// Total bytes processed so far.
105    position: usize,
106}
107
108impl Utf8Decoder {
109    fn new() -> Self {
110        Self {
111            code_point: 0,
112            bytes_seen: 0,
113            bytes_needed: 0,
114            lower_boundary: 0x80,
115            upper_boundary: 0xBF,
116            sequence_start: 0,
117            position: 0,
118        }
119    }
120
121    fn process_byte(&mut self, byte: u8) -> DecoderResult {
122        let pos = self.position;
123        self.position += 1;
124
125        if self.bytes_needed == 0 {
126            match byte {
127                0x00..=0x7F => DecoderResult::CodePoint(byte as char),
128                0xC2..=0xDF => {
129                    self.bytes_needed = 1;
130                    self.code_point = (byte & 0x1F) as u32;
131                    self.sequence_start = pos;
132                    DecoderResult::Continue
133                }
134                0xE0 => {
135                    self.bytes_needed = 2;
136                    self.lower_boundary = 0xA0;
137                    self.code_point = (byte & 0x0F) as u32;
138                    self.sequence_start = pos;
139                    DecoderResult::Continue
140                }
141                0xE1..=0xEC | 0xEE..=0xEF => {
142                    self.bytes_needed = 2;
143                    self.code_point = (byte & 0x0F) as u32;
144                    self.sequence_start = pos;
145                    DecoderResult::Continue
146                }
147                0xED => {
148                    self.bytes_needed = 2;
149                    self.upper_boundary = 0x9F;
150                    self.code_point = (byte & 0x0F) as u32;
151                    self.sequence_start = pos;
152                    DecoderResult::Continue
153                }
154                0xF0 => {
155                    self.bytes_needed = 3;
156                    self.lower_boundary = 0x90;
157                    self.code_point = (byte & 0x07) as u32;
158                    self.sequence_start = pos;
159                    DecoderResult::Continue
160                }
161                0xF1..=0xF3 => {
162                    self.bytes_needed = 3;
163                    self.code_point = (byte & 0x07) as u32;
164                    self.sequence_start = pos;
165                    DecoderResult::Continue
166                }
167                0xF4 => {
168                    self.bytes_needed = 3;
169                    self.upper_boundary = 0x8F;
170                    self.code_point = (byte & 0x07) as u32;
171                    self.sequence_start = pos;
172                    DecoderResult::Continue
173                }
174                _ => {
175                    // 0x80..=0xC1, 0xF5..=0xFF: invalid lead byte
176                    DecoderResult::Error(pos)
177                }
178            }
179        } else {
180            // Expecting continuation byte
181            if byte < self.lower_boundary || byte > self.upper_boundary {
182                // Invalid continuation — reset and prepend byte
183                let err_pos = self.sequence_start;
184                self.reset();
185                self.position -= 1; // will be re-processed
186                return DecoderResult::ErrorPrepend(err_pos);
187            }
188
189            // Valid continuation byte
190            self.lower_boundary = 0x80;
191            self.upper_boundary = 0xBF;
192            self.code_point = (self.code_point << 6) | (byte & 0x3F) as u32;
193            self.bytes_seen += 1;
194
195            if self.bytes_seen == self.bytes_needed {
196                let cp = self.code_point;
197                self.reset();
198                // The WHATWG state machine guarantees valid scalar values here,
199                // but use fallback for defense-in-depth.
200                let ch = char::from_u32(cp).unwrap_or('\u{FFFD}');
201                DecoderResult::CodePoint(ch)
202            } else {
203                DecoderResult::Continue
204            }
205        }
206    }
207
208    fn reset(&mut self) {
209        self.code_point = 0;
210        self.bytes_seen = 0;
211        self.bytes_needed = 0;
212        self.lower_boundary = 0x80;
213        self.upper_boundary = 0xBF;
214    }
215}
216
217// ---------------------------------------------------------------------------
218// Tests
219// ---------------------------------------------------------------------------
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224
225    fn decode_replace(bytes: &[u8]) -> String {
226        decode_utf8(bytes, ErrorMode::Replacement).unwrap()
227    }
228
229    fn decode_fatal(bytes: &[u8]) -> Result<String> {
230        decode_utf8(bytes, ErrorMode::Fatal)
231    }
232
233    // -- Basic ASCII --
234
235    #[test]
236    fn ascii_roundtrip() {
237        assert_eq!(decode_replace(b"Hello, world!"), "Hello, world!");
238    }
239
240    #[test]
241    fn empty_input() {
242        assert_eq!(decode_replace(b""), "");
243    }
244
245    #[test]
246    fn null_byte() {
247        assert_eq!(decode_replace(&[0x00]), "\0");
248    }
249
250    // -- Multi-byte sequences --
251
252    #[test]
253    fn two_byte_sequence() {
254        // U+00E9 (e with acute) = 0xC3 0xA9
255        assert_eq!(decode_replace(&[0xC3, 0xA9]), "\u{00E9}");
256    }
257
258    #[test]
259    fn three_byte_sequence() {
260        // U+4E16 (CJK character) = 0xE4 0xB8 0x96
261        assert_eq!(decode_replace(&[0xE4, 0xB8, 0x96]), "\u{4E16}");
262    }
263
264    #[test]
265    fn four_byte_sequence() {
266        // U+1F600 (grinning face) = 0xF0 0x9F 0x98 0x80
267        assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98, 0x80]), "\u{1F600}");
268    }
269
270    #[test]
271    fn mixed_ascii_and_multibyte() {
272        // "Caf\u{00E9}" = [0x43, 0x61, 0x66, 0xC3, 0xA9]
273        assert_eq!(
274            decode_replace(&[0x43, 0x61, 0x66, 0xC3, 0xA9]),
275            "Caf\u{00E9}"
276        );
277    }
278
279    // -- BOM handling --
280
281    #[test]
282    fn bom_stripped() {
283        // UTF-8 BOM + "A"
284        assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF, 0x41]), "A");
285    }
286
287    #[test]
288    fn bom_only() {
289        assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF]), "");
290    }
291
292    // -- Invalid sequences (replacement mode) --
293
294    #[test]
295    fn invalid_byte_ff() {
296        assert_eq!(decode_replace(&[0xFF]), "\u{FFFD}");
297    }
298
299    #[test]
300    fn invalid_byte_fe() {
301        assert_eq!(decode_replace(&[0xFE]), "\u{FFFD}");
302    }
303
304    #[test]
305    fn invalid_continuation_byte_standalone() {
306        // 0x80 without a lead byte
307        assert_eq!(decode_replace(&[0x80]), "\u{FFFD}");
308    }
309
310    #[test]
311    fn overlong_two_byte() {
312        // 0xC0 0xAF is an overlong encoding of U+002F ('/')
313        // 0xC0 is always invalid (lead byte rejected), 0xAF is a continuation
314        // byte without a lead (also invalid) — both produce U+FFFD
315        assert_eq!(decode_replace(&[0xC0, 0xAF]), "\u{FFFD}\u{FFFD}");
316    }
317
318    #[test]
319    fn truncated_two_byte() {
320        // 0xC3 without continuation
321        assert_eq!(decode_replace(&[0xC3]), "\u{FFFD}");
322    }
323
324    #[test]
325    fn truncated_three_byte() {
326        // 0xE4 0xB8 without third byte
327        assert_eq!(decode_replace(&[0xE4, 0xB8]), "\u{FFFD}");
328    }
329
330    #[test]
331    fn truncated_four_byte() {
332        // 0xF0 0x9F 0x98 without fourth byte
333        assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98]), "\u{FFFD}");
334    }
335
336    #[test]
337    fn surrogate_half_rejected() {
338        // U+D800 would encode as 0xED 0xA0 0x80, but surrogates are invalid in UTF-8
339        // 0xED with upper_boundary 0x9F rejects 0xA0
340        assert_eq!(
341            decode_replace(&[0xED, 0xA0, 0x80]),
342            "\u{FFFD}\u{FFFD}\u{FFFD}"
343        );
344    }
345
346    #[test]
347    fn invalid_continuation_mid_sequence() {
348        // 0xE4 expects continuation, but 0x41 is ASCII — error + prepend
349        assert_eq!(decode_replace(&[0xE4, 0x41]), "\u{FFFD}A");
350    }
351
352    #[test]
353    fn invalid_between_valid() {
354        // Valid 'A', invalid 0xFF, valid 'B'
355        assert_eq!(decode_replace(&[0x41, 0xFF, 0x42]), "A\u{FFFD}B");
356    }
357
358    #[test]
359    fn multiple_errors_in_a_row() {
360        assert_eq!(
361            decode_replace(&[0xFE, 0xFF, 0xFE]),
362            "\u{FFFD}\u{FFFD}\u{FFFD}"
363        );
364    }
365
366    // -- Fatal mode --
367
368    #[test]
369    fn fatal_valid() {
370        assert_eq!(decode_fatal(b"Hello").unwrap(), "Hello");
371    }
372
373    #[test]
374    fn fatal_invalid() {
375        let err = decode_fatal(&[0x41, 0xFF]).unwrap_err();
376        assert!(matches!(
377            err,
378            EncodingError::InvalidSequence {
379                encoding: "UTF-8",
380                position: 1
381            }
382        ));
383    }
384
385    #[test]
386    fn fatal_truncated() {
387        let err = decode_fatal(&[0xC3]).unwrap_err();
388        assert!(matches!(
389            err,
390            EncodingError::InvalidSequence {
391                encoding: "UTF-8",
392                ..
393            }
394        ));
395    }
396
397    // -- Encoder --
398
399    #[test]
400    fn encode_ascii() {
401        assert_eq!(encode_utf8("Hello"), b"Hello");
402    }
403
404    #[test]
405    fn encode_multibyte() {
406        assert_eq!(encode_utf8("\u{00E9}"), &[0xC3, 0xA9]);
407    }
408
409    #[test]
410    fn encode_emoji() {
411        assert_eq!(encode_utf8("\u{1F600}"), &[0xF0, 0x9F, 0x98, 0x80]);
412    }
413
414    #[test]
415    fn encode_empty() {
416        assert_eq!(encode_utf8(""), b"");
417    }
418
419    #[test]
420    fn roundtrip() {
421        let original = "Hello \u{4E16}\u{754C} \u{1F600}";
422        let encoded = encode_utf8(original);
423        let decoded = decode_replace(&encoded);
424        assert_eq!(decoded, original);
425    }
426
427    // -- Edge cases --
428
429    #[test]
430    fn max_two_byte() {
431        // U+07FF = 0xDF 0xBF
432        assert_eq!(decode_replace(&[0xDF, 0xBF]), "\u{07FF}");
433    }
434
435    #[test]
436    fn min_three_byte() {
437        // U+0800 = 0xE0 0xA0 0x80
438        assert_eq!(decode_replace(&[0xE0, 0xA0, 0x80]), "\u{0800}");
439    }
440
441    #[test]
442    fn max_three_byte() {
443        // U+FFFF = 0xEF 0xBF 0xBF
444        assert_eq!(decode_replace(&[0xEF, 0xBF, 0xBF]), "\u{FFFF}");
445    }
446
447    #[test]
448    fn min_four_byte() {
449        // U+10000 = 0xF0 0x90 0x80 0x80
450        assert_eq!(decode_replace(&[0xF0, 0x90, 0x80, 0x80]), "\u{10000}");
451    }
452
453    #[test]
454    fn max_unicode() {
455        // U+10FFFF = 0xF4 0x8F 0xBF 0xBF
456        assert_eq!(decode_replace(&[0xF4, 0x8F, 0xBF, 0xBF]), "\u{10FFFF}");
457    }
458
459    #[test]
460    fn above_max_unicode_rejected() {
461        // 0xF4 0x90 would start U+110000, which is above max
462        // 0xF4 has upper_boundary = 0x8F, so 0x90 is rejected
463        assert_eq!(
464            decode_replace(&[0xF4, 0x90, 0x80, 0x80]),
465            "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"
466        );
467    }
468
469    #[test]
470    fn overlong_three_byte_rejected() {
471        // 0xE0 requires lower_boundary = 0xA0, so 0xE0 0x80 0x80 is rejected
472        assert_eq!(
473            decode_replace(&[0xE0, 0x80, 0x80]),
474            "\u{FFFD}\u{FFFD}\u{FFFD}"
475        );
476    }
477
478    #[test]
479    fn overlong_four_byte_rejected() {
480        // 0xF0 requires lower_boundary = 0x90, so 0xF0 0x80 0x80 0x80 is rejected
481        assert_eq!(
482            decode_replace(&[0xF0, 0x80, 0x80, 0x80]),
483            "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"
484        );
485    }
486}