we (web engine): Experimental web browser project to understand the limits of Claude
1//! UTF-8 decoder and encoder per WHATWG Encoding Standard.
2
3use crate::error::{EncodingError, Result};
4
5/// Error handling mode.
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7pub(crate) enum ErrorMode {
8 Replacement,
9 Fatal,
10}
11
12/// Decode a byte slice as UTF-8.
13///
14/// In replacement mode, invalid sequences are replaced with U+FFFD.
15/// In fatal mode, the first invalid sequence causes an error.
16pub(crate) fn decode_utf8(bytes: &[u8], mode: ErrorMode) -> Result<String> {
17 // Strip UTF-8 BOM if present
18 let bytes = if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
19 &bytes[3..]
20 } else {
21 bytes
22 };
23
24 let mut output = String::with_capacity(bytes.len());
25 let mut decoder = Utf8Decoder::new();
26 let mut i = 0;
27
28 while i < bytes.len() {
29 match decoder.process_byte(bytes[i]) {
30 DecoderResult::CodePoint(ch) => {
31 output.push(ch);
32 i += 1;
33 }
34 DecoderResult::Error(error_pos) => {
35 if mode == ErrorMode::Fatal {
36 return Err(EncodingError::InvalidSequence {
37 encoding: "UTF-8",
38 position: error_pos,
39 });
40 }
41 output.push('\u{FFFD}');
42 i += 1;
43 }
44 DecoderResult::ErrorPrepend(error_pos) => {
45 if mode == ErrorMode::Fatal {
46 return Err(EncodingError::InvalidSequence {
47 encoding: "UTF-8",
48 position: error_pos,
49 });
50 }
51 output.push('\u{FFFD}');
52 // Do NOT advance i — re-process this byte
53 }
54 DecoderResult::Continue => {
55 i += 1;
56 }
57 }
58 }
59
60 // Handle incomplete sequence at end of input
61 if decoder.bytes_needed > 0 {
62 if mode == ErrorMode::Fatal {
63 return Err(EncodingError::InvalidSequence {
64 encoding: "UTF-8",
65 position: bytes.len().saturating_sub(decoder.bytes_seen as usize),
66 });
67 }
68 output.push('\u{FFFD}');
69 }
70
71 Ok(output)
72}
73
74/// Encode a string as UTF-8 bytes.
75///
76/// Since Rust strings are already valid UTF-8, this is a straightforward copy.
77pub(crate) fn encode_utf8(text: &str) -> Vec<u8> {
78 text.as_bytes().to_vec()
79}
80
81// ---------------------------------------------------------------------------
82// Streaming UTF-8 decoder (WHATWG Encoding Standard §8.1.1)
83// ---------------------------------------------------------------------------
84
85enum DecoderResult {
86 /// A valid code point was decoded.
87 CodePoint(char),
88 /// An error occurred at the given byte position; advance to next byte.
89 Error(usize),
90 /// An error occurred at the given byte position; re-process current byte.
91 ErrorPrepend(usize),
92 /// More bytes needed; continue feeding.
93 Continue,
94}
95
96struct Utf8Decoder {
97 code_point: u32,
98 bytes_seen: u8,
99 bytes_needed: u8,
100 lower_boundary: u8,
101 upper_boundary: u8,
102 /// Position of the start of the current multi-byte sequence.
103 sequence_start: usize,
104 /// Total bytes processed so far.
105 position: usize,
106}
107
108impl Utf8Decoder {
109 fn new() -> Self {
110 Self {
111 code_point: 0,
112 bytes_seen: 0,
113 bytes_needed: 0,
114 lower_boundary: 0x80,
115 upper_boundary: 0xBF,
116 sequence_start: 0,
117 position: 0,
118 }
119 }
120
121 fn process_byte(&mut self, byte: u8) -> DecoderResult {
122 let pos = self.position;
123 self.position += 1;
124
125 if self.bytes_needed == 0 {
126 match byte {
127 0x00..=0x7F => DecoderResult::CodePoint(byte as char),
128 0xC2..=0xDF => {
129 self.bytes_needed = 1;
130 self.code_point = (byte & 0x1F) as u32;
131 self.sequence_start = pos;
132 DecoderResult::Continue
133 }
134 0xE0 => {
135 self.bytes_needed = 2;
136 self.lower_boundary = 0xA0;
137 self.code_point = (byte & 0x0F) as u32;
138 self.sequence_start = pos;
139 DecoderResult::Continue
140 }
141 0xE1..=0xEC | 0xEE..=0xEF => {
142 self.bytes_needed = 2;
143 self.code_point = (byte & 0x0F) as u32;
144 self.sequence_start = pos;
145 DecoderResult::Continue
146 }
147 0xED => {
148 self.bytes_needed = 2;
149 self.upper_boundary = 0x9F;
150 self.code_point = (byte & 0x0F) as u32;
151 self.sequence_start = pos;
152 DecoderResult::Continue
153 }
154 0xF0 => {
155 self.bytes_needed = 3;
156 self.lower_boundary = 0x90;
157 self.code_point = (byte & 0x07) as u32;
158 self.sequence_start = pos;
159 DecoderResult::Continue
160 }
161 0xF1..=0xF3 => {
162 self.bytes_needed = 3;
163 self.code_point = (byte & 0x07) as u32;
164 self.sequence_start = pos;
165 DecoderResult::Continue
166 }
167 0xF4 => {
168 self.bytes_needed = 3;
169 self.upper_boundary = 0x8F;
170 self.code_point = (byte & 0x07) as u32;
171 self.sequence_start = pos;
172 DecoderResult::Continue
173 }
174 _ => {
175 // 0x80..=0xC1, 0xF5..=0xFF: invalid lead byte
176 DecoderResult::Error(pos)
177 }
178 }
179 } else {
180 // Expecting continuation byte
181 if byte < self.lower_boundary || byte > self.upper_boundary {
182 // Invalid continuation — reset and prepend byte
183 let err_pos = self.sequence_start;
184 self.reset();
185 self.position -= 1; // will be re-processed
186 return DecoderResult::ErrorPrepend(err_pos);
187 }
188
189 // Valid continuation byte
190 self.lower_boundary = 0x80;
191 self.upper_boundary = 0xBF;
192 self.code_point = (self.code_point << 6) | (byte & 0x3F) as u32;
193 self.bytes_seen += 1;
194
195 if self.bytes_seen == self.bytes_needed {
196 let cp = self.code_point;
197 self.reset();
198 // The WHATWG state machine guarantees valid scalar values here,
199 // but use fallback for defense-in-depth.
200 let ch = char::from_u32(cp).unwrap_or('\u{FFFD}');
201 DecoderResult::CodePoint(ch)
202 } else {
203 DecoderResult::Continue
204 }
205 }
206 }
207
208 fn reset(&mut self) {
209 self.code_point = 0;
210 self.bytes_seen = 0;
211 self.bytes_needed = 0;
212 self.lower_boundary = 0x80;
213 self.upper_boundary = 0xBF;
214 }
215}
216
217// ---------------------------------------------------------------------------
218// Tests
219// ---------------------------------------------------------------------------
220
221#[cfg(test)]
222mod tests {
223 use super::*;
224
225 fn decode_replace(bytes: &[u8]) -> String {
226 decode_utf8(bytes, ErrorMode::Replacement).unwrap()
227 }
228
229 fn decode_fatal(bytes: &[u8]) -> Result<String> {
230 decode_utf8(bytes, ErrorMode::Fatal)
231 }
232
233 // -- Basic ASCII --
234
235 #[test]
236 fn ascii_roundtrip() {
237 assert_eq!(decode_replace(b"Hello, world!"), "Hello, world!");
238 }
239
240 #[test]
241 fn empty_input() {
242 assert_eq!(decode_replace(b""), "");
243 }
244
245 #[test]
246 fn null_byte() {
247 assert_eq!(decode_replace(&[0x00]), "\0");
248 }
249
250 // -- Multi-byte sequences --
251
252 #[test]
253 fn two_byte_sequence() {
254 // U+00E9 (e with acute) = 0xC3 0xA9
255 assert_eq!(decode_replace(&[0xC3, 0xA9]), "\u{00E9}");
256 }
257
258 #[test]
259 fn three_byte_sequence() {
260 // U+4E16 (CJK character) = 0xE4 0xB8 0x96
261 assert_eq!(decode_replace(&[0xE4, 0xB8, 0x96]), "\u{4E16}");
262 }
263
264 #[test]
265 fn four_byte_sequence() {
266 // U+1F600 (grinning face) = 0xF0 0x9F 0x98 0x80
267 assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98, 0x80]), "\u{1F600}");
268 }
269
270 #[test]
271 fn mixed_ascii_and_multibyte() {
272 // "Caf\u{00E9}" = [0x43, 0x61, 0x66, 0xC3, 0xA9]
273 assert_eq!(
274 decode_replace(&[0x43, 0x61, 0x66, 0xC3, 0xA9]),
275 "Caf\u{00E9}"
276 );
277 }
278
279 // -- BOM handling --
280
281 #[test]
282 fn bom_stripped() {
283 // UTF-8 BOM + "A"
284 assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF, 0x41]), "A");
285 }
286
287 #[test]
288 fn bom_only() {
289 assert_eq!(decode_replace(&[0xEF, 0xBB, 0xBF]), "");
290 }
291
292 // -- Invalid sequences (replacement mode) --
293
294 #[test]
295 fn invalid_byte_ff() {
296 assert_eq!(decode_replace(&[0xFF]), "\u{FFFD}");
297 }
298
299 #[test]
300 fn invalid_byte_fe() {
301 assert_eq!(decode_replace(&[0xFE]), "\u{FFFD}");
302 }
303
304 #[test]
305 fn invalid_continuation_byte_standalone() {
306 // 0x80 without a lead byte
307 assert_eq!(decode_replace(&[0x80]), "\u{FFFD}");
308 }
309
310 #[test]
311 fn overlong_two_byte() {
312 // 0xC0 0xAF is an overlong encoding of U+002F ('/')
313 // 0xC0 is always invalid (lead byte rejected), 0xAF is a continuation
314 // byte without a lead (also invalid) — both produce U+FFFD
315 assert_eq!(decode_replace(&[0xC0, 0xAF]), "\u{FFFD}\u{FFFD}");
316 }
317
318 #[test]
319 fn truncated_two_byte() {
320 // 0xC3 without continuation
321 assert_eq!(decode_replace(&[0xC3]), "\u{FFFD}");
322 }
323
324 #[test]
325 fn truncated_three_byte() {
326 // 0xE4 0xB8 without third byte
327 assert_eq!(decode_replace(&[0xE4, 0xB8]), "\u{FFFD}");
328 }
329
330 #[test]
331 fn truncated_four_byte() {
332 // 0xF0 0x9F 0x98 without fourth byte
333 assert_eq!(decode_replace(&[0xF0, 0x9F, 0x98]), "\u{FFFD}");
334 }
335
336 #[test]
337 fn surrogate_half_rejected() {
338 // U+D800 would encode as 0xED 0xA0 0x80, but surrogates are invalid in UTF-8
339 // 0xED with upper_boundary 0x9F rejects 0xA0
340 assert_eq!(
341 decode_replace(&[0xED, 0xA0, 0x80]),
342 "\u{FFFD}\u{FFFD}\u{FFFD}"
343 );
344 }
345
346 #[test]
347 fn invalid_continuation_mid_sequence() {
348 // 0xE4 expects continuation, but 0x41 is ASCII — error + prepend
349 assert_eq!(decode_replace(&[0xE4, 0x41]), "\u{FFFD}A");
350 }
351
352 #[test]
353 fn invalid_between_valid() {
354 // Valid 'A', invalid 0xFF, valid 'B'
355 assert_eq!(decode_replace(&[0x41, 0xFF, 0x42]), "A\u{FFFD}B");
356 }
357
358 #[test]
359 fn multiple_errors_in_a_row() {
360 assert_eq!(
361 decode_replace(&[0xFE, 0xFF, 0xFE]),
362 "\u{FFFD}\u{FFFD}\u{FFFD}"
363 );
364 }
365
366 // -- Fatal mode --
367
368 #[test]
369 fn fatal_valid() {
370 assert_eq!(decode_fatal(b"Hello").unwrap(), "Hello");
371 }
372
373 #[test]
374 fn fatal_invalid() {
375 let err = decode_fatal(&[0x41, 0xFF]).unwrap_err();
376 assert!(matches!(
377 err,
378 EncodingError::InvalidSequence {
379 encoding: "UTF-8",
380 position: 1
381 }
382 ));
383 }
384
385 #[test]
386 fn fatal_truncated() {
387 let err = decode_fatal(&[0xC3]).unwrap_err();
388 assert!(matches!(
389 err,
390 EncodingError::InvalidSequence {
391 encoding: "UTF-8",
392 ..
393 }
394 ));
395 }
396
397 // -- Encoder --
398
399 #[test]
400 fn encode_ascii() {
401 assert_eq!(encode_utf8("Hello"), b"Hello");
402 }
403
404 #[test]
405 fn encode_multibyte() {
406 assert_eq!(encode_utf8("\u{00E9}"), &[0xC3, 0xA9]);
407 }
408
409 #[test]
410 fn encode_emoji() {
411 assert_eq!(encode_utf8("\u{1F600}"), &[0xF0, 0x9F, 0x98, 0x80]);
412 }
413
414 #[test]
415 fn encode_empty() {
416 assert_eq!(encode_utf8(""), b"");
417 }
418
419 #[test]
420 fn roundtrip() {
421 let original = "Hello \u{4E16}\u{754C} \u{1F600}";
422 let encoded = encode_utf8(original);
423 let decoded = decode_replace(&encoded);
424 assert_eq!(decoded, original);
425 }
426
427 // -- Edge cases --
428
429 #[test]
430 fn max_two_byte() {
431 // U+07FF = 0xDF 0xBF
432 assert_eq!(decode_replace(&[0xDF, 0xBF]), "\u{07FF}");
433 }
434
435 #[test]
436 fn min_three_byte() {
437 // U+0800 = 0xE0 0xA0 0x80
438 assert_eq!(decode_replace(&[0xE0, 0xA0, 0x80]), "\u{0800}");
439 }
440
441 #[test]
442 fn max_three_byte() {
443 // U+FFFF = 0xEF 0xBF 0xBF
444 assert_eq!(decode_replace(&[0xEF, 0xBF, 0xBF]), "\u{FFFF}");
445 }
446
447 #[test]
448 fn min_four_byte() {
449 // U+10000 = 0xF0 0x90 0x80 0x80
450 assert_eq!(decode_replace(&[0xF0, 0x90, 0x80, 0x80]), "\u{10000}");
451 }
452
453 #[test]
454 fn max_unicode() {
455 // U+10FFFF = 0xF4 0x8F 0xBF 0xBF
456 assert_eq!(decode_replace(&[0xF4, 0x8F, 0xBF, 0xBF]), "\u{10FFFF}");
457 }
458
459 #[test]
460 fn above_max_unicode_rejected() {
461 // 0xF4 0x90 would start U+110000, which is above max
462 // 0xF4 has upper_boundary = 0x8F, so 0x90 is rejected
463 assert_eq!(
464 decode_replace(&[0xF4, 0x90, 0x80, 0x80]),
465 "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"
466 );
467 }
468
469 #[test]
470 fn overlong_three_byte_rejected() {
471 // 0xE0 requires lower_boundary = 0xA0, so 0xE0 0x80 0x80 is rejected
472 assert_eq!(
473 decode_replace(&[0xE0, 0x80, 0x80]),
474 "\u{FFFD}\u{FFFD}\u{FFFD}"
475 );
476 }
477
478 #[test]
479 fn overlong_four_byte_rejected() {
480 // 0xF0 requires lower_boundary = 0x90, so 0xF0 0x80 0x80 0x80 is rejected
481 assert_eq!(
482 decode_replace(&[0xF0, 0x80, 0x80, 0x80]),
483 "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"
484 );
485 }
486}