we (web engine): Experimental web browser project to understand the limits of Claude
1//! WHATWG Encoding Standard — UTF-8 and UTF-16 codecs, pure Rust.
2
3pub mod error;
4mod utf16;
5mod utf8;
6
7use error::{EncodingError, Result};
8use utf8::ErrorMode;
9
10// ---------------------------------------------------------------------------
11// Encoding enum
12// ---------------------------------------------------------------------------
13
14/// Supported text encodings per WHATWG Encoding Standard.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
16pub enum Encoding {
17 Utf8,
18 Utf16Be,
19 Utf16Le,
20}
21
22impl Encoding {
23 /// Canonical name per WHATWG spec.
24 pub fn name(&self) -> &'static str {
25 match self {
26 Self::Utf8 => "UTF-8",
27 Self::Utf16Be => "UTF-16BE",
28 Self::Utf16Le => "UTF-16LE",
29 }
30 }
31}
32
33// ---------------------------------------------------------------------------
34// Label lookup (WHATWG Encoding Standard §4.2)
35// ---------------------------------------------------------------------------
36
37/// WHATWG encoding label mappings.
38/// Labels are stored in lowercase; lookup normalizes input to lowercase.
39const ENCODING_LABELS: &[(&str, Encoding)] = &[
40 // UTF-8 labels
41 ("unicode-1-1-utf-8", Encoding::Utf8),
42 ("unicode11utf8", Encoding::Utf8),
43 ("unicode20utf8", Encoding::Utf8),
44 ("utf-8", Encoding::Utf8),
45 ("utf8", Encoding::Utf8),
46 ("x-unicode20utf8", Encoding::Utf8),
47 // UTF-16BE labels
48 ("unicodefffe", Encoding::Utf16Be),
49 ("utf-16be", Encoding::Utf16Be),
50 // UTF-16LE labels
51 ("csunicode", Encoding::Utf16Le),
52 ("iso-10646-ucs-2", Encoding::Utf16Le),
53 ("ucs-2", Encoding::Utf16Le),
54 ("unicode", Encoding::Utf16Le),
55 ("unicodefeff", Encoding::Utf16Le),
56 ("utf-16", Encoding::Utf16Le),
57 ("utf-16le", Encoding::Utf16Le),
58];
59
60/// Look up an encoding by its WHATWG label.
61///
62/// Strips leading/trailing ASCII whitespace and compares case-insensitively,
63/// per the WHATWG Encoding Standard.
64pub fn lookup(label: &str) -> Option<Encoding> {
65 let trimmed = trim_ascii_whitespace(label);
66 if trimmed.is_empty() {
67 return None;
68 }
69 for &(name, enc) in ENCODING_LABELS {
70 if ascii_eq_ignore_case(trimmed, name) {
71 return Some(enc);
72 }
73 }
74 None
75}
76
77/// Sniff BOM from the start of a byte slice.
78///
79/// Returns the detected encoding (if any) and the remaining bytes after the BOM.
80pub fn bom_sniff(bytes: &[u8]) -> (Option<Encoding>, &[u8]) {
81 if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
82 (Some(Encoding::Utf8), &bytes[3..])
83 } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
84 (Some(Encoding::Utf16Be), &bytes[2..])
85 } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE {
86 (Some(Encoding::Utf16Le), &bytes[2..])
87 } else {
88 (None, bytes)
89 }
90}
91
92// ---------------------------------------------------------------------------
93// Public API
94// ---------------------------------------------------------------------------
95
96/// Decode bytes to a `String` using the given encoding.
97///
98/// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec).
99pub fn decode(bytes: &[u8], encoding: Encoding) -> String {
100 // Replacement mode never fails
101 match encoding {
102 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(),
103 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(),
104 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(),
105 }
106}
107
108/// Decode bytes to a `String`, returning an error on any invalid sequence.
109///
110/// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence.
111pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result<String> {
112 match encoding {
113 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal),
114 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal),
115 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal),
116 }
117}
118
119/// Encode a string to bytes using the given encoding.
120///
121/// Only UTF-8 encoding is supported for encode. Per WHATWG spec, UTF-16
122/// encodings are decode-only.
123pub fn encode(text: &str, encoding: Encoding) -> Result<Vec<u8>> {
124 match encoding {
125 Encoding::Utf8 => Ok(utf8::encode_utf8(text)),
126 Encoding::Utf16Be => Err(EncodingError::EncodeNotSupported {
127 encoding: "UTF-16BE",
128 }),
129 Encoding::Utf16Le => Err(EncodingError::EncodeNotSupported {
130 encoding: "UTF-16LE",
131 }),
132 }
133}
134
135// ---------------------------------------------------------------------------
136// Internal helpers
137// ---------------------------------------------------------------------------
138
139/// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE.
140fn trim_ascii_whitespace(s: &str) -> &str {
141 let bytes = s.as_bytes();
142 let start = bytes
143 .iter()
144 .position(|&b| !is_ascii_whitespace(b))
145 .unwrap_or(bytes.len());
146 let end = bytes
147 .iter()
148 .rposition(|&b| !is_ascii_whitespace(b))
149 .map(|p| p + 1)
150 .unwrap_or(0);
151 if start >= end {
152 return "";
153 }
154 &s[start..end]
155}
156
157fn is_ascii_whitespace(b: u8) -> bool {
158 matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20)
159}
160
161fn ascii_eq_ignore_case(a: &str, b: &str) -> bool {
162 a.eq_ignore_ascii_case(b)
163}
164
165// ---------------------------------------------------------------------------
166// Tests
167// ---------------------------------------------------------------------------
168
169#[cfg(test)]
170mod tests {
171 use super::*;
172
173 // -- Encoding enum --
174
175 #[test]
176 fn encoding_names() {
177 assert_eq!(Encoding::Utf8.name(), "UTF-8");
178 assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE");
179 assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE");
180 }
181
182 // -- Label lookup --
183
184 #[test]
185 fn lookup_utf8_labels() {
186 assert_eq!(lookup("utf-8"), Some(Encoding::Utf8));
187 assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8));
188 assert_eq!(lookup("utf8"), Some(Encoding::Utf8));
189 assert_eq!(lookup("Utf8"), Some(Encoding::Utf8));
190 assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8));
191 assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8));
192 }
193
194 #[test]
195 fn lookup_utf16_labels() {
196 assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be));
197 assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be));
198 assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be));
199 assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le));
200 assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le));
201 assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le));
202 assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le));
203 assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le));
204 }
205
206 #[test]
207 fn lookup_with_whitespace() {
208 assert_eq!(lookup(" utf-8 "), Some(Encoding::Utf8));
209 assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8));
210 assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le));
211 }
212
213 #[test]
214 fn lookup_unknown() {
215 assert_eq!(lookup("latin1"), None);
216 assert_eq!(lookup(""), None);
217 assert_eq!(lookup(" "), None);
218 assert_eq!(lookup("utf-99"), None);
219 }
220
221 // -- BOM sniffing --
222
223 #[test]
224 fn bom_utf8() {
225 let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]);
226 assert_eq!(enc, Some(Encoding::Utf8));
227 assert_eq!(rest, &[0x41]);
228 }
229
230 #[test]
231 fn bom_utf16be() {
232 let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]);
233 assert_eq!(enc, Some(Encoding::Utf16Be));
234 assert_eq!(rest, &[0x00, 0x41]);
235 }
236
237 #[test]
238 fn bom_utf16le() {
239 let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]);
240 assert_eq!(enc, Some(Encoding::Utf16Le));
241 assert_eq!(rest, &[0x41, 0x00]);
242 }
243
244 #[test]
245 fn bom_none() {
246 let data = [0x41, 0x42, 0x43];
247 let (enc, rest) = bom_sniff(&data);
248 assert_eq!(enc, None);
249 assert_eq!(rest, &data);
250 }
251
252 #[test]
253 fn bom_empty() {
254 let (enc, rest) = bom_sniff(&[]);
255 assert_eq!(enc, None);
256 assert_eq!(rest, &[] as &[u8]);
257 }
258
259 #[test]
260 fn bom_short() {
261 let (enc, rest) = bom_sniff(&[0xEF, 0xBB]);
262 assert_eq!(enc, None);
263 assert_eq!(rest, &[0xEF, 0xBB]);
264 }
265
266 // -- Top-level decode --
267
268 #[test]
269 fn decode_utf8_basic() {
270 assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello");
271 }
272
273 #[test]
274 fn decode_utf8_invalid_replaces() {
275 assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}");
276 }
277
278 #[test]
279 fn decode_utf16le_basic() {
280 assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A");
281 }
282
283 #[test]
284 fn decode_utf16be_basic() {
285 assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A");
286 }
287
288 // -- Top-level decode_strict --
289
290 #[test]
291 fn decode_strict_valid() {
292 assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello");
293 }
294
295 #[test]
296 fn decode_strict_invalid() {
297 assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err());
298 }
299
300 // -- Top-level encode --
301
302 #[test]
303 fn encode_utf8_basic() {
304 assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello");
305 }
306
307 #[test]
308 fn encode_utf16_not_supported() {
309 assert!(matches!(
310 encode("Hello", Encoding::Utf16Le),
311 Err(EncodingError::EncodeNotSupported {
312 encoding: "UTF-16LE"
313 })
314 ));
315 assert!(matches!(
316 encode("Hello", Encoding::Utf16Be),
317 Err(EncodingError::EncodeNotSupported {
318 encoding: "UTF-16BE"
319 })
320 ));
321 }
322
323 // -- Trim helpers --
324
325 #[test]
326 fn trim_ascii_whitespace_basic() {
327 assert_eq!(trim_ascii_whitespace(" hello "), "hello");
328 assert_eq!(trim_ascii_whitespace("hello"), "hello");
329 assert_eq!(trim_ascii_whitespace(""), "");
330 assert_eq!(trim_ascii_whitespace(" "), "");
331 assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello");
332 }
333
334 #[test]
335 fn ascii_eq_ignore_case_basic() {
336 assert!(ascii_eq_ignore_case("utf-8", "UTF-8"));
337 assert!(ascii_eq_ignore_case("Utf-8", "utf-8"));
338 assert!(!ascii_eq_ignore_case("utf-8", "utf-9"));
339 assert!(!ascii_eq_ignore_case("utf-8", "utf-8x"));
340 }
341}