we (web engine): Experimental web browser project to understand the limits of Claude
1//! WHATWG Encoding Standard — UTF-8, UTF-16, and legacy single-byte codecs, pure Rust.
2
3pub mod error;
4mod single_byte;
5pub mod sniff;
6mod utf16;
7mod utf8;
8
9use error::{EncodingError, Result};
10use utf8::ErrorMode;
11
12// ---------------------------------------------------------------------------
13// Encoding enum
14// ---------------------------------------------------------------------------
15
16/// Supported text encodings per WHATWG Encoding Standard.
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
18pub enum Encoding {
19 Utf8,
20 Utf16Be,
21 Utf16Le,
22 // Single-byte encodings
23 Ibm866,
24 Iso8859_2,
25 Iso8859_3,
26 Iso8859_4,
27 Iso8859_5,
28 Iso8859_6,
29 Iso8859_7,
30 Iso8859_8,
31 Iso8859_8I,
32 Iso8859_10,
33 Iso8859_13,
34 Iso8859_14,
35 Iso8859_15,
36 Iso8859_16,
37 Koi8R,
38 Koi8U,
39 Macintosh,
40 Windows874,
41 Windows1250,
42 Windows1251,
43 Windows1252,
44 Windows1253,
45 Windows1254,
46 Windows1255,
47 Windows1256,
48 Windows1257,
49 Windows1258,
50 XMacCyrillic,
51}
52
53impl Encoding {
54 /// Canonical name per WHATWG spec.
55 pub fn name(&self) -> &'static str {
56 match self {
57 Self::Utf8 => "UTF-8",
58 Self::Utf16Be => "UTF-16BE",
59 Self::Utf16Le => "UTF-16LE",
60 Self::Ibm866 => "IBM866",
61 Self::Iso8859_2 => "ISO-8859-2",
62 Self::Iso8859_3 => "ISO-8859-3",
63 Self::Iso8859_4 => "ISO-8859-4",
64 Self::Iso8859_5 => "ISO-8859-5",
65 Self::Iso8859_6 => "ISO-8859-6",
66 Self::Iso8859_7 => "ISO-8859-7",
67 Self::Iso8859_8 => "ISO-8859-8",
68 Self::Iso8859_8I => "ISO-8859-8-I",
69 Self::Iso8859_10 => "ISO-8859-10",
70 Self::Iso8859_13 => "ISO-8859-13",
71 Self::Iso8859_14 => "ISO-8859-14",
72 Self::Iso8859_15 => "ISO-8859-15",
73 Self::Iso8859_16 => "ISO-8859-16",
74 Self::Koi8R => "KOI8-R",
75 Self::Koi8U => "KOI8-U",
76 Self::Macintosh => "macintosh",
77 Self::Windows874 => "windows-874",
78 Self::Windows1250 => "windows-1250",
79 Self::Windows1251 => "windows-1251",
80 Self::Windows1252 => "windows-1252",
81 Self::Windows1253 => "windows-1253",
82 Self::Windows1254 => "windows-1254",
83 Self::Windows1255 => "windows-1255",
84 Self::Windows1256 => "windows-1256",
85 Self::Windows1257 => "windows-1257",
86 Self::Windows1258 => "windows-1258",
87 Self::XMacCyrillic => "x-mac-cyrillic",
88 }
89 }
90}
91
92// ---------------------------------------------------------------------------
93// Label lookup (WHATWG Encoding Standard §4.2)
94// ---------------------------------------------------------------------------
95
96/// WHATWG encoding label mappings.
97/// Labels are stored in lowercase; lookup normalizes input to lowercase.
98const ENCODING_LABELS: &[(&str, Encoding)] = &[
99 // UTF-8 labels
100 ("unicode-1-1-utf-8", Encoding::Utf8),
101 ("unicode11utf8", Encoding::Utf8),
102 ("unicode20utf8", Encoding::Utf8),
103 ("utf-8", Encoding::Utf8),
104 ("utf8", Encoding::Utf8),
105 ("x-unicode20utf8", Encoding::Utf8),
106 // UTF-16BE labels
107 ("unicodefffe", Encoding::Utf16Be),
108 ("utf-16be", Encoding::Utf16Be),
109 // UTF-16LE labels
110 ("csunicode", Encoding::Utf16Le),
111 ("iso-10646-ucs-2", Encoding::Utf16Le),
112 ("ucs-2", Encoding::Utf16Le),
113 ("unicode", Encoding::Utf16Le),
114 ("unicodefeff", Encoding::Utf16Le),
115 ("utf-16", Encoding::Utf16Le),
116 ("utf-16le", Encoding::Utf16Le),
117 // IBM866 labels
118 ("866", Encoding::Ibm866),
119 ("cp866", Encoding::Ibm866),
120 ("csibm866", Encoding::Ibm866),
121 ("ibm866", Encoding::Ibm866),
122 // ISO-8859-2 labels
123 ("csisolatin2", Encoding::Iso8859_2),
124 ("iso-8859-2", Encoding::Iso8859_2),
125 ("iso-ir-101", Encoding::Iso8859_2),
126 ("iso8859-2", Encoding::Iso8859_2),
127 ("iso88592", Encoding::Iso8859_2),
128 ("iso_8859-2", Encoding::Iso8859_2),
129 ("iso_8859-2:1987", Encoding::Iso8859_2),
130 ("l2", Encoding::Iso8859_2),
131 ("latin2", Encoding::Iso8859_2),
132 // ISO-8859-3 labels
133 ("csisolatin3", Encoding::Iso8859_3),
134 ("iso-8859-3", Encoding::Iso8859_3),
135 ("iso-ir-109", Encoding::Iso8859_3),
136 ("iso8859-3", Encoding::Iso8859_3),
137 ("iso88593", Encoding::Iso8859_3),
138 ("iso_8859-3", Encoding::Iso8859_3),
139 ("iso_8859-3:1988", Encoding::Iso8859_3),
140 ("l3", Encoding::Iso8859_3),
141 ("latin3", Encoding::Iso8859_3),
142 // ISO-8859-4 labels
143 ("csisolatin4", Encoding::Iso8859_4),
144 ("iso-8859-4", Encoding::Iso8859_4),
145 ("iso-ir-110", Encoding::Iso8859_4),
146 ("iso8859-4", Encoding::Iso8859_4),
147 ("iso88594", Encoding::Iso8859_4),
148 ("iso_8859-4", Encoding::Iso8859_4),
149 ("iso_8859-4:1988", Encoding::Iso8859_4),
150 ("l4", Encoding::Iso8859_4),
151 ("latin4", Encoding::Iso8859_4),
152 // ISO-8859-5 labels
153 ("csisolatincyrillic", Encoding::Iso8859_5),
154 ("cyrillic", Encoding::Iso8859_5),
155 ("iso-8859-5", Encoding::Iso8859_5),
156 ("iso-ir-144", Encoding::Iso8859_5),
157 ("iso8859-5", Encoding::Iso8859_5),
158 ("iso88595", Encoding::Iso8859_5),
159 ("iso_8859-5", Encoding::Iso8859_5),
160 ("iso_8859-5:1988", Encoding::Iso8859_5),
161 // ISO-8859-6 labels
162 ("arabic", Encoding::Iso8859_6),
163 ("asmo-708", Encoding::Iso8859_6),
164 ("csiso88596e", Encoding::Iso8859_6),
165 ("csiso88596i", Encoding::Iso8859_6),
166 ("csisolatinarabic", Encoding::Iso8859_6),
167 ("ecma-114", Encoding::Iso8859_6),
168 ("iso-8859-6", Encoding::Iso8859_6),
169 ("iso-8859-6-e", Encoding::Iso8859_6),
170 ("iso-8859-6-i", Encoding::Iso8859_6),
171 ("iso-ir-127", Encoding::Iso8859_6),
172 ("iso8859-6", Encoding::Iso8859_6),
173 ("iso88596", Encoding::Iso8859_6),
174 ("iso_8859-6", Encoding::Iso8859_6),
175 ("iso_8859-6:1987", Encoding::Iso8859_6),
176 // ISO-8859-7 labels
177 ("csisolatingreek", Encoding::Iso8859_7),
178 ("ecma-118", Encoding::Iso8859_7),
179 ("elot_928", Encoding::Iso8859_7),
180 ("greek", Encoding::Iso8859_7),
181 ("greek8", Encoding::Iso8859_7),
182 ("iso-8859-7", Encoding::Iso8859_7),
183 ("iso-ir-126", Encoding::Iso8859_7),
184 ("iso8859-7", Encoding::Iso8859_7),
185 ("iso88597", Encoding::Iso8859_7),
186 ("iso_8859-7", Encoding::Iso8859_7),
187 ("iso_8859-7:1987", Encoding::Iso8859_7),
188 ("sun_eu_greek", Encoding::Iso8859_7),
189 // ISO-8859-8 labels
190 ("csiso88598e", Encoding::Iso8859_8),
191 ("csisolatinhebrew", Encoding::Iso8859_8),
192 ("hebrew", Encoding::Iso8859_8),
193 ("iso-8859-8", Encoding::Iso8859_8),
194 ("iso-8859-8-e", Encoding::Iso8859_8),
195 ("iso-ir-138", Encoding::Iso8859_8),
196 ("iso8859-8", Encoding::Iso8859_8),
197 ("iso88598", Encoding::Iso8859_8),
198 ("iso_8859-8", Encoding::Iso8859_8),
199 ("iso_8859-8:1988", Encoding::Iso8859_8),
200 ("visual", Encoding::Iso8859_8),
201 // ISO-8859-8-I labels
202 ("csiso88598i", Encoding::Iso8859_8I),
203 ("iso-8859-8-i", Encoding::Iso8859_8I),
204 ("logical", Encoding::Iso8859_8I),
205 // ISO-8859-10 labels
206 ("csisolatin6", Encoding::Iso8859_10),
207 ("iso-8859-10", Encoding::Iso8859_10),
208 ("iso-ir-157", Encoding::Iso8859_10),
209 ("iso8859-10", Encoding::Iso8859_10),
210 ("iso885910", Encoding::Iso8859_10),
211 ("l6", Encoding::Iso8859_10),
212 ("latin6", Encoding::Iso8859_10),
213 // ISO-8859-13 labels
214 ("iso-8859-13", Encoding::Iso8859_13),
215 ("iso8859-13", Encoding::Iso8859_13),
216 ("iso885913", Encoding::Iso8859_13),
217 // ISO-8859-14 labels
218 ("iso-8859-14", Encoding::Iso8859_14),
219 ("iso8859-14", Encoding::Iso8859_14),
220 ("iso885914", Encoding::Iso8859_14),
221 // ISO-8859-15 labels
222 ("csisolatin9", Encoding::Iso8859_15),
223 ("iso-8859-15", Encoding::Iso8859_15),
224 ("iso8859-15", Encoding::Iso8859_15),
225 ("iso885915", Encoding::Iso8859_15),
226 ("iso_8859-15", Encoding::Iso8859_15),
227 ("l9", Encoding::Iso8859_15),
228 // ISO-8859-16 labels
229 ("iso-8859-16", Encoding::Iso8859_16),
230 // KOI8-R labels
231 ("cskoi8r", Encoding::Koi8R),
232 ("koi", Encoding::Koi8R),
233 ("koi8", Encoding::Koi8R),
234 ("koi8-r", Encoding::Koi8R),
235 ("koi8_r", Encoding::Koi8R),
236 // KOI8-U labels
237 ("koi8-ru", Encoding::Koi8U),
238 ("koi8-u", Encoding::Koi8U),
239 // macintosh labels
240 ("csmacintosh", Encoding::Macintosh),
241 ("mac", Encoding::Macintosh),
242 ("macintosh", Encoding::Macintosh),
243 ("x-mac-roman", Encoding::Macintosh),
244 // windows-874 labels
245 ("dos-874", Encoding::Windows874),
246 ("iso-8859-11", Encoding::Windows874),
247 ("iso8859-11", Encoding::Windows874),
248 ("iso885911", Encoding::Windows874),
249 ("tis-620", Encoding::Windows874),
250 ("windows-874", Encoding::Windows874),
251 // windows-1250 labels
252 ("cp1250", Encoding::Windows1250),
253 ("windows-1250", Encoding::Windows1250),
254 ("x-cp1250", Encoding::Windows1250),
255 // windows-1251 labels
256 ("cp1251", Encoding::Windows1251),
257 ("windows-1251", Encoding::Windows1251),
258 ("x-cp1251", Encoding::Windows1251),
259 // windows-1252 labels (also serves as ISO-8859-1 and US-ASCII per WHATWG)
260 ("ansi_x3.4-1968", Encoding::Windows1252),
261 ("ascii", Encoding::Windows1252),
262 ("cp1252", Encoding::Windows1252),
263 ("cp819", Encoding::Windows1252),
264 ("csisolatin1", Encoding::Windows1252),
265 ("ibm819", Encoding::Windows1252),
266 ("iso-8859-1", Encoding::Windows1252),
267 ("iso-ir-100", Encoding::Windows1252),
268 ("iso8859-1", Encoding::Windows1252),
269 ("iso88591", Encoding::Windows1252),
270 ("iso_8859-1", Encoding::Windows1252),
271 ("iso_8859-1:1987", Encoding::Windows1252),
272 ("l1", Encoding::Windows1252),
273 ("latin1", Encoding::Windows1252),
274 ("us-ascii", Encoding::Windows1252),
275 ("windows-1252", Encoding::Windows1252),
276 ("x-cp1252", Encoding::Windows1252),
277 // windows-1253 labels
278 ("cp1253", Encoding::Windows1253),
279 ("windows-1253", Encoding::Windows1253),
280 ("x-cp1253", Encoding::Windows1253),
281 // windows-1254 labels
282 ("cp1254", Encoding::Windows1254),
283 ("csisolatin5", Encoding::Windows1254),
284 ("iso-8859-9", Encoding::Windows1254),
285 ("iso-ir-148", Encoding::Windows1254),
286 ("iso8859-9", Encoding::Windows1254),
287 ("iso88599", Encoding::Windows1254),
288 ("iso_8859-9", Encoding::Windows1254),
289 ("iso_8859-9:1989", Encoding::Windows1254),
290 ("l5", Encoding::Windows1254),
291 ("latin5", Encoding::Windows1254),
292 ("windows-1254", Encoding::Windows1254),
293 ("x-cp1254", Encoding::Windows1254),
294 // windows-1255 labels
295 ("cp1255", Encoding::Windows1255),
296 ("windows-1255", Encoding::Windows1255),
297 ("x-cp1255", Encoding::Windows1255),
298 // windows-1256 labels
299 ("cp1256", Encoding::Windows1256),
300 ("windows-1256", Encoding::Windows1256),
301 ("x-cp1256", Encoding::Windows1256),
302 // windows-1257 labels
303 ("cp1257", Encoding::Windows1257),
304 ("windows-1257", Encoding::Windows1257),
305 ("x-cp1257", Encoding::Windows1257),
306 // windows-1258 labels
307 ("cp1258", Encoding::Windows1258),
308 ("windows-1258", Encoding::Windows1258),
309 ("x-cp1258", Encoding::Windows1258),
310 // x-mac-cyrillic labels
311 ("x-mac-cyrillic", Encoding::XMacCyrillic),
312 ("x-mac-ukrainian", Encoding::XMacCyrillic),
313];
314
315/// Look up an encoding by its WHATWG label.
316///
317/// Strips leading/trailing ASCII whitespace and compares case-insensitively,
318/// per the WHATWG Encoding Standard.
319pub fn lookup(label: &str) -> Option<Encoding> {
320 let trimmed = trim_ascii_whitespace(label);
321 if trimmed.is_empty() {
322 return None;
323 }
324 for &(name, enc) in ENCODING_LABELS {
325 if ascii_eq_ignore_case(trimmed, name) {
326 return Some(enc);
327 }
328 }
329 None
330}
331
332/// Sniff BOM from the start of a byte slice.
333///
334/// Returns the detected encoding (if any) and the remaining bytes after the BOM.
335pub fn bom_sniff(bytes: &[u8]) -> (Option<Encoding>, &[u8]) {
336 if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
337 (Some(Encoding::Utf8), &bytes[3..])
338 } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
339 (Some(Encoding::Utf16Be), &bytes[2..])
340 } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE {
341 (Some(Encoding::Utf16Le), &bytes[2..])
342 } else {
343 (None, bytes)
344 }
345}
346
347// ---------------------------------------------------------------------------
348// Public API
349// ---------------------------------------------------------------------------
350
351/// Decode bytes to a `String` using the given encoding.
352///
353/// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec).
354pub fn decode(bytes: &[u8], encoding: Encoding) -> String {
355 // Replacement mode never fails
356 match encoding {
357 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(),
358 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(),
359 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(),
360 enc => {
361 let table = single_byte::table_for(&enc).unwrap();
362 single_byte::decode_single_byte(bytes, table, enc.name(), ErrorMode::Replacement)
363 .unwrap()
364 }
365 }
366}
367
368/// Decode bytes to a `String`, returning an error on any invalid sequence.
369///
370/// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence.
371pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result<String> {
372 match encoding {
373 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal),
374 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal),
375 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal),
376 enc => {
377 let table = single_byte::table_for(&enc).unwrap();
378 single_byte::decode_single_byte(bytes, table, enc.name(), ErrorMode::Fatal)
379 }
380 }
381}
382
383/// Encode a string to bytes using the given encoding.
384///
385/// Only UTF-8 encoding is supported for encode. Per WHATWG spec, all other
386/// encodings are decode-only.
387pub fn encode(text: &str, encoding: Encoding) -> Result<Vec<u8>> {
388 match encoding {
389 Encoding::Utf8 => Ok(utf8::encode_utf8(text)),
390 other => Err(EncodingError::EncodeNotSupported {
391 encoding: other.name(),
392 }),
393 }
394}
395
396// ---------------------------------------------------------------------------
397// Internal helpers
398// ---------------------------------------------------------------------------
399
400/// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE.
401fn trim_ascii_whitespace(s: &str) -> &str {
402 let bytes = s.as_bytes();
403 let start = bytes
404 .iter()
405 .position(|&b| !is_ascii_whitespace(b))
406 .unwrap_or(bytes.len());
407 let end = bytes
408 .iter()
409 .rposition(|&b| !is_ascii_whitespace(b))
410 .map(|p| p + 1)
411 .unwrap_or(0);
412 if start >= end {
413 return "";
414 }
415 &s[start..end]
416}
417
418fn is_ascii_whitespace(b: u8) -> bool {
419 matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20)
420}
421
422fn ascii_eq_ignore_case(a: &str, b: &str) -> bool {
423 a.eq_ignore_ascii_case(b)
424}
425
426// ---------------------------------------------------------------------------
427// Tests
428// ---------------------------------------------------------------------------
429
430#[cfg(test)]
431mod tests {
432 use super::*;
433
434 // -- Encoding enum --
435
436 #[test]
437 fn encoding_names() {
438 assert_eq!(Encoding::Utf8.name(), "UTF-8");
439 assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE");
440 assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE");
441 assert_eq!(Encoding::Windows1252.name(), "windows-1252");
442 assert_eq!(Encoding::Iso8859_2.name(), "ISO-8859-2");
443 assert_eq!(Encoding::Koi8R.name(), "KOI8-R");
444 assert_eq!(Encoding::Macintosh.name(), "macintosh");
445 }
446
447 // -- Label lookup --
448
449 #[test]
450 fn lookup_utf8_labels() {
451 assert_eq!(lookup("utf-8"), Some(Encoding::Utf8));
452 assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8));
453 assert_eq!(lookup("utf8"), Some(Encoding::Utf8));
454 assert_eq!(lookup("Utf8"), Some(Encoding::Utf8));
455 assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8));
456 assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8));
457 }
458
459 #[test]
460 fn lookup_utf16_labels() {
461 assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be));
462 assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be));
463 assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be));
464 assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le));
465 assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le));
466 assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le));
467 assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le));
468 assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le));
469 }
470
471 #[test]
472 fn lookup_windows_1252_labels() {
473 // windows-1252 is THE most important single-byte encoding
474 assert_eq!(lookup("windows-1252"), Some(Encoding::Windows1252));
475 assert_eq!(lookup("cp1252"), Some(Encoding::Windows1252));
476 assert_eq!(lookup("x-cp1252"), Some(Encoding::Windows1252));
477 // ISO-8859-1 maps to windows-1252 per WHATWG
478 assert_eq!(lookup("iso-8859-1"), Some(Encoding::Windows1252));
479 assert_eq!(lookup("latin1"), Some(Encoding::Windows1252));
480 assert_eq!(lookup("l1"), Some(Encoding::Windows1252));
481 // US-ASCII maps to windows-1252 per WHATWG
482 assert_eq!(lookup("us-ascii"), Some(Encoding::Windows1252));
483 assert_eq!(lookup("ascii"), Some(Encoding::Windows1252));
484 }
485
486 #[test]
487 fn lookup_legacy_labels() {
488 assert_eq!(lookup("iso-8859-2"), Some(Encoding::Iso8859_2));
489 assert_eq!(lookup("latin2"), Some(Encoding::Iso8859_2));
490 assert_eq!(lookup("iso-8859-5"), Some(Encoding::Iso8859_5));
491 assert_eq!(lookup("cyrillic"), Some(Encoding::Iso8859_5));
492 assert_eq!(lookup("iso-8859-7"), Some(Encoding::Iso8859_7));
493 assert_eq!(lookup("greek"), Some(Encoding::Iso8859_7));
494 assert_eq!(lookup("iso-8859-15"), Some(Encoding::Iso8859_15));
495 assert_eq!(lookup("koi8-r"), Some(Encoding::Koi8R));
496 assert_eq!(lookup("koi8-u"), Some(Encoding::Koi8U));
497 assert_eq!(lookup("macintosh"), Some(Encoding::Macintosh));
498 assert_eq!(lookup("ibm866"), Some(Encoding::Ibm866));
499 assert_eq!(lookup("windows-1251"), Some(Encoding::Windows1251));
500 assert_eq!(lookup("windows-874"), Some(Encoding::Windows874));
501 assert_eq!(lookup("iso-8859-9"), Some(Encoding::Windows1254));
502 assert_eq!(lookup("x-mac-cyrillic"), Some(Encoding::XMacCyrillic));
503 }
504
505 #[test]
506 fn lookup_with_whitespace() {
507 assert_eq!(lookup(" utf-8 "), Some(Encoding::Utf8));
508 assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8));
509 assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le));
510 assert_eq!(lookup(" windows-1252 "), Some(Encoding::Windows1252));
511 }
512
513 #[test]
514 fn lookup_unknown() {
515 assert_eq!(lookup(""), None);
516 assert_eq!(lookup(" "), None);
517 assert_eq!(lookup("utf-99"), None);
518 assert_eq!(lookup("bogus-encoding"), None);
519 }
520
521 // -- BOM sniffing --
522
523 #[test]
524 fn bom_utf8() {
525 let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]);
526 assert_eq!(enc, Some(Encoding::Utf8));
527 assert_eq!(rest, &[0x41]);
528 }
529
530 #[test]
531 fn bom_utf16be() {
532 let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]);
533 assert_eq!(enc, Some(Encoding::Utf16Be));
534 assert_eq!(rest, &[0x00, 0x41]);
535 }
536
537 #[test]
538 fn bom_utf16le() {
539 let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]);
540 assert_eq!(enc, Some(Encoding::Utf16Le));
541 assert_eq!(rest, &[0x41, 0x00]);
542 }
543
544 #[test]
545 fn bom_none() {
546 let data = [0x41, 0x42, 0x43];
547 let (enc, rest) = bom_sniff(&data);
548 assert_eq!(enc, None);
549 assert_eq!(rest, &data);
550 }
551
552 #[test]
553 fn bom_empty() {
554 let (enc, rest) = bom_sniff(&[]);
555 assert_eq!(enc, None);
556 assert_eq!(rest, &[] as &[u8]);
557 }
558
559 #[test]
560 fn bom_short() {
561 let (enc, rest) = bom_sniff(&[0xEF, 0xBB]);
562 assert_eq!(enc, None);
563 assert_eq!(rest, &[0xEF, 0xBB]);
564 }
565
566 // -- Top-level decode --
567
568 #[test]
569 fn decode_utf8_basic() {
570 assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello");
571 }
572
573 #[test]
574 fn decode_utf8_invalid_replaces() {
575 assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}");
576 }
577
578 #[test]
579 fn decode_utf16le_basic() {
580 assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A");
581 }
582
583 #[test]
584 fn decode_utf16be_basic() {
585 assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A");
586 }
587
588 #[test]
589 fn decode_windows_1252_euro() {
590 assert_eq!(decode(&[0x80], Encoding::Windows1252), "\u{20AC}");
591 }
592
593 #[test]
594 fn decode_windows_1252_cafe() {
595 // "Café" in windows-1252
596 assert_eq!(
597 decode(&[0x43, 0x61, 0x66, 0xE9], Encoding::Windows1252),
598 "Caf\u{00E9}"
599 );
600 }
601
602 #[test]
603 fn decode_iso_8859_2() {
604 // 0xA1 → Ą
605 assert_eq!(decode(&[0xA1], Encoding::Iso8859_2), "\u{0104}");
606 }
607
608 #[test]
609 fn decode_koi8r_cyrillic() {
610 // 0xE1 → А (U+0410)
611 assert_eq!(decode(&[0xE1], Encoding::Koi8R), "\u{0410}");
612 }
613
614 #[test]
615 fn decode_windows_1251_cyrillic() {
616 // 0xC0 → А (U+0410), 0xE0 → а (U+0430)
617 assert_eq!(
618 decode(&[0xC0, 0xE0], Encoding::Windows1251),
619 "\u{0410}\u{0430}"
620 );
621 }
622
623 // -- Top-level decode_strict --
624
625 #[test]
626 fn decode_strict_valid() {
627 assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello");
628 }
629
630 #[test]
631 fn decode_strict_invalid() {
632 assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err());
633 }
634
635 #[test]
636 fn decode_strict_single_byte_unmapped() {
637 // ISO-8859-3 byte 0xA5 is unmapped
638 assert!(decode_strict(&[0xA5], Encoding::Iso8859_3).is_err());
639 }
640
641 #[test]
642 fn decode_strict_single_byte_valid() {
643 assert_eq!(
644 decode_strict(&[0x80], Encoding::Windows1252).unwrap(),
645 "\u{20AC}"
646 );
647 }
648
649 // -- Top-level encode --
650
651 #[test]
652 fn encode_utf8_basic() {
653 assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello");
654 }
655
656 #[test]
657 fn encode_non_utf8_not_supported() {
658 assert!(matches!(
659 encode("Hello", Encoding::Utf16Le),
660 Err(EncodingError::EncodeNotSupported { .. })
661 ));
662 assert!(matches!(
663 encode("Hello", Encoding::Utf16Be),
664 Err(EncodingError::EncodeNotSupported { .. })
665 ));
666 assert!(matches!(
667 encode("Hello", Encoding::Windows1252),
668 Err(EncodingError::EncodeNotSupported { .. })
669 ));
670 }
671
672 // -- Trim helpers --
673
674 #[test]
675 fn trim_ascii_whitespace_basic() {
676 assert_eq!(trim_ascii_whitespace(" hello "), "hello");
677 assert_eq!(trim_ascii_whitespace("hello"), "hello");
678 assert_eq!(trim_ascii_whitespace(""), "");
679 assert_eq!(trim_ascii_whitespace(" "), "");
680 assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello");
681 }
682
683 #[test]
684 fn ascii_eq_ignore_case_basic() {
685 assert!(ascii_eq_ignore_case("utf-8", "UTF-8"));
686 assert!(ascii_eq_ignore_case("Utf-8", "utf-8"));
687 assert!(!ascii_eq_ignore_case("utf-8", "utf-9"));
688 assert!(!ascii_eq_ignore_case("utf-8", "utf-8x"));
689 }
690}