we (web engine): Experimental web browser project to understand the limits of Claude
at main 690 lines 24 kB view raw
1//! WHATWG Encoding Standard — UTF-8, UTF-16, and legacy single-byte codecs, pure Rust. 2 3pub mod error; 4mod single_byte; 5pub mod sniff; 6mod utf16; 7mod utf8; 8 9use error::{EncodingError, Result}; 10use utf8::ErrorMode; 11 12// --------------------------------------------------------------------------- 13// Encoding enum 14// --------------------------------------------------------------------------- 15 16/// Supported text encodings per WHATWG Encoding Standard. 17#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 18pub enum Encoding { 19 Utf8, 20 Utf16Be, 21 Utf16Le, 22 // Single-byte encodings 23 Ibm866, 24 Iso8859_2, 25 Iso8859_3, 26 Iso8859_4, 27 Iso8859_5, 28 Iso8859_6, 29 Iso8859_7, 30 Iso8859_8, 31 Iso8859_8I, 32 Iso8859_10, 33 Iso8859_13, 34 Iso8859_14, 35 Iso8859_15, 36 Iso8859_16, 37 Koi8R, 38 Koi8U, 39 Macintosh, 40 Windows874, 41 Windows1250, 42 Windows1251, 43 Windows1252, 44 Windows1253, 45 Windows1254, 46 Windows1255, 47 Windows1256, 48 Windows1257, 49 Windows1258, 50 XMacCyrillic, 51} 52 53impl Encoding { 54 /// Canonical name per WHATWG spec. 55 pub fn name(&self) -> &'static str { 56 match self { 57 Self::Utf8 => "UTF-8", 58 Self::Utf16Be => "UTF-16BE", 59 Self::Utf16Le => "UTF-16LE", 60 Self::Ibm866 => "IBM866", 61 Self::Iso8859_2 => "ISO-8859-2", 62 Self::Iso8859_3 => "ISO-8859-3", 63 Self::Iso8859_4 => "ISO-8859-4", 64 Self::Iso8859_5 => "ISO-8859-5", 65 Self::Iso8859_6 => "ISO-8859-6", 66 Self::Iso8859_7 => "ISO-8859-7", 67 Self::Iso8859_8 => "ISO-8859-8", 68 Self::Iso8859_8I => "ISO-8859-8-I", 69 Self::Iso8859_10 => "ISO-8859-10", 70 Self::Iso8859_13 => "ISO-8859-13", 71 Self::Iso8859_14 => "ISO-8859-14", 72 Self::Iso8859_15 => "ISO-8859-15", 73 Self::Iso8859_16 => "ISO-8859-16", 74 Self::Koi8R => "KOI8-R", 75 Self::Koi8U => "KOI8-U", 76 Self::Macintosh => "macintosh", 77 Self::Windows874 => "windows-874", 78 Self::Windows1250 => "windows-1250", 79 Self::Windows1251 => "windows-1251", 80 Self::Windows1252 => "windows-1252", 81 Self::Windows1253 => "windows-1253", 82 Self::Windows1254 => "windows-1254", 83 Self::Windows1255 => "windows-1255", 84 Self::Windows1256 => "windows-1256", 85 Self::Windows1257 => "windows-1257", 86 Self::Windows1258 => "windows-1258", 87 Self::XMacCyrillic => "x-mac-cyrillic", 88 } 89 } 90} 91 92// --------------------------------------------------------------------------- 93// Label lookup (WHATWG Encoding Standard §4.2) 94// --------------------------------------------------------------------------- 95 96/// WHATWG encoding label mappings. 97/// Labels are stored in lowercase; lookup normalizes input to lowercase. 98const ENCODING_LABELS: &[(&str, Encoding)] = &[ 99 // UTF-8 labels 100 ("unicode-1-1-utf-8", Encoding::Utf8), 101 ("unicode11utf8", Encoding::Utf8), 102 ("unicode20utf8", Encoding::Utf8), 103 ("utf-8", Encoding::Utf8), 104 ("utf8", Encoding::Utf8), 105 ("x-unicode20utf8", Encoding::Utf8), 106 // UTF-16BE labels 107 ("unicodefffe", Encoding::Utf16Be), 108 ("utf-16be", Encoding::Utf16Be), 109 // UTF-16LE labels 110 ("csunicode", Encoding::Utf16Le), 111 ("iso-10646-ucs-2", Encoding::Utf16Le), 112 ("ucs-2", Encoding::Utf16Le), 113 ("unicode", Encoding::Utf16Le), 114 ("unicodefeff", Encoding::Utf16Le), 115 ("utf-16", Encoding::Utf16Le), 116 ("utf-16le", Encoding::Utf16Le), 117 // IBM866 labels 118 ("866", Encoding::Ibm866), 119 ("cp866", Encoding::Ibm866), 120 ("csibm866", Encoding::Ibm866), 121 ("ibm866", Encoding::Ibm866), 122 // ISO-8859-2 labels 123 ("csisolatin2", Encoding::Iso8859_2), 124 ("iso-8859-2", Encoding::Iso8859_2), 125 ("iso-ir-101", Encoding::Iso8859_2), 126 ("iso8859-2", Encoding::Iso8859_2), 127 ("iso88592", Encoding::Iso8859_2), 128 ("iso_8859-2", Encoding::Iso8859_2), 129 ("iso_8859-2:1987", Encoding::Iso8859_2), 130 ("l2", Encoding::Iso8859_2), 131 ("latin2", Encoding::Iso8859_2), 132 // ISO-8859-3 labels 133 ("csisolatin3", Encoding::Iso8859_3), 134 ("iso-8859-3", Encoding::Iso8859_3), 135 ("iso-ir-109", Encoding::Iso8859_3), 136 ("iso8859-3", Encoding::Iso8859_3), 137 ("iso88593", Encoding::Iso8859_3), 138 ("iso_8859-3", Encoding::Iso8859_3), 139 ("iso_8859-3:1988", Encoding::Iso8859_3), 140 ("l3", Encoding::Iso8859_3), 141 ("latin3", Encoding::Iso8859_3), 142 // ISO-8859-4 labels 143 ("csisolatin4", Encoding::Iso8859_4), 144 ("iso-8859-4", Encoding::Iso8859_4), 145 ("iso-ir-110", Encoding::Iso8859_4), 146 ("iso8859-4", Encoding::Iso8859_4), 147 ("iso88594", Encoding::Iso8859_4), 148 ("iso_8859-4", Encoding::Iso8859_4), 149 ("iso_8859-4:1988", Encoding::Iso8859_4), 150 ("l4", Encoding::Iso8859_4), 151 ("latin4", Encoding::Iso8859_4), 152 // ISO-8859-5 labels 153 ("csisolatincyrillic", Encoding::Iso8859_5), 154 ("cyrillic", Encoding::Iso8859_5), 155 ("iso-8859-5", Encoding::Iso8859_5), 156 ("iso-ir-144", Encoding::Iso8859_5), 157 ("iso8859-5", Encoding::Iso8859_5), 158 ("iso88595", Encoding::Iso8859_5), 159 ("iso_8859-5", Encoding::Iso8859_5), 160 ("iso_8859-5:1988", Encoding::Iso8859_5), 161 // ISO-8859-6 labels 162 ("arabic", Encoding::Iso8859_6), 163 ("asmo-708", Encoding::Iso8859_6), 164 ("csiso88596e", Encoding::Iso8859_6), 165 ("csiso88596i", Encoding::Iso8859_6), 166 ("csisolatinarabic", Encoding::Iso8859_6), 167 ("ecma-114", Encoding::Iso8859_6), 168 ("iso-8859-6", Encoding::Iso8859_6), 169 ("iso-8859-6-e", Encoding::Iso8859_6), 170 ("iso-8859-6-i", Encoding::Iso8859_6), 171 ("iso-ir-127", Encoding::Iso8859_6), 172 ("iso8859-6", Encoding::Iso8859_6), 173 ("iso88596", Encoding::Iso8859_6), 174 ("iso_8859-6", Encoding::Iso8859_6), 175 ("iso_8859-6:1987", Encoding::Iso8859_6), 176 // ISO-8859-7 labels 177 ("csisolatingreek", Encoding::Iso8859_7), 178 ("ecma-118", Encoding::Iso8859_7), 179 ("elot_928", Encoding::Iso8859_7), 180 ("greek", Encoding::Iso8859_7), 181 ("greek8", Encoding::Iso8859_7), 182 ("iso-8859-7", Encoding::Iso8859_7), 183 ("iso-ir-126", Encoding::Iso8859_7), 184 ("iso8859-7", Encoding::Iso8859_7), 185 ("iso88597", Encoding::Iso8859_7), 186 ("iso_8859-7", Encoding::Iso8859_7), 187 ("iso_8859-7:1987", Encoding::Iso8859_7), 188 ("sun_eu_greek", Encoding::Iso8859_7), 189 // ISO-8859-8 labels 190 ("csiso88598e", Encoding::Iso8859_8), 191 ("csisolatinhebrew", Encoding::Iso8859_8), 192 ("hebrew", Encoding::Iso8859_8), 193 ("iso-8859-8", Encoding::Iso8859_8), 194 ("iso-8859-8-e", Encoding::Iso8859_8), 195 ("iso-ir-138", Encoding::Iso8859_8), 196 ("iso8859-8", Encoding::Iso8859_8), 197 ("iso88598", Encoding::Iso8859_8), 198 ("iso_8859-8", Encoding::Iso8859_8), 199 ("iso_8859-8:1988", Encoding::Iso8859_8), 200 ("visual", Encoding::Iso8859_8), 201 // ISO-8859-8-I labels 202 ("csiso88598i", Encoding::Iso8859_8I), 203 ("iso-8859-8-i", Encoding::Iso8859_8I), 204 ("logical", Encoding::Iso8859_8I), 205 // ISO-8859-10 labels 206 ("csisolatin6", Encoding::Iso8859_10), 207 ("iso-8859-10", Encoding::Iso8859_10), 208 ("iso-ir-157", Encoding::Iso8859_10), 209 ("iso8859-10", Encoding::Iso8859_10), 210 ("iso885910", Encoding::Iso8859_10), 211 ("l6", Encoding::Iso8859_10), 212 ("latin6", Encoding::Iso8859_10), 213 // ISO-8859-13 labels 214 ("iso-8859-13", Encoding::Iso8859_13), 215 ("iso8859-13", Encoding::Iso8859_13), 216 ("iso885913", Encoding::Iso8859_13), 217 // ISO-8859-14 labels 218 ("iso-8859-14", Encoding::Iso8859_14), 219 ("iso8859-14", Encoding::Iso8859_14), 220 ("iso885914", Encoding::Iso8859_14), 221 // ISO-8859-15 labels 222 ("csisolatin9", Encoding::Iso8859_15), 223 ("iso-8859-15", Encoding::Iso8859_15), 224 ("iso8859-15", Encoding::Iso8859_15), 225 ("iso885915", Encoding::Iso8859_15), 226 ("iso_8859-15", Encoding::Iso8859_15), 227 ("l9", Encoding::Iso8859_15), 228 // ISO-8859-16 labels 229 ("iso-8859-16", Encoding::Iso8859_16), 230 // KOI8-R labels 231 ("cskoi8r", Encoding::Koi8R), 232 ("koi", Encoding::Koi8R), 233 ("koi8", Encoding::Koi8R), 234 ("koi8-r", Encoding::Koi8R), 235 ("koi8_r", Encoding::Koi8R), 236 // KOI8-U labels 237 ("koi8-ru", Encoding::Koi8U), 238 ("koi8-u", Encoding::Koi8U), 239 // macintosh labels 240 ("csmacintosh", Encoding::Macintosh), 241 ("mac", Encoding::Macintosh), 242 ("macintosh", Encoding::Macintosh), 243 ("x-mac-roman", Encoding::Macintosh), 244 // windows-874 labels 245 ("dos-874", Encoding::Windows874), 246 ("iso-8859-11", Encoding::Windows874), 247 ("iso8859-11", Encoding::Windows874), 248 ("iso885911", Encoding::Windows874), 249 ("tis-620", Encoding::Windows874), 250 ("windows-874", Encoding::Windows874), 251 // windows-1250 labels 252 ("cp1250", Encoding::Windows1250), 253 ("windows-1250", Encoding::Windows1250), 254 ("x-cp1250", Encoding::Windows1250), 255 // windows-1251 labels 256 ("cp1251", Encoding::Windows1251), 257 ("windows-1251", Encoding::Windows1251), 258 ("x-cp1251", Encoding::Windows1251), 259 // windows-1252 labels (also serves as ISO-8859-1 and US-ASCII per WHATWG) 260 ("ansi_x3.4-1968", Encoding::Windows1252), 261 ("ascii", Encoding::Windows1252), 262 ("cp1252", Encoding::Windows1252), 263 ("cp819", Encoding::Windows1252), 264 ("csisolatin1", Encoding::Windows1252), 265 ("ibm819", Encoding::Windows1252), 266 ("iso-8859-1", Encoding::Windows1252), 267 ("iso-ir-100", Encoding::Windows1252), 268 ("iso8859-1", Encoding::Windows1252), 269 ("iso88591", Encoding::Windows1252), 270 ("iso_8859-1", Encoding::Windows1252), 271 ("iso_8859-1:1987", Encoding::Windows1252), 272 ("l1", Encoding::Windows1252), 273 ("latin1", Encoding::Windows1252), 274 ("us-ascii", Encoding::Windows1252), 275 ("windows-1252", Encoding::Windows1252), 276 ("x-cp1252", Encoding::Windows1252), 277 // windows-1253 labels 278 ("cp1253", Encoding::Windows1253), 279 ("windows-1253", Encoding::Windows1253), 280 ("x-cp1253", Encoding::Windows1253), 281 // windows-1254 labels 282 ("cp1254", Encoding::Windows1254), 283 ("csisolatin5", Encoding::Windows1254), 284 ("iso-8859-9", Encoding::Windows1254), 285 ("iso-ir-148", Encoding::Windows1254), 286 ("iso8859-9", Encoding::Windows1254), 287 ("iso88599", Encoding::Windows1254), 288 ("iso_8859-9", Encoding::Windows1254), 289 ("iso_8859-9:1989", Encoding::Windows1254), 290 ("l5", Encoding::Windows1254), 291 ("latin5", Encoding::Windows1254), 292 ("windows-1254", Encoding::Windows1254), 293 ("x-cp1254", Encoding::Windows1254), 294 // windows-1255 labels 295 ("cp1255", Encoding::Windows1255), 296 ("windows-1255", Encoding::Windows1255), 297 ("x-cp1255", Encoding::Windows1255), 298 // windows-1256 labels 299 ("cp1256", Encoding::Windows1256), 300 ("windows-1256", Encoding::Windows1256), 301 ("x-cp1256", Encoding::Windows1256), 302 // windows-1257 labels 303 ("cp1257", Encoding::Windows1257), 304 ("windows-1257", Encoding::Windows1257), 305 ("x-cp1257", Encoding::Windows1257), 306 // windows-1258 labels 307 ("cp1258", Encoding::Windows1258), 308 ("windows-1258", Encoding::Windows1258), 309 ("x-cp1258", Encoding::Windows1258), 310 // x-mac-cyrillic labels 311 ("x-mac-cyrillic", Encoding::XMacCyrillic), 312 ("x-mac-ukrainian", Encoding::XMacCyrillic), 313]; 314 315/// Look up an encoding by its WHATWG label. 316/// 317/// Strips leading/trailing ASCII whitespace and compares case-insensitively, 318/// per the WHATWG Encoding Standard. 319pub fn lookup(label: &str) -> Option<Encoding> { 320 let trimmed = trim_ascii_whitespace(label); 321 if trimmed.is_empty() { 322 return None; 323 } 324 for &(name, enc) in ENCODING_LABELS { 325 if ascii_eq_ignore_case(trimmed, name) { 326 return Some(enc); 327 } 328 } 329 None 330} 331 332/// Sniff BOM from the start of a byte slice. 333/// 334/// Returns the detected encoding (if any) and the remaining bytes after the BOM. 335pub fn bom_sniff(bytes: &[u8]) -> (Option<Encoding>, &[u8]) { 336 if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF { 337 (Some(Encoding::Utf8), &bytes[3..]) 338 } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { 339 (Some(Encoding::Utf16Be), &bytes[2..]) 340 } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE { 341 (Some(Encoding::Utf16Le), &bytes[2..]) 342 } else { 343 (None, bytes) 344 } 345} 346 347// --------------------------------------------------------------------------- 348// Public API 349// --------------------------------------------------------------------------- 350 351/// Decode bytes to a `String` using the given encoding. 352/// 353/// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec). 354pub fn decode(bytes: &[u8], encoding: Encoding) -> String { 355 // Replacement mode never fails 356 match encoding { 357 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(), 358 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(), 359 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(), 360 enc => { 361 let table = single_byte::table_for(&enc).unwrap(); 362 single_byte::decode_single_byte(bytes, table, enc.name(), ErrorMode::Replacement) 363 .unwrap() 364 } 365 } 366} 367 368/// Decode bytes to a `String`, returning an error on any invalid sequence. 369/// 370/// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence. 371pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result<String> { 372 match encoding { 373 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal), 374 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal), 375 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal), 376 enc => { 377 let table = single_byte::table_for(&enc).unwrap(); 378 single_byte::decode_single_byte(bytes, table, enc.name(), ErrorMode::Fatal) 379 } 380 } 381} 382 383/// Encode a string to bytes using the given encoding. 384/// 385/// Only UTF-8 encoding is supported for encode. Per WHATWG spec, all other 386/// encodings are decode-only. 387pub fn encode(text: &str, encoding: Encoding) -> Result<Vec<u8>> { 388 match encoding { 389 Encoding::Utf8 => Ok(utf8::encode_utf8(text)), 390 other => Err(EncodingError::EncodeNotSupported { 391 encoding: other.name(), 392 }), 393 } 394} 395 396// --------------------------------------------------------------------------- 397// Internal helpers 398// --------------------------------------------------------------------------- 399 400/// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE. 401fn trim_ascii_whitespace(s: &str) -> &str { 402 let bytes = s.as_bytes(); 403 let start = bytes 404 .iter() 405 .position(|&b| !is_ascii_whitespace(b)) 406 .unwrap_or(bytes.len()); 407 let end = bytes 408 .iter() 409 .rposition(|&b| !is_ascii_whitespace(b)) 410 .map(|p| p + 1) 411 .unwrap_or(0); 412 if start >= end { 413 return ""; 414 } 415 &s[start..end] 416} 417 418fn is_ascii_whitespace(b: u8) -> bool { 419 matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20) 420} 421 422fn ascii_eq_ignore_case(a: &str, b: &str) -> bool { 423 a.eq_ignore_ascii_case(b) 424} 425 426// --------------------------------------------------------------------------- 427// Tests 428// --------------------------------------------------------------------------- 429 430#[cfg(test)] 431mod tests { 432 use super::*; 433 434 // -- Encoding enum -- 435 436 #[test] 437 fn encoding_names() { 438 assert_eq!(Encoding::Utf8.name(), "UTF-8"); 439 assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE"); 440 assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE"); 441 assert_eq!(Encoding::Windows1252.name(), "windows-1252"); 442 assert_eq!(Encoding::Iso8859_2.name(), "ISO-8859-2"); 443 assert_eq!(Encoding::Koi8R.name(), "KOI8-R"); 444 assert_eq!(Encoding::Macintosh.name(), "macintosh"); 445 } 446 447 // -- Label lookup -- 448 449 #[test] 450 fn lookup_utf8_labels() { 451 assert_eq!(lookup("utf-8"), Some(Encoding::Utf8)); 452 assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8)); 453 assert_eq!(lookup("utf8"), Some(Encoding::Utf8)); 454 assert_eq!(lookup("Utf8"), Some(Encoding::Utf8)); 455 assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8)); 456 assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8)); 457 } 458 459 #[test] 460 fn lookup_utf16_labels() { 461 assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be)); 462 assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be)); 463 assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be)); 464 assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le)); 465 assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le)); 466 assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le)); 467 assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le)); 468 assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le)); 469 } 470 471 #[test] 472 fn lookup_windows_1252_labels() { 473 // windows-1252 is THE most important single-byte encoding 474 assert_eq!(lookup("windows-1252"), Some(Encoding::Windows1252)); 475 assert_eq!(lookup("cp1252"), Some(Encoding::Windows1252)); 476 assert_eq!(lookup("x-cp1252"), Some(Encoding::Windows1252)); 477 // ISO-8859-1 maps to windows-1252 per WHATWG 478 assert_eq!(lookup("iso-8859-1"), Some(Encoding::Windows1252)); 479 assert_eq!(lookup("latin1"), Some(Encoding::Windows1252)); 480 assert_eq!(lookup("l1"), Some(Encoding::Windows1252)); 481 // US-ASCII maps to windows-1252 per WHATWG 482 assert_eq!(lookup("us-ascii"), Some(Encoding::Windows1252)); 483 assert_eq!(lookup("ascii"), Some(Encoding::Windows1252)); 484 } 485 486 #[test] 487 fn lookup_legacy_labels() { 488 assert_eq!(lookup("iso-8859-2"), Some(Encoding::Iso8859_2)); 489 assert_eq!(lookup("latin2"), Some(Encoding::Iso8859_2)); 490 assert_eq!(lookup("iso-8859-5"), Some(Encoding::Iso8859_5)); 491 assert_eq!(lookup("cyrillic"), Some(Encoding::Iso8859_5)); 492 assert_eq!(lookup("iso-8859-7"), Some(Encoding::Iso8859_7)); 493 assert_eq!(lookup("greek"), Some(Encoding::Iso8859_7)); 494 assert_eq!(lookup("iso-8859-15"), Some(Encoding::Iso8859_15)); 495 assert_eq!(lookup("koi8-r"), Some(Encoding::Koi8R)); 496 assert_eq!(lookup("koi8-u"), Some(Encoding::Koi8U)); 497 assert_eq!(lookup("macintosh"), Some(Encoding::Macintosh)); 498 assert_eq!(lookup("ibm866"), Some(Encoding::Ibm866)); 499 assert_eq!(lookup("windows-1251"), Some(Encoding::Windows1251)); 500 assert_eq!(lookup("windows-874"), Some(Encoding::Windows874)); 501 assert_eq!(lookup("iso-8859-9"), Some(Encoding::Windows1254)); 502 assert_eq!(lookup("x-mac-cyrillic"), Some(Encoding::XMacCyrillic)); 503 } 504 505 #[test] 506 fn lookup_with_whitespace() { 507 assert_eq!(lookup(" utf-8 "), Some(Encoding::Utf8)); 508 assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8)); 509 assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le)); 510 assert_eq!(lookup(" windows-1252 "), Some(Encoding::Windows1252)); 511 } 512 513 #[test] 514 fn lookup_unknown() { 515 assert_eq!(lookup(""), None); 516 assert_eq!(lookup(" "), None); 517 assert_eq!(lookup("utf-99"), None); 518 assert_eq!(lookup("bogus-encoding"), None); 519 } 520 521 // -- BOM sniffing -- 522 523 #[test] 524 fn bom_utf8() { 525 let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]); 526 assert_eq!(enc, Some(Encoding::Utf8)); 527 assert_eq!(rest, &[0x41]); 528 } 529 530 #[test] 531 fn bom_utf16be() { 532 let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]); 533 assert_eq!(enc, Some(Encoding::Utf16Be)); 534 assert_eq!(rest, &[0x00, 0x41]); 535 } 536 537 #[test] 538 fn bom_utf16le() { 539 let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]); 540 assert_eq!(enc, Some(Encoding::Utf16Le)); 541 assert_eq!(rest, &[0x41, 0x00]); 542 } 543 544 #[test] 545 fn bom_none() { 546 let data = [0x41, 0x42, 0x43]; 547 let (enc, rest) = bom_sniff(&data); 548 assert_eq!(enc, None); 549 assert_eq!(rest, &data); 550 } 551 552 #[test] 553 fn bom_empty() { 554 let (enc, rest) = bom_sniff(&[]); 555 assert_eq!(enc, None); 556 assert_eq!(rest, &[] as &[u8]); 557 } 558 559 #[test] 560 fn bom_short() { 561 let (enc, rest) = bom_sniff(&[0xEF, 0xBB]); 562 assert_eq!(enc, None); 563 assert_eq!(rest, &[0xEF, 0xBB]); 564 } 565 566 // -- Top-level decode -- 567 568 #[test] 569 fn decode_utf8_basic() { 570 assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello"); 571 } 572 573 #[test] 574 fn decode_utf8_invalid_replaces() { 575 assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}"); 576 } 577 578 #[test] 579 fn decode_utf16le_basic() { 580 assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A"); 581 } 582 583 #[test] 584 fn decode_utf16be_basic() { 585 assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A"); 586 } 587 588 #[test] 589 fn decode_windows_1252_euro() { 590 assert_eq!(decode(&[0x80], Encoding::Windows1252), "\u{20AC}"); 591 } 592 593 #[test] 594 fn decode_windows_1252_cafe() { 595 // "Café" in windows-1252 596 assert_eq!( 597 decode(&[0x43, 0x61, 0x66, 0xE9], Encoding::Windows1252), 598 "Caf\u{00E9}" 599 ); 600 } 601 602 #[test] 603 fn decode_iso_8859_2() { 604 // 0xA1 → Ą 605 assert_eq!(decode(&[0xA1], Encoding::Iso8859_2), "\u{0104}"); 606 } 607 608 #[test] 609 fn decode_koi8r_cyrillic() { 610 // 0xE1 → А (U+0410) 611 assert_eq!(decode(&[0xE1], Encoding::Koi8R), "\u{0410}"); 612 } 613 614 #[test] 615 fn decode_windows_1251_cyrillic() { 616 // 0xC0 → А (U+0410), 0xE0 → а (U+0430) 617 assert_eq!( 618 decode(&[0xC0, 0xE0], Encoding::Windows1251), 619 "\u{0410}\u{0430}" 620 ); 621 } 622 623 // -- Top-level decode_strict -- 624 625 #[test] 626 fn decode_strict_valid() { 627 assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello"); 628 } 629 630 #[test] 631 fn decode_strict_invalid() { 632 assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err()); 633 } 634 635 #[test] 636 fn decode_strict_single_byte_unmapped() { 637 // ISO-8859-3 byte 0xA5 is unmapped 638 assert!(decode_strict(&[0xA5], Encoding::Iso8859_3).is_err()); 639 } 640 641 #[test] 642 fn decode_strict_single_byte_valid() { 643 assert_eq!( 644 decode_strict(&[0x80], Encoding::Windows1252).unwrap(), 645 "\u{20AC}" 646 ); 647 } 648 649 // -- Top-level encode -- 650 651 #[test] 652 fn encode_utf8_basic() { 653 assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello"); 654 } 655 656 #[test] 657 fn encode_non_utf8_not_supported() { 658 assert!(matches!( 659 encode("Hello", Encoding::Utf16Le), 660 Err(EncodingError::EncodeNotSupported { .. }) 661 )); 662 assert!(matches!( 663 encode("Hello", Encoding::Utf16Be), 664 Err(EncodingError::EncodeNotSupported { .. }) 665 )); 666 assert!(matches!( 667 encode("Hello", Encoding::Windows1252), 668 Err(EncodingError::EncodeNotSupported { .. }) 669 )); 670 } 671 672 // -- Trim helpers -- 673 674 #[test] 675 fn trim_ascii_whitespace_basic() { 676 assert_eq!(trim_ascii_whitespace(" hello "), "hello"); 677 assert_eq!(trim_ascii_whitespace("hello"), "hello"); 678 assert_eq!(trim_ascii_whitespace(""), ""); 679 assert_eq!(trim_ascii_whitespace(" "), ""); 680 assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello"); 681 } 682 683 #[test] 684 fn ascii_eq_ignore_case_basic() { 685 assert!(ascii_eq_ignore_case("utf-8", "UTF-8")); 686 assert!(ascii_eq_ignore_case("Utf-8", "utf-8")); 687 assert!(!ascii_eq_ignore_case("utf-8", "utf-9")); 688 assert!(!ascii_eq_ignore_case("utf-8", "utf-8x")); 689 } 690}