we (web engine): Experimental web browser project to understand the limits of Claude
at utf-codecs 341 lines 10 kB view raw
1//! WHATWG Encoding Standard — UTF-8 and UTF-16 codecs, pure Rust. 2 3pub mod error; 4mod utf16; 5mod utf8; 6 7use error::{EncodingError, Result}; 8use utf8::ErrorMode; 9 10// --------------------------------------------------------------------------- 11// Encoding enum 12// --------------------------------------------------------------------------- 13 14/// Supported text encodings per WHATWG Encoding Standard. 15#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 16pub enum Encoding { 17 Utf8, 18 Utf16Be, 19 Utf16Le, 20} 21 22impl Encoding { 23 /// Canonical name per WHATWG spec. 24 pub fn name(&self) -> &'static str { 25 match self { 26 Self::Utf8 => "UTF-8", 27 Self::Utf16Be => "UTF-16BE", 28 Self::Utf16Le => "UTF-16LE", 29 } 30 } 31} 32 33// --------------------------------------------------------------------------- 34// Label lookup (WHATWG Encoding Standard §4.2) 35// --------------------------------------------------------------------------- 36 37/// WHATWG encoding label mappings. 38/// Labels are stored in lowercase; lookup normalizes input to lowercase. 39const ENCODING_LABELS: &[(&str, Encoding)] = &[ 40 // UTF-8 labels 41 ("unicode-1-1-utf-8", Encoding::Utf8), 42 ("unicode11utf8", Encoding::Utf8), 43 ("unicode20utf8", Encoding::Utf8), 44 ("utf-8", Encoding::Utf8), 45 ("utf8", Encoding::Utf8), 46 ("x-unicode20utf8", Encoding::Utf8), 47 // UTF-16BE labels 48 ("unicodefffe", Encoding::Utf16Be), 49 ("utf-16be", Encoding::Utf16Be), 50 // UTF-16LE labels 51 ("csunicode", Encoding::Utf16Le), 52 ("iso-10646-ucs-2", Encoding::Utf16Le), 53 ("ucs-2", Encoding::Utf16Le), 54 ("unicode", Encoding::Utf16Le), 55 ("unicodefeff", Encoding::Utf16Le), 56 ("utf-16", Encoding::Utf16Le), 57 ("utf-16le", Encoding::Utf16Le), 58]; 59 60/// Look up an encoding by its WHATWG label. 61/// 62/// Strips leading/trailing ASCII whitespace and compares case-insensitively, 63/// per the WHATWG Encoding Standard. 64pub fn lookup(label: &str) -> Option<Encoding> { 65 let trimmed = trim_ascii_whitespace(label); 66 if trimmed.is_empty() { 67 return None; 68 } 69 for &(name, enc) in ENCODING_LABELS { 70 if ascii_eq_ignore_case(trimmed, name) { 71 return Some(enc); 72 } 73 } 74 None 75} 76 77/// Sniff BOM from the start of a byte slice. 78/// 79/// Returns the detected encoding (if any) and the remaining bytes after the BOM. 80pub fn bom_sniff(bytes: &[u8]) -> (Option<Encoding>, &[u8]) { 81 if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF { 82 (Some(Encoding::Utf8), &bytes[3..]) 83 } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { 84 (Some(Encoding::Utf16Be), &bytes[2..]) 85 } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE { 86 (Some(Encoding::Utf16Le), &bytes[2..]) 87 } else { 88 (None, bytes) 89 } 90} 91 92// --------------------------------------------------------------------------- 93// Public API 94// --------------------------------------------------------------------------- 95 96/// Decode bytes to a `String` using the given encoding. 97/// 98/// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec). 99pub fn decode(bytes: &[u8], encoding: Encoding) -> String { 100 // Replacement mode never fails 101 match encoding { 102 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(), 103 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(), 104 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(), 105 } 106} 107 108/// Decode bytes to a `String`, returning an error on any invalid sequence. 109/// 110/// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence. 111pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result<String> { 112 match encoding { 113 Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal), 114 Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal), 115 Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal), 116 } 117} 118 119/// Encode a string to bytes using the given encoding. 120/// 121/// Only UTF-8 encoding is supported for encode. Per WHATWG spec, UTF-16 122/// encodings are decode-only. 123pub fn encode(text: &str, encoding: Encoding) -> Result<Vec<u8>> { 124 match encoding { 125 Encoding::Utf8 => Ok(utf8::encode_utf8(text)), 126 Encoding::Utf16Be => Err(EncodingError::EncodeNotSupported { 127 encoding: "UTF-16BE", 128 }), 129 Encoding::Utf16Le => Err(EncodingError::EncodeNotSupported { 130 encoding: "UTF-16LE", 131 }), 132 } 133} 134 135// --------------------------------------------------------------------------- 136// Internal helpers 137// --------------------------------------------------------------------------- 138 139/// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE. 140fn trim_ascii_whitespace(s: &str) -> &str { 141 let bytes = s.as_bytes(); 142 let start = bytes 143 .iter() 144 .position(|&b| !is_ascii_whitespace(b)) 145 .unwrap_or(bytes.len()); 146 let end = bytes 147 .iter() 148 .rposition(|&b| !is_ascii_whitespace(b)) 149 .map(|p| p + 1) 150 .unwrap_or(0); 151 if start >= end { 152 return ""; 153 } 154 &s[start..end] 155} 156 157fn is_ascii_whitespace(b: u8) -> bool { 158 matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20) 159} 160 161fn ascii_eq_ignore_case(a: &str, b: &str) -> bool { 162 a.eq_ignore_ascii_case(b) 163} 164 165// --------------------------------------------------------------------------- 166// Tests 167// --------------------------------------------------------------------------- 168 169#[cfg(test)] 170mod tests { 171 use super::*; 172 173 // -- Encoding enum -- 174 175 #[test] 176 fn encoding_names() { 177 assert_eq!(Encoding::Utf8.name(), "UTF-8"); 178 assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE"); 179 assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE"); 180 } 181 182 // -- Label lookup -- 183 184 #[test] 185 fn lookup_utf8_labels() { 186 assert_eq!(lookup("utf-8"), Some(Encoding::Utf8)); 187 assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8)); 188 assert_eq!(lookup("utf8"), Some(Encoding::Utf8)); 189 assert_eq!(lookup("Utf8"), Some(Encoding::Utf8)); 190 assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8)); 191 assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8)); 192 } 193 194 #[test] 195 fn lookup_utf16_labels() { 196 assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be)); 197 assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be)); 198 assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be)); 199 assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le)); 200 assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le)); 201 assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le)); 202 assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le)); 203 assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le)); 204 } 205 206 #[test] 207 fn lookup_with_whitespace() { 208 assert_eq!(lookup(" utf-8 "), Some(Encoding::Utf8)); 209 assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8)); 210 assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le)); 211 } 212 213 #[test] 214 fn lookup_unknown() { 215 assert_eq!(lookup("latin1"), None); 216 assert_eq!(lookup(""), None); 217 assert_eq!(lookup(" "), None); 218 assert_eq!(lookup("utf-99"), None); 219 } 220 221 // -- BOM sniffing -- 222 223 #[test] 224 fn bom_utf8() { 225 let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]); 226 assert_eq!(enc, Some(Encoding::Utf8)); 227 assert_eq!(rest, &[0x41]); 228 } 229 230 #[test] 231 fn bom_utf16be() { 232 let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]); 233 assert_eq!(enc, Some(Encoding::Utf16Be)); 234 assert_eq!(rest, &[0x00, 0x41]); 235 } 236 237 #[test] 238 fn bom_utf16le() { 239 let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]); 240 assert_eq!(enc, Some(Encoding::Utf16Le)); 241 assert_eq!(rest, &[0x41, 0x00]); 242 } 243 244 #[test] 245 fn bom_none() { 246 let data = [0x41, 0x42, 0x43]; 247 let (enc, rest) = bom_sniff(&data); 248 assert_eq!(enc, None); 249 assert_eq!(rest, &data); 250 } 251 252 #[test] 253 fn bom_empty() { 254 let (enc, rest) = bom_sniff(&[]); 255 assert_eq!(enc, None); 256 assert_eq!(rest, &[] as &[u8]); 257 } 258 259 #[test] 260 fn bom_short() { 261 let (enc, rest) = bom_sniff(&[0xEF, 0xBB]); 262 assert_eq!(enc, None); 263 assert_eq!(rest, &[0xEF, 0xBB]); 264 } 265 266 // -- Top-level decode -- 267 268 #[test] 269 fn decode_utf8_basic() { 270 assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello"); 271 } 272 273 #[test] 274 fn decode_utf8_invalid_replaces() { 275 assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}"); 276 } 277 278 #[test] 279 fn decode_utf16le_basic() { 280 assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A"); 281 } 282 283 #[test] 284 fn decode_utf16be_basic() { 285 assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A"); 286 } 287 288 // -- Top-level decode_strict -- 289 290 #[test] 291 fn decode_strict_valid() { 292 assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello"); 293 } 294 295 #[test] 296 fn decode_strict_invalid() { 297 assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err()); 298 } 299 300 // -- Top-level encode -- 301 302 #[test] 303 fn encode_utf8_basic() { 304 assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello"); 305 } 306 307 #[test] 308 fn encode_utf16_not_supported() { 309 assert!(matches!( 310 encode("Hello", Encoding::Utf16Le), 311 Err(EncodingError::EncodeNotSupported { 312 encoding: "UTF-16LE" 313 }) 314 )); 315 assert!(matches!( 316 encode("Hello", Encoding::Utf16Be), 317 Err(EncodingError::EncodeNotSupported { 318 encoding: "UTF-16BE" 319 }) 320 )); 321 } 322 323 // -- Trim helpers -- 324 325 #[test] 326 fn trim_ascii_whitespace_basic() { 327 assert_eq!(trim_ascii_whitespace(" hello "), "hello"); 328 assert_eq!(trim_ascii_whitespace("hello"), "hello"); 329 assert_eq!(trim_ascii_whitespace(""), ""); 330 assert_eq!(trim_ascii_whitespace(" "), ""); 331 assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello"); 332 } 333 334 #[test] 335 fn ascii_eq_ignore_case_basic() { 336 assert!(ascii_eq_ignore_case("utf-8", "UTF-8")); 337 assert!(ascii_eq_ignore_case("Utf-8", "utf-8")); 338 assert!(!ascii_eq_ignore_case("utf-8", "utf-9")); 339 assert!(!ascii_eq_ignore_case("utf-8", "utf-8x")); 340 } 341}