crates/encoding/src/lib.rs at utf-codecs

pierrelf.com / we
fork atom
we (web engine): Experimental web browser project to understand the limits of Claude
fork atom
we / crates / encoding / src / lib.rs
at utf-codecs 341 lines 10 kB view raw
wrap content
pierrelf.com Implement WHATWG Encoding: UTF-8 and UTF-16 codecs 12d ago
1af54bc7
  1//! WHATWG Encoding Standard — UTF-8 and UTF-16 codecs, pure Rust.
  2
  3pub mod error;
  4mod utf16;
  5mod utf8;
  6
  7use error::{EncodingError, Result};
  8use utf8::ErrorMode;
  9
 10// ---------------------------------------------------------------------------
 11// Encoding enum
 12// ---------------------------------------------------------------------------
 13
 14/// Supported text encodings per WHATWG Encoding Standard.
 15#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 16pub enum Encoding {
 17    Utf8,
 18    Utf16Be,
 19    Utf16Le,
 20}
 21
 22impl Encoding {
 23    /// Canonical name per WHATWG spec.
 24    pub fn name(&self) -> &'static str {
 25        match self {
 26            Self::Utf8 => "UTF-8",
 27            Self::Utf16Be => "UTF-16BE",
 28            Self::Utf16Le => "UTF-16LE",
 29        }
 30    }
 31}
 32
 33// ---------------------------------------------------------------------------
 34// Label lookup (WHATWG Encoding Standard §4.2)
 35// ---------------------------------------------------------------------------
 36
 37/// WHATWG encoding label mappings.
 38/// Labels are stored in lowercase; lookup normalizes input to lowercase.
 39const ENCODING_LABELS: &[(&str, Encoding)] = &[
 40    // UTF-8 labels
 41    ("unicode-1-1-utf-8", Encoding::Utf8),
 42    ("unicode11utf8", Encoding::Utf8),
 43    ("unicode20utf8", Encoding::Utf8),
 44    ("utf-8", Encoding::Utf8),
 45    ("utf8", Encoding::Utf8),
 46    ("x-unicode20utf8", Encoding::Utf8),
 47    // UTF-16BE labels
 48    ("unicodefffe", Encoding::Utf16Be),
 49    ("utf-16be", Encoding::Utf16Be),
 50    // UTF-16LE labels
 51    ("csunicode", Encoding::Utf16Le),
 52    ("iso-10646-ucs-2", Encoding::Utf16Le),
 53    ("ucs-2", Encoding::Utf16Le),
 54    ("unicode", Encoding::Utf16Le),
 55    ("unicodefeff", Encoding::Utf16Le),
 56    ("utf-16", Encoding::Utf16Le),
 57    ("utf-16le", Encoding::Utf16Le),
 58];
 59
 60/// Look up an encoding by its WHATWG label.
 61///
 62/// Strips leading/trailing ASCII whitespace and compares case-insensitively,
 63/// per the WHATWG Encoding Standard.
 64pub fn lookup(label: &str) -> Option<Encoding> {
 65    let trimmed = trim_ascii_whitespace(label);
 66    if trimmed.is_empty() {
 67        return None;
 68    }
 69    for &(name, enc) in ENCODING_LABELS {
 70        if ascii_eq_ignore_case(trimmed, name) {
 71            return Some(enc);
 72        }
 73    }
 74    None
 75}
 76
 77/// Sniff BOM from the start of a byte slice.
 78///
 79/// Returns the detected encoding (if any) and the remaining bytes after the BOM.
 80pub fn bom_sniff(bytes: &[u8]) -> (Option<Encoding>, &[u8]) {
 81    if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
 82        (Some(Encoding::Utf8), &bytes[3..])
 83    } else if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
 84        (Some(Encoding::Utf16Be), &bytes[2..])
 85    } else if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE {
 86        (Some(Encoding::Utf16Le), &bytes[2..])
 87    } else {
 88        (None, bytes)
 89    }
 90}
 91
 92// ---------------------------------------------------------------------------
 93// Public API
 94// ---------------------------------------------------------------------------
 95
 96/// Decode bytes to a `String` using the given encoding.
 97///
 98/// Invalid sequences are replaced with U+FFFD (replacement mode per WHATWG spec).
 99pub fn decode(bytes: &[u8], encoding: Encoding) -> String {
100    // Replacement mode never fails
101    match encoding {
102        Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Replacement).unwrap(),
103        Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Replacement).unwrap(),
104        Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Replacement).unwrap(),
105    }
106}
107
108/// Decode bytes to a `String`, returning an error on any invalid sequence.
109///
110/// Fatal mode per WHATWG spec — returns `Err` on the first invalid byte sequence.
111pub fn decode_strict(bytes: &[u8], encoding: Encoding) -> Result<String> {
112    match encoding {
113        Encoding::Utf8 => utf8::decode_utf8(bytes, ErrorMode::Fatal),
114        Encoding::Utf16Le => utf16::decode_utf16le(bytes, ErrorMode::Fatal),
115        Encoding::Utf16Be => utf16::decode_utf16be(bytes, ErrorMode::Fatal),
116    }
117}
118
119/// Encode a string to bytes using the given encoding.
120///
121/// Only UTF-8 encoding is supported for encode. Per WHATWG spec, UTF-16
122/// encodings are decode-only.
123pub fn encode(text: &str, encoding: Encoding) -> Result<Vec<u8>> {
124    match encoding {
125        Encoding::Utf8 => Ok(utf8::encode_utf8(text)),
126        Encoding::Utf16Be => Err(EncodingError::EncodeNotSupported {
127            encoding: "UTF-16BE",
128        }),
129        Encoding::Utf16Le => Err(EncodingError::EncodeNotSupported {
130            encoding: "UTF-16LE",
131        }),
132    }
133}
134
135// ---------------------------------------------------------------------------
136// Internal helpers
137// ---------------------------------------------------------------------------
138
139/// ASCII whitespace per WHATWG spec: TAB, LF, FF, CR, SPACE.
140fn trim_ascii_whitespace(s: &str) -> &str {
141    let bytes = s.as_bytes();
142    let start = bytes
143        .iter()
144        .position(|&b| !is_ascii_whitespace(b))
145        .unwrap_or(bytes.len());
146    let end = bytes
147        .iter()
148        .rposition(|&b| !is_ascii_whitespace(b))
149        .map(|p| p + 1)
150        .unwrap_or(0);
151    if start >= end {
152        return "";
153    }
154    &s[start..end]
155}
156
157fn is_ascii_whitespace(b: u8) -> bool {
158    matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20)
159}
160
161fn ascii_eq_ignore_case(a: &str, b: &str) -> bool {
162    a.eq_ignore_ascii_case(b)
163}
164
165// ---------------------------------------------------------------------------
166// Tests
167// ---------------------------------------------------------------------------
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172
173    // -- Encoding enum --
174
175    #[test]
176    fn encoding_names() {
177        assert_eq!(Encoding::Utf8.name(), "UTF-8");
178        assert_eq!(Encoding::Utf16Be.name(), "UTF-16BE");
179        assert_eq!(Encoding::Utf16Le.name(), "UTF-16LE");
180    }
181
182    // -- Label lookup --
183
184    #[test]
185    fn lookup_utf8_labels() {
186        assert_eq!(lookup("utf-8"), Some(Encoding::Utf8));
187        assert_eq!(lookup("UTF-8"), Some(Encoding::Utf8));
188        assert_eq!(lookup("utf8"), Some(Encoding::Utf8));
189        assert_eq!(lookup("Utf8"), Some(Encoding::Utf8));
190        assert_eq!(lookup("unicode-1-1-utf-8"), Some(Encoding::Utf8));
191        assert_eq!(lookup("x-unicode20utf8"), Some(Encoding::Utf8));
192    }
193
194    #[test]
195    fn lookup_utf16_labels() {
196        assert_eq!(lookup("utf-16be"), Some(Encoding::Utf16Be));
197        assert_eq!(lookup("UTF-16BE"), Some(Encoding::Utf16Be));
198        assert_eq!(lookup("unicodefffe"), Some(Encoding::Utf16Be));
199        assert_eq!(lookup("utf-16le"), Some(Encoding::Utf16Le));
200        assert_eq!(lookup("utf-16"), Some(Encoding::Utf16Le));
201        assert_eq!(lookup("unicode"), Some(Encoding::Utf16Le));
202        assert_eq!(lookup("ucs-2"), Some(Encoding::Utf16Le));
203        assert_eq!(lookup("iso-10646-ucs-2"), Some(Encoding::Utf16Le));
204    }
205
206    #[test]
207    fn lookup_with_whitespace() {
208        assert_eq!(lookup("  utf-8  "), Some(Encoding::Utf8));
209        assert_eq!(lookup("\tutf-8\n"), Some(Encoding::Utf8));
210        assert_eq!(lookup("\r\nutf-16le\r\n"), Some(Encoding::Utf16Le));
211    }
212
213    #[test]
214    fn lookup_unknown() {
215        assert_eq!(lookup("latin1"), None);
216        assert_eq!(lookup(""), None);
217        assert_eq!(lookup("   "), None);
218        assert_eq!(lookup("utf-99"), None);
219    }
220
221    // -- BOM sniffing --
222
223    #[test]
224    fn bom_utf8() {
225        let (enc, rest) = bom_sniff(&[0xEF, 0xBB, 0xBF, 0x41]);
226        assert_eq!(enc, Some(Encoding::Utf8));
227        assert_eq!(rest, &[0x41]);
228    }
229
230    #[test]
231    fn bom_utf16be() {
232        let (enc, rest) = bom_sniff(&[0xFE, 0xFF, 0x00, 0x41]);
233        assert_eq!(enc, Some(Encoding::Utf16Be));
234        assert_eq!(rest, &[0x00, 0x41]);
235    }
236
237    #[test]
238    fn bom_utf16le() {
239        let (enc, rest) = bom_sniff(&[0xFF, 0xFE, 0x41, 0x00]);
240        assert_eq!(enc, Some(Encoding::Utf16Le));
241        assert_eq!(rest, &[0x41, 0x00]);
242    }
243
244    #[test]
245    fn bom_none() {
246        let data = [0x41, 0x42, 0x43];
247        let (enc, rest) = bom_sniff(&data);
248        assert_eq!(enc, None);
249        assert_eq!(rest, &data);
250    }
251
252    #[test]
253    fn bom_empty() {
254        let (enc, rest) = bom_sniff(&[]);
255        assert_eq!(enc, None);
256        assert_eq!(rest, &[] as &[u8]);
257    }
258
259    #[test]
260    fn bom_short() {
261        let (enc, rest) = bom_sniff(&[0xEF, 0xBB]);
262        assert_eq!(enc, None);
263        assert_eq!(rest, &[0xEF, 0xBB]);
264    }
265
266    // -- Top-level decode --
267
268    #[test]
269    fn decode_utf8_basic() {
270        assert_eq!(decode(b"Hello", Encoding::Utf8), "Hello");
271    }
272
273    #[test]
274    fn decode_utf8_invalid_replaces() {
275        assert_eq!(decode(&[0xFF], Encoding::Utf8), "\u{FFFD}");
276    }
277
278    #[test]
279    fn decode_utf16le_basic() {
280        assert_eq!(decode(&[0x41, 0x00], Encoding::Utf16Le), "A");
281    }
282
283    #[test]
284    fn decode_utf16be_basic() {
285        assert_eq!(decode(&[0x00, 0x41], Encoding::Utf16Be), "A");
286    }
287
288    // -- Top-level decode_strict --
289
290    #[test]
291    fn decode_strict_valid() {
292        assert_eq!(decode_strict(b"Hello", Encoding::Utf8).unwrap(), "Hello");
293    }
294
295    #[test]
296    fn decode_strict_invalid() {
297        assert!(decode_strict(&[0xFF], Encoding::Utf8).is_err());
298    }
299
300    // -- Top-level encode --
301
302    #[test]
303    fn encode_utf8_basic() {
304        assert_eq!(encode("Hello", Encoding::Utf8).unwrap(), b"Hello");
305    }
306
307    #[test]
308    fn encode_utf16_not_supported() {
309        assert!(matches!(
310            encode("Hello", Encoding::Utf16Le),
311            Err(EncodingError::EncodeNotSupported {
312                encoding: "UTF-16LE"
313            })
314        ));
315        assert!(matches!(
316            encode("Hello", Encoding::Utf16Be),
317            Err(EncodingError::EncodeNotSupported {
318                encoding: "UTF-16BE"
319            })
320        ));
321    }
322
323    // -- Trim helpers --
324
325    #[test]
326    fn trim_ascii_whitespace_basic() {
327        assert_eq!(trim_ascii_whitespace("  hello  "), "hello");
328        assert_eq!(trim_ascii_whitespace("hello"), "hello");
329        assert_eq!(trim_ascii_whitespace(""), "");
330        assert_eq!(trim_ascii_whitespace("   "), "");
331        assert_eq!(trim_ascii_whitespace("\t\nhello\r\n"), "hello");
332    }
333
334    #[test]
335    fn ascii_eq_ignore_case_basic() {
336        assert!(ascii_eq_ignore_case("utf-8", "UTF-8"));
337        assert!(ascii_eq_ignore_case("Utf-8", "utf-8"));
338        assert!(!ascii_eq_ignore_case("utf-8", "utf-9"));
339        assert!(!ascii_eq_ignore_case("utf-8", "utf-8x"));
340    }
341}