crates/encoding/src/sniff.rs at encoding-sniffing

pierrelf.com / we
fork atom
we (web engine): Experimental web browser project to understand the limits of Claude
fork atom
we / crates / encoding / src / sniff.rs
at encoding-sniffing 715 lines 23 kB view raw
wrap content
pierrelf.com Review fix: use break instead of ? in parse_meta_tag attribute loop 4d ago
a82f370b
  1//! Encoding sniffing per WHATWG Encoding Standard and HTML spec.
  2//!
  3//! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan.
  4
  5use crate::{bom_sniff, lookup, Encoding};
  6
  7/// How the encoding was determined.
  8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
  9pub enum EncodingSource {
 10    /// Byte Order Mark at the start of the byte stream.
 11    Bom,
 12    /// `charset` parameter from the HTTP `Content-Type` header.
 13    HttpHeader,
 14    /// `<meta charset>` or `<meta http-equiv="Content-Type">` prescan.
 15    MetaPrescan,
 16    /// Default fallback (Windows-1252 for HTML).
 17    Default,
 18}
 19
 20/// Sniff the encoding of a byte stream.
 21///
 22/// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default.
 23/// The default encoding is Windows-1252 per WHATWG spec for HTML.
 24pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) {
 25    // 1. BOM sniffing (highest priority)
 26    let (bom_enc, _) = bom_sniff(bytes);
 27    if let Some(enc) = bom_enc {
 28        return (enc, EncodingSource::Bom);
 29    }
 30
 31    // 2. HTTP Content-Type charset
 32    if let Some(ct) = http_content_type {
 33        if let Some(enc) = extract_charset_from_content_type(ct) {
 34            return (enc, EncodingSource::HttpHeader);
 35        }
 36    }
 37
 38    // 3. HTML meta prescan (first 1024 bytes)
 39    if let Some(enc) = meta_prescan(bytes) {
 40        return (enc, EncodingSource::MetaPrescan);
 41    }
 42
 43    // 4. Default: Windows-1252
 44    (Encoding::Windows1252, EncodingSource::Default)
 45}
 46
 47/// Extract charset from an HTTP `Content-Type` header value.
 48///
 49/// Handles formats like:
 50/// - `text/html; charset=utf-8`
 51/// - `text/html; charset="utf-8"`
 52/// - `text/html;charset=utf-8` (no space)
 53///
 54/// Per WHATWG spec, the charset parameter value is looked up via the encoding label table.
 55/// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM).
 56fn extract_charset_from_content_type(content_type: &str) -> Option<Encoding> {
 57    let charset_value = extract_charset_value(content_type)?;
 58    let enc = lookup(charset_value)?;
 59    // Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead
 60    Some(match enc {
 61        Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
 62        other => other,
 63    })
 64}
 65
 66/// Extract the raw charset value from a Content-Type string.
 67fn extract_charset_value(content_type: &str) -> Option<&str> {
 68    // Find "charset" (case-insensitive) after a ';'
 69    let lower = content_type.to_ascii_lowercase();
 70    let idx = lower.find("charset")?;
 71
 72    // Must be preceded by ';' or whitespace (or be in parameters section)
 73    let after_charset = &content_type[idx + 7..];
 74    // Skip optional whitespace then '='
 75    let after_charset = after_charset.trim_start();
 76    let after_eq = after_charset.strip_prefix('=')?;
 77    let after_eq = after_eq.trim_start();
 78
 79    if let Some(inner) = after_eq.strip_prefix('"') {
 80        // Quoted value
 81        let end = inner.find('"')?;
 82        Some(&inner[..end])
 83    } else {
 84        // Unquoted value: terminated by whitespace, ';', or end of string
 85        let end = after_eq
 86            .find(|c: char| c == ';' || c.is_ascii_whitespace())
 87            .unwrap_or(after_eq.len());
 88        if end == 0 {
 89            return None;
 90        }
 91        Some(&after_eq[..end])
 92    }
 93}
 94
 95/// Prescan the first 1024 bytes of an HTML document for encoding declarations.
 96///
 97/// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm.
 98/// Looks for:
 99/// - `<meta charset="...">`
100/// - `<meta http-equiv="Content-Type" content="...;charset=...">`
101fn meta_prescan(bytes: &[u8]) -> Option<Encoding> {
102    let limit = bytes.len().min(1024);
103    let bytes = &bytes[..limit];
104    let mut pos = 0;
105
106    while pos < bytes.len() {
107        // Skip until we find '<'
108        if bytes[pos] != b'<' {
109            pos += 1;
110            continue;
111        }
112        pos += 1;
113        if pos >= bytes.len() {
114            break;
115        }
116
117        // Check for comment "<!--"
118        if bytes[pos..].starts_with(b"!--") {
119            pos += 3;
120            // Skip until "-->"
121            while pos + 2 < bytes.len() {
122                if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' {
123                    pos += 3;
124                    break;
125                }
126                pos += 1;
127            }
128            continue;
129        }
130
131        // Check for "<meta" (case-insensitive)
132        if pos + 4 <= bytes.len() && ascii_ci_eq(&bytes[pos..pos + 4], b"meta") {
133            let after_meta = pos + 4;
134            if after_meta < bytes.len() && is_space_or_slash(bytes[after_meta]) {
135                if let Some((enc, _tag_end)) = parse_meta_tag(bytes, after_meta) {
136                    // Per spec: override UTF-16 from meta to UTF-8
137                    let enc = match enc {
138                        Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
139                        other => other,
140                    };
141                    return Some(enc);
142                } else {
143                    pos = skip_tag(bytes, after_meta);
144                    continue;
145                }
146            }
147        }
148
149        // Skip other tags (like <!DOCTYPE>, <html>, etc.)
150        if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' {
151            pos = skip_tag(bytes, pos);
152            continue;
153        }
154
155        // Check if it's a letter (start of a tag name)
156        if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() {
157            pos = skip_tag(bytes, pos);
158            continue;
159        }
160
161        // Not a tag, continue
162    }
163
164    None
165}
166
167/// Parse attributes of a `<meta` tag looking for charset declarations.
168///
169/// Returns the encoding and position after the tag if found.
170fn parse_meta_tag(bytes: &[u8], start: usize) -> Option<(Encoding, usize)> {
171    let mut pos = start;
172    let mut got_pragma = false;
173    let mut need_pragma: Option<bool> = None;
174    let mut charset: Option<Encoding> = None;
175
176    loop {
177        // Skip whitespace
178        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
179            pos += 1;
180        }
181        if pos >= bytes.len() {
182            break;
183        }
184        // End of tag?
185        if bytes[pos] == b'>'
186            || (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>')
187        {
188            break;
189        }
190
191        let Some((attr_name, attr_value, new_pos)) = parse_attribute(bytes, pos) else {
192            break;
193        };
194        pos = new_pos;
195
196        if ascii_ci_eq_str(&attr_name, "http-equiv") {
197            if ascii_ci_eq_str(&attr_value, "content-type") {
198                got_pragma = true;
199            }
200        } else if ascii_ci_eq_str(&attr_name, "content") {
201            if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) {
202                if let Some(enc) = lookup(&charset_val) {
203                    charset = Some(enc);
204                    need_pragma = Some(true);
205                }
206            }
207        } else if ascii_ci_eq_str(&attr_name, "charset") {
208            if let Some(enc) = lookup(&attr_value) {
209                charset = Some(enc);
210                need_pragma = Some(false);
211            }
212        }
213    }
214
215    // Determine result per spec
216    match (need_pragma, charset) {
217        (Some(true), Some(enc)) if got_pragma => Some((enc, pos)),
218        (Some(false), Some(enc)) => Some((enc, pos)),
219        _ => None,
220    }
221}
222
223/// Parse a single HTML attribute (name=value pair).
224///
225/// Returns (name, value, new_position). Returns None if we hit end of tag or input.
226fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> {
227    let mut pos = start;
228
229    // Skip whitespace
230    while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
231        pos += 1;
232    }
233    if pos >= bytes.len() || bytes[pos] == b'>' {
234        return None;
235    }
236
237    // Read attribute name
238    let name_start = pos;
239    while pos < bytes.len()
240        && bytes[pos] != b'='
241        && bytes[pos] != b'>'
242        && !bytes[pos].is_ascii_whitespace()
243        && bytes[pos] != b'/'
244    {
245        pos += 1;
246    }
247    let name = to_ascii_lowercase(&bytes[name_start..pos]);
248    if name.is_empty() {
249        return None;
250    }
251
252    // Skip whitespace
253    while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
254        pos += 1;
255    }
256
257    // No value
258    if pos >= bytes.len() || bytes[pos] != b'=' {
259        return Some((name, String::new(), pos));
260    }
261    pos += 1; // skip '='
262
263    // Skip whitespace
264    while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
265        pos += 1;
266    }
267
268    if pos >= bytes.len() {
269        return Some((name, String::new(), pos));
270    }
271
272    // Read value
273    let value;
274    if bytes[pos] == b'"' || bytes[pos] == b'\'' {
275        let quote = bytes[pos];
276        pos += 1;
277        let val_start = pos;
278        while pos < bytes.len() && bytes[pos] != quote {
279            pos += 1;
280        }
281        value = to_ascii_lowercase(&bytes[val_start..pos]);
282        if pos < bytes.len() {
283            pos += 1; // skip closing quote
284        }
285    } else {
286        let val_start = pos;
287        while pos < bytes.len()
288            && !bytes[pos].is_ascii_whitespace()
289            && bytes[pos] != b'>'
290            && bytes[pos] != b';'
291        {
292            pos += 1;
293        }
294        value = to_ascii_lowercase(&bytes[val_start..pos]);
295    }
296
297    Some((name, value, pos))
298}
299
300/// Extract charset value from a meta content attribute value.
301///
302/// Looks for `charset=` in strings like `text/html; charset=utf-8`.
303fn extract_charset_from_meta_content(content: &str) -> Option<String> {
304    let lower = content.to_ascii_lowercase();
305    let idx = lower.find("charset")?;
306    let rest = &content[idx + 7..];
307    // Skip whitespace
308    let rest = rest.trim_start();
309    let rest = rest.strip_prefix('=')?;
310    let rest = rest.trim_start();
311
312    if rest.is_empty() {
313        return None;
314    }
315
316    // The value is terminated by ';', whitespace, or end
317    if rest.starts_with('"') || rest.starts_with('\'') {
318        let quote = rest.as_bytes()[0];
319        let inner = &rest[1..];
320        let end = inner.find(quote as char).unwrap_or(inner.len());
321        let val = inner[..end].trim();
322        if val.is_empty() {
323            return None;
324        }
325        Some(val.to_string())
326    } else {
327        let end = rest
328            .find(|c: char| c == ';' || c.is_ascii_whitespace())
329            .unwrap_or(rest.len());
330        if end == 0 {
331            return None;
332        }
333        Some(rest[..end].to_string())
334    }
335}
336
337/// Skip a tag (find the closing '>').
338fn skip_tag(bytes: &[u8], start: usize) -> usize {
339    let mut pos = start;
340    while pos < bytes.len() && bytes[pos] != b'>' {
341        pos += 1;
342    }
343    if pos < bytes.len() {
344        pos + 1
345    } else {
346        pos
347    }
348}
349
350fn is_space_or_slash(b: u8) -> bool {
351    b.is_ascii_whitespace() || b == b'/'
352}
353
354fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool {
355    a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y))
356}
357
358fn ascii_ci_eq_str(a: &str, b: &str) -> bool {
359    a.eq_ignore_ascii_case(b)
360}
361
362fn to_ascii_lowercase(bytes: &[u8]) -> String {
363    bytes
364        .iter()
365        .map(|&b| b.to_ascii_lowercase() as char)
366        .collect()
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    // -----------------------------------------------------------------------
374    // sniff_encoding — BOM priority
375    // -----------------------------------------------------------------------
376
377    #[test]
378    fn sniff_bom_utf8() {
379        let bytes = b"\xEF\xBB\xBFHello";
380        let (enc, src) = sniff_encoding(bytes, None);
381        assert_eq!(enc, Encoding::Utf8);
382        assert_eq!(src, EncodingSource::Bom);
383    }
384
385    #[test]
386    fn sniff_bom_utf16be() {
387        let bytes = b"\xFE\xFF\x00A";
388        let (enc, src) = sniff_encoding(bytes, None);
389        assert_eq!(enc, Encoding::Utf16Be);
390        assert_eq!(src, EncodingSource::Bom);
391    }
392
393    #[test]
394    fn sniff_bom_utf16le() {
395        let bytes = b"\xFF\xFEA\x00";
396        let (enc, src) = sniff_encoding(bytes, None);
397        assert_eq!(enc, Encoding::Utf16Le);
398        assert_eq!(src, EncodingSource::Bom);
399    }
400
401    #[test]
402    fn sniff_bom_beats_http_header() {
403        let bytes = b"\xEF\xBB\xBFHello";
404        let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2"));
405        assert_eq!(enc, Encoding::Utf8);
406        assert_eq!(src, EncodingSource::Bom);
407    }
408
409    #[test]
410    fn sniff_bom_beats_meta() {
411        let mut bytes = vec![0xEF, 0xBB, 0xBF];
412        bytes.extend_from_slice(b"<meta charset=\"iso-8859-5\">");
413        let (enc, src) = sniff_encoding(&bytes, None);
414        assert_eq!(enc, Encoding::Utf8);
415        assert_eq!(src, EncodingSource::Bom);
416    }
417
418    // -----------------------------------------------------------------------
419    // sniff_encoding — HTTP Content-Type priority
420    // -----------------------------------------------------------------------
421
422    #[test]
423    fn sniff_http_charset_utf8() {
424        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8"));
425        assert_eq!(enc, Encoding::Utf8);
426        assert_eq!(src, EncodingSource::HttpHeader);
427    }
428
429    #[test]
430    fn sniff_http_charset_quoted() {
431        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\""));
432        assert_eq!(enc, Encoding::Utf8);
433        assert_eq!(src, EncodingSource::HttpHeader);
434    }
435
436    #[test]
437    fn sniff_http_charset_case_insensitive() {
438        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8"));
439        assert_eq!(enc, Encoding::Utf8);
440        assert_eq!(src, EncodingSource::HttpHeader);
441    }
442
443    #[test]
444    fn sniff_http_charset_no_space() {
445        let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8"));
446        assert_eq!(enc, Encoding::Utf8);
447        assert_eq!(src, EncodingSource::HttpHeader);
448    }
449
450    #[test]
451    fn sniff_http_charset_windows_1252() {
452        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252"));
453        assert_eq!(enc, Encoding::Windows1252);
454        assert_eq!(src, EncodingSource::HttpHeader);
455    }
456
457    #[test]
458    fn sniff_http_charset_iso_8859_1_maps_to_1252() {
459        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1"));
460        assert_eq!(enc, Encoding::Windows1252);
461        assert_eq!(src, EncodingSource::HttpHeader);
462    }
463
464    #[test]
465    fn sniff_http_utf16_override_to_utf8() {
466        // Per WHATWG spec: UTF-16 from HTTP becomes UTF-8
467        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le"));
468        assert_eq!(enc, Encoding::Utf8);
469        assert_eq!(src, EncodingSource::HttpHeader);
470    }
471
472    #[test]
473    fn sniff_http_no_charset() {
474        let (enc, src) = sniff_encoding(b"Hello", Some("text/html"));
475        // Falls through to default
476        assert_eq!(enc, Encoding::Windows1252);
477        assert_eq!(src, EncodingSource::Default);
478    }
479
480    #[test]
481    fn sniff_http_beats_meta() {
482        let html = b"<meta charset=\"iso-8859-5\">";
483        let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8"));
484        assert_eq!(enc, Encoding::Utf8);
485        assert_eq!(src, EncodingSource::HttpHeader);
486    }
487
488    // -----------------------------------------------------------------------
489    // sniff_encoding — meta prescan
490    // -----------------------------------------------------------------------
491
492    #[test]
493    fn sniff_meta_charset() {
494        let html = b"<meta charset=\"utf-8\">";
495        let (enc, src) = sniff_encoding(html, None);
496        assert_eq!(enc, Encoding::Utf8);
497        assert_eq!(src, EncodingSource::MetaPrescan);
498    }
499
500    #[test]
501    fn sniff_meta_charset_single_quotes() {
502        let html = b"<meta charset='utf-8'>";
503        let (enc, src) = sniff_encoding(html, None);
504        assert_eq!(enc, Encoding::Utf8);
505        assert_eq!(src, EncodingSource::MetaPrescan);
506    }
507
508    #[test]
509    fn sniff_meta_http_equiv() {
510        let html = b"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">";
511        let (enc, src) = sniff_encoding(html, None);
512        assert_eq!(enc, Encoding::Utf8);
513        assert_eq!(src, EncodingSource::MetaPrescan);
514    }
515
516    #[test]
517    fn sniff_meta_http_equiv_case_insensitive() {
518        let html = b"<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\">";
519        let (enc, src) = sniff_encoding(html, None);
520        assert_eq!(enc, Encoding::Utf8);
521        assert_eq!(src, EncodingSource::MetaPrescan);
522    }
523
524    #[test]
525    fn sniff_meta_charset_legacy_encoding() {
526        let html = b"<meta charset=\"windows-1251\">";
527        let (enc, src) = sniff_encoding(html, None);
528        assert_eq!(enc, Encoding::Windows1251);
529        assert_eq!(src, EncodingSource::MetaPrescan);
530    }
531
532    #[test]
533    fn sniff_meta_utf16_override_to_utf8() {
534        let html = b"<meta charset=\"utf-16le\">";
535        let (enc, src) = sniff_encoding(html, None);
536        assert_eq!(enc, Encoding::Utf8);
537        assert_eq!(src, EncodingSource::MetaPrescan);
538    }
539
540    #[test]
541    fn sniff_meta_with_doctype_and_html() {
542        let html = b"<!DOCTYPE html><html><head><meta charset=\"utf-8\"></head>";
543        let (enc, src) = sniff_encoding(html, None);
544        assert_eq!(enc, Encoding::Utf8);
545        assert_eq!(src, EncodingSource::MetaPrescan);
546    }
547
548    #[test]
549    fn sniff_meta_with_comment_before() {
550        let html = b"<!-- comment --><meta charset=\"utf-8\">";
551        let (enc, src) = sniff_encoding(html, None);
552        assert_eq!(enc, Encoding::Utf8);
553        assert_eq!(src, EncodingSource::MetaPrescan);
554    }
555
556    #[test]
557    fn sniff_meta_beyond_1024_bytes_not_found() {
558        let mut html = vec![b' '; 1024];
559        html.extend_from_slice(b"<meta charset=\"utf-8\">");
560        let (enc, src) = sniff_encoding(&html, None);
561        assert_eq!(enc, Encoding::Windows1252);
562        assert_eq!(src, EncodingSource::Default);
563    }
564
565    #[test]
566    fn sniff_meta_within_1024_bytes() {
567        let mut html = vec![b' '; 1000];
568        html.extend_from_slice(b"<meta charset=\"utf-8\">");
569        let (enc, src) = sniff_encoding(&html, None);
570        assert_eq!(enc, Encoding::Utf8);
571        assert_eq!(src, EncodingSource::MetaPrescan);
572    }
573
574    // -----------------------------------------------------------------------
575    // sniff_encoding — default fallback
576    // -----------------------------------------------------------------------
577
578    #[test]
579    fn sniff_default_no_signals() {
580        let (enc, src) = sniff_encoding(b"Hello world", None);
581        assert_eq!(enc, Encoding::Windows1252);
582        assert_eq!(src, EncodingSource::Default);
583    }
584
585    #[test]
586    fn sniff_default_empty() {
587        let (enc, src) = sniff_encoding(b"", None);
588        assert_eq!(enc, Encoding::Windows1252);
589        assert_eq!(src, EncodingSource::Default);
590    }
591
592    // -----------------------------------------------------------------------
593    // extract_charset_from_content_type
594    // -----------------------------------------------------------------------
595
596    #[test]
597    fn extract_charset_basic() {
598        assert_eq!(
599            extract_charset_from_content_type("text/html; charset=utf-8"),
600            Some(Encoding::Utf8)
601        );
602    }
603
604    #[test]
605    fn extract_charset_quoted() {
606        assert_eq!(
607            extract_charset_from_content_type("text/html; charset=\"utf-8\""),
608            Some(Encoding::Utf8)
609        );
610    }
611
612    #[test]
613    fn extract_charset_no_space() {
614        assert_eq!(
615            extract_charset_from_content_type("text/html;charset=utf-8"),
616            Some(Encoding::Utf8)
617        );
618    }
619
620    #[test]
621    fn extract_charset_uppercase() {
622        assert_eq!(
623            extract_charset_from_content_type("text/html; CHARSET=UTF-8"),
624            Some(Encoding::Utf8)
625        );
626    }
627
628    #[test]
629    fn extract_charset_missing() {
630        assert_eq!(extract_charset_from_content_type("text/html"), None);
631    }
632
633    #[test]
634    fn extract_charset_empty_value() {
635        assert_eq!(
636            extract_charset_from_content_type("text/html; charset="),
637            None
638        );
639    }
640
641    #[test]
642    fn extract_charset_unknown_encoding() {
643        assert_eq!(
644            extract_charset_from_content_type("text/html; charset=bogus"),
645            None
646        );
647    }
648
649    #[test]
650    fn extract_charset_with_extra_params() {
651        assert_eq!(
652            extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"),
653            Some(Encoding::Utf8)
654        );
655    }
656
657    // -----------------------------------------------------------------------
658    // meta_prescan internals
659    // -----------------------------------------------------------------------
660
661    #[test]
662    fn meta_prescan_charset_attr() {
663        let html = b"<meta charset=\"iso-8859-2\">";
664        assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2));
665    }
666
667    #[test]
668    fn meta_prescan_http_equiv_content() {
669        let html = b"<meta http-equiv=\"content-type\" content=\"text/html; charset=koi8-r\">";
670        assert_eq!(meta_prescan(html), Some(Encoding::Koi8R));
671    }
672
673    #[test]
674    fn meta_prescan_no_meta() {
675        let html = b"<html><head><title>Test</title></head></html>";
676        assert_eq!(meta_prescan(html), None);
677    }
678
679    #[test]
680    fn meta_prescan_meta_without_charset() {
681        let html = b"<meta name=\"viewport\" content=\"width=device-width\">";
682        assert_eq!(meta_prescan(html), None);
683    }
684
685    #[test]
686    fn meta_prescan_http_equiv_without_content() {
687        let html = b"<meta http-equiv=\"content-type\">";
688        assert_eq!(meta_prescan(html), None);
689    }
690
691    #[test]
692    fn meta_prescan_content_without_http_equiv() {
693        // charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false
694        let html = b"<meta content=\"text/html; charset=utf-8\">";
695        assert_eq!(meta_prescan(html), None);
696    }
697
698    #[test]
699    fn meta_prescan_skips_comments() {
700        let html = b"<!-- <meta charset=\"iso-8859-5\"> --><meta charset=\"utf-8\">";
701        assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
702    }
703
704    #[test]
705    fn meta_prescan_unquoted_charset() {
706        let html = b"<meta charset=utf-8>";
707        assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
708    }
709
710    #[test]
711    fn meta_prescan_self_closing() {
712        let html = b"<meta charset=\"utf-8\" />";
713        assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
714    }
715}