crates/encoding/src/sniff.rs at flexbox

pierrelf.com / we
fork atom
we (web engine): Experimental web browser project to understand the limits of Claude
fork atom
we / crates / encoding / src / sniff.rs
at flexbox 713 lines 23 kB view raw
wrap content
pierrelf.com Implement encoding sniffing: BOM, HTTP charset, meta prescan 8d ago
276dfeb1
  1//! Encoding sniffing per WHATWG Encoding Standard and HTML spec.
  2//!
  3//! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan.
  4
  5use crate::{bom_sniff, lookup, Encoding};
  6
  7/// How the encoding was determined.
  8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
  9pub enum EncodingSource {
 10    /// Byte Order Mark at the start of the byte stream.
 11    Bom,
 12    /// `charset` parameter from the HTTP `Content-Type` header.
 13    HttpHeader,
 14    /// `<meta charset>` or `<meta http-equiv="Content-Type">` prescan.
 15    MetaPrescan,
 16    /// Default fallback (Windows-1252 for HTML).
 17    Default,
 18}
 19
 20/// Sniff the encoding of a byte stream.
 21///
 22/// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default.
 23/// The default encoding is Windows-1252 per WHATWG spec for HTML.
 24pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) {
 25    // 1. BOM sniffing (highest priority)
 26    let (bom_enc, _) = bom_sniff(bytes);
 27    if let Some(enc) = bom_enc {
 28        return (enc, EncodingSource::Bom);
 29    }
 30
 31    // 2. HTTP Content-Type charset
 32    if let Some(ct) = http_content_type {
 33        if let Some(enc) = extract_charset_from_content_type(ct) {
 34            return (enc, EncodingSource::HttpHeader);
 35        }
 36    }
 37
 38    // 3. HTML meta prescan (first 1024 bytes)
 39    if let Some(enc) = meta_prescan(bytes) {
 40        return (enc, EncodingSource::MetaPrescan);
 41    }
 42
 43    // 4. Default: Windows-1252
 44    (Encoding::Windows1252, EncodingSource::Default)
 45}
 46
 47/// Extract charset from an HTTP `Content-Type` header value.
 48///
 49/// Handles formats like:
 50/// - `text/html; charset=utf-8`
 51/// - `text/html; charset="utf-8"`
 52/// - `text/html;charset=utf-8` (no space)
 53///
 54/// Per WHATWG spec, the charset parameter value is looked up via the encoding label table.
 55/// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM).
 56fn extract_charset_from_content_type(content_type: &str) -> Option<Encoding> {
 57    let charset_value = extract_charset_value(content_type)?;
 58    let enc = lookup(charset_value)?;
 59    // Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead
 60    Some(match enc {
 61        Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
 62        other => other,
 63    })
 64}
 65
 66/// Extract the raw charset value from a Content-Type string.
 67fn extract_charset_value(content_type: &str) -> Option<&str> {
 68    // Find "charset" (case-insensitive) after a ';'
 69    let lower = content_type.to_ascii_lowercase();
 70    let idx = lower.find("charset")?;
 71
 72    // Must be preceded by ';' or whitespace (or be in parameters section)
 73    let after_charset = &content_type[idx + 7..];
 74    // Skip optional whitespace then '='
 75    let after_charset = after_charset.trim_start();
 76    let after_eq = after_charset.strip_prefix('=')?;
 77    let after_eq = after_eq.trim_start();
 78
 79    if let Some(inner) = after_eq.strip_prefix('"') {
 80        // Quoted value
 81        let end = inner.find('"')?;
 82        Some(&inner[..end])
 83    } else {
 84        // Unquoted value: terminated by whitespace, ';', or end of string
 85        let end = after_eq
 86            .find(|c: char| c == ';' || c.is_ascii_whitespace())
 87            .unwrap_or(after_eq.len());
 88        if end == 0 {
 89            return None;
 90        }
 91        Some(&after_eq[..end])
 92    }
 93}
 94
 95/// Prescan the first 1024 bytes of an HTML document for encoding declarations.
 96///
 97/// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm.
 98/// Looks for:
 99/// - `<meta charset="...">`
100/// - `<meta http-equiv="Content-Type" content="...;charset=...">`
101fn meta_prescan(bytes: &[u8]) -> Option<Encoding> {
102    let limit = bytes.len().min(1024);
103    let bytes = &bytes[..limit];
104    let mut pos = 0;
105
106    while pos < bytes.len() {
107        // Skip until we find '<'
108        if bytes[pos] != b'<' {
109            pos += 1;
110            continue;
111        }
112        pos += 1;
113        if pos >= bytes.len() {
114            break;
115        }
116
117        // Check for comment "<!--"
118        if bytes[pos..].starts_with(b"!--") {
119            pos += 3;
120            // Skip until "-->"
121            while pos + 2 < bytes.len() {
122                if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' {
123                    pos += 3;
124                    break;
125                }
126                pos += 1;
127            }
128            continue;
129        }
130
131        // Check for "<meta" (case-insensitive)
132        if pos + 4 <= bytes.len() && ascii_ci_eq(&bytes[pos..pos + 4], b"meta") {
133            let after_meta = pos + 4;
134            if after_meta < bytes.len() && is_space_or_slash(bytes[after_meta]) {
135                if let Some((enc, _tag_end)) = parse_meta_tag(bytes, after_meta) {
136                    // Per spec: override UTF-16 from meta to UTF-8
137                    let enc = match enc {
138                        Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
139                        other => other,
140                    };
141                    return Some(enc);
142                } else {
143                    pos = skip_tag(bytes, after_meta);
144                    continue;
145                }
146            }
147        }
148
149        // Skip other tags (like <!DOCTYPE>, <html>, etc.)
150        if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' {
151            pos = skip_tag(bytes, pos);
152            continue;
153        }
154
155        // Check if it's a letter (start of a tag name)
156        if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() {
157            pos = skip_tag(bytes, pos);
158            continue;
159        }
160
161        // Not a tag, continue
162    }
163
164    None
165}
166
167/// Parse attributes of a `<meta` tag looking for charset declarations.
168///
169/// Returns the encoding and position after the tag if found.
170fn parse_meta_tag(bytes: &[u8], start: usize) -> Option<(Encoding, usize)> {
171    let mut pos = start;
172    let mut got_pragma = false;
173    let mut need_pragma: Option<bool> = None;
174    let mut charset: Option<Encoding> = None;
175
176    loop {
177        // Skip whitespace
178        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
179            pos += 1;
180        }
181        if pos >= bytes.len() {
182            break;
183        }
184        // End of tag?
185        if bytes[pos] == b'>'
186            || (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>')
187        {
188            break;
189        }
190
191        let (attr_name, attr_value, new_pos) = parse_attribute(bytes, pos)?;
192        pos = new_pos;
193
194        if ascii_ci_eq_str(&attr_name, "http-equiv") {
195            if ascii_ci_eq_str(&attr_value, "content-type") {
196                got_pragma = true;
197            }
198        } else if ascii_ci_eq_str(&attr_name, "content") {
199            if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) {
200                if let Some(enc) = lookup(&charset_val) {
201                    charset = Some(enc);
202                    need_pragma = Some(true);
203                }
204            }
205        } else if ascii_ci_eq_str(&attr_name, "charset") {
206            if let Some(enc) = lookup(&attr_value) {
207                charset = Some(enc);
208                need_pragma = Some(false);
209            }
210        }
211    }
212
213    // Determine result per spec
214    match (need_pragma, charset) {
215        (Some(true), Some(enc)) if got_pragma => Some((enc, pos)),
216        (Some(false), Some(enc)) => Some((enc, pos)),
217        _ => None,
218    }
219}
220
221/// Parse a single HTML attribute (name=value pair).
222///
223/// Returns (name, value, new_position). Returns None if we hit end of tag or input.
224fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> {
225    let mut pos = start;
226
227    // Skip whitespace
228    while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
229        pos += 1;
230    }
231    if pos >= bytes.len() || bytes[pos] == b'>' {
232        return None;
233    }
234
235    // Read attribute name
236    let name_start = pos;
237    while pos < bytes.len()
238        && bytes[pos] != b'='
239        && bytes[pos] != b'>'
240        && !bytes[pos].is_ascii_whitespace()
241        && bytes[pos] != b'/'
242    {
243        pos += 1;
244    }
245    let name = to_ascii_lowercase(&bytes[name_start..pos]);
246    if name.is_empty() {
247        return None;
248    }
249
250    // Skip whitespace
251    while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
252        pos += 1;
253    }
254
255    // No value
256    if pos >= bytes.len() || bytes[pos] != b'=' {
257        return Some((name, String::new(), pos));
258    }
259    pos += 1; // skip '='
260
261    // Skip whitespace
262    while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
263        pos += 1;
264    }
265
266    if pos >= bytes.len() {
267        return Some((name, String::new(), pos));
268    }
269
270    // Read value
271    let value;
272    if bytes[pos] == b'"' || bytes[pos] == b'\'' {
273        let quote = bytes[pos];
274        pos += 1;
275        let val_start = pos;
276        while pos < bytes.len() && bytes[pos] != quote {
277            pos += 1;
278        }
279        value = to_ascii_lowercase(&bytes[val_start..pos]);
280        if pos < bytes.len() {
281            pos += 1; // skip closing quote
282        }
283    } else {
284        let val_start = pos;
285        while pos < bytes.len()
286            && !bytes[pos].is_ascii_whitespace()
287            && bytes[pos] != b'>'
288            && bytes[pos] != b';'
289        {
290            pos += 1;
291        }
292        value = to_ascii_lowercase(&bytes[val_start..pos]);
293    }
294
295    Some((name, value, pos))
296}
297
298/// Extract charset value from a meta content attribute value.
299///
300/// Looks for `charset=` in strings like `text/html; charset=utf-8`.
301fn extract_charset_from_meta_content(content: &str) -> Option<String> {
302    let lower = content.to_ascii_lowercase();
303    let idx = lower.find("charset")?;
304    let rest = &content[idx + 7..];
305    // Skip whitespace
306    let rest = rest.trim_start();
307    let rest = rest.strip_prefix('=')?;
308    let rest = rest.trim_start();
309
310    if rest.is_empty() {
311        return None;
312    }
313
314    // The value is terminated by ';', whitespace, or end
315    if rest.starts_with('"') || rest.starts_with('\'') {
316        let quote = rest.as_bytes()[0];
317        let inner = &rest[1..];
318        let end = inner.find(quote as char).unwrap_or(inner.len());
319        let val = inner[..end].trim();
320        if val.is_empty() {
321            return None;
322        }
323        Some(val.to_string())
324    } else {
325        let end = rest
326            .find(|c: char| c == ';' || c.is_ascii_whitespace())
327            .unwrap_or(rest.len());
328        if end == 0 {
329            return None;
330        }
331        Some(rest[..end].to_string())
332    }
333}
334
335/// Skip a tag (find the closing '>').
336fn skip_tag(bytes: &[u8], start: usize) -> usize {
337    let mut pos = start;
338    while pos < bytes.len() && bytes[pos] != b'>' {
339        pos += 1;
340    }
341    if pos < bytes.len() {
342        pos + 1
343    } else {
344        pos
345    }
346}
347
348fn is_space_or_slash(b: u8) -> bool {
349    b.is_ascii_whitespace() || b == b'/'
350}
351
352fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool {
353    a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y))
354}
355
356fn ascii_ci_eq_str(a: &str, b: &str) -> bool {
357    a.eq_ignore_ascii_case(b)
358}
359
360fn to_ascii_lowercase(bytes: &[u8]) -> String {
361    bytes
362        .iter()
363        .map(|&b| b.to_ascii_lowercase() as char)
364        .collect()
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370
371    // -----------------------------------------------------------------------
372    // sniff_encoding — BOM priority
373    // -----------------------------------------------------------------------
374
375    #[test]
376    fn sniff_bom_utf8() {
377        let bytes = b"\xEF\xBB\xBFHello";
378        let (enc, src) = sniff_encoding(bytes, None);
379        assert_eq!(enc, Encoding::Utf8);
380        assert_eq!(src, EncodingSource::Bom);
381    }
382
383    #[test]
384    fn sniff_bom_utf16be() {
385        let bytes = b"\xFE\xFF\x00A";
386        let (enc, src) = sniff_encoding(bytes, None);
387        assert_eq!(enc, Encoding::Utf16Be);
388        assert_eq!(src, EncodingSource::Bom);
389    }
390
391    #[test]
392    fn sniff_bom_utf16le() {
393        let bytes = b"\xFF\xFEA\x00";
394        let (enc, src) = sniff_encoding(bytes, None);
395        assert_eq!(enc, Encoding::Utf16Le);
396        assert_eq!(src, EncodingSource::Bom);
397    }
398
399    #[test]
400    fn sniff_bom_beats_http_header() {
401        let bytes = b"\xEF\xBB\xBFHello";
402        let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2"));
403        assert_eq!(enc, Encoding::Utf8);
404        assert_eq!(src, EncodingSource::Bom);
405    }
406
407    #[test]
408    fn sniff_bom_beats_meta() {
409        let mut bytes = vec![0xEF, 0xBB, 0xBF];
410        bytes.extend_from_slice(b"<meta charset=\"iso-8859-5\">");
411        let (enc, src) = sniff_encoding(&bytes, None);
412        assert_eq!(enc, Encoding::Utf8);
413        assert_eq!(src, EncodingSource::Bom);
414    }
415
416    // -----------------------------------------------------------------------
417    // sniff_encoding — HTTP Content-Type priority
418    // -----------------------------------------------------------------------
419
420    #[test]
421    fn sniff_http_charset_utf8() {
422        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8"));
423        assert_eq!(enc, Encoding::Utf8);
424        assert_eq!(src, EncodingSource::HttpHeader);
425    }
426
427    #[test]
428    fn sniff_http_charset_quoted() {
429        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\""));
430        assert_eq!(enc, Encoding::Utf8);
431        assert_eq!(src, EncodingSource::HttpHeader);
432    }
433
434    #[test]
435    fn sniff_http_charset_case_insensitive() {
436        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8"));
437        assert_eq!(enc, Encoding::Utf8);
438        assert_eq!(src, EncodingSource::HttpHeader);
439    }
440
441    #[test]
442    fn sniff_http_charset_no_space() {
443        let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8"));
444        assert_eq!(enc, Encoding::Utf8);
445        assert_eq!(src, EncodingSource::HttpHeader);
446    }
447
448    #[test]
449    fn sniff_http_charset_windows_1252() {
450        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252"));
451        assert_eq!(enc, Encoding::Windows1252);
452        assert_eq!(src, EncodingSource::HttpHeader);
453    }
454
455    #[test]
456    fn sniff_http_charset_iso_8859_1_maps_to_1252() {
457        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1"));
458        assert_eq!(enc, Encoding::Windows1252);
459        assert_eq!(src, EncodingSource::HttpHeader);
460    }
461
462    #[test]
463    fn sniff_http_utf16_override_to_utf8() {
464        // Per WHATWG spec: UTF-16 from HTTP becomes UTF-8
465        let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le"));
466        assert_eq!(enc, Encoding::Utf8);
467        assert_eq!(src, EncodingSource::HttpHeader);
468    }
469
470    #[test]
471    fn sniff_http_no_charset() {
472        let (enc, src) = sniff_encoding(b"Hello", Some("text/html"));
473        // Falls through to default
474        assert_eq!(enc, Encoding::Windows1252);
475        assert_eq!(src, EncodingSource::Default);
476    }
477
478    #[test]
479    fn sniff_http_beats_meta() {
480        let html = b"<meta charset=\"iso-8859-5\">";
481        let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8"));
482        assert_eq!(enc, Encoding::Utf8);
483        assert_eq!(src, EncodingSource::HttpHeader);
484    }
485
486    // -----------------------------------------------------------------------
487    // sniff_encoding — meta prescan
488    // -----------------------------------------------------------------------
489
490    #[test]
491    fn sniff_meta_charset() {
492        let html = b"<meta charset=\"utf-8\">";
493        let (enc, src) = sniff_encoding(html, None);
494        assert_eq!(enc, Encoding::Utf8);
495        assert_eq!(src, EncodingSource::MetaPrescan);
496    }
497
498    #[test]
499    fn sniff_meta_charset_single_quotes() {
500        let html = b"<meta charset='utf-8'>";
501        let (enc, src) = sniff_encoding(html, None);
502        assert_eq!(enc, Encoding::Utf8);
503        assert_eq!(src, EncodingSource::MetaPrescan);
504    }
505
506    #[test]
507    fn sniff_meta_http_equiv() {
508        let html = b"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">";
509        let (enc, src) = sniff_encoding(html, None);
510        assert_eq!(enc, Encoding::Utf8);
511        assert_eq!(src, EncodingSource::MetaPrescan);
512    }
513
514    #[test]
515    fn sniff_meta_http_equiv_case_insensitive() {
516        let html = b"<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\">";
517        let (enc, src) = sniff_encoding(html, None);
518        assert_eq!(enc, Encoding::Utf8);
519        assert_eq!(src, EncodingSource::MetaPrescan);
520    }
521
522    #[test]
523    fn sniff_meta_charset_legacy_encoding() {
524        let html = b"<meta charset=\"windows-1251\">";
525        let (enc, src) = sniff_encoding(html, None);
526        assert_eq!(enc, Encoding::Windows1251);
527        assert_eq!(src, EncodingSource::MetaPrescan);
528    }
529
530    #[test]
531    fn sniff_meta_utf16_override_to_utf8() {
532        let html = b"<meta charset=\"utf-16le\">";
533        let (enc, src) = sniff_encoding(html, None);
534        assert_eq!(enc, Encoding::Utf8);
535        assert_eq!(src, EncodingSource::MetaPrescan);
536    }
537
538    #[test]
539    fn sniff_meta_with_doctype_and_html() {
540        let html = b"<!DOCTYPE html><html><head><meta charset=\"utf-8\"></head>";
541        let (enc, src) = sniff_encoding(html, None);
542        assert_eq!(enc, Encoding::Utf8);
543        assert_eq!(src, EncodingSource::MetaPrescan);
544    }
545
546    #[test]
547    fn sniff_meta_with_comment_before() {
548        let html = b"<!-- comment --><meta charset=\"utf-8\">";
549        let (enc, src) = sniff_encoding(html, None);
550        assert_eq!(enc, Encoding::Utf8);
551        assert_eq!(src, EncodingSource::MetaPrescan);
552    }
553
554    #[test]
555    fn sniff_meta_beyond_1024_bytes_not_found() {
556        let mut html = vec![b' '; 1024];
557        html.extend_from_slice(b"<meta charset=\"utf-8\">");
558        let (enc, src) = sniff_encoding(&html, None);
559        assert_eq!(enc, Encoding::Windows1252);
560        assert_eq!(src, EncodingSource::Default);
561    }
562
563    #[test]
564    fn sniff_meta_within_1024_bytes() {
565        let mut html = vec![b' '; 1000];
566        html.extend_from_slice(b"<meta charset=\"utf-8\">");
567        let (enc, src) = sniff_encoding(&html, None);
568        assert_eq!(enc, Encoding::Utf8);
569        assert_eq!(src, EncodingSource::MetaPrescan);
570    }
571
572    // -----------------------------------------------------------------------
573    // sniff_encoding — default fallback
574    // -----------------------------------------------------------------------
575
576    #[test]
577    fn sniff_default_no_signals() {
578        let (enc, src) = sniff_encoding(b"Hello world", None);
579        assert_eq!(enc, Encoding::Windows1252);
580        assert_eq!(src, EncodingSource::Default);
581    }
582
583    #[test]
584    fn sniff_default_empty() {
585        let (enc, src) = sniff_encoding(b"", None);
586        assert_eq!(enc, Encoding::Windows1252);
587        assert_eq!(src, EncodingSource::Default);
588    }
589
590    // -----------------------------------------------------------------------
591    // extract_charset_from_content_type
592    // -----------------------------------------------------------------------
593
594    #[test]
595    fn extract_charset_basic() {
596        assert_eq!(
597            extract_charset_from_content_type("text/html; charset=utf-8"),
598            Some(Encoding::Utf8)
599        );
600    }
601
602    #[test]
603    fn extract_charset_quoted() {
604        assert_eq!(
605            extract_charset_from_content_type("text/html; charset=\"utf-8\""),
606            Some(Encoding::Utf8)
607        );
608    }
609
610    #[test]
611    fn extract_charset_no_space() {
612        assert_eq!(
613            extract_charset_from_content_type("text/html;charset=utf-8"),
614            Some(Encoding::Utf8)
615        );
616    }
617
618    #[test]
619    fn extract_charset_uppercase() {
620        assert_eq!(
621            extract_charset_from_content_type("text/html; CHARSET=UTF-8"),
622            Some(Encoding::Utf8)
623        );
624    }
625
626    #[test]
627    fn extract_charset_missing() {
628        assert_eq!(extract_charset_from_content_type("text/html"), None);
629    }
630
631    #[test]
632    fn extract_charset_empty_value() {
633        assert_eq!(
634            extract_charset_from_content_type("text/html; charset="),
635            None
636        );
637    }
638
639    #[test]
640    fn extract_charset_unknown_encoding() {
641        assert_eq!(
642            extract_charset_from_content_type("text/html; charset=bogus"),
643            None
644        );
645    }
646
647    #[test]
648    fn extract_charset_with_extra_params() {
649        assert_eq!(
650            extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"),
651            Some(Encoding::Utf8)
652        );
653    }
654
655    // -----------------------------------------------------------------------
656    // meta_prescan internals
657    // -----------------------------------------------------------------------
658
659    #[test]
660    fn meta_prescan_charset_attr() {
661        let html = b"<meta charset=\"iso-8859-2\">";
662        assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2));
663    }
664
665    #[test]
666    fn meta_prescan_http_equiv_content() {
667        let html = b"<meta http-equiv=\"content-type\" content=\"text/html; charset=koi8-r\">";
668        assert_eq!(meta_prescan(html), Some(Encoding::Koi8R));
669    }
670
671    #[test]
672    fn meta_prescan_no_meta() {
673        let html = b"<html><head><title>Test</title></head></html>";
674        assert_eq!(meta_prescan(html), None);
675    }
676
677    #[test]
678    fn meta_prescan_meta_without_charset() {
679        let html = b"<meta name=\"viewport\" content=\"width=device-width\">";
680        assert_eq!(meta_prescan(html), None);
681    }
682
683    #[test]
684    fn meta_prescan_http_equiv_without_content() {
685        let html = b"<meta http-equiv=\"content-type\">";
686        assert_eq!(meta_prescan(html), None);
687    }
688
689    #[test]
690    fn meta_prescan_content_without_http_equiv() {
691        // charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false
692        let html = b"<meta content=\"text/html; charset=utf-8\">";
693        assert_eq!(meta_prescan(html), None);
694    }
695
696    #[test]
697    fn meta_prescan_skips_comments() {
698        let html = b"<!-- <meta charset=\"iso-8859-5\"> --><meta charset=\"utf-8\">";
699        assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
700    }
701
702    #[test]
703    fn meta_prescan_unquoted_charset() {
704        let html = b"<meta charset=utf-8>";
705        assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
706    }
707
708    #[test]
709    fn meta_prescan_self_closing() {
710        let html = b"<meta charset=\"utf-8\" />";
711        assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
712    }
713}