we (web engine): Experimental web browser project to understand the limits of Claude
1//! Encoding sniffing per WHATWG Encoding Standard and HTML spec.
2//!
3//! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan.
4
5use crate::{bom_sniff, lookup, Encoding};
6
7/// How the encoding was determined.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum EncodingSource {
10 /// Byte Order Mark at the start of the byte stream.
11 Bom,
12 /// `charset` parameter from the HTTP `Content-Type` header.
13 HttpHeader,
14 /// `<meta charset>` or `<meta http-equiv="Content-Type">` prescan.
15 MetaPrescan,
16 /// Default fallback (Windows-1252 for HTML).
17 Default,
18}
19
20/// Sniff the encoding of a byte stream.
21///
22/// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default.
23/// The default encoding is Windows-1252 per WHATWG spec for HTML.
24pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) {
25 // 1. BOM sniffing (highest priority)
26 let (bom_enc, _) = bom_sniff(bytes);
27 if let Some(enc) = bom_enc {
28 return (enc, EncodingSource::Bom);
29 }
30
31 // 2. HTTP Content-Type charset
32 if let Some(ct) = http_content_type {
33 if let Some(enc) = extract_charset_from_content_type(ct) {
34 return (enc, EncodingSource::HttpHeader);
35 }
36 }
37
38 // 3. HTML meta prescan (first 1024 bytes)
39 if let Some(enc) = meta_prescan(bytes) {
40 return (enc, EncodingSource::MetaPrescan);
41 }
42
43 // 4. Default: Windows-1252
44 (Encoding::Windows1252, EncodingSource::Default)
45}
46
47/// Extract charset from an HTTP `Content-Type` header value.
48///
49/// Handles formats like:
50/// - `text/html; charset=utf-8`
51/// - `text/html; charset="utf-8"`
52/// - `text/html;charset=utf-8` (no space)
53///
54/// Per WHATWG spec, the charset parameter value is looked up via the encoding label table.
55/// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM).
56fn extract_charset_from_content_type(content_type: &str) -> Option<Encoding> {
57 let charset_value = extract_charset_value(content_type)?;
58 let enc = lookup(charset_value)?;
59 // Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead
60 Some(match enc {
61 Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
62 other => other,
63 })
64}
65
66/// Extract the raw charset value from a Content-Type string.
67fn extract_charset_value(content_type: &str) -> Option<&str> {
68 // Find "charset" (case-insensitive) after a ';'
69 let lower = content_type.to_ascii_lowercase();
70 let idx = lower.find("charset")?;
71
72 // Must be preceded by ';' or whitespace (or be in parameters section)
73 let after_charset = &content_type[idx + 7..];
74 // Skip optional whitespace then '='
75 let after_charset = after_charset.trim_start();
76 let after_eq = after_charset.strip_prefix('=')?;
77 let after_eq = after_eq.trim_start();
78
79 if let Some(inner) = after_eq.strip_prefix('"') {
80 // Quoted value
81 let end = inner.find('"')?;
82 Some(&inner[..end])
83 } else {
84 // Unquoted value: terminated by whitespace, ';', or end of string
85 let end = after_eq
86 .find(|c: char| c == ';' || c.is_ascii_whitespace())
87 .unwrap_or(after_eq.len());
88 if end == 0 {
89 return None;
90 }
91 Some(&after_eq[..end])
92 }
93}
94
95/// Prescan the first 1024 bytes of an HTML document for encoding declarations.
96///
97/// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm.
98/// Looks for:
99/// - `<meta charset="...">`
100/// - `<meta http-equiv="Content-Type" content="...;charset=...">`
101fn meta_prescan(bytes: &[u8]) -> Option<Encoding> {
102 let limit = bytes.len().min(1024);
103 let bytes = &bytes[..limit];
104 let mut pos = 0;
105
106 while pos < bytes.len() {
107 // Skip until we find '<'
108 if bytes[pos] != b'<' {
109 pos += 1;
110 continue;
111 }
112 pos += 1;
113 if pos >= bytes.len() {
114 break;
115 }
116
117 // Check for comment "<!--"
118 if bytes[pos..].starts_with(b"!--") {
119 pos += 3;
120 // Skip until "-->"
121 while pos + 2 < bytes.len() {
122 if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' {
123 pos += 3;
124 break;
125 }
126 pos += 1;
127 }
128 continue;
129 }
130
131 // Check for "<meta" (case-insensitive)
132 if pos + 4 <= bytes.len() && ascii_ci_eq(&bytes[pos..pos + 4], b"meta") {
133 let after_meta = pos + 4;
134 if after_meta < bytes.len() && is_space_or_slash(bytes[after_meta]) {
135 if let Some((enc, _tag_end)) = parse_meta_tag(bytes, after_meta) {
136 // Per spec: override UTF-16 from meta to UTF-8
137 let enc = match enc {
138 Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
139 other => other,
140 };
141 return Some(enc);
142 } else {
143 pos = skip_tag(bytes, after_meta);
144 continue;
145 }
146 }
147 }
148
149 // Skip other tags (like <!DOCTYPE>, <html>, etc.)
150 if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' {
151 pos = skip_tag(bytes, pos);
152 continue;
153 }
154
155 // Check if it's a letter (start of a tag name)
156 if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() {
157 pos = skip_tag(bytes, pos);
158 continue;
159 }
160
161 // Not a tag, continue
162 }
163
164 None
165}
166
167/// Parse attributes of a `<meta` tag looking for charset declarations.
168///
169/// Returns the encoding and position after the tag if found.
170fn parse_meta_tag(bytes: &[u8], start: usize) -> Option<(Encoding, usize)> {
171 let mut pos = start;
172 let mut got_pragma = false;
173 let mut need_pragma: Option<bool> = None;
174 let mut charset: Option<Encoding> = None;
175
176 loop {
177 // Skip whitespace
178 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
179 pos += 1;
180 }
181 if pos >= bytes.len() {
182 break;
183 }
184 // End of tag?
185 if bytes[pos] == b'>'
186 || (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>')
187 {
188 break;
189 }
190
191 let Some((attr_name, attr_value, new_pos)) = parse_attribute(bytes, pos) else {
192 break;
193 };
194 pos = new_pos;
195
196 if ascii_ci_eq_str(&attr_name, "http-equiv") {
197 if ascii_ci_eq_str(&attr_value, "content-type") {
198 got_pragma = true;
199 }
200 } else if ascii_ci_eq_str(&attr_name, "content") {
201 if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) {
202 if let Some(enc) = lookup(&charset_val) {
203 charset = Some(enc);
204 need_pragma = Some(true);
205 }
206 }
207 } else if ascii_ci_eq_str(&attr_name, "charset") {
208 if let Some(enc) = lookup(&attr_value) {
209 charset = Some(enc);
210 need_pragma = Some(false);
211 }
212 }
213 }
214
215 // Determine result per spec
216 match (need_pragma, charset) {
217 (Some(true), Some(enc)) if got_pragma => Some((enc, pos)),
218 (Some(false), Some(enc)) => Some((enc, pos)),
219 _ => None,
220 }
221}
222
223/// Parse a single HTML attribute (name=value pair).
224///
225/// Returns (name, value, new_position). Returns None if we hit end of tag or input.
226fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> {
227 let mut pos = start;
228
229 // Skip whitespace
230 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
231 pos += 1;
232 }
233 if pos >= bytes.len() || bytes[pos] == b'>' {
234 return None;
235 }
236
237 // Read attribute name
238 let name_start = pos;
239 while pos < bytes.len()
240 && bytes[pos] != b'='
241 && bytes[pos] != b'>'
242 && !bytes[pos].is_ascii_whitespace()
243 && bytes[pos] != b'/'
244 {
245 pos += 1;
246 }
247 let name = to_ascii_lowercase(&bytes[name_start..pos]);
248 if name.is_empty() {
249 return None;
250 }
251
252 // Skip whitespace
253 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
254 pos += 1;
255 }
256
257 // No value
258 if pos >= bytes.len() || bytes[pos] != b'=' {
259 return Some((name, String::new(), pos));
260 }
261 pos += 1; // skip '='
262
263 // Skip whitespace
264 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
265 pos += 1;
266 }
267
268 if pos >= bytes.len() {
269 return Some((name, String::new(), pos));
270 }
271
272 // Read value
273 let value;
274 if bytes[pos] == b'"' || bytes[pos] == b'\'' {
275 let quote = bytes[pos];
276 pos += 1;
277 let val_start = pos;
278 while pos < bytes.len() && bytes[pos] != quote {
279 pos += 1;
280 }
281 value = to_ascii_lowercase(&bytes[val_start..pos]);
282 if pos < bytes.len() {
283 pos += 1; // skip closing quote
284 }
285 } else {
286 let val_start = pos;
287 while pos < bytes.len()
288 && !bytes[pos].is_ascii_whitespace()
289 && bytes[pos] != b'>'
290 && bytes[pos] != b';'
291 {
292 pos += 1;
293 }
294 value = to_ascii_lowercase(&bytes[val_start..pos]);
295 }
296
297 Some((name, value, pos))
298}
299
300/// Extract charset value from a meta content attribute value.
301///
302/// Looks for `charset=` in strings like `text/html; charset=utf-8`.
303fn extract_charset_from_meta_content(content: &str) -> Option<String> {
304 let lower = content.to_ascii_lowercase();
305 let idx = lower.find("charset")?;
306 let rest = &content[idx + 7..];
307 // Skip whitespace
308 let rest = rest.trim_start();
309 let rest = rest.strip_prefix('=')?;
310 let rest = rest.trim_start();
311
312 if rest.is_empty() {
313 return None;
314 }
315
316 // The value is terminated by ';', whitespace, or end
317 if rest.starts_with('"') || rest.starts_with('\'') {
318 let quote = rest.as_bytes()[0];
319 let inner = &rest[1..];
320 let end = inner.find(quote as char).unwrap_or(inner.len());
321 let val = inner[..end].trim();
322 if val.is_empty() {
323 return None;
324 }
325 Some(val.to_string())
326 } else {
327 let end = rest
328 .find(|c: char| c == ';' || c.is_ascii_whitespace())
329 .unwrap_or(rest.len());
330 if end == 0 {
331 return None;
332 }
333 Some(rest[..end].to_string())
334 }
335}
336
337/// Skip a tag (find the closing '>').
338fn skip_tag(bytes: &[u8], start: usize) -> usize {
339 let mut pos = start;
340 while pos < bytes.len() && bytes[pos] != b'>' {
341 pos += 1;
342 }
343 if pos < bytes.len() {
344 pos + 1
345 } else {
346 pos
347 }
348}
349
350fn is_space_or_slash(b: u8) -> bool {
351 b.is_ascii_whitespace() || b == b'/'
352}
353
354fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool {
355 a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y))
356}
357
358fn ascii_ci_eq_str(a: &str, b: &str) -> bool {
359 a.eq_ignore_ascii_case(b)
360}
361
362fn to_ascii_lowercase(bytes: &[u8]) -> String {
363 bytes
364 .iter()
365 .map(|&b| b.to_ascii_lowercase() as char)
366 .collect()
367}
368
369#[cfg(test)]
370mod tests {
371 use super::*;
372
373 // -----------------------------------------------------------------------
374 // sniff_encoding — BOM priority
375 // -----------------------------------------------------------------------
376
377 #[test]
378 fn sniff_bom_utf8() {
379 let bytes = b"\xEF\xBB\xBFHello";
380 let (enc, src) = sniff_encoding(bytes, None);
381 assert_eq!(enc, Encoding::Utf8);
382 assert_eq!(src, EncodingSource::Bom);
383 }
384
385 #[test]
386 fn sniff_bom_utf16be() {
387 let bytes = b"\xFE\xFF\x00A";
388 let (enc, src) = sniff_encoding(bytes, None);
389 assert_eq!(enc, Encoding::Utf16Be);
390 assert_eq!(src, EncodingSource::Bom);
391 }
392
393 #[test]
394 fn sniff_bom_utf16le() {
395 let bytes = b"\xFF\xFEA\x00";
396 let (enc, src) = sniff_encoding(bytes, None);
397 assert_eq!(enc, Encoding::Utf16Le);
398 assert_eq!(src, EncodingSource::Bom);
399 }
400
401 #[test]
402 fn sniff_bom_beats_http_header() {
403 let bytes = b"\xEF\xBB\xBFHello";
404 let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2"));
405 assert_eq!(enc, Encoding::Utf8);
406 assert_eq!(src, EncodingSource::Bom);
407 }
408
409 #[test]
410 fn sniff_bom_beats_meta() {
411 let mut bytes = vec![0xEF, 0xBB, 0xBF];
412 bytes.extend_from_slice(b"<meta charset=\"iso-8859-5\">");
413 let (enc, src) = sniff_encoding(&bytes, None);
414 assert_eq!(enc, Encoding::Utf8);
415 assert_eq!(src, EncodingSource::Bom);
416 }
417
418 // -----------------------------------------------------------------------
419 // sniff_encoding — HTTP Content-Type priority
420 // -----------------------------------------------------------------------
421
422 #[test]
423 fn sniff_http_charset_utf8() {
424 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8"));
425 assert_eq!(enc, Encoding::Utf8);
426 assert_eq!(src, EncodingSource::HttpHeader);
427 }
428
429 #[test]
430 fn sniff_http_charset_quoted() {
431 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\""));
432 assert_eq!(enc, Encoding::Utf8);
433 assert_eq!(src, EncodingSource::HttpHeader);
434 }
435
436 #[test]
437 fn sniff_http_charset_case_insensitive() {
438 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8"));
439 assert_eq!(enc, Encoding::Utf8);
440 assert_eq!(src, EncodingSource::HttpHeader);
441 }
442
443 #[test]
444 fn sniff_http_charset_no_space() {
445 let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8"));
446 assert_eq!(enc, Encoding::Utf8);
447 assert_eq!(src, EncodingSource::HttpHeader);
448 }
449
450 #[test]
451 fn sniff_http_charset_windows_1252() {
452 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252"));
453 assert_eq!(enc, Encoding::Windows1252);
454 assert_eq!(src, EncodingSource::HttpHeader);
455 }
456
457 #[test]
458 fn sniff_http_charset_iso_8859_1_maps_to_1252() {
459 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1"));
460 assert_eq!(enc, Encoding::Windows1252);
461 assert_eq!(src, EncodingSource::HttpHeader);
462 }
463
464 #[test]
465 fn sniff_http_utf16_override_to_utf8() {
466 // Per WHATWG spec: UTF-16 from HTTP becomes UTF-8
467 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le"));
468 assert_eq!(enc, Encoding::Utf8);
469 assert_eq!(src, EncodingSource::HttpHeader);
470 }
471
472 #[test]
473 fn sniff_http_no_charset() {
474 let (enc, src) = sniff_encoding(b"Hello", Some("text/html"));
475 // Falls through to default
476 assert_eq!(enc, Encoding::Windows1252);
477 assert_eq!(src, EncodingSource::Default);
478 }
479
480 #[test]
481 fn sniff_http_beats_meta() {
482 let html = b"<meta charset=\"iso-8859-5\">";
483 let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8"));
484 assert_eq!(enc, Encoding::Utf8);
485 assert_eq!(src, EncodingSource::HttpHeader);
486 }
487
488 // -----------------------------------------------------------------------
489 // sniff_encoding — meta prescan
490 // -----------------------------------------------------------------------
491
492 #[test]
493 fn sniff_meta_charset() {
494 let html = b"<meta charset=\"utf-8\">";
495 let (enc, src) = sniff_encoding(html, None);
496 assert_eq!(enc, Encoding::Utf8);
497 assert_eq!(src, EncodingSource::MetaPrescan);
498 }
499
500 #[test]
501 fn sniff_meta_charset_single_quotes() {
502 let html = b"<meta charset='utf-8'>";
503 let (enc, src) = sniff_encoding(html, None);
504 assert_eq!(enc, Encoding::Utf8);
505 assert_eq!(src, EncodingSource::MetaPrescan);
506 }
507
508 #[test]
509 fn sniff_meta_http_equiv() {
510 let html = b"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">";
511 let (enc, src) = sniff_encoding(html, None);
512 assert_eq!(enc, Encoding::Utf8);
513 assert_eq!(src, EncodingSource::MetaPrescan);
514 }
515
516 #[test]
517 fn sniff_meta_http_equiv_case_insensitive() {
518 let html = b"<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\">";
519 let (enc, src) = sniff_encoding(html, None);
520 assert_eq!(enc, Encoding::Utf8);
521 assert_eq!(src, EncodingSource::MetaPrescan);
522 }
523
524 #[test]
525 fn sniff_meta_charset_legacy_encoding() {
526 let html = b"<meta charset=\"windows-1251\">";
527 let (enc, src) = sniff_encoding(html, None);
528 assert_eq!(enc, Encoding::Windows1251);
529 assert_eq!(src, EncodingSource::MetaPrescan);
530 }
531
532 #[test]
533 fn sniff_meta_utf16_override_to_utf8() {
534 let html = b"<meta charset=\"utf-16le\">";
535 let (enc, src) = sniff_encoding(html, None);
536 assert_eq!(enc, Encoding::Utf8);
537 assert_eq!(src, EncodingSource::MetaPrescan);
538 }
539
540 #[test]
541 fn sniff_meta_with_doctype_and_html() {
542 let html = b"<!DOCTYPE html><html><head><meta charset=\"utf-8\"></head>";
543 let (enc, src) = sniff_encoding(html, None);
544 assert_eq!(enc, Encoding::Utf8);
545 assert_eq!(src, EncodingSource::MetaPrescan);
546 }
547
548 #[test]
549 fn sniff_meta_with_comment_before() {
550 let html = b"<!-- comment --><meta charset=\"utf-8\">";
551 let (enc, src) = sniff_encoding(html, None);
552 assert_eq!(enc, Encoding::Utf8);
553 assert_eq!(src, EncodingSource::MetaPrescan);
554 }
555
556 #[test]
557 fn sniff_meta_beyond_1024_bytes_not_found() {
558 let mut html = vec![b' '; 1024];
559 html.extend_from_slice(b"<meta charset=\"utf-8\">");
560 let (enc, src) = sniff_encoding(&html, None);
561 assert_eq!(enc, Encoding::Windows1252);
562 assert_eq!(src, EncodingSource::Default);
563 }
564
565 #[test]
566 fn sniff_meta_within_1024_bytes() {
567 let mut html = vec![b' '; 1000];
568 html.extend_from_slice(b"<meta charset=\"utf-8\">");
569 let (enc, src) = sniff_encoding(&html, None);
570 assert_eq!(enc, Encoding::Utf8);
571 assert_eq!(src, EncodingSource::MetaPrescan);
572 }
573
574 // -----------------------------------------------------------------------
575 // sniff_encoding — default fallback
576 // -----------------------------------------------------------------------
577
578 #[test]
579 fn sniff_default_no_signals() {
580 let (enc, src) = sniff_encoding(b"Hello world", None);
581 assert_eq!(enc, Encoding::Windows1252);
582 assert_eq!(src, EncodingSource::Default);
583 }
584
585 #[test]
586 fn sniff_default_empty() {
587 let (enc, src) = sniff_encoding(b"", None);
588 assert_eq!(enc, Encoding::Windows1252);
589 assert_eq!(src, EncodingSource::Default);
590 }
591
592 // -----------------------------------------------------------------------
593 // extract_charset_from_content_type
594 // -----------------------------------------------------------------------
595
596 #[test]
597 fn extract_charset_basic() {
598 assert_eq!(
599 extract_charset_from_content_type("text/html; charset=utf-8"),
600 Some(Encoding::Utf8)
601 );
602 }
603
604 #[test]
605 fn extract_charset_quoted() {
606 assert_eq!(
607 extract_charset_from_content_type("text/html; charset=\"utf-8\""),
608 Some(Encoding::Utf8)
609 );
610 }
611
612 #[test]
613 fn extract_charset_no_space() {
614 assert_eq!(
615 extract_charset_from_content_type("text/html;charset=utf-8"),
616 Some(Encoding::Utf8)
617 );
618 }
619
620 #[test]
621 fn extract_charset_uppercase() {
622 assert_eq!(
623 extract_charset_from_content_type("text/html; CHARSET=UTF-8"),
624 Some(Encoding::Utf8)
625 );
626 }
627
628 #[test]
629 fn extract_charset_missing() {
630 assert_eq!(extract_charset_from_content_type("text/html"), None);
631 }
632
633 #[test]
634 fn extract_charset_empty_value() {
635 assert_eq!(
636 extract_charset_from_content_type("text/html; charset="),
637 None
638 );
639 }
640
641 #[test]
642 fn extract_charset_unknown_encoding() {
643 assert_eq!(
644 extract_charset_from_content_type("text/html; charset=bogus"),
645 None
646 );
647 }
648
649 #[test]
650 fn extract_charset_with_extra_params() {
651 assert_eq!(
652 extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"),
653 Some(Encoding::Utf8)
654 );
655 }
656
657 // -----------------------------------------------------------------------
658 // meta_prescan internals
659 // -----------------------------------------------------------------------
660
661 #[test]
662 fn meta_prescan_charset_attr() {
663 let html = b"<meta charset=\"iso-8859-2\">";
664 assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2));
665 }
666
667 #[test]
668 fn meta_prescan_http_equiv_content() {
669 let html = b"<meta http-equiv=\"content-type\" content=\"text/html; charset=koi8-r\">";
670 assert_eq!(meta_prescan(html), Some(Encoding::Koi8R));
671 }
672
673 #[test]
674 fn meta_prescan_no_meta() {
675 let html = b"<html><head><title>Test</title></head></html>";
676 assert_eq!(meta_prescan(html), None);
677 }
678
679 #[test]
680 fn meta_prescan_meta_without_charset() {
681 let html = b"<meta name=\"viewport\" content=\"width=device-width\">";
682 assert_eq!(meta_prescan(html), None);
683 }
684
685 #[test]
686 fn meta_prescan_http_equiv_without_content() {
687 let html = b"<meta http-equiv=\"content-type\">";
688 assert_eq!(meta_prescan(html), None);
689 }
690
691 #[test]
692 fn meta_prescan_content_without_http_equiv() {
693 // charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false
694 let html = b"<meta content=\"text/html; charset=utf-8\">";
695 assert_eq!(meta_prescan(html), None);
696 }
697
698 #[test]
699 fn meta_prescan_skips_comments() {
700 let html = b"<!-- <meta charset=\"iso-8859-5\"> --><meta charset=\"utf-8\">";
701 assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
702 }
703
704 #[test]
705 fn meta_prescan_unquoted_charset() {
706 let html = b"<meta charset=utf-8>";
707 assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
708 }
709
710 #[test]
711 fn meta_prescan_self_closing() {
712 let html = b"<meta charset=\"utf-8\" />";
713 assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
714 }
715}