we (web engine): Experimental web browser project to understand the limits of Claude
1//! Encoding sniffing per WHATWG Encoding Standard and HTML spec.
2//!
3//! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan.
4
5use crate::{bom_sniff, lookup, Encoding};
6
7/// How the encoding was determined.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum EncodingSource {
10 /// Byte Order Mark at the start of the byte stream.
11 Bom,
12 /// `charset` parameter from the HTTP `Content-Type` header.
13 HttpHeader,
14 /// `<meta charset>` or `<meta http-equiv="Content-Type">` prescan.
15 MetaPrescan,
16 /// Default fallback (Windows-1252 for HTML).
17 Default,
18}
19
20/// Sniff the encoding of a byte stream.
21///
22/// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default.
23/// The default encoding is Windows-1252 per WHATWG spec for HTML.
24pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) {
25 // 1. BOM sniffing (highest priority)
26 let (bom_enc, _) = bom_sniff(bytes);
27 if let Some(enc) = bom_enc {
28 return (enc, EncodingSource::Bom);
29 }
30
31 // 2. HTTP Content-Type charset
32 if let Some(ct) = http_content_type {
33 if let Some(enc) = extract_charset_from_content_type(ct) {
34 return (enc, EncodingSource::HttpHeader);
35 }
36 }
37
38 // 3. HTML meta prescan (first 1024 bytes)
39 if let Some(enc) = meta_prescan(bytes) {
40 return (enc, EncodingSource::MetaPrescan);
41 }
42
43 // 4. Default: Windows-1252
44 (Encoding::Windows1252, EncodingSource::Default)
45}
46
47/// Extract charset from an HTTP `Content-Type` header value.
48///
49/// Handles formats like:
50/// - `text/html; charset=utf-8`
51/// - `text/html; charset="utf-8"`
52/// - `text/html;charset=utf-8` (no space)
53///
54/// Per WHATWG spec, the charset parameter value is looked up via the encoding label table.
55/// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM).
56fn extract_charset_from_content_type(content_type: &str) -> Option<Encoding> {
57 let charset_value = extract_charset_value(content_type)?;
58 let enc = lookup(charset_value)?;
59 // Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead
60 Some(match enc {
61 Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
62 other => other,
63 })
64}
65
66/// Extract the raw charset value from a Content-Type string.
67fn extract_charset_value(content_type: &str) -> Option<&str> {
68 // Find "charset" (case-insensitive) after a ';'
69 let lower = content_type.to_ascii_lowercase();
70 let idx = lower.find("charset")?;
71
72 // Must be preceded by ';' or whitespace (or be in parameters section)
73 let after_charset = &content_type[idx + 7..];
74 // Skip optional whitespace then '='
75 let after_charset = after_charset.trim_start();
76 let after_eq = after_charset.strip_prefix('=')?;
77 let after_eq = after_eq.trim_start();
78
79 if let Some(inner) = after_eq.strip_prefix('"') {
80 // Quoted value
81 let end = inner.find('"')?;
82 Some(&inner[..end])
83 } else {
84 // Unquoted value: terminated by whitespace, ';', or end of string
85 let end = after_eq
86 .find(|c: char| c == ';' || c.is_ascii_whitespace())
87 .unwrap_or(after_eq.len());
88 if end == 0 {
89 return None;
90 }
91 Some(&after_eq[..end])
92 }
93}
94
95/// Prescan the first 1024 bytes of an HTML document for encoding declarations.
96///
97/// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm.
98/// Looks for:
99/// - `<meta charset="...">`
100/// - `<meta http-equiv="Content-Type" content="...;charset=...">`
101fn meta_prescan(bytes: &[u8]) -> Option<Encoding> {
102 let limit = bytes.len().min(1024);
103 let bytes = &bytes[..limit];
104 let mut pos = 0;
105
106 while pos < bytes.len() {
107 // Skip until we find '<'
108 if bytes[pos] != b'<' {
109 pos += 1;
110 continue;
111 }
112 pos += 1;
113 if pos >= bytes.len() {
114 break;
115 }
116
117 // Check for comment "<!--"
118 if bytes[pos..].starts_with(b"!--") {
119 pos += 3;
120 // Skip until "-->"
121 while pos + 2 < bytes.len() {
122 if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' {
123 pos += 3;
124 break;
125 }
126 pos += 1;
127 }
128 continue;
129 }
130
131 // Check for "<meta" (case-insensitive)
132 if pos + 4 <= bytes.len() && ascii_ci_eq(&bytes[pos..pos + 4], b"meta") {
133 let after_meta = pos + 4;
134 if after_meta < bytes.len() && is_space_or_slash(bytes[after_meta]) {
135 if let Some((enc, _tag_end)) = parse_meta_tag(bytes, after_meta) {
136 // Per spec: override UTF-16 from meta to UTF-8
137 let enc = match enc {
138 Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
139 other => other,
140 };
141 return Some(enc);
142 } else {
143 pos = skip_tag(bytes, after_meta);
144 continue;
145 }
146 }
147 }
148
149 // Skip other tags (like <!DOCTYPE>, <html>, etc.)
150 if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' {
151 pos = skip_tag(bytes, pos);
152 continue;
153 }
154
155 // Check if it's a letter (start of a tag name)
156 if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() {
157 pos = skip_tag(bytes, pos);
158 continue;
159 }
160
161 // Not a tag, continue
162 }
163
164 None
165}
166
167/// Parse attributes of a `<meta` tag looking for charset declarations.
168///
169/// Returns the encoding and position after the tag if found.
170fn parse_meta_tag(bytes: &[u8], start: usize) -> Option<(Encoding, usize)> {
171 let mut pos = start;
172 let mut got_pragma = false;
173 let mut need_pragma: Option<bool> = None;
174 let mut charset: Option<Encoding> = None;
175
176 loop {
177 // Skip whitespace
178 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
179 pos += 1;
180 }
181 if pos >= bytes.len() {
182 break;
183 }
184 // End of tag?
185 if bytes[pos] == b'>'
186 || (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>')
187 {
188 break;
189 }
190
191 let (attr_name, attr_value, new_pos) = parse_attribute(bytes, pos)?;
192 pos = new_pos;
193
194 if ascii_ci_eq_str(&attr_name, "http-equiv") {
195 if ascii_ci_eq_str(&attr_value, "content-type") {
196 got_pragma = true;
197 }
198 } else if ascii_ci_eq_str(&attr_name, "content") {
199 if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) {
200 if let Some(enc) = lookup(&charset_val) {
201 charset = Some(enc);
202 need_pragma = Some(true);
203 }
204 }
205 } else if ascii_ci_eq_str(&attr_name, "charset") {
206 if let Some(enc) = lookup(&attr_value) {
207 charset = Some(enc);
208 need_pragma = Some(false);
209 }
210 }
211 }
212
213 // Determine result per spec
214 match (need_pragma, charset) {
215 (Some(true), Some(enc)) if got_pragma => Some((enc, pos)),
216 (Some(false), Some(enc)) => Some((enc, pos)),
217 _ => None,
218 }
219}
220
221/// Parse a single HTML attribute (name=value pair).
222///
223/// Returns (name, value, new_position). Returns None if we hit end of tag or input.
224fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> {
225 let mut pos = start;
226
227 // Skip whitespace
228 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
229 pos += 1;
230 }
231 if pos >= bytes.len() || bytes[pos] == b'>' {
232 return None;
233 }
234
235 // Read attribute name
236 let name_start = pos;
237 while pos < bytes.len()
238 && bytes[pos] != b'='
239 && bytes[pos] != b'>'
240 && !bytes[pos].is_ascii_whitespace()
241 && bytes[pos] != b'/'
242 {
243 pos += 1;
244 }
245 let name = to_ascii_lowercase(&bytes[name_start..pos]);
246 if name.is_empty() {
247 return None;
248 }
249
250 // Skip whitespace
251 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
252 pos += 1;
253 }
254
255 // No value
256 if pos >= bytes.len() || bytes[pos] != b'=' {
257 return Some((name, String::new(), pos));
258 }
259 pos += 1; // skip '='
260
261 // Skip whitespace
262 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
263 pos += 1;
264 }
265
266 if pos >= bytes.len() {
267 return Some((name, String::new(), pos));
268 }
269
270 // Read value
271 let value;
272 if bytes[pos] == b'"' || bytes[pos] == b'\'' {
273 let quote = bytes[pos];
274 pos += 1;
275 let val_start = pos;
276 while pos < bytes.len() && bytes[pos] != quote {
277 pos += 1;
278 }
279 value = to_ascii_lowercase(&bytes[val_start..pos]);
280 if pos < bytes.len() {
281 pos += 1; // skip closing quote
282 }
283 } else {
284 let val_start = pos;
285 while pos < bytes.len()
286 && !bytes[pos].is_ascii_whitespace()
287 && bytes[pos] != b'>'
288 && bytes[pos] != b';'
289 {
290 pos += 1;
291 }
292 value = to_ascii_lowercase(&bytes[val_start..pos]);
293 }
294
295 Some((name, value, pos))
296}
297
298/// Extract charset value from a meta content attribute value.
299///
300/// Looks for `charset=` in strings like `text/html; charset=utf-8`.
301fn extract_charset_from_meta_content(content: &str) -> Option<String> {
302 let lower = content.to_ascii_lowercase();
303 let idx = lower.find("charset")?;
304 let rest = &content[idx + 7..];
305 // Skip whitespace
306 let rest = rest.trim_start();
307 let rest = rest.strip_prefix('=')?;
308 let rest = rest.trim_start();
309
310 if rest.is_empty() {
311 return None;
312 }
313
314 // The value is terminated by ';', whitespace, or end
315 if rest.starts_with('"') || rest.starts_with('\'') {
316 let quote = rest.as_bytes()[0];
317 let inner = &rest[1..];
318 let end = inner.find(quote as char).unwrap_or(inner.len());
319 let val = inner[..end].trim();
320 if val.is_empty() {
321 return None;
322 }
323 Some(val.to_string())
324 } else {
325 let end = rest
326 .find(|c: char| c == ';' || c.is_ascii_whitespace())
327 .unwrap_or(rest.len());
328 if end == 0 {
329 return None;
330 }
331 Some(rest[..end].to_string())
332 }
333}
334
335/// Skip a tag (find the closing '>').
336fn skip_tag(bytes: &[u8], start: usize) -> usize {
337 let mut pos = start;
338 while pos < bytes.len() && bytes[pos] != b'>' {
339 pos += 1;
340 }
341 if pos < bytes.len() {
342 pos + 1
343 } else {
344 pos
345 }
346}
347
348fn is_space_or_slash(b: u8) -> bool {
349 b.is_ascii_whitespace() || b == b'/'
350}
351
352fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool {
353 a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y))
354}
355
356fn ascii_ci_eq_str(a: &str, b: &str) -> bool {
357 a.eq_ignore_ascii_case(b)
358}
359
360fn to_ascii_lowercase(bytes: &[u8]) -> String {
361 bytes
362 .iter()
363 .map(|&b| b.to_ascii_lowercase() as char)
364 .collect()
365}
366
367#[cfg(test)]
368mod tests {
369 use super::*;
370
371 // -----------------------------------------------------------------------
372 // sniff_encoding — BOM priority
373 // -----------------------------------------------------------------------
374
375 #[test]
376 fn sniff_bom_utf8() {
377 let bytes = b"\xEF\xBB\xBFHello";
378 let (enc, src) = sniff_encoding(bytes, None);
379 assert_eq!(enc, Encoding::Utf8);
380 assert_eq!(src, EncodingSource::Bom);
381 }
382
383 #[test]
384 fn sniff_bom_utf16be() {
385 let bytes = b"\xFE\xFF\x00A";
386 let (enc, src) = sniff_encoding(bytes, None);
387 assert_eq!(enc, Encoding::Utf16Be);
388 assert_eq!(src, EncodingSource::Bom);
389 }
390
391 #[test]
392 fn sniff_bom_utf16le() {
393 let bytes = b"\xFF\xFEA\x00";
394 let (enc, src) = sniff_encoding(bytes, None);
395 assert_eq!(enc, Encoding::Utf16Le);
396 assert_eq!(src, EncodingSource::Bom);
397 }
398
399 #[test]
400 fn sniff_bom_beats_http_header() {
401 let bytes = b"\xEF\xBB\xBFHello";
402 let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2"));
403 assert_eq!(enc, Encoding::Utf8);
404 assert_eq!(src, EncodingSource::Bom);
405 }
406
407 #[test]
408 fn sniff_bom_beats_meta() {
409 let mut bytes = vec![0xEF, 0xBB, 0xBF];
410 bytes.extend_from_slice(b"<meta charset=\"iso-8859-5\">");
411 let (enc, src) = sniff_encoding(&bytes, None);
412 assert_eq!(enc, Encoding::Utf8);
413 assert_eq!(src, EncodingSource::Bom);
414 }
415
416 // -----------------------------------------------------------------------
417 // sniff_encoding — HTTP Content-Type priority
418 // -----------------------------------------------------------------------
419
420 #[test]
421 fn sniff_http_charset_utf8() {
422 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8"));
423 assert_eq!(enc, Encoding::Utf8);
424 assert_eq!(src, EncodingSource::HttpHeader);
425 }
426
427 #[test]
428 fn sniff_http_charset_quoted() {
429 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\""));
430 assert_eq!(enc, Encoding::Utf8);
431 assert_eq!(src, EncodingSource::HttpHeader);
432 }
433
434 #[test]
435 fn sniff_http_charset_case_insensitive() {
436 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8"));
437 assert_eq!(enc, Encoding::Utf8);
438 assert_eq!(src, EncodingSource::HttpHeader);
439 }
440
441 #[test]
442 fn sniff_http_charset_no_space() {
443 let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8"));
444 assert_eq!(enc, Encoding::Utf8);
445 assert_eq!(src, EncodingSource::HttpHeader);
446 }
447
448 #[test]
449 fn sniff_http_charset_windows_1252() {
450 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252"));
451 assert_eq!(enc, Encoding::Windows1252);
452 assert_eq!(src, EncodingSource::HttpHeader);
453 }
454
455 #[test]
456 fn sniff_http_charset_iso_8859_1_maps_to_1252() {
457 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1"));
458 assert_eq!(enc, Encoding::Windows1252);
459 assert_eq!(src, EncodingSource::HttpHeader);
460 }
461
462 #[test]
463 fn sniff_http_utf16_override_to_utf8() {
464 // Per WHATWG spec: UTF-16 from HTTP becomes UTF-8
465 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le"));
466 assert_eq!(enc, Encoding::Utf8);
467 assert_eq!(src, EncodingSource::HttpHeader);
468 }
469
470 #[test]
471 fn sniff_http_no_charset() {
472 let (enc, src) = sniff_encoding(b"Hello", Some("text/html"));
473 // Falls through to default
474 assert_eq!(enc, Encoding::Windows1252);
475 assert_eq!(src, EncodingSource::Default);
476 }
477
478 #[test]
479 fn sniff_http_beats_meta() {
480 let html = b"<meta charset=\"iso-8859-5\">";
481 let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8"));
482 assert_eq!(enc, Encoding::Utf8);
483 assert_eq!(src, EncodingSource::HttpHeader);
484 }
485
486 // -----------------------------------------------------------------------
487 // sniff_encoding — meta prescan
488 // -----------------------------------------------------------------------
489
490 #[test]
491 fn sniff_meta_charset() {
492 let html = b"<meta charset=\"utf-8\">";
493 let (enc, src) = sniff_encoding(html, None);
494 assert_eq!(enc, Encoding::Utf8);
495 assert_eq!(src, EncodingSource::MetaPrescan);
496 }
497
498 #[test]
499 fn sniff_meta_charset_single_quotes() {
500 let html = b"<meta charset='utf-8'>";
501 let (enc, src) = sniff_encoding(html, None);
502 assert_eq!(enc, Encoding::Utf8);
503 assert_eq!(src, EncodingSource::MetaPrescan);
504 }
505
506 #[test]
507 fn sniff_meta_http_equiv() {
508 let html = b"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">";
509 let (enc, src) = sniff_encoding(html, None);
510 assert_eq!(enc, Encoding::Utf8);
511 assert_eq!(src, EncodingSource::MetaPrescan);
512 }
513
514 #[test]
515 fn sniff_meta_http_equiv_case_insensitive() {
516 let html = b"<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\">";
517 let (enc, src) = sniff_encoding(html, None);
518 assert_eq!(enc, Encoding::Utf8);
519 assert_eq!(src, EncodingSource::MetaPrescan);
520 }
521
522 #[test]
523 fn sniff_meta_charset_legacy_encoding() {
524 let html = b"<meta charset=\"windows-1251\">";
525 let (enc, src) = sniff_encoding(html, None);
526 assert_eq!(enc, Encoding::Windows1251);
527 assert_eq!(src, EncodingSource::MetaPrescan);
528 }
529
530 #[test]
531 fn sniff_meta_utf16_override_to_utf8() {
532 let html = b"<meta charset=\"utf-16le\">";
533 let (enc, src) = sniff_encoding(html, None);
534 assert_eq!(enc, Encoding::Utf8);
535 assert_eq!(src, EncodingSource::MetaPrescan);
536 }
537
538 #[test]
539 fn sniff_meta_with_doctype_and_html() {
540 let html = b"<!DOCTYPE html><html><head><meta charset=\"utf-8\"></head>";
541 let (enc, src) = sniff_encoding(html, None);
542 assert_eq!(enc, Encoding::Utf8);
543 assert_eq!(src, EncodingSource::MetaPrescan);
544 }
545
546 #[test]
547 fn sniff_meta_with_comment_before() {
548 let html = b"<!-- comment --><meta charset=\"utf-8\">";
549 let (enc, src) = sniff_encoding(html, None);
550 assert_eq!(enc, Encoding::Utf8);
551 assert_eq!(src, EncodingSource::MetaPrescan);
552 }
553
554 #[test]
555 fn sniff_meta_beyond_1024_bytes_not_found() {
556 let mut html = vec![b' '; 1024];
557 html.extend_from_slice(b"<meta charset=\"utf-8\">");
558 let (enc, src) = sniff_encoding(&html, None);
559 assert_eq!(enc, Encoding::Windows1252);
560 assert_eq!(src, EncodingSource::Default);
561 }
562
563 #[test]
564 fn sniff_meta_within_1024_bytes() {
565 let mut html = vec![b' '; 1000];
566 html.extend_from_slice(b"<meta charset=\"utf-8\">");
567 let (enc, src) = sniff_encoding(&html, None);
568 assert_eq!(enc, Encoding::Utf8);
569 assert_eq!(src, EncodingSource::MetaPrescan);
570 }
571
572 // -----------------------------------------------------------------------
573 // sniff_encoding — default fallback
574 // -----------------------------------------------------------------------
575
576 #[test]
577 fn sniff_default_no_signals() {
578 let (enc, src) = sniff_encoding(b"Hello world", None);
579 assert_eq!(enc, Encoding::Windows1252);
580 assert_eq!(src, EncodingSource::Default);
581 }
582
583 #[test]
584 fn sniff_default_empty() {
585 let (enc, src) = sniff_encoding(b"", None);
586 assert_eq!(enc, Encoding::Windows1252);
587 assert_eq!(src, EncodingSource::Default);
588 }
589
590 // -----------------------------------------------------------------------
591 // extract_charset_from_content_type
592 // -----------------------------------------------------------------------
593
594 #[test]
595 fn extract_charset_basic() {
596 assert_eq!(
597 extract_charset_from_content_type("text/html; charset=utf-8"),
598 Some(Encoding::Utf8)
599 );
600 }
601
602 #[test]
603 fn extract_charset_quoted() {
604 assert_eq!(
605 extract_charset_from_content_type("text/html; charset=\"utf-8\""),
606 Some(Encoding::Utf8)
607 );
608 }
609
610 #[test]
611 fn extract_charset_no_space() {
612 assert_eq!(
613 extract_charset_from_content_type("text/html;charset=utf-8"),
614 Some(Encoding::Utf8)
615 );
616 }
617
618 #[test]
619 fn extract_charset_uppercase() {
620 assert_eq!(
621 extract_charset_from_content_type("text/html; CHARSET=UTF-8"),
622 Some(Encoding::Utf8)
623 );
624 }
625
626 #[test]
627 fn extract_charset_missing() {
628 assert_eq!(extract_charset_from_content_type("text/html"), None);
629 }
630
631 #[test]
632 fn extract_charset_empty_value() {
633 assert_eq!(
634 extract_charset_from_content_type("text/html; charset="),
635 None
636 );
637 }
638
639 #[test]
640 fn extract_charset_unknown_encoding() {
641 assert_eq!(
642 extract_charset_from_content_type("text/html; charset=bogus"),
643 None
644 );
645 }
646
647 #[test]
648 fn extract_charset_with_extra_params() {
649 assert_eq!(
650 extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"),
651 Some(Encoding::Utf8)
652 );
653 }
654
655 // -----------------------------------------------------------------------
656 // meta_prescan internals
657 // -----------------------------------------------------------------------
658
659 #[test]
660 fn meta_prescan_charset_attr() {
661 let html = b"<meta charset=\"iso-8859-2\">";
662 assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2));
663 }
664
665 #[test]
666 fn meta_prescan_http_equiv_content() {
667 let html = b"<meta http-equiv=\"content-type\" content=\"text/html; charset=koi8-r\">";
668 assert_eq!(meta_prescan(html), Some(Encoding::Koi8R));
669 }
670
671 #[test]
672 fn meta_prescan_no_meta() {
673 let html = b"<html><head><title>Test</title></head></html>";
674 assert_eq!(meta_prescan(html), None);
675 }
676
677 #[test]
678 fn meta_prescan_meta_without_charset() {
679 let html = b"<meta name=\"viewport\" content=\"width=device-width\">";
680 assert_eq!(meta_prescan(html), None);
681 }
682
683 #[test]
684 fn meta_prescan_http_equiv_without_content() {
685 let html = b"<meta http-equiv=\"content-type\">";
686 assert_eq!(meta_prescan(html), None);
687 }
688
689 #[test]
690 fn meta_prescan_content_without_http_equiv() {
691 // charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false
692 let html = b"<meta content=\"text/html; charset=utf-8\">";
693 assert_eq!(meta_prescan(html), None);
694 }
695
696 #[test]
697 fn meta_prescan_skips_comments() {
698 let html = b"<!-- <meta charset=\"iso-8859-5\"> --><meta charset=\"utf-8\">";
699 assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
700 }
701
702 #[test]
703 fn meta_prescan_unquoted_charset() {
704 let html = b"<meta charset=utf-8>";
705 assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
706 }
707
708 #[test]
709 fn meta_prescan_self_closing() {
710 let html = b"<meta charset=\"utf-8\" />";
711 assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
712 }
713}