we (web engine): Experimental web browser project to understand the limits of Claude
at encoding-sniffing 715 lines 23 kB view raw
1//! Encoding sniffing per WHATWG Encoding Standard and HTML spec. 2//! 3//! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan. 4 5use crate::{bom_sniff, lookup, Encoding}; 6 7/// How the encoding was determined. 8#[derive(Debug, Clone, Copy, PartialEq, Eq)] 9pub enum EncodingSource { 10 /// Byte Order Mark at the start of the byte stream. 11 Bom, 12 /// `charset` parameter from the HTTP `Content-Type` header. 13 HttpHeader, 14 /// `<meta charset>` or `<meta http-equiv="Content-Type">` prescan. 15 MetaPrescan, 16 /// Default fallback (Windows-1252 for HTML). 17 Default, 18} 19 20/// Sniff the encoding of a byte stream. 21/// 22/// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default. 23/// The default encoding is Windows-1252 per WHATWG spec for HTML. 24pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) { 25 // 1. BOM sniffing (highest priority) 26 let (bom_enc, _) = bom_sniff(bytes); 27 if let Some(enc) = bom_enc { 28 return (enc, EncodingSource::Bom); 29 } 30 31 // 2. HTTP Content-Type charset 32 if let Some(ct) = http_content_type { 33 if let Some(enc) = extract_charset_from_content_type(ct) { 34 return (enc, EncodingSource::HttpHeader); 35 } 36 } 37 38 // 3. HTML meta prescan (first 1024 bytes) 39 if let Some(enc) = meta_prescan(bytes) { 40 return (enc, EncodingSource::MetaPrescan); 41 } 42 43 // 4. Default: Windows-1252 44 (Encoding::Windows1252, EncodingSource::Default) 45} 46 47/// Extract charset from an HTTP `Content-Type` header value. 48/// 49/// Handles formats like: 50/// - `text/html; charset=utf-8` 51/// - `text/html; charset="utf-8"` 52/// - `text/html;charset=utf-8` (no space) 53/// 54/// Per WHATWG spec, the charset parameter value is looked up via the encoding label table. 55/// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM). 56fn extract_charset_from_content_type(content_type: &str) -> Option<Encoding> { 57 let charset_value = extract_charset_value(content_type)?; 58 let enc = lookup(charset_value)?; 59 // Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead 60 Some(match enc { 61 Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8, 62 other => other, 63 }) 64} 65 66/// Extract the raw charset value from a Content-Type string. 67fn extract_charset_value(content_type: &str) -> Option<&str> { 68 // Find "charset" (case-insensitive) after a ';' 69 let lower = content_type.to_ascii_lowercase(); 70 let idx = lower.find("charset")?; 71 72 // Must be preceded by ';' or whitespace (or be in parameters section) 73 let after_charset = &content_type[idx + 7..]; 74 // Skip optional whitespace then '=' 75 let after_charset = after_charset.trim_start(); 76 let after_eq = after_charset.strip_prefix('=')?; 77 let after_eq = after_eq.trim_start(); 78 79 if let Some(inner) = after_eq.strip_prefix('"') { 80 // Quoted value 81 let end = inner.find('"')?; 82 Some(&inner[..end]) 83 } else { 84 // Unquoted value: terminated by whitespace, ';', or end of string 85 let end = after_eq 86 .find(|c: char| c == ';' || c.is_ascii_whitespace()) 87 .unwrap_or(after_eq.len()); 88 if end == 0 { 89 return None; 90 } 91 Some(&after_eq[..end]) 92 } 93} 94 95/// Prescan the first 1024 bytes of an HTML document for encoding declarations. 96/// 97/// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm. 98/// Looks for: 99/// - `<meta charset="...">` 100/// - `<meta http-equiv="Content-Type" content="...;charset=...">` 101fn meta_prescan(bytes: &[u8]) -> Option<Encoding> { 102 let limit = bytes.len().min(1024); 103 let bytes = &bytes[..limit]; 104 let mut pos = 0; 105 106 while pos < bytes.len() { 107 // Skip until we find '<' 108 if bytes[pos] != b'<' { 109 pos += 1; 110 continue; 111 } 112 pos += 1; 113 if pos >= bytes.len() { 114 break; 115 } 116 117 // Check for comment "<!--" 118 if bytes[pos..].starts_with(b"!--") { 119 pos += 3; 120 // Skip until "-->" 121 while pos + 2 < bytes.len() { 122 if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' { 123 pos += 3; 124 break; 125 } 126 pos += 1; 127 } 128 continue; 129 } 130 131 // Check for "<meta" (case-insensitive) 132 if pos + 4 <= bytes.len() && ascii_ci_eq(&bytes[pos..pos + 4], b"meta") { 133 let after_meta = pos + 4; 134 if after_meta < bytes.len() && is_space_or_slash(bytes[after_meta]) { 135 if let Some((enc, _tag_end)) = parse_meta_tag(bytes, after_meta) { 136 // Per spec: override UTF-16 from meta to UTF-8 137 let enc = match enc { 138 Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8, 139 other => other, 140 }; 141 return Some(enc); 142 } else { 143 pos = skip_tag(bytes, after_meta); 144 continue; 145 } 146 } 147 } 148 149 // Skip other tags (like <!DOCTYPE>, <html>, etc.) 150 if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' { 151 pos = skip_tag(bytes, pos); 152 continue; 153 } 154 155 // Check if it's a letter (start of a tag name) 156 if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() { 157 pos = skip_tag(bytes, pos); 158 continue; 159 } 160 161 // Not a tag, continue 162 } 163 164 None 165} 166 167/// Parse attributes of a `<meta` tag looking for charset declarations. 168/// 169/// Returns the encoding and position after the tag if found. 170fn parse_meta_tag(bytes: &[u8], start: usize) -> Option<(Encoding, usize)> { 171 let mut pos = start; 172 let mut got_pragma = false; 173 let mut need_pragma: Option<bool> = None; 174 let mut charset: Option<Encoding> = None; 175 176 loop { 177 // Skip whitespace 178 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 179 pos += 1; 180 } 181 if pos >= bytes.len() { 182 break; 183 } 184 // End of tag? 185 if bytes[pos] == b'>' 186 || (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>') 187 { 188 break; 189 } 190 191 let Some((attr_name, attr_value, new_pos)) = parse_attribute(bytes, pos) else { 192 break; 193 }; 194 pos = new_pos; 195 196 if ascii_ci_eq_str(&attr_name, "http-equiv") { 197 if ascii_ci_eq_str(&attr_value, "content-type") { 198 got_pragma = true; 199 } 200 } else if ascii_ci_eq_str(&attr_name, "content") { 201 if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) { 202 if let Some(enc) = lookup(&charset_val) { 203 charset = Some(enc); 204 need_pragma = Some(true); 205 } 206 } 207 } else if ascii_ci_eq_str(&attr_name, "charset") { 208 if let Some(enc) = lookup(&attr_value) { 209 charset = Some(enc); 210 need_pragma = Some(false); 211 } 212 } 213 } 214 215 // Determine result per spec 216 match (need_pragma, charset) { 217 (Some(true), Some(enc)) if got_pragma => Some((enc, pos)), 218 (Some(false), Some(enc)) => Some((enc, pos)), 219 _ => None, 220 } 221} 222 223/// Parse a single HTML attribute (name=value pair). 224/// 225/// Returns (name, value, new_position). Returns None if we hit end of tag or input. 226fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> { 227 let mut pos = start; 228 229 // Skip whitespace 230 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 231 pos += 1; 232 } 233 if pos >= bytes.len() || bytes[pos] == b'>' { 234 return None; 235 } 236 237 // Read attribute name 238 let name_start = pos; 239 while pos < bytes.len() 240 && bytes[pos] != b'=' 241 && bytes[pos] != b'>' 242 && !bytes[pos].is_ascii_whitespace() 243 && bytes[pos] != b'/' 244 { 245 pos += 1; 246 } 247 let name = to_ascii_lowercase(&bytes[name_start..pos]); 248 if name.is_empty() { 249 return None; 250 } 251 252 // Skip whitespace 253 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 254 pos += 1; 255 } 256 257 // No value 258 if pos >= bytes.len() || bytes[pos] != b'=' { 259 return Some((name, String::new(), pos)); 260 } 261 pos += 1; // skip '=' 262 263 // Skip whitespace 264 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 265 pos += 1; 266 } 267 268 if pos >= bytes.len() { 269 return Some((name, String::new(), pos)); 270 } 271 272 // Read value 273 let value; 274 if bytes[pos] == b'"' || bytes[pos] == b'\'' { 275 let quote = bytes[pos]; 276 pos += 1; 277 let val_start = pos; 278 while pos < bytes.len() && bytes[pos] != quote { 279 pos += 1; 280 } 281 value = to_ascii_lowercase(&bytes[val_start..pos]); 282 if pos < bytes.len() { 283 pos += 1; // skip closing quote 284 } 285 } else { 286 let val_start = pos; 287 while pos < bytes.len() 288 && !bytes[pos].is_ascii_whitespace() 289 && bytes[pos] != b'>' 290 && bytes[pos] != b';' 291 { 292 pos += 1; 293 } 294 value = to_ascii_lowercase(&bytes[val_start..pos]); 295 } 296 297 Some((name, value, pos)) 298} 299 300/// Extract charset value from a meta content attribute value. 301/// 302/// Looks for `charset=` in strings like `text/html; charset=utf-8`. 303fn extract_charset_from_meta_content(content: &str) -> Option<String> { 304 let lower = content.to_ascii_lowercase(); 305 let idx = lower.find("charset")?; 306 let rest = &content[idx + 7..]; 307 // Skip whitespace 308 let rest = rest.trim_start(); 309 let rest = rest.strip_prefix('=')?; 310 let rest = rest.trim_start(); 311 312 if rest.is_empty() { 313 return None; 314 } 315 316 // The value is terminated by ';', whitespace, or end 317 if rest.starts_with('"') || rest.starts_with('\'') { 318 let quote = rest.as_bytes()[0]; 319 let inner = &rest[1..]; 320 let end = inner.find(quote as char).unwrap_or(inner.len()); 321 let val = inner[..end].trim(); 322 if val.is_empty() { 323 return None; 324 } 325 Some(val.to_string()) 326 } else { 327 let end = rest 328 .find(|c: char| c == ';' || c.is_ascii_whitespace()) 329 .unwrap_or(rest.len()); 330 if end == 0 { 331 return None; 332 } 333 Some(rest[..end].to_string()) 334 } 335} 336 337/// Skip a tag (find the closing '>'). 338fn skip_tag(bytes: &[u8], start: usize) -> usize { 339 let mut pos = start; 340 while pos < bytes.len() && bytes[pos] != b'>' { 341 pos += 1; 342 } 343 if pos < bytes.len() { 344 pos + 1 345 } else { 346 pos 347 } 348} 349 350fn is_space_or_slash(b: u8) -> bool { 351 b.is_ascii_whitespace() || b == b'/' 352} 353 354fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool { 355 a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y)) 356} 357 358fn ascii_ci_eq_str(a: &str, b: &str) -> bool { 359 a.eq_ignore_ascii_case(b) 360} 361 362fn to_ascii_lowercase(bytes: &[u8]) -> String { 363 bytes 364 .iter() 365 .map(|&b| b.to_ascii_lowercase() as char) 366 .collect() 367} 368 369#[cfg(test)] 370mod tests { 371 use super::*; 372 373 // ----------------------------------------------------------------------- 374 // sniff_encoding — BOM priority 375 // ----------------------------------------------------------------------- 376 377 #[test] 378 fn sniff_bom_utf8() { 379 let bytes = b"\xEF\xBB\xBFHello"; 380 let (enc, src) = sniff_encoding(bytes, None); 381 assert_eq!(enc, Encoding::Utf8); 382 assert_eq!(src, EncodingSource::Bom); 383 } 384 385 #[test] 386 fn sniff_bom_utf16be() { 387 let bytes = b"\xFE\xFF\x00A"; 388 let (enc, src) = sniff_encoding(bytes, None); 389 assert_eq!(enc, Encoding::Utf16Be); 390 assert_eq!(src, EncodingSource::Bom); 391 } 392 393 #[test] 394 fn sniff_bom_utf16le() { 395 let bytes = b"\xFF\xFEA\x00"; 396 let (enc, src) = sniff_encoding(bytes, None); 397 assert_eq!(enc, Encoding::Utf16Le); 398 assert_eq!(src, EncodingSource::Bom); 399 } 400 401 #[test] 402 fn sniff_bom_beats_http_header() { 403 let bytes = b"\xEF\xBB\xBFHello"; 404 let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2")); 405 assert_eq!(enc, Encoding::Utf8); 406 assert_eq!(src, EncodingSource::Bom); 407 } 408 409 #[test] 410 fn sniff_bom_beats_meta() { 411 let mut bytes = vec![0xEF, 0xBB, 0xBF]; 412 bytes.extend_from_slice(b"<meta charset=\"iso-8859-5\">"); 413 let (enc, src) = sniff_encoding(&bytes, None); 414 assert_eq!(enc, Encoding::Utf8); 415 assert_eq!(src, EncodingSource::Bom); 416 } 417 418 // ----------------------------------------------------------------------- 419 // sniff_encoding — HTTP Content-Type priority 420 // ----------------------------------------------------------------------- 421 422 #[test] 423 fn sniff_http_charset_utf8() { 424 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8")); 425 assert_eq!(enc, Encoding::Utf8); 426 assert_eq!(src, EncodingSource::HttpHeader); 427 } 428 429 #[test] 430 fn sniff_http_charset_quoted() { 431 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\"")); 432 assert_eq!(enc, Encoding::Utf8); 433 assert_eq!(src, EncodingSource::HttpHeader); 434 } 435 436 #[test] 437 fn sniff_http_charset_case_insensitive() { 438 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8")); 439 assert_eq!(enc, Encoding::Utf8); 440 assert_eq!(src, EncodingSource::HttpHeader); 441 } 442 443 #[test] 444 fn sniff_http_charset_no_space() { 445 let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8")); 446 assert_eq!(enc, Encoding::Utf8); 447 assert_eq!(src, EncodingSource::HttpHeader); 448 } 449 450 #[test] 451 fn sniff_http_charset_windows_1252() { 452 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252")); 453 assert_eq!(enc, Encoding::Windows1252); 454 assert_eq!(src, EncodingSource::HttpHeader); 455 } 456 457 #[test] 458 fn sniff_http_charset_iso_8859_1_maps_to_1252() { 459 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1")); 460 assert_eq!(enc, Encoding::Windows1252); 461 assert_eq!(src, EncodingSource::HttpHeader); 462 } 463 464 #[test] 465 fn sniff_http_utf16_override_to_utf8() { 466 // Per WHATWG spec: UTF-16 from HTTP becomes UTF-8 467 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le")); 468 assert_eq!(enc, Encoding::Utf8); 469 assert_eq!(src, EncodingSource::HttpHeader); 470 } 471 472 #[test] 473 fn sniff_http_no_charset() { 474 let (enc, src) = sniff_encoding(b"Hello", Some("text/html")); 475 // Falls through to default 476 assert_eq!(enc, Encoding::Windows1252); 477 assert_eq!(src, EncodingSource::Default); 478 } 479 480 #[test] 481 fn sniff_http_beats_meta() { 482 let html = b"<meta charset=\"iso-8859-5\">"; 483 let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8")); 484 assert_eq!(enc, Encoding::Utf8); 485 assert_eq!(src, EncodingSource::HttpHeader); 486 } 487 488 // ----------------------------------------------------------------------- 489 // sniff_encoding — meta prescan 490 // ----------------------------------------------------------------------- 491 492 #[test] 493 fn sniff_meta_charset() { 494 let html = b"<meta charset=\"utf-8\">"; 495 let (enc, src) = sniff_encoding(html, None); 496 assert_eq!(enc, Encoding::Utf8); 497 assert_eq!(src, EncodingSource::MetaPrescan); 498 } 499 500 #[test] 501 fn sniff_meta_charset_single_quotes() { 502 let html = b"<meta charset='utf-8'>"; 503 let (enc, src) = sniff_encoding(html, None); 504 assert_eq!(enc, Encoding::Utf8); 505 assert_eq!(src, EncodingSource::MetaPrescan); 506 } 507 508 #[test] 509 fn sniff_meta_http_equiv() { 510 let html = b"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"; 511 let (enc, src) = sniff_encoding(html, None); 512 assert_eq!(enc, Encoding::Utf8); 513 assert_eq!(src, EncodingSource::MetaPrescan); 514 } 515 516 #[test] 517 fn sniff_meta_http_equiv_case_insensitive() { 518 let html = b"<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\">"; 519 let (enc, src) = sniff_encoding(html, None); 520 assert_eq!(enc, Encoding::Utf8); 521 assert_eq!(src, EncodingSource::MetaPrescan); 522 } 523 524 #[test] 525 fn sniff_meta_charset_legacy_encoding() { 526 let html = b"<meta charset=\"windows-1251\">"; 527 let (enc, src) = sniff_encoding(html, None); 528 assert_eq!(enc, Encoding::Windows1251); 529 assert_eq!(src, EncodingSource::MetaPrescan); 530 } 531 532 #[test] 533 fn sniff_meta_utf16_override_to_utf8() { 534 let html = b"<meta charset=\"utf-16le\">"; 535 let (enc, src) = sniff_encoding(html, None); 536 assert_eq!(enc, Encoding::Utf8); 537 assert_eq!(src, EncodingSource::MetaPrescan); 538 } 539 540 #[test] 541 fn sniff_meta_with_doctype_and_html() { 542 let html = b"<!DOCTYPE html><html><head><meta charset=\"utf-8\"></head>"; 543 let (enc, src) = sniff_encoding(html, None); 544 assert_eq!(enc, Encoding::Utf8); 545 assert_eq!(src, EncodingSource::MetaPrescan); 546 } 547 548 #[test] 549 fn sniff_meta_with_comment_before() { 550 let html = b"<!-- comment --><meta charset=\"utf-8\">"; 551 let (enc, src) = sniff_encoding(html, None); 552 assert_eq!(enc, Encoding::Utf8); 553 assert_eq!(src, EncodingSource::MetaPrescan); 554 } 555 556 #[test] 557 fn sniff_meta_beyond_1024_bytes_not_found() { 558 let mut html = vec![b' '; 1024]; 559 html.extend_from_slice(b"<meta charset=\"utf-8\">"); 560 let (enc, src) = sniff_encoding(&html, None); 561 assert_eq!(enc, Encoding::Windows1252); 562 assert_eq!(src, EncodingSource::Default); 563 } 564 565 #[test] 566 fn sniff_meta_within_1024_bytes() { 567 let mut html = vec![b' '; 1000]; 568 html.extend_from_slice(b"<meta charset=\"utf-8\">"); 569 let (enc, src) = sniff_encoding(&html, None); 570 assert_eq!(enc, Encoding::Utf8); 571 assert_eq!(src, EncodingSource::MetaPrescan); 572 } 573 574 // ----------------------------------------------------------------------- 575 // sniff_encoding — default fallback 576 // ----------------------------------------------------------------------- 577 578 #[test] 579 fn sniff_default_no_signals() { 580 let (enc, src) = sniff_encoding(b"Hello world", None); 581 assert_eq!(enc, Encoding::Windows1252); 582 assert_eq!(src, EncodingSource::Default); 583 } 584 585 #[test] 586 fn sniff_default_empty() { 587 let (enc, src) = sniff_encoding(b"", None); 588 assert_eq!(enc, Encoding::Windows1252); 589 assert_eq!(src, EncodingSource::Default); 590 } 591 592 // ----------------------------------------------------------------------- 593 // extract_charset_from_content_type 594 // ----------------------------------------------------------------------- 595 596 #[test] 597 fn extract_charset_basic() { 598 assert_eq!( 599 extract_charset_from_content_type("text/html; charset=utf-8"), 600 Some(Encoding::Utf8) 601 ); 602 } 603 604 #[test] 605 fn extract_charset_quoted() { 606 assert_eq!( 607 extract_charset_from_content_type("text/html; charset=\"utf-8\""), 608 Some(Encoding::Utf8) 609 ); 610 } 611 612 #[test] 613 fn extract_charset_no_space() { 614 assert_eq!( 615 extract_charset_from_content_type("text/html;charset=utf-8"), 616 Some(Encoding::Utf8) 617 ); 618 } 619 620 #[test] 621 fn extract_charset_uppercase() { 622 assert_eq!( 623 extract_charset_from_content_type("text/html; CHARSET=UTF-8"), 624 Some(Encoding::Utf8) 625 ); 626 } 627 628 #[test] 629 fn extract_charset_missing() { 630 assert_eq!(extract_charset_from_content_type("text/html"), None); 631 } 632 633 #[test] 634 fn extract_charset_empty_value() { 635 assert_eq!( 636 extract_charset_from_content_type("text/html; charset="), 637 None 638 ); 639 } 640 641 #[test] 642 fn extract_charset_unknown_encoding() { 643 assert_eq!( 644 extract_charset_from_content_type("text/html; charset=bogus"), 645 None 646 ); 647 } 648 649 #[test] 650 fn extract_charset_with_extra_params() { 651 assert_eq!( 652 extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"), 653 Some(Encoding::Utf8) 654 ); 655 } 656 657 // ----------------------------------------------------------------------- 658 // meta_prescan internals 659 // ----------------------------------------------------------------------- 660 661 #[test] 662 fn meta_prescan_charset_attr() { 663 let html = b"<meta charset=\"iso-8859-2\">"; 664 assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2)); 665 } 666 667 #[test] 668 fn meta_prescan_http_equiv_content() { 669 let html = b"<meta http-equiv=\"content-type\" content=\"text/html; charset=koi8-r\">"; 670 assert_eq!(meta_prescan(html), Some(Encoding::Koi8R)); 671 } 672 673 #[test] 674 fn meta_prescan_no_meta() { 675 let html = b"<html><head><title>Test</title></head></html>"; 676 assert_eq!(meta_prescan(html), None); 677 } 678 679 #[test] 680 fn meta_prescan_meta_without_charset() { 681 let html = b"<meta name=\"viewport\" content=\"width=device-width\">"; 682 assert_eq!(meta_prescan(html), None); 683 } 684 685 #[test] 686 fn meta_prescan_http_equiv_without_content() { 687 let html = b"<meta http-equiv=\"content-type\">"; 688 assert_eq!(meta_prescan(html), None); 689 } 690 691 #[test] 692 fn meta_prescan_content_without_http_equiv() { 693 // charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false 694 let html = b"<meta content=\"text/html; charset=utf-8\">"; 695 assert_eq!(meta_prescan(html), None); 696 } 697 698 #[test] 699 fn meta_prescan_skips_comments() { 700 let html = b"<!-- <meta charset=\"iso-8859-5\"> --><meta charset=\"utf-8\">"; 701 assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 702 } 703 704 #[test] 705 fn meta_prescan_unquoted_charset() { 706 let html = b"<meta charset=utf-8>"; 707 assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 708 } 709 710 #[test] 711 fn meta_prescan_self_closing() { 712 let html = b"<meta charset=\"utf-8\" />"; 713 assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 714 } 715}