we (web engine): Experimental web browser project to understand the limits of Claude
at js-parser 713 lines 23 kB view raw
1//! Encoding sniffing per WHATWG Encoding Standard and HTML spec. 2//! 3//! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan. 4 5use crate::{bom_sniff, lookup, Encoding}; 6 7/// How the encoding was determined. 8#[derive(Debug, Clone, Copy, PartialEq, Eq)] 9pub enum EncodingSource { 10 /// Byte Order Mark at the start of the byte stream. 11 Bom, 12 /// `charset` parameter from the HTTP `Content-Type` header. 13 HttpHeader, 14 /// `<meta charset>` or `<meta http-equiv="Content-Type">` prescan. 15 MetaPrescan, 16 /// Default fallback (Windows-1252 for HTML). 17 Default, 18} 19 20/// Sniff the encoding of a byte stream. 21/// 22/// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default. 23/// The default encoding is Windows-1252 per WHATWG spec for HTML. 24pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) { 25 // 1. BOM sniffing (highest priority) 26 let (bom_enc, _) = bom_sniff(bytes); 27 if let Some(enc) = bom_enc { 28 return (enc, EncodingSource::Bom); 29 } 30 31 // 2. HTTP Content-Type charset 32 if let Some(ct) = http_content_type { 33 if let Some(enc) = extract_charset_from_content_type(ct) { 34 return (enc, EncodingSource::HttpHeader); 35 } 36 } 37 38 // 3. HTML meta prescan (first 1024 bytes) 39 if let Some(enc) = meta_prescan(bytes) { 40 return (enc, EncodingSource::MetaPrescan); 41 } 42 43 // 4. Default: Windows-1252 44 (Encoding::Windows1252, EncodingSource::Default) 45} 46 47/// Extract charset from an HTTP `Content-Type` header value. 48/// 49/// Handles formats like: 50/// - `text/html; charset=utf-8` 51/// - `text/html; charset="utf-8"` 52/// - `text/html;charset=utf-8` (no space) 53/// 54/// Per WHATWG spec, the charset parameter value is looked up via the encoding label table. 55/// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM). 56fn extract_charset_from_content_type(content_type: &str) -> Option<Encoding> { 57 let charset_value = extract_charset_value(content_type)?; 58 let enc = lookup(charset_value)?; 59 // Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead 60 Some(match enc { 61 Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8, 62 other => other, 63 }) 64} 65 66/// Extract the raw charset value from a Content-Type string. 67fn extract_charset_value(content_type: &str) -> Option<&str> { 68 // Find "charset" (case-insensitive) after a ';' 69 let lower = content_type.to_ascii_lowercase(); 70 let idx = lower.find("charset")?; 71 72 // Must be preceded by ';' or whitespace (or be in parameters section) 73 let after_charset = &content_type[idx + 7..]; 74 // Skip optional whitespace then '=' 75 let after_charset = after_charset.trim_start(); 76 let after_eq = after_charset.strip_prefix('=')?; 77 let after_eq = after_eq.trim_start(); 78 79 if let Some(inner) = after_eq.strip_prefix('"') { 80 // Quoted value 81 let end = inner.find('"')?; 82 Some(&inner[..end]) 83 } else { 84 // Unquoted value: terminated by whitespace, ';', or end of string 85 let end = after_eq 86 .find(|c: char| c == ';' || c.is_ascii_whitespace()) 87 .unwrap_or(after_eq.len()); 88 if end == 0 { 89 return None; 90 } 91 Some(&after_eq[..end]) 92 } 93} 94 95/// Prescan the first 1024 bytes of an HTML document for encoding declarations. 96/// 97/// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm. 98/// Looks for: 99/// - `<meta charset="...">` 100/// - `<meta http-equiv="Content-Type" content="...;charset=...">` 101fn meta_prescan(bytes: &[u8]) -> Option<Encoding> { 102 let limit = bytes.len().min(1024); 103 let bytes = &bytes[..limit]; 104 let mut pos = 0; 105 106 while pos < bytes.len() { 107 // Skip until we find '<' 108 if bytes[pos] != b'<' { 109 pos += 1; 110 continue; 111 } 112 pos += 1; 113 if pos >= bytes.len() { 114 break; 115 } 116 117 // Check for comment "<!--" 118 if bytes[pos..].starts_with(b"!--") { 119 pos += 3; 120 // Skip until "-->" 121 while pos + 2 < bytes.len() { 122 if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' { 123 pos += 3; 124 break; 125 } 126 pos += 1; 127 } 128 continue; 129 } 130 131 // Check for "<meta" (case-insensitive) 132 if pos + 4 <= bytes.len() && ascii_ci_eq(&bytes[pos..pos + 4], b"meta") { 133 let after_meta = pos + 4; 134 if after_meta < bytes.len() && is_space_or_slash(bytes[after_meta]) { 135 if let Some((enc, _tag_end)) = parse_meta_tag(bytes, after_meta) { 136 // Per spec: override UTF-16 from meta to UTF-8 137 let enc = match enc { 138 Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8, 139 other => other, 140 }; 141 return Some(enc); 142 } else { 143 pos = skip_tag(bytes, after_meta); 144 continue; 145 } 146 } 147 } 148 149 // Skip other tags (like <!DOCTYPE>, <html>, etc.) 150 if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' { 151 pos = skip_tag(bytes, pos); 152 continue; 153 } 154 155 // Check if it's a letter (start of a tag name) 156 if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() { 157 pos = skip_tag(bytes, pos); 158 continue; 159 } 160 161 // Not a tag, continue 162 } 163 164 None 165} 166 167/// Parse attributes of a `<meta` tag looking for charset declarations. 168/// 169/// Returns the encoding and position after the tag if found. 170fn parse_meta_tag(bytes: &[u8], start: usize) -> Option<(Encoding, usize)> { 171 let mut pos = start; 172 let mut got_pragma = false; 173 let mut need_pragma: Option<bool> = None; 174 let mut charset: Option<Encoding> = None; 175 176 loop { 177 // Skip whitespace 178 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 179 pos += 1; 180 } 181 if pos >= bytes.len() { 182 break; 183 } 184 // End of tag? 185 if bytes[pos] == b'>' 186 || (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>') 187 { 188 break; 189 } 190 191 let (attr_name, attr_value, new_pos) = parse_attribute(bytes, pos)?; 192 pos = new_pos; 193 194 if ascii_ci_eq_str(&attr_name, "http-equiv") { 195 if ascii_ci_eq_str(&attr_value, "content-type") { 196 got_pragma = true; 197 } 198 } else if ascii_ci_eq_str(&attr_name, "content") { 199 if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) { 200 if let Some(enc) = lookup(&charset_val) { 201 charset = Some(enc); 202 need_pragma = Some(true); 203 } 204 } 205 } else if ascii_ci_eq_str(&attr_name, "charset") { 206 if let Some(enc) = lookup(&attr_value) { 207 charset = Some(enc); 208 need_pragma = Some(false); 209 } 210 } 211 } 212 213 // Determine result per spec 214 match (need_pragma, charset) { 215 (Some(true), Some(enc)) if got_pragma => Some((enc, pos)), 216 (Some(false), Some(enc)) => Some((enc, pos)), 217 _ => None, 218 } 219} 220 221/// Parse a single HTML attribute (name=value pair). 222/// 223/// Returns (name, value, new_position). Returns None if we hit end of tag or input. 224fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> { 225 let mut pos = start; 226 227 // Skip whitespace 228 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 229 pos += 1; 230 } 231 if pos >= bytes.len() || bytes[pos] == b'>' { 232 return None; 233 } 234 235 // Read attribute name 236 let name_start = pos; 237 while pos < bytes.len() 238 && bytes[pos] != b'=' 239 && bytes[pos] != b'>' 240 && !bytes[pos].is_ascii_whitespace() 241 && bytes[pos] != b'/' 242 { 243 pos += 1; 244 } 245 let name = to_ascii_lowercase(&bytes[name_start..pos]); 246 if name.is_empty() { 247 return None; 248 } 249 250 // Skip whitespace 251 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 252 pos += 1; 253 } 254 255 // No value 256 if pos >= bytes.len() || bytes[pos] != b'=' { 257 return Some((name, String::new(), pos)); 258 } 259 pos += 1; // skip '=' 260 261 // Skip whitespace 262 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { 263 pos += 1; 264 } 265 266 if pos >= bytes.len() { 267 return Some((name, String::new(), pos)); 268 } 269 270 // Read value 271 let value; 272 if bytes[pos] == b'"' || bytes[pos] == b'\'' { 273 let quote = bytes[pos]; 274 pos += 1; 275 let val_start = pos; 276 while pos < bytes.len() && bytes[pos] != quote { 277 pos += 1; 278 } 279 value = to_ascii_lowercase(&bytes[val_start..pos]); 280 if pos < bytes.len() { 281 pos += 1; // skip closing quote 282 } 283 } else { 284 let val_start = pos; 285 while pos < bytes.len() 286 && !bytes[pos].is_ascii_whitespace() 287 && bytes[pos] != b'>' 288 && bytes[pos] != b';' 289 { 290 pos += 1; 291 } 292 value = to_ascii_lowercase(&bytes[val_start..pos]); 293 } 294 295 Some((name, value, pos)) 296} 297 298/// Extract charset value from a meta content attribute value. 299/// 300/// Looks for `charset=` in strings like `text/html; charset=utf-8`. 301fn extract_charset_from_meta_content(content: &str) -> Option<String> { 302 let lower = content.to_ascii_lowercase(); 303 let idx = lower.find("charset")?; 304 let rest = &content[idx + 7..]; 305 // Skip whitespace 306 let rest = rest.trim_start(); 307 let rest = rest.strip_prefix('=')?; 308 let rest = rest.trim_start(); 309 310 if rest.is_empty() { 311 return None; 312 } 313 314 // The value is terminated by ';', whitespace, or end 315 if rest.starts_with('"') || rest.starts_with('\'') { 316 let quote = rest.as_bytes()[0]; 317 let inner = &rest[1..]; 318 let end = inner.find(quote as char).unwrap_or(inner.len()); 319 let val = inner[..end].trim(); 320 if val.is_empty() { 321 return None; 322 } 323 Some(val.to_string()) 324 } else { 325 let end = rest 326 .find(|c: char| c == ';' || c.is_ascii_whitespace()) 327 .unwrap_or(rest.len()); 328 if end == 0 { 329 return None; 330 } 331 Some(rest[..end].to_string()) 332 } 333} 334 335/// Skip a tag (find the closing '>'). 336fn skip_tag(bytes: &[u8], start: usize) -> usize { 337 let mut pos = start; 338 while pos < bytes.len() && bytes[pos] != b'>' { 339 pos += 1; 340 } 341 if pos < bytes.len() { 342 pos + 1 343 } else { 344 pos 345 } 346} 347 348fn is_space_or_slash(b: u8) -> bool { 349 b.is_ascii_whitespace() || b == b'/' 350} 351 352fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool { 353 a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y)) 354} 355 356fn ascii_ci_eq_str(a: &str, b: &str) -> bool { 357 a.eq_ignore_ascii_case(b) 358} 359 360fn to_ascii_lowercase(bytes: &[u8]) -> String { 361 bytes 362 .iter() 363 .map(|&b| b.to_ascii_lowercase() as char) 364 .collect() 365} 366 367#[cfg(test)] 368mod tests { 369 use super::*; 370 371 // ----------------------------------------------------------------------- 372 // sniff_encoding — BOM priority 373 // ----------------------------------------------------------------------- 374 375 #[test] 376 fn sniff_bom_utf8() { 377 let bytes = b"\xEF\xBB\xBFHello"; 378 let (enc, src) = sniff_encoding(bytes, None); 379 assert_eq!(enc, Encoding::Utf8); 380 assert_eq!(src, EncodingSource::Bom); 381 } 382 383 #[test] 384 fn sniff_bom_utf16be() { 385 let bytes = b"\xFE\xFF\x00A"; 386 let (enc, src) = sniff_encoding(bytes, None); 387 assert_eq!(enc, Encoding::Utf16Be); 388 assert_eq!(src, EncodingSource::Bom); 389 } 390 391 #[test] 392 fn sniff_bom_utf16le() { 393 let bytes = b"\xFF\xFEA\x00"; 394 let (enc, src) = sniff_encoding(bytes, None); 395 assert_eq!(enc, Encoding::Utf16Le); 396 assert_eq!(src, EncodingSource::Bom); 397 } 398 399 #[test] 400 fn sniff_bom_beats_http_header() { 401 let bytes = b"\xEF\xBB\xBFHello"; 402 let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2")); 403 assert_eq!(enc, Encoding::Utf8); 404 assert_eq!(src, EncodingSource::Bom); 405 } 406 407 #[test] 408 fn sniff_bom_beats_meta() { 409 let mut bytes = vec![0xEF, 0xBB, 0xBF]; 410 bytes.extend_from_slice(b"<meta charset=\"iso-8859-5\">"); 411 let (enc, src) = sniff_encoding(&bytes, None); 412 assert_eq!(enc, Encoding::Utf8); 413 assert_eq!(src, EncodingSource::Bom); 414 } 415 416 // ----------------------------------------------------------------------- 417 // sniff_encoding — HTTP Content-Type priority 418 // ----------------------------------------------------------------------- 419 420 #[test] 421 fn sniff_http_charset_utf8() { 422 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8")); 423 assert_eq!(enc, Encoding::Utf8); 424 assert_eq!(src, EncodingSource::HttpHeader); 425 } 426 427 #[test] 428 fn sniff_http_charset_quoted() { 429 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\"")); 430 assert_eq!(enc, Encoding::Utf8); 431 assert_eq!(src, EncodingSource::HttpHeader); 432 } 433 434 #[test] 435 fn sniff_http_charset_case_insensitive() { 436 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8")); 437 assert_eq!(enc, Encoding::Utf8); 438 assert_eq!(src, EncodingSource::HttpHeader); 439 } 440 441 #[test] 442 fn sniff_http_charset_no_space() { 443 let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8")); 444 assert_eq!(enc, Encoding::Utf8); 445 assert_eq!(src, EncodingSource::HttpHeader); 446 } 447 448 #[test] 449 fn sniff_http_charset_windows_1252() { 450 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252")); 451 assert_eq!(enc, Encoding::Windows1252); 452 assert_eq!(src, EncodingSource::HttpHeader); 453 } 454 455 #[test] 456 fn sniff_http_charset_iso_8859_1_maps_to_1252() { 457 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1")); 458 assert_eq!(enc, Encoding::Windows1252); 459 assert_eq!(src, EncodingSource::HttpHeader); 460 } 461 462 #[test] 463 fn sniff_http_utf16_override_to_utf8() { 464 // Per WHATWG spec: UTF-16 from HTTP becomes UTF-8 465 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le")); 466 assert_eq!(enc, Encoding::Utf8); 467 assert_eq!(src, EncodingSource::HttpHeader); 468 } 469 470 #[test] 471 fn sniff_http_no_charset() { 472 let (enc, src) = sniff_encoding(b"Hello", Some("text/html")); 473 // Falls through to default 474 assert_eq!(enc, Encoding::Windows1252); 475 assert_eq!(src, EncodingSource::Default); 476 } 477 478 #[test] 479 fn sniff_http_beats_meta() { 480 let html = b"<meta charset=\"iso-8859-5\">"; 481 let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8")); 482 assert_eq!(enc, Encoding::Utf8); 483 assert_eq!(src, EncodingSource::HttpHeader); 484 } 485 486 // ----------------------------------------------------------------------- 487 // sniff_encoding — meta prescan 488 // ----------------------------------------------------------------------- 489 490 #[test] 491 fn sniff_meta_charset() { 492 let html = b"<meta charset=\"utf-8\">"; 493 let (enc, src) = sniff_encoding(html, None); 494 assert_eq!(enc, Encoding::Utf8); 495 assert_eq!(src, EncodingSource::MetaPrescan); 496 } 497 498 #[test] 499 fn sniff_meta_charset_single_quotes() { 500 let html = b"<meta charset='utf-8'>"; 501 let (enc, src) = sniff_encoding(html, None); 502 assert_eq!(enc, Encoding::Utf8); 503 assert_eq!(src, EncodingSource::MetaPrescan); 504 } 505 506 #[test] 507 fn sniff_meta_http_equiv() { 508 let html = b"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"; 509 let (enc, src) = sniff_encoding(html, None); 510 assert_eq!(enc, Encoding::Utf8); 511 assert_eq!(src, EncodingSource::MetaPrescan); 512 } 513 514 #[test] 515 fn sniff_meta_http_equiv_case_insensitive() { 516 let html = b"<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\">"; 517 let (enc, src) = sniff_encoding(html, None); 518 assert_eq!(enc, Encoding::Utf8); 519 assert_eq!(src, EncodingSource::MetaPrescan); 520 } 521 522 #[test] 523 fn sniff_meta_charset_legacy_encoding() { 524 let html = b"<meta charset=\"windows-1251\">"; 525 let (enc, src) = sniff_encoding(html, None); 526 assert_eq!(enc, Encoding::Windows1251); 527 assert_eq!(src, EncodingSource::MetaPrescan); 528 } 529 530 #[test] 531 fn sniff_meta_utf16_override_to_utf8() { 532 let html = b"<meta charset=\"utf-16le\">"; 533 let (enc, src) = sniff_encoding(html, None); 534 assert_eq!(enc, Encoding::Utf8); 535 assert_eq!(src, EncodingSource::MetaPrescan); 536 } 537 538 #[test] 539 fn sniff_meta_with_doctype_and_html() { 540 let html = b"<!DOCTYPE html><html><head><meta charset=\"utf-8\"></head>"; 541 let (enc, src) = sniff_encoding(html, None); 542 assert_eq!(enc, Encoding::Utf8); 543 assert_eq!(src, EncodingSource::MetaPrescan); 544 } 545 546 #[test] 547 fn sniff_meta_with_comment_before() { 548 let html = b"<!-- comment --><meta charset=\"utf-8\">"; 549 let (enc, src) = sniff_encoding(html, None); 550 assert_eq!(enc, Encoding::Utf8); 551 assert_eq!(src, EncodingSource::MetaPrescan); 552 } 553 554 #[test] 555 fn sniff_meta_beyond_1024_bytes_not_found() { 556 let mut html = vec![b' '; 1024]; 557 html.extend_from_slice(b"<meta charset=\"utf-8\">"); 558 let (enc, src) = sniff_encoding(&html, None); 559 assert_eq!(enc, Encoding::Windows1252); 560 assert_eq!(src, EncodingSource::Default); 561 } 562 563 #[test] 564 fn sniff_meta_within_1024_bytes() { 565 let mut html = vec![b' '; 1000]; 566 html.extend_from_slice(b"<meta charset=\"utf-8\">"); 567 let (enc, src) = sniff_encoding(&html, None); 568 assert_eq!(enc, Encoding::Utf8); 569 assert_eq!(src, EncodingSource::MetaPrescan); 570 } 571 572 // ----------------------------------------------------------------------- 573 // sniff_encoding — default fallback 574 // ----------------------------------------------------------------------- 575 576 #[test] 577 fn sniff_default_no_signals() { 578 let (enc, src) = sniff_encoding(b"Hello world", None); 579 assert_eq!(enc, Encoding::Windows1252); 580 assert_eq!(src, EncodingSource::Default); 581 } 582 583 #[test] 584 fn sniff_default_empty() { 585 let (enc, src) = sniff_encoding(b"", None); 586 assert_eq!(enc, Encoding::Windows1252); 587 assert_eq!(src, EncodingSource::Default); 588 } 589 590 // ----------------------------------------------------------------------- 591 // extract_charset_from_content_type 592 // ----------------------------------------------------------------------- 593 594 #[test] 595 fn extract_charset_basic() { 596 assert_eq!( 597 extract_charset_from_content_type("text/html; charset=utf-8"), 598 Some(Encoding::Utf8) 599 ); 600 } 601 602 #[test] 603 fn extract_charset_quoted() { 604 assert_eq!( 605 extract_charset_from_content_type("text/html; charset=\"utf-8\""), 606 Some(Encoding::Utf8) 607 ); 608 } 609 610 #[test] 611 fn extract_charset_no_space() { 612 assert_eq!( 613 extract_charset_from_content_type("text/html;charset=utf-8"), 614 Some(Encoding::Utf8) 615 ); 616 } 617 618 #[test] 619 fn extract_charset_uppercase() { 620 assert_eq!( 621 extract_charset_from_content_type("text/html; CHARSET=UTF-8"), 622 Some(Encoding::Utf8) 623 ); 624 } 625 626 #[test] 627 fn extract_charset_missing() { 628 assert_eq!(extract_charset_from_content_type("text/html"), None); 629 } 630 631 #[test] 632 fn extract_charset_empty_value() { 633 assert_eq!( 634 extract_charset_from_content_type("text/html; charset="), 635 None 636 ); 637 } 638 639 #[test] 640 fn extract_charset_unknown_encoding() { 641 assert_eq!( 642 extract_charset_from_content_type("text/html; charset=bogus"), 643 None 644 ); 645 } 646 647 #[test] 648 fn extract_charset_with_extra_params() { 649 assert_eq!( 650 extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"), 651 Some(Encoding::Utf8) 652 ); 653 } 654 655 // ----------------------------------------------------------------------- 656 // meta_prescan internals 657 // ----------------------------------------------------------------------- 658 659 #[test] 660 fn meta_prescan_charset_attr() { 661 let html = b"<meta charset=\"iso-8859-2\">"; 662 assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2)); 663 } 664 665 #[test] 666 fn meta_prescan_http_equiv_content() { 667 let html = b"<meta http-equiv=\"content-type\" content=\"text/html; charset=koi8-r\">"; 668 assert_eq!(meta_prescan(html), Some(Encoding::Koi8R)); 669 } 670 671 #[test] 672 fn meta_prescan_no_meta() { 673 let html = b"<html><head><title>Test</title></head></html>"; 674 assert_eq!(meta_prescan(html), None); 675 } 676 677 #[test] 678 fn meta_prescan_meta_without_charset() { 679 let html = b"<meta name=\"viewport\" content=\"width=device-width\">"; 680 assert_eq!(meta_prescan(html), None); 681 } 682 683 #[test] 684 fn meta_prescan_http_equiv_without_content() { 685 let html = b"<meta http-equiv=\"content-type\">"; 686 assert_eq!(meta_prescan(html), None); 687 } 688 689 #[test] 690 fn meta_prescan_content_without_http_equiv() { 691 // charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false 692 let html = b"<meta content=\"text/html; charset=utf-8\">"; 693 assert_eq!(meta_prescan(html), None); 694 } 695 696 #[test] 697 fn meta_prescan_skips_comments() { 698 let html = b"<!-- <meta charset=\"iso-8859-5\"> --><meta charset=\"utf-8\">"; 699 assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 700 } 701 702 #[test] 703 fn meta_prescan_unquoted_charset() { 704 let html = b"<meta charset=utf-8>"; 705 assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 706 } 707 708 #[test] 709 fn meta_prescan_self_closing() { 710 let html = b"<meta charset=\"utf-8\" />"; 711 assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); 712 } 713}