//! Encoding sniffing per WHATWG Encoding Standard and HTML spec. //! //! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan. use crate::{bom_sniff, lookup, Encoding}; /// How the encoding was determined. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum EncodingSource { /// Byte Order Mark at the start of the byte stream. Bom, /// `charset` parameter from the HTTP `Content-Type` header. HttpHeader, /// `` or `` prescan. MetaPrescan, /// Default fallback (Windows-1252 for HTML). Default, } /// Sniff the encoding of a byte stream. /// /// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default. /// The default encoding is Windows-1252 per WHATWG spec for HTML. pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) { // 1. BOM sniffing (highest priority) let (bom_enc, _) = bom_sniff(bytes); if let Some(enc) = bom_enc { return (enc, EncodingSource::Bom); } // 2. HTTP Content-Type charset if let Some(ct) = http_content_type { if let Some(enc) = extract_charset_from_content_type(ct) { return (enc, EncodingSource::HttpHeader); } } // 3. HTML meta prescan (first 1024 bytes) if let Some(enc) = meta_prescan(bytes) { return (enc, EncodingSource::MetaPrescan); } // 4. Default: Windows-1252 (Encoding::Windows1252, EncodingSource::Default) } /// Extract charset from an HTTP `Content-Type` header value. /// /// Handles formats like: /// - `text/html; charset=utf-8` /// - `text/html; charset="utf-8"` /// - `text/html;charset=utf-8` (no space) /// /// Per WHATWG spec, the charset parameter value is looked up via the encoding label table. /// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM). fn extract_charset_from_content_type(content_type: &str) -> Option { let charset_value = extract_charset_value(content_type)?; let enc = lookup(charset_value)?; // Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead Some(match enc { Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8, other => other, }) } /// Extract the raw charset value from a Content-Type string. fn extract_charset_value(content_type: &str) -> Option<&str> { // Find "charset" (case-insensitive) after a ';' let lower = content_type.to_ascii_lowercase(); let idx = lower.find("charset")?; // Must be preceded by ';' or whitespace (or be in parameters section) let after_charset = &content_type[idx + 7..]; // Skip optional whitespace then '=' let after_charset = after_charset.trim_start(); let after_eq = after_charset.strip_prefix('=')?; let after_eq = after_eq.trim_start(); if let Some(inner) = after_eq.strip_prefix('"') { // Quoted value let end = inner.find('"')?; Some(&inner[..end]) } else { // Unquoted value: terminated by whitespace, ';', or end of string let end = after_eq .find(|c: char| c == ';' || c.is_ascii_whitespace()) .unwrap_or(after_eq.len()); if end == 0 { return None; } Some(&after_eq[..end]) } } /// Prescan the first 1024 bytes of an HTML document for encoding declarations. /// /// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm. /// Looks for: /// - `` /// - `` fn meta_prescan(bytes: &[u8]) -> Option { let limit = bytes.len().min(1024); let bytes = &bytes[..limit]; let mut pos = 0; while pos < bytes.len() { // Skip until we find '<' if bytes[pos] != b'<' { pos += 1; continue; } pos += 1; if pos >= bytes.len() { break; } // Check for comment "" while pos + 2 < bytes.len() { if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' { pos += 3; break; } pos += 1; } continue; } // Check for " Encoding::Utf8, other => other, }; return Some(enc); } else { pos = skip_tag(bytes, after_meta); continue; } } } // Skip other tags (like , , etc.) if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' { pos = skip_tag(bytes, pos); continue; } // Check if it's a letter (start of a tag name) if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() { pos = skip_tag(bytes, pos); continue; } // Not a tag, continue } None } /// Parse attributes of a ` Option<(Encoding, usize)> { let mut pos = start; let mut got_pragma = false; let mut need_pragma: Option = None; let mut charset: Option = None; loop { // Skip whitespace while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { pos += 1; } if pos >= bytes.len() { break; } // End of tag? if bytes[pos] == b'>' || (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>') { break; } let (attr_name, attr_value, new_pos) = parse_attribute(bytes, pos)?; pos = new_pos; if ascii_ci_eq_str(&attr_name, "http-equiv") { if ascii_ci_eq_str(&attr_value, "content-type") { got_pragma = true; } } else if ascii_ci_eq_str(&attr_name, "content") { if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) { if let Some(enc) = lookup(&charset_val) { charset = Some(enc); need_pragma = Some(true); } } } else if ascii_ci_eq_str(&attr_name, "charset") { if let Some(enc) = lookup(&attr_value) { charset = Some(enc); need_pragma = Some(false); } } } // Determine result per spec match (need_pragma, charset) { (Some(true), Some(enc)) if got_pragma => Some((enc, pos)), (Some(false), Some(enc)) => Some((enc, pos)), _ => None, } } /// Parse a single HTML attribute (name=value pair). /// /// Returns (name, value, new_position). Returns None if we hit end of tag or input. fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> { let mut pos = start; // Skip whitespace while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { pos += 1; } if pos >= bytes.len() || bytes[pos] == b'>' { return None; } // Read attribute name let name_start = pos; while pos < bytes.len() && bytes[pos] != b'=' && bytes[pos] != b'>' && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'/' { pos += 1; } let name = to_ascii_lowercase(&bytes[name_start..pos]); if name.is_empty() { return None; } // Skip whitespace while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { pos += 1; } // No value if pos >= bytes.len() || bytes[pos] != b'=' { return Some((name, String::new(), pos)); } pos += 1; // skip '=' // Skip whitespace while pos < bytes.len() && bytes[pos].is_ascii_whitespace() { pos += 1; } if pos >= bytes.len() { return Some((name, String::new(), pos)); } // Read value let value; if bytes[pos] == b'"' || bytes[pos] == b'\'' { let quote = bytes[pos]; pos += 1; let val_start = pos; while pos < bytes.len() && bytes[pos] != quote { pos += 1; } value = to_ascii_lowercase(&bytes[val_start..pos]); if pos < bytes.len() { pos += 1; // skip closing quote } } else { let val_start = pos; while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'>' && bytes[pos] != b';' { pos += 1; } value = to_ascii_lowercase(&bytes[val_start..pos]); } Some((name, value, pos)) } /// Extract charset value from a meta content attribute value. /// /// Looks for `charset=` in strings like `text/html; charset=utf-8`. fn extract_charset_from_meta_content(content: &str) -> Option { let lower = content.to_ascii_lowercase(); let idx = lower.find("charset")?; let rest = &content[idx + 7..]; // Skip whitespace let rest = rest.trim_start(); let rest = rest.strip_prefix('=')?; let rest = rest.trim_start(); if rest.is_empty() { return None; } // The value is terminated by ';', whitespace, or end if rest.starts_with('"') || rest.starts_with('\'') { let quote = rest.as_bytes()[0]; let inner = &rest[1..]; let end = inner.find(quote as char).unwrap_or(inner.len()); let val = inner[..end].trim(); if val.is_empty() { return None; } Some(val.to_string()) } else { let end = rest .find(|c: char| c == ';' || c.is_ascii_whitespace()) .unwrap_or(rest.len()); if end == 0 { return None; } Some(rest[..end].to_string()) } } /// Skip a tag (find the closing '>'). fn skip_tag(bytes: &[u8], start: usize) -> usize { let mut pos = start; while pos < bytes.len() && bytes[pos] != b'>' { pos += 1; } if pos < bytes.len() { pos + 1 } else { pos } } fn is_space_or_slash(b: u8) -> bool { b.is_ascii_whitespace() || b == b'/' } fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool { a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y)) } fn ascii_ci_eq_str(a: &str, b: &str) -> bool { a.eq_ignore_ascii_case(b) } fn to_ascii_lowercase(bytes: &[u8]) -> String { bytes .iter() .map(|&b| b.to_ascii_lowercase() as char) .collect() } #[cfg(test)] mod tests { use super::*; // ----------------------------------------------------------------------- // sniff_encoding — BOM priority // ----------------------------------------------------------------------- #[test] fn sniff_bom_utf8() { let bytes = b"\xEF\xBB\xBFHello"; let (enc, src) = sniff_encoding(bytes, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::Bom); } #[test] fn sniff_bom_utf16be() { let bytes = b"\xFE\xFF\x00A"; let (enc, src) = sniff_encoding(bytes, None); assert_eq!(enc, Encoding::Utf16Be); assert_eq!(src, EncodingSource::Bom); } #[test] fn sniff_bom_utf16le() { let bytes = b"\xFF\xFEA\x00"; let (enc, src) = sniff_encoding(bytes, None); assert_eq!(enc, Encoding::Utf16Le); assert_eq!(src, EncodingSource::Bom); } #[test] fn sniff_bom_beats_http_header() { let bytes = b"\xEF\xBB\xBFHello"; let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2")); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::Bom); } #[test] fn sniff_bom_beats_meta() { let mut bytes = vec![0xEF, 0xBB, 0xBF]; bytes.extend_from_slice(b""); let (enc, src) = sniff_encoding(&bytes, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::Bom); } // ----------------------------------------------------------------------- // sniff_encoding — HTTP Content-Type priority // ----------------------------------------------------------------------- #[test] fn sniff_http_charset_utf8() { let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8")); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::HttpHeader); } #[test] fn sniff_http_charset_quoted() { let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\"")); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::HttpHeader); } #[test] fn sniff_http_charset_case_insensitive() { let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8")); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::HttpHeader); } #[test] fn sniff_http_charset_no_space() { let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8")); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::HttpHeader); } #[test] fn sniff_http_charset_windows_1252() { let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252")); assert_eq!(enc, Encoding::Windows1252); assert_eq!(src, EncodingSource::HttpHeader); } #[test] fn sniff_http_charset_iso_8859_1_maps_to_1252() { let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1")); assert_eq!(enc, Encoding::Windows1252); assert_eq!(src, EncodingSource::HttpHeader); } #[test] fn sniff_http_utf16_override_to_utf8() { // Per WHATWG spec: UTF-16 from HTTP becomes UTF-8 let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le")); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::HttpHeader); } #[test] fn sniff_http_no_charset() { let (enc, src) = sniff_encoding(b"Hello", Some("text/html")); // Falls through to default assert_eq!(enc, Encoding::Windows1252); assert_eq!(src, EncodingSource::Default); } #[test] fn sniff_http_beats_meta() { let html = b""; let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8")); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::HttpHeader); } // ----------------------------------------------------------------------- // sniff_encoding — meta prescan // ----------------------------------------------------------------------- #[test] fn sniff_meta_charset() { let html = b""; let (enc, src) = sniff_encoding(html, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::MetaPrescan); } #[test] fn sniff_meta_charset_single_quotes() { let html = b""; let (enc, src) = sniff_encoding(html, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::MetaPrescan); } #[test] fn sniff_meta_http_equiv() { let html = b""; let (enc, src) = sniff_encoding(html, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::MetaPrescan); } #[test] fn sniff_meta_http_equiv_case_insensitive() { let html = b""; let (enc, src) = sniff_encoding(html, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::MetaPrescan); } #[test] fn sniff_meta_charset_legacy_encoding() { let html = b""; let (enc, src) = sniff_encoding(html, None); assert_eq!(enc, Encoding::Windows1251); assert_eq!(src, EncodingSource::MetaPrescan); } #[test] fn sniff_meta_utf16_override_to_utf8() { let html = b""; let (enc, src) = sniff_encoding(html, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::MetaPrescan); } #[test] fn sniff_meta_with_doctype_and_html() { let html = b""; let (enc, src) = sniff_encoding(html, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::MetaPrescan); } #[test] fn sniff_meta_with_comment_before() { let html = b""; let (enc, src) = sniff_encoding(html, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::MetaPrescan); } #[test] fn sniff_meta_beyond_1024_bytes_not_found() { let mut html = vec![b' '; 1024]; html.extend_from_slice(b""); let (enc, src) = sniff_encoding(&html, None); assert_eq!(enc, Encoding::Windows1252); assert_eq!(src, EncodingSource::Default); } #[test] fn sniff_meta_within_1024_bytes() { let mut html = vec![b' '; 1000]; html.extend_from_slice(b""); let (enc, src) = sniff_encoding(&html, None); assert_eq!(enc, Encoding::Utf8); assert_eq!(src, EncodingSource::MetaPrescan); } // ----------------------------------------------------------------------- // sniff_encoding — default fallback // ----------------------------------------------------------------------- #[test] fn sniff_default_no_signals() { let (enc, src) = sniff_encoding(b"Hello world", None); assert_eq!(enc, Encoding::Windows1252); assert_eq!(src, EncodingSource::Default); } #[test] fn sniff_default_empty() { let (enc, src) = sniff_encoding(b"", None); assert_eq!(enc, Encoding::Windows1252); assert_eq!(src, EncodingSource::Default); } // ----------------------------------------------------------------------- // extract_charset_from_content_type // ----------------------------------------------------------------------- #[test] fn extract_charset_basic() { assert_eq!( extract_charset_from_content_type("text/html; charset=utf-8"), Some(Encoding::Utf8) ); } #[test] fn extract_charset_quoted() { assert_eq!( extract_charset_from_content_type("text/html; charset=\"utf-8\""), Some(Encoding::Utf8) ); } #[test] fn extract_charset_no_space() { assert_eq!( extract_charset_from_content_type("text/html;charset=utf-8"), Some(Encoding::Utf8) ); } #[test] fn extract_charset_uppercase() { assert_eq!( extract_charset_from_content_type("text/html; CHARSET=UTF-8"), Some(Encoding::Utf8) ); } #[test] fn extract_charset_missing() { assert_eq!(extract_charset_from_content_type("text/html"), None); } #[test] fn extract_charset_empty_value() { assert_eq!( extract_charset_from_content_type("text/html; charset="), None ); } #[test] fn extract_charset_unknown_encoding() { assert_eq!( extract_charset_from_content_type("text/html; charset=bogus"), None ); } #[test] fn extract_charset_with_extra_params() { assert_eq!( extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"), Some(Encoding::Utf8) ); } // ----------------------------------------------------------------------- // meta_prescan internals // ----------------------------------------------------------------------- #[test] fn meta_prescan_charset_attr() { let html = b""; assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2)); } #[test] fn meta_prescan_http_equiv_content() { let html = b""; assert_eq!(meta_prescan(html), Some(Encoding::Koi8R)); } #[test] fn meta_prescan_no_meta() { let html = b"Test"; assert_eq!(meta_prescan(html), None); } #[test] fn meta_prescan_meta_without_charset() { let html = b""; assert_eq!(meta_prescan(html), None); } #[test] fn meta_prescan_http_equiv_without_content() { let html = b""; assert_eq!(meta_prescan(html), None); } #[test] fn meta_prescan_content_without_http_equiv() { // charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false let html = b""; assert_eq!(meta_prescan(html), None); } #[test] fn meta_prescan_skips_comments() { let html = b""; assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); } #[test] fn meta_prescan_unquoted_charset() { let html = b""; assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); } #[test] fn meta_prescan_self_closing() { let html = b""; assert_eq!(meta_prescan(html), Some(Encoding::Utf8)); } }