//! Encoding sniffing per WHATWG Encoding Standard and HTML spec.
//!
//! Detects character encoding from BOM, HTTP Content-Type charset, or HTML meta prescan.
use crate::{bom_sniff, lookup, Encoding};
/// How the encoding was determined.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EncodingSource {
/// Byte Order Mark at the start of the byte stream.
Bom,
/// `charset` parameter from the HTTP `Content-Type` header.
HttpHeader,
/// `` or `` prescan.
MetaPrescan,
/// Default fallback (Windows-1252 for HTML).
Default,
}
/// Sniff the encoding of a byte stream.
///
/// Priority order per spec: BOM > HTTP Content-Type charset > HTML meta prescan > default.
/// The default encoding is Windows-1252 per WHATWG spec for HTML.
pub fn sniff_encoding(bytes: &[u8], http_content_type: Option<&str>) -> (Encoding, EncodingSource) {
// 1. BOM sniffing (highest priority)
let (bom_enc, _) = bom_sniff(bytes);
if let Some(enc) = bom_enc {
return (enc, EncodingSource::Bom);
}
// 2. HTTP Content-Type charset
if let Some(ct) = http_content_type {
if let Some(enc) = extract_charset_from_content_type(ct) {
return (enc, EncodingSource::HttpHeader);
}
}
// 3. HTML meta prescan (first 1024 bytes)
if let Some(enc) = meta_prescan(bytes) {
return (enc, EncodingSource::MetaPrescan);
}
// 4. Default: Windows-1252
(Encoding::Windows1252, EncodingSource::Default)
}
/// Extract charset from an HTTP `Content-Type` header value.
///
/// Handles formats like:
/// - `text/html; charset=utf-8`
/// - `text/html; charset="utf-8"`
/// - `text/html;charset=utf-8` (no space)
///
/// Per WHATWG spec, the charset parameter value is looked up via the encoding label table.
/// Returns `None` for UTF-16BE/LE from HTTP headers per spec (those are only valid via BOM).
fn extract_charset_from_content_type(content_type: &str) -> Option {
let charset_value = extract_charset_value(content_type)?;
let enc = lookup(charset_value)?;
// Per WHATWG: if the encoding from HTTP is UTF-16BE or UTF-16LE, use UTF-8 instead
Some(match enc {
Encoding::Utf16Be | Encoding::Utf16Le => Encoding::Utf8,
other => other,
})
}
/// Extract the raw charset value from a Content-Type string.
fn extract_charset_value(content_type: &str) -> Option<&str> {
// Find "charset" (case-insensitive) after a ';'
let lower = content_type.to_ascii_lowercase();
let idx = lower.find("charset")?;
// Must be preceded by ';' or whitespace (or be in parameters section)
let after_charset = &content_type[idx + 7..];
// Skip optional whitespace then '='
let after_charset = after_charset.trim_start();
let after_eq = after_charset.strip_prefix('=')?;
let after_eq = after_eq.trim_start();
if let Some(inner) = after_eq.strip_prefix('"') {
// Quoted value
let end = inner.find('"')?;
Some(&inner[..end])
} else {
// Unquoted value: terminated by whitespace, ';', or end of string
let end = after_eq
.find(|c: char| c == ';' || c.is_ascii_whitespace())
.unwrap_or(after_eq.len());
if end == 0 {
return None;
}
Some(&after_eq[..end])
}
}
/// Prescan the first 1024 bytes of an HTML document for encoding declarations.
///
/// Per the HTML spec "prescan a byte stream to determine its encoding" algorithm.
/// Looks for:
/// - ``
/// - ``
fn meta_prescan(bytes: &[u8]) -> Option {
let limit = bytes.len().min(1024);
let bytes = &bytes[..limit];
let mut pos = 0;
while pos < bytes.len() {
// Skip until we find '<'
if bytes[pos] != b'<' {
pos += 1;
continue;
}
pos += 1;
if pos >= bytes.len() {
break;
}
// Check for comment ""
while pos + 2 < bytes.len() {
if bytes[pos] == b'-' && bytes[pos + 1] == b'-' && bytes[pos + 2] == b'>' {
pos += 3;
break;
}
pos += 1;
}
continue;
}
// Check for " Encoding::Utf8,
other => other,
};
return Some(enc);
} else {
pos = skip_tag(bytes, after_meta);
continue;
}
}
}
// Skip other tags (like , , etc.)
if bytes[pos..].starts_with(b"!") || bytes[pos..].starts_with(b"/") || bytes[pos] == b'?' {
pos = skip_tag(bytes, pos);
continue;
}
// Check if it's a letter (start of a tag name)
if pos < bytes.len() && bytes[pos].is_ascii_alphabetic() {
pos = skip_tag(bytes, pos);
continue;
}
// Not a tag, continue
}
None
}
/// Parse attributes of a ` Option<(Encoding, usize)> {
let mut pos = start;
let mut got_pragma = false;
let mut need_pragma: Option = None;
let mut charset: Option = None;
loop {
// Skip whitespace
while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
pos += 1;
}
if pos >= bytes.len() {
break;
}
// End of tag?
if bytes[pos] == b'>'
|| (bytes[pos] == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'>')
{
break;
}
let (attr_name, attr_value, new_pos) = parse_attribute(bytes, pos)?;
pos = new_pos;
if ascii_ci_eq_str(&attr_name, "http-equiv") {
if ascii_ci_eq_str(&attr_value, "content-type") {
got_pragma = true;
}
} else if ascii_ci_eq_str(&attr_name, "content") {
if let Some(charset_val) = extract_charset_from_meta_content(&attr_value) {
if let Some(enc) = lookup(&charset_val) {
charset = Some(enc);
need_pragma = Some(true);
}
}
} else if ascii_ci_eq_str(&attr_name, "charset") {
if let Some(enc) = lookup(&attr_value) {
charset = Some(enc);
need_pragma = Some(false);
}
}
}
// Determine result per spec
match (need_pragma, charset) {
(Some(true), Some(enc)) if got_pragma => Some((enc, pos)),
(Some(false), Some(enc)) => Some((enc, pos)),
_ => None,
}
}
/// Parse a single HTML attribute (name=value pair).
///
/// Returns (name, value, new_position). Returns None if we hit end of tag or input.
fn parse_attribute(bytes: &[u8], start: usize) -> Option<(String, String, usize)> {
let mut pos = start;
// Skip whitespace
while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
pos += 1;
}
if pos >= bytes.len() || bytes[pos] == b'>' {
return None;
}
// Read attribute name
let name_start = pos;
while pos < bytes.len()
&& bytes[pos] != b'='
&& bytes[pos] != b'>'
&& !bytes[pos].is_ascii_whitespace()
&& bytes[pos] != b'/'
{
pos += 1;
}
let name = to_ascii_lowercase(&bytes[name_start..pos]);
if name.is_empty() {
return None;
}
// Skip whitespace
while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
pos += 1;
}
// No value
if pos >= bytes.len() || bytes[pos] != b'=' {
return Some((name, String::new(), pos));
}
pos += 1; // skip '='
// Skip whitespace
while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
pos += 1;
}
if pos >= bytes.len() {
return Some((name, String::new(), pos));
}
// Read value
let value;
if bytes[pos] == b'"' || bytes[pos] == b'\'' {
let quote = bytes[pos];
pos += 1;
let val_start = pos;
while pos < bytes.len() && bytes[pos] != quote {
pos += 1;
}
value = to_ascii_lowercase(&bytes[val_start..pos]);
if pos < bytes.len() {
pos += 1; // skip closing quote
}
} else {
let val_start = pos;
while pos < bytes.len()
&& !bytes[pos].is_ascii_whitespace()
&& bytes[pos] != b'>'
&& bytes[pos] != b';'
{
pos += 1;
}
value = to_ascii_lowercase(&bytes[val_start..pos]);
}
Some((name, value, pos))
}
/// Extract charset value from a meta content attribute value.
///
/// Looks for `charset=` in strings like `text/html; charset=utf-8`.
fn extract_charset_from_meta_content(content: &str) -> Option {
let lower = content.to_ascii_lowercase();
let idx = lower.find("charset")?;
let rest = &content[idx + 7..];
// Skip whitespace
let rest = rest.trim_start();
let rest = rest.strip_prefix('=')?;
let rest = rest.trim_start();
if rest.is_empty() {
return None;
}
// The value is terminated by ';', whitespace, or end
if rest.starts_with('"') || rest.starts_with('\'') {
let quote = rest.as_bytes()[0];
let inner = &rest[1..];
let end = inner.find(quote as char).unwrap_or(inner.len());
let val = inner[..end].trim();
if val.is_empty() {
return None;
}
Some(val.to_string())
} else {
let end = rest
.find(|c: char| c == ';' || c.is_ascii_whitespace())
.unwrap_or(rest.len());
if end == 0 {
return None;
}
Some(rest[..end].to_string())
}
}
/// Skip a tag (find the closing '>').
fn skip_tag(bytes: &[u8], start: usize) -> usize {
let mut pos = start;
while pos < bytes.len() && bytes[pos] != b'>' {
pos += 1;
}
if pos < bytes.len() {
pos + 1
} else {
pos
}
}
fn is_space_or_slash(b: u8) -> bool {
b.is_ascii_whitespace() || b == b'/'
}
fn ascii_ci_eq(a: &[u8], b: &[u8]) -> bool {
a.len() == b.len() && a.iter().zip(b).all(|(x, y)| x.eq_ignore_ascii_case(y))
}
fn ascii_ci_eq_str(a: &str, b: &str) -> bool {
a.eq_ignore_ascii_case(b)
}
fn to_ascii_lowercase(bytes: &[u8]) -> String {
bytes
.iter()
.map(|&b| b.to_ascii_lowercase() as char)
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
// -----------------------------------------------------------------------
// sniff_encoding — BOM priority
// -----------------------------------------------------------------------
#[test]
fn sniff_bom_utf8() {
let bytes = b"\xEF\xBB\xBFHello";
let (enc, src) = sniff_encoding(bytes, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::Bom);
}
#[test]
fn sniff_bom_utf16be() {
let bytes = b"\xFE\xFF\x00A";
let (enc, src) = sniff_encoding(bytes, None);
assert_eq!(enc, Encoding::Utf16Be);
assert_eq!(src, EncodingSource::Bom);
}
#[test]
fn sniff_bom_utf16le() {
let bytes = b"\xFF\xFEA\x00";
let (enc, src) = sniff_encoding(bytes, None);
assert_eq!(enc, Encoding::Utf16Le);
assert_eq!(src, EncodingSource::Bom);
}
#[test]
fn sniff_bom_beats_http_header() {
let bytes = b"\xEF\xBB\xBFHello";
let (enc, src) = sniff_encoding(bytes, Some("text/html; charset=iso-8859-2"));
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::Bom);
}
#[test]
fn sniff_bom_beats_meta() {
let mut bytes = vec![0xEF, 0xBB, 0xBF];
bytes.extend_from_slice(b"");
let (enc, src) = sniff_encoding(&bytes, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::Bom);
}
// -----------------------------------------------------------------------
// sniff_encoding — HTTP Content-Type priority
// -----------------------------------------------------------------------
#[test]
fn sniff_http_charset_utf8() {
let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-8"));
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::HttpHeader);
}
#[test]
fn sniff_http_charset_quoted() {
let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=\"utf-8\""));
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::HttpHeader);
}
#[test]
fn sniff_http_charset_case_insensitive() {
let (enc, src) = sniff_encoding(b"Hello", Some("text/html; Charset=UTF-8"));
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::HttpHeader);
}
#[test]
fn sniff_http_charset_no_space() {
let (enc, src) = sniff_encoding(b"Hello", Some("text/html;charset=utf-8"));
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::HttpHeader);
}
#[test]
fn sniff_http_charset_windows_1252() {
let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=windows-1252"));
assert_eq!(enc, Encoding::Windows1252);
assert_eq!(src, EncodingSource::HttpHeader);
}
#[test]
fn sniff_http_charset_iso_8859_1_maps_to_1252() {
let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=iso-8859-1"));
assert_eq!(enc, Encoding::Windows1252);
assert_eq!(src, EncodingSource::HttpHeader);
}
#[test]
fn sniff_http_utf16_override_to_utf8() {
// Per WHATWG spec: UTF-16 from HTTP becomes UTF-8
let (enc, src) = sniff_encoding(b"Hello", Some("text/html; charset=utf-16le"));
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::HttpHeader);
}
#[test]
fn sniff_http_no_charset() {
let (enc, src) = sniff_encoding(b"Hello", Some("text/html"));
// Falls through to default
assert_eq!(enc, Encoding::Windows1252);
assert_eq!(src, EncodingSource::Default);
}
#[test]
fn sniff_http_beats_meta() {
let html = b"";
let (enc, src) = sniff_encoding(html, Some("text/html; charset=utf-8"));
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::HttpHeader);
}
// -----------------------------------------------------------------------
// sniff_encoding — meta prescan
// -----------------------------------------------------------------------
#[test]
fn sniff_meta_charset() {
let html = b"";
let (enc, src) = sniff_encoding(html, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::MetaPrescan);
}
#[test]
fn sniff_meta_charset_single_quotes() {
let html = b"";
let (enc, src) = sniff_encoding(html, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::MetaPrescan);
}
#[test]
fn sniff_meta_http_equiv() {
let html = b"";
let (enc, src) = sniff_encoding(html, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::MetaPrescan);
}
#[test]
fn sniff_meta_http_equiv_case_insensitive() {
let html = b"";
let (enc, src) = sniff_encoding(html, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::MetaPrescan);
}
#[test]
fn sniff_meta_charset_legacy_encoding() {
let html = b"";
let (enc, src) = sniff_encoding(html, None);
assert_eq!(enc, Encoding::Windows1251);
assert_eq!(src, EncodingSource::MetaPrescan);
}
#[test]
fn sniff_meta_utf16_override_to_utf8() {
let html = b"";
let (enc, src) = sniff_encoding(html, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::MetaPrescan);
}
#[test]
fn sniff_meta_with_doctype_and_html() {
let html = b"";
let (enc, src) = sniff_encoding(html, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::MetaPrescan);
}
#[test]
fn sniff_meta_with_comment_before() {
let html = b"";
let (enc, src) = sniff_encoding(html, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::MetaPrescan);
}
#[test]
fn sniff_meta_beyond_1024_bytes_not_found() {
let mut html = vec![b' '; 1024];
html.extend_from_slice(b"");
let (enc, src) = sniff_encoding(&html, None);
assert_eq!(enc, Encoding::Windows1252);
assert_eq!(src, EncodingSource::Default);
}
#[test]
fn sniff_meta_within_1024_bytes() {
let mut html = vec![b' '; 1000];
html.extend_from_slice(b"");
let (enc, src) = sniff_encoding(&html, None);
assert_eq!(enc, Encoding::Utf8);
assert_eq!(src, EncodingSource::MetaPrescan);
}
// -----------------------------------------------------------------------
// sniff_encoding — default fallback
// -----------------------------------------------------------------------
#[test]
fn sniff_default_no_signals() {
let (enc, src) = sniff_encoding(b"Hello world", None);
assert_eq!(enc, Encoding::Windows1252);
assert_eq!(src, EncodingSource::Default);
}
#[test]
fn sniff_default_empty() {
let (enc, src) = sniff_encoding(b"", None);
assert_eq!(enc, Encoding::Windows1252);
assert_eq!(src, EncodingSource::Default);
}
// -----------------------------------------------------------------------
// extract_charset_from_content_type
// -----------------------------------------------------------------------
#[test]
fn extract_charset_basic() {
assert_eq!(
extract_charset_from_content_type("text/html; charset=utf-8"),
Some(Encoding::Utf8)
);
}
#[test]
fn extract_charset_quoted() {
assert_eq!(
extract_charset_from_content_type("text/html; charset=\"utf-8\""),
Some(Encoding::Utf8)
);
}
#[test]
fn extract_charset_no_space() {
assert_eq!(
extract_charset_from_content_type("text/html;charset=utf-8"),
Some(Encoding::Utf8)
);
}
#[test]
fn extract_charset_uppercase() {
assert_eq!(
extract_charset_from_content_type("text/html; CHARSET=UTF-8"),
Some(Encoding::Utf8)
);
}
#[test]
fn extract_charset_missing() {
assert_eq!(extract_charset_from_content_type("text/html"), None);
}
#[test]
fn extract_charset_empty_value() {
assert_eq!(
extract_charset_from_content_type("text/html; charset="),
None
);
}
#[test]
fn extract_charset_unknown_encoding() {
assert_eq!(
extract_charset_from_content_type("text/html; charset=bogus"),
None
);
}
#[test]
fn extract_charset_with_extra_params() {
assert_eq!(
extract_charset_from_content_type("text/html; charset=utf-8; boundary=something"),
Some(Encoding::Utf8)
);
}
// -----------------------------------------------------------------------
// meta_prescan internals
// -----------------------------------------------------------------------
#[test]
fn meta_prescan_charset_attr() {
let html = b"";
assert_eq!(meta_prescan(html), Some(Encoding::Iso8859_2));
}
#[test]
fn meta_prescan_http_equiv_content() {
let html = b"";
assert_eq!(meta_prescan(html), Some(Encoding::Koi8R));
}
#[test]
fn meta_prescan_no_meta() {
let html = b"Test";
assert_eq!(meta_prescan(html), None);
}
#[test]
fn meta_prescan_meta_without_charset() {
let html = b"";
assert_eq!(meta_prescan(html), None);
}
#[test]
fn meta_prescan_http_equiv_without_content() {
let html = b"";
assert_eq!(meta_prescan(html), None);
}
#[test]
fn meta_prescan_content_without_http_equiv() {
// charset in content but no http-equiv="content-type" -> need_pragma is true but got_pragma is false
let html = b"";
assert_eq!(meta_prescan(html), None);
}
#[test]
fn meta_prescan_skips_comments() {
let html = b"";
assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
}
#[test]
fn meta_prescan_unquoted_charset() {
let html = b"";
assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
}
#[test]
fn meta_prescan_self_closing() {
let html = b"";
assert_eq!(meta_prescan(html), Some(Encoding::Utf8));
}
}