OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Encoding label normalization per WHATWG Encoding Standard *)
2
3let normalize_label label =
4 if String.length label = 0 then None
5 else
6 let s = Astring.String.Ascii.lowercase (Astring.String.trim label) in
7 if String.length s = 0 then None
8 else
9 (* Security: never allow utf-7 *)
10 if s = "utf-7" || s = "utf7" || s = "x-utf-7" then
11 Some Encoding_types.Windows_1252
12 else if s = "utf-8" || s = "utf8" then
13 Some Encoding_types.Utf8
14 (* HTML treats latin-1 labels as windows-1252 *)
15 else if s = "iso-8859-1" || s = "iso8859-1" || s = "latin1" ||
16 s = "latin-1" || s = "l1" || s = "cp819" || s = "ibm819" then
17 Some Encoding_types.Windows_1252
18 else if s = "windows-1252" || s = "windows1252" || s = "cp1252" || s = "x-cp1252" then
19 Some Encoding_types.Windows_1252
20 else if s = "iso-8859-2" || s = "iso8859-2" || s = "latin2" || s = "latin-2" then
21 Some Encoding_types.Iso_8859_2
22 else if s = "euc-jp" || s = "eucjp" then
23 Some Encoding_types.Euc_jp
24 else if s = "utf-16" || s = "utf16" then
25 Some Encoding_types.Utf16le (* Default to LE for ambiguous utf-16 *)
26 else if s = "utf-16le" || s = "utf16le" then
27 Some Encoding_types.Utf16le
28 else if s = "utf-16be" || s = "utf16be" then
29 Some Encoding_types.Utf16be
30 else
31 None
32
33let normalize_meta_declared label =
34 match normalize_label label with
35 | None -> None
36 | Some enc ->
37 (* Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
38 treat them as UTF-8 *)
39 match enc with
40 | Encoding_types.Utf16le | Encoding_types.Utf16be -> Some Encoding_types.Utf8
41 | other -> Some other