OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Encoding label normalization per WHATWG Encoding Standard *) 2 3let normalize_label label = 4 if String.length label = 0 then None 5 else 6 let s = Astring.String.Ascii.lowercase (Astring.String.trim label) in 7 if String.length s = 0 then None 8 else 9 (* Security: never allow utf-7 *) 10 if s = "utf-7" || s = "utf7" || s = "x-utf-7" then 11 Some Encoding_types.Windows_1252 12 else if s = "utf-8" || s = "utf8" then 13 Some Encoding_types.Utf8 14 (* HTML treats latin-1 labels as windows-1252 *) 15 else if s = "iso-8859-1" || s = "iso8859-1" || s = "latin1" || 16 s = "latin-1" || s = "l1" || s = "cp819" || s = "ibm819" then 17 Some Encoding_types.Windows_1252 18 else if s = "windows-1252" || s = "windows1252" || s = "cp1252" || s = "x-cp1252" then 19 Some Encoding_types.Windows_1252 20 else if s = "iso-8859-2" || s = "iso8859-2" || s = "latin2" || s = "latin-2" then 21 Some Encoding_types.Iso_8859_2 22 else if s = "euc-jp" || s = "eucjp" then 23 Some Encoding_types.Euc_jp 24 else if s = "utf-16" || s = "utf16" then 25 Some Encoding_types.Utf16le (* Default to LE for ambiguous utf-16 *) 26 else if s = "utf-16le" || s = "utf16le" then 27 Some Encoding_types.Utf16le 28 else if s = "utf-16be" || s = "utf16be" then 29 Some Encoding_types.Utf16be 30 else 31 None 32 33let normalize_meta_declared label = 34 match normalize_label label with 35 | None -> None 36 | Some enc -> 37 (* Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and 38 treat them as UTF-8 *) 39 match enc with 40 | Encoding_types.Utf16le | Encoding_types.Utf16be -> Some Encoding_types.Utf8 41 | other -> Some other