OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 encoding detection and decoding *)
2
3(* UTF-8 replacement character *)
4let replacement_char = Uchar.of_int 0xFFFD
5
6let decode_utf16 data ~is_le ~bom_len =
7 let len = Bytes.length data in
8 let buf = Buffer.create len in
9 let i = ref bom_len in
10
11 while !i + 1 < len do
12 let b0 = Char.code (Bytes.get data !i) in
13 let b1 = Char.code (Bytes.get data (!i + 1)) in
14 let code_unit =
15 if is_le then b0 lor (b1 lsl 8)
16 else (b0 lsl 8) lor b1
17 in
18 i := !i + 2;
19
20 (* Handle surrogate pairs *)
21 if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin
22 (* High surrogate, look for low surrogate *)
23 let b2 = Char.code (Bytes.get data !i) in
24 let b3 = Char.code (Bytes.get data (!i + 1)) in
25 let code_unit2 =
26 if is_le then b2 lor (b3 lsl 8)
27 else (b2 lsl 8) lor b3
28 in
29 if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin
30 i := !i + 2;
31 let high = code_unit - 0xD800 in
32 let low = code_unit2 - 0xDC00 in
33 let cp = 0x10000 + (high lsl 10) lor low in
34 Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
35 end else begin
36 (* Invalid surrogate, output replacement *)
37 Uutf.Buffer.add_utf_8 buf replacement_char
38 end
39 end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin
40 (* Lone surrogate *)
41 Uutf.Buffer.add_utf_8 buf replacement_char
42 end else begin
43 Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit)
44 end
45 done;
46
47 (* Odd trailing byte *)
48 if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char;
49
50 Buffer.contents buf
51
52let decode_with_encoding data enc ~bom_len =
53 match enc with
54 | Encoding_types.Utf8 ->
55 (* UTF-8: Just validate and replace errors with replacement character *)
56 let len = Bytes.length data in
57 let buf = Buffer.create len in
58 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in
59 (* Skip BOM if present *)
60 let _ =
61 if bom_len > 0 then begin
62 for _ = 1 to bom_len do
63 ignore (Uutf.decode decoder)
64 done
65 end
66 in
67 let rec loop () =
68 match Uutf.decode decoder with
69 | `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop ()
70 | `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop ()
71 | `End -> ()
72 | `Await -> assert false
73 in
74 loop ();
75 Buffer.contents buf
76
77 | Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
78 | Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
79
80 | Encoding_types.Windows_1252 ->
81 (* Windows-1252 mapping table for 0x80-0x9F range *)
82 let len = Bytes.length data in
83 let buf = Buffer.create len in
84 let table = [|
85 (* 0x80-0x9F *)
86 0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021;
87 0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F;
88 0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014;
89 0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178;
90 |] in
91 for i = bom_len to len - 1 do
92 let b = Char.code (Bytes.get data i) in
93 let cp =
94 if b >= 0x80 && b <= 0x9F then table.(b - 0x80)
95 else b
96 in
97 Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
98 done;
99 Buffer.contents buf
100
101 | Encoding_types.Iso_8859_2 ->
102 (* Use uuuu for ISO-8859-2 decoding *)
103 let len = Bytes.length data in
104 let buf = Buffer.create len in
105 let s = Bytes.sub_string data bom_len (len - bom_len) in
106 Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function
107 | `Uchar u -> Uutf.Buffer.add_utf_8 buf u
108 | `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char
109 ) () s;
110 Buffer.contents buf
111
112 | Encoding_types.Euc_jp ->
113 (* For EUC-JP, use uutf with best effort *)
114 let len = Bytes.length data in
115 let buf = Buffer.create len in
116 let s = Bytes.sub_string data bom_len (len - bom_len) in
117 (* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *)
118 (* This is a simplification - full EUC-JP would need a separate decoder *)
119 String.iter (fun c ->
120 if Char.code c <= 0x7F then
121 Buffer.add_char buf c
122 else
123 Buffer.add_string buf "\xEF\xBF\xBD"
124 ) s;
125 Buffer.contents buf
126
127let decode data ?transport_encoding () =
128 (* Step 1: Check for BOM *)
129 let bom_result = Encoding_bom.sniff data in
130 match bom_result with
131 | Some (enc, bom_len) ->
132 (decode_with_encoding data enc ~bom_len, enc)
133 | None ->
134 (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *)
135 let enc_from_transport =
136 match transport_encoding with
137 | Some te -> Encoding_labels.normalize_label te
138 | None -> None
139 in
140 match enc_from_transport with
141 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
142 | None ->
143 (* Step 3: Prescan for meta charset *)
144 match Encoding_prescan.prescan_for_meta_charset data with
145 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
146 | None ->
147 (* Default to Windows-1252 per HTML5 spec when no encoding detected *)
148 (decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)