OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 encoding detection and decoding *) 2 3(* UTF-8 replacement character *) 4let replacement_char = Uchar.of_int 0xFFFD 5 6let decode_utf16 data ~is_le ~bom_len = 7 let len = Bytes.length data in 8 let buf = Buffer.create len in 9 let i = ref bom_len in 10 11 while !i + 1 < len do 12 let b0 = Char.code (Bytes.get data !i) in 13 let b1 = Char.code (Bytes.get data (!i + 1)) in 14 let code_unit = 15 if is_le then b0 lor (b1 lsl 8) 16 else (b0 lsl 8) lor b1 17 in 18 i := !i + 2; 19 20 (* Handle surrogate pairs *) 21 if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin 22 (* High surrogate, look for low surrogate *) 23 let b2 = Char.code (Bytes.get data !i) in 24 let b3 = Char.code (Bytes.get data (!i + 1)) in 25 let code_unit2 = 26 if is_le then b2 lor (b3 lsl 8) 27 else (b2 lsl 8) lor b3 28 in 29 if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin 30 i := !i + 2; 31 let high = code_unit - 0xD800 in 32 let low = code_unit2 - 0xDC00 in 33 let cp = 0x10000 + (high lsl 10) lor low in 34 Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp) 35 end else begin 36 (* Invalid surrogate, output replacement *) 37 Uutf.Buffer.add_utf_8 buf replacement_char 38 end 39 end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin 40 (* Lone surrogate *) 41 Uutf.Buffer.add_utf_8 buf replacement_char 42 end else begin 43 Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit) 44 end 45 done; 46 47 (* Odd trailing byte *) 48 if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char; 49 50 Buffer.contents buf 51 52let decode_with_encoding data enc ~bom_len = 53 match enc with 54 | Encoding_types.Utf8 -> 55 (* UTF-8: Just validate and replace errors with replacement character *) 56 let len = Bytes.length data in 57 let buf = Buffer.create len in 58 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in 59 (* Skip BOM if present *) 60 let _ = 61 if bom_len > 0 then begin 62 for _ = 1 to bom_len do 63 ignore (Uutf.decode decoder) 64 done 65 end 66 in 67 let rec loop () = 68 match Uutf.decode decoder with 69 | `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop () 70 | `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop () 71 | `End -> () 72 | `Await -> assert false 73 in 74 loop (); 75 Buffer.contents buf 76 77 | Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len 78 | Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len 79 80 | Encoding_types.Windows_1252 -> 81 (* Windows-1252 mapping table for 0x80-0x9F range *) 82 let len = Bytes.length data in 83 let buf = Buffer.create len in 84 let table = [| 85 (* 0x80-0x9F *) 86 0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021; 87 0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F; 88 0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014; 89 0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178; 90 |] in 91 for i = bom_len to len - 1 do 92 let b = Char.code (Bytes.get data i) in 93 let cp = 94 if b >= 0x80 && b <= 0x9F then table.(b - 0x80) 95 else b 96 in 97 Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp) 98 done; 99 Buffer.contents buf 100 101 | Encoding_types.Iso_8859_2 -> 102 (* Use uuuu for ISO-8859-2 decoding *) 103 let len = Bytes.length data in 104 let buf = Buffer.create len in 105 let s = Bytes.sub_string data bom_len (len - bom_len) in 106 Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function 107 | `Uchar u -> Uutf.Buffer.add_utf_8 buf u 108 | `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char 109 ) () s; 110 Buffer.contents buf 111 112 | Encoding_types.Euc_jp -> 113 (* For EUC-JP, use uutf with best effort *) 114 let len = Bytes.length data in 115 let buf = Buffer.create len in 116 let s = Bytes.sub_string data bom_len (len - bom_len) in 117 (* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *) 118 (* This is a simplification - full EUC-JP would need a separate decoder *) 119 String.iter (fun c -> 120 if Char.code c <= 0x7F then 121 Buffer.add_char buf c 122 else 123 Buffer.add_string buf "\xEF\xBF\xBD" 124 ) s; 125 Buffer.contents buf 126 127let decode data ?transport_encoding () = 128 (* Step 1: Check for BOM *) 129 let bom_result = Encoding_bom.sniff data in 130 match bom_result with 131 | Some (enc, bom_len) -> 132 (decode_with_encoding data enc ~bom_len, enc) 133 | None -> 134 (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *) 135 let enc_from_transport = 136 match transport_encoding with 137 | Some te -> Encoding_labels.normalize_label te 138 | None -> None 139 in 140 match enc_from_transport with 141 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) 142 | None -> 143 (* Step 3: Prescan for meta charset *) 144 match Encoding_prescan.prescan_for_meta_charset data with 145 | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) 146 | None -> 147 (* Default to Windows-1252 per HTML5 spec when no encoding detected *) 148 (decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)