lib/html5rw/encoding/encoding_decode.ml at e51bb723f481ed8ef0938e71bdb6e3a2370cbc1f · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / lib / html5rw / encoding / encoding_decode.ml
at e51bb723f481ed8ef0938e71bdb6e3a2370cbc1f 5.2 kB view raw
  1(* HTML5 encoding detection and decoding *)
  2
  3(* UTF-8 replacement character *)
  4let replacement_char = Uchar.of_int 0xFFFD
  5
  6let decode_utf16 data ~is_le ~bom_len =
  7  let len = Bytes.length data in
  8  let buf = Buffer.create len in
  9  let i = ref bom_len in
 10
 11  while !i + 1 < len do
 12    let b0 = Char.code (Bytes.get data !i) in
 13    let b1 = Char.code (Bytes.get data (!i + 1)) in
 14    let code_unit =
 15      if is_le then b0 lor (b1 lsl 8)
 16      else (b0 lsl 8) lor b1
 17    in
 18    i := !i + 2;
 19
 20    (* Handle surrogate pairs *)
 21    if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin
 22      (* High surrogate, look for low surrogate *)
 23      let b2 = Char.code (Bytes.get data !i) in
 24      let b3 = Char.code (Bytes.get data (!i + 1)) in
 25      let code_unit2 =
 26        if is_le then b2 lor (b3 lsl 8)
 27        else (b2 lsl 8) lor b3
 28      in
 29      if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin
 30        i := !i + 2;
 31        let high = code_unit - 0xD800 in
 32        let low = code_unit2 - 0xDC00 in
 33        let cp = 0x10000 + (high lsl 10) lor low in
 34        Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
 35      end else begin
 36        (* Invalid surrogate, output replacement *)
 37        Uutf.Buffer.add_utf_8 buf replacement_char
 38      end
 39    end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin
 40      (* Lone surrogate *)
 41      Uutf.Buffer.add_utf_8 buf replacement_char
 42    end else begin
 43      Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit)
 44    end
 45  done;
 46
 47  (* Odd trailing byte *)
 48  if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char;
 49
 50  Buffer.contents buf
 51
 52let decode_with_encoding data enc ~bom_len =
 53  match enc with
 54  | Encoding_types.Utf8 ->
 55    (* UTF-8: Just validate and replace errors with replacement character *)
 56    let len = Bytes.length data in
 57    let buf = Buffer.create len in
 58    let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in
 59    (* Skip BOM if present *)
 60    let _ =
 61      if bom_len > 0 then begin
 62        for _ = 1 to bom_len do
 63          ignore (Uutf.decode decoder)
 64        done
 65      end
 66    in
 67    let rec loop () =
 68      match Uutf.decode decoder with
 69      | `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop ()
 70      | `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop ()
 71      | `End -> ()
 72      | `Await -> assert false
 73    in
 74    loop ();
 75    Buffer.contents buf
 76
 77  | Encoding_types.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
 78  | Encoding_types.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
 79
 80  | Encoding_types.Windows_1252 ->
 81    (* Windows-1252 mapping table for 0x80-0x9F range *)
 82    let len = Bytes.length data in
 83    let buf = Buffer.create len in
 84    let table = [|
 85      (* 0x80-0x9F *)
 86      0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021;
 87      0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F;
 88      0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014;
 89      0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178;
 90    |] in
 91    for i = bom_len to len - 1 do
 92      let b = Char.code (Bytes.get data i) in
 93      let cp =
 94        if b >= 0x80 && b <= 0x9F then table.(b - 0x80)
 95        else b
 96      in
 97      Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
 98    done;
 99    Buffer.contents buf
100
101  | Encoding_types.Iso_8859_2 ->
102    (* Use uuuu for ISO-8859-2 decoding *)
103    let len = Bytes.length data in
104    let buf = Buffer.create len in
105    let s = Bytes.sub_string data bom_len (len - bom_len) in
106    Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function
107      | `Uchar u -> Uutf.Buffer.add_utf_8 buf u
108      | `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char
109    ) () s;
110    Buffer.contents buf
111
112  | Encoding_types.Euc_jp ->
113    (* For EUC-JP, use uutf with best effort *)
114    let len = Bytes.length data in
115    let buf = Buffer.create len in
116    let s = Bytes.sub_string data bom_len (len - bom_len) in
117    (* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *)
118    (* This is a simplification - full EUC-JP would need a separate decoder *)
119    String.iter (fun c ->
120      if Char.code c <= 0x7F then
121        Buffer.add_char buf c
122      else
123        Buffer.add_string buf "\xEF\xBF\xBD"
124    ) s;
125    Buffer.contents buf
126
127let decode data ?transport_encoding () =
128  (* Step 1: Check for BOM *)
129  let bom_result = Encoding_bom.sniff data in
130  match bom_result with
131  | Some (enc, bom_len) ->
132    (decode_with_encoding data enc ~bom_len, enc)
133  | None ->
134    (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *)
135    let enc_from_transport =
136      match transport_encoding with
137      | Some te -> Encoding_labels.normalize_label te
138      | None -> None
139    in
140    match enc_from_transport with
141    | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
142    | None ->
143      (* Step 3: Prescan for meta charset *)
144      match Encoding_prescan.prescan_for_meta_charset data with
145      | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
146      | None ->
147        (* Default to Windows-1252 per HTML5 spec when no encoding detected *)
148        (decode_with_encoding data Encoding_types.Windows_1252 ~bom_len:0, Encoding_types.Windows_1252)