OCaml HTML5 parser/serialiser based on Python's JustHTML
1open Bytesrw
2
3(* Encoding detection example *)
4
5let () =
6 Printf.printf "=== Encoding Detection ===\n\n";
7
8 (* Parse UTF-8 bytes with BOM *)
9 let utf8_bom = Bytes.of_string "\xEF\xBB\xBF<html><body>UTF-8 with BOM</body></html>" in
10 let result = Html5rw.parse_bytes utf8_bom in
11 (match Html5rw.encoding result with
12 | Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc)
13 | None -> Printf.printf "No encoding detected\n");
14 Printf.printf "Text: %s\n\n" (Html5rw.to_text result);
15
16 (* Parse with meta charset *)
17 let meta_charset = Bytes.of_string {|
18 <html>
19 <head><meta charset="utf-8"></head>
20 <body>Encoding from meta tag</body>
21 </html>
22 |} in
23 let result2 = Html5rw.parse_bytes meta_charset in
24 (match Html5rw.encoding result2 with
25 | Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc)
26 | None -> Printf.printf "No encoding detected\n");
27 Printf.printf "Text: %s\n\n" (Html5rw.to_text result2);
28
29 (* Using low-level encoding functions *)
30 Printf.printf "=== Low-level Encoding API ===\n\n";
31
32 let bytes = Bytes.of_string "\xEF\xBB\xBFHello" in
33 (match Html5rw.Encoding.sniff_bom bytes with
34 | Some (enc, offset) ->
35 Printf.printf "BOM sniffing result: %s (skip %d bytes)\n"
36 (Html5rw.Encoding.encoding_to_string enc) offset
37 | None ->
38 Printf.printf "No BOM detected\n");
39
40 let html_bytes = Bytes.of_string {|<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">|} in
41 (match Html5rw.Encoding.prescan_for_meta_charset html_bytes with
42 | Some enc -> Printf.printf "Prescan found: %s\n" (Html5rw.Encoding.encoding_to_string enc)
43 | None -> Printf.printf "No charset in prescan\n")