OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 1.7 kB view raw
1open Bytesrw 2 3(* Encoding detection example *) 4 5let () = 6 Printf.printf "=== Encoding Detection ===\n\n"; 7 8 (* Parse UTF-8 bytes with BOM *) 9 let utf8_bom = Bytes.of_string "\xEF\xBB\xBF<html><body>UTF-8 with BOM</body></html>" in 10 let result = Html5rw.parse_bytes utf8_bom in 11 (match Html5rw.encoding result with 12 | Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc) 13 | None -> Printf.printf "No encoding detected\n"); 14 Printf.printf "Text: %s\n\n" (Html5rw.to_text result); 15 16 (* Parse with meta charset *) 17 let meta_charset = Bytes.of_string {| 18 <html> 19 <head><meta charset="utf-8"></head> 20 <body>Encoding from meta tag</body> 21 </html> 22 |} in 23 let result2 = Html5rw.parse_bytes meta_charset in 24 (match Html5rw.encoding result2 with 25 | Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc) 26 | None -> Printf.printf "No encoding detected\n"); 27 Printf.printf "Text: %s\n\n" (Html5rw.to_text result2); 28 29 (* Using low-level encoding functions *) 30 Printf.printf "=== Low-level Encoding API ===\n\n"; 31 32 let bytes = Bytes.of_string "\xEF\xBB\xBFHello" in 33 (match Html5rw.Encoding.sniff_bom bytes with 34 | Some (enc, offset) -> 35 Printf.printf "BOM sniffing result: %s (skip %d bytes)\n" 36 (Html5rw.Encoding.encoding_to_string enc) offset 37 | None -> 38 Printf.printf "No BOM detected\n"); 39 40 let html_bytes = Bytes.of_string {|<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">|} in 41 (match Html5rw.Encoding.prescan_for_meta_charset html_bytes with 42 | Some enc -> Printf.printf "Prescan found: %s\n" (Html5rw.Encoding.encoding_to_string enc) 43 | None -> Printf.printf "No charset in prescan\n")