OCaml HTML5 parser/serialiser based on Python's JustHTML
1open Bytesrw
2
3(* Error handling and malformed HTML example *)
4
5let malformed_html = {|
6<html>
7<head>
8 <title>Unclosed title
9 <meta charset="utf-8">
10</head>
11<body>
12 <div>
13 <p>Unclosed paragraph
14 <p>Another paragraph (implicitly closes the previous one)
15 <span><div>Misnested tags</span></div>
16 </div>
17 <table>
18 <tr><td>Cell 1<td>Cell 2</td>
19 </table>
20 <!-- Unclosed comment
21</body>
22</html>
23|}
24
25let () =
26 Printf.printf "=== Parsing Malformed HTML ===\n\n";
27
28 (* Parse with error collection enabled *)
29 let result = Html5rw.parse ~collect_errors:true (Bytes.Reader.of_string malformed_html) in
30
31 (* Get parse errors *)
32 let errs = Html5rw.errors result in
33 Printf.printf "Parse errors: %d\n\n" (List.length errs);
34 List.iter (fun err ->
35 Printf.printf " Line %d, Col %d: %s\n"
36 (Html5rw.error_line err)
37 (Html5rw.error_column err)
38 (Html5rw.Parse_error_code.to_string (Html5rw.error_code err))
39 ) errs;
40
41 (* The parser still produces a valid DOM tree *)
42 Printf.printf "\n=== Recovered DOM Tree ===\n";
43 let html = Html5rw.to_string ~pretty:true ~indent_size:2 result in
44 Printf.printf "%s\n" html;
45
46 (* Query the recovered tree *)
47 Printf.printf "\n=== Query Results ===\n";
48 let paragraphs = Html5rw.query result "p" in
49 Printf.printf "Found %d paragraphs\n" (List.length paragraphs);
50
51 let cells = Html5rw.query result "td" in
52 Printf.printf "Found %d table cells\n" (List.length cells)