OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 1.4 kB view raw
1open Bytesrw 2 3(* Error handling and malformed HTML example *) 4 5let malformed_html = {| 6<html> 7<head> 8 <title>Unclosed title 9 <meta charset="utf-8"> 10</head> 11<body> 12 <div> 13 <p>Unclosed paragraph 14 <p>Another paragraph (implicitly closes the previous one) 15 <span><div>Misnested tags</span></div> 16 </div> 17 <table> 18 <tr><td>Cell 1<td>Cell 2</td> 19 </table> 20 <!-- Unclosed comment 21</body> 22</html> 23|} 24 25let () = 26 Printf.printf "=== Parsing Malformed HTML ===\n\n"; 27 28 (* Parse with error collection enabled *) 29 let result = Html5rw.parse ~collect_errors:true (Bytes.Reader.of_string malformed_html) in 30 31 (* Get parse errors *) 32 let errs = Html5rw.errors result in 33 Printf.printf "Parse errors: %d\n\n" (List.length errs); 34 List.iter (fun err -> 35 Printf.printf " Line %d, Col %d: %s\n" 36 (Html5rw.error_line err) 37 (Html5rw.error_column err) 38 (Html5rw.Parse_error_code.to_string (Html5rw.error_code err)) 39 ) errs; 40 41 (* The parser still produces a valid DOM tree *) 42 Printf.printf "\n=== Recovered DOM Tree ===\n"; 43 let html = Html5rw.to_string ~pretty:true ~indent_size:2 result in 44 Printf.printf "%s\n" html; 45 46 (* Query the recovered tree *) 47 Printf.printf "\n=== Query Results ===\n"; 48 let paragraphs = Html5rw.query result "p" in 49 Printf.printf "Found %d paragraphs\n" (List.length paragraphs); 50 51 let cells = Html5rw.query result "td" in 52 Printf.printf "Found %d table cells\n" (List.length cells)