OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6let of_parse_error ?system_id err = 7 let code = Html5rw.error_code err in 8 let line = Html5rw.error_line err in 9 let column = Html5rw.error_column err in 10 let location = 11 Message.make_location ~line ~column ?system_id () 12 in 13 let code_str = Html5rw.Parse_error_code.to_string code in 14 let message = match code with 15 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> 16 "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag." 17 | _ -> Printf.sprintf "Parse error: %s" code_str 18 in 19 Message.error 20 ~message 21 ~code:code_str 22 ~location 23 () 24 25let collect_parse_errors ?system_id result = 26 let errors = Html5rw.errors result in 27 let is_xhtml = match system_id with 28 | Some s -> String.length s > 6 && String.sub s (String.length s - 6) 6 = ".xhtml" 29 | None -> false 30 in 31 let filtered_errors = 32 if is_xhtml then 33 (* XHTML has different requirements than HTML: 34 - No DOCTYPE required 35 - Self-closing syntax is valid for all elements *) 36 List.filter (fun err -> 37 match Html5rw.error_code err with 38 | Html5rw.Parse_error_code.Tree_construction_error "expected-doctype-but-got-other" -> false 39 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> false 40 | _ -> true 41 ) errors 42 else errors 43 in 44 List.map (of_parse_error ?system_id) filtered_errors