OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6let of_parse_error ?system_id err =
7 let code = Html5rw.error_code err in
8 let line = Html5rw.error_line err in
9 let column = Html5rw.error_column err in
10 let location =
11 Message.make_location ~line ~column ?system_id ()
12 in
13 let code_str = Html5rw.Parse_error_code.to_string code in
14 let message = match code with
15 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
16 "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag."
17 | _ -> Printf.sprintf "Parse error: %s" code_str
18 in
19 Message.error
20 ~message
21 ~code:code_str
22 ~location
23 ()
24
25let collect_parse_errors ?system_id result =
26 let errors = Html5rw.errors result in
27 let is_xhtml = match system_id with
28 | Some s -> String.length s > 6 && String.sub s (String.length s - 6) 6 = ".xhtml"
29 | None -> false
30 in
31 let filtered_errors =
32 if is_xhtml then
33 (* XHTML has different requirements than HTML:
34 - No DOCTYPE required
35 - Self-closing syntax is valid for all elements *)
36 List.filter (fun err ->
37 match Html5rw.error_code err with
38 | Html5rw.Parse_error_code.Tree_construction_error "expected-doctype-but-got-other" -> false
39 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> false
40 | _ -> true
41 ) errors
42 else errors
43 in
44 List.map (of_parse_error ?system_id) filtered_errors