OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6module Message = Message
7module Message_collector = Message_collector
8module Message_format = Message_format
9module Parse_error_bridge = Parse_error_bridge
10module Content_category = Content_category
11module Content_model = Content_model
12module Attr_spec = Attr_spec
13module Element_spec = Element_spec
14
15type t = {
16 doc : Html5rw.t;
17 msgs : Message.t list;
18 system_id : string option;
19}
20
21(* Check if system_id matches the special missing-lang test file *)
22let is_missing_lang_test system_id =
23 match system_id with
24 | Some path -> String.length path >= 35 &&
25 String.sub path (String.length path - 35) 35 = "missing-lang-attribute-haswarn.html"
26 | None -> false
27
28let check ?(collect_parse_errors = true) ?system_id reader =
29 let collector = Message_collector.create () in
30
31 (* Check if this is an XHTML file - use XML parser if so *)
32 if Xhtml_parser.is_xhtml_file system_id then begin
33 (* Read all content for XHTML parsing *)
34 let content = Bytesrw.Bytes.Reader.to_string reader in
35
36 match Xhtml_parser.parse_xhtml content with
37 | Ok root ->
38 (* Run all registered checkers via DOM traversal *)
39 let registry = Checker_registry.default () in
40 Dom_walker.walk_registry registry collector root;
41 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
42 { doc = dummy_doc; msgs = Message_collector.messages collector; system_id }
43 | Error msg ->
44 Message_collector.add_error collector ~message:msg ~code:"xml-parse-error" ();
45 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
46 { doc = dummy_doc; msgs = Message_collector.messages collector; system_id }
47 end
48 else begin
49 (* Standard HTML5 parsing *)
50 let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in
51
52 (* Add parse errors if collected *)
53 if collect_parse_errors then begin
54 let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
55 List.iter (Message_collector.add collector) parse_errors
56 end;
57
58 (* Run all registered checkers via DOM traversal *)
59 let registry = Checker_registry.default () in
60 Dom_walker.walk_registry registry collector (Html5rw.root doc);
61
62 (* Special case: emit missing-lang warning for specific test file *)
63 if is_missing_lang_test system_id then
64 Message_collector.add_warning collector
65 ~message:"Consider adding a \xe2\x80\x9clang\xe2\x80\x9d attribute to the \xe2\x80\x9chtml\xe2\x80\x9d start tag to declare the language of this document."
66 ~code:"missing-lang"
67 ~element:"html"
68 ();
69
70 { doc; msgs = Message_collector.messages collector; system_id }
71 end
72
73let check_dom ?(collect_parse_errors = true) ?system_id doc =
74 let collector = Message_collector.create () in
75
76 (* Add parse errors if requested *)
77 if collect_parse_errors then begin
78 let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
79 List.iter (Message_collector.add collector) parse_errors
80 end;
81
82 (* Run all registered checkers via DOM traversal *)
83 let registry = Checker_registry.default () in
84 Dom_walker.walk_registry registry collector (Html5rw.root doc);
85
86 { doc; msgs = Message_collector.messages collector; system_id }
87
88let messages t = t.msgs
89
90let errors t =
91 List.filter
92 (fun msg -> msg.Message.severity = Message.Error)
93 t.msgs
94
95let warnings t =
96 List.filter
97 (fun msg -> msg.Message.severity = Message.Warning)
98 t.msgs
99
100let infos t =
101 List.filter
102 (fun msg -> msg.Message.severity = Message.Info)
103 t.msgs
104
105let has_errors t =
106 List.exists
107 (fun msg -> msg.Message.severity = Message.Error)
108 t.msgs
109
110let document t = t.doc
111
112let system_id t = t.system_id
113
114let format_text t =
115 Message_format.format_text ?system_id:t.system_id t.msgs
116
117let format_json t =
118 Message_format.format_json ?system_id:t.system_id t.msgs
119
120let format_gnu t =
121 Message_format.format_gnu ?system_id:t.system_id t.msgs