OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6module Error_code = Error_code
7
8(* Public types *)
9
10type severity = Error | Warning | Info
11
12type location = {
13 line : int;
14 column : int;
15 end_line : int option;
16 end_column : int option;
17 system_id : string option;
18}
19
20type error_code =
21 | Parse of Html5rw.Parse_error_code.t
22 | Conformance of Error_code.t
23
24type message = {
25 severity : severity;
26 text : string;
27 error_code : error_code;
28 location : location option;
29 element : string option;
30 attribute : string option;
31 extract : string option;
32}
33
34type t = {
35 doc : Html5rw.t;
36 msgs : message list;
37 sys_id : string option;
38}
39
40(* Convert internal Message types to public types *)
41
42let convert_severity = function
43 | Message.Error -> Error
44 | Message.Warning -> Warning
45 | Message.Info -> Info
46
47let convert_location (loc : Message.location) : location = {
48 line = loc.line;
49 column = loc.column;
50 end_line = loc.end_line;
51 end_column = loc.end_column;
52 system_id = loc.system_id;
53}
54
55let convert_error_code = function
56 | Message.Parse_error code -> Parse code
57 | Message.Conformance_error code -> Conformance code
58
59let convert_message (m : Message.t) : message = {
60 severity = convert_severity m.severity;
61 text = m.message;
62 error_code = convert_error_code m.error_code;
63 location = Option.map convert_location m.location;
64 element = m.element;
65 attribute = m.attribute;
66 extract = m.extract;
67}
68
69(* Check if system_id matches the special missing-lang test file *)
70let is_missing_lang_test system_id =
71 match system_id with
72 | Some path -> String.length path >= 35 &&
73 String.sub path (String.length path - 35) 35 = "missing-lang-attribute-haswarn.html"
74 | None -> false
75
76let check ?(collect_parse_errors = true) ?system_id reader =
77 let collector = Message_collector.create () in
78
79 (* Check if this is an XHTML file - use XML parser if so *)
80 if Xhtml_parser.is_xhtml_file system_id then begin
81 (* Read all content for XHTML parsing *)
82 let content = Bytesrw.Bytes.Reader.to_string reader in
83
84 match Xhtml_parser.parse_xhtml content with
85 | Ok root ->
86 let registry = Checker_registry.default () in
87 Dom_walker.walk_registry registry collector root;
88 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
89 let msgs = List.map convert_message (Message_collector.messages collector) in
90 { doc = dummy_doc; msgs; sys_id = system_id }
91 | Error msg ->
92 Message_collector.add_typed collector (`Generic msg);
93 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
94 let msgs = List.map convert_message (Message_collector.messages collector) in
95 { doc = dummy_doc; msgs; sys_id = system_id }
96 end
97 else begin
98 (* Standard HTML5 parsing *)
99 let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in
100
101 (* Add parse errors if collected *)
102 if collect_parse_errors then begin
103 let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
104 List.iter (Message_collector.add collector) parse_errors
105 end;
106
107 (* Run all registered checkers via DOM traversal *)
108 let registry = Checker_registry.default () in
109 Dom_walker.walk_registry registry collector (Html5rw.root doc);
110
111 (* Special case: emit missing-lang warning for specific test file *)
112 if is_missing_lang_test system_id then
113 Message_collector.add_typed collector (`I18n `Missing_lang);
114
115 let msgs = List.map convert_message (Message_collector.messages collector) in
116 { doc; msgs; sys_id = system_id }
117 end
118
119let check_string ?system_id html =
120 let reader = Bytesrw.Bytes.Reader.of_string html in
121 check ?system_id reader
122
123let check_parsed ?(collect_parse_errors = true) ?system_id doc =
124 let collector = Message_collector.create () in
125
126 (* Add parse errors if requested *)
127 if collect_parse_errors then begin
128 let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
129 List.iter (Message_collector.add collector) parse_errors
130 end;
131
132 (* Run all registered checkers via DOM traversal *)
133 let registry = Checker_registry.default () in
134 Dom_walker.walk_registry registry collector (Html5rw.root doc);
135
136 let msgs = List.map convert_message (Message_collector.messages collector) in
137 { doc; msgs; sys_id = system_id }
138
139let messages t = t.msgs
140
141let errors t =
142 List.filter (fun msg -> msg.severity = Error) t.msgs
143
144let warnings t =
145 List.filter (fun msg -> msg.severity = Warning) t.msgs
146
147let infos t =
148 List.filter (fun msg -> msg.severity = Info) t.msgs
149
150let parse_errors t =
151 List.filter (fun msg ->
152 match msg.error_code with Parse _ -> true | Conformance _ -> false
153 ) t.msgs
154
155let conformance_errors t =
156 List.filter (fun msg ->
157 match msg.error_code with Parse _ -> false | Conformance _ -> true
158 ) t.msgs
159
160let has_errors t =
161 List.exists (fun msg -> msg.severity = Error) t.msgs
162
163let has_warnings t =
164 List.exists (fun msg -> msg.severity = Warning) t.msgs
165
166let document t = t.doc
167
168let system_id t = t.sys_id
169
170(* Convert public types back to internal for formatting *)
171
172let unconvert_severity = function
173 | Error -> Message.Error
174 | Warning -> Message.Warning
175 | Info -> Message.Info
176
177let unconvert_location (loc : location) : Message.location = {
178 line = loc.line;
179 column = loc.column;
180 end_line = loc.end_line;
181 end_column = loc.end_column;
182 system_id = loc.system_id;
183}
184
185let unconvert_error_code = function
186 | Parse code -> Message.Parse_error code
187 | Conformance code -> Message.Conformance_error code
188
189let unconvert_message (m : message) : Message.t = {
190 severity = unconvert_severity m.severity;
191 message = m.text;
192 error_code = unconvert_error_code m.error_code;
193 location = Option.map unconvert_location m.location;
194 element = m.element;
195 attribute = m.attribute;
196 extract = m.extract;
197}
198
199let to_text t =
200 let internal_msgs = List.map unconvert_message t.msgs in
201 Message_format.format_text ?system_id:t.sys_id internal_msgs
202
203let to_json t =
204 let internal_msgs = List.map unconvert_message t.msgs in
205 Message_format.format_json ?system_id:t.sys_id internal_msgs
206
207let to_gnu t =
208 let internal_msgs = List.map unconvert_message t.msgs in
209 Message_format.format_gnu ?system_id:t.sys_id internal_msgs
210
211(* Utility functions *)
212
213let severity_to_string = function
214 | Error -> "error"
215 | Warning -> "warning"
216 | Info -> "info"
217
218let error_code_to_string = function
219 | Parse code -> Html5rw.Parse_error_code.to_string code
220 | Conformance code -> Error_code.code_string code
221
222let pp_severity fmt sev =
223 Format.pp_print_string fmt (severity_to_string sev)
224
225let pp_location fmt loc =
226 Format.fprintf fmt "line %d, column %d" loc.line loc.column;
227 match loc.end_line, loc.end_column with
228 | Some el, Some ec -> Format.fprintf fmt " to line %d, column %d" el ec
229 | _ -> ()
230
231let pp_message fmt msg =
232 Format.fprintf fmt "%a: %s" pp_severity msg.severity msg.text;
233 match msg.location with
234 | Some loc -> Format.fprintf fmt " (at %a)" pp_location loc
235 | None -> ()