OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** HTML5 Conformance Checker
7
8 Validates HTML5 documents against the
9 {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard}.
10
11 {2 Quick Start}
12
13 {[
14 let result = Htmlrw_check.check_string "<html><body><img></body></html>" in
15
16 if Htmlrw_check.has_errors result then
17 print_endline (Htmlrw_check.to_text result)
18 else
19 print_endline "Valid HTML5!"
20 ]}
21
22 {2 Handling Specific Errors}
23
24 Use pattern matching on [error_code] for fine-grained control:
25
26 {[
27 List.iter (fun msg ->
28 match msg.Htmlrw_check.error_code with
29 | Parse code ->
30 Printf.printf "Syntax error: %s\n"
31 (Html5rw.Parse_error_code.to_string code)
32 | Conformance code ->
33 match code with
34 | `Img `Missing_alt ->
35 Printf.printf "Accessibility: %s needs alt text\n"
36 (Option.value ~default:"image" msg.element)
37 | `Attr (`Duplicate_id _) ->
38 Printf.printf "Duplicate ID found\n"
39 | _ ->
40 Printf.printf "Error: %s\n" msg.text
41 ) (Htmlrw_check.errors result)
42 ]}
43
44 {2 CI Integration}
45
46 {[
47 let validate_file path =
48 let ic = open_in path in
49 let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
50 let result = Htmlrw_check.check ~system_id:path reader in
51 close_in ic;
52 if Htmlrw_check.has_errors result then begin
53 print_string (Htmlrw_check.to_gnu result);
54 exit 1
55 end
56 ]}
57
58 {2 What Gets Checked}
59
60 - {b Parse errors}: Malformed syntax per WHATWG parsing specification
61 - {b Content model}: Invalid element nesting (e.g., [<div>] inside [<p>])
62 - {b Attributes}: Missing required, disallowed, or invalid attributes
63 - {b Accessibility}: ARIA misuse, missing alt text, form labeling
64 - {b Structure}: Missing DOCTYPE, duplicate IDs, heading hierarchy
65 - {b Internationalization}: Missing or mismatched lang attributes
66
67 @see <https://html.spec.whatwg.org/> WHATWG HTML Living Standard
68 @see <https://validator.w3.org/nu/> Nu HTML Checker *)
69
70
71(** {1:types Types} *)
72
73(** Message severity level. *)
74type severity =
75 | Error (** Conformance violation - document is invalid *)
76 | Warning (** Likely problem - may be intentional *)
77 | Info (** Suggestion for improvement *)
78
79(** Source location in the document. Line and column are 1-indexed. *)
80type location = {
81 line : int;
82 column : int;
83 end_line : int option;
84 end_column : int option;
85 system_id : string option; (** File path or URL if provided *)
86}
87
88(** Typed error code. Pattern match to handle specific errors.
89
90 {[
91 match msg.error_code with
92 | Parse Html5rw.Parse_error_code.Eof_in_tag ->
93 (* Unclosed tag at end of file *)
94 | Conformance (`Img `Missing_alt) ->
95 (* Image without alt attribute *)
96 | _ -> ()
97 ]} *)
98type error_code =
99 | Parse of Html5rw.Parse_error_code.t
100 (** Syntax error from the HTML5 parser.
101 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> *)
102 | Conformance of Error_code.t
103 (** Semantic error from conformance checking. *)
104
105(** A validation message. *)
106type message = {
107 severity : severity;
108 text : string; (** Human-readable description *)
109 error_code : error_code; (** Typed code for pattern matching *)
110 location : location option; (** Source location if available *)
111 element : string option; (** Relevant element (lowercase) *)
112 attribute : string option; (** Relevant attribute (lowercase) *)
113 extract : string option; (** Source excerpt for context *)
114}
115
116(** Validation result. Use accessors below to inspect. *)
117type t
118
119
120(** {1:validation Validation} *)
121
122(** Validate HTML from a string.
123
124 {[
125 let result = Htmlrw_check.check_string html in
126 if Htmlrw_check.has_errors result then
127 prerr_endline (Htmlrw_check.to_text result)
128 ]}
129
130 @param system_id File path or URL for error messages. *)
131val check_string : ?system_id:string -> string -> t
132
133(** Validate HTML from a reader.
134
135 @param collect_parse_errors Include syntax errors (default: [true]).
136 @param system_id File path or URL for error messages. *)
137val check :
138 ?collect_parse_errors:bool ->
139 ?system_id:string ->
140 Bytesrw.Bytes.Reader.t ->
141 t
142
143(** Validate an already-parsed document.
144
145 Useful when you've parsed the HTML separately and want to run
146 conformance checks without re-parsing. *)
147val check_parsed :
148 ?collect_parse_errors:bool ->
149 ?system_id:string ->
150 Html5rw.t ->
151 t
152
153
154(** {1:results Results} *)
155
156(** All messages in document order. *)
157val messages : t -> message list
158
159(** Only error-severity messages. *)
160val errors : t -> message list
161
162(** Only warning-severity messages. *)
163val warnings : t -> message list
164
165(** Only info-severity messages. *)
166val infos : t -> message list
167
168(** Only syntax errors from the parser. *)
169val parse_errors : t -> message list
170
171(** Only semantic errors from conformance checking. *)
172val conformance_errors : t -> message list
173
174(** [true] if any errors were found. *)
175val has_errors : t -> bool
176
177(** [true] if any warnings were found. *)
178val has_warnings : t -> bool
179
180(** The parsed document. *)
181val document : t -> Html5rw.t
182
183(** The system identifier (file path or URL) if provided. *)
184val system_id : t -> string option
185
186
187(** {1:formatting Output Formatting} *)
188
189(** Human-readable text format.
190
191{v
192file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt".
193v} *)
194val to_text : t -> string
195
196(** JSON format compatible with Nu HTML Validator.
197
198{v
199{"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]}
200v} *)
201val to_json : t -> string
202
203(** GNU error format for IDE integration.
204
205{v
206file.html:5:3: error: Element "img" is missing required attribute "alt".
207v} *)
208val to_gnu : t -> string
209
210
211(** {1:utilities Utilities} *)
212
213(** ["error"], ["warning"], or ["info"]. *)
214val severity_to_string : severity -> string
215
216(** String representation of an error code. *)
217val error_code_to_string : error_code -> string
218
219(** Pretty-printer for severity. *)
220val pp_severity : Format.formatter -> severity -> unit
221
222(** Pretty-printer for location. *)
223val pp_location : Format.formatter -> location -> unit
224
225(** Pretty-printer for message. *)
226val pp_message : Format.formatter -> message -> unit
227
228
229(** {1:error_codes Error Code Types}
230
231 For pattern matching on conformance errors. Parse errors use
232 {!Html5rw.Parse_error_code}.
233
234 {[
235 match code with
236 | `Attr (`Missing_required_attr _) -> ...
237 | `Img `Missing_alt -> ...
238 | `Aria _ -> ... (* Any ARIA error *)
239 | _ -> ...
240 ]} *)
241module Error_code = Error_code