OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Parse error codes as defined by the WHATWG HTML5 specification.
7
8 The HTML5 parser never fails - it always produces a DOM tree. However,
9 the specification defines these error codes for conformance checkers to
10 report issues in HTML documents.
11
12 Each error code corresponds to a specific condition in the WHATWG
13 specification's parsing algorithm.
14
15 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
16 WHATWG: Parse errors *)
17
18type t =
19 | Abrupt_closing_of_empty_comment
20 (** Parser encounters [<!-->] or [<!--->]; comment is treated as
21 correctly closed.
22
23 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-closing-of-empty-comment> *)
24
25 | Abrupt_doctype_public_identifier
26 (** [>] found in DOCTYPE public identifier before closing quote;
27 sets document to quirks mode.
28
29 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-doctype-public-identifier> *)
30
31 | Abrupt_doctype_system_identifier
32 (** [>] found in DOCTYPE system identifier before closing quote;
33 sets document to quirks mode.
34
35 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-doctype-system-identifier> *)
36
37 | Absence_of_digits_in_numeric_character_reference
38 (** Numeric character reference has no digits (e.g., [&#qux;]);
39 the reference is not resolved.
40
41 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-absence-of-digits-in-numeric-character-reference> *)
42
43 | Cdata_in_html_content
44 (** CDATA section found outside SVG or MathML foreign content;
45 treated as a bogus comment.
46
47 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-cdata-in-html-content> *)
48
49 | Character_reference_outside_unicode_range
50 (** Numeric reference exceeds U+10FFFF; resolves to U+FFFD
51 REPLACEMENT CHARACTER.
52
53 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-character-reference-outside-unicode-range> *)
54
55 | Control_character_in_input_stream
56 (** Control code point (other than ASCII whitespace or NULL)
57 appears in the input; parsed as-is.
58
59 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-control-character-in-input-stream> *)
60
61 | Control_character_reference
62 (** Numeric reference to a control character; handled per
63 specification replacement rules.
64
65 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-control-character-reference> *)
66
67 | Duplicate_attribute
68 (** Tag contains duplicate attribute names; later duplicates
69 are removed.
70
71 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-duplicate-attribute> *)
72
73 | End_tag_with_attributes
74 (** End tag includes attributes; attributes are ignored.
75
76 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-end-tag-with-attributes> *)
77
78 | End_tag_with_trailing_solidus
79 (** End tag has [/] before [>] (like [</br/>]); treated as
80 regular end tag.
81
82 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-end-tag-with-trailing-solidus> *)
83
84 | Eof_before_tag_name
85 (** End of input where tag name expected; [<] or [</] is
86 treated as text.
87
88 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-before-tag-name> *)
89
90 | Eof_in_cdata
91 (** End of input within CDATA section; treated as immediately closed.
92
93 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-cdata> *)
94
95 | Eof_in_comment
96 (** End of input within comment; comment is treated as
97 immediately closed.
98
99 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-comment> *)
100
101 | Eof_in_doctype
102 (** End of input within DOCTYPE; sets document to quirks mode.
103
104 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-doctype> *)
105
106 | Eof_in_script_html_comment_like_text
107 (** End of input within HTML-like comment syntax inside a script element.
108
109 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-script-html-comment-like-text> *)
110
111 | Eof_in_tag
112 (** End of input within a start or end tag; the tag is ignored.
113
114 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-tag> *)
115
116 | Incorrectly_closed_comment
117 (** Comment closed by [--!>] instead of [-->]; treated as
118 correctly closed.
119
120 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment> *)
121
122 | Incorrectly_opened_comment
123 (** [<!] not followed by [--] (e.g., [<!ELEMENT]); content is
124 treated as a bogus comment.
125
126 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-opened-comment> *)
127
128 | Invalid_character_sequence_after_doctype_name
129 (** Neither "PUBLIC" nor "SYSTEM" after DOCTYPE name; sets
130 document to quirks mode.
131
132 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-invalid-character-sequence-after-doctype-name> *)
133
134 | Invalid_first_character_of_tag_name
135 (** Non-ASCII-alpha character where tag name start expected;
136 [<] is treated as text.
137
138 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-invalid-first-character-of-tag-name> *)
139
140 | Missing_attribute_value
141 (** [>] where attribute value expected (e.g., [<div id=>]);
142 attribute gets empty string value.
143
144 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-attribute-value> *)
145
146 | Missing_doctype_name
147 (** DOCTYPE has no name; sets document to quirks mode.
148
149 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-name> *)
150
151 | Missing_doctype_public_identifier
152 (** [>] where public identifier expected; sets quirks mode.
153
154 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-public-identifier> *)
155
156 | Missing_doctype_system_identifier
157 (** [>] where system identifier expected; sets quirks mode.
158
159 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-system-identifier> *)
160
161 | Missing_end_tag_name
162 (** [>] where end tag name expected ([</>]); sequence is ignored.
163
164 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-end-tag-name> *)
165
166 | Missing_quote_before_doctype_public_identifier
167 (** Public identifier lacks preceding quote; sets quirks mode.
168
169 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-quote-before-doctype-public-identifier> *)
170
171 | Missing_quote_before_doctype_system_identifier
172 (** System identifier lacks preceding quote; sets quirks mode.
173
174 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-quote-before-doctype-system-identifier> *)
175
176 | Missing_semicolon_after_character_reference
177 (** Character reference lacks terminating [;]; behaves as if
178 semicolon were present.
179
180 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-semicolon-after-character-reference> *)
181
182 | Missing_whitespace_after_doctype_public_keyword
183 (** No whitespace between "PUBLIC" and identifier; treated as
184 if whitespace were present.
185
186 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-after-doctype-public-keyword> *)
187
188 | Missing_whitespace_after_doctype_system_keyword
189 (** No whitespace between "SYSTEM" and identifier; treated as
190 if whitespace were present.
191
192 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-after-doctype-system-keyword> *)
193
194 | Missing_whitespace_before_doctype_name
195 (** No whitespace between "DOCTYPE" and name; treated as if
196 whitespace were present.
197
198 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-before-doctype-name> *)
199
200 | Missing_whitespace_between_attributes
201 (** Adjacent attributes lack separating whitespace; treated as
202 if whitespace were present.
203
204 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-between-attributes> *)
205
206 | Missing_whitespace_between_doctype_public_and_system_identifiers
207 (** Public and system identifiers not separated by whitespace;
208 treated as if whitespace were present.
209
210 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-between-doctype-public-and-system-identifiers> *)
211
212 | Nested_comment
213 (** Nested [<!--] detected within comment; comment still closes
214 at first [-->].
215
216 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-nested-comment> *)
217
218 | Noncharacter_character_reference
219 (** Numeric reference to a Unicode noncharacter; resolved as-is
220 (not replaced).
221
222 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-noncharacter-character-reference> *)
223
224 | Noncharacter_in_input_stream
225 (** Unicode noncharacter code point in input; parsed as-is.
226
227 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-noncharacter-in-input-stream> *)
228
229 | Non_void_html_element_start_tag_with_trailing_solidus
230 (** Non-void element start tag has [/] before [>] (like
231 [<div/>]); the [/] is ignored.
232
233 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-non-void-html-element-start-tag-with-trailing-solidus> *)
234
235 | Null_character_reference
236 (** Numeric reference to U+0000 (NULL); resolves to U+FFFD
237 REPLACEMENT CHARACTER.
238
239 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-null-character-reference> *)
240
241 | Surrogate_character_reference
242 (** Numeric reference to a surrogate code point (U+D800-U+DFFF);
243 resolves to U+FFFD REPLACEMENT CHARACTER.
244
245 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-surrogate-character-reference> *)
246
247 | Surrogate_in_input_stream
248 (** Surrogate code point in input stream; parsed as-is.
249
250 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-surrogate-in-input-stream> *)
251
252 | Unexpected_character_after_doctype_system_identifier
253 (** Non-whitespace/non-[>] character after system identifier;
254 the character is ignored.
255
256 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-after-doctype-system-identifier> *)
257
258 | Unexpected_character_in_attribute_name
259 (** Double quote, single quote, or less-than sign in attribute name;
260 included in the attribute name.
261
262 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-in-attribute-name> *)
263
264 | Unexpected_character_in_unquoted_attribute_value
265 (** Double quote, equals sign, backtick, or less-than sign in
266 unquoted attribute value; included in the value.
267
268 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-in-unquoted-attribute-value> *)
269
270 | Unexpected_equals_sign_before_attribute_name
271 (** [=] where attribute name expected; treated as first
272 character of attribute name.
273
274 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-equals-sign-before-attribute-name> *)
275
276 | Unexpected_null_character
277 (** U+0000 (NULL) in various positions; ignored or replaced
278 with U+FFFD depending on context.
279
280 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-null-character> *)
281
282 | Unexpected_question_mark_instead_of_tag_name
283 (** [?] where tag name expected (like [<?xml]); treated as
284 start of bogus comment.
285
286 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-question-mark-instead-of-tag-name> *)
287
288 | Unexpected_solidus_in_tag
289 (** [/] in tag not immediately before [>]; treated as
290 whitespace.
291
292 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-solidus-in-tag> *)
293
294 | Unknown_named_character_reference
295 (** Ambiguous ampersand: [&] followed by characters that don't
296 match any named reference; not resolved as reference.
297
298 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unknown-named-character-reference> *)
299
300 | Tree_construction_error of string
301 (** Tree construction error not defined in the WHATWG specification.
302
303 These are informative errors produced during tree construction
304 to indicate various issues like unexpected tags, missing closing
305 tags, etc. The string contains a descriptive error code. *)
306
307val to_string : t -> string
308(** Convert an error code to its WHATWG specification string representation.
309
310 The returned string is lowercase with hyphens, matching the WHATWG
311 specification naming convention. For example:
312 - [Abrupt_closing_of_empty_comment] becomes ["abrupt-closing-of-empty-comment"]
313 - [Eof_in_tag] becomes ["eof-in-tag"] *)
314
315val of_string : string -> t
316(** Parse an error code from its WHATWG specification string representation.
317
318 If the string matches a known WHATWG error code, returns that variant.
319 Otherwise, returns [Tree_construction_error s]. *)
320
321val of_string_opt : string -> t option
322(** Parse an error code from its WHATWG specification string representation.
323
324 Always returns [Some code]. For unrecognized strings, returns
325 [Some (Tree_construction_error s)]. *)
326
327val is_whatwg_standard : t -> bool
328(** Check if an error code is defined in the WHATWG specification.
329
330 Returns [false] for [Tree_construction_error _], [true] for all others. *)
331
332val pp : Format.formatter -> t -> unit
333(** Pretty-print an error code using the WHATWG specification string format. *)