OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Parse error codes as defined by the WHATWG HTML5 specification.
7
8 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
9*)
10
11type t =
12 | Abrupt_closing_of_empty_comment
13 | Abrupt_doctype_public_identifier
14 | Abrupt_doctype_system_identifier
15 | Absence_of_digits_in_numeric_character_reference
16 | Cdata_in_html_content
17 | Character_reference_outside_unicode_range
18 | Control_character_in_input_stream
19 | Control_character_reference
20 | Duplicate_attribute
21 | End_tag_with_attributes
22 | End_tag_with_trailing_solidus
23 | Eof_before_tag_name
24 | Eof_in_cdata
25 | Eof_in_comment
26 | Eof_in_doctype
27 | Eof_in_script_html_comment_like_text
28 | Eof_in_tag
29 | Incorrectly_closed_comment
30 | Incorrectly_opened_comment
31 | Invalid_character_sequence_after_doctype_name
32 | Invalid_first_character_of_tag_name
33 | Missing_attribute_value
34 | Missing_doctype_name
35 | Missing_doctype_public_identifier
36 | Missing_doctype_system_identifier
37 | Missing_end_tag_name
38 | Missing_quote_before_doctype_public_identifier
39 | Missing_quote_before_doctype_system_identifier
40 | Missing_semicolon_after_character_reference
41 | Missing_whitespace_after_doctype_public_keyword
42 | Missing_whitespace_after_doctype_system_keyword
43 | Missing_whitespace_before_doctype_name
44 | Missing_whitespace_between_attributes
45 | Missing_whitespace_between_doctype_public_and_system_identifiers
46 | Nested_comment
47 | Noncharacter_character_reference
48 | Noncharacter_in_input_stream
49 | Non_void_html_element_start_tag_with_trailing_solidus
50 | Null_character_reference
51 | Surrogate_character_reference
52 | Surrogate_in_input_stream
53 | Unexpected_character_after_doctype_system_identifier
54 | Unexpected_character_in_attribute_name
55 | Unexpected_character_in_unquoted_attribute_value
56 | Unexpected_equals_sign_before_attribute_name
57 | Unexpected_null_character
58 | Unexpected_question_mark_instead_of_tag_name
59 | Unexpected_solidus_in_tag
60 | Unknown_named_character_reference
61 | Tree_construction_error of string
62
63let to_string = function
64 | Abrupt_closing_of_empty_comment -> "abrupt-closing-of-empty-comment"
65 | Abrupt_doctype_public_identifier -> "abrupt-doctype-public-identifier"
66 | Abrupt_doctype_system_identifier -> "abrupt-doctype-system-identifier"
67 | Absence_of_digits_in_numeric_character_reference ->
68 "absence-of-digits-in-numeric-character-reference"
69 | Cdata_in_html_content -> "cdata-in-html-content"
70 | Character_reference_outside_unicode_range ->
71 "character-reference-outside-unicode-range"
72 | Control_character_in_input_stream -> "control-character-in-input-stream"
73 | Control_character_reference -> "control-character-reference"
74 | Duplicate_attribute -> "duplicate-attribute"
75 | End_tag_with_attributes -> "end-tag-with-attributes"
76 | End_tag_with_trailing_solidus -> "end-tag-with-trailing-solidus"
77 | Eof_before_tag_name -> "eof-before-tag-name"
78 | Eof_in_cdata -> "eof-in-cdata"
79 | Eof_in_comment -> "eof-in-comment"
80 | Eof_in_doctype -> "eof-in-doctype"
81 | Eof_in_script_html_comment_like_text ->
82 "eof-in-script-html-comment-like-text"
83 | Eof_in_tag -> "eof-in-tag"
84 | Incorrectly_closed_comment -> "incorrectly-closed-comment"
85 | Incorrectly_opened_comment -> "incorrectly-opened-comment"
86 | Invalid_character_sequence_after_doctype_name ->
87 "invalid-character-sequence-after-doctype-name"
88 | Invalid_first_character_of_tag_name ->
89 "invalid-first-character-of-tag-name"
90 | Missing_attribute_value -> "missing-attribute-value"
91 | Missing_doctype_name -> "missing-doctype-name"
92 | Missing_doctype_public_identifier -> "missing-doctype-public-identifier"
93 | Missing_doctype_system_identifier -> "missing-doctype-system-identifier"
94 | Missing_end_tag_name -> "missing-end-tag-name"
95 | Missing_quote_before_doctype_public_identifier ->
96 "missing-quote-before-doctype-public-identifier"
97 | Missing_quote_before_doctype_system_identifier ->
98 "missing-quote-before-doctype-system-identifier"
99 | Missing_semicolon_after_character_reference ->
100 "missing-semicolon-after-character-reference"
101 | Missing_whitespace_after_doctype_public_keyword ->
102 "missing-whitespace-after-doctype-public-keyword"
103 | Missing_whitespace_after_doctype_system_keyword ->
104 "missing-whitespace-after-doctype-system-keyword"
105 | Missing_whitespace_before_doctype_name ->
106 "missing-whitespace-before-doctype-name"
107 | Missing_whitespace_between_attributes ->
108 "missing-whitespace-between-attributes"
109 | Missing_whitespace_between_doctype_public_and_system_identifiers ->
110 "missing-whitespace-between-doctype-public-and-system-identifiers"
111 | Nested_comment -> "nested-comment"
112 | Noncharacter_character_reference -> "noncharacter-character-reference"
113 | Noncharacter_in_input_stream -> "noncharacter-in-input-stream"
114 | Non_void_html_element_start_tag_with_trailing_solidus ->
115 "non-void-html-element-start-tag-with-trailing-solidus"
116 | Null_character_reference -> "null-character-reference"
117 | Surrogate_character_reference -> "surrogate-character-reference"
118 | Surrogate_in_input_stream -> "surrogate-in-input-stream"
119 | Unexpected_character_after_doctype_system_identifier ->
120 "unexpected-character-after-doctype-system-identifier"
121 | Unexpected_character_in_attribute_name ->
122 "unexpected-character-in-attribute-name"
123 | Unexpected_character_in_unquoted_attribute_value ->
124 "unexpected-character-in-unquoted-attribute-value"
125 | Unexpected_equals_sign_before_attribute_name ->
126 "unexpected-equals-sign-before-attribute-name"
127 | Unexpected_null_character -> "unexpected-null-character"
128 | Unexpected_question_mark_instead_of_tag_name ->
129 "unexpected-question-mark-instead-of-tag-name"
130 | Unexpected_solidus_in_tag -> "unexpected-solidus-in-tag"
131 | Unknown_named_character_reference -> "unknown-named-character-reference"
132 | Tree_construction_error s -> s
133
134let of_string = function
135 | "abrupt-closing-of-empty-comment" -> Abrupt_closing_of_empty_comment
136 | "abrupt-doctype-public-identifier" -> Abrupt_doctype_public_identifier
137 | "abrupt-doctype-system-identifier" -> Abrupt_doctype_system_identifier
138 | "absence-of-digits-in-numeric-character-reference" ->
139 Absence_of_digits_in_numeric_character_reference
140 | "cdata-in-html-content" -> Cdata_in_html_content
141 | "character-reference-outside-unicode-range" ->
142 Character_reference_outside_unicode_range
143 | "control-character-in-input-stream" -> Control_character_in_input_stream
144 | "control-character-reference" -> Control_character_reference
145 | "duplicate-attribute" -> Duplicate_attribute
146 | "end-tag-with-attributes" -> End_tag_with_attributes
147 | "end-tag-with-trailing-solidus" -> End_tag_with_trailing_solidus
148 | "eof-before-tag-name" -> Eof_before_tag_name
149 | "eof-in-cdata" -> Eof_in_cdata
150 | "eof-in-comment" -> Eof_in_comment
151 | "eof-in-doctype" -> Eof_in_doctype
152 | "eof-in-script-html-comment-like-text" ->
153 Eof_in_script_html_comment_like_text
154 | "eof-in-tag" -> Eof_in_tag
155 | "incorrectly-closed-comment" -> Incorrectly_closed_comment
156 | "incorrectly-opened-comment" -> Incorrectly_opened_comment
157 | "invalid-character-sequence-after-doctype-name" ->
158 Invalid_character_sequence_after_doctype_name
159 | "invalid-first-character-of-tag-name" ->
160 Invalid_first_character_of_tag_name
161 | "missing-attribute-value" -> Missing_attribute_value
162 | "missing-doctype-name" -> Missing_doctype_name
163 | "missing-doctype-public-identifier" -> Missing_doctype_public_identifier
164 | "missing-doctype-system-identifier" -> Missing_doctype_system_identifier
165 | "missing-end-tag-name" -> Missing_end_tag_name
166 | "missing-quote-before-doctype-public-identifier" ->
167 Missing_quote_before_doctype_public_identifier
168 | "missing-quote-before-doctype-system-identifier" ->
169 Missing_quote_before_doctype_system_identifier
170 | "missing-semicolon-after-character-reference" ->
171 Missing_semicolon_after_character_reference
172 | "missing-whitespace-after-doctype-public-keyword" ->
173 Missing_whitespace_after_doctype_public_keyword
174 | "missing-whitespace-after-doctype-system-keyword" ->
175 Missing_whitespace_after_doctype_system_keyword
176 | "missing-whitespace-before-doctype-name" ->
177 Missing_whitespace_before_doctype_name
178 | "missing-whitespace-between-attributes" ->
179 Missing_whitespace_between_attributes
180 | "missing-whitespace-between-doctype-public-and-system-identifiers" ->
181 Missing_whitespace_between_doctype_public_and_system_identifiers
182 | "nested-comment" -> Nested_comment
183 | "noncharacter-character-reference" -> Noncharacter_character_reference
184 | "noncharacter-in-input-stream" -> Noncharacter_in_input_stream
185 | "non-void-html-element-start-tag-with-trailing-solidus" ->
186 Non_void_html_element_start_tag_with_trailing_solidus
187 | "null-character-reference" -> Null_character_reference
188 | "surrogate-character-reference" -> Surrogate_character_reference
189 | "surrogate-in-input-stream" -> Surrogate_in_input_stream
190 | "unexpected-character-after-doctype-system-identifier" ->
191 Unexpected_character_after_doctype_system_identifier
192 | "unexpected-character-in-attribute-name" ->
193 Unexpected_character_in_attribute_name
194 | "unexpected-character-in-unquoted-attribute-value" ->
195 Unexpected_character_in_unquoted_attribute_value
196 | "unexpected-equals-sign-before-attribute-name" ->
197 Unexpected_equals_sign_before_attribute_name
198 | "unexpected-null-character" -> Unexpected_null_character
199 | "unexpected-question-mark-instead-of-tag-name" ->
200 Unexpected_question_mark_instead_of_tag_name
201 | "unexpected-solidus-in-tag" -> Unexpected_solidus_in_tag
202 | "unknown-named-character-reference" -> Unknown_named_character_reference
203 | s -> Tree_construction_error s
204
205let of_string_opt s = Some (of_string s)
206
207let is_whatwg_standard = function
208 | Tree_construction_error _ -> false
209 | _ -> true
210
211let pp fmt t = Format.pp_print_string fmt (to_string t)