OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Typed HTML5 element representation.
7
8 This module combines tags and attributes into a complete typed element
9 representation. Elements are created from raw input (tag name, namespace,
10 attributes) and provide typed accessors for validation and manipulation.
11
12 {2 Design Philosophy}
13
14 An element in this module represents a complete typed view of an HTML
15 element, including:
16
17 - The element's tag (typed via {!Tag.element_tag})
18 - Typed attributes (via {!Attr.t} list)
19 - Raw attributes (for fallback access)
20
21 This dual representation allows checkers to use typed pattern matching
22 for common cases while falling back to raw strings when needed.
23
24 {2 Usage Example}
25
26 {[
27 let elem = Element.create
28 ~name:"input"
29 ~namespace:None
30 ~attrs:[("type", "email"); ("required", ""); ("class", "form-input")]
31 in
32 match elem.tag with
33 | Tag.Html `Input ->
34 if Element.has_required elem then
35 (* Validate required input *)
36 ()
37 | _ -> ()
38 ]}
39
40 @see 'Tag' for element tag types
41 @see 'Attr' for attribute types
42*)
43
44(** {1 Element Type} *)
45
46(** A typed HTML element.
47
48 @field tag The element's tag classification
49 @field attrs Typed attributes parsed from raw input
50 @field raw_attrs Original attribute name-value pairs for fallback *)
51type t = {
52 tag : Tag.element_tag;
53 attrs : Attr.t list;
54 raw_attrs : (string * string) list;
55}
56
57(** {1 Construction} *)
58
59val create : name:string -> namespace:string option -> attrs:(string * string) list -> t
60(** [create ~name ~namespace ~attrs] creates a typed element.
61
62 @param name The element's tag name
63 @param namespace Optional namespace URI (for SVG/MathML)
64 @param attrs Raw attribute name-value pairs
65 @return A typed element
66
67 {b Example:}
68 {[
69 let div = Element.create ~name:"div" ~namespace:None
70 ~attrs:[("class", "container"); ("id", "main")]
71 ]} *)
72
73(** {1 Tag Accessors} *)
74
75val tag : t -> Tag.element_tag
76(** [tag elem] returns the element's tag. *)
77
78val tag_name : t -> string
79(** [tag_name elem] returns the element's tag name as a string. *)
80
81val is_html_tag : Tag.html_tag -> t -> bool
82(** [is_html_tag expected elem] checks if the element is a specific HTML tag.
83
84 @param expected The expected HTML tag variant
85 @param elem The element to check
86 @return [true] if the element matches *)
87
88val as_html_tag : t -> Tag.html_tag option
89(** [as_html_tag elem] extracts the HTML tag if this is an HTML element.
90
91 @return [Some tag] for HTML elements, [None] for SVG/MathML/Custom/Unknown *)
92
93(** {1 Attribute Accessors} *)
94
95val attrs : t -> Attr.t list
96(** [attrs elem] returns the typed attributes. *)
97
98val raw_attrs : t -> (string * string) list
99(** [raw_attrs elem] returns the original raw attributes. *)
100
101val get_id : t -> string option
102(** [get_id elem] extracts the id attribute value. *)
103
104val get_class : t -> string option
105(** [get_class elem] extracts the class attribute value. *)
106
107val get_href : t -> string option
108(** [get_href elem] extracts the href attribute value. *)
109
110val get_src : t -> string option
111(** [get_src elem] extracts the src attribute value. *)
112
113val get_alt : t -> string option
114(** [get_alt elem] extracts the alt attribute value. *)
115
116val get_name : t -> string option
117(** [get_name elem] extracts the name attribute value. *)
118
119val get_value : t -> string option
120(** [get_value elem] extracts the value attribute value. *)
121
122val get_role : t -> string option
123(** [get_role elem] extracts the role attribute value. *)
124
125val get_aria : string -> t -> string option
126(** [get_aria name elem] extracts a specific aria-* attribute value.
127
128 @param name The aria attribute name without the "aria-" prefix *)
129
130val get_data : string -> t -> string option
131(** [get_data name elem] extracts a specific data-* attribute value.
132
133 @param name The data attribute name without the "data-" prefix *)
134
135val has_disabled : t -> bool
136(** [has_disabled elem] checks if the disabled attribute is present. *)
137
138val has_required : t -> bool
139(** [has_required elem] checks if the required attribute is present. *)
140
141val has_readonly : t -> bool
142(** [has_readonly elem] checks if the readonly attribute is present. *)
143
144val has_checked : t -> bool
145(** [has_checked elem] checks if the checked attribute is present. *)
146
147val has_autofocus : t -> bool
148(** [has_autofocus elem] checks if the autofocus attribute is present. *)
149
150val has_hidden : t -> bool
151(** [has_hidden elem] checks if the hidden attribute is present. *)
152
153val has_inert : t -> bool
154(** [has_inert elem] checks if the inert attribute is present. *)
155
156val has_open : t -> bool
157(** [has_open elem] checks if the open attribute is present. *)
158
159val get_all_aria : t -> (string * string) list
160(** [get_all_aria elem] extracts all aria-* attributes. *)
161
162val get_all_data : t -> (string * string) list
163(** [get_all_data elem] extracts all data-* attributes. *)
164
165(** {1 Raw Attribute Fallback} *)
166
167val get_raw_attr : string -> t -> string option
168(** [get_raw_attr name elem] gets a raw attribute value by name.
169
170 This is useful when the typed representation doesn't capture a specific
171 attribute or when you need the exact original value.
172
173 @param name The attribute name (case-insensitive)
174 @param elem The element
175 @return [Some value] if the attribute exists *)
176
177val has_raw_attr : string -> t -> bool
178(** [has_raw_attr name elem] checks if a raw attribute exists.
179
180 @param name The attribute name (case-insensitive)
181 @param elem The element
182 @return [true] if the attribute is present *)
183
184(** {1 Category Checks}
185
186 These predicates check element categories based on the HTML5 content model. *)
187
188val is_void : t -> bool
189(** [is_void elem] checks if this is a void element (cannot have children).
190
191 @return [true] for br, hr, img, input, etc. *)
192
193val is_heading : t -> bool
194(** [is_heading elem] checks if this is a heading element.
195
196 @return [true] for h1-h6 *)
197
198val heading_level : t -> int option
199(** [heading_level elem] gets the heading level (1-6) if applicable.
200
201 @return [Some level] for h1-h6, [None] otherwise *)
202
203val is_sectioning : t -> bool
204(** [is_sectioning elem] checks if this is sectioning content.
205
206 @return [true] for article, aside, nav, section *)
207
208val is_sectioning_root : t -> bool
209(** [is_sectioning_root elem] checks if this is a sectioning root.
210
211 @return [true] for blockquote, body, details, dialog, fieldset, figure, td *)
212
213val is_embedded : t -> bool
214(** [is_embedded elem] checks if this is embedded content.
215
216 @return [true] for audio, canvas, embed, iframe, img, object, picture, video *)
217
218val is_interactive : t -> bool
219(** [is_interactive elem] checks if this is interactive content.
220
221 @return [true] for focusable/activatable elements *)
222
223val is_form_associated : t -> bool
224(** [is_form_associated elem] checks if this is form-associated.
225
226 @return [true] for elements that can belong to a form *)
227
228val is_labelable : t -> bool
229(** [is_labelable elem] checks if this can be associated with a label.
230
231 @return [true] for button, input, meter, output, progress, select, textarea *)
232
233val is_submittable : t -> bool
234(** [is_submittable elem] checks if this is a submittable form element.
235
236 @return [true] for button, input, select, textarea *)
237
238val is_table_element : t -> bool
239(** [is_table_element elem] checks if this is a table-related element.
240
241 @return [true] for table, tr, td, th, etc. *)
242
243val is_media : t -> bool
244(** [is_media elem] checks if this is a media element.
245
246 @return [true] for audio, video *)
247
248val is_list_container : t -> bool
249(** [is_list_container elem] checks if this is a list container.
250
251 @return [true] for ul, ol, menu, dl *)
252
253val is_transparent : t -> bool
254(** [is_transparent elem] checks if this has a transparent content model.
255
256 @return [true] for a, abbr, audio, canvas, del, ins, map, noscript, etc. *)
257
258val is_phrasing : t -> bool
259(** [is_phrasing elem] checks if this is phrasing content.
260
261 @return [true] for inline-level elements *)
262
263val is_flow : t -> bool
264(** [is_flow elem] checks if this is flow content.
265
266 @return [true] for most body-level elements *)
267
268val is_obsolete : t -> bool
269(** [is_obsolete elem] checks if this is a deprecated element.
270
271 @return [true] for applet, font, marquee, etc. *)
272
273val is_svg : t -> bool
274(** [is_svg elem] checks if this is an SVG element.
275
276 @return [true] if the element is in the SVG namespace *)
277
278val is_mathml : t -> bool
279(** [is_mathml elem] checks if this is a MathML element.
280
281 @return [true] if the element is in the MathML namespace *)
282
283val is_custom : t -> bool
284(** [is_custom elem] checks if this is a custom element.
285
286 @return [true] if the element name contains a hyphen *)
287
288val is_unknown : t -> bool
289(** [is_unknown elem] checks if this is an unknown element.
290
291 @return [true] if the element is not recognized *)
292
293(** {1 Input Type Utilities} *)
294
295val get_input_type : t -> Attr.input_type option
296(** [get_input_type elem] gets the input type for input elements.
297
298 @return [Some type] for input elements with a type, [None] otherwise *)
299
300val get_button_type : t -> Attr.button_type option
301(** [get_button_type elem] gets the button type for button elements.
302
303 @return [Some type] for button elements with a type, [None] otherwise *)
304
305val is_input_type : Attr.input_type -> t -> bool
306(** [is_input_type expected elem] checks if an input has a specific type.
307
308 @param expected The expected input type
309 @param elem The element to check
310 @return [true] if this is an input with the specified type *)
311
312(** {1 Pattern Matching Helpers} *)
313
314val match_html : t -> (Tag.html_tag -> 'a) -> 'a option
315(** [match_html elem f] applies [f] to the HTML tag if present.
316
317 @param elem The element
318 @param f Function to apply to the HTML tag
319 @return [Some (f tag)] for HTML elements, [None] otherwise *)
320
321val when_html_tag : Tag.html_tag -> t -> (unit -> 'a) -> 'a option
322(** [when_html_tag expected elem f] applies [f] if the element matches.
323
324 @param expected The expected HTML tag
325 @param elem The element to check
326 @param f Function to call if the element matches
327 @return [Some (f ())] if matched, [None] otherwise *)
328
329(** {1 Internal} *)
330
331val parse_type_attr : Tag.html_tag -> string -> Attr.t
332(** [parse_type_attr tag value] parses a type attribute for an element.
333
334 Different elements have different valid type values. This function
335 handles context-dependent parsing.
336
337 @param tag The element's HTML tag
338 @param value The type attribute value
339 @return The parsed attribute variant *)
340
341val parse_attrs_for_tag : Tag.element_tag -> (string * string) list -> Attr.t list
342(** [parse_attrs_for_tag tag raw_attrs] parses attributes with element context.
343
344 The type attribute is parsed differently depending on the element tag.
345
346 @param tag The element's tag
347 @param raw_attrs Raw attribute name-value pairs
348 @return List of typed attributes *)