OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Typed HTML5 element representation.
7
8 This module combines tags and attributes into a complete typed element
9 representation. Elements are created from raw input (tag name, namespace,
10 attributes) and provide typed accessors for validation and manipulation.
11
12 {2 Design Philosophy}
13
14 An element in this module represents a complete typed view of an HTML
15 element, including:
16
17 - The element's tag (typed via {!Tag.element_tag})
18 - Typed attributes (via {!Attr.t} list)
19 - Raw attributes (for fallback access)
20
21 This dual representation allows checkers to use typed pattern matching
22 for common cases while falling back to raw strings when needed.
23
24 {2 Usage Example}
25
26 {[
27 let elem = Element.create
28 ~name:"input"
29 ~namespace:None
30 ~attrs:[("type", "email"); ("required", ""); ("class", "form-input")]
31 in
32 match elem.tag with
33 | Tag.Html `Input ->
34 if Element.has_required elem then
35 (* Validate required input *)
36 ()
37 | _ -> ()
38 ]}
39
40 @see 'Tag' for element tag types
41 @see 'Attr' for attribute types
42*)
43
44(** {1 Element Type} *)
45
46(** A typed HTML element.
47
48 - [tag]: The element's tag classification
49 - [attrs]: Typed attributes parsed from raw input
50 - [raw_attrs]: Original attribute name-value pairs for fallback *)
51type t = {
52 tag : Tag.element_tag;
53 attrs : Attr.t list;
54 raw_attrs : (string * string) list;
55}
56
57(** {1 Construction} *)
58
59val create : name:string -> namespace:string option -> attrs:(string * string) list -> t
60(** [create ~name ~namespace ~attrs] creates a typed element.
61
62 @param name The element's tag name
63 @param namespace Optional namespace URI (for SVG/MathML)
64 @param attrs Raw attribute name-value pairs
65 @return A typed element
66
67 {b Example:}
68 {[
69 let div = Element.create ~name:"div" ~namespace:None
70 ~attrs:[("class", "container"); ("id", "main")]
71 ]} *)
72
73(** {1 Tag Accessors} *)
74
75val tag : t -> Tag.element_tag
76(** [tag elem] returns the element's tag. *)
77
78val tag_name : t -> string
79(** [tag_name elem] returns the element's tag name as a string. *)
80
81val is_html_tag : Tag.html_tag -> t -> bool
82(** [is_html_tag expected elem] checks if the element is a specific HTML tag.
83
84 @param expected The expected HTML tag variant
85 @param elem The element to check
86 @return [true] if the element matches *)
87
88val as_html_tag : t -> Tag.html_tag option
89(** [as_html_tag elem] extracts the HTML tag if this is an HTML element.
90
91 @return [Some tag] for HTML elements, [None] for SVG/MathML/Custom/Unknown *)
92
93(** {1 Attribute Accessors} *)
94
95val attrs : t -> Attr.t list
96(** [attrs elem] returns the typed attributes. *)
97
98val raw_attrs : t -> (string * string) list
99(** [raw_attrs elem] returns the original raw attributes. *)
100
101val get_id : t -> string option
102(** [get_id elem] extracts the id attribute value. *)
103
104val get_class : t -> string option
105(** [get_class elem] extracts the class attribute value. *)
106
107val get_href : t -> string option
108(** [get_href elem] extracts the href attribute value. *)
109
110val get_src : t -> string option
111(** [get_src elem] extracts the src attribute value. *)
112
113val get_alt : t -> string option
114(** [get_alt elem] extracts the alt attribute value. *)
115
116val get_name : t -> string option
117(** [get_name elem] extracts the name attribute value. *)
118
119val get_value : t -> string option
120(** [get_value elem] extracts the value attribute value. *)
121
122val get_role : t -> string option
123(** [get_role elem] extracts the role attribute value. *)
124
125val get_aria : string -> t -> string option
126(** [get_aria name elem] extracts a specific aria-* attribute value.
127
128 @param name The aria attribute name without the "aria-" prefix *)
129
130val get_data : string -> t -> string option
131(** [get_data name elem] extracts a specific data-* attribute value.
132
133 @param name The data attribute name without the "data-" prefix *)
134
135val has_disabled : t -> bool
136(** [has_disabled elem] checks if the disabled attribute is present. *)
137
138val has_required : t -> bool
139(** [has_required elem] checks if the required attribute is present. *)
140
141val has_readonly : t -> bool
142(** [has_readonly elem] checks if the readonly attribute is present. *)
143
144val has_checked : t -> bool
145(** [has_checked elem] checks if the checked attribute is present. *)
146
147val has_autofocus : t -> bool
148(** [has_autofocus elem] checks if the autofocus attribute is present. *)
149
150val has_hidden : t -> bool
151(** [has_hidden elem] checks if the hidden attribute is present. *)
152
153val has_inert : t -> bool
154(** [has_inert elem] checks if the inert attribute is present. *)
155
156val has_open : t -> bool
157(** [has_open elem] checks if the open attribute is present. *)
158
159val get_all_aria : t -> (string * string) list
160(** [get_all_aria elem] extracts all aria-* attributes. *)
161
162val get_all_data : t -> (string * string) list
163(** [get_all_data elem] extracts all data-* attributes. *)
164
165(** {1 Space-Separated List Accessors}
166
167 These functions return attribute values as parsed lists, splitting on
168 whitespace per HTML5 spec. *)
169
170val get_class_list : t -> string list
171(** [get_class_list elem] returns class names as a list. *)
172
173val get_rel_list : t -> string list
174(** [get_rel_list elem] returns link types as a list. *)
175
176val get_headers_list : t -> string list
177(** [get_headers_list elem] returns header IDs as a list (for td/th). *)
178
179val get_itemref_list : t -> string list
180(** [get_itemref_list elem] returns itemref IDs as a list. *)
181
182val get_itemprop_list : t -> string list
183(** [get_itemprop_list elem] returns itemprop names as a list. *)
184
185val get_itemtype_list : t -> string list
186(** [get_itemtype_list elem] returns itemtype URLs as a list. *)
187
188val get_aria_list : string -> t -> string list
189(** [get_aria_list name elem] returns space-separated ARIA values as a list. *)
190
191(** {1 Raw Attribute Fallback} *)
192
193val get_raw_attr : string -> t -> string option
194(** [get_raw_attr name elem] gets a raw attribute value by name.
195
196 This is useful when the typed representation doesn't capture a specific
197 attribute or when you need the exact original value.
198
199 @param name The attribute name (case-insensitive)
200 @param elem The element
201 @return [Some value] if the attribute exists *)
202
203val has_raw_attr : string -> t -> bool
204(** [has_raw_attr name elem] checks if a raw attribute exists.
205
206 @param name The attribute name (case-insensitive)
207 @param elem The element
208 @return [true] if the attribute is present *)
209
210(** {1 Category Checks}
211
212 These predicates check element categories based on the HTML5 content model. *)
213
214val is_void : t -> bool
215(** [is_void elem] checks if this is a void element (cannot have children).
216
217 @return [true] for br, hr, img, input, etc. *)
218
219val is_heading : t -> bool
220(** [is_heading elem] checks if this is a heading element.
221
222 @return [true] for h1-h6 *)
223
224val heading_level : t -> int option
225(** [heading_level elem] gets the heading level (1-6) if applicable.
226
227 @return [Some level] for h1-h6, [None] otherwise *)
228
229val is_sectioning : t -> bool
230(** [is_sectioning elem] checks if this is sectioning content.
231
232 @return [true] for article, aside, nav, section *)
233
234val is_sectioning_root : t -> bool
235(** [is_sectioning_root elem] checks if this is a sectioning root.
236
237 @return [true] for blockquote, body, details, dialog, fieldset, figure, td *)
238
239val is_embedded : t -> bool
240(** [is_embedded elem] checks if this is embedded content.
241
242 @return [true] for audio, canvas, embed, iframe, img, object, picture, video *)
243
244val is_interactive : t -> bool
245(** [is_interactive elem] checks if this is interactive content.
246
247 @return [true] for focusable/activatable elements *)
248
249val is_form_associated : t -> bool
250(** [is_form_associated elem] checks if this is form-associated.
251
252 @return [true] for elements that can belong to a form *)
253
254val is_labelable : t -> bool
255(** [is_labelable elem] checks if this can be associated with a label.
256
257 @return [true] for button, input, meter, output, progress, select, textarea *)
258
259val is_submittable : t -> bool
260(** [is_submittable elem] checks if this is a submittable form element.
261
262 @return [true] for button, input, select, textarea *)
263
264val is_table_element : t -> bool
265(** [is_table_element elem] checks if this is a table-related element.
266
267 @return [true] for table, tr, td, th, etc. *)
268
269val is_media : t -> bool
270(** [is_media elem] checks if this is a media element.
271
272 @return [true] for audio, video *)
273
274val is_list_container : t -> bool
275(** [is_list_container elem] checks if this is a list container.
276
277 @return [true] for ul, ol, menu, dl *)
278
279val is_transparent : t -> bool
280(** [is_transparent elem] checks if this has a transparent content model.
281
282 @return [true] for a, abbr, audio, canvas, del, ins, map, noscript, etc. *)
283
284val is_phrasing : t -> bool
285(** [is_phrasing elem] checks if this is phrasing content.
286
287 @return [true] for inline-level elements *)
288
289val is_flow : t -> bool
290(** [is_flow elem] checks if this is flow content.
291
292 @return [true] for most body-level elements *)
293
294val is_obsolete : t -> bool
295(** [is_obsolete elem] checks if this is a deprecated element.
296
297 @return [true] for applet, font, marquee, etc. *)
298
299val is_svg : t -> bool
300(** [is_svg elem] checks if this is an SVG element.
301
302 @return [true] if the element is in the SVG namespace *)
303
304val is_mathml : t -> bool
305(** [is_mathml elem] checks if this is a MathML element.
306
307 @return [true] if the element is in the MathML namespace *)
308
309val is_custom : t -> bool
310(** [is_custom elem] checks if this is a custom element.
311
312 @return [true] if the element name contains a hyphen *)
313
314val is_unknown : t -> bool
315(** [is_unknown elem] checks if this is an unknown element.
316
317 @return [true] if the element is not recognized *)
318
319(** {1 Input Type Utilities} *)
320
321val get_input_type : t -> Attr.input_type option
322(** [get_input_type elem] gets the input type for input elements.
323
324 @return [Some type] for input elements with a type, [None] otherwise *)
325
326val get_button_type : t -> Attr.button_type option
327(** [get_button_type elem] gets the button type for button elements.
328
329 @return [Some type] for button elements with a type, [None] otherwise *)
330
331val is_input_type : Attr.input_type -> t -> bool
332(** [is_input_type expected elem] checks if an input has a specific type.
333
334 @param expected The expected input type
335 @param elem The element to check
336 @return [true] if this is an input with the specified type *)
337
338(** {1 Pattern Matching Helpers} *)
339
340val match_html : t -> (Tag.html_tag -> 'a) -> 'a option
341(** [match_html elem f] applies [f] to the HTML tag if present.
342
343 @param elem The element
344 @param f Function to apply to the HTML tag
345 @return [Some (f tag)] for HTML elements, [None] otherwise *)
346
347val when_html_tag : Tag.html_tag -> t -> (unit -> 'a) -> 'a option
348(** [when_html_tag expected elem f] applies [f] if the element matches.
349
350 @param expected The expected HTML tag
351 @param elem The element to check
352 @param f Function to call if the element matches
353 @return [Some (f ())] if matched, [None] otherwise *)
354
355(** {1 Internal} *)
356
357val parse_type_attr : Tag.html_tag -> string -> Attr.t
358(** [parse_type_attr tag value] parses a type attribute for an element.
359
360 Different elements have different valid type values. This function
361 handles context-dependent parsing.
362
363 @param tag The element's HTML tag
364 @param value The type attribute value
365 @return The parsed attribute variant *)
366
367val parse_attrs_for_tag : Tag.element_tag -> (string * string) list -> Attr.t list
368(** [parse_attrs_for_tag tag raw_attrs] parses attributes with element context.
369
370 The type attribute is parsed differently depending on the element tag.
371
372 @param tag The element's tag
373 @param raw_attrs Raw attribute name-value pairs
374 @return List of typed attributes *)