OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6(** Typed HTML5 attribute representations using polymorphic variants.
7
8 This module provides typed representations for HTML attributes with
9 proper value types for enumerated attributes. Parsing raw attribute
10 name-value pairs produces typed variants that can be pattern-matched
11 with exhaustiveness checking.
12
13 {2 Design Philosophy}
14
15 HTML5 attributes have specific value constraints that this module
16 encodes in the type system:
17
18 - Boolean attributes: Present means true (e.g., [disabled], [checked])
19 - Enumerated attributes: Fixed set of valid values (e.g., [dir], [method])
20 - Numeric attributes: Integer or float values (e.g., [tabindex], [colspan])
21 - URL attributes: String values representing URLs (e.g., [href], [src])
22 - Free-form attributes: Any string value (e.g., [class], [title])
23
24 {2 Parsing Strategy}
25
26 Attributes are parsed with validation:
27 - Known attributes are parsed into typed variants
28 - Invalid values for enumerated attributes fall back to [Unknown_attr]
29 - Unknown attribute names are captured as [Unknown_attr]
30 - Special handling for [data-*] and [aria-*] prefixed attributes
31
32 @see <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>
33 HTML Standard: Global attributes
34*)
35
36(** {1 Attribute Value Types}
37
38 These types represent the valid values for enumerated HTML attributes. *)
39
40(** Direction attribute values for [dir]. *)
41type dir_value = [ `Ltr | `Rtl | `Auto ]
42
43(** Hidden attribute values. *)
44type hidden_value = [ `Hidden | `Until_found ]
45
46(** Popover attribute values. *)
47type popover_value = [ `Auto | `Manual | `Hint ]
48
49(** Link target attribute values. *)
50type target_value = [ `Self | `Blank | `Parent | `Top | `Named of string ]
51
52(** Image/resource loading behavior. *)
53type loading_value = [ `Eager | `Lazy ]
54
55(** Image decoding hint. *)
56type decoding_value = [ `Sync | `Async | `Auto ]
57
58(** Fetch priority hint. *)
59type fetchpriority_value = [ `High | `Low | `Auto ]
60
61(** CORS settings. *)
62type crossorigin_value = [ `Anonymous | `Use_credentials ]
63
64(** Media preload hint. *)
65type preload_value = [ `None | `Metadata | `Auto ]
66
67(** Form method values. *)
68type method_value = [ `Get | `Post | `Dialog ]
69
70(** Form encoding type values. *)
71type enctype_value = [ `Urlencoded | `Multipart | `Plain ]
72
73(** Textarea wrap mode. *)
74type wrap_value = [ `Soft | `Hard ]
75
76(** Table cell scope. *)
77type scope_value = [ `Row | `Col | `Rowgroup | `Colgroup ]
78
79(** Input element type values. *)
80type input_type = [
81 | `Hidden | `Text | `Search | `Tel | `Url | `Email | `Password
82 | `Date | `Month | `Week | `Time | `Datetime_local | `Number
83 | `Range | `Color | `Checkbox | `Radio | `File | `Submit
84 | `Image | `Reset | `Button
85]
86
87(** Button element type values. *)
88type button_type = [ `Submit | `Reset | `Button ]
89
90(** Referrer policy values. *)
91type referrerpolicy_value = [
92 | `No_referrer | `No_referrer_when_downgrade | `Origin
93 | `Origin_when_cross_origin | `Same_origin | `Strict_origin
94 | `Strict_origin_when_cross_origin | `Unsafe_url
95]
96
97(** Iframe sandbox flags. *)
98type sandbox_flag = [
99 | `Allow_downloads | `Allow_forms | `Allow_modals | `Allow_orientation_lock
100 | `Allow_pointer_lock | `Allow_popups | `Allow_popups_to_escape_sandbox
101 | `Allow_presentation | `Allow_same_origin | `Allow_scripts
102 | `Allow_top_navigation | `Allow_top_navigation_by_user_activation
103 | `Allow_top_navigation_to_custom_protocols
104]
105
106(** Enter key hint values for virtual keyboards. *)
107type enterkeyhint_value = [
108 | `Enter | `Done | `Go | `Next | `Previous | `Search | `Send
109]
110
111(** Input mode hint for virtual keyboards. *)
112type inputmode_value = [
113 | `None | `Text | `Decimal | `Numeric | `Tel | `Search | `Email | `Url
114]
115
116(** Content editable values. *)
117type contenteditable_value = [ `True | `False | `Plaintext_only ]
118
119(** Autocapitalize values. *)
120type autocapitalize_value = [
121 | `Off | `None | `On | `Sentences | `Words | `Characters
122]
123
124(** Image map shape values. *)
125type shape_value = [ `Rect | `Circle | `Poly | `Default ]
126
127(** Input capture values for file inputs. *)
128type capture_value = [ `User | `Environment ]
129
130(** Ordered list type values. *)
131type list_type_value = [
132 | `Decimal | `Lower_alpha | `Upper_alpha | `Lower_roman | `Upper_roman
133]
134
135(** Track element kind values. *)
136type kind_value = [
137 | `Subtitles | `Captions | `Descriptions | `Chapters | `Metadata
138]
139
140(** {1 Typed Attribute Variant} *)
141
142(** Typed attribute representation.
143
144 This type covers all HTML5 attributes with appropriate value types.
145 Attributes are organized into logical groups. *)
146type t = [
147 (* Global attributes *)
148 | `Id of string
149 | `Class of string
150 | `Style of string
151 | `Title of string
152 | `Lang of string
153 | `Dir of dir_value
154 | `Hidden of hidden_value option
155 | `Tabindex of int
156 | `Accesskey of string
157 | `Autocapitalize of autocapitalize_value
158 | `Autofocus
159 | `Contenteditable of contenteditable_value option
160 | `Draggable of bool
161 | `Enterkeyhint of enterkeyhint_value
162 | `Inert
163 | `Inputmode of inputmode_value
164 | `Is of string
165 | `Nonce of string
166 | `Popover of popover_value option
167 | `Slot of string
168 | `Spellcheck of bool option
169 | `Translate of bool
170 | `Exportparts of string
171 | `Part of string
172
173 (* Microdata *)
174 | `Itemscope
175 | `Itemtype of string
176 | `Itemprop of string
177 | `Itemid of string
178 | `Itemref of string
179
180 (* ARIA *)
181 | `Role of string
182 | `Aria of string * string
183
184 (* Event handlers *)
185 | `Event of string * string
186
187 (* Link/navigation attributes *)
188 | `Href of string
189 | `Target of target_value
190 | `Rel of string
191 | `Download of string option
192 | `Hreflang of string
193 | `Ping of string
194 | `Referrerpolicy of referrerpolicy_value
195 | `Type_link of string
196
197 (* Media/resource attributes *)
198 | `Src of string
199 | `Srcset of string
200 | `Sizes of string
201 | `Alt of string
202 | `Width of string
203 | `Height of string
204 | `Loading of loading_value
205 | `Decoding of decoding_value
206 | `Fetchpriority of fetchpriority_value
207 | `Crossorigin of crossorigin_value option
208 | `Ismap
209 | `Usemap of string
210 | `Media of string
211
212 (* Audio/Video specific *)
213 | `Controls
214 | `Autoplay
215 | `Loop
216 | `Muted
217 | `Preload of preload_value
218 | `Poster of string
219 | `Playsinline
220
221 (* Image map *)
222 | `Coords of string
223 | `Shape of shape_value
224
225 (* iframe *)
226 | `Sandbox of sandbox_flag list option
227 | `Allow of string
228 | `Allowfullscreen
229 | `Srcdoc of string
230 | `Csp of string
231
232 (* Form attributes *)
233 | `Action of string
234 | `Method of method_value
235 | `Enctype of enctype_value
236 | `Novalidate
237 | `Accept_charset of string
238 | `Autocomplete of string
239 | `Name of string
240 | `Form of string
241
242 (* Form control attributes *)
243 | `Value of string
244 | `Type_input of input_type
245 | `Type_button of button_type
246 | `Disabled
247 | `Readonly
248 | `Required
249 | `Checked
250 | `Selected
251 | `Multiple
252 | `Placeholder of string
253 | `Min of string
254 | `Max of string
255 | `Step of string
256 | `Minlength of int
257 | `Maxlength of int
258 | `Pattern of string
259 | `Size of int
260 | `Cols of int
261 | `Rows of int
262 | `Wrap of wrap_value
263 | `Accept of string
264 | `Capture of capture_value
265 | `Dirname of string
266 | `For of string
267 | `List of string
268
269 (* Form submission attributes *)
270 | `Formaction of string
271 | `Formmethod of method_value
272 | `Formenctype of enctype_value
273 | `Formnovalidate
274 | `Formtarget of target_value
275
276 (* Table attributes *)
277 | `Colspan of int
278 | `Rowspan of int
279 | `Headers of string
280 | `Scope of scope_value
281 | `Span of int
282
283 (* Details/Dialog *)
284 | `Open
285
286 (* Script *)
287 | `Async
288 | `Defer
289 | `Integrity of string
290 | `Nomodule
291 | `Blocking of string
292 | `Type_script of string
293
294 (* Meta *)
295 | `Charset of string
296 | `Content of string
297 | `Http_equiv of string
298
299 (* Link element *)
300 | `As of string
301 | `Imagesizes of string
302 | `Imagesrcset of string
303
304 (* Object/Embed *)
305 | `Data_object of string
306
307 (* Output *)
308 | `For_output of string
309
310 (* Meter/Progress *)
311 | `Low of float
312 | `High of float
313 | `Optimum of float
314
315 (* Time *)
316 | `Datetime of string
317
318 (* Ol *)
319 | `Start of int
320 | `Reversed
321 | `Type_list of list_type_value
322
323 (* Track *)
324 | `Kind of kind_value
325 | `Srclang of string
326 | `Default
327
328 (* Td/Th *)
329 | `Abbr of string
330
331 (* Data attributes *)
332 | `Data_attr of string * string
333
334 (* RDFa *)
335 | `Property of string
336 | `Typeof of string
337 | `Resource of string
338 | `Prefix of string
339 | `Vocab of string
340 | `About of string
341 | `Datatype of string
342 | `Inlist
343 | `Rev of string
344
345 (* Escape hatch *)
346 | `Unknown_attr of string * string
347]
348
349(** {1 Parsing Functions} *)
350
351val parse_dir : string -> dir_value option
352(** [parse_dir value] parses a direction attribute value. *)
353
354val parse_target : string -> target_value
355(** [parse_target value] parses a target attribute value. *)
356
357val parse_loading : string -> loading_value option
358(** [parse_loading value] parses a loading attribute value. *)
359
360val parse_decoding : string -> decoding_value option
361(** [parse_decoding value] parses a decoding attribute value. *)
362
363val parse_fetchpriority : string -> fetchpriority_value option
364(** [parse_fetchpriority value] parses a fetchpriority attribute value. *)
365
366val parse_crossorigin : string -> crossorigin_value option
367(** [parse_crossorigin value] parses a crossorigin attribute value. *)
368
369val parse_preload : string -> preload_value option
370(** [parse_preload value] parses a preload attribute value. *)
371
372val parse_method : string -> method_value option
373(** [parse_method value] parses a form method attribute value. *)
374
375val parse_enctype : string -> enctype_value option
376(** [parse_enctype value] parses a form enctype attribute value. *)
377
378val parse_wrap : string -> wrap_value option
379(** [parse_wrap value] parses a textarea wrap attribute value. *)
380
381val parse_scope : string -> scope_value option
382(** [parse_scope value] parses a table scope attribute value. *)
383
384val parse_input_type : string -> input_type option
385(** [parse_input_type value] parses an input type attribute value. *)
386
387val parse_button_type : string -> button_type option
388(** [parse_button_type value] parses a button type attribute value. *)
389
390val parse_shape : string -> shape_value option
391(** [parse_shape value] parses an area shape attribute value. *)
392
393val parse_capture : string -> capture_value option
394(** [parse_capture value] parses an input capture attribute value. *)
395
396val parse_list_type : string -> list_type_value option
397(** [parse_list_type value] parses an ordered list type attribute value. *)
398
399val parse_kind : string -> kind_value option
400(** [parse_kind value] parses a track kind attribute value. *)
401
402val parse_referrerpolicy : string -> referrerpolicy_value option
403(** [parse_referrerpolicy value] parses a referrer policy attribute value. *)
404
405val parse_sandbox_flag : string -> sandbox_flag option
406(** [parse_sandbox_flag value] parses a single sandbox flag token. *)
407
408val parse_sandbox : string -> sandbox_flag list option
409(** [parse_sandbox value] parses a space-separated sandbox attribute value. *)
410
411val parse_enterkeyhint : string -> enterkeyhint_value option
412(** [parse_enterkeyhint value] parses an enterkeyhint attribute value. *)
413
414val parse_inputmode : string -> inputmode_value option
415(** [parse_inputmode value] parses an inputmode attribute value. *)
416
417val parse_contenteditable : string -> contenteditable_value option
418(** [parse_contenteditable value] parses a contenteditable attribute value. *)
419
420val parse_autocapitalize : string -> autocapitalize_value option
421(** [parse_autocapitalize value] parses an autocapitalize attribute value. *)
422
423val parse_hidden : string -> hidden_value option
424(** [parse_hidden value] parses a hidden attribute value. *)
425
426val parse_popover : string -> popover_value option
427(** [parse_popover value] parses a popover attribute value. *)
428
429val parse_int : string -> int option
430(** [parse_int value] attempts to parse an integer from a string. *)
431
432val parse_float : string -> float option
433(** [parse_float value] attempts to parse a float from a string. *)
434
435val parse_bool : string -> bool option
436(** [parse_bool value] parses a boolean attribute value. *)
437
438val parse_attr : string -> string -> t
439(** [parse_attr name value] parses a single attribute name-value pair.
440
441 @param name The attribute name
442 @param value The attribute value
443 @return A typed attribute variant
444
445 {b Example:}
446 {[
447 parse_attr "class" "container" (* `Class "container" *)
448 parse_attr "disabled" "" (* `Disabled *)
449 parse_attr "data-id" "123" (* `Data_attr ("id", "123") *)
450 ]} *)
451
452val parse_attrs : (string * string) list -> t list
453(** [parse_attrs attrs] parses multiple attributes.
454
455 @param attrs List of (name, value) pairs
456 @return List of typed attributes *)
457
458(** {1 Accessor Functions} *)
459
460val get_id : t list -> string option
461(** [get_id attrs] extracts the id attribute value if present. *)
462
463val get_class : t list -> string option
464(** [get_class attrs] extracts the class attribute value as a raw string. *)
465
466val get_class_list : t list -> string list
467(** [get_class_list attrs] extracts the class attribute as a list of class names.
468 Returns empty list if not present. Space-separated values are split. *)
469
470val get_href : t list -> string option
471(** [get_href attrs] extracts the href attribute value if present. *)
472
473val get_src : t list -> string option
474(** [get_src attrs] extracts the src attribute value if present. *)
475
476val get_alt : t list -> string option
477(** [get_alt attrs] extracts the alt attribute value if present. *)
478
479val get_name : t list -> string option
480(** [get_name attrs] extracts the name attribute value if present. *)
481
482val get_value : t list -> string option
483(** [get_value attrs] extracts the value attribute value if present. *)
484
485val get_role : t list -> string option
486(** [get_role attrs] extracts the role attribute value if present. *)
487
488val get_aria : string -> t list -> string option
489(** [get_aria name attrs] extracts a specific aria-* attribute value.
490
491 @param name The aria attribute name without the "aria-" prefix *)
492
493val get_data : string -> t list -> string option
494(** [get_data name attrs] extracts a specific data-* attribute value.
495
496 @param name The data attribute name without the "data-" prefix *)
497
498val has_disabled : t list -> bool
499(** [has_disabled attrs] checks if the disabled attribute is present. *)
500
501val has_required : t list -> bool
502(** [has_required attrs] checks if the required attribute is present. *)
503
504val has_readonly : t list -> bool
505(** [has_readonly attrs] checks if the readonly attribute is present. *)
506
507val has_checked : t list -> bool
508(** [has_checked attrs] checks if the checked attribute is present. *)
509
510val has_autofocus : t list -> bool
511(** [has_autofocus attrs] checks if the autofocus attribute is present. *)
512
513val has_hidden : t list -> bool
514(** [has_hidden attrs] checks if the hidden attribute is present. *)
515
516val has_inert : t list -> bool
517(** [has_inert attrs] checks if the inert attribute is present. *)
518
519val has_open : t list -> bool
520(** [has_open attrs] checks if the open attribute is present. *)
521
522val get_all_aria : t list -> (string * string) list
523(** [get_all_aria attrs] extracts all aria-* attributes. *)
524
525val get_all_data : t list -> (string * string) list
526(** [get_all_data attrs] extracts all data-* attributes. *)
527
528(** {2 Space-Separated Attribute List Getters} *)
529
530val get_rel : t list -> string option
531(** [get_rel attrs] extracts the rel attribute value as a raw string. *)
532
533val get_rel_list : t list -> string list
534(** [get_rel_list attrs] extracts the rel attribute as a list of link types.
535 Returns empty list if not present. Space-separated values are split. *)
536
537val get_headers : t list -> string option
538(** [get_headers attrs] extracts the headers attribute value as a raw string. *)
539
540val get_headers_list : t list -> string list
541(** [get_headers_list attrs] extracts the headers attribute as a list of IDs.
542 Returns empty list if not present. Space-separated values are split. *)
543
544val get_itemref : t list -> string option
545(** [get_itemref attrs] extracts the itemref attribute value as a raw string. *)
546
547val get_itemref_list : t list -> string list
548(** [get_itemref_list attrs] extracts the itemref attribute as a list of IDs.
549 Returns empty list if not present. Space-separated values are split. *)
550
551val get_itemprop : t list -> string option
552(** [get_itemprop attrs] extracts the itemprop attribute value as a raw string. *)
553
554val get_itemprop_list : t list -> string list
555(** [get_itemprop_list attrs] extracts the itemprop attribute as a list of property names.
556 Returns empty list if not present. Space-separated values are split. *)
557
558val get_itemtype : t list -> string option
559(** [get_itemtype attrs] extracts the itemtype attribute value as a raw string. *)
560
561val get_itemtype_list : t list -> string list
562(** [get_itemtype_list attrs] extracts the itemtype attribute as a list of URLs.
563 Returns empty list if not present. Space-separated values are split. *)
564
565val get_aria_list : string -> t list -> string list
566(** [get_aria_list name attrs] extracts a specific aria-* attribute as a list.
567 Useful for space-separated aria values like aria-labelledby, aria-describedby.
568 Returns empty list if not present. *)
569
570val find : (t -> 'a option) -> t list -> 'a option
571(** [find f attrs] finds the first attribute matching predicate [f]. *)
572
573val exists : (t -> bool) -> t list -> bool
574(** [exists f attrs] checks if any attribute matches predicate [f]. *)
575
576val filter : (t -> bool) -> t list -> t list
577(** [filter f attrs] filters attributes matching predicate [f]. *)