OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 18 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** Typed HTML5 attribute representations using polymorphic variants. 7 8 This module provides typed representations for HTML attributes with 9 proper value types for enumerated attributes. Parsing raw attribute 10 name-value pairs produces typed variants that can be pattern-matched 11 with exhaustiveness checking. 12 13 {2 Design Philosophy} 14 15 HTML5 attributes have specific value constraints that this module 16 encodes in the type system: 17 18 - Boolean attributes: Present means true (e.g., [disabled], [checked]) 19 - Enumerated attributes: Fixed set of valid values (e.g., [dir], [method]) 20 - Numeric attributes: Integer or float values (e.g., [tabindex], [colspan]) 21 - URL attributes: String values representing URLs (e.g., [href], [src]) 22 - Free-form attributes: Any string value (e.g., [class], [title]) 23 24 {2 Parsing Strategy} 25 26 Attributes are parsed with validation: 27 - Known attributes are parsed into typed variants 28 - Invalid values for enumerated attributes fall back to [Unknown_attr] 29 - Unknown attribute names are captured as [Unknown_attr] 30 - Special handling for [data-*] and [aria-*] prefixed attributes 31 32 @see <https://html.spec.whatwg.org/multipage/dom.html#global-attributes> 33 HTML Standard: Global attributes 34*) 35 36(** {1 Attribute Value Types} 37 38 These types represent the valid values for enumerated HTML attributes. *) 39 40(** Direction attribute values for [dir]. *) 41type dir_value = [ `Ltr | `Rtl | `Auto ] 42 43(** Hidden attribute values. *) 44type hidden_value = [ `Hidden | `Until_found ] 45 46(** Popover attribute values. *) 47type popover_value = [ `Auto | `Manual | `Hint ] 48 49(** Link target attribute values. *) 50type target_value = [ `Self | `Blank | `Parent | `Top | `Named of string ] 51 52(** Image/resource loading behavior. *) 53type loading_value = [ `Eager | `Lazy ] 54 55(** Image decoding hint. *) 56type decoding_value = [ `Sync | `Async | `Auto ] 57 58(** Fetch priority hint. *) 59type fetchpriority_value = [ `High | `Low | `Auto ] 60 61(** CORS settings. *) 62type crossorigin_value = [ `Anonymous | `Use_credentials ] 63 64(** Media preload hint. *) 65type preload_value = [ `None | `Metadata | `Auto ] 66 67(** Form method values. *) 68type method_value = [ `Get | `Post | `Dialog ] 69 70(** Form encoding type values. *) 71type enctype_value = [ `Urlencoded | `Multipart | `Plain ] 72 73(** Textarea wrap mode. *) 74type wrap_value = [ `Soft | `Hard ] 75 76(** Table cell scope. *) 77type scope_value = [ `Row | `Col | `Rowgroup | `Colgroup ] 78 79(** Input element type values. *) 80type input_type = [ 81 | `Hidden | `Text | `Search | `Tel | `Url | `Email | `Password 82 | `Date | `Month | `Week | `Time | `Datetime_local | `Number 83 | `Range | `Color | `Checkbox | `Radio | `File | `Submit 84 | `Image | `Reset | `Button 85] 86 87(** Button element type values. *) 88type button_type = [ `Submit | `Reset | `Button ] 89 90(** Referrer policy values. *) 91type referrerpolicy_value = [ 92 | `No_referrer | `No_referrer_when_downgrade | `Origin 93 | `Origin_when_cross_origin | `Same_origin | `Strict_origin 94 | `Strict_origin_when_cross_origin | `Unsafe_url 95] 96 97(** Iframe sandbox flags. *) 98type sandbox_flag = [ 99 | `Allow_downloads | `Allow_forms | `Allow_modals | `Allow_orientation_lock 100 | `Allow_pointer_lock | `Allow_popups | `Allow_popups_to_escape_sandbox 101 | `Allow_presentation | `Allow_same_origin | `Allow_scripts 102 | `Allow_top_navigation | `Allow_top_navigation_by_user_activation 103 | `Allow_top_navigation_to_custom_protocols 104] 105 106(** Enter key hint values for virtual keyboards. *) 107type enterkeyhint_value = [ 108 | `Enter | `Done | `Go | `Next | `Previous | `Search | `Send 109] 110 111(** Input mode hint for virtual keyboards. *) 112type inputmode_value = [ 113 | `None | `Text | `Decimal | `Numeric | `Tel | `Search | `Email | `Url 114] 115 116(** Content editable values. *) 117type contenteditable_value = [ `True | `False | `Plaintext_only ] 118 119(** Autocapitalize values. *) 120type autocapitalize_value = [ 121 | `Off | `None | `On | `Sentences | `Words | `Characters 122] 123 124(** Image map shape values. *) 125type shape_value = [ `Rect | `Circle | `Poly | `Default ] 126 127(** Input capture values for file inputs. *) 128type capture_value = [ `User | `Environment ] 129 130(** Ordered list type values. *) 131type list_type_value = [ 132 | `Decimal | `Lower_alpha | `Upper_alpha | `Lower_roman | `Upper_roman 133] 134 135(** Track element kind values. *) 136type kind_value = [ 137 | `Subtitles | `Captions | `Descriptions | `Chapters | `Metadata 138] 139 140(** {1 Typed Attribute Variant} *) 141 142(** Typed attribute representation. 143 144 This type covers all HTML5 attributes with appropriate value types. 145 Attributes are organized into logical groups. *) 146type t = [ 147 (* Global attributes *) 148 | `Id of string 149 | `Class of string 150 | `Style of string 151 | `Title of string 152 | `Lang of string 153 | `Dir of dir_value 154 | `Hidden of hidden_value option 155 | `Tabindex of int 156 | `Accesskey of string 157 | `Autocapitalize of autocapitalize_value 158 | `Autofocus 159 | `Contenteditable of contenteditable_value option 160 | `Draggable of bool 161 | `Enterkeyhint of enterkeyhint_value 162 | `Inert 163 | `Inputmode of inputmode_value 164 | `Is of string 165 | `Nonce of string 166 | `Popover of popover_value option 167 | `Slot of string 168 | `Spellcheck of bool option 169 | `Translate of bool 170 | `Exportparts of string 171 | `Part of string 172 173 (* Microdata *) 174 | `Itemscope 175 | `Itemtype of string 176 | `Itemprop of string 177 | `Itemid of string 178 | `Itemref of string 179 180 (* ARIA *) 181 | `Role of string 182 | `Aria of string * string 183 184 (* Event handlers *) 185 | `Event of string * string 186 187 (* Link/navigation attributes *) 188 | `Href of string 189 | `Target of target_value 190 | `Rel of string 191 | `Download of string option 192 | `Hreflang of string 193 | `Ping of string 194 | `Referrerpolicy of referrerpolicy_value 195 | `Type_link of string 196 197 (* Media/resource attributes *) 198 | `Src of string 199 | `Srcset of string 200 | `Sizes of string 201 | `Alt of string 202 | `Width of string 203 | `Height of string 204 | `Loading of loading_value 205 | `Decoding of decoding_value 206 | `Fetchpriority of fetchpriority_value 207 | `Crossorigin of crossorigin_value option 208 | `Ismap 209 | `Usemap of string 210 | `Media of string 211 212 (* Audio/Video specific *) 213 | `Controls 214 | `Autoplay 215 | `Loop 216 | `Muted 217 | `Preload of preload_value 218 | `Poster of string 219 | `Playsinline 220 221 (* Image map *) 222 | `Coords of string 223 | `Shape of shape_value 224 225 (* iframe *) 226 | `Sandbox of sandbox_flag list option 227 | `Allow of string 228 | `Allowfullscreen 229 | `Srcdoc of string 230 | `Csp of string 231 232 (* Form attributes *) 233 | `Action of string 234 | `Method of method_value 235 | `Enctype of enctype_value 236 | `Novalidate 237 | `Accept_charset of string 238 | `Autocomplete of string 239 | `Name of string 240 | `Form of string 241 242 (* Form control attributes *) 243 | `Value of string 244 | `Type_input of input_type 245 | `Type_button of button_type 246 | `Disabled 247 | `Readonly 248 | `Required 249 | `Checked 250 | `Selected 251 | `Multiple 252 | `Placeholder of string 253 | `Min of string 254 | `Max of string 255 | `Step of string 256 | `Minlength of int 257 | `Maxlength of int 258 | `Pattern of string 259 | `Size of int 260 | `Cols of int 261 | `Rows of int 262 | `Wrap of wrap_value 263 | `Accept of string 264 | `Capture of capture_value 265 | `Dirname of string 266 | `For of string 267 | `List of string 268 269 (* Form submission attributes *) 270 | `Formaction of string 271 | `Formmethod of method_value 272 | `Formenctype of enctype_value 273 | `Formnovalidate 274 | `Formtarget of target_value 275 276 (* Table attributes *) 277 | `Colspan of int 278 | `Rowspan of int 279 | `Headers of string 280 | `Scope of scope_value 281 | `Span of int 282 283 (* Details/Dialog *) 284 | `Open 285 286 (* Script *) 287 | `Async 288 | `Defer 289 | `Integrity of string 290 | `Nomodule 291 | `Blocking of string 292 | `Type_script of string 293 294 (* Meta *) 295 | `Charset of string 296 | `Content of string 297 | `Http_equiv of string 298 299 (* Link element *) 300 | `As of string 301 | `Imagesizes of string 302 | `Imagesrcset of string 303 304 (* Object/Embed *) 305 | `Data_object of string 306 307 (* Output *) 308 | `For_output of string 309 310 (* Meter/Progress *) 311 | `Low of float 312 | `High of float 313 | `Optimum of float 314 315 (* Time *) 316 | `Datetime of string 317 318 (* Ol *) 319 | `Start of int 320 | `Reversed 321 | `Type_list of list_type_value 322 323 (* Track *) 324 | `Kind of kind_value 325 | `Srclang of string 326 | `Default 327 328 (* Td/Th *) 329 | `Abbr of string 330 331 (* Data attributes *) 332 | `Data_attr of string * string 333 334 (* RDFa *) 335 | `Property of string 336 | `Typeof of string 337 | `Resource of string 338 | `Prefix of string 339 | `Vocab of string 340 | `About of string 341 | `Datatype of string 342 | `Inlist 343 | `Rev of string 344 345 (* Escape hatch *) 346 | `Unknown_attr of string * string 347] 348 349(** {1 Parsing Functions} *) 350 351val parse_dir : string -> dir_value option 352(** [parse_dir value] parses a direction attribute value. *) 353 354val parse_target : string -> target_value 355(** [parse_target value] parses a target attribute value. *) 356 357val parse_loading : string -> loading_value option 358(** [parse_loading value] parses a loading attribute value. *) 359 360val parse_decoding : string -> decoding_value option 361(** [parse_decoding value] parses a decoding attribute value. *) 362 363val parse_fetchpriority : string -> fetchpriority_value option 364(** [parse_fetchpriority value] parses a fetchpriority attribute value. *) 365 366val parse_crossorigin : string -> crossorigin_value option 367(** [parse_crossorigin value] parses a crossorigin attribute value. *) 368 369val parse_preload : string -> preload_value option 370(** [parse_preload value] parses a preload attribute value. *) 371 372val parse_method : string -> method_value option 373(** [parse_method value] parses a form method attribute value. *) 374 375val parse_enctype : string -> enctype_value option 376(** [parse_enctype value] parses a form enctype attribute value. *) 377 378val parse_wrap : string -> wrap_value option 379(** [parse_wrap value] parses a textarea wrap attribute value. *) 380 381val parse_scope : string -> scope_value option 382(** [parse_scope value] parses a table scope attribute value. *) 383 384val parse_input_type : string -> input_type option 385(** [parse_input_type value] parses an input type attribute value. *) 386 387val parse_button_type : string -> button_type option 388(** [parse_button_type value] parses a button type attribute value. *) 389 390val parse_shape : string -> shape_value option 391(** [parse_shape value] parses an area shape attribute value. *) 392 393val parse_capture : string -> capture_value option 394(** [parse_capture value] parses an input capture attribute value. *) 395 396val parse_list_type : string -> list_type_value option 397(** [parse_list_type value] parses an ordered list type attribute value. *) 398 399val parse_kind : string -> kind_value option 400(** [parse_kind value] parses a track kind attribute value. *) 401 402val parse_referrerpolicy : string -> referrerpolicy_value option 403(** [parse_referrerpolicy value] parses a referrer policy attribute value. *) 404 405val parse_sandbox_flag : string -> sandbox_flag option 406(** [parse_sandbox_flag value] parses a single sandbox flag token. *) 407 408val parse_sandbox : string -> sandbox_flag list option 409(** [parse_sandbox value] parses a space-separated sandbox attribute value. *) 410 411val parse_enterkeyhint : string -> enterkeyhint_value option 412(** [parse_enterkeyhint value] parses an enterkeyhint attribute value. *) 413 414val parse_inputmode : string -> inputmode_value option 415(** [parse_inputmode value] parses an inputmode attribute value. *) 416 417val parse_contenteditable : string -> contenteditable_value option 418(** [parse_contenteditable value] parses a contenteditable attribute value. *) 419 420val parse_autocapitalize : string -> autocapitalize_value option 421(** [parse_autocapitalize value] parses an autocapitalize attribute value. *) 422 423val parse_hidden : string -> hidden_value option 424(** [parse_hidden value] parses a hidden attribute value. *) 425 426val parse_popover : string -> popover_value option 427(** [parse_popover value] parses a popover attribute value. *) 428 429val parse_int : string -> int option 430(** [parse_int value] attempts to parse an integer from a string. *) 431 432val parse_float : string -> float option 433(** [parse_float value] attempts to parse a float from a string. *) 434 435val parse_bool : string -> bool option 436(** [parse_bool value] parses a boolean attribute value. *) 437 438val parse_attr : string -> string -> t 439(** [parse_attr name value] parses a single attribute name-value pair. 440 441 @param name The attribute name 442 @param value The attribute value 443 @return A typed attribute variant 444 445 {b Example:} 446 {[ 447 parse_attr "class" "container" (* `Class "container" *) 448 parse_attr "disabled" "" (* `Disabled *) 449 parse_attr "data-id" "123" (* `Data_attr ("id", "123") *) 450 ]} *) 451 452val parse_attrs : (string * string) list -> t list 453(** [parse_attrs attrs] parses multiple attributes. 454 455 @param attrs List of (name, value) pairs 456 @return List of typed attributes *) 457 458(** {1 Accessor Functions} *) 459 460val get_id : t list -> string option 461(** [get_id attrs] extracts the id attribute value if present. *) 462 463val get_class : t list -> string option 464(** [get_class attrs] extracts the class attribute value as a raw string. *) 465 466val get_class_list : t list -> string list 467(** [get_class_list attrs] extracts the class attribute as a list of class names. 468 Returns empty list if not present. Space-separated values are split. *) 469 470val get_href : t list -> string option 471(** [get_href attrs] extracts the href attribute value if present. *) 472 473val get_src : t list -> string option 474(** [get_src attrs] extracts the src attribute value if present. *) 475 476val get_alt : t list -> string option 477(** [get_alt attrs] extracts the alt attribute value if present. *) 478 479val get_name : t list -> string option 480(** [get_name attrs] extracts the name attribute value if present. *) 481 482val get_value : t list -> string option 483(** [get_value attrs] extracts the value attribute value if present. *) 484 485val get_role : t list -> string option 486(** [get_role attrs] extracts the role attribute value if present. *) 487 488val get_aria : string -> t list -> string option 489(** [get_aria name attrs] extracts a specific aria-* attribute value. 490 491 @param name The aria attribute name without the "aria-" prefix *) 492 493val get_data : string -> t list -> string option 494(** [get_data name attrs] extracts a specific data-* attribute value. 495 496 @param name The data attribute name without the "data-" prefix *) 497 498val has_disabled : t list -> bool 499(** [has_disabled attrs] checks if the disabled attribute is present. *) 500 501val has_required : t list -> bool 502(** [has_required attrs] checks if the required attribute is present. *) 503 504val has_readonly : t list -> bool 505(** [has_readonly attrs] checks if the readonly attribute is present. *) 506 507val has_checked : t list -> bool 508(** [has_checked attrs] checks if the checked attribute is present. *) 509 510val has_autofocus : t list -> bool 511(** [has_autofocus attrs] checks if the autofocus attribute is present. *) 512 513val has_hidden : t list -> bool 514(** [has_hidden attrs] checks if the hidden attribute is present. *) 515 516val has_inert : t list -> bool 517(** [has_inert attrs] checks if the inert attribute is present. *) 518 519val has_open : t list -> bool 520(** [has_open attrs] checks if the open attribute is present. *) 521 522val get_all_aria : t list -> (string * string) list 523(** [get_all_aria attrs] extracts all aria-* attributes. *) 524 525val get_all_data : t list -> (string * string) list 526(** [get_all_data attrs] extracts all data-* attributes. *) 527 528(** {2 Space-Separated Attribute List Getters} *) 529 530val get_rel : t list -> string option 531(** [get_rel attrs] extracts the rel attribute value as a raw string. *) 532 533val get_rel_list : t list -> string list 534(** [get_rel_list attrs] extracts the rel attribute as a list of link types. 535 Returns empty list if not present. Space-separated values are split. *) 536 537val get_headers : t list -> string option 538(** [get_headers attrs] extracts the headers attribute value as a raw string. *) 539 540val get_headers_list : t list -> string list 541(** [get_headers_list attrs] extracts the headers attribute as a list of IDs. 542 Returns empty list if not present. Space-separated values are split. *) 543 544val get_itemref : t list -> string option 545(** [get_itemref attrs] extracts the itemref attribute value as a raw string. *) 546 547val get_itemref_list : t list -> string list 548(** [get_itemref_list attrs] extracts the itemref attribute as a list of IDs. 549 Returns empty list if not present. Space-separated values are split. *) 550 551val get_itemprop : t list -> string option 552(** [get_itemprop attrs] extracts the itemprop attribute value as a raw string. *) 553 554val get_itemprop_list : t list -> string list 555(** [get_itemprop_list attrs] extracts the itemprop attribute as a list of property names. 556 Returns empty list if not present. Space-separated values are split. *) 557 558val get_itemtype : t list -> string option 559(** [get_itemtype attrs] extracts the itemtype attribute value as a raw string. *) 560 561val get_itemtype_list : t list -> string list 562(** [get_itemtype_list attrs] extracts the itemtype attribute as a list of URLs. 563 Returns empty list if not present. Space-separated values are split. *) 564 565val get_aria_list : string -> t list -> string list 566(** [get_aria_list name attrs] extracts a specific aria-* attribute as a list. 567 Useful for space-separated aria values like aria-labelledby, aria-describedby. 568 Returns empty list if not present. *) 569 570val find : (t -> 'a option) -> t list -> 'a option 571(** [find f attrs] finds the first attribute matching predicate [f]. *) 572 573val exists : (t -> bool) -> t list -> bool 574(** [exists f attrs] checks if any attribute matches predicate [f]. *) 575 576val filter : (t -> bool) -> t list -> t list 577(** [filter f attrs] filters attributes matching predicate [f]. *)