OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 12 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** Typed HTML5 element representation. 7 8 This module combines tags and attributes into a complete typed element 9 representation. Elements are created from raw input (tag name, namespace, 10 attributes) and provide typed accessors for validation and manipulation. 11 12 {2 Design Philosophy} 13 14 An element in this module represents a complete typed view of an HTML 15 element, including: 16 17 - The element's tag (typed via {!Tag.element_tag}) 18 - Typed attributes (via {!Attr.t} list) 19 - Raw attributes (for fallback access) 20 21 This dual representation allows checkers to use typed pattern matching 22 for common cases while falling back to raw strings when needed. 23 24 {2 Usage Example} 25 26 {[ 27 let elem = Element.create 28 ~name:"input" 29 ~namespace:None 30 ~attrs:[("type", "email"); ("required", ""); ("class", "form-input")] 31 in 32 match elem.tag with 33 | Tag.Html `Input -> 34 if Element.has_required elem then 35 (* Validate required input *) 36 () 37 | _ -> () 38 ]} 39 40 @see 'Tag' for element tag types 41 @see 'Attr' for attribute types 42*) 43 44(** {1 Element Type} *) 45 46(** A typed HTML element. 47 48 - [tag]: The element's tag classification 49 - [attrs]: Typed attributes parsed from raw input 50 - [raw_attrs]: Original attribute name-value pairs for fallback *) 51type t = { 52 tag : Tag.element_tag; 53 attrs : Attr.t list; 54 raw_attrs : (string * string) list; 55} 56 57(** {1 Construction} *) 58 59val create : name:string -> namespace:string option -> attrs:(string * string) list -> t 60(** [create ~name ~namespace ~attrs] creates a typed element. 61 62 @param name The element's tag name 63 @param namespace Optional namespace URI (for SVG/MathML) 64 @param attrs Raw attribute name-value pairs 65 @return A typed element 66 67 {b Example:} 68 {[ 69 let div = Element.create ~name:"div" ~namespace:None 70 ~attrs:[("class", "container"); ("id", "main")] 71 ]} *) 72 73(** {1 Tag Accessors} *) 74 75val tag : t -> Tag.element_tag 76(** [tag elem] returns the element's tag. *) 77 78val tag_name : t -> string 79(** [tag_name elem] returns the element's tag name as a string. *) 80 81val is_html_tag : Tag.html_tag -> t -> bool 82(** [is_html_tag expected elem] checks if the element is a specific HTML tag. 83 84 @param expected The expected HTML tag variant 85 @param elem The element to check 86 @return [true] if the element matches *) 87 88val as_html_tag : t -> Tag.html_tag option 89(** [as_html_tag elem] extracts the HTML tag if this is an HTML element. 90 91 @return [Some tag] for HTML elements, [None] for SVG/MathML/Custom/Unknown *) 92 93(** {1 Attribute Accessors} *) 94 95val attrs : t -> Attr.t list 96(** [attrs elem] returns the typed attributes. *) 97 98val raw_attrs : t -> (string * string) list 99(** [raw_attrs elem] returns the original raw attributes. *) 100 101val get_id : t -> string option 102(** [get_id elem] extracts the id attribute value. *) 103 104val get_class : t -> string option 105(** [get_class elem] extracts the class attribute value. *) 106 107val get_href : t -> string option 108(** [get_href elem] extracts the href attribute value. *) 109 110val get_src : t -> string option 111(** [get_src elem] extracts the src attribute value. *) 112 113val get_alt : t -> string option 114(** [get_alt elem] extracts the alt attribute value. *) 115 116val get_name : t -> string option 117(** [get_name elem] extracts the name attribute value. *) 118 119val get_value : t -> string option 120(** [get_value elem] extracts the value attribute value. *) 121 122val get_role : t -> string option 123(** [get_role elem] extracts the role attribute value. *) 124 125val get_aria : string -> t -> string option 126(** [get_aria name elem] extracts a specific aria-* attribute value. 127 128 @param name The aria attribute name without the "aria-" prefix *) 129 130val get_data : string -> t -> string option 131(** [get_data name elem] extracts a specific data-* attribute value. 132 133 @param name The data attribute name without the "data-" prefix *) 134 135val has_disabled : t -> bool 136(** [has_disabled elem] checks if the disabled attribute is present. *) 137 138val has_required : t -> bool 139(** [has_required elem] checks if the required attribute is present. *) 140 141val has_readonly : t -> bool 142(** [has_readonly elem] checks if the readonly attribute is present. *) 143 144val has_checked : t -> bool 145(** [has_checked elem] checks if the checked attribute is present. *) 146 147val has_autofocus : t -> bool 148(** [has_autofocus elem] checks if the autofocus attribute is present. *) 149 150val has_hidden : t -> bool 151(** [has_hidden elem] checks if the hidden attribute is present. *) 152 153val has_inert : t -> bool 154(** [has_inert elem] checks if the inert attribute is present. *) 155 156val has_open : t -> bool 157(** [has_open elem] checks if the open attribute is present. *) 158 159val get_all_aria : t -> (string * string) list 160(** [get_all_aria elem] extracts all aria-* attributes. *) 161 162val get_all_data : t -> (string * string) list 163(** [get_all_data elem] extracts all data-* attributes. *) 164 165(** {1 Space-Separated List Accessors} 166 167 These functions return attribute values as parsed lists, splitting on 168 whitespace per HTML5 spec. *) 169 170val get_class_list : t -> string list 171(** [get_class_list elem] returns class names as a list. *) 172 173val get_rel_list : t -> string list 174(** [get_rel_list elem] returns link types as a list. *) 175 176val get_headers_list : t -> string list 177(** [get_headers_list elem] returns header IDs as a list (for td/th). *) 178 179val get_itemref_list : t -> string list 180(** [get_itemref_list elem] returns itemref IDs as a list. *) 181 182val get_itemprop_list : t -> string list 183(** [get_itemprop_list elem] returns itemprop names as a list. *) 184 185val get_itemtype_list : t -> string list 186(** [get_itemtype_list elem] returns itemtype URLs as a list. *) 187 188val get_aria_list : string -> t -> string list 189(** [get_aria_list name elem] returns space-separated ARIA values as a list. *) 190 191(** {1 Raw Attribute Fallback} *) 192 193val get_raw_attr : string -> t -> string option 194(** [get_raw_attr name elem] gets a raw attribute value by name. 195 196 This is useful when the typed representation doesn't capture a specific 197 attribute or when you need the exact original value. 198 199 @param name The attribute name (case-insensitive) 200 @param elem The element 201 @return [Some value] if the attribute exists *) 202 203val has_raw_attr : string -> t -> bool 204(** [has_raw_attr name elem] checks if a raw attribute exists. 205 206 @param name The attribute name (case-insensitive) 207 @param elem The element 208 @return [true] if the attribute is present *) 209 210(** {1 Category Checks} 211 212 These predicates check element categories based on the HTML5 content model. *) 213 214val is_void : t -> bool 215(** [is_void elem] checks if this is a void element (cannot have children). 216 217 @return [true] for br, hr, img, input, etc. *) 218 219val is_heading : t -> bool 220(** [is_heading elem] checks if this is a heading element. 221 222 @return [true] for h1-h6 *) 223 224val heading_level : t -> int option 225(** [heading_level elem] gets the heading level (1-6) if applicable. 226 227 @return [Some level] for h1-h6, [None] otherwise *) 228 229val is_sectioning : t -> bool 230(** [is_sectioning elem] checks if this is sectioning content. 231 232 @return [true] for article, aside, nav, section *) 233 234val is_sectioning_root : t -> bool 235(** [is_sectioning_root elem] checks if this is a sectioning root. 236 237 @return [true] for blockquote, body, details, dialog, fieldset, figure, td *) 238 239val is_embedded : t -> bool 240(** [is_embedded elem] checks if this is embedded content. 241 242 @return [true] for audio, canvas, embed, iframe, img, object, picture, video *) 243 244val is_interactive : t -> bool 245(** [is_interactive elem] checks if this is interactive content. 246 247 @return [true] for focusable/activatable elements *) 248 249val is_form_associated : t -> bool 250(** [is_form_associated elem] checks if this is form-associated. 251 252 @return [true] for elements that can belong to a form *) 253 254val is_labelable : t -> bool 255(** [is_labelable elem] checks if this can be associated with a label. 256 257 @return [true] for button, input, meter, output, progress, select, textarea *) 258 259val is_submittable : t -> bool 260(** [is_submittable elem] checks if this is a submittable form element. 261 262 @return [true] for button, input, select, textarea *) 263 264val is_table_element : t -> bool 265(** [is_table_element elem] checks if this is a table-related element. 266 267 @return [true] for table, tr, td, th, etc. *) 268 269val is_media : t -> bool 270(** [is_media elem] checks if this is a media element. 271 272 @return [true] for audio, video *) 273 274val is_list_container : t -> bool 275(** [is_list_container elem] checks if this is a list container. 276 277 @return [true] for ul, ol, menu, dl *) 278 279val is_transparent : t -> bool 280(** [is_transparent elem] checks if this has a transparent content model. 281 282 @return [true] for a, abbr, audio, canvas, del, ins, map, noscript, etc. *) 283 284val is_phrasing : t -> bool 285(** [is_phrasing elem] checks if this is phrasing content. 286 287 @return [true] for inline-level elements *) 288 289val is_flow : t -> bool 290(** [is_flow elem] checks if this is flow content. 291 292 @return [true] for most body-level elements *) 293 294val is_obsolete : t -> bool 295(** [is_obsolete elem] checks if this is a deprecated element. 296 297 @return [true] for applet, font, marquee, etc. *) 298 299val is_svg : t -> bool 300(** [is_svg elem] checks if this is an SVG element. 301 302 @return [true] if the element is in the SVG namespace *) 303 304val is_mathml : t -> bool 305(** [is_mathml elem] checks if this is a MathML element. 306 307 @return [true] if the element is in the MathML namespace *) 308 309val is_custom : t -> bool 310(** [is_custom elem] checks if this is a custom element. 311 312 @return [true] if the element name contains a hyphen *) 313 314val is_unknown : t -> bool 315(** [is_unknown elem] checks if this is an unknown element. 316 317 @return [true] if the element is not recognized *) 318 319(** {1 Input Type Utilities} *) 320 321val get_input_type : t -> Attr.input_type option 322(** [get_input_type elem] gets the input type for input elements. 323 324 @return [Some type] for input elements with a type, [None] otherwise *) 325 326val get_button_type : t -> Attr.button_type option 327(** [get_button_type elem] gets the button type for button elements. 328 329 @return [Some type] for button elements with a type, [None] otherwise *) 330 331val is_input_type : Attr.input_type -> t -> bool 332(** [is_input_type expected elem] checks if an input has a specific type. 333 334 @param expected The expected input type 335 @param elem The element to check 336 @return [true] if this is an input with the specified type *) 337 338(** {1 Pattern Matching Helpers} *) 339 340val match_html : t -> (Tag.html_tag -> 'a) -> 'a option 341(** [match_html elem f] applies [f] to the HTML tag if present. 342 343 @param elem The element 344 @param f Function to apply to the HTML tag 345 @return [Some (f tag)] for HTML elements, [None] otherwise *) 346 347val when_html_tag : Tag.html_tag -> t -> (unit -> 'a) -> 'a option 348(** [when_html_tag expected elem f] applies [f] if the element matches. 349 350 @param expected The expected HTML tag 351 @param elem The element to check 352 @param f Function to call if the element matches 353 @return [Some (f ())] if matched, [None] otherwise *) 354 355(** {1 Internal} *) 356 357val parse_type_attr : Tag.html_tag -> string -> Attr.t 358(** [parse_type_attr tag value] parses a type attribute for an element. 359 360 Different elements have different valid type values. This function 361 handles context-dependent parsing. 362 363 @param tag The element's HTML tag 364 @param value The type attribute value 365 @return The parsed attribute variant *) 366 367val parse_attrs_for_tag : Tag.element_tag -> (string * string) list -> Attr.t list 368(** [parse_attrs_for_tag tag raw_attrs] parses attributes with element context. 369 370 The type attribute is parsed differently depending on the element tag. 371 372 @param tag The element's tag 373 @param raw_attrs Raw attribute name-value pairs 374 @return List of typed attributes *)