OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at f43716550893e38b395c9f53b53036796ad853b5 348 lines 11 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** Typed HTML5 element representation. 7 8 This module combines tags and attributes into a complete typed element 9 representation. Elements are created from raw input (tag name, namespace, 10 attributes) and provide typed accessors for validation and manipulation. 11 12 {2 Design Philosophy} 13 14 An element in this module represents a complete typed view of an HTML 15 element, including: 16 17 - The element's tag (typed via {!Tag.element_tag}) 18 - Typed attributes (via {!Attr.t} list) 19 - Raw attributes (for fallback access) 20 21 This dual representation allows checkers to use typed pattern matching 22 for common cases while falling back to raw strings when needed. 23 24 {2 Usage Example} 25 26 {[ 27 let elem = Element.create 28 ~name:"input" 29 ~namespace:None 30 ~attrs:[("type", "email"); ("required", ""); ("class", "form-input")] 31 in 32 match elem.tag with 33 | Tag.Html `Input -> 34 if Element.has_required elem then 35 (* Validate required input *) 36 () 37 | _ -> () 38 ]} 39 40 @see 'Tag' for element tag types 41 @see 'Attr' for attribute types 42*) 43 44(** {1 Element Type} *) 45 46(** A typed HTML element. 47 48 @field tag The element's tag classification 49 @field attrs Typed attributes parsed from raw input 50 @field raw_attrs Original attribute name-value pairs for fallback *) 51type t = { 52 tag : Tag.element_tag; 53 attrs : Attr.t list; 54 raw_attrs : (string * string) list; 55} 56 57(** {1 Construction} *) 58 59val create : name:string -> namespace:string option -> attrs:(string * string) list -> t 60(** [create ~name ~namespace ~attrs] creates a typed element. 61 62 @param name The element's tag name 63 @param namespace Optional namespace URI (for SVG/MathML) 64 @param attrs Raw attribute name-value pairs 65 @return A typed element 66 67 {b Example:} 68 {[ 69 let div = Element.create ~name:"div" ~namespace:None 70 ~attrs:[("class", "container"); ("id", "main")] 71 ]} *) 72 73(** {1 Tag Accessors} *) 74 75val tag : t -> Tag.element_tag 76(** [tag elem] returns the element's tag. *) 77 78val tag_name : t -> string 79(** [tag_name elem] returns the element's tag name as a string. *) 80 81val is_html_tag : Tag.html_tag -> t -> bool 82(** [is_html_tag expected elem] checks if the element is a specific HTML tag. 83 84 @param expected The expected HTML tag variant 85 @param elem The element to check 86 @return [true] if the element matches *) 87 88val as_html_tag : t -> Tag.html_tag option 89(** [as_html_tag elem] extracts the HTML tag if this is an HTML element. 90 91 @return [Some tag] for HTML elements, [None] for SVG/MathML/Custom/Unknown *) 92 93(** {1 Attribute Accessors} *) 94 95val attrs : t -> Attr.t list 96(** [attrs elem] returns the typed attributes. *) 97 98val raw_attrs : t -> (string * string) list 99(** [raw_attrs elem] returns the original raw attributes. *) 100 101val get_id : t -> string option 102(** [get_id elem] extracts the id attribute value. *) 103 104val get_class : t -> string option 105(** [get_class elem] extracts the class attribute value. *) 106 107val get_href : t -> string option 108(** [get_href elem] extracts the href attribute value. *) 109 110val get_src : t -> string option 111(** [get_src elem] extracts the src attribute value. *) 112 113val get_alt : t -> string option 114(** [get_alt elem] extracts the alt attribute value. *) 115 116val get_name : t -> string option 117(** [get_name elem] extracts the name attribute value. *) 118 119val get_value : t -> string option 120(** [get_value elem] extracts the value attribute value. *) 121 122val get_role : t -> string option 123(** [get_role elem] extracts the role attribute value. *) 124 125val get_aria : string -> t -> string option 126(** [get_aria name elem] extracts a specific aria-* attribute value. 127 128 @param name The aria attribute name without the "aria-" prefix *) 129 130val get_data : string -> t -> string option 131(** [get_data name elem] extracts a specific data-* attribute value. 132 133 @param name The data attribute name without the "data-" prefix *) 134 135val has_disabled : t -> bool 136(** [has_disabled elem] checks if the disabled attribute is present. *) 137 138val has_required : t -> bool 139(** [has_required elem] checks if the required attribute is present. *) 140 141val has_readonly : t -> bool 142(** [has_readonly elem] checks if the readonly attribute is present. *) 143 144val has_checked : t -> bool 145(** [has_checked elem] checks if the checked attribute is present. *) 146 147val has_autofocus : t -> bool 148(** [has_autofocus elem] checks if the autofocus attribute is present. *) 149 150val has_hidden : t -> bool 151(** [has_hidden elem] checks if the hidden attribute is present. *) 152 153val has_inert : t -> bool 154(** [has_inert elem] checks if the inert attribute is present. *) 155 156val has_open : t -> bool 157(** [has_open elem] checks if the open attribute is present. *) 158 159val get_all_aria : t -> (string * string) list 160(** [get_all_aria elem] extracts all aria-* attributes. *) 161 162val get_all_data : t -> (string * string) list 163(** [get_all_data elem] extracts all data-* attributes. *) 164 165(** {1 Raw Attribute Fallback} *) 166 167val get_raw_attr : string -> t -> string option 168(** [get_raw_attr name elem] gets a raw attribute value by name. 169 170 This is useful when the typed representation doesn't capture a specific 171 attribute or when you need the exact original value. 172 173 @param name The attribute name (case-insensitive) 174 @param elem The element 175 @return [Some value] if the attribute exists *) 176 177val has_raw_attr : string -> t -> bool 178(** [has_raw_attr name elem] checks if a raw attribute exists. 179 180 @param name The attribute name (case-insensitive) 181 @param elem The element 182 @return [true] if the attribute is present *) 183 184(** {1 Category Checks} 185 186 These predicates check element categories based on the HTML5 content model. *) 187 188val is_void : t -> bool 189(** [is_void elem] checks if this is a void element (cannot have children). 190 191 @return [true] for br, hr, img, input, etc. *) 192 193val is_heading : t -> bool 194(** [is_heading elem] checks if this is a heading element. 195 196 @return [true] for h1-h6 *) 197 198val heading_level : t -> int option 199(** [heading_level elem] gets the heading level (1-6) if applicable. 200 201 @return [Some level] for h1-h6, [None] otherwise *) 202 203val is_sectioning : t -> bool 204(** [is_sectioning elem] checks if this is sectioning content. 205 206 @return [true] for article, aside, nav, section *) 207 208val is_sectioning_root : t -> bool 209(** [is_sectioning_root elem] checks if this is a sectioning root. 210 211 @return [true] for blockquote, body, details, dialog, fieldset, figure, td *) 212 213val is_embedded : t -> bool 214(** [is_embedded elem] checks if this is embedded content. 215 216 @return [true] for audio, canvas, embed, iframe, img, object, picture, video *) 217 218val is_interactive : t -> bool 219(** [is_interactive elem] checks if this is interactive content. 220 221 @return [true] for focusable/activatable elements *) 222 223val is_form_associated : t -> bool 224(** [is_form_associated elem] checks if this is form-associated. 225 226 @return [true] for elements that can belong to a form *) 227 228val is_labelable : t -> bool 229(** [is_labelable elem] checks if this can be associated with a label. 230 231 @return [true] for button, input, meter, output, progress, select, textarea *) 232 233val is_submittable : t -> bool 234(** [is_submittable elem] checks if this is a submittable form element. 235 236 @return [true] for button, input, select, textarea *) 237 238val is_table_element : t -> bool 239(** [is_table_element elem] checks if this is a table-related element. 240 241 @return [true] for table, tr, td, th, etc. *) 242 243val is_media : t -> bool 244(** [is_media elem] checks if this is a media element. 245 246 @return [true] for audio, video *) 247 248val is_list_container : t -> bool 249(** [is_list_container elem] checks if this is a list container. 250 251 @return [true] for ul, ol, menu, dl *) 252 253val is_transparent : t -> bool 254(** [is_transparent elem] checks if this has a transparent content model. 255 256 @return [true] for a, abbr, audio, canvas, del, ins, map, noscript, etc. *) 257 258val is_phrasing : t -> bool 259(** [is_phrasing elem] checks if this is phrasing content. 260 261 @return [true] for inline-level elements *) 262 263val is_flow : t -> bool 264(** [is_flow elem] checks if this is flow content. 265 266 @return [true] for most body-level elements *) 267 268val is_obsolete : t -> bool 269(** [is_obsolete elem] checks if this is a deprecated element. 270 271 @return [true] for applet, font, marquee, etc. *) 272 273val is_svg : t -> bool 274(** [is_svg elem] checks if this is an SVG element. 275 276 @return [true] if the element is in the SVG namespace *) 277 278val is_mathml : t -> bool 279(** [is_mathml elem] checks if this is a MathML element. 280 281 @return [true] if the element is in the MathML namespace *) 282 283val is_custom : t -> bool 284(** [is_custom elem] checks if this is a custom element. 285 286 @return [true] if the element name contains a hyphen *) 287 288val is_unknown : t -> bool 289(** [is_unknown elem] checks if this is an unknown element. 290 291 @return [true] if the element is not recognized *) 292 293(** {1 Input Type Utilities} *) 294 295val get_input_type : t -> Attr.input_type option 296(** [get_input_type elem] gets the input type for input elements. 297 298 @return [Some type] for input elements with a type, [None] otherwise *) 299 300val get_button_type : t -> Attr.button_type option 301(** [get_button_type elem] gets the button type for button elements. 302 303 @return [Some type] for button elements with a type, [None] otherwise *) 304 305val is_input_type : Attr.input_type -> t -> bool 306(** [is_input_type expected elem] checks if an input has a specific type. 307 308 @param expected The expected input type 309 @param elem The element to check 310 @return [true] if this is an input with the specified type *) 311 312(** {1 Pattern Matching Helpers} *) 313 314val match_html : t -> (Tag.html_tag -> 'a) -> 'a option 315(** [match_html elem f] applies [f] to the HTML tag if present. 316 317 @param elem The element 318 @param f Function to apply to the HTML tag 319 @return [Some (f tag)] for HTML elements, [None] otherwise *) 320 321val when_html_tag : Tag.html_tag -> t -> (unit -> 'a) -> 'a option 322(** [when_html_tag expected elem f] applies [f] if the element matches. 323 324 @param expected The expected HTML tag 325 @param elem The element to check 326 @param f Function to call if the element matches 327 @return [Some (f ())] if matched, [None] otherwise *) 328 329(** {1 Internal} *) 330 331val parse_type_attr : Tag.html_tag -> string -> Attr.t 332(** [parse_type_attr tag value] parses a type attribute for an element. 333 334 Different elements have different valid type values. This function 335 handles context-dependent parsing. 336 337 @param tag The element's HTML tag 338 @param value The type attribute value 339 @return The parsed attribute variant *) 340 341val parse_attrs_for_tag : Tag.element_tag -> (string * string) list -> Attr.t list 342(** [parse_attrs_for_tag tag raw_attrs] parses attributes with element context. 343 344 The type attribute is parsed differently depending on the element tag. 345 346 @param tag The element's tag 347 @param raw_attrs Raw attribute name-value pairs 348 @return List of typed attributes *)