OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+55
lib/htmlrw_check/attr_utils.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Common attribute utilities used across checkers. 7 + 8 + This module provides simple helper functions for working with raw 9 + attribute lists (name-value pairs). These utilities are used by 10 + checkers that need to inspect attributes without full typed parsing. 11 + 12 + For typed attribute access, see the {!Attr} module. 13 + *) 14 + 15 + (** {1 Types} *) 16 + 17 + type attrs = (string * string) list 18 + (** Raw attribute list as name-value pairs. *) 19 + 20 + (** {1 Attribute Lookup} *) 21 + 22 + val has_attr : string -> attrs -> bool 23 + (** [has_attr name attrs] checks if an attribute exists. 24 + 25 + The comparison is case-insensitive. 26 + 27 + @param name The attribute name to look for (lowercase) 28 + @param attrs The attribute list 29 + @return [true] if the attribute is present *) 30 + 31 + val get_attr : string -> attrs -> string option 32 + (** [get_attr name attrs] gets an attribute value. 33 + 34 + The comparison is case-insensitive. 35 + 36 + @param name The attribute name to look for (lowercase) 37 + @param attrs The attribute list 38 + @return [Some value] if found, [None] otherwise *) 39 + 40 + val get_attr_or : string -> default:string -> attrs -> string 41 + (** [get_attr_or name ~default attrs] gets an attribute value with a default. 42 + 43 + @param name The attribute name to look for (lowercase) 44 + @param default The default value if not found 45 + @param attrs The attribute list 46 + @return The attribute value or the default *) 47 + 48 + val is_non_empty_attr : string -> attrs -> bool 49 + (** [is_non_empty_attr name attrs] checks if an attribute exists with non-empty value. 50 + 51 + The value is considered non-empty if it contains non-whitespace characters. 52 + 53 + @param name The attribute name to look for (lowercase) 54 + @param attrs The attribute list 55 + @return [true] if the attribute exists and has a non-empty value *)
+531
lib/htmlrw_check/element/attr.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Typed HTML5 attribute representations using polymorphic variants. 7 + 8 + This module provides typed representations for HTML attributes with 9 + proper value types for enumerated attributes. Parsing raw attribute 10 + name-value pairs produces typed variants that can be pattern-matched 11 + with exhaustiveness checking. 12 + 13 + {2 Design Philosophy} 14 + 15 + HTML5 attributes have specific value constraints that this module 16 + encodes in the type system: 17 + 18 + - Boolean attributes: Present means true (e.g., [disabled], [checked]) 19 + - Enumerated attributes: Fixed set of valid values (e.g., [dir], [method]) 20 + - Numeric attributes: Integer or float values (e.g., [tabindex], [colspan]) 21 + - URL attributes: String values representing URLs (e.g., [href], [src]) 22 + - Free-form attributes: Any string value (e.g., [class], [title]) 23 + 24 + {2 Parsing Strategy} 25 + 26 + Attributes are parsed with validation: 27 + - Known attributes are parsed into typed variants 28 + - Invalid values for enumerated attributes fall back to [Unknown_attr] 29 + - Unknown attribute names are captured as [Unknown_attr] 30 + - Special handling for [data-*] and [aria-*] prefixed attributes 31 + 32 + @see <https://html.spec.whatwg.org/multipage/dom.html#global-attributes> 33 + HTML Standard: Global attributes 34 + *) 35 + 36 + (** {1 Attribute Value Types} 37 + 38 + These types represent the valid values for enumerated HTML attributes. *) 39 + 40 + (** Direction attribute values for [dir]. *) 41 + type dir_value = [ `Ltr | `Rtl | `Auto ] 42 + 43 + (** Hidden attribute values. *) 44 + type hidden_value = [ `Hidden | `Until_found ] 45 + 46 + (** Popover attribute values. *) 47 + type popover_value = [ `Auto | `Manual | `Hint ] 48 + 49 + (** Link target attribute values. *) 50 + type target_value = [ `Self | `Blank | `Parent | `Top | `Named of string ] 51 + 52 + (** Image/resource loading behavior. *) 53 + type loading_value = [ `Eager | `Lazy ] 54 + 55 + (** Image decoding hint. *) 56 + type decoding_value = [ `Sync | `Async | `Auto ] 57 + 58 + (** Fetch priority hint. *) 59 + type fetchpriority_value = [ `High | `Low | `Auto ] 60 + 61 + (** CORS settings. *) 62 + type crossorigin_value = [ `Anonymous | `Use_credentials ] 63 + 64 + (** Media preload hint. *) 65 + type preload_value = [ `None | `Metadata | `Auto ] 66 + 67 + (** Form method values. *) 68 + type method_value = [ `Get | `Post | `Dialog ] 69 + 70 + (** Form encoding type values. *) 71 + type enctype_value = [ `Urlencoded | `Multipart | `Plain ] 72 + 73 + (** Textarea wrap mode. *) 74 + type wrap_value = [ `Soft | `Hard ] 75 + 76 + (** Table cell scope. *) 77 + type scope_value = [ `Row | `Col | `Rowgroup | `Colgroup ] 78 + 79 + (** Input element type values. *) 80 + type input_type = [ 81 + | `Hidden | `Text | `Search | `Tel | `Url | `Email | `Password 82 + | `Date | `Month | `Week | `Time | `Datetime_local | `Number 83 + | `Range | `Color | `Checkbox | `Radio | `File | `Submit 84 + | `Image | `Reset | `Button 85 + ] 86 + 87 + (** Button element type values. *) 88 + type button_type = [ `Submit | `Reset | `Button ] 89 + 90 + (** Referrer policy values. *) 91 + type referrerpolicy_value = [ 92 + | `No_referrer | `No_referrer_when_downgrade | `Origin 93 + | `Origin_when_cross_origin | `Same_origin | `Strict_origin 94 + | `Strict_origin_when_cross_origin | `Unsafe_url 95 + ] 96 + 97 + (** Iframe sandbox flags. *) 98 + type sandbox_flag = [ 99 + | `Allow_downloads | `Allow_forms | `Allow_modals | `Allow_orientation_lock 100 + | `Allow_pointer_lock | `Allow_popups | `Allow_popups_to_escape_sandbox 101 + | `Allow_presentation | `Allow_same_origin | `Allow_scripts 102 + | `Allow_top_navigation | `Allow_top_navigation_by_user_activation 103 + | `Allow_top_navigation_to_custom_protocols 104 + ] 105 + 106 + (** Enter key hint values for virtual keyboards. *) 107 + type enterkeyhint_value = [ 108 + | `Enter | `Done | `Go | `Next | `Previous | `Search | `Send 109 + ] 110 + 111 + (** Input mode hint for virtual keyboards. *) 112 + type inputmode_value = [ 113 + | `None | `Text | `Decimal | `Numeric | `Tel | `Search | `Email | `Url 114 + ] 115 + 116 + (** Content editable values. *) 117 + type contenteditable_value = [ `True | `False | `Plaintext_only ] 118 + 119 + (** Autocapitalize values. *) 120 + type autocapitalize_value = [ 121 + | `Off | `None | `On | `Sentences | `Words | `Characters 122 + ] 123 + 124 + (** Image map shape values. *) 125 + type shape_value = [ `Rect | `Circle | `Poly | `Default ] 126 + 127 + (** Input capture values for file inputs. *) 128 + type capture_value = [ `User | `Environment ] 129 + 130 + (** Ordered list type values. *) 131 + type list_type_value = [ 132 + | `Decimal | `Lower_alpha | `Upper_alpha | `Lower_roman | `Upper_roman 133 + ] 134 + 135 + (** Track element kind values. *) 136 + type kind_value = [ 137 + | `Subtitles | `Captions | `Descriptions | `Chapters | `Metadata 138 + ] 139 + 140 + (** {1 Typed Attribute Variant} *) 141 + 142 + (** Typed attribute representation. 143 + 144 + This type covers all HTML5 attributes with appropriate value types. 145 + Attributes are organized into logical groups. *) 146 + type t = [ 147 + (* Global attributes *) 148 + | `Id of string 149 + | `Class of string 150 + | `Style of string 151 + | `Title of string 152 + | `Lang of string 153 + | `Dir of dir_value 154 + | `Hidden of hidden_value option 155 + | `Tabindex of int 156 + | `Accesskey of string 157 + | `Autocapitalize of autocapitalize_value 158 + | `Autofocus 159 + | `Contenteditable of contenteditable_value option 160 + | `Draggable of bool 161 + | `Enterkeyhint of enterkeyhint_value 162 + | `Inert 163 + | `Inputmode of inputmode_value 164 + | `Is of string 165 + | `Nonce of string 166 + | `Popover of popover_value option 167 + | `Slot of string 168 + | `Spellcheck of bool option 169 + | `Translate of bool 170 + | `Exportparts of string 171 + | `Part of string 172 + 173 + (* Microdata *) 174 + | `Itemscope 175 + | `Itemtype of string 176 + | `Itemprop of string 177 + | `Itemid of string 178 + | `Itemref of string 179 + 180 + (* ARIA *) 181 + | `Role of string 182 + | `Aria of string * string 183 + 184 + (* Event handlers *) 185 + | `Event of string * string 186 + 187 + (* Link/navigation attributes *) 188 + | `Href of string 189 + | `Target of target_value 190 + | `Rel of string 191 + | `Download of string option 192 + | `Hreflang of string 193 + | `Ping of string 194 + | `Referrerpolicy of referrerpolicy_value 195 + | `Type_link of string 196 + 197 + (* Media/resource attributes *) 198 + | `Src of string 199 + | `Srcset of string 200 + | `Sizes of string 201 + | `Alt of string 202 + | `Width of string 203 + | `Height of string 204 + | `Loading of loading_value 205 + | `Decoding of decoding_value 206 + | `Fetchpriority of fetchpriority_value 207 + | `Crossorigin of crossorigin_value option 208 + | `Ismap 209 + | `Usemap of string 210 + | `Media of string 211 + 212 + (* Audio/Video specific *) 213 + | `Controls 214 + | `Autoplay 215 + | `Loop 216 + | `Muted 217 + | `Preload of preload_value 218 + | `Poster of string 219 + | `Playsinline 220 + 221 + (* Image map *) 222 + | `Coords of string 223 + | `Shape of shape_value 224 + 225 + (* iframe *) 226 + | `Sandbox of sandbox_flag list option 227 + | `Allow of string 228 + | `Allowfullscreen 229 + | `Srcdoc of string 230 + | `Csp of string 231 + 232 + (* Form attributes *) 233 + | `Action of string 234 + | `Method of method_value 235 + | `Enctype of enctype_value 236 + | `Novalidate 237 + | `Accept_charset of string 238 + | `Autocomplete of string 239 + | `Name of string 240 + | `Form of string 241 + 242 + (* Form control attributes *) 243 + | `Value of string 244 + | `Type_input of input_type 245 + | `Type_button of button_type 246 + | `Disabled 247 + | `Readonly 248 + | `Required 249 + | `Checked 250 + | `Selected 251 + | `Multiple 252 + | `Placeholder of string 253 + | `Min of string 254 + | `Max of string 255 + | `Step of string 256 + | `Minlength of int 257 + | `Maxlength of int 258 + | `Pattern of string 259 + | `Size of int 260 + | `Cols of int 261 + | `Rows of int 262 + | `Wrap of wrap_value 263 + | `Accept of string 264 + | `Capture of capture_value 265 + | `Dirname of string 266 + | `For of string 267 + | `List of string 268 + 269 + (* Form submission attributes *) 270 + | `Formaction of string 271 + | `Formmethod of method_value 272 + | `Formenctype of enctype_value 273 + | `Formnovalidate 274 + | `Formtarget of target_value 275 + 276 + (* Table attributes *) 277 + | `Colspan of int 278 + | `Rowspan of int 279 + | `Headers of string 280 + | `Scope of scope_value 281 + | `Span of int 282 + 283 + (* Details/Dialog *) 284 + | `Open 285 + 286 + (* Script *) 287 + | `Async 288 + | `Defer 289 + | `Integrity of string 290 + | `Nomodule 291 + | `Blocking of string 292 + | `Type_script of string 293 + 294 + (* Meta *) 295 + | `Charset of string 296 + | `Content of string 297 + | `Http_equiv of string 298 + 299 + (* Link element *) 300 + | `As of string 301 + | `Imagesizes of string 302 + | `Imagesrcset of string 303 + 304 + (* Object/Embed *) 305 + | `Data_object of string 306 + 307 + (* Output *) 308 + | `For_output of string 309 + 310 + (* Meter/Progress *) 311 + | `Low of float 312 + | `High of float 313 + | `Optimum of float 314 + 315 + (* Time *) 316 + | `Datetime of string 317 + 318 + (* Ol *) 319 + | `Start of int 320 + | `Reversed 321 + | `Type_list of list_type_value 322 + 323 + (* Track *) 324 + | `Kind of kind_value 325 + | `Srclang of string 326 + | `Default 327 + 328 + (* Td/Th *) 329 + | `Abbr of string 330 + 331 + (* Data attributes *) 332 + | `Data_attr of string * string 333 + 334 + (* RDFa *) 335 + | `Property of string 336 + | `Typeof of string 337 + | `Resource of string 338 + | `Prefix of string 339 + | `Vocab of string 340 + | `About of string 341 + | `Datatype of string 342 + | `Inlist 343 + | `Rev of string 344 + 345 + (* Escape hatch *) 346 + | `Unknown_attr of string * string 347 + ] 348 + 349 + (** {1 Parsing Functions} *) 350 + 351 + val parse_dir : string -> dir_value option 352 + (** [parse_dir value] parses a direction attribute value. *) 353 + 354 + val parse_target : string -> target_value 355 + (** [parse_target value] parses a target attribute value. *) 356 + 357 + val parse_loading : string -> loading_value option 358 + (** [parse_loading value] parses a loading attribute value. *) 359 + 360 + val parse_decoding : string -> decoding_value option 361 + (** [parse_decoding value] parses a decoding attribute value. *) 362 + 363 + val parse_fetchpriority : string -> fetchpriority_value option 364 + (** [parse_fetchpriority value] parses a fetchpriority attribute value. *) 365 + 366 + val parse_crossorigin : string -> crossorigin_value option 367 + (** [parse_crossorigin value] parses a crossorigin attribute value. *) 368 + 369 + val parse_preload : string -> preload_value option 370 + (** [parse_preload value] parses a preload attribute value. *) 371 + 372 + val parse_method : string -> method_value option 373 + (** [parse_method value] parses a form method attribute value. *) 374 + 375 + val parse_enctype : string -> enctype_value option 376 + (** [parse_enctype value] parses a form enctype attribute value. *) 377 + 378 + val parse_wrap : string -> wrap_value option 379 + (** [parse_wrap value] parses a textarea wrap attribute value. *) 380 + 381 + val parse_scope : string -> scope_value option 382 + (** [parse_scope value] parses a table scope attribute value. *) 383 + 384 + val parse_input_type : string -> input_type option 385 + (** [parse_input_type value] parses an input type attribute value. *) 386 + 387 + val parse_button_type : string -> button_type option 388 + (** [parse_button_type value] parses a button type attribute value. *) 389 + 390 + val parse_shape : string -> shape_value option 391 + (** [parse_shape value] parses an area shape attribute value. *) 392 + 393 + val parse_capture : string -> capture_value option 394 + (** [parse_capture value] parses an input capture attribute value. *) 395 + 396 + val parse_list_type : string -> list_type_value option 397 + (** [parse_list_type value] parses an ordered list type attribute value. *) 398 + 399 + val parse_kind : string -> kind_value option 400 + (** [parse_kind value] parses a track kind attribute value. *) 401 + 402 + val parse_referrerpolicy : string -> referrerpolicy_value option 403 + (** [parse_referrerpolicy value] parses a referrer policy attribute value. *) 404 + 405 + val parse_sandbox_flag : string -> sandbox_flag option 406 + (** [parse_sandbox_flag value] parses a single sandbox flag token. *) 407 + 408 + val parse_sandbox : string -> sandbox_flag list option 409 + (** [parse_sandbox value] parses a space-separated sandbox attribute value. *) 410 + 411 + val parse_enterkeyhint : string -> enterkeyhint_value option 412 + (** [parse_enterkeyhint value] parses an enterkeyhint attribute value. *) 413 + 414 + val parse_inputmode : string -> inputmode_value option 415 + (** [parse_inputmode value] parses an inputmode attribute value. *) 416 + 417 + val parse_contenteditable : string -> contenteditable_value option 418 + (** [parse_contenteditable value] parses a contenteditable attribute value. *) 419 + 420 + val parse_autocapitalize : string -> autocapitalize_value option 421 + (** [parse_autocapitalize value] parses an autocapitalize attribute value. *) 422 + 423 + val parse_hidden : string -> hidden_value option 424 + (** [parse_hidden value] parses a hidden attribute value. *) 425 + 426 + val parse_popover : string -> popover_value option 427 + (** [parse_popover value] parses a popover attribute value. *) 428 + 429 + val parse_int : string -> int option 430 + (** [parse_int value] attempts to parse an integer from a string. *) 431 + 432 + val parse_float : string -> float option 433 + (** [parse_float value] attempts to parse a float from a string. *) 434 + 435 + val parse_bool : string -> bool option 436 + (** [parse_bool value] parses a boolean attribute value. *) 437 + 438 + val parse_attr : string -> string -> t 439 + (** [parse_attr name value] parses a single attribute name-value pair. 440 + 441 + @param name The attribute name 442 + @param value The attribute value 443 + @return A typed attribute variant 444 + 445 + {b Example:} 446 + {[ 447 + parse_attr "class" "container" (* `Class "container" *) 448 + parse_attr "disabled" "" (* `Disabled *) 449 + parse_attr "data-id" "123" (* `Data_attr ("id", "123") *) 450 + ]} *) 451 + 452 + val parse_attrs : (string * string) list -> t list 453 + (** [parse_attrs attrs] parses multiple attributes. 454 + 455 + @param attrs List of (name, value) pairs 456 + @return List of typed attributes *) 457 + 458 + (** {1 Accessor Functions} *) 459 + 460 + val get_id : t list -> string option 461 + (** [get_id attrs] extracts the id attribute value if present. *) 462 + 463 + val get_class : t list -> string option 464 + (** [get_class attrs] extracts the class attribute value if present. *) 465 + 466 + val get_href : t list -> string option 467 + (** [get_href attrs] extracts the href attribute value if present. *) 468 + 469 + val get_src : t list -> string option 470 + (** [get_src attrs] extracts the src attribute value if present. *) 471 + 472 + val get_alt : t list -> string option 473 + (** [get_alt attrs] extracts the alt attribute value if present. *) 474 + 475 + val get_name : t list -> string option 476 + (** [get_name attrs] extracts the name attribute value if present. *) 477 + 478 + val get_value : t list -> string option 479 + (** [get_value attrs] extracts the value attribute value if present. *) 480 + 481 + val get_role : t list -> string option 482 + (** [get_role attrs] extracts the role attribute value if present. *) 483 + 484 + val get_aria : string -> t list -> string option 485 + (** [get_aria name attrs] extracts a specific aria-* attribute value. 486 + 487 + @param name The aria attribute name without the "aria-" prefix *) 488 + 489 + val get_data : string -> t list -> string option 490 + (** [get_data name attrs] extracts a specific data-* attribute value. 491 + 492 + @param name The data attribute name without the "data-" prefix *) 493 + 494 + val has_disabled : t list -> bool 495 + (** [has_disabled attrs] checks if the disabled attribute is present. *) 496 + 497 + val has_required : t list -> bool 498 + (** [has_required attrs] checks if the required attribute is present. *) 499 + 500 + val has_readonly : t list -> bool 501 + (** [has_readonly attrs] checks if the readonly attribute is present. *) 502 + 503 + val has_checked : t list -> bool 504 + (** [has_checked attrs] checks if the checked attribute is present. *) 505 + 506 + val has_autofocus : t list -> bool 507 + (** [has_autofocus attrs] checks if the autofocus attribute is present. *) 508 + 509 + val has_hidden : t list -> bool 510 + (** [has_hidden attrs] checks if the hidden attribute is present. *) 511 + 512 + val has_inert : t list -> bool 513 + (** [has_inert attrs] checks if the inert attribute is present. *) 514 + 515 + val has_open : t list -> bool 516 + (** [has_open attrs] checks if the open attribute is present. *) 517 + 518 + val get_all_aria : t list -> (string * string) list 519 + (** [get_all_aria attrs] extracts all aria-* attributes. *) 520 + 521 + val get_all_data : t list -> (string * string) list 522 + (** [get_all_data attrs] extracts all data-* attributes. *) 523 + 524 + val find : (t -> 'a option) -> t list -> 'a option 525 + (** [find f attrs] finds the first attribute matching predicate [f]. *) 526 + 527 + val exists : (t -> bool) -> t list -> bool 528 + (** [exists f attrs] checks if any attribute matches predicate [f]. *) 529 + 530 + val filter : (t -> bool) -> t list -> t list 531 + (** [filter f attrs] filters attributes matching predicate [f]. *)
+348
lib/htmlrw_check/element/element.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Typed HTML5 element representation. 7 + 8 + This module combines tags and attributes into a complete typed element 9 + representation. Elements are created from raw input (tag name, namespace, 10 + attributes) and provide typed accessors for validation and manipulation. 11 + 12 + {2 Design Philosophy} 13 + 14 + An element in this module represents a complete typed view of an HTML 15 + element, including: 16 + 17 + - The element's tag (typed via {!Tag.element_tag}) 18 + - Typed attributes (via {!Attr.t} list) 19 + - Raw attributes (for fallback access) 20 + 21 + This dual representation allows checkers to use typed pattern matching 22 + for common cases while falling back to raw strings when needed. 23 + 24 + {2 Usage Example} 25 + 26 + {[ 27 + let elem = Element.create 28 + ~name:"input" 29 + ~namespace:None 30 + ~attrs:[("type", "email"); ("required", ""); ("class", "form-input")] 31 + in 32 + match elem.tag with 33 + | Tag.Html `Input -> 34 + if Element.has_required elem then 35 + (* Validate required input *) 36 + () 37 + | _ -> () 38 + ]} 39 + 40 + @see 'Tag' for element tag types 41 + @see 'Attr' for attribute types 42 + *) 43 + 44 + (** {1 Element Type} *) 45 + 46 + (** A typed HTML element. 47 + 48 + @field tag The element's tag classification 49 + @field attrs Typed attributes parsed from raw input 50 + @field raw_attrs Original attribute name-value pairs for fallback *) 51 + type t = { 52 + tag : Tag.element_tag; 53 + attrs : Attr.t list; 54 + raw_attrs : (string * string) list; 55 + } 56 + 57 + (** {1 Construction} *) 58 + 59 + val create : name:string -> namespace:string option -> attrs:(string * string) list -> t 60 + (** [create ~name ~namespace ~attrs] creates a typed element. 61 + 62 + @param name The element's tag name 63 + @param namespace Optional namespace URI (for SVG/MathML) 64 + @param attrs Raw attribute name-value pairs 65 + @return A typed element 66 + 67 + {b Example:} 68 + {[ 69 + let div = Element.create ~name:"div" ~namespace:None 70 + ~attrs:[("class", "container"); ("id", "main")] 71 + ]} *) 72 + 73 + (** {1 Tag Accessors} *) 74 + 75 + val tag : t -> Tag.element_tag 76 + (** [tag elem] returns the element's tag. *) 77 + 78 + val tag_name : t -> string 79 + (** [tag_name elem] returns the element's tag name as a string. *) 80 + 81 + val is_html_tag : Tag.html_tag -> t -> bool 82 + (** [is_html_tag expected elem] checks if the element is a specific HTML tag. 83 + 84 + @param expected The expected HTML tag variant 85 + @param elem The element to check 86 + @return [true] if the element matches *) 87 + 88 + val as_html_tag : t -> Tag.html_tag option 89 + (** [as_html_tag elem] extracts the HTML tag if this is an HTML element. 90 + 91 + @return [Some tag] for HTML elements, [None] for SVG/MathML/Custom/Unknown *) 92 + 93 + (** {1 Attribute Accessors} *) 94 + 95 + val attrs : t -> Attr.t list 96 + (** [attrs elem] returns the typed attributes. *) 97 + 98 + val raw_attrs : t -> (string * string) list 99 + (** [raw_attrs elem] returns the original raw attributes. *) 100 + 101 + val get_id : t -> string option 102 + (** [get_id elem] extracts the id attribute value. *) 103 + 104 + val get_class : t -> string option 105 + (** [get_class elem] extracts the class attribute value. *) 106 + 107 + val get_href : t -> string option 108 + (** [get_href elem] extracts the href attribute value. *) 109 + 110 + val get_src : t -> string option 111 + (** [get_src elem] extracts the src attribute value. *) 112 + 113 + val get_alt : t -> string option 114 + (** [get_alt elem] extracts the alt attribute value. *) 115 + 116 + val get_name : t -> string option 117 + (** [get_name elem] extracts the name attribute value. *) 118 + 119 + val get_value : t -> string option 120 + (** [get_value elem] extracts the value attribute value. *) 121 + 122 + val get_role : t -> string option 123 + (** [get_role elem] extracts the role attribute value. *) 124 + 125 + val get_aria : string -> t -> string option 126 + (** [get_aria name elem] extracts a specific aria-* attribute value. 127 + 128 + @param name The aria attribute name without the "aria-" prefix *) 129 + 130 + val get_data : string -> t -> string option 131 + (** [get_data name elem] extracts a specific data-* attribute value. 132 + 133 + @param name The data attribute name without the "data-" prefix *) 134 + 135 + val has_disabled : t -> bool 136 + (** [has_disabled elem] checks if the disabled attribute is present. *) 137 + 138 + val has_required : t -> bool 139 + (** [has_required elem] checks if the required attribute is present. *) 140 + 141 + val has_readonly : t -> bool 142 + (** [has_readonly elem] checks if the readonly attribute is present. *) 143 + 144 + val has_checked : t -> bool 145 + (** [has_checked elem] checks if the checked attribute is present. *) 146 + 147 + val has_autofocus : t -> bool 148 + (** [has_autofocus elem] checks if the autofocus attribute is present. *) 149 + 150 + val has_hidden : t -> bool 151 + (** [has_hidden elem] checks if the hidden attribute is present. *) 152 + 153 + val has_inert : t -> bool 154 + (** [has_inert elem] checks if the inert attribute is present. *) 155 + 156 + val has_open : t -> bool 157 + (** [has_open elem] checks if the open attribute is present. *) 158 + 159 + val get_all_aria : t -> (string * string) list 160 + (** [get_all_aria elem] extracts all aria-* attributes. *) 161 + 162 + val get_all_data : t -> (string * string) list 163 + (** [get_all_data elem] extracts all data-* attributes. *) 164 + 165 + (** {1 Raw Attribute Fallback} *) 166 + 167 + val get_raw_attr : string -> t -> string option 168 + (** [get_raw_attr name elem] gets a raw attribute value by name. 169 + 170 + This is useful when the typed representation doesn't capture a specific 171 + attribute or when you need the exact original value. 172 + 173 + @param name The attribute name (case-insensitive) 174 + @param elem The element 175 + @return [Some value] if the attribute exists *) 176 + 177 + val has_raw_attr : string -> t -> bool 178 + (** [has_raw_attr name elem] checks if a raw attribute exists. 179 + 180 + @param name The attribute name (case-insensitive) 181 + @param elem The element 182 + @return [true] if the attribute is present *) 183 + 184 + (** {1 Category Checks} 185 + 186 + These predicates check element categories based on the HTML5 content model. *) 187 + 188 + val is_void : t -> bool 189 + (** [is_void elem] checks if this is a void element (cannot have children). 190 + 191 + @return [true] for br, hr, img, input, etc. *) 192 + 193 + val is_heading : t -> bool 194 + (** [is_heading elem] checks if this is a heading element. 195 + 196 + @return [true] for h1-h6 *) 197 + 198 + val heading_level : t -> int option 199 + (** [heading_level elem] gets the heading level (1-6) if applicable. 200 + 201 + @return [Some level] for h1-h6, [None] otherwise *) 202 + 203 + val is_sectioning : t -> bool 204 + (** [is_sectioning elem] checks if this is sectioning content. 205 + 206 + @return [true] for article, aside, nav, section *) 207 + 208 + val is_sectioning_root : t -> bool 209 + (** [is_sectioning_root elem] checks if this is a sectioning root. 210 + 211 + @return [true] for blockquote, body, details, dialog, fieldset, figure, td *) 212 + 213 + val is_embedded : t -> bool 214 + (** [is_embedded elem] checks if this is embedded content. 215 + 216 + @return [true] for audio, canvas, embed, iframe, img, object, picture, video *) 217 + 218 + val is_interactive : t -> bool 219 + (** [is_interactive elem] checks if this is interactive content. 220 + 221 + @return [true] for focusable/activatable elements *) 222 + 223 + val is_form_associated : t -> bool 224 + (** [is_form_associated elem] checks if this is form-associated. 225 + 226 + @return [true] for elements that can belong to a form *) 227 + 228 + val is_labelable : t -> bool 229 + (** [is_labelable elem] checks if this can be associated with a label. 230 + 231 + @return [true] for button, input, meter, output, progress, select, textarea *) 232 + 233 + val is_submittable : t -> bool 234 + (** [is_submittable elem] checks if this is a submittable form element. 235 + 236 + @return [true] for button, input, select, textarea *) 237 + 238 + val is_table_element : t -> bool 239 + (** [is_table_element elem] checks if this is a table-related element. 240 + 241 + @return [true] for table, tr, td, th, etc. *) 242 + 243 + val is_media : t -> bool 244 + (** [is_media elem] checks if this is a media element. 245 + 246 + @return [true] for audio, video *) 247 + 248 + val is_list_container : t -> bool 249 + (** [is_list_container elem] checks if this is a list container. 250 + 251 + @return [true] for ul, ol, menu, dl *) 252 + 253 + val is_transparent : t -> bool 254 + (** [is_transparent elem] checks if this has a transparent content model. 255 + 256 + @return [true] for a, abbr, audio, canvas, del, ins, map, noscript, etc. *) 257 + 258 + val is_phrasing : t -> bool 259 + (** [is_phrasing elem] checks if this is phrasing content. 260 + 261 + @return [true] for inline-level elements *) 262 + 263 + val is_flow : t -> bool 264 + (** [is_flow elem] checks if this is flow content. 265 + 266 + @return [true] for most body-level elements *) 267 + 268 + val is_obsolete : t -> bool 269 + (** [is_obsolete elem] checks if this is a deprecated element. 270 + 271 + @return [true] for applet, font, marquee, etc. *) 272 + 273 + val is_svg : t -> bool 274 + (** [is_svg elem] checks if this is an SVG element. 275 + 276 + @return [true] if the element is in the SVG namespace *) 277 + 278 + val is_mathml : t -> bool 279 + (** [is_mathml elem] checks if this is a MathML element. 280 + 281 + @return [true] if the element is in the MathML namespace *) 282 + 283 + val is_custom : t -> bool 284 + (** [is_custom elem] checks if this is a custom element. 285 + 286 + @return [true] if the element name contains a hyphen *) 287 + 288 + val is_unknown : t -> bool 289 + (** [is_unknown elem] checks if this is an unknown element. 290 + 291 + @return [true] if the element is not recognized *) 292 + 293 + (** {1 Input Type Utilities} *) 294 + 295 + val get_input_type : t -> Attr.input_type option 296 + (** [get_input_type elem] gets the input type for input elements. 297 + 298 + @return [Some type] for input elements with a type, [None] otherwise *) 299 + 300 + val get_button_type : t -> Attr.button_type option 301 + (** [get_button_type elem] gets the button type for button elements. 302 + 303 + @return [Some type] for button elements with a type, [None] otherwise *) 304 + 305 + val is_input_type : Attr.input_type -> t -> bool 306 + (** [is_input_type expected elem] checks if an input has a specific type. 307 + 308 + @param expected The expected input type 309 + @param elem The element to check 310 + @return [true] if this is an input with the specified type *) 311 + 312 + (** {1 Pattern Matching Helpers} *) 313 + 314 + val match_html : t -> (Tag.html_tag -> 'a) -> 'a option 315 + (** [match_html elem f] applies [f] to the HTML tag if present. 316 + 317 + @param elem The element 318 + @param f Function to apply to the HTML tag 319 + @return [Some (f tag)] for HTML elements, [None] otherwise *) 320 + 321 + val when_html_tag : Tag.html_tag -> t -> (unit -> 'a) -> 'a option 322 + (** [when_html_tag expected elem f] applies [f] if the element matches. 323 + 324 + @param expected The expected HTML tag 325 + @param elem The element to check 326 + @param f Function to call if the element matches 327 + @return [Some (f ())] if matched, [None] otherwise *) 328 + 329 + (** {1 Internal} *) 330 + 331 + val parse_type_attr : Tag.html_tag -> string -> Attr.t 332 + (** [parse_type_attr tag value] parses a type attribute for an element. 333 + 334 + Different elements have different valid type values. This function 335 + handles context-dependent parsing. 336 + 337 + @param tag The element's HTML tag 338 + @param value The type attribute value 339 + @return The parsed attribute variant *) 340 + 341 + val parse_attrs_for_tag : Tag.element_tag -> (string * string) list -> Attr.t list 342 + (** [parse_attrs_for_tag tag raw_attrs] parses attributes with element context. 343 + 344 + The type attribute is parsed differently depending on the element tag. 345 + 346 + @param tag The element's tag 347 + @param raw_attrs Raw attribute name-value pairs 348 + @return List of typed attributes *)
+439
lib/htmlrw_check/element/tag.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Typed HTML5 tag representations using polymorphic variants. 7 + 8 + This module provides compile-time type safety for HTML elements while 9 + maintaining escape hatches for unknown/custom elements. Tags are 10 + represented using polymorphic variants, enabling pattern matching with 11 + exhaustiveness checking while avoiding the overhead of explicit 12 + constructors. 13 + 14 + {2 Design Philosophy} 15 + 16 + HTML5 defines over 100 standard elements with specific categories and 17 + content models. This module: 18 + 19 + - Provides typed representations for all standard elements 20 + - Supports SVG and MathML namespaced elements 21 + - Recognizes custom elements (containing hyphens) 22 + - Falls back to [Unknown] for unrecognized elements 23 + 24 + {2 Element Categories} 25 + 26 + HTML5 categorizes elements into content categories that define where 27 + elements can appear and what they can contain. This module provides 28 + predicates for common categories: 29 + 30 + - {!is_void} - Elements that cannot have children 31 + - {!is_heading} - Heading elements (h1-h6) 32 + - {!is_sectioning} - Elements that create document sections 33 + - {!is_phrasing} - Inline/phrasing content elements 34 + - {!is_flow} - Block/flow content elements 35 + 36 + @see <https://html.spec.whatwg.org/multipage/dom.html#content-models> 37 + HTML Standard: Content models 38 + *) 39 + 40 + (** {1 HTML Tag Types} *) 41 + 42 + (** All standard HTML5 elements plus deprecated elements needed by the validator. 43 + 44 + This type covers: 45 + - Document metadata elements (html, head, title, etc.) 46 + - Sectioning elements (article, section, nav, etc.) 47 + - Heading elements (h1-h6) 48 + - Grouping content (div, p, ul, ol, etc.) 49 + - Text-level semantics (a, em, strong, span, etc.) 50 + - Embedded content (img, video, audio, etc.) 51 + - Table elements (table, tr, td, th, etc.) 52 + - Form elements (form, input, button, etc.) 53 + - Interactive elements (details, dialog, summary) 54 + - Scripting elements (script, noscript, template) 55 + - Deprecated/obsolete elements (font, center, marquee, etc.) *) 56 + type html_tag = [ 57 + (* Document metadata *) 58 + | `Html | `Head | `Title | `Base | `Link | `Meta | `Style 59 + 60 + (* Sectioning root *) 61 + | `Body 62 + 63 + (* Content sectioning *) 64 + | `Address | `Article | `Aside | `Footer | `Header | `Hgroup 65 + | `Main | `Nav | `Search | `Section 66 + 67 + (* Heading content *) 68 + | `H1 | `H2 | `H3 | `H4 | `H5 | `H6 69 + 70 + (* Grouping content *) 71 + | `Blockquote | `Dd | `Div | `Dl | `Dt | `Figcaption | `Figure 72 + | `Hr | `Li | `Menu | `Ol | `P | `Pre | `Ul 73 + 74 + (* Text-level semantics *) 75 + | `A | `Abbr | `B | `Bdi | `Bdo | `Br | `Cite | `Code | `Data 76 + | `Dfn | `Em | `I | `Kbd | `Mark | `Q | `Rp | `Rt | `Ruby 77 + | `S | `Samp | `Small | `Span | `Strong | `Sub | `Sup | `Time 78 + | `U | `Var | `Wbr 79 + 80 + (* Edits *) 81 + | `Del | `Ins 82 + 83 + (* Embedded content *) 84 + | `Area | `Audio | `Canvas | `Embed | `Iframe | `Img | `Map | `Object 85 + | `Picture | `Source | `Track | `Video 86 + 87 + (* Tabular data *) 88 + | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot 89 + | `Th | `Thead | `Tr 90 + 91 + (* Forms *) 92 + | `Button | `Datalist | `Fieldset | `Form | `Input | `Label 93 + | `Legend | `Meter | `Optgroup | `Option | `Output | `Progress 94 + | `Select | `Textarea 95 + 96 + (* Interactive elements *) 97 + | `Details | `Dialog | `Summary 98 + 99 + (* Scripting *) 100 + | `Noscript | `Script | `Slot | `Template 101 + 102 + (* Web Components / Misc *) 103 + | `Portal | `Param 104 + 105 + (* Deprecated/obsolete elements *) 106 + | `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset 107 + | `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid 108 + | `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp 109 + | `Basefont | `Big | `Blink | `Center | `Font | `Marquee 110 + | `Multicol | `Nobr | `Spacer | `Tt | `Image 111 + ] 112 + 113 + (** {1 Category Types} 114 + 115 + Type aliases for element subsets, enabling functions that only accept 116 + specific categories with compile-time checking. *) 117 + 118 + (** Void elements - cannot have children (e.g., br, hr, img, input). *) 119 + type void_tag = [ 120 + | `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input 121 + | `Link | `Meta | `Source | `Track | `Wbr 122 + | `Basefont | `Frame | `Isindex | `Keygen | `Param 123 + ] 124 + 125 + (** Heading elements (h1-h6). *) 126 + type heading_tag = [ `H1 | `H2 | `H3 | `H4 | `H5 | `H6 ] 127 + 128 + (** Sectioning content elements that establish document sections. *) 129 + type sectioning_tag = [ `Article | `Aside | `Nav | `Section ] 130 + 131 + (** Sectioning roots that establish their own outline context. *) 132 + type sectioning_root_tag = [ 133 + | `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td 134 + ] 135 + 136 + (** Embedded content elements. *) 137 + type embedded_tag = [ 138 + | `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video 139 + ] 140 + 141 + (** Interactive content elements (focusable/activatable). *) 142 + type interactive_tag = [ 143 + | `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img 144 + | `Input | `Label | `Select | `Textarea | `Video 145 + ] 146 + 147 + (** Form-associated elements that can belong to a form. *) 148 + type form_associated_tag = [ 149 + | `Button | `Fieldset | `Input | `Label | `Object | `Output 150 + | `Select | `Textarea | `Meter | `Progress 151 + ] 152 + 153 + (** Labelable elements that can be associated with a label. *) 154 + type labelable_tag = [ 155 + | `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea 156 + ] 157 + 158 + (** Submittable form elements. *) 159 + type submittable_tag = [ 160 + | `Button | `Input | `Select | `Textarea 161 + ] 162 + 163 + (** Resettable form elements. *) 164 + type resettable_tag = [ 165 + | `Input | `Output | `Select | `Textarea 166 + ] 167 + 168 + (** Table-related elements. *) 169 + type table_tag = [ 170 + | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot 171 + | `Th | `Thead | `Tr 172 + ] 173 + 174 + (** Media elements (audio and video). *) 175 + type media_tag = [ `Audio | `Video ] 176 + 177 + (** List container elements. *) 178 + type list_container_tag = [ `Ul | `Ol | `Menu | `Dl ] 179 + 180 + (** List item elements. *) 181 + type list_item_tag = [ `Li | `Dd | `Dt ] 182 + 183 + (** Script-supporting elements. *) 184 + type script_supporting_tag = [ `Script | `Template ] 185 + 186 + (** Metadata content elements. *) 187 + type metadata_tag = [ `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title ] 188 + 189 + (** {1 Top-Level Element Type} *) 190 + 191 + (** Top-level element classification. 192 + 193 + Elements are classified by namespace and recognition status: 194 + - [Html tag] - A known HTML5 element 195 + - [Svg name] - An SVG element (preserves original case) 196 + - [MathML name] - A MathML element (preserves original case) 197 + - [Custom name] - A custom element (contains hyphen) 198 + - [Unknown name] - An unrecognized element *) 199 + type element_tag = 200 + | Html of html_tag 201 + | Svg of string 202 + | MathML of string 203 + | Custom of string 204 + | Unknown of string 205 + 206 + (** {1 Namespace Constants} *) 207 + 208 + val svg_namespace : string 209 + (** The SVG namespace URI: ["http://www.w3.org/2000/svg"]. *) 210 + 211 + val mathml_namespace : string 212 + (** The MathML namespace URI: ["http://www.w3.org/1998/Math/MathML"]. *) 213 + 214 + (** {1 Conversion Functions} *) 215 + 216 + val html_tag_of_string_opt : string -> html_tag option 217 + (** [html_tag_of_string_opt name] converts a lowercase tag name to an [html_tag]. 218 + 219 + @param name The lowercase tag name (e.g., ["div"], ["span"]) 220 + @return [Some tag] if recognized, [None] otherwise 221 + 222 + {b Example:} 223 + {[ 224 + html_tag_of_string_opt "div" (* Some `Div *) 225 + html_tag_of_string_opt "xyz" (* None *) 226 + ]} *) 227 + 228 + val is_custom_element_name : string -> bool 229 + (** [is_custom_element_name name] checks if a name is a valid custom element name. 230 + 231 + A valid custom element name must contain a hyphen and not be reserved 232 + (e.g., not start with "xml" or be "annotation-xml"). 233 + 234 + @param name The element name to check 235 + @return [true] if the name is a valid custom element name *) 236 + 237 + val is_svg_namespace : string -> bool 238 + (** [is_svg_namespace ns] checks if a namespace string represents SVG. 239 + 240 + Accepts both the short form ["svg"] and the full URI. *) 241 + 242 + val is_mathml_namespace : string -> bool 243 + (** [is_mathml_namespace ns] checks if a namespace string represents MathML. 244 + 245 + Accepts both the short form ["mathml"] and the full URI. *) 246 + 247 + val tag_of_string : ?namespace:string -> string -> element_tag 248 + (** [tag_of_string ?namespace name] converts a tag name to an [element_tag]. 249 + 250 + @param namespace Optional namespace URI or short form 251 + @param name The element name 252 + @return The classified element tag 253 + 254 + {b Example:} 255 + {[ 256 + tag_of_string "div" (* Html `Div *) 257 + tag_of_string ~namespace:"svg" "circle" (* Svg "circle" *) 258 + tag_of_string "my-component" (* Custom "my-component" *) 259 + tag_of_string "xyz" (* Unknown "xyz" *) 260 + ]} *) 261 + 262 + val html_tag_to_string : html_tag -> string 263 + (** [html_tag_to_string tag] converts an [html_tag] to its lowercase string name. 264 + 265 + @param tag The HTML tag variant 266 + @return The lowercase tag name (e.g., ["div"], ["span"]) *) 267 + 268 + val tag_to_string : element_tag -> string 269 + (** [tag_to_string tag] converts any [element_tag] to its string name. 270 + 271 + @param tag The element tag 272 + @return The tag name (lowercase for HTML, original case for SVG/MathML) *) 273 + 274 + (** {1 Category Predicates} *) 275 + 276 + val is_void : html_tag -> bool 277 + (** [is_void tag] checks if an element is a void element (cannot have children). 278 + 279 + @param tag The HTML tag to check 280 + @return [true] if the element is void (br, hr, img, input, etc.) *) 281 + 282 + val is_heading : html_tag -> bool 283 + (** [is_heading tag] checks if an element is a heading element. 284 + 285 + @param tag The HTML tag to check 286 + @return [true] if the element is h1-h6 *) 287 + 288 + val heading_level : html_tag -> int option 289 + (** [heading_level tag] gets the heading level (1-6) if applicable. 290 + 291 + @param tag The HTML tag to check 292 + @return [Some level] for h1-h6, [None] for other elements *) 293 + 294 + val is_sectioning : html_tag -> bool 295 + (** [is_sectioning tag] checks if an element is sectioning content. 296 + 297 + @param tag The HTML tag to check 298 + @return [true] if the element is article, aside, nav, or section *) 299 + 300 + val is_sectioning_root : html_tag -> bool 301 + (** [is_sectioning_root tag] checks if an element is a sectioning root. 302 + 303 + Sectioning roots establish their own outline context. 304 + 305 + @param tag The HTML tag to check 306 + @return [true] if the element is blockquote, body, details, dialog, 307 + fieldset, figure, or td *) 308 + 309 + val is_embedded : html_tag -> bool 310 + (** [is_embedded tag] checks if an element is embedded content. 311 + 312 + @param tag The HTML tag to check 313 + @return [true] if the element is audio, canvas, embed, iframe, img, 314 + object, picture, or video *) 315 + 316 + val is_interactive : html_tag -> bool 317 + (** [is_interactive tag] checks if an element is interactive content. 318 + 319 + @param tag The HTML tag to check 320 + @return [true] if the element is focusable or activatable *) 321 + 322 + val is_form_associated : html_tag -> bool 323 + (** [is_form_associated tag] checks if an element is form-associated. 324 + 325 + @param tag The HTML tag to check 326 + @return [true] if the element can belong to a form *) 327 + 328 + val is_labelable : html_tag -> bool 329 + (** [is_labelable tag] checks if an element can be associated with a label. 330 + 331 + @param tag The HTML tag to check 332 + @return [true] if the element is labelable *) 333 + 334 + val is_submittable : html_tag -> bool 335 + (** [is_submittable tag] checks if an element is a submittable form element. 336 + 337 + @param tag The HTML tag to check 338 + @return [true] if the element is button, input, select, or textarea *) 339 + 340 + val is_resettable : html_tag -> bool 341 + (** [is_resettable tag] checks if an element is a resettable form element. 342 + 343 + @param tag The HTML tag to check 344 + @return [true] if the element is input, output, select, or textarea *) 345 + 346 + val is_transparent : html_tag -> bool 347 + (** [is_transparent tag] checks if an element has a transparent content model. 348 + 349 + Transparent elements inherit their content model from their parent. 350 + 351 + @param tag The HTML tag to check 352 + @return [true] if the element is transparent (a, abbr, audio, canvas, etc.) *) 353 + 354 + val is_script_supporting : html_tag -> bool 355 + (** [is_script_supporting tag] checks if an element is script-supporting. 356 + 357 + @param tag The HTML tag to check 358 + @return [true] if the element is script or template *) 359 + 360 + val is_table_element : html_tag -> bool 361 + (** [is_table_element tag] checks if an element is a table-related element. 362 + 363 + @param tag The HTML tag to check 364 + @return [true] if the element is table, tr, td, th, etc. *) 365 + 366 + val is_media : html_tag -> bool 367 + (** [is_media tag] checks if an element is a media element. 368 + 369 + @param tag The HTML tag to check 370 + @return [true] if the element is audio or video *) 371 + 372 + val is_list_container : html_tag -> bool 373 + (** [is_list_container tag] checks if an element is a list container. 374 + 375 + @param tag The HTML tag to check 376 + @return [true] if the element is ul, ol, menu, or dl *) 377 + 378 + val is_list_item : html_tag -> bool 379 + (** [is_list_item tag] checks if an element is a list item. 380 + 381 + @param tag The HTML tag to check 382 + @return [true] if the element is li, dd, or dt *) 383 + 384 + val is_metadata : html_tag -> bool 385 + (** [is_metadata tag] checks if an element is metadata content. 386 + 387 + @param tag The HTML tag to check 388 + @return [true] if the element is base, link, meta, etc. *) 389 + 390 + val is_obsolete : html_tag -> bool 391 + (** [is_obsolete tag] checks if an element is deprecated/obsolete. 392 + 393 + @param tag The HTML tag to check 394 + @return [true] if the element is applet, font, marquee, etc. *) 395 + 396 + val is_raw_text : html_tag -> bool 397 + (** [is_raw_text tag] checks if an element is a raw text element. 398 + 399 + Raw text elements contain unparsed text content. 400 + 401 + @param tag The HTML tag to check 402 + @return [true] if the element is script or style *) 403 + 404 + val is_escapable_raw_text : html_tag -> bool 405 + (** [is_escapable_raw_text tag] checks if an element is escapable raw text. 406 + 407 + @param tag The HTML tag to check 408 + @return [true] if the element is textarea or title *) 409 + 410 + val is_phrasing : html_tag -> bool 411 + (** [is_phrasing tag] checks if an element is phrasing content. 412 + 413 + Phrasing content is inline-level content that forms paragraphs. 414 + 415 + @param tag The HTML tag to check 416 + @return [true] if the element is phrasing content *) 417 + 418 + val is_flow : html_tag -> bool 419 + (** [is_flow tag] checks if an element is flow content. 420 + 421 + Flow content is most elements that can appear in the body. 422 + 423 + @param tag The HTML tag to check 424 + @return [true] if the element is flow content *) 425 + 426 + (** {1 Pattern Matching Helpers} *) 427 + 428 + val as_html_tag : element_tag -> html_tag option 429 + (** [as_html_tag tag] extracts the HTML tag if present. 430 + 431 + @param tag The element tag 432 + @return [Some html_tag] if [tag] is [Html html_tag], [None] otherwise *) 433 + 434 + val is_html_tag : html_tag -> element_tag -> bool 435 + (** [is_html_tag expected tag] checks if [tag] matches the expected HTML tag. 436 + 437 + @param expected The expected HTML tag variant 438 + @param tag The element tag to check 439 + @return [true] if [tag] is [Html expected] *)
+5
lib/htmlrw_check/error_code.ml
··· 119 | `For_id_mismatch 120 | `Role_on_ancestor 121 | `Role_on_for 122 | `Aria_label_on_for 123 ] 124 ··· 309 | `Label `For_id_mismatch -> "label-for-mismatch" 310 | `Label `Role_on_ancestor -> "role-on-label" 311 | `Label `Role_on_for -> "role-on-label" 312 | `Label `Aria_label_on_for -> "aria-label-on-label" 313 314 (* Input errors *) ··· 624 | `Label `Role_on_for -> 625 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element." 626 (q "role") (q "label") 627 | `Label `Aria_label_on_for -> 628 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element." 629 (q "aria-label") (q "label")
··· 119 | `For_id_mismatch 120 | `Role_on_ancestor 121 | `Role_on_for 122 + | `Aria_label_on_ancestor 123 | `Aria_label_on_for 124 ] 125 ··· 310 | `Label `For_id_mismatch -> "label-for-mismatch" 311 | `Label `Role_on_ancestor -> "role-on-label" 312 | `Label `Role_on_for -> "role-on-label" 313 + | `Label `Aria_label_on_ancestor -> "aria-label-on-label" 314 | `Label `Aria_label_on_for -> "aria-label-on-label" 315 316 (* Input errors *) ··· 626 | `Label `Role_on_for -> 627 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element." 628 (q "role") (q "label") 629 + | `Label `Aria_label_on_ancestor -> 630 + Printf.sprintf "The %s attribute must not be used on any %s element that is an ancestor of a labelable element." 631 + (q "aria-label") (q "label") 632 | `Label `Aria_label_on_for -> 633 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element." 634 (q "aria-label") (q "label")
+5
lib/htmlrw_check/error_code.mli
··· 527 Adding [role] to a label that wraps a form control 528 breaks the implicit label association. *) 529 530 | `Role_on_for 531 (** [<label>] with role uses [for] association. 532 Labels with explicit [for] association must not have [role]. *)
··· 527 Adding [role] to a label that wraps a form control 528 breaks the implicit label association. *) 529 530 + | `Aria_label_on_ancestor 531 + (** [<label>] with [aria-label] is ancestor of labelable element. 532 + [aria-label] on a label that wraps a form control creates 533 + conflicting accessible names. *) 534 + 535 | `Role_on_for 536 (** [<label>] with role uses [for] association. 537 Labels with explicit [for] association must not have [role]. *)
+31
lib/htmlrw_check/semantic/autofocus_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Autofocus attribute validation checker. 7 + 8 + This checker validates that only one element with the [autofocus] attribute 9 + exists within each dialog or popover context. HTML5 specifies that there 10 + should be at most one autofocused element per autofocus scope. 11 + 12 + {2 Validation Rules} 13 + 14 + - Within each dialog element, only one descendant may have [autofocus] 15 + - Within each popover element, only one descendant may have [autofocus] 16 + - Nested dialogs and popovers create separate scopes 17 + 18 + {2 Error Messages} 19 + 20 + Reports [Multiple_autofocus] when more than one autofocus attribute is 21 + found within the same scope. 22 + 23 + @see <https://html.spec.whatwg.org/multipage/interaction.html#the-autofocus-attribute> 24 + HTML Standard: The autofocus attribute 25 + *) 26 + 27 + val checker : Checker.t 28 + (** The autofocus checker instance. 29 + 30 + This checker can be registered with the checker registry and will be 31 + invoked during DOM traversal to validate autofocus attribute usage. *)
+2 -19
lib/htmlrw_check/semantic/lang_detecting_checker.ml
··· 6 type state = { 7 mutable html_lang : string option; 8 mutable html_dir : string option; 9 - mutable html_locator : (int * int) option; (* line, column *) 10 mutable in_body : bool; 11 mutable skip_depth : int; (* depth in elements to skip *) 12 mutable foreign_depth : int; (* depth in SVG/MathML content to skip *) 13 - mutable text_buffer : Buffer.t; 14 mutable char_count : int; 15 } 16 ··· 30 let create () = { 31 html_lang = None; 32 html_dir = None; 33 - html_locator = None; 34 in_body = false; 35 skip_depth = 0; 36 foreign_depth = 0; ··· 41 let reset state = 42 state.html_lang <- None; 43 state.html_dir <- None; 44 - state.html_locator <- None; 45 state.in_body <- false; 46 state.skip_depth <- 0; 47 state.foreign_depth <- 0; 48 Buffer.clear state.text_buffer; 49 state.char_count <- 0 50 - 51 - (* Namespaces to skip for language detection *) 52 - let svg_namespace = "http://www.w3.org/2000/svg" 53 - let mathml_namespace = "http://www.w3.org/1998/Math/MathML" 54 - 55 - let is_foreign_namespace ns = 56 - ns = svg_namespace || ns = mathml_namespace 57 - 58 - (* Element names that start foreign content (for when namespace isn't set) *) 59 - let is_foreign_element name = 60 - let n = String.lowercase_ascii name in 61 - n = "svg" || n = "math" 62 63 let get_lang_code lang = 64 (* Extract primary language subtag *) ··· 221 match element.tag with 222 | Tag.Html `Html -> 223 state.html_lang <- Attr_utils.get_attr "lang" attrs; 224 - state.html_dir <- Attr_utils.get_attr "dir" attrs; 225 - (* TODO: get line/column from locator *) 226 - state.html_locator <- Some (1, 1) 227 | Tag.Html `Body -> 228 state.in_body <- true 229 | Tag.Svg _ | Tag.MathML _ ->
··· 6 type state = { 7 mutable html_lang : string option; 8 mutable html_dir : string option; 9 mutable in_body : bool; 10 mutable skip_depth : int; (* depth in elements to skip *) 11 mutable foreign_depth : int; (* depth in SVG/MathML content to skip *) 12 + text_buffer : Buffer.t; (* buffer contents are mutated, not the field itself *) 13 mutable char_count : int; 14 } 15 ··· 29 let create () = { 30 html_lang = None; 31 html_dir = None; 32 in_body = false; 33 skip_depth = 0; 34 foreign_depth = 0; ··· 39 let reset state = 40 state.html_lang <- None; 41 state.html_dir <- None; 42 state.in_body <- false; 43 state.skip_depth <- 0; 44 state.foreign_depth <- 0; 45 Buffer.clear state.text_buffer; 46 state.char_count <- 0 47 48 let get_lang_code lang = 49 (* Extract primary language subtag *) ··· 206 match element.tag with 207 | Tag.Html `Html -> 208 state.html_lang <- Attr_utils.get_attr "lang" attrs; 209 + state.html_dir <- Attr_utils.get_attr "dir" attrs 210 | Tag.Html `Body -> 211 state.in_body <- true 212 | Tag.Svg _ | Tag.MathML _ ->
+41
lib/htmlrw_check/semantic/lang_detecting_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Language detection and validation checker. 7 + 8 + This checker validates that the document's [lang] attribute matches the 9 + detected language of the content, and that the [dir] attribute is correct 10 + for right-to-left (RTL) languages. 11 + 12 + {2 Detection Algorithm} 13 + 14 + The checker: 15 + 1. Collects text content from the document body (up to 30720 characters) 16 + 2. Skips text from certain elements (scripts, navigation, form controls) 17 + 3. Skips foreign namespace content (SVG, MathML) 18 + 4. Uses statistical language detection with >90% confidence threshold 19 + 5. Handles Traditional vs Simplified Chinese detection 20 + 21 + {2 Validation Rules} 22 + 23 + - Documents should have a [lang] attribute on the [<html>] element 24 + - The declared language should match the detected content language 25 + - RTL languages (Arabic, Hebrew, Persian, Urdu, etc.) should have [dir="rtl"] 26 + 27 + {2 Error Messages} 28 + 29 + - [Wrong_lang]: The declared language doesn't match detected content 30 + - [Missing_dir_rtl]: An RTL language is detected but no [dir] attribute 31 + - [Wrong_dir]: The [dir] attribute doesn't match the detected RTL language 32 + 33 + @see <https://html.spec.whatwg.org/multipage/dom.html#the-lang-and-xml:lang-attributes> 34 + HTML Standard: The lang attribute 35 + *) 36 + 37 + val checker : Checker.t 38 + (** The language detection checker instance. 39 + 40 + This checker collects text during DOM traversal and performs language 41 + detection at document end. *)
+2 -1
lib/htmlrw_check/semantic/option_checker.ml
··· 49 (match state.option_stack with 50 | ctx :: rest -> 51 state.option_stack <- rest; 52 - if not ctx.has_text && not ctx.has_label then 53 Message_collector.add_typed collector (`Misc `Option_empty_without_label) 54 | [] -> ()) 55 | _ -> ()
··· 49 (match state.option_stack with 50 | ctx :: rest -> 51 state.option_stack <- rest; 52 + (* Empty label attribute doesn't count as a valid label *) 53 + if not ctx.has_text && (not ctx.has_label || ctx.label_empty) then 54 Message_collector.add_typed collector (`Misc `Option_empty_without_label) 55 | [] -> ()) 56 | _ -> ()
+32
lib/htmlrw_check/semantic/option_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Option element validation checker. 7 + 8 + This checker validates that [<option>] elements have proper content or 9 + a [label] attribute. Empty options without labels can be confusing for 10 + users, especially those using assistive technologies. 11 + 12 + {2 Validation Rules} 13 + 14 + - An [<option>] element must have either: 15 + - Non-whitespace text content, OR 16 + - A non-empty [label] attribute 17 + - Empty [label] attribute values are reported as errors 18 + - Options inside [<template>] elements are not checked 19 + 20 + {2 Error Messages} 21 + 22 + - [Option_empty_without_label]: Option has no text and no label attribute 23 + - [Bad_value] for label: The label attribute value is empty 24 + 25 + @see <https://html.spec.whatwg.org/multipage/form-elements.html#the-option-element> 26 + HTML Standard: The option element 27 + *) 28 + 29 + val checker : Checker.t 30 + (** The option element checker instance. 31 + 32 + This checker validates option elements during DOM traversal. *)
+31
lib/htmlrw_check/specialized/attr_restrictions_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Attribute restrictions checker. 7 + 8 + This checker validates that certain attributes are not used on elements 9 + where they are not allowed. It catches common misuses such as: 10 + 11 + - RDFa-style [href] on elements like [<img>], [<p>], [<div>] 12 + - [src] or [media] on [<a>] elements 13 + - [srcset] on media elements ([<audio>], [<video>], [<object>]) 14 + 15 + {2 Validation Rules} 16 + 17 + The checker maintains a list of (element, disallowed_attributes) pairs 18 + for both HTML and SVG elements. When an element is encountered with 19 + a disallowed attribute, an error is reported. 20 + 21 + {2 Error Messages} 22 + 23 + Reports [Not_allowed] when an attribute is used on an element where 24 + it is not permitted. 25 + 26 + @see <https://html.spec.whatwg.org/multipage/dom.html#element-definitions> 27 + HTML Standard: Element definitions 28 + *) 29 + 30 + val checker : Checker.t 31 + (** The attribute restrictions checker instance. *)
+28
lib/htmlrw_check/specialized/base_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Base element ordering checker. 7 + 8 + This checker validates that the [<base>] element appears before any 9 + elements that may use URLs resolved against the base URL. Specifically, 10 + [<base>] should appear before [<link>] and [<script>] elements. 11 + 12 + {2 Validation Rules} 13 + 14 + - [<base>] must appear before any [<link>] elements 15 + - [<base>] must appear before any [<script>] elements 16 + - The order is significant for URL resolution in the document 17 + 18 + {2 Error Messages} 19 + 20 + Reports [Base_after_link_script] when a [<base>] element is found 21 + after [<link>] or [<script>] elements. 22 + 23 + @see <https://html.spec.whatwg.org/multipage/semantics.html#the-base-element> 24 + HTML Standard: The base element 25 + *) 26 + 27 + val checker : Checker.t 28 + (** The base element ordering checker instance. *)
-3
lib/htmlrw_check/specialized/datetime_checker.ml
··· 5 (** Elements that have datetime attribute *) 6 let datetime_elements = ["del"; "ins"; "time"] 7 8 - (** Helper: check if char is digit *) 9 - let is_digit c = c >= '0' && c <= '9' 10 - 11 (** Parse int safely *) 12 let parse_int s = 13 try Some (int_of_string s) with _ -> None
··· 5 (** Elements that have datetime attribute *) 6 let datetime_elements = ["del"; "ins"; "time"] 7 8 (** Parse int safely *) 9 let parse_int s = 10 try Some (int_of_string s) with _ -> None
+43
lib/htmlrw_check/specialized/datetime_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Datetime attribute validation checker. 7 + 8 + This checker validates the [datetime] attribute on [<del>], [<ins>], 9 + and [<time>] elements. The datetime value must conform to a valid 10 + date, time, or datetime format as specified by HTML5. 11 + 12 + {2 Supported Formats} 13 + 14 + The checker validates these datetime formats: 15 + - Date: [YYYY-MM-DD] (e.g., "2025-12-19") 16 + - Month: [YYYY-MM] (e.g., "2025-12") 17 + - Year: [YYYY] (e.g., "2025") 18 + - Week: [YYYY-Www] (e.g., "2025-W51") 19 + - Time: [HH:MM] or [HH:MM:SS] (e.g., "14:30:00") 20 + - Datetime: Date followed by time with separator (e.g., "2025-12-19T14:30") 21 + - Timezone offsets: [+HH:MM] or [-HH:MM] or [Z] 22 + - Duration: [P] prefix followed by duration components 23 + 24 + {2 Validation Rules} 25 + 26 + - Month values must be 01-12 27 + - Day values must be valid for the given month 28 + - Leap years are correctly handled for February 29th 29 + - Hour values must be 00-23 30 + - Minute and second values must be 00-59 31 + - Week numbers must be 01-53 32 + 33 + {2 Error Messages} 34 + 35 + Reports [Bad_value] when the datetime attribute contains an invalid 36 + format or out-of-range values. 37 + 38 + @see <https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#dates-and-times> 39 + HTML Standard: Dates and times 40 + *) 41 + 42 + val checker : Checker.t 43 + (** The datetime attribute checker instance. *)
+37
lib/htmlrw_check/specialized/dl_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** DL element content model validation checker. 7 + 8 + This checker validates that [<dl>] (description list) elements follow 9 + the HTML5 content model requirements. Description lists must contain 10 + [<dt>] (term) and [<dd>] (description) elements in the correct order. 11 + 12 + {2 Content Model} 13 + 14 + A [<dl>] element may contain: 15 + - Zero or more groups of [<dt>] followed by [<dd>] elements 16 + - [<div>] elements wrapping [<dt>]/[<dd>] groups (for styling) 17 + - [<template>] and [<script>] elements (script-supporting) 18 + 19 + {2 Validation Rules} 20 + 21 + - [<dd>] should not appear before any [<dt>] (terms should come first) 22 + - [<dl>] should not be empty (should contain at least one term/description) 23 + - When using [<div>] wrappers, mixing wrapped and unwrapped content 24 + is discouraged 25 + - Each [<div>] in a [<dl>] should contain at least one [<dt>]/[<dd>] group 26 + 27 + {2 Error Messages} 28 + 29 + - [Dl_empty]: The [<dl>] element has no content 30 + - [Dd_before_dt]: A [<dd>] appears before any [<dt>] element 31 + 32 + @see <https://html.spec.whatwg.org/multipage/grouping-content.html#the-dl-element> 33 + HTML Standard: The dl element 34 + *) 35 + 36 + val checker : Checker.t 37 + (** The description list content model checker instance. *)
+35
lib/htmlrw_check/specialized/h1_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** H1 element counter and validator. 7 + 8 + This checker warns about multiple [<h1>] elements in a document. 9 + While HTML5 technically allows multiple [<h1>] elements when using 10 + the document outline algorithm, this algorithm was never implemented 11 + by browsers and has been removed from the specification. 12 + 13 + {2 Best Practice} 14 + 15 + Documents should have exactly one [<h1>] element that represents the 16 + main heading of the page. Multiple [<h1>] elements can confuse users 17 + and assistive technologies about the document's structure. 18 + 19 + {2 Special Cases} 20 + 21 + - [<h1>] elements inside [<svg>] content (e.g., in [<foreignObject>]) 22 + are not counted, as they may represent different content contexts 23 + - The checker reports a warning after the second [<h1>] is encountered 24 + 25 + {2 Error Messages} 26 + 27 + Reports [Multiple_h1] when more than one [<h1>] element is found 28 + in the document. 29 + 30 + @see <https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements> 31 + HTML Standard: The h1-h6 elements 32 + *) 33 + 34 + val checker : Checker.t 35 + (** The h1 element counter/validator instance. *)
+2
lib/htmlrw_check/specialized/label_checker.ml
··· 110 | Tag.Html `Label when state.label_depth = 0 -> 111 if state.label_has_role && state.labelable_count > 0 then 112 Message_collector.add_typed collector (`Label `Role_on_ancestor); 113 state.in_label <- false; 114 state.labelable_count <- 0; 115 state.label_for_value <- None;
··· 110 | Tag.Html `Label when state.label_depth = 0 -> 111 if state.label_has_role && state.labelable_count > 0 then 112 Message_collector.add_typed collector (`Label `Role_on_ancestor); 113 + if state.label_has_aria_label && state.labelable_count > 0 then 114 + Message_collector.add_typed collector (`Label `Aria_label_on_ancestor); 115 state.in_label <- false; 116 state.labelable_count <- 0; 117 state.label_for_value <- None;
+41
lib/htmlrw_check/specialized/label_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Label element content model validation checker. 7 + 8 + This checker validates that [<label>] elements follow the HTML5 9 + content model requirements. Labels associate text with form controls 10 + and must be used correctly for accessibility. 11 + 12 + {2 Validation Rules} 13 + 14 + - A [<label>] element may contain at most one labelable element 15 + (button, input, meter, output, progress, select, textarea) 16 + - When using the [for] attribute, it should reference an existing 17 + element ID in the document 18 + - Nested labelable elements are not counted (only direct descendants) 19 + 20 + {2 Labelable Elements} 21 + 22 + The following elements can be labeled: 23 + - [<button>] 24 + - [<input>] (except type="hidden") 25 + - [<meter>] 26 + - [<output>] 27 + - [<progress>] 28 + - [<select>] 29 + - [<textarea>] 30 + 31 + {2 Error Messages} 32 + 33 + - Multiple labelable elements inside a single [<label>] 34 + - [for] attribute references a non-existent ID 35 + 36 + @see <https://html.spec.whatwg.org/multipage/forms.html#the-label-element> 37 + HTML Standard: The label element 38 + *) 39 + 40 + val checker : Checker.t 41 + (** The label element content model checker instance. *)
+42
lib/htmlrw_check/specialized/picture_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Picture element content model and attribute validation checker. 7 + 8 + This checker validates that [<picture>] elements follow the HTML5 9 + content model requirements and that attributes are used correctly. 10 + 11 + {2 Content Model} 12 + 13 + A [<picture>] element may contain: 14 + - Zero or more [<source>] elements (must come before [<img>]) 15 + - Exactly one [<img>] element (required) 16 + - [<script>] and [<template>] elements (script-supporting) 17 + 18 + {2 Attribute Restrictions} 19 + 20 + The [<picture>] element should not have image-related attributes 21 + directly on it (these belong on the [<img>] child): 22 + - [src], [srcset], [sizes], [alt], [width], [height] 23 + - [crossorigin], [loading], [decoding] 24 + - Legacy attributes like [align], [border], [hspace], etc. 25 + 26 + {2 Source Restrictions in Picture} 27 + 28 + When [<source>] is a child of [<picture>]: 29 + - It must have [srcset] attribute (required) 30 + - It should not have [src] attribute 31 + 32 + {2 Error Messages} 33 + 34 + - Disallowed attributes on [<picture>] or [<source>] in picture context 35 + - Invalid parent elements for [<picture>] 36 + 37 + @see <https://html.spec.whatwg.org/multipage/embedded-content.html#the-picture-element> 38 + HTML Standard: The picture element 39 + *) 40 + 41 + val checker : Checker.t 42 + (** The picture element checker instance. *)
+36
lib/htmlrw_check/specialized/ruby_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Ruby element content model validation checker. 7 + 8 + This checker validates that [<ruby>] elements follow the HTML5 9 + content model requirements. Ruby annotations are used for East Asian 10 + typography to show pronunciation or meaning of characters. 11 + 12 + {2 Content Model} 13 + 14 + A [<ruby>] element must contain: 15 + - Phrasing content (the base text) 16 + - One or more [<rt>] elements (the ruby text/annotation) 17 + - Optional [<rp>] elements (fallback parentheses) 18 + 19 + {2 Validation Rules} 20 + 21 + - [<ruby>] must contain at least one [<rt>] element 22 + - There should be phrasing content before the first [<rt>] 23 + - [<rp>] elements should surround [<rt>] for fallback rendering 24 + - Nested [<ruby>] elements are handled correctly 25 + 26 + {2 Error Messages} 27 + 28 + - Ruby element without any [<rt>] child 29 + - Missing base text before ruby annotation 30 + 31 + @see <https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-ruby-element> 32 + HTML Standard: The ruby element 33 + *) 34 + 35 + val checker : Checker.t 36 + (** The ruby element content model checker instance. *)
+34
lib/htmlrw_check/specialized/source_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Source element context validation checker. 7 + 8 + This checker validates that [<source>] element attributes are appropriate 9 + for the parent context. The allowed attributes differ based on whether 10 + the source is inside [<picture>], [<video>], or [<audio>]. 11 + 12 + {2 Context-Dependent Rules} 13 + 14 + In [<picture>] context: 15 + - [srcset] is required 16 + - [src] is not allowed 17 + - [media] and [type] are allowed 18 + 19 + In [<video>] or [<audio>] context: 20 + - [src] is required 21 + - [srcset] and [sizes] are not allowed 22 + - [type] is allowed for MIME type hints 23 + 24 + {2 Error Messages} 25 + 26 + - Missing required attributes for the context 27 + - Attributes not allowed in the current context 28 + 29 + @see <https://html.spec.whatwg.org/multipage/embedded-content.html#the-source-element> 30 + HTML Standard: The source element 31 + *) 32 + 33 + val checker : Checker.t 34 + (** The source element context checker instance. *)
-15
lib/htmlrw_check/specialized/srcset_sizes_checker.ml
··· 61 let split_on_space_respecting_parens s = 62 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "") 63 64 - (** Check if string contains only whitespace *) 65 - let is_whitespace_only s = 66 - String.for_all (fun c -> c = ' ' || c = '\t' || c = '\n' || c = '\r') s 67 - 68 (** Invalid units that are not CSS lengths but might be confused for them *) 69 let invalid_size_units = [ 70 "deg"; "grad"; "rad"; "turn"; (* angle units *) ··· 154 NoCommentError 155 end 156 end 157 - 158 - (** For backward compatibility *) 159 - let has_invalid_css_comment s = 160 - match check_css_comment_position s with 161 - | NoCommentError -> false 162 - | _ -> true 163 164 (** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *) 165 let has_invalid_scientific_notation s = ··· 280 end 281 end 282 end 283 - 284 - let has_valid_size_unit size_value = 285 - match check_size_value size_value with 286 - | Valid -> true 287 - | InvalidUnit (_, _) | NegativeValue | CssCommentAfterSign (_, _) | CssCommentBeforeUnit (_, _) | BadScientificNotation | BadCssNumber (_, _) -> false 288 289 (** Check if a sizes entry has a media condition (starts with '(') *) 290 let has_media_condition entry =
··· 61 let split_on_space_respecting_parens s = 62 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "") 63 64 (** Invalid units that are not CSS lengths but might be confused for them *) 65 let invalid_size_units = [ 66 "deg"; "grad"; "rad"; "turn"; (* angle units *) ··· 150 NoCommentError 151 end 152 end 153 154 (** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *) 155 let has_invalid_scientific_notation s = ··· 270 end 271 end 272 end 273 274 (** Check if a sizes entry has a media condition (starts with '(') *) 275 let has_media_condition entry =
+50
lib/htmlrw_check/specialized/srcset_sizes_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Srcset and sizes attribute validation checker. 7 + 8 + This checker validates the [srcset] and [sizes] attributes on [<img>] 9 + and [<source>] elements. These attributes use a specialized microsyntax 10 + for responsive images. 11 + 12 + {2 Srcset Syntax} 13 + 14 + The [srcset] attribute contains a comma-separated list of image 15 + candidates, each with: 16 + - A URL 17 + - An optional width descriptor ([Nw], e.g., "800w") 18 + - Or an optional pixel density descriptor ([Nx], e.g., "2x") 19 + 20 + Width and pixel density descriptors cannot be mixed in the same srcset. 21 + 22 + {2 Sizes Syntax} 23 + 24 + The [sizes] attribute contains a comma-separated list of: 25 + - Media conditions (optional) 26 + - Source sizes (CSS lengths) 27 + 28 + The last entry should not have a media condition (it's the default). 29 + 30 + {2 Validation Rules} 31 + 32 + - URLs in srcset must be valid 33 + - Width descriptors must be positive integers 34 + - Pixel density descriptors must be positive numbers 35 + - Sizes must use valid CSS length units 36 + - Duplicate descriptors are flagged 37 + 38 + {2 Error Messages} 39 + 40 + - Invalid srcset syntax 41 + - Invalid sizes syntax 42 + - Missing sizes when srcset uses width descriptors 43 + - Invalid CSS length units 44 + 45 + @see <https://html.spec.whatwg.org/multipage/images.html#srcset-attributes> 46 + HTML Standard: Srcset attributes 47 + *) 48 + 49 + val checker : Checker.t 50 + (** The srcset/sizes attribute checker instance. *)
+1 -4
lib/htmlrw_check/specialized/title_checker.ml
··· 6 mutable in_title : bool; 7 mutable title_has_content : bool; 8 mutable title_depth : int; 9 - mutable is_iframe_srcdoc : bool; 10 } 11 12 let create () = { ··· 15 in_title = false; 16 title_has_content = false; 17 title_depth = 0; 18 - is_iframe_srcdoc = false; 19 } 20 21 let reset state = ··· 23 state.has_title <- false; 24 state.in_title <- false; 25 state.title_has_content <- false; 26 - state.title_depth <- 0; 27 - state.is_iframe_srcdoc <- false 28 29 let start_element state ~element _collector = 30 (match element.Element.tag with
··· 6 mutable in_title : bool; 7 mutable title_has_content : bool; 8 mutable title_depth : int; 9 } 10 11 let create () = { ··· 14 in_title = false; 15 title_has_content = false; 16 title_depth = 0; 17 } 18 19 let reset state = ··· 21 state.has_title <- false; 22 state.in_title <- false; 23 state.title_has_content <- false; 24 + state.title_depth <- 0 25 26 let start_element state ~element _collector = 27 (match element.Element.tag with
+28
lib/htmlrw_check/specialized/title_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Title element validation checker. 7 + 8 + This checker validates that documents have a proper [<title>] element 9 + with meaningful content. The title is important for accessibility, 10 + SEO, and browser tab identification. 11 + 12 + {2 Validation Rules} 13 + 14 + - Documents should have exactly one [<title>] element in the [<head>] 15 + - The [<title>] element should contain non-whitespace text 16 + - Empty titles are flagged as errors 17 + 18 + {2 Error Messages} 19 + 20 + - [Empty_title]: The title element is empty or contains only whitespace 21 + - [Missing_title]: No title element found in the document head 22 + 23 + @see <https://html.spec.whatwg.org/multipage/semantics.html#the-title-element> 24 + HTML Standard: The title element 25 + *) 26 + 27 + val checker : Checker.t 28 + (** The title element checker instance. *)
+4 -55
lib/htmlrw_check/specialized/unknown_element_checker.ml
··· 1 (** Unknown HTML element checker. 2 3 Detects elements that are not in the HTML5 specification and produces 4 - appropriate error messages. Custom elements (with hyphens) are allowed. *) 5 - 6 - (** Set of all known HTML5 element names. *) 7 - let known_elements = 8 - let elements = [ 9 - (* Document metadata *) 10 - "html"; "head"; "title"; "base"; "link"; "meta"; "style"; 11 - 12 - (* Sections *) 13 - "body"; "article"; "section"; "nav"; "aside"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; 14 - "hgroup"; "header"; "footer"; "address"; "main"; 15 - 16 - (* Grouping content *) 17 - "p"; "hr"; "pre"; "blockquote"; "ol"; "ul"; "menu"; "li"; "dl"; "dt"; "dd"; 18 - "figure"; "figcaption"; "div"; 19 20 - (* Text-level semantics *) 21 - "a"; "em"; "strong"; "small"; "s"; "cite"; "q"; "dfn"; "abbr"; "ruby"; "rt"; "rp"; 22 - "data"; "time"; "code"; "var"; "samp"; "kbd"; "sub"; "sup"; "i"; "b"; "u"; "mark"; 23 - "bdi"; "bdo"; "span"; "br"; "wbr"; "search"; 24 - 25 - (* Edits *) 26 - "ins"; "del"; 27 - 28 - (* Embedded content *) 29 - "picture"; "source"; "img"; "iframe"; "embed"; "object"; "video"; "audio"; 30 - "track"; "map"; "area"; "math"; "svg"; 31 - 32 - (* Tables *) 33 - "table"; "caption"; "colgroup"; "col"; "tbody"; "thead"; "tfoot"; "tr"; "td"; "th"; 34 - 35 - (* Forms *) 36 - "form"; "label"; "input"; "button"; "select"; "datalist"; "optgroup"; "option"; 37 - "textarea"; "output"; "progress"; "meter"; "fieldset"; "legend"; 38 - 39 - (* Interactive *) 40 - "details"; "summary"; "dialog"; 41 - 42 - (* Scripting *) 43 - "script"; "noscript"; "template"; "slot"; "canvas"; 44 - 45 - (* Deprecated but still recognized *) 46 - "param"; 47 - ] in 48 - let tbl = Hashtbl.create (List.length elements) in 49 - List.iter (fun el -> Hashtbl.add tbl el ()) elements; 50 - tbl 51 - 52 - (** Check if an element name is a custom element (contains hyphen). *) 53 - let is_custom_element name = 54 - String.contains name '-' 55 - 56 - (** Check if an element name is known. *) 57 - let is_known_element name = 58 - let name_lower = String.lowercase_ascii name in 59 - Hashtbl.mem known_elements name_lower || is_custom_element name_lower 60 61 type state = { 62 mutable stack : string list; (* Parent element stack *)
··· 1 (** Unknown HTML element checker. 2 3 Detects elements that are not in the HTML5 specification and produces 4 + appropriate error messages. Custom elements (with hyphens) are allowed. 5 6 + Note: Unknown element detection is performed by the parser, which marks 7 + unrecognized elements as [Tag.Unknown]. This checker produces appropriate 8 + error messages for those elements. *) 9 10 type state = { 11 mutable stack : string list; (* Parent element stack *)
+40
lib/htmlrw_check/specialized/unknown_element_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Unknown HTML element checker. 7 + 8 + This checker detects elements that are not in the HTML5 specification 9 + and produces appropriate error messages. Custom elements (names 10 + containing hyphens) are allowed per the Web Components specification. 11 + 12 + {2 Recognized Elements} 13 + 14 + The checker recognizes all standard HTML5 elements including: 15 + - Document metadata (html, head, title, etc.) 16 + - Sections (body, article, section, nav, etc.) 17 + - Grouping content (p, div, ul, ol, etc.) 18 + - Text-level semantics (a, em, strong, span, etc.) 19 + - Embedded content (img, video, audio, iframe, etc.) 20 + - Tabular data (table, tr, td, th, etc.) 21 + - Forms (form, input, button, select, etc.) 22 + - Interactive elements (details, dialog, summary) 23 + - Scripting (script, noscript, template) 24 + 25 + {2 Custom Elements} 26 + 27 + Element names containing a hyphen are treated as custom elements 28 + and are allowed without warning (e.g., [<my-component>], [<app-header>]). 29 + 30 + {2 Error Messages} 31 + 32 + Reports [Unknown_element] for unrecognized element names that are 33 + not valid custom elements. 34 + 35 + @see <https://html.spec.whatwg.org/multipage/custom-elements.html> 36 + HTML Standard: Custom elements 37 + *) 38 + 39 + val checker : Checker.t 40 + (** The unknown element checker instance. *)
+68
lib/htmlrw_check/specialized/url_checker.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** URL validation checker. 7 + 8 + This checker validates URL attributes ([href], [src], [action], etc.) 9 + on HTML elements. It checks for common URL issues and security concerns. 10 + 11 + {2 Validated Attributes} 12 + 13 + The checker validates URLs in these attributes: 14 + - [href] on [<a>], [<area>], [<base>], [<link>] 15 + - [src] on [<audio>], [<embed>], [<iframe>], [<img>], [<input>], 16 + [<script>], [<source>], [<track>], [<video>] 17 + - [action] on [<form>], [<button>] (formaction) 18 + - [cite] on [<blockquote>], [<del>], [<ins>], [<q>] 19 + - [data] on [<object>] 20 + - [poster] on [<video>] 21 + - [value] on [<input type="url">] 22 + 23 + {2 Validation Rules} 24 + 25 + - URLs should be well-formed (parseable) 26 + - Relative URLs are allowed 27 + - Fragment-only URLs ([#anchor]) are valid 28 + - Data URLs are validated for proper structure 29 + - javascript: URLs may trigger warnings 30 + - Empty URLs are flagged on elements that require them 31 + 32 + {2 Error Messages} 33 + 34 + - [Bad_url]: Malformed URL that cannot be parsed 35 + - [Empty_url]: Required URL attribute is empty 36 + - Various URL-specific validation errors 37 + 38 + @see <https://url.spec.whatwg.org/> 39 + URL Standard 40 + *) 41 + 42 + (** {1 URL Parsing Utilities} *) 43 + 44 + val extract_scheme : string -> string option 45 + (** [extract_scheme url] extracts the scheme (protocol) from a URL. 46 + 47 + @param url The URL to parse 48 + @return [Some scheme] if a valid scheme is found (e.g., "http", "https"), 49 + [None] if no scheme is present or the URL is relative *) 50 + 51 + val validate_url : string -> string -> string -> string option 52 + (** [validate_url url element_name attr_name] validates a URL. 53 + 54 + Performs comprehensive validation including: 55 + - Checking for empty URLs on elements that require them 56 + - Validating scheme, host, port, path, query, and fragment 57 + - Checking for illegal characters and encoding issues 58 + - Validating special schemes (http, https, etc.) 59 + 60 + @param url The URL to validate 61 + @param element_name The element containing the URL attribute 62 + @param attr_name The attribute name 63 + @return [Some error_message] if the URL is invalid, [None] if valid *) 64 + 65 + (** {1 Checker} *) 66 + 67 + val checker : Checker.t 68 + (** The URL validation checker instance. *)
+56
lib/htmlrw_check/xhtml_parser.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** XHTML parser using xmlm for proper XML parsing. 7 + 8 + This module provides XML parsing for XHTML files. While the HTML5 parser 9 + handles most content, XHTML requires proper XML parsing to correctly handle: 10 + 11 + - Self-closing tags on non-void elements (e.g., [<div/>]) 12 + - XML namespaces for SVG and MathML 13 + - Strict XML well-formedness requirements 14 + 15 + {2 Usage} 16 + 17 + {[ 18 + if Xhtml_parser.is_xhtml_file (Some "page.xhtml") then 19 + match Xhtml_parser.parse_xhtml content with 20 + | Ok doc -> (* Process XHTML document *) 21 + | Error msg -> (* Handle parse error *) 22 + ]} 23 + *) 24 + 25 + (** {1 Types} *) 26 + 27 + type xhtml_doc = { 28 + root : Html5rw.Dom.node; 29 + (** The document root node. *) 30 + errors : Html5rw.Error.t list; 31 + (** Parse errors (empty for valid XML). *) 32 + } 33 + (** An XHTML document representation. *) 34 + 35 + (** {1 Parsing} *) 36 + 37 + val parse_xhtml : string -> (Html5rw.Dom.node, string) result 38 + (** [parse_xhtml content] parses XHTML content using xmlm. 39 + 40 + @param content The XHTML content as a string 41 + @return [Ok root] with the document root on success, 42 + [Error message] with parse error details on failure *) 43 + 44 + val is_xhtml_file : string option -> bool 45 + (** [is_xhtml_file system_id] checks if a system_id indicates an XHTML file. 46 + 47 + @param system_id The optional file path or identifier 48 + @return [true] if the path ends with ".xhtml" *) 49 + 50 + (** {1 Document Access} *) 51 + 52 + val xhtml_root : xhtml_doc -> Html5rw.Dom.node 53 + (** [xhtml_root doc] returns the document root node. *) 54 + 55 + val xhtml_errors : xhtml_doc -> Html5rw.Error.t list 56 + (** [xhtml_errors doc] returns the parse errors (always empty for XHTML). *)